commit d09d6f1cc0cd2b144148a189b92383dd5e920896
Author: feie9456 <feie9454@gmail.com>
Date:   Wed Mar 18 12:54:00 2026 +0800

    添加AI服务，支持麦克风、摄像头输入

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a27acfc
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,31 @@
+*.iml
+.gradle
+/local.properties
+/.idea/
+.DS_Store
+/build/
+*/build/
+*/*/build/
+*/debug/
+*/release/
+/captures
+.externalNativeBuild
+.cxx
+*.bat
+*.apk
+output-metadata.json
+# app用到zip的请忽略
+# 自定义了local.properties的请删除这条
+local.properties
+~$*
+gradlew
+.idea
+# 自定义了根目录gradle/gradle-wrapper.properties的请删除这条（不推荐自定义）
+#gradle
+build
+
+# 预编译的静态库和第三方二进制文件（opencv-mobile 等）
+# 这些文件体积大、为二进制，应从发布源获取而非纳入版本控制
+*.a
+*.so
+duix-sdk/src/main/cpp/third/opencv-mobile-*/
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..d65d7d3
--- /dev/null
+++ b/README.md
@@ -0,0 +1,405 @@
+# Duix Mobile for Android SDK Documentation
+
+English | [中文](./README_zh.md)
+
+## 1. Product Overview
+
+`Duix Mobile for Android` is a lightweight, fully offline 2D digital human solution for Android, supporting real-time rendering of digital avatars driven by voice audio.
+
+### 1.1 Application Scenarios
+
+- **Low deployment cost**: Suitable for unattended scenarios such as large-screen terminals, government halls, and banks.
+- **Minimal network dependency**: Runs entirely locally, no internet required, stable operation in subways and remote areas.
+- **Diverse functionality**: Can serve as a guide, Q&A customer service, intelligent companion, and more.
+
+### 1.2 Core Features
+
+- Customizable digital avatar and local rendering
+- Real-time voice-driven playback (supports WAV playback and PCM streaming)
+- Motion playback control (specific or random actions)
+- Automatic resource download management
+
+---
+
+## 2. Terminology
+
+| Term               | Meaning                                                                    |
+|--------------------|----------------------------------------------------------------------------|
+| PCM                | Pulse-Code Modulation, raw audio stream with 16kHz sample rate, 16-bit depth, Mono channel |
+| WAV                | An audio file format that supports PCM encoding, suitable for short voice playback |
+| RenderSink         | Rendering data reception interface, implemented by the SDK, can be used for custom rendering or default display |
+| DUIX               | Main control object of the digital human, integrates model loading, rendering, broadcasting, and motion control |
+| GLES               | OpenGL ES, a graphics interface for rendering images on Android               |
+| SpecialAction      | A JSON file attached to the model that marks action intervals (e.g., greetings, waving) |
+
+---
+
+## 3. SDK Access
+
+### 3.1 Module Reference (Recommended)
+
+1. Obtain the complete source package, unzip it, and copy the `duix-sdk` directory to the project root directory.
+2. In the project `settings.gradle`, add:
+
+```gradle
+include ':duix-sdk'
+```
+
+3. In the module's `build.gradle`, add the dependency:
+
+```gradle
+dependencies {
+    api project(":duix-sdk")
+}
+```
+
+### 3.2 AAR Reference (Optional)
+
+1. Place the compiled `duix-sdk-release.aar` module into the `libs/` directory.
+2. Add the dependency:
+
+```gradle
+dependencies {
+    api fileTree(include: ['*.jar', '*.aar'], dir: 'libs')
+}
+```
+
+---
+
+## 4. Integration Requirements
+
+| Item           | Description                                                     |
+|----------------|-----------------------------------------------------------------|
+| System         | Supports Android 10+ systems.                                    |
+| CPU Architecture | armeabi-v7a, arm64-v8a                                           |
+| Hardware Requirements | Device CPU with 8 or more cores (Snapdragon 8 Gen 2), 8GB or more memory, available storage space of 1GB or more |
+| Network        | None (Fully local operation)                                    |
+| Development IDE | Android Studio Giraffe 2022.3.1 Patch 2                         |
+| Memory Requirements | Minimum 800MB memory available for the digital human          |
+
+---
+
+## 5. Usage Flow Overview
+
+```mermaid
+graph TD
+A[Check Configuration and Models] --> B[Build DUIX Instance]
+B --> C[Call init to Initialize]
+C --> D[Display Avatar / Render]
+D --> E[PCM or WAV Audio Driving]
+E --> F[Playback Control & Motion Triggering]
+F --> G[Resource Release]
+```
+
+---
+
+## 6. Key Interfaces and Example Calls
+
+### 6.1 Model Check and Download
+
+Before using the rendering service, ensure that the basic configuration and model files are synchronized to local storage. The SDK provides a simple demonstration of the model download and decompression process using `VirtualModelUtil`. If model download is slow or fails, developers can choose to cache the model package to their own storage service.
+
+> Function Definition: `ai.guiji.duix.sdk.client.VirtualModelUtil`
+
+```
+// Check if base configuration is downloaded
+boolean checkBaseConfig(Context context)
+
+// Check if the model is downloaded
+boolean checkModel(Context context, String name)
+
+// Base configuration download
+void baseConfigDownload(Context context, String url, ModelDownloadCallback callback)
+
+// Model download
+void modelDownload(Context context, String modelUrl, ModelDownloadCallback callback)
+```
+
+`ModelDownloadCallback` includes progress, completion, failure callbacks, etc., as defined in the SDK.
+
+```
+interface ModelDownloadCallback {
+    // Download progress
+    void onDownloadProgress(String url, long current, long total);
+    // Unzip progress
+    void onUnzipProgress(String url, long current, long total);
+    // Download and unzip complete
+    void onDownloadComplete(String url, File dir);
+    // Download and unzip failed
+    void onDownloadFail(String url, int code, String msg);
+}
+```
+
+**Call Example**:
+
+```kotlin
+if (!VirtualModelUtil.checkBaseConfig(mContext)){
+    VirtualModelUtil.baseConfigDownload(mContext, baseConfigUrl, callback)
+}
+```
+
+```kotlin
+if (!VirtualModelUtil.checkModel(mContext, modelUrl)){
+    VirtualModelUtil.modelDownload(mContext, modelUrl, callback)
+}
+```
+
+---
+
+### 6.2 Initialization and Rendering Start
+
+In the `onCreate()` stage of the rendering page, build the DUIX object and call the init interface.
+
+> Function Definition: `ai.guiji.duix.sdk.client.DUIX`
+
+```
+// Build DUIX object
+public DUIX(Context context, String modelName, RenderSink sink, Callback callback)
+
+// Initialize DUIX service
+void init()
+```
+
+**DUIX Object Construction Explanation**:
+
+| Parameter     | Type      | Description                                                    |
+|---------------|-----------|----------------------------------------------------------------|
+| context       | Context   | System context                                                  |
+| modelName     | String    | Can pass the model download URL (if downloaded) or cached filename |
+| render        | RenderSink| Rendering data interface, SDK provides a default rendering component inheriting from this interface, or you can implement it yourself |
+| callback      | Callback  | Various callback events handled by the SDK                      |
+
+Where **Callback** is defined as: `ai.guiji.duix.sdk.client.Callback`
+
+```
+interface Callback {
+    void onEvent(String event, String msg, Object info);
+}
+```
+
+**Call Example**:
+
+```kotlin
+duix = DUIX(mContext, modelUrl, mDUIXRender) { event, msg, info ->
+    when (event) {
+        ai.guiji.duix.sdk.client.Constant.CALLBACK_EVENT_INIT_READY -> {
+            initOK()
+        }
+
+        ai.guiji.duix.sdk.client.Constant.CALLBACK_EVENT_INIT_ERROR -> {
+            initError()
+        }
+        // ...
+    }
+}
+// Asynchronous callback result
+duix?.init()
+```
+
+In the `init` callback, confirm the initialization result.
+
+---
+
+### 6.3 Digital Human Avatar Display
+
+Use the SDK-provided `DUIXRenderer` and `DUIXTextureView` to quickly implement rendering with transparency support. Alternatively, you can implement the `RenderSink` interface to customize the rendering logic.
+
+The **RenderSink** definition is as follows: `ai.guiji.duix.sdk.client.render.RenderSink`
+
+```java
+/**
+ * Rendering pipeline, returns rendering data through this interface
+ */
+public interface RenderSink {
+
+    // The frame's buffer data is arranged in BGR order
+    void onVideoFrame(ImageFrame imageFrame);
+
+}
+```
+
+**Call Example**:
+
+Use `DUIXRenderer` and `DUIXTextureView` to quickly implement rendering. These controls support transparency and can freely set the background and foreground.
+
+```kotlin
+override fun onCreate(savedInstanceState: Bundle?) {
+    super.onCreate(savedInstanceState)
+    // ...
+    mDUIXRender =
+        DUIXRenderer(
+            mContext,
+            binding.glTextureView
+        )
+
+    binding.glTextureView.setEGLContextClientVersion(GL_CONTEXT_VERSION)
+    binding.glTextureView.setEGLConfigChooser(8, 8, 8, 8, 16, 0) // Transparency
+    binding.glTextureView.isOpaque = false           // Transparency
+    binding.glTextureView.setRenderer(mDUIXRender)
+    binding.glTextureView.renderMode =
+        GLSurfaceView.RENDERMODE_WHEN_DIRTY      // Must be called after setting the renderer
+
+    duix = DUIX(mContext, modelUrl, mDUIXRender) { event, msg, _ ->
+    }
+    // ...
+}
+```
+
+---
+
+### 6.4 Broadcasting Control
+
+#### Use Streaming PCM to Drive Digital Human Broadcasting
+
+**PCM Format: 16kHz sample rate, single channel, 16-bit depth**
+
+> Function Definition: `ai.guiji.duix.sdk.client.DUIX`
+
+```
+// Notify service to start pushing audio
+void startPush()
+
+// Push PCM data
+void pushPcm(byte[] buffer)
+
+// Finish a segment of audio push (Call this after the audio push is complete, not after playback finishes)
+void stopPush()
+```
+
+`startPush`, `pushPcm`, and `stopPush` need to be called in pairs. `pushPcm` should not be too long. After pushing the entire audio, call `stopPush` to end the session. Use `startPush` again for the next audio.
+
+**The audio data between each startPush and stopPush segment should be at least 1 second (32000 bytes), otherwise the mouth shape driver cannot be triggered, and blank frames can be used to fill in.**
+
+**Call Example**:
+
+```kotlin
+val thread = Thread {
+            duix?.startPush()
+            val inputStream = assets.open("pcm/2.pcm")
+            val buffer = ByteArray(320)
+            var length = 0
+            while (inputStream.read(buffer).also { length = it } > 0){
+                val data = buffer.copyOfRange(0, length)
+                duix?.pushPcm(data)
+            }
+            duix?.stopPush()
+            inputStream.close()
+}
+thread.start()
+```
+
+---
+
+### 6.5 Motion Control
+
+#### Play Specific Motion Interval
+
+The model supports new motion intervals marked in `SpecialAction.json`
+
+> Function Definition: `ai.guiji.duix.sdk.client.DUIX`
+
+```
+/**
+ * Play specific motion interval
+ * @param name The motion interval name, which can be obtained from @{ModelInfo.getSilenceRegion()} after init callback
+ * @param now Whether to play immediately: true: play now; false: wait for current silent or motion interval to finish
+ */
+void startMotion(String name, boolean now)
+```
+
+**Call Example**:
+
+```kotlin
+duix?.startMotion("Greeting", true)
+```
+
+#### Randomly Play Motion Interval
+
+> Function Definition: `ai.guiji.duix.sdk.client.DUIX`
+
+```
+/**
+ * Randomly play a motion interval
+ * @param now Whether to play immediately: true: play now; false: wait for current silent or motion interval to finish
+ */
+void startRandomMotion(boolean now);
+```
+
+**Call Example**:
+
+```kotlin
+duix?.startRandomMotion(true)
+```
+
+---
+
+## 7. Proguard Configuration
+
+If using obfuscation, add the following in `proguard-rules.pro`:
+
+```proguard
+-keep class ai.guiji.duix.DuixNcnn{*; }
+```
+
+---
+
+## 8. Precautions
+
+1. Ensure that the base configuration file and model are downloaded to the specified location before driving rendering initialization.
+2. PCM audio should not be too long, as PCM buffers are cached in memory; long audio streams may cause memory overflow.
+3. To replace the preview model, modify the `modelUrl` value in `MainActivity.kt` and use the SDK's built-in file download and decompression management to obtain the complete model files.
+4. Audio driving format: 16kHz sample rate, single channel, 16-bit depth.
+5. Insufficient device performance may result in the audio feature extraction speed not matching the playback speed. You can use `duix?.setReporter()` to monitor frame rendering information.
+
+---
+
+## 9. FAQ and Troubleshooting Guide
+
+| Issue                          | Possible Cause               | Solution                     |
+|---------------------------------|------------------------------|------------------------------|
+| init callback failed            | Model path error or model not downloaded | Use `checkModel` to check model status |
+| Rendering black screen          | EGL configuration or texture view error | Use SDK-provided example settings |
+| No PCM playback effect          | Incorrect format or `startPush` not called | Ensure audio format is correct and call push method |
+| Model download slow             | Unstable network or restricted CDN | Support self-hosted model file storage service |
+
+---
+
+## 10. Version History
+
+**<a>4.0.1</a>**
+
+1. Supports PCM audio stream driving the digital human, improving audio playback response speed.
+2. Optimized motion interval playback, allowing specific motion intervals based on model configuration.
+3. Custom audio player, removed Exoplayer playback dependency.
+4. Provided simplified model download synchronization management tools.
+5. The audio data between each startPush and stopPush segment should be at least 1 second (32000 bytes), otherwise the mouth shape driver cannot be triggered, and blank frames can be used to fill in.
+
+**<a>3.0.5</a>**
+
+```text
+1. Updated arm32 CPU libonnxruntime.so version to fix compatibility issues.
+2. Modified motion interval playback function, supports random and sequential playback, requires manual call to stop playback to return to silent interval.
+```
+
+**<a>3.0.4</a>**
+
+```text
+1. Fixed model display issue due to low float precision on some devices.
+```
+
+**<a>3.0.3</a>**
+
+```text
+1. Optimized local rendering.
+```
+
+## 11. 🔗 Open-source Dependencies
+
+| Module                                   | Description                    |
+|------------------------------------------|--------------------------------|
+| [onnx](https://github.com/onnx/onnx)     | General AI model standard format |
+| [ncnn](https://github.com/Tencent/ncnn)  | High-performance neural network computing framework (Tencent) |
+
+---
+
+For more help, please contact the technical support team.
\ No newline at end of file
diff --git a/README_zh.md b/README_zh.md
new file mode 100644
index 0000000..d16e9b6
--- /dev/null
+++ b/README_zh.md
@@ -0,0 +1,485 @@
+# Duix Mobile for Android SDK 文档
+
+中文 | [English](./README.md)
+
+## 一、产品介绍
+
+`Duix Mobile for Android` 是一套轻量级、纯离线的 Android 平台 2D 虚拟人解决方案，支持通过语音音频驱动数字人形象并进行实时渲染。
+
+### 1.1 应用场景
+
+- **部署成本低**：适用于大屏终端、政务展厅、银行等无人值守场景。
+- **网络依赖小**：完全本地运行，无需联网，可在地铁、偏远地区稳定运行。
+- **功能多样化**：可服务于导览讲解、问答客服、智能陪伴等多种业务形态。
+
+### 1.2 核心功能
+
+- 数字人形象定制与本地渲染
+- 实时语音驱动播报（支持 WAV 播放和 PCM 推送）
+- 动作播放控制（指定动作、随机动作）
+- 资源自动下载管理
+
+---
+
+## 二、术语说明
+
+| 术语                | 含义                                                                     |
+|-------------------|------------------------------------------------------------------------|
+| PCM               | Pulse-Code Modulation，16kHz 采样率、16bit 位深、Mono 单通道的原始音频流                |
+| WAV               | 一种音频文件格式，支持 PCM 编码，适合短语音播放                                             |
+| RenderSink        | 渲染数据接收接口，由 SDK 提供实现，可用于自定义渲染或默认展示                                      |
+| DUIX              | 数字人主控对象，集成了模型加载、渲染、播报、动作等能力                                            |
+| GLES              | OpenGL ES，Android 渲染图像用到的图形接口                                          |
+| SpecialAction     | 模型附带的 JSON 文件，标注动作区间（例如打招呼、挥手等）                                        |
+
+---
+
+## 三、SDK 获取方式
+
+### 3.1 Module 引用（推荐）
+
+1. 获取完整源码包，解压后将 `duix-sdk` 目录复制到项目根目录下。
+2. 在项目 `settings.gradle` 中添加：
+
+```gradle
+include ':duix-sdk'
+```
+
+3. 在模块 `build.gradle` 中添加依赖：
+
+```gradle
+dependencies {
+    api project(":duix-sdk")
+}
+```
+
+### 3.2 AAR 引用（可选）
+
+1. 将duix-sdk模块编译的 `duix-sdk-release.aar` 放入 `libs/` 目录。
+2. 添加依赖：
+
+```gradle
+dependencies {
+    api fileTree(include: ['*.jar', '*.aar'], dir: 'libs')
+}
+```
+
+---
+
+## 四、集成要求
+
+| 项目     | 描述                                                 |
+|--------|----------------------------------------------------|
+| 系统     | 支持 Android 10+ 系统。                                 |
+| CPU架构  | armeabi-v7a, arm64-v8a                             |
+| 硬件要求   | 要求设备 CPU8 核及以上(骁龙8 Gen2),内存 8G 及以上。可用存储空间 1GB 及以上。 |
+| 网络     | 无（完全本地运行）                                          |
+| 开发 IDE | Android Studio Giraffe 2022.3.1 Patch 2            |
+| 内存要求   | 可用于数字人的内存 >= 800MB                                 |
+
+
+**编译项目的Gradle使用的JDK版本为17,需要在File->Setting->Build,Execution,Deployment->Grade Projects->Gradle JDK: ${选择一个17版本的JDK}**
+
+---
+
+## 五、使用流程概览
+
+```mermaid
+graph TD
+A[检查配置与模型] --> B[构建 DUIX 实例]
+B --> C[调用 init 初始化]
+C --> D[展示形象 / 渲染]
+D --> E[PCM 或 WAV 音频驱动]
+E --> F[播放控制与动作触发]
+F --> G[资源释放]
+```
+
+---
+
+## 六、关键接口与调用示例
+
+### 6.1. 模型检查及下载
+
+使用渲染服务前需要将基础配置及模型文件同步到本地存储中,SDK中提供了VirtualModelUtil简单演示了模型下载解压流程。
+若模型下载过慢或无法下载，开发者可以选择将模型包缓存到自己的存储服务。
+
+> 函数定义: `ai.guiji.duix.sdk.client.VirtualModelUtil`
+
+```
+// 检查基础配置是否已下载
+boolean checkBaseConfig(Context context)
+
+// 检查模型是否已下载
+boolean checkModel(Context context, String name)
+
+// 基础配置下载
+void baseConfigDownload(Context context, String url, ModelDownloadCallback callback)
+
+// 模型下载
+void modelDownload(Context context, String modelUrl, ModelDownloadCallback callback)
+```
+
+`ModelDownloadCallback` 包含进度、完成、失败等回调，详见 SDK 定义。
+
+```
+interface ModelDownloadCallback {
+    // 下载进度
+    void onDownloadProgress(String url, long current, long total);
+    // 解压进度
+    void onUnzipProgress(String url, long current, long total);
+    // 下载解压完成
+    void onDownloadComplete(String url, File dir);
+    // 下载解压失败
+    void onDownloadFail(String url, int code, String msg);
+}
+```
+
+**调用示例**:
+
+```kotlin
+if (!VirtualModelUtil.checkBaseConfig(mContext)){
+    VirtualModelUtil.baseConfigDownload(mContext, baseConfigUrl, callback)
+}
+```
+
+```kotlin
+if (!VirtualModelUtil.checkModel(mContext, modelUrl)){
+    VirtualModelUtil.modelDownload(mContext, modelUrl, callback)
+}
+
+```
+
+---
+
+### 6.2. 初始化与渲染启动
+
+在渲染页onCreate()阶段构建DUIX对象并调用init接口
+
+> 函数定义: `ai.guiji.duix.sdk.client.DUIX`
+
+```
+// 构建DUIX对象
+public DUIX(Context context, String modelName, RenderSink sink, Callback callback)
+
+// 初始化DUIX服务
+void init()
+```
+
+**DUIX对象构建说明**:
+
+| 参数         | 类型         | 描述                                  |
+|------------|------------|-------------------------------------|
+| context    | Context    | 系统上下文                               |
+| modelName  | String     | 可以传递模型下载的URL(已下载完成)或缓存的文件名          |
+| render     | RenderSink | 渲染数据接口，sdk提供了默认的渲染组件继承自该接口，也可以自己实现  |
+| callback   | Callback   | SDK处理的各种回调事件                        |
+
+
+其中**Callback**的定义: `ai.guiji.duix.sdk.client.Callback`
+
+```
+interface Callback {
+    void onEvent(String event, String msg, Object info);
+}
+```
+
+**调用示例**:
+
+```kotlin
+duix = DUIX(mContext, modelUrl, mDUIXRender) { event, msg, info ->
+    when (event) {
+        ai.guiji.duix.sdk.client.Constant.CALLBACK_EVENT_INIT_READY -> {
+            initOK()
+        }
+
+        ai.guiji.duix.sdk.client.Constant.CALLBACK_EVENT_INIT_ERROR -> {
+            initError()
+        }
+        // ...
+
+    }
+}
+// 异步回调结果
+duix?.init()
+```
+
+在init回调中确认初始化结果
+
+---
+
+### 6.3. 数字人形象展示
+
+使用 SDK 提供的 `DUIXRenderer` 和 `DUIXTextureView` 可快速实现支持透明通道的渲染。也可以自己实现RenderSink接口自定义渲染逻辑。
+
+其中**RenderSink**的定义如下: `ai.guiji.duix.sdk.client.render.RenderSink`
+
+```java
+/**
+ * 渲染管道，通过该接口返回渲染数据
+ */
+public interface RenderSink {
+
+    // frame中的buffer数据以bgr顺序排列
+    void onVideoFrame(ImageFrame imageFrame);
+
+}
+```
+
+**调用示例**:
+
+使用DUIXRenderer及DUIXTextureView控件简单实现渲染展示,该控件支持透明通道可以自由设置背景及前景
+
+```kotlin
+override fun onCreate(savedInstanceState: Bundle?) {
+    super.onCreate(savedInstanceState)
+    // ...
+    mDUIXRender =
+        DUIXRenderer(
+            mContext,
+            binding.glTextureView
+        )
+
+    binding.glTextureView.setEGLContextClientVersion(GL_CONTEXT_VERSION)
+    binding.glTextureView.setEGLConfigChooser(8, 8, 8, 8, 16, 0) // 透明
+    binding.glTextureView.isOpaque = false           // 透明
+    binding.glTextureView.setRenderer(mDUIXRender)
+    binding.glTextureView.renderMode =
+        GLSurfaceView.RENDERMODE_WHEN_DIRTY      // 一定要在设置完Render之后再调用
+
+    duix = DUIX(mContext, modelUrl, mDUIXRender) { event, msg, _ ->
+    }
+    // ...
+}
+```
+
+---
+
+### 6.4 播报控制
+
+#### 使用流式推送PCM驱动数字人播报
+
+**PCM格式:16k采样率单通道16位深**
+
+> 函数定义: `ai.guiji.duix.sdk.client.DUIX`
+
+```
+// 通知服务开始推送音频
+void startPush()
+
+// 推送PCM数据
+void pushPcm(byte[] buffer)
+
+// 完成一段音频推送(音频推送完就调要该函数，而不是等播放完成再调用。)
+void stopPush()
+
+```
+
+startPush、pushPcm、stopPush需要成对调用，pushPcm不宜过长。可以在一整段音频推送完后调用stopPush结束当前会话，下一段音频再使用startPush重新开启推送。
+
+**每段startPush到stopPush中间的音频数据最少要1秒(32000字节)否则无法触发口型驱动，可以自行使用空白帧填充。**
+
+**调用示例**:
+
+```kotlin
+val thread = Thread {
+            duix?.startPush()
+            val inputStream = assets.open("pcm/2.pcm")
+            val buffer = ByteArray(320)
+            var length = 0
+            while (inputStream.read(buffer).also { length = it } > 0){
+                val data = buffer.copyOfRange(0, length)
+                duix?.pushPcm(data)
+            }
+            duix?.stopPush()
+            inputStream.close()
+}
+thread.start()
+```
+
+---
+
+#### WAV 播放驱动
+
+> 函数定义: `ai.guiji.duix.sdk.client.DUIX`
+
+```
+void playAudio(String wavPath) 
+```
+
+**该函数兼容旧的wav驱动数字人接口，在内部实际是调用了PCM推流方式实现驱动。**
+
+
+**参数说明**:
+
+| 参数      | 类型     | 描述                    |
+|---------|--------|-----------------------|
+| wavPath | String | 16k采样率单通道16位深的wav本地文件 |
+
+
+**调用示例**:
+
+```kotlin
+duix?.playAudio(wavPath)
+```
+
+音频播放状态及进度回调:
+
+```kotlin
+object : Callback {
+    fun onEvent(event: String, msg: String, info: Object) {
+        when (event) {
+            // ...
+
+            "play.start" -> {
+                // 开始播放音频
+            }
+
+            "play.end" -> {
+                // 完成播放音频
+            }
+            "play.error" -> {
+                // 音频播放异常
+            }
+        }
+    }
+}
+```
+
+---
+
+#### 终止当前播报
+
+当数字人正在播报时调用该接口终止播报。
+
+> 函数定义: `ai.guiji.duix.sdk.client.DUIX`
+
+```
+boolean stopAudio();
+```
+
+**调用示例如下**：
+
+```kotlin
+duix?.stopAudio()
+```
+
+---
+
+### 6.5. 动作控制
+
+
+#### 播放指定动作区间
+
+模型中支持新的动作区间标注(SpecialAction.json)
+
+> 函数定义: `ai.guiji.duix.sdk.client.DUIX`
+
+```
+/**
+ * 播放指定动作区间
+ * @param name 动作区间名称，在init成功回调时，可以在@{ModelInfo.getSilenceRegion()}中获取到可用的动作区间
+ * @param now 是否立即播放 true: 立即播放; false: 等待当前静默区间或动作区间播放完毕后播放
+ */
+void startMotion(String name, boolean now)
+```
+
+**调用示例如下**：
+
+```kotlin
+duix?.startMotion("打招呼", true)
+```
+
+#### 随机播放动作区间
+
+随机播放场景及旧的标注协议(config.json)
+
+> 函数定义: `ai.guiji.duix.sdk.client.DUIX`
+
+```
+/**
+ * 随机播放一个动作区间
+ * @param now 是否立即播放 true: 立即播放; false: 等待当前静默区间或动作区间播放完毕后播放
+ */
+void startRandomMotion(boolean now);
+```
+
+**调用示例如下**：
+
+```kotlin
+duix?.startRandomMotion(true)
+```
+
+---
+
+## 七. Proguard配置
+
+如果代码使用了混淆，请在proguard-rules.pro中配置：
+
+```proguard
+-keep class ai.guiji.duix.DuixNcnn{*; }
+```
+
+---
+
+## 八、注意事项
+
+1. 驱动渲染初始化前需要确保基础配置文件及模型下载到指定位置。
+2. 播放的PCM音频不宜过长，播放的PCM缓存在内存中，过长的音频流可能导致内存溢出。
+3. 替换预览模型可以在MainActivity.kt文件中修改modelUrl的值，使用SDK中自带的文件下载解压管理以获得完整的模型文件。
+4. 音频驱动的格式: 16k采样率单通道16位深度
+5. 设备性能不足时可能导致音频特征提取的速度跟不上音频播放的速度，可以使用duix?.setReporter()函数添加一个监控观察帧渲染返回的信息。
+6. 每段startPush到stopPush中间的音频数据最少要1秒(32000字节)否则无法触发口型驱动，可以自行使用空白帧填充。
+
+---
+
+## 九、常见问题与排查指南
+
+| 问题现象                | 可能原因                     | 解决方案                   |
+|---------------------|--------------------------|------------------------|
+| init 回调失败           | 模型路径错误或未下载完成             | 使用 `checkModel` 检查模型状态 |
+| 渲染黑屏                | EGL 配置或纹理视图设置错误          | 使用 SDK 提供示例中的设置方法      |
+| PCM 无播报效果           | 格式不符或未调用 startPush       | 确保音频格式正确并调用推送方法        |
+| 模型下载过慢              | 网络不稳定或 CDN 受限            | 支持自建模型文件托管服务           |
+
+---
+
+## 十、版本记录
+
+**<a>4.0.1</a>**
+
+```text
+1. 支持PCM音频流驱动数字人，提升音频播放响应速度。
+2. 优化动作区间播放，可根据模型配置指定播放动作区间。
+3. 自定义音频播放器，去除Exoplayer播放依赖
+4. 提供简洁的模型下载同步管理工具
+```
+
+**<a>3.0.5</a>**
+
+```text
+1. 更新arm32位cpu的libonnxruntime.so版本以修复兼容问题。
+2. 修改动作区间播放函数，可以使用随机播放和顺序播放，需要主动调用停止播放动作区间以回到静默区间。
+```
+
+**<a>3.0.4</a>**
+
+```text
+1. 修复部分设备gl默认float低精度导致无法正常显示形象问题。
+```
+
+**<a>3.0.3</a>**
+
+```text
+1. 优化本地渲染。
+```
+
+## 十一、🔗 开源依赖
+
+| 模块                                        | 描述                |
+|-------------------------------------------|-------------------|
+| [onnx](https://github.com/onnx/onnx)      | 通用AI模型标准格式        |
+| [ncnn](https://github.com/Tencent/ncnn)   | 高性能神经网络计算框架（腾讯）   |
+
+---
+
+如需更多帮助，请联系技术支持团队。
\ No newline at end of file
diff --git a/android_glide_lint.xml b/android_glide_lint.xml
new file mode 100644
index 0000000..90c96a4
--- /dev/null
+++ b/android_glide_lint.xml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="utf-8" ?>
+<!-- https://github.com/bumptech/glide/issues/4940 -->
+<lint>
+    <issue id="NotificationPermission">
+        <ignore regexp="com.bumptech.glide.request.target.NotificationTarget" />
+    </issue>
+</lint>
\ No newline at end of file
diff --git a/build.gradle b/build.gradle
new file mode 100644
index 0000000..d22ff0c
--- /dev/null
+++ b/build.gradle
@@ -0,0 +1,39 @@
+// Top-level build file where you can add configuration options common to all sub-projects/modules.
+buildscript {
+    repositories {
+
+        maven { url 'https://maven.aliyun.com/repository/public/' }
+        maven { url 'https://maven.aliyun.com/repository/central' }
+        maven { url 'https://maven.aliyun.com/repository/google' }
+        maven { url 'https://maven.aliyun.com/repository/gradle-plugin' }
+        maven { url 'https://jitpack.io' }
+        maven { url 'https://repo1.maven.org/maven2/' }
+        google()
+    }
+    dependencies {
+        classpath 'com.android.tools.build:gradle:8.1.2'
+        classpath 'org.jetbrains.kotlin:kotlin-gradle-plugin:1.8.10'
+    }
+}
+
+allprojects {
+    repositories {
+
+        maven { url 'https://maven.aliyun.com/repository/public/' }
+        maven { url 'https://maven.aliyun.com/repository/central' }
+        maven { url 'https://maven.aliyun.com/repository/google' }
+        maven { url 'https://maven.aliyun.com/repository/gradle-plugin' }
+        maven { url 'https://jitpack.io' }
+        maven { url 'https://repo1.maven.org/maven2/' }
+        google()
+    }
+}
+
+ext {
+    compileSdkVersion = 33
+    buildToolsVersion = '30.0.2'
+    minSdkVersion = 24
+    targetSdkVersion = 33
+    versionCode = 2
+    versionName = "0.0.2"
+}
diff --git a/demo.jks b/demo.jks
new file mode 100644
index 0000000..3cfe0f5
Binary files /dev/null and b/demo.jks differ
diff --git a/duix-sdk/.gitignore b/duix-sdk/.gitignore
new file mode 100644
index 0000000..42afabf
--- /dev/null
+++ b/duix-sdk/.gitignore
@@ -0,0 +1 @@
+/build
\ No newline at end of file
diff --git a/duix-sdk/build.gradle b/duix-sdk/build.gradle
new file mode 100644
index 0000000..608ea47
--- /dev/null
+++ b/duix-sdk/build.gradle
@@ -0,0 +1,68 @@
+plugins {
+    id 'com.android.library'
+}
+
+android {
+    namespace 'ai.guiji.duix.sdk.client'
+    compileSdk 33
+
+    defaultConfig {
+        minSdk 24
+        versionCode 13
+        versionName '4.1.1'
+
+        externalNativeBuild {
+            cmake {
+                abiFilters 'arm64-v8a', "armeabi-v7a"
+                cppFlags "-std=c++17", "-fexceptions"
+                //arguments "-DANDROID_STL=c++_shared","-DANDROID_TOOLCHAIN=clang"
+            }
+        }
+    }
+
+    buildTypes {
+        debug {
+            minifyEnabled false
+            proguardFiles getDefaultProguardFile('proguard-android-optimize.txt'), 'proguard-rules.pro'
+
+            buildConfigField("String", "VERSION_NAME", "\"${defaultConfig.versionName}\"")
+            buildConfigField('int', 'VERSION_CODE', "${defaultConfig.versionCode}")
+        }
+
+        release {
+            minifyEnabled false
+            proguardFiles getDefaultProguardFile('proguard-android-optimize.txt'), 'proguard-rules.pro'
+
+            buildConfigField("String", "VERSION_NAME", "\"${defaultConfig.versionName}\"")
+            buildConfigField('int', 'VERSION_CODE', "${defaultConfig.versionCode}")
+
+            android.libraryVariants.all { variant ->
+                variant.outputs.all {
+                    outputFileName = "duix_client_sdk_${buildType.name}_${defaultConfig.versionName}.aar"
+                }
+            }
+        }
+    }
+
+    externalNativeBuild {
+        cmake {
+            path "src/main/cpp/CMakeLists.txt"
+            version "3.18.1"
+        }
+    }
+
+    compileOptions {
+        sourceCompatibility JavaVersion.VERSION_1_8
+        targetCompatibility JavaVersion.VERSION_1_8
+    }
+//    kotlinOptions {
+//        jvmTarget = '1.8'
+//    }
+//    packagingOptions {
+//        exclude 'lib/**/libonnxruntime.so'
+//    }
+}
+
+dependencies {
+    api fileTree(include: ['*.jar', '*.aar'], dir: 'libs')
+}
\ No newline at end of file
diff --git a/duix-sdk/consumer-rules.pro b/duix-sdk/consumer-rules.pro
new file mode 100644
index 0000000..e69de29
diff --git a/duix-sdk/libs/resource_loader.jar b/duix-sdk/libs/resource_loader.jar
new file mode 100644
index 0000000..c372c59
Binary files /dev/null and b/duix-sdk/libs/resource_loader.jar differ
diff --git a/duix-sdk/proguard-rules.pro b/duix-sdk/proguard-rules.pro
new file mode 100644
index 0000000..15502f3
--- /dev/null
+++ b/duix-sdk/proguard-rules.pro
@@ -0,0 +1,90 @@
+# Add project specific ProGuard rules here.
+# You can control the set of applied configuration files using the
+# proguardFiles setting in build.gradle.
+#
+# For more details, see
+#   http://developer.android.com/guide/developing/tools/proguard.html
+
+# If your project uses WebView with JS, uncomment the following
+# and specify the fully qualified class name to the JavaScript interface
+# class:
+#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
+#   public *;
+#}
+
+# Uncomment this to preserve the line number information for
+# debugging stack traces.
+#-keepattributes SourceFile,LineNumberTable
+
+# If you keep the line number information, uncomment this to
+# hide the original source file name.
+#-renamesourcefileattribute SourceFile
+
+-optimizationpasses 5  #指定代码的压缩级别 0 - 7，一般都是5，无需改变
+-dontusemixedcaseclassnames #不使用大小写混合
+#告诉Proguard 不要跳过对非公开类的处理，默认是跳过
+-dontskipnonpubliclibraryclasses #如果应用程序引入的有jar包，并且混淆jar包里面的class
+-verbose #混淆时记录日志（混淆后生产映射文件 map 类名 -> 转化后类名的映射
+#指定混淆时的算法，后面的参数是一个过滤器
+#这个过滤器是谷歌推荐的算法，一般也不会改变
+-optimizations !code/simplification/arithmetic,!field/*,!class/merging/*
+#类型转换错误 添加如下代码以便过滤泛型（不写可能会出现类型转换错误，一般情况把这个加上就是了）,即避免泛型被混淆
+-keepattributes Signature
+#假如项目中有用到注解，应加入这行配置,对JSON实体映射也很重要,eg:fastjson
+-keepattributes *Annotation*
+#抛出异常时保留代码行数
+-keepattributes SourceFile,LineNumberTable
+#保持 native 的方法不去混淆
+-keepclasseswithmembernames class * {
+    native <methods>;
+}
+
+#保持指定规则的方法不被混淆（Android layout 布局文件中为控件配置的onClick方法不能混淆）
+-keepclassmembers class * extends android.app.Activity {
+    public void *(android.view.View);
+}
+#保持自定义控件指定规则的方法不被混淆
+-keep public class * extends android.view.View {
+    public <init>(android.content.Context);
+    public <init>(android.content.Context, android.util.AttributeSet);
+    public <init>(android.content.Context, android.util.AttributeSet, int);
+    public void set*(...);
+}
+#保持枚举 enum 不被混淆
+-keepclassmembers enum * {
+    public static **[] values();
+    public static ** valueOf(java.lang.String);
+}
+#保持 Parcelable 不被混淆（aidl文件不能去混淆）
+-keep class * implements android.os.Parcelable {
+    public static final android.os.Parcelable$Creator *;
+}
+#需要序列化和反序列化的类不能被混淆（注：Java反射用到的类也不能被混淆）
+-keepnames class * implements java.io.Serializable
+#保护实现接口Serializable的类中，指定规则的类成员不被混淆
+-keepclassmembers class * implements java.io.Serializable {
+    static final long serialVersionUID;
+    private static final java.io.ObjectStreamField[] serialPersistentFields;
+    !static !transient <fields>;
+    private void writeObject(java.io.ObjectOutputStream);
+    private void readObject(java.io.ObjectInputStream);
+    java.lang.Object writeReplace();
+    java.lang.Object readResolve();
+}
+#保持R文件不被混淆，否则，你的反射是获取不到资源id的
+-keep class **.R$* { *; }
+
+-keepclassmembers class * {
+   public <init> (org.json.JSONObject);
+}
+
+-keepclassmembers enum * {
+    public static **[] values();
+    public static ** valueOf(java.lang.String);
+}
+
+
+#以下针对App本身设置
+
+
+-keep class ai.guiji.duix.DuixNcnn{*; }
\ No newline at end of file
diff --git a/duix-sdk/src/main/AndroidManifest.xml b/duix-sdk/src/main/AndroidManifest.xml
new file mode 100644
index 0000000..a5918e6
--- /dev/null
+++ b/duix-sdk/src/main/AndroidManifest.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android">
+
+</manifest>
\ No newline at end of file
diff --git a/duix-sdk/src/main/cpp/CMakeLists.txt b/duix-sdk/src/main/cpp/CMakeLists.txt
new file mode 100644
index 0000000..9d2577e
--- /dev/null
+++ b/duix-sdk/src/main/cpp/CMakeLists.txt
@@ -0,0 +1,199 @@
+cmake_minimum_required(VERSION 3.13.2)
+project(gjmywrt)
+
+#set(CMAKE_CXX_COMPILER g++)
+#set(CMAKE_C_COMPILER gcc)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -fPIC  -funwind-tables -fno-omit-frame-pointer")
+#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC ")
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_BUILD_TYPE "Debug")
+set(ORT_NO_EXCEPTIONS FALSE)
+
+#set(DEVAUD false)
+option(DEVARM "shared library support" TRUE)
+
+if(DEVARM)
+  set(OpenCV_DIR ${CMAKE_SOURCE_DIR}/third/opencv-mobile-4.6.0-android/sdk/native/jni)
+  find_package(OpenCV REQUIRED core imgproc highgui)
+
+  set(ncnn_DIR ${CMAKE_SOURCE_DIR}/third/ncnn-20231027-android-shared/${ANDROID_ABI}/lib/cmake/ncnn)
+  find_package(ncnn REQUIRED)
+
+  add_library(turbojpeg STATIC IMPORTED)
+  set_target_properties(turbojpeg
+    PROPERTIES IMPORTED_LOCATION
+    ${CMAKE_SOURCE_DIR}/third/arm/${ANDROID_ABI}/libturbojpeg.a)
+
+  add_library(libjpeg STATIC IMPORTED)
+  set_target_properties(libjpeg
+    PROPERTIES IMPORTED_LOCATION
+    ${CMAKE_SOURCE_DIR}/third/arm/${ANDROID_ABI}/libjpeg.a)
+
+  add_library(onnx-lib SHARED IMPORTED)
+  set_target_properties(
+    onnx-lib
+    PROPERTIES IMPORTED_LOCATION
+    ${CMAKE_SOURCE_DIR}/third/arm/${ANDROID_ABI}/libonnxruntime.so)
+endif()
+
+option(USE_OPENCV "shared library support" TRUE)
+option(USE_NCNN "shared library support" TRUE)
+option(USE_OPENVINO "shared library support" FALSE)
+set(THIRD_INC "third/include")
+
+if(DEVARM)
+  set(THIRD_LIB "third/libarm")
+else()
+  set(THIRD_LIB "third/lib64")
+endif()
+
+
+
+if(DEVARM)
+
+  include_directories(
+    include
+    dhcore
+    dhmfcc
+    aes
+    android
+    third/arm/include
+    third/arm/include/onnx
+    third/arm/include/ncnn
+    third/arm/include/turbojpeg
+  )
+else()
+
+  include_directories(
+    include
+    dhcore
+    dhmfcc
+    aes
+    third2/include
+    third2/inc2404
+    third2/include/onnx
+    third2/include/turbojpeg
+    third2/include/ncnn
+    /usr/local/include/opencv4
+  )
+
+  link_directories(
+    ${CMAKE_SOURCE_DIR}/lib64
+    ${CMAKE_SOURCE_DIR}/third2/lib64
+    ${CMAKE_SOURCE_DIR}/third2/lib2404
+    /usr/local/lib
+  )
+endif()
+
+add_library(dhcore STATIC
+  dhcore/dh_mem.c
+  dhcore/dh_data.cpp
+  dhcore/dh_que.cpp
+)
+
+target_link_libraries(dhcore
+  -lm -lz -pthread
+)
+
+
+
+
+
+add_library(dhmfcc STATIC
+  dhmfcc/dhpcm.cpp
+  dhmfcc/dhwenet.cpp
+  dhmfcc/wenetai.cpp
+  dhmfcc/AudioFFT.cpp
+  dhmfcc/iir_filter.cpp
+  dhmfcc/mfcc.cpp
+)
+
+target_link_libraries(dhmfcc
+  dhcore
+  -lz -lm 
+)
+
+target_compile_options(dhmfcc   PRIVATE
+  -std=c++17
+)
+
+include_directories(
+  include
+  dhunet
+)
+
+add_library(dhunet STATIC
+  dhunet/jmat.cpp
+  dhunet/blendgram.cpp
+  dhunet/face_utils.cpp
+  dhunet/malpha.cpp
+  dhunet/munet.cpp
+)
+
+target_link_libraries(dhunet
+  dhcore
+  dhmfcc
+  -lz -lm 
+)
+
+if(DEVARM)
+
+  add_library(gjduix SHARED
+    duix/gjduix.cpp
+    duix/gjsimp.cpp
+    android/Log.cpp
+    android/DuixJni.cpp
+    android/JniHelper.cpp
+    aes/aes_cbc.c  aes/aes_core.c  aes/aes_ecb.c  aes/base64.c  aes/cbc128.c  aes/gj_aes.c
+    aes/aesmain.c
+  )
+
+  target_link_libraries(gjduix
+    dhcore
+    dhmfcc
+    dhunet
+    ${OpenCV_LIBS}
+    ${log-lib}
+    ncnn
+    onnx-lib
+    libjpeg
+    turbojpeg
+    -lz -lm 
+    -landroid
+  )
+
+else()
+  add_library(gjduix SHARED
+    duix/gjduix.cpp
+    duix/gjsimp.cpp
+  )
+
+  target_link_libraries(gjduix
+    dhcore
+    dhmfcc
+    dhunet
+    -ljpeg
+    -lopencv_core
+    -lopencv_imgproc
+    -lopencv_highgui
+    -lturbojpeg
+    -lonnxruntime
+    -lncnn
+    -lz -lm 
+  )
+
+
+endif()
+
+
+add_executable(duixtest
+  #iostest/testduix.cpp
+  iostest/testsimp.cpp
+)
+
+target_link_libraries(duixtest
+  dhcore
+  gjduix
+)
+
diff --git a/duix-sdk/src/main/cpp/aes/aes.h b/duix-sdk/src/main/cpp/aes/aes.h
new file mode 100644
index 0000000..854538e
--- /dev/null
+++ b/duix-sdk/src/main/cpp/aes/aes.h
@@ -0,0 +1,41 @@
+
+
+#ifndef HEADER_AES_H
+# define HEADER_AES_H
+
+# include <stddef.h>
+
+# define AES_ENCRYPT     1
+# define AES_DECRYPT     0
+
+# define AES_MAXNR 14
+# define AES_BLOCK_SIZE 16
+
+struct aes_key_st {
+# ifdef AES_LONG
+    unsigned long rd_key[4 * (AES_MAXNR + 1)];
+# else
+    unsigned int rd_key[4 * (AES_MAXNR + 1)];
+# endif
+    int rounds;
+};
+
+typedef struct aes_key_st AES_KEY;
+
+
+int AES_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key);
+int AES_set_decrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key);
+
+void AES_encrypt(const unsigned char *in, unsigned char *out, const AES_KEY *key);
+void AES_decrypt(const unsigned char *in, unsigned char *out, const AES_KEY *key);
+
+void AES_ecb_encrypt(const unsigned char *in, unsigned char *out, const AES_KEY *key, 
+					const int enc);
+
+void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
+                     size_t length, const AES_KEY *key,
+                     unsigned char *ivec, const int enc);
+
+#endif
+
+
diff --git a/duix-sdk/src/main/cpp/aes/aes_cbc.c b/duix-sdk/src/main/cpp/aes/aes_cbc.c
new file mode 100644
index 0000000..3925bb1
--- /dev/null
+++ b/duix-sdk/src/main/cpp/aes/aes_cbc.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include "aes.h"
+#include "modes.h"
+
+void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
+                     size_t len, const AES_KEY *key,
+                     unsigned char *ivec, const int enc)
+{
+
+    if (enc)
+        CRYPTO_cbc128_encrypt(in, out, len, key, ivec,
+                              (block128_f) AES_encrypt);
+    else
+        CRYPTO_cbc128_decrypt(in, out, len, key, ivec, (block128_f) AES_decrypt);
+}
diff --git a/duix-sdk/src/main/cpp/aes/aes_core.c b/duix-sdk/src/main/cpp/aes/aes_core.c
new file mode 100644
index 0000000..5801309
--- /dev/null
+++ b/duix-sdk/src/main/cpp/aes/aes_core.c
@@ -0,0 +1,1127 @@
+#include <assert.h>
+
+#include "aes.h"
+#include "aes_locl.h"
+
+
+
+/*-
+Te0[x] = S [x].[02, 01, 01, 03];
+Te1[x] = S [x].[03, 02, 01, 01];
+Te2[x] = S [x].[01, 03, 02, 01];
+Te3[x] = S [x].[01, 01, 03, 02];
+
+Td0[x] = Si[x].[0e, 09, 0d, 0b];
+Td1[x] = Si[x].[0b, 0e, 09, 0d];
+Td2[x] = Si[x].[0d, 0b, 0e, 09];
+Td3[x] = Si[x].[09, 0d, 0b, 0e];
+Td4[x] = Si[x].[01];
+*/
+
+static const u32 Te0[256] = {
+    0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU,
+    0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U,
+    0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU,
+    0xe7fefe19U, 0xb5d7d762U, 0x4dababe6U, 0xec76769aU,
+    0x8fcaca45U, 0x1f82829dU, 0x89c9c940U, 0xfa7d7d87U,
+    0xeffafa15U, 0xb25959ebU, 0x8e4747c9U, 0xfbf0f00bU,
+    0x41adadecU, 0xb3d4d467U, 0x5fa2a2fdU, 0x45afafeaU,
+    0x239c9cbfU, 0x53a4a4f7U, 0xe4727296U, 0x9bc0c05bU,
+    0x75b7b7c2U, 0xe1fdfd1cU, 0x3d9393aeU, 0x4c26266aU,
+    0x6c36365aU, 0x7e3f3f41U, 0xf5f7f702U, 0x83cccc4fU,
+    0x6834345cU, 0x51a5a5f4U, 0xd1e5e534U, 0xf9f1f108U,
+    0xe2717193U, 0xabd8d873U, 0x62313153U, 0x2a15153fU,
+    0x0804040cU, 0x95c7c752U, 0x46232365U, 0x9dc3c35eU,
+    0x30181828U, 0x379696a1U, 0x0a05050fU, 0x2f9a9ab5U,
+    0x0e070709U, 0x24121236U, 0x1b80809bU, 0xdfe2e23dU,
+    0xcdebeb26U, 0x4e272769U, 0x7fb2b2cdU, 0xea75759fU,
+    0x1209091bU, 0x1d83839eU, 0x582c2c74U, 0x341a1a2eU,
+    0x361b1b2dU, 0xdc6e6eb2U, 0xb45a5aeeU, 0x5ba0a0fbU,
+    0xa45252f6U, 0x763b3b4dU, 0xb7d6d661U, 0x7db3b3ceU,
+    0x5229297bU, 0xdde3e33eU, 0x5e2f2f71U, 0x13848497U,
+    0xa65353f5U, 0xb9d1d168U, 0x00000000U, 0xc1eded2cU,
+    0x40202060U, 0xe3fcfc1fU, 0x79b1b1c8U, 0xb65b5bedU,
+    0xd46a6abeU, 0x8dcbcb46U, 0x67bebed9U, 0x7239394bU,
+    0x944a4adeU, 0x984c4cd4U, 0xb05858e8U, 0x85cfcf4aU,
+    0xbbd0d06bU, 0xc5efef2aU, 0x4faaaae5U, 0xedfbfb16U,
+    0x864343c5U, 0x9a4d4dd7U, 0x66333355U, 0x11858594U,
+    0x8a4545cfU, 0xe9f9f910U, 0x04020206U, 0xfe7f7f81U,
+    0xa05050f0U, 0x783c3c44U, 0x259f9fbaU, 0x4ba8a8e3U,
+    0xa25151f3U, 0x5da3a3feU, 0x804040c0U, 0x058f8f8aU,
+    0x3f9292adU, 0x219d9dbcU, 0x70383848U, 0xf1f5f504U,
+    0x63bcbcdfU, 0x77b6b6c1U, 0xafdada75U, 0x42212163U,
+    0x20101030U, 0xe5ffff1aU, 0xfdf3f30eU, 0xbfd2d26dU,
+    0x81cdcd4cU, 0x180c0c14U, 0x26131335U, 0xc3ecec2fU,
+    0xbe5f5fe1U, 0x359797a2U, 0x884444ccU, 0x2e171739U,
+    0x93c4c457U, 0x55a7a7f2U, 0xfc7e7e82U, 0x7a3d3d47U,
+    0xc86464acU, 0xba5d5de7U, 0x3219192bU, 0xe6737395U,
+    0xc06060a0U, 0x19818198U, 0x9e4f4fd1U, 0xa3dcdc7fU,
+    0x44222266U, 0x542a2a7eU, 0x3b9090abU, 0x0b888883U,
+    0x8c4646caU, 0xc7eeee29U, 0x6bb8b8d3U, 0x2814143cU,
+    0xa7dede79U, 0xbc5e5ee2U, 0x160b0b1dU, 0xaddbdb76U,
+    0xdbe0e03bU, 0x64323256U, 0x743a3a4eU, 0x140a0a1eU,
+    0x924949dbU, 0x0c06060aU, 0x4824246cU, 0xb85c5ce4U,
+    0x9fc2c25dU, 0xbdd3d36eU, 0x43acacefU, 0xc46262a6U,
+    0x399191a8U, 0x319595a4U, 0xd3e4e437U, 0xf279798bU,
+    0xd5e7e732U, 0x8bc8c843U, 0x6e373759U, 0xda6d6db7U,
+    0x018d8d8cU, 0xb1d5d564U, 0x9c4e4ed2U, 0x49a9a9e0U,
+    0xd86c6cb4U, 0xac5656faU, 0xf3f4f407U, 0xcfeaea25U,
+    0xca6565afU, 0xf47a7a8eU, 0x47aeaee9U, 0x10080818U,
+    0x6fbabad5U, 0xf0787888U, 0x4a25256fU, 0x5c2e2e72U,
+    0x381c1c24U, 0x57a6a6f1U, 0x73b4b4c7U, 0x97c6c651U,
+    0xcbe8e823U, 0xa1dddd7cU, 0xe874749cU, 0x3e1f1f21U,
+    0x964b4bddU, 0x61bdbddcU, 0x0d8b8b86U, 0x0f8a8a85U,
+    0xe0707090U, 0x7c3e3e42U, 0x71b5b5c4U, 0xcc6666aaU,
+    0x904848d8U, 0x06030305U, 0xf7f6f601U, 0x1c0e0e12U,
+    0xc26161a3U, 0x6a35355fU, 0xae5757f9U, 0x69b9b9d0U,
+    0x17868691U, 0x99c1c158U, 0x3a1d1d27U, 0x279e9eb9U,
+    0xd9e1e138U, 0xebf8f813U, 0x2b9898b3U, 0x22111133U,
+    0xd26969bbU, 0xa9d9d970U, 0x078e8e89U, 0x339494a7U,
+    0x2d9b9bb6U, 0x3c1e1e22U, 0x15878792U, 0xc9e9e920U,
+    0x87cece49U, 0xaa5555ffU, 0x50282878U, 0xa5dfdf7aU,
+    0x038c8c8fU, 0x59a1a1f8U, 0x09898980U, 0x1a0d0d17U,
+    0x65bfbfdaU, 0xd7e6e631U, 0x844242c6U, 0xd06868b8U,
+    0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U,
+    0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU,
+};
+static const u32 Te1[256] = {
+    0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU,
+    0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U,
+    0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU,
+    0x19e7fefeU, 0x62b5d7d7U, 0xe64dababU, 0x9aec7676U,
+    0x458fcacaU, 0x9d1f8282U, 0x4089c9c9U, 0x87fa7d7dU,
+    0x15effafaU, 0xebb25959U, 0xc98e4747U, 0x0bfbf0f0U,
+    0xec41adadU, 0x67b3d4d4U, 0xfd5fa2a2U, 0xea45afafU,
+    0xbf239c9cU, 0xf753a4a4U, 0x96e47272U, 0x5b9bc0c0U,
+    0xc275b7b7U, 0x1ce1fdfdU, 0xae3d9393U, 0x6a4c2626U,
+    0x5a6c3636U, 0x417e3f3fU, 0x02f5f7f7U, 0x4f83ccccU,
+    0x5c683434U, 0xf451a5a5U, 0x34d1e5e5U, 0x08f9f1f1U,
+    0x93e27171U, 0x73abd8d8U, 0x53623131U, 0x3f2a1515U,
+    0x0c080404U, 0x5295c7c7U, 0x65462323U, 0x5e9dc3c3U,
+    0x28301818U, 0xa1379696U, 0x0f0a0505U, 0xb52f9a9aU,
+    0x090e0707U, 0x36241212U, 0x9b1b8080U, 0x3ddfe2e2U,
+    0x26cdebebU, 0x694e2727U, 0xcd7fb2b2U, 0x9fea7575U,
+    0x1b120909U, 0x9e1d8383U, 0x74582c2cU, 0x2e341a1aU,
+    0x2d361b1bU, 0xb2dc6e6eU, 0xeeb45a5aU, 0xfb5ba0a0U,
+    0xf6a45252U, 0x4d763b3bU, 0x61b7d6d6U, 0xce7db3b3U,
+    0x7b522929U, 0x3edde3e3U, 0x715e2f2fU, 0x97138484U,
+    0xf5a65353U, 0x68b9d1d1U, 0x00000000U, 0x2cc1ededU,
+    0x60402020U, 0x1fe3fcfcU, 0xc879b1b1U, 0xedb65b5bU,
+    0xbed46a6aU, 0x468dcbcbU, 0xd967bebeU, 0x4b723939U,
+    0xde944a4aU, 0xd4984c4cU, 0xe8b05858U, 0x4a85cfcfU,
+    0x6bbbd0d0U, 0x2ac5efefU, 0xe54faaaaU, 0x16edfbfbU,
+    0xc5864343U, 0xd79a4d4dU, 0x55663333U, 0x94118585U,
+    0xcf8a4545U, 0x10e9f9f9U, 0x06040202U, 0x81fe7f7fU,
+    0xf0a05050U, 0x44783c3cU, 0xba259f9fU, 0xe34ba8a8U,
+    0xf3a25151U, 0xfe5da3a3U, 0xc0804040U, 0x8a058f8fU,
+    0xad3f9292U, 0xbc219d9dU, 0x48703838U, 0x04f1f5f5U,
+    0xdf63bcbcU, 0xc177b6b6U, 0x75afdadaU, 0x63422121U,
+    0x30201010U, 0x1ae5ffffU, 0x0efdf3f3U, 0x6dbfd2d2U,
+    0x4c81cdcdU, 0x14180c0cU, 0x35261313U, 0x2fc3ececU,
+    0xe1be5f5fU, 0xa2359797U, 0xcc884444U, 0x392e1717U,
+    0x5793c4c4U, 0xf255a7a7U, 0x82fc7e7eU, 0x477a3d3dU,
+    0xacc86464U, 0xe7ba5d5dU, 0x2b321919U, 0x95e67373U,
+    0xa0c06060U, 0x98198181U, 0xd19e4f4fU, 0x7fa3dcdcU,
+    0x66442222U, 0x7e542a2aU, 0xab3b9090U, 0x830b8888U,
+    0xca8c4646U, 0x29c7eeeeU, 0xd36bb8b8U, 0x3c281414U,
+    0x79a7dedeU, 0xe2bc5e5eU, 0x1d160b0bU, 0x76addbdbU,
+    0x3bdbe0e0U, 0x56643232U, 0x4e743a3aU, 0x1e140a0aU,
+    0xdb924949U, 0x0a0c0606U, 0x6c482424U, 0xe4b85c5cU,
+    0x5d9fc2c2U, 0x6ebdd3d3U, 0xef43acacU, 0xa6c46262U,
+    0xa8399191U, 0xa4319595U, 0x37d3e4e4U, 0x8bf27979U,
+    0x32d5e7e7U, 0x438bc8c8U, 0x596e3737U, 0xb7da6d6dU,
+    0x8c018d8dU, 0x64b1d5d5U, 0xd29c4e4eU, 0xe049a9a9U,
+    0xb4d86c6cU, 0xfaac5656U, 0x07f3f4f4U, 0x25cfeaeaU,
+    0xafca6565U, 0x8ef47a7aU, 0xe947aeaeU, 0x18100808U,
+    0xd56fbabaU, 0x88f07878U, 0x6f4a2525U, 0x725c2e2eU,
+    0x24381c1cU, 0xf157a6a6U, 0xc773b4b4U, 0x5197c6c6U,
+    0x23cbe8e8U, 0x7ca1ddddU, 0x9ce87474U, 0x213e1f1fU,
+    0xdd964b4bU, 0xdc61bdbdU, 0x860d8b8bU, 0x850f8a8aU,
+    0x90e07070U, 0x427c3e3eU, 0xc471b5b5U, 0xaacc6666U,
+    0xd8904848U, 0x05060303U, 0x01f7f6f6U, 0x121c0e0eU,
+    0xa3c26161U, 0x5f6a3535U, 0xf9ae5757U, 0xd069b9b9U,
+    0x91178686U, 0x5899c1c1U, 0x273a1d1dU, 0xb9279e9eU,
+    0x38d9e1e1U, 0x13ebf8f8U, 0xb32b9898U, 0x33221111U,
+    0xbbd26969U, 0x70a9d9d9U, 0x89078e8eU, 0xa7339494U,
+    0xb62d9b9bU, 0x223c1e1eU, 0x92158787U, 0x20c9e9e9U,
+    0x4987ceceU, 0xffaa5555U, 0x78502828U, 0x7aa5dfdfU,
+    0x8f038c8cU, 0xf859a1a1U, 0x80098989U, 0x171a0d0dU,
+    0xda65bfbfU, 0x31d7e6e6U, 0xc6844242U, 0xb8d06868U,
+    0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU,
+    0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U,
+};
+static const u32 Te2[256] = {
+    0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU,
+    0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U,
+    0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU,
+    0xfe19e7feU, 0xd762b5d7U, 0xabe64dabU, 0x769aec76U,
+    0xca458fcaU, 0x829d1f82U, 0xc94089c9U, 0x7d87fa7dU,
+    0xfa15effaU, 0x59ebb259U, 0x47c98e47U, 0xf00bfbf0U,
+    0xadec41adU, 0xd467b3d4U, 0xa2fd5fa2U, 0xafea45afU,
+    0x9cbf239cU, 0xa4f753a4U, 0x7296e472U, 0xc05b9bc0U,
+    0xb7c275b7U, 0xfd1ce1fdU, 0x93ae3d93U, 0x266a4c26U,
+    0x365a6c36U, 0x3f417e3fU, 0xf702f5f7U, 0xcc4f83ccU,
+    0x345c6834U, 0xa5f451a5U, 0xe534d1e5U, 0xf108f9f1U,
+    0x7193e271U, 0xd873abd8U, 0x31536231U, 0x153f2a15U,
+    0x040c0804U, 0xc75295c7U, 0x23654623U, 0xc35e9dc3U,
+    0x18283018U, 0x96a13796U, 0x050f0a05U, 0x9ab52f9aU,
+    0x07090e07U, 0x12362412U, 0x809b1b80U, 0xe23ddfe2U,
+    0xeb26cdebU, 0x27694e27U, 0xb2cd7fb2U, 0x759fea75U,
+    0x091b1209U, 0x839e1d83U, 0x2c74582cU, 0x1a2e341aU,
+    0x1b2d361bU, 0x6eb2dc6eU, 0x5aeeb45aU, 0xa0fb5ba0U,
+    0x52f6a452U, 0x3b4d763bU, 0xd661b7d6U, 0xb3ce7db3U,
+    0x297b5229U, 0xe33edde3U, 0x2f715e2fU, 0x84971384U,
+    0x53f5a653U, 0xd168b9d1U, 0x00000000U, 0xed2cc1edU,
+    0x20604020U, 0xfc1fe3fcU, 0xb1c879b1U, 0x5bedb65bU,
+    0x6abed46aU, 0xcb468dcbU, 0xbed967beU, 0x394b7239U,
+    0x4ade944aU, 0x4cd4984cU, 0x58e8b058U, 0xcf4a85cfU,
+    0xd06bbbd0U, 0xef2ac5efU, 0xaae54faaU, 0xfb16edfbU,
+    0x43c58643U, 0x4dd79a4dU, 0x33556633U, 0x85941185U,
+    0x45cf8a45U, 0xf910e9f9U, 0x02060402U, 0x7f81fe7fU,
+    0x50f0a050U, 0x3c44783cU, 0x9fba259fU, 0xa8e34ba8U,
+    0x51f3a251U, 0xa3fe5da3U, 0x40c08040U, 0x8f8a058fU,
+    0x92ad3f92U, 0x9dbc219dU, 0x38487038U, 0xf504f1f5U,
+    0xbcdf63bcU, 0xb6c177b6U, 0xda75afdaU, 0x21634221U,
+    0x10302010U, 0xff1ae5ffU, 0xf30efdf3U, 0xd26dbfd2U,
+    0xcd4c81cdU, 0x0c14180cU, 0x13352613U, 0xec2fc3ecU,
+    0x5fe1be5fU, 0x97a23597U, 0x44cc8844U, 0x17392e17U,
+    0xc45793c4U, 0xa7f255a7U, 0x7e82fc7eU, 0x3d477a3dU,
+    0x64acc864U, 0x5de7ba5dU, 0x192b3219U, 0x7395e673U,
+    0x60a0c060U, 0x81981981U, 0x4fd19e4fU, 0xdc7fa3dcU,
+    0x22664422U, 0x2a7e542aU, 0x90ab3b90U, 0x88830b88U,
+    0x46ca8c46U, 0xee29c7eeU, 0xb8d36bb8U, 0x143c2814U,
+    0xde79a7deU, 0x5ee2bc5eU, 0x0b1d160bU, 0xdb76addbU,
+    0xe03bdbe0U, 0x32566432U, 0x3a4e743aU, 0x0a1e140aU,
+    0x49db9249U, 0x060a0c06U, 0x246c4824U, 0x5ce4b85cU,
+    0xc25d9fc2U, 0xd36ebdd3U, 0xacef43acU, 0x62a6c462U,
+    0x91a83991U, 0x95a43195U, 0xe437d3e4U, 0x798bf279U,
+    0xe732d5e7U, 0xc8438bc8U, 0x37596e37U, 0x6db7da6dU,
+    0x8d8c018dU, 0xd564b1d5U, 0x4ed29c4eU, 0xa9e049a9U,
+    0x6cb4d86cU, 0x56faac56U, 0xf407f3f4U, 0xea25cfeaU,
+    0x65afca65U, 0x7a8ef47aU, 0xaee947aeU, 0x08181008U,
+    0xbad56fbaU, 0x7888f078U, 0x256f4a25U, 0x2e725c2eU,
+    0x1c24381cU, 0xa6f157a6U, 0xb4c773b4U, 0xc65197c6U,
+    0xe823cbe8U, 0xdd7ca1ddU, 0x749ce874U, 0x1f213e1fU,
+    0x4bdd964bU, 0xbddc61bdU, 0x8b860d8bU, 0x8a850f8aU,
+    0x7090e070U, 0x3e427c3eU, 0xb5c471b5U, 0x66aacc66U,
+    0x48d89048U, 0x03050603U, 0xf601f7f6U, 0x0e121c0eU,
+    0x61a3c261U, 0x355f6a35U, 0x57f9ae57U, 0xb9d069b9U,
+    0x86911786U, 0xc15899c1U, 0x1d273a1dU, 0x9eb9279eU,
+    0xe138d9e1U, 0xf813ebf8U, 0x98b32b98U, 0x11332211U,
+    0x69bbd269U, 0xd970a9d9U, 0x8e89078eU, 0x94a73394U,
+    0x9bb62d9bU, 0x1e223c1eU, 0x87921587U, 0xe920c9e9U,
+    0xce4987ceU, 0x55ffaa55U, 0x28785028U, 0xdf7aa5dfU,
+    0x8c8f038cU, 0xa1f859a1U, 0x89800989U, 0x0d171a0dU,
+    0xbfda65bfU, 0xe631d7e6U, 0x42c68442U, 0x68b8d068U,
+    0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU,
+    0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U,
+};
+static const u32 Te3[256] = {
+    0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U,
+    0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U,
+    0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U,
+    0xfefe19e7U, 0xd7d762b5U, 0xababe64dU, 0x76769aecU,
+    0xcaca458fU, 0x82829d1fU, 0xc9c94089U, 0x7d7d87faU,
+    0xfafa15efU, 0x5959ebb2U, 0x4747c98eU, 0xf0f00bfbU,
+    0xadadec41U, 0xd4d467b3U, 0xa2a2fd5fU, 0xafafea45U,
+    0x9c9cbf23U, 0xa4a4f753U, 0x727296e4U, 0xc0c05b9bU,
+    0xb7b7c275U, 0xfdfd1ce1U, 0x9393ae3dU, 0x26266a4cU,
+    0x36365a6cU, 0x3f3f417eU, 0xf7f702f5U, 0xcccc4f83U,
+    0x34345c68U, 0xa5a5f451U, 0xe5e534d1U, 0xf1f108f9U,
+    0x717193e2U, 0xd8d873abU, 0x31315362U, 0x15153f2aU,
+    0x04040c08U, 0xc7c75295U, 0x23236546U, 0xc3c35e9dU,
+    0x18182830U, 0x9696a137U, 0x05050f0aU, 0x9a9ab52fU,
+    0x0707090eU, 0x12123624U, 0x80809b1bU, 0xe2e23ddfU,
+    0xebeb26cdU, 0x2727694eU, 0xb2b2cd7fU, 0x75759feaU,
+    0x09091b12U, 0x83839e1dU, 0x2c2c7458U, 0x1a1a2e34U,
+    0x1b1b2d36U, 0x6e6eb2dcU, 0x5a5aeeb4U, 0xa0a0fb5bU,
+    0x5252f6a4U, 0x3b3b4d76U, 0xd6d661b7U, 0xb3b3ce7dU,
+    0x29297b52U, 0xe3e33eddU, 0x2f2f715eU, 0x84849713U,
+    0x5353f5a6U, 0xd1d168b9U, 0x00000000U, 0xeded2cc1U,
+    0x20206040U, 0xfcfc1fe3U, 0xb1b1c879U, 0x5b5bedb6U,
+    0x6a6abed4U, 0xcbcb468dU, 0xbebed967U, 0x39394b72U,
+    0x4a4ade94U, 0x4c4cd498U, 0x5858e8b0U, 0xcfcf4a85U,
+    0xd0d06bbbU, 0xefef2ac5U, 0xaaaae54fU, 0xfbfb16edU,
+    0x4343c586U, 0x4d4dd79aU, 0x33335566U, 0x85859411U,
+    0x4545cf8aU, 0xf9f910e9U, 0x02020604U, 0x7f7f81feU,
+    0x5050f0a0U, 0x3c3c4478U, 0x9f9fba25U, 0xa8a8e34bU,
+    0x5151f3a2U, 0xa3a3fe5dU, 0x4040c080U, 0x8f8f8a05U,
+    0x9292ad3fU, 0x9d9dbc21U, 0x38384870U, 0xf5f504f1U,
+    0xbcbcdf63U, 0xb6b6c177U, 0xdada75afU, 0x21216342U,
+    0x10103020U, 0xffff1ae5U, 0xf3f30efdU, 0xd2d26dbfU,
+    0xcdcd4c81U, 0x0c0c1418U, 0x13133526U, 0xecec2fc3U,
+    0x5f5fe1beU, 0x9797a235U, 0x4444cc88U, 0x1717392eU,
+    0xc4c45793U, 0xa7a7f255U, 0x7e7e82fcU, 0x3d3d477aU,
+    0x6464acc8U, 0x5d5de7baU, 0x19192b32U, 0x737395e6U,
+    0x6060a0c0U, 0x81819819U, 0x4f4fd19eU, 0xdcdc7fa3U,
+    0x22226644U, 0x2a2a7e54U, 0x9090ab3bU, 0x8888830bU,
+    0x4646ca8cU, 0xeeee29c7U, 0xb8b8d36bU, 0x14143c28U,
+    0xdede79a7U, 0x5e5ee2bcU, 0x0b0b1d16U, 0xdbdb76adU,
+    0xe0e03bdbU, 0x32325664U, 0x3a3a4e74U, 0x0a0a1e14U,
+    0x4949db92U, 0x06060a0cU, 0x24246c48U, 0x5c5ce4b8U,
+    0xc2c25d9fU, 0xd3d36ebdU, 0xacacef43U, 0x6262a6c4U,
+    0x9191a839U, 0x9595a431U, 0xe4e437d3U, 0x79798bf2U,
+    0xe7e732d5U, 0xc8c8438bU, 0x3737596eU, 0x6d6db7daU,
+    0x8d8d8c01U, 0xd5d564b1U, 0x4e4ed29cU, 0xa9a9e049U,
+    0x6c6cb4d8U, 0x5656faacU, 0xf4f407f3U, 0xeaea25cfU,
+    0x6565afcaU, 0x7a7a8ef4U, 0xaeaee947U, 0x08081810U,
+    0xbabad56fU, 0x787888f0U, 0x25256f4aU, 0x2e2e725cU,
+    0x1c1c2438U, 0xa6a6f157U, 0xb4b4c773U, 0xc6c65197U,
+    0xe8e823cbU, 0xdddd7ca1U, 0x74749ce8U, 0x1f1f213eU,
+    0x4b4bdd96U, 0xbdbddc61U, 0x8b8b860dU, 0x8a8a850fU,
+    0x707090e0U, 0x3e3e427cU, 0xb5b5c471U, 0x6666aaccU,
+    0x4848d890U, 0x03030506U, 0xf6f601f7U, 0x0e0e121cU,
+    0x6161a3c2U, 0x35355f6aU, 0x5757f9aeU, 0xb9b9d069U,
+    0x86869117U, 0xc1c15899U, 0x1d1d273aU, 0x9e9eb927U,
+    0xe1e138d9U, 0xf8f813ebU, 0x9898b32bU, 0x11113322U,
+    0x6969bbd2U, 0xd9d970a9U, 0x8e8e8907U, 0x9494a733U,
+    0x9b9bb62dU, 0x1e1e223cU, 0x87879215U, 0xe9e920c9U,
+    0xcece4987U, 0x5555ffaaU, 0x28287850U, 0xdfdf7aa5U,
+    0x8c8c8f03U, 0xa1a1f859U, 0x89898009U, 0x0d0d171aU,
+    0xbfbfda65U, 0xe6e631d7U, 0x4242c684U, 0x6868b8d0U,
+    0x4141c382U, 0x9999b029U, 0x2d2d775aU, 0x0f0f111eU,
+    0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU,
+};
+
+static const u32 Td0[256] = {
+    0x51f4a750U, 0x7e416553U, 0x1a17a4c3U, 0x3a275e96U,
+    0x3bab6bcbU, 0x1f9d45f1U, 0xacfa58abU, 0x4be30393U,
+    0x2030fa55U, 0xad766df6U, 0x88cc7691U, 0xf5024c25U,
+    0x4fe5d7fcU, 0xc52acbd7U, 0x26354480U, 0xb562a38fU,
+    0xdeb15a49U, 0x25ba1b67U, 0x45ea0e98U, 0x5dfec0e1U,
+    0xc32f7502U, 0x814cf012U, 0x8d4697a3U, 0x6bd3f9c6U,
+    0x038f5fe7U, 0x15929c95U, 0xbf6d7aebU, 0x955259daU,
+    0xd4be832dU, 0x587421d3U, 0x49e06929U, 0x8ec9c844U,
+    0x75c2896aU, 0xf48e7978U, 0x99583e6bU, 0x27b971ddU,
+    0xbee14fb6U, 0xf088ad17U, 0xc920ac66U, 0x7dce3ab4U,
+    0x63df4a18U, 0xe51a3182U, 0x97513360U, 0x62537f45U,
+    0xb16477e0U, 0xbb6bae84U, 0xfe81a01cU, 0xf9082b94U,
+    0x70486858U, 0x8f45fd19U, 0x94de6c87U, 0x527bf8b7U,
+    0xab73d323U, 0x724b02e2U, 0xe31f8f57U, 0x6655ab2aU,
+    0xb2eb2807U, 0x2fb5c203U, 0x86c57b9aU, 0xd33708a5U,
+    0x302887f2U, 0x23bfa5b2U, 0x02036abaU, 0xed16825cU,
+    0x8acf1c2bU, 0xa779b492U, 0xf307f2f0U, 0x4e69e2a1U,
+    0x65daf4cdU, 0x0605bed5U, 0xd134621fU, 0xc4a6fe8aU,
+    0x342e539dU, 0xa2f355a0U, 0x058ae132U, 0xa4f6eb75U,
+    0x0b83ec39U, 0x4060efaaU, 0x5e719f06U, 0xbd6e1051U,
+    0x3e218af9U, 0x96dd063dU, 0xdd3e05aeU, 0x4de6bd46U,
+    0x91548db5U, 0x71c45d05U, 0x0406d46fU, 0x605015ffU,
+    0x1998fb24U, 0xd6bde997U, 0x894043ccU, 0x67d99e77U,
+    0xb0e842bdU, 0x07898b88U, 0xe7195b38U, 0x79c8eedbU,
+    0xa17c0a47U, 0x7c420fe9U, 0xf8841ec9U, 0x00000000U,
+    0x09808683U, 0x322bed48U, 0x1e1170acU, 0x6c5a724eU,
+    0xfd0efffbU, 0x0f853856U, 0x3daed51eU, 0x362d3927U,
+    0x0a0fd964U, 0x685ca621U, 0x9b5b54d1U, 0x24362e3aU,
+    0x0c0a67b1U, 0x9357e70fU, 0xb4ee96d2U, 0x1b9b919eU,
+    0x80c0c54fU, 0x61dc20a2U, 0x5a774b69U, 0x1c121a16U,
+    0xe293ba0aU, 0xc0a02ae5U, 0x3c22e043U, 0x121b171dU,
+    0x0e090d0bU, 0xf28bc7adU, 0x2db6a8b9U, 0x141ea9c8U,
+    0x57f11985U, 0xaf75074cU, 0xee99ddbbU, 0xa37f60fdU,
+    0xf701269fU, 0x5c72f5bcU, 0x44663bc5U, 0x5bfb7e34U,
+    0x8b432976U, 0xcb23c6dcU, 0xb6edfc68U, 0xb8e4f163U,
+    0xd731dccaU, 0x42638510U, 0x13972240U, 0x84c61120U,
+    0x854a247dU, 0xd2bb3df8U, 0xaef93211U, 0xc729a16dU,
+    0x1d9e2f4bU, 0xdcb230f3U, 0x0d8652ecU, 0x77c1e3d0U,
+    0x2bb3166cU, 0xa970b999U, 0x119448faU, 0x47e96422U,
+    0xa8fc8cc4U, 0xa0f03f1aU, 0x567d2cd8U, 0x223390efU,
+    0x87494ec7U, 0xd938d1c1U, 0x8ccaa2feU, 0x98d40b36U,
+    0xa6f581cfU, 0xa57ade28U, 0xdab78e26U, 0x3fadbfa4U,
+    0x2c3a9de4U, 0x5078920dU, 0x6a5fcc9bU, 0x547e4662U,
+    0xf68d13c2U, 0x90d8b8e8U, 0x2e39f75eU, 0x82c3aff5U,
+    0x9f5d80beU, 0x69d0937cU, 0x6fd52da9U, 0xcf2512b3U,
+    0xc8ac993bU, 0x10187da7U, 0xe89c636eU, 0xdb3bbb7bU,
+    0xcd267809U, 0x6e5918f4U, 0xec9ab701U, 0x834f9aa8U,
+    0xe6956e65U, 0xaaffe67eU, 0x21bccf08U, 0xef15e8e6U,
+    0xbae79bd9U, 0x4a6f36ceU, 0xea9f09d4U, 0x29b07cd6U,
+    0x31a4b2afU, 0x2a3f2331U, 0xc6a59430U, 0x35a266c0U,
+    0x744ebc37U, 0xfc82caa6U, 0xe090d0b0U, 0x33a7d815U,
+    0xf104984aU, 0x41ecdaf7U, 0x7fcd500eU, 0x1791f62fU,
+    0x764dd68dU, 0x43efb04dU, 0xccaa4d54U, 0xe49604dfU,
+    0x9ed1b5e3U, 0x4c6a881bU, 0xc12c1fb8U, 0x4665517fU,
+    0x9d5eea04U, 0x018c355dU, 0xfa877473U, 0xfb0b412eU,
+    0xb3671d5aU, 0x92dbd252U, 0xe9105633U, 0x6dd64713U,
+    0x9ad7618cU, 0x37a10c7aU, 0x59f8148eU, 0xeb133c89U,
+    0xcea927eeU, 0xb761c935U, 0xe11ce5edU, 0x7a47b13cU,
+    0x9cd2df59U, 0x55f2733fU, 0x1814ce79U, 0x73c737bfU,
+    0x53f7cdeaU, 0x5ffdaa5bU, 0xdf3d6f14U, 0x7844db86U,
+    0xcaaff381U, 0xb968c43eU, 0x3824342cU, 0xc2a3405fU,
+    0x161dc372U, 0xbce2250cU, 0x283c498bU, 0xff0d9541U,
+    0x39a80171U, 0x080cb3deU, 0xd8b4e49cU, 0x6456c190U,
+    0x7bcb8461U, 0xd532b670U, 0x486c5c74U, 0xd0b85742U,
+};
+static const u32 Td1[256] = {
+    0x5051f4a7U, 0x537e4165U, 0xc31a17a4U, 0x963a275eU,
+    0xcb3bab6bU, 0xf11f9d45U, 0xabacfa58U, 0x934be303U,
+    0x552030faU, 0xf6ad766dU, 0x9188cc76U, 0x25f5024cU,
+    0xfc4fe5d7U, 0xd7c52acbU, 0x80263544U, 0x8fb562a3U,
+    0x49deb15aU, 0x6725ba1bU, 0x9845ea0eU, 0xe15dfec0U,
+    0x02c32f75U, 0x12814cf0U, 0xa38d4697U, 0xc66bd3f9U,
+    0xe7038f5fU, 0x9515929cU, 0xebbf6d7aU, 0xda955259U,
+    0x2dd4be83U, 0xd3587421U, 0x2949e069U, 0x448ec9c8U,
+    0x6a75c289U, 0x78f48e79U, 0x6b99583eU, 0xdd27b971U,
+    0xb6bee14fU, 0x17f088adU, 0x66c920acU, 0xb47dce3aU,
+    0x1863df4aU, 0x82e51a31U, 0x60975133U, 0x4562537fU,
+    0xe0b16477U, 0x84bb6baeU, 0x1cfe81a0U, 0x94f9082bU,
+    0x58704868U, 0x198f45fdU, 0x8794de6cU, 0xb7527bf8U,
+    0x23ab73d3U, 0xe2724b02U, 0x57e31f8fU, 0x2a6655abU,
+    0x07b2eb28U, 0x032fb5c2U, 0x9a86c57bU, 0xa5d33708U,
+    0xf2302887U, 0xb223bfa5U, 0xba02036aU, 0x5ced1682U,
+    0x2b8acf1cU, 0x92a779b4U, 0xf0f307f2U, 0xa14e69e2U,
+    0xcd65daf4U, 0xd50605beU, 0x1fd13462U, 0x8ac4a6feU,
+    0x9d342e53U, 0xa0a2f355U, 0x32058ae1U, 0x75a4f6ebU,
+    0x390b83ecU, 0xaa4060efU, 0x065e719fU, 0x51bd6e10U,
+    0xf93e218aU, 0x3d96dd06U, 0xaedd3e05U, 0x464de6bdU,
+    0xb591548dU, 0x0571c45dU, 0x6f0406d4U, 0xff605015U,
+    0x241998fbU, 0x97d6bde9U, 0xcc894043U, 0x7767d99eU,
+    0xbdb0e842U, 0x8807898bU, 0x38e7195bU, 0xdb79c8eeU,
+    0x47a17c0aU, 0xe97c420fU, 0xc9f8841eU, 0x00000000U,
+    0x83098086U, 0x48322bedU, 0xac1e1170U, 0x4e6c5a72U,
+    0xfbfd0effU, 0x560f8538U, 0x1e3daed5U, 0x27362d39U,
+    0x640a0fd9U, 0x21685ca6U, 0xd19b5b54U, 0x3a24362eU,
+    0xb10c0a67U, 0x0f9357e7U, 0xd2b4ee96U, 0x9e1b9b91U,
+    0x4f80c0c5U, 0xa261dc20U, 0x695a774bU, 0x161c121aU,
+    0x0ae293baU, 0xe5c0a02aU, 0x433c22e0U, 0x1d121b17U,
+    0x0b0e090dU, 0xadf28bc7U, 0xb92db6a8U, 0xc8141ea9U,
+    0x8557f119U, 0x4caf7507U, 0xbbee99ddU, 0xfda37f60U,
+    0x9ff70126U, 0xbc5c72f5U, 0xc544663bU, 0x345bfb7eU,
+    0x768b4329U, 0xdccb23c6U, 0x68b6edfcU, 0x63b8e4f1U,
+    0xcad731dcU, 0x10426385U, 0x40139722U, 0x2084c611U,
+    0x7d854a24U, 0xf8d2bb3dU, 0x11aef932U, 0x6dc729a1U,
+    0x4b1d9e2fU, 0xf3dcb230U, 0xec0d8652U, 0xd077c1e3U,
+    0x6c2bb316U, 0x99a970b9U, 0xfa119448U, 0x2247e964U,
+    0xc4a8fc8cU, 0x1aa0f03fU, 0xd8567d2cU, 0xef223390U,
+    0xc787494eU, 0xc1d938d1U, 0xfe8ccaa2U, 0x3698d40bU,
+    0xcfa6f581U, 0x28a57adeU, 0x26dab78eU, 0xa43fadbfU,
+    0xe42c3a9dU, 0x0d507892U, 0x9b6a5fccU, 0x62547e46U,
+    0xc2f68d13U, 0xe890d8b8U, 0x5e2e39f7U, 0xf582c3afU,
+    0xbe9f5d80U, 0x7c69d093U, 0xa96fd52dU, 0xb3cf2512U,
+    0x3bc8ac99U, 0xa710187dU, 0x6ee89c63U, 0x7bdb3bbbU,
+    0x09cd2678U, 0xf46e5918U, 0x01ec9ab7U, 0xa8834f9aU,
+    0x65e6956eU, 0x7eaaffe6U, 0x0821bccfU, 0xe6ef15e8U,
+    0xd9bae79bU, 0xce4a6f36U, 0xd4ea9f09U, 0xd629b07cU,
+    0xaf31a4b2U, 0x312a3f23U, 0x30c6a594U, 0xc035a266U,
+    0x37744ebcU, 0xa6fc82caU, 0xb0e090d0U, 0x1533a7d8U,
+    0x4af10498U, 0xf741ecdaU, 0x0e7fcd50U, 0x2f1791f6U,
+    0x8d764dd6U, 0x4d43efb0U, 0x54ccaa4dU, 0xdfe49604U,
+    0xe39ed1b5U, 0x1b4c6a88U, 0xb8c12c1fU, 0x7f466551U,
+    0x049d5eeaU, 0x5d018c35U, 0x73fa8774U, 0x2efb0b41U,
+    0x5ab3671dU, 0x5292dbd2U, 0x33e91056U, 0x136dd647U,
+    0x8c9ad761U, 0x7a37a10cU, 0x8e59f814U, 0x89eb133cU,
+    0xeecea927U, 0x35b761c9U, 0xede11ce5U, 0x3c7a47b1U,
+    0x599cd2dfU, 0x3f55f273U, 0x791814ceU, 0xbf73c737U,
+    0xea53f7cdU, 0x5b5ffdaaU, 0x14df3d6fU, 0x867844dbU,
+    0x81caaff3U, 0x3eb968c4U, 0x2c382434U, 0x5fc2a340U,
+    0x72161dc3U, 0x0cbce225U, 0x8b283c49U, 0x41ff0d95U,
+    0x7139a801U, 0xde080cb3U, 0x9cd8b4e4U, 0x906456c1U,
+    0x617bcb84U, 0x70d532b6U, 0x74486c5cU, 0x42d0b857U,
+};
+static const u32 Td2[256] = {
+    0xa75051f4U, 0x65537e41U, 0xa4c31a17U, 0x5e963a27U,
+    0x6bcb3babU, 0x45f11f9dU, 0x58abacfaU, 0x03934be3U,
+    0xfa552030U, 0x6df6ad76U, 0x769188ccU, 0x4c25f502U,
+    0xd7fc4fe5U, 0xcbd7c52aU, 0x44802635U, 0xa38fb562U,
+    0x5a49deb1U, 0x1b6725baU, 0x0e9845eaU, 0xc0e15dfeU,
+    0x7502c32fU, 0xf012814cU, 0x97a38d46U, 0xf9c66bd3U,
+    0x5fe7038fU, 0x9c951592U, 0x7aebbf6dU, 0x59da9552U,
+    0x832dd4beU, 0x21d35874U, 0x692949e0U, 0xc8448ec9U,
+    0x896a75c2U, 0x7978f48eU, 0x3e6b9958U, 0x71dd27b9U,
+    0x4fb6bee1U, 0xad17f088U, 0xac66c920U, 0x3ab47dceU,
+    0x4a1863dfU, 0x3182e51aU, 0x33609751U, 0x7f456253U,
+    0x77e0b164U, 0xae84bb6bU, 0xa01cfe81U, 0x2b94f908U,
+    0x68587048U, 0xfd198f45U, 0x6c8794deU, 0xf8b7527bU,
+    0xd323ab73U, 0x02e2724bU, 0x8f57e31fU, 0xab2a6655U,
+    0x2807b2ebU, 0xc2032fb5U, 0x7b9a86c5U, 0x08a5d337U,
+    0x87f23028U, 0xa5b223bfU, 0x6aba0203U, 0x825ced16U,
+    0x1c2b8acfU, 0xb492a779U, 0xf2f0f307U, 0xe2a14e69U,
+    0xf4cd65daU, 0xbed50605U, 0x621fd134U, 0xfe8ac4a6U,
+    0x539d342eU, 0x55a0a2f3U, 0xe132058aU, 0xeb75a4f6U,
+    0xec390b83U, 0xefaa4060U, 0x9f065e71U, 0x1051bd6eU,
+    0x8af93e21U, 0x063d96ddU, 0x05aedd3eU, 0xbd464de6U,
+    0x8db59154U, 0x5d0571c4U, 0xd46f0406U, 0x15ff6050U,
+    0xfb241998U, 0xe997d6bdU, 0x43cc8940U, 0x9e7767d9U,
+    0x42bdb0e8U, 0x8b880789U, 0x5b38e719U, 0xeedb79c8U,
+    0x0a47a17cU, 0x0fe97c42U, 0x1ec9f884U, 0x00000000U,
+    0x86830980U, 0xed48322bU, 0x70ac1e11U, 0x724e6c5aU,
+    0xfffbfd0eU, 0x38560f85U, 0xd51e3daeU, 0x3927362dU,
+    0xd9640a0fU, 0xa621685cU, 0x54d19b5bU, 0x2e3a2436U,
+    0x67b10c0aU, 0xe70f9357U, 0x96d2b4eeU, 0x919e1b9bU,
+    0xc54f80c0U, 0x20a261dcU, 0x4b695a77U, 0x1a161c12U,
+    0xba0ae293U, 0x2ae5c0a0U, 0xe0433c22U, 0x171d121bU,
+    0x0d0b0e09U, 0xc7adf28bU, 0xa8b92db6U, 0xa9c8141eU,
+    0x198557f1U, 0x074caf75U, 0xddbbee99U, 0x60fda37fU,
+    0x269ff701U, 0xf5bc5c72U, 0x3bc54466U, 0x7e345bfbU,
+    0x29768b43U, 0xc6dccb23U, 0xfc68b6edU, 0xf163b8e4U,
+    0xdccad731U, 0x85104263U, 0x22401397U, 0x112084c6U,
+    0x247d854aU, 0x3df8d2bbU, 0x3211aef9U, 0xa16dc729U,
+    0x2f4b1d9eU, 0x30f3dcb2U, 0x52ec0d86U, 0xe3d077c1U,
+    0x166c2bb3U, 0xb999a970U, 0x48fa1194U, 0x642247e9U,
+    0x8cc4a8fcU, 0x3f1aa0f0U, 0x2cd8567dU, 0x90ef2233U,
+    0x4ec78749U, 0xd1c1d938U, 0xa2fe8ccaU, 0x0b3698d4U,
+    0x81cfa6f5U, 0xde28a57aU, 0x8e26dab7U, 0xbfa43fadU,
+    0x9de42c3aU, 0x920d5078U, 0xcc9b6a5fU, 0x4662547eU,
+    0x13c2f68dU, 0xb8e890d8U, 0xf75e2e39U, 0xaff582c3U,
+    0x80be9f5dU, 0x937c69d0U, 0x2da96fd5U, 0x12b3cf25U,
+    0x993bc8acU, 0x7da71018U, 0x636ee89cU, 0xbb7bdb3bU,
+    0x7809cd26U, 0x18f46e59U, 0xb701ec9aU, 0x9aa8834fU,
+    0x6e65e695U, 0xe67eaaffU, 0xcf0821bcU, 0xe8e6ef15U,
+    0x9bd9bae7U, 0x36ce4a6fU, 0x09d4ea9fU, 0x7cd629b0U,
+    0xb2af31a4U, 0x23312a3fU, 0x9430c6a5U, 0x66c035a2U,
+    0xbc37744eU, 0xcaa6fc82U, 0xd0b0e090U, 0xd81533a7U,
+    0x984af104U, 0xdaf741ecU, 0x500e7fcdU, 0xf62f1791U,
+    0xd68d764dU, 0xb04d43efU, 0x4d54ccaaU, 0x04dfe496U,
+    0xb5e39ed1U, 0x881b4c6aU, 0x1fb8c12cU, 0x517f4665U,
+    0xea049d5eU, 0x355d018cU, 0x7473fa87U, 0x412efb0bU,
+    0x1d5ab367U, 0xd25292dbU, 0x5633e910U, 0x47136dd6U,
+    0x618c9ad7U, 0x0c7a37a1U, 0x148e59f8U, 0x3c89eb13U,
+    0x27eecea9U, 0xc935b761U, 0xe5ede11cU, 0xb13c7a47U,
+    0xdf599cd2U, 0x733f55f2U, 0xce791814U, 0x37bf73c7U,
+    0xcdea53f7U, 0xaa5b5ffdU, 0x6f14df3dU, 0xdb867844U,
+    0xf381caafU, 0xc43eb968U, 0x342c3824U, 0x405fc2a3U,
+    0xc372161dU, 0x250cbce2U, 0x498b283cU, 0x9541ff0dU,
+    0x017139a8U, 0xb3de080cU, 0xe49cd8b4U, 0xc1906456U,
+    0x84617bcbU, 0xb670d532U, 0x5c74486cU, 0x5742d0b8U,
+};
+static const u32 Td3[256] = {
+    0xf4a75051U, 0x4165537eU, 0x17a4c31aU, 0x275e963aU,
+    0xab6bcb3bU, 0x9d45f11fU, 0xfa58abacU, 0xe303934bU,
+    0x30fa5520U, 0x766df6adU, 0xcc769188U, 0x024c25f5U,
+    0xe5d7fc4fU, 0x2acbd7c5U, 0x35448026U, 0x62a38fb5U,
+    0xb15a49deU, 0xba1b6725U, 0xea0e9845U, 0xfec0e15dU,
+    0x2f7502c3U, 0x4cf01281U, 0x4697a38dU, 0xd3f9c66bU,
+    0x8f5fe703U, 0x929c9515U, 0x6d7aebbfU, 0x5259da95U,
+    0xbe832dd4U, 0x7421d358U, 0xe0692949U, 0xc9c8448eU,
+    0xc2896a75U, 0x8e7978f4U, 0x583e6b99U, 0xb971dd27U,
+    0xe14fb6beU, 0x88ad17f0U, 0x20ac66c9U, 0xce3ab47dU,
+    0xdf4a1863U, 0x1a3182e5U, 0x51336097U, 0x537f4562U,
+    0x6477e0b1U, 0x6bae84bbU, 0x81a01cfeU, 0x082b94f9U,
+    0x48685870U, 0x45fd198fU, 0xde6c8794U, 0x7bf8b752U,
+    0x73d323abU, 0x4b02e272U, 0x1f8f57e3U, 0x55ab2a66U,
+    0xeb2807b2U, 0xb5c2032fU, 0xc57b9a86U, 0x3708a5d3U,
+    0x2887f230U, 0xbfa5b223U, 0x036aba02U, 0x16825cedU,
+    0xcf1c2b8aU, 0x79b492a7U, 0x07f2f0f3U, 0x69e2a14eU,
+    0xdaf4cd65U, 0x05bed506U, 0x34621fd1U, 0xa6fe8ac4U,
+    0x2e539d34U, 0xf355a0a2U, 0x8ae13205U, 0xf6eb75a4U,
+    0x83ec390bU, 0x60efaa40U, 0x719f065eU, 0x6e1051bdU,
+    0x218af93eU, 0xdd063d96U, 0x3e05aeddU, 0xe6bd464dU,
+    0x548db591U, 0xc45d0571U, 0x06d46f04U, 0x5015ff60U,
+    0x98fb2419U, 0xbde997d6U, 0x4043cc89U, 0xd99e7767U,
+    0xe842bdb0U, 0x898b8807U, 0x195b38e7U, 0xc8eedb79U,
+    0x7c0a47a1U, 0x420fe97cU, 0x841ec9f8U, 0x00000000U,
+    0x80868309U, 0x2bed4832U, 0x1170ac1eU, 0x5a724e6cU,
+    0x0efffbfdU, 0x8538560fU, 0xaed51e3dU, 0x2d392736U,
+    0x0fd9640aU, 0x5ca62168U, 0x5b54d19bU, 0x362e3a24U,
+    0x0a67b10cU, 0x57e70f93U, 0xee96d2b4U, 0x9b919e1bU,
+    0xc0c54f80U, 0xdc20a261U, 0x774b695aU, 0x121a161cU,
+    0x93ba0ae2U, 0xa02ae5c0U, 0x22e0433cU, 0x1b171d12U,
+    0x090d0b0eU, 0x8bc7adf2U, 0xb6a8b92dU, 0x1ea9c814U,
+    0xf1198557U, 0x75074cafU, 0x99ddbbeeU, 0x7f60fda3U,
+    0x01269ff7U, 0x72f5bc5cU, 0x663bc544U, 0xfb7e345bU,
+    0x4329768bU, 0x23c6dccbU, 0xedfc68b6U, 0xe4f163b8U,
+    0x31dccad7U, 0x63851042U, 0x97224013U, 0xc6112084U,
+    0x4a247d85U, 0xbb3df8d2U, 0xf93211aeU, 0x29a16dc7U,
+    0x9e2f4b1dU, 0xb230f3dcU, 0x8652ec0dU, 0xc1e3d077U,
+    0xb3166c2bU, 0x70b999a9U, 0x9448fa11U, 0xe9642247U,
+    0xfc8cc4a8U, 0xf03f1aa0U, 0x7d2cd856U, 0x3390ef22U,
+    0x494ec787U, 0x38d1c1d9U, 0xcaa2fe8cU, 0xd40b3698U,
+    0xf581cfa6U, 0x7ade28a5U, 0xb78e26daU, 0xadbfa43fU,
+    0x3a9de42cU, 0x78920d50U, 0x5fcc9b6aU, 0x7e466254U,
+    0x8d13c2f6U, 0xd8b8e890U, 0x39f75e2eU, 0xc3aff582U,
+    0x5d80be9fU, 0xd0937c69U, 0xd52da96fU, 0x2512b3cfU,
+    0xac993bc8U, 0x187da710U, 0x9c636ee8U, 0x3bbb7bdbU,
+    0x267809cdU, 0x5918f46eU, 0x9ab701ecU, 0x4f9aa883U,
+    0x956e65e6U, 0xffe67eaaU, 0xbccf0821U, 0x15e8e6efU,
+    0xe79bd9baU, 0x6f36ce4aU, 0x9f09d4eaU, 0xb07cd629U,
+    0xa4b2af31U, 0x3f23312aU, 0xa59430c6U, 0xa266c035U,
+    0x4ebc3774U, 0x82caa6fcU, 0x90d0b0e0U, 0xa7d81533U,
+    0x04984af1U, 0xecdaf741U, 0xcd500e7fU, 0x91f62f17U,
+    0x4dd68d76U, 0xefb04d43U, 0xaa4d54ccU, 0x9604dfe4U,
+    0xd1b5e39eU, 0x6a881b4cU, 0x2c1fb8c1U, 0x65517f46U,
+    0x5eea049dU, 0x8c355d01U, 0x877473faU, 0x0b412efbU,
+    0x671d5ab3U, 0xdbd25292U, 0x105633e9U, 0xd647136dU,
+    0xd7618c9aU, 0xa10c7a37U, 0xf8148e59U, 0x133c89ebU,
+    0xa927eeceU, 0x61c935b7U, 0x1ce5ede1U, 0x47b13c7aU,
+    0xd2df599cU, 0xf2733f55U, 0x14ce7918U, 0xc737bf73U,
+    0xf7cdea53U, 0xfdaa5b5fU, 0x3d6f14dfU, 0x44db8678U,
+    0xaff381caU, 0x68c43eb9U, 0x24342c38U, 0xa3405fc2U,
+    0x1dc37216U, 0xe2250cbcU, 0x3c498b28U, 0x0d9541ffU,
+    0xa8017139U, 0x0cb3de08U, 0xb4e49cd8U, 0x56c19064U,
+    0xcb84617bU, 0x32b670d5U, 0x6c5c7448U, 0xb85742d0U,
+};
+static const u8 Td4[256] = {
+    0x52U, 0x09U, 0x6aU, 0xd5U, 0x30U, 0x36U, 0xa5U, 0x38U,
+    0xbfU, 0x40U, 0xa3U, 0x9eU, 0x81U, 0xf3U, 0xd7U, 0xfbU,
+    0x7cU, 0xe3U, 0x39U, 0x82U, 0x9bU, 0x2fU, 0xffU, 0x87U,
+    0x34U, 0x8eU, 0x43U, 0x44U, 0xc4U, 0xdeU, 0xe9U, 0xcbU,
+    0x54U, 0x7bU, 0x94U, 0x32U, 0xa6U, 0xc2U, 0x23U, 0x3dU,
+    0xeeU, 0x4cU, 0x95U, 0x0bU, 0x42U, 0xfaU, 0xc3U, 0x4eU,
+    0x08U, 0x2eU, 0xa1U, 0x66U, 0x28U, 0xd9U, 0x24U, 0xb2U,
+    0x76U, 0x5bU, 0xa2U, 0x49U, 0x6dU, 0x8bU, 0xd1U, 0x25U,
+    0x72U, 0xf8U, 0xf6U, 0x64U, 0x86U, 0x68U, 0x98U, 0x16U,
+    0xd4U, 0xa4U, 0x5cU, 0xccU, 0x5dU, 0x65U, 0xb6U, 0x92U,
+    0x6cU, 0x70U, 0x48U, 0x50U, 0xfdU, 0xedU, 0xb9U, 0xdaU,
+    0x5eU, 0x15U, 0x46U, 0x57U, 0xa7U, 0x8dU, 0x9dU, 0x84U,
+    0x90U, 0xd8U, 0xabU, 0x00U, 0x8cU, 0xbcU, 0xd3U, 0x0aU,
+    0xf7U, 0xe4U, 0x58U, 0x05U, 0xb8U, 0xb3U, 0x45U, 0x06U,
+    0xd0U, 0x2cU, 0x1eU, 0x8fU, 0xcaU, 0x3fU, 0x0fU, 0x02U,
+    0xc1U, 0xafU, 0xbdU, 0x03U, 0x01U, 0x13U, 0x8aU, 0x6bU,
+    0x3aU, 0x91U, 0x11U, 0x41U, 0x4fU, 0x67U, 0xdcU, 0xeaU,
+    0x97U, 0xf2U, 0xcfU, 0xceU, 0xf0U, 0xb4U, 0xe6U, 0x73U,
+    0x96U, 0xacU, 0x74U, 0x22U, 0xe7U, 0xadU, 0x35U, 0x85U,
+    0xe2U, 0xf9U, 0x37U, 0xe8U, 0x1cU, 0x75U, 0xdfU, 0x6eU,
+    0x47U, 0xf1U, 0x1aU, 0x71U, 0x1dU, 0x29U, 0xc5U, 0x89U,
+    0x6fU, 0xb7U, 0x62U, 0x0eU, 0xaaU, 0x18U, 0xbeU, 0x1bU,
+    0xfcU, 0x56U, 0x3eU, 0x4bU, 0xc6U, 0xd2U, 0x79U, 0x20U,
+    0x9aU, 0xdbU, 0xc0U, 0xfeU, 0x78U, 0xcdU, 0x5aU, 0xf4U,
+    0x1fU, 0xddU, 0xa8U, 0x33U, 0x88U, 0x07U, 0xc7U, 0x31U,
+    0xb1U, 0x12U, 0x10U, 0x59U, 0x27U, 0x80U, 0xecU, 0x5fU,
+    0x60U, 0x51U, 0x7fU, 0xa9U, 0x19U, 0xb5U, 0x4aU, 0x0dU,
+    0x2dU, 0xe5U, 0x7aU, 0x9fU, 0x93U, 0xc9U, 0x9cU, 0xefU,
+    0xa0U, 0xe0U, 0x3bU, 0x4dU, 0xaeU, 0x2aU, 0xf5U, 0xb0U,
+    0xc8U, 0xebU, 0xbbU, 0x3cU, 0x83U, 0x53U, 0x99U, 0x61U,
+    0x17U, 0x2bU, 0x04U, 0x7eU, 0xbaU, 0x77U, 0xd6U, 0x26U,
+    0xe1U, 0x69U, 0x14U, 0x63U, 0x55U, 0x21U, 0x0cU, 0x7dU,
+};
+static const u32 rcon[] = {
+    0x01000000, 0x02000000, 0x04000000, 0x08000000,
+    0x10000000, 0x20000000, 0x40000000, 0x80000000,
+    0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
+};
+
+/**
+ * Expand the cipher key into the encryption key schedule.
+ */
+int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+                        AES_KEY *key)
+{
+
+    u32 *rk;
+    int i = 0;
+    u32 temp;
+
+    if (!userKey || !key)
+        return -1;
+    if (bits != 128 && bits != 192 && bits != 256)
+        return -2;
+
+    rk = key->rd_key;
+
+    if (bits == 128)
+        key->rounds = 10;
+    else if (bits == 192)
+        key->rounds = 12;
+    else
+        key->rounds = 14;
+
+    rk[0] = GETU32(userKey     );
+    rk[1] = GETU32(userKey +  4);
+    rk[2] = GETU32(userKey +  8);
+    rk[3] = GETU32(userKey + 12);
+    if (bits == 128) {
+        while (1) {
+            temp  = rk[3];
+            rk[4] = rk[0] ^
+                (Te2[(temp >> 16) & 0xff] & 0xff000000) ^
+                (Te3[(temp >>  8) & 0xff] & 0x00ff0000) ^
+                (Te0[(temp      ) & 0xff] & 0x0000ff00) ^
+                (Te1[(temp >> 24)       ] & 0x000000ff) ^
+                rcon[i];
+            rk[5] = rk[1] ^ rk[4];
+            rk[6] = rk[2] ^ rk[5];
+            rk[7] = rk[3] ^ rk[6];
+            if (++i == 10) {
+                return 0;
+            }
+            rk += 4;
+        }
+    }
+    rk[4] = GETU32(userKey + 16);
+    rk[5] = GETU32(userKey + 20);
+    if (bits == 192) {
+        while (1) {
+            temp = rk[ 5];
+            rk[ 6] = rk[ 0] ^
+                (Te2[(temp >> 16) & 0xff] & 0xff000000) ^
+                (Te3[(temp >>  8) & 0xff] & 0x00ff0000) ^
+                (Te0[(temp      ) & 0xff] & 0x0000ff00) ^
+                (Te1[(temp >> 24)       ] & 0x000000ff) ^
+                rcon[i];
+            rk[ 7] = rk[ 1] ^ rk[ 6];
+            rk[ 8] = rk[ 2] ^ rk[ 7];
+            rk[ 9] = rk[ 3] ^ rk[ 8];
+            if (++i == 8) {
+                return 0;
+            }
+            rk[10] = rk[ 4] ^ rk[ 9];
+            rk[11] = rk[ 5] ^ rk[10];
+            rk += 6;
+        }
+    }
+    rk[6] = GETU32(userKey + 24);
+    rk[7] = GETU32(userKey + 28);
+    if (bits == 256) {
+        while (1) {
+            temp = rk[ 7];
+            rk[ 8] = rk[ 0] ^
+                (Te2[(temp >> 16) & 0xff] & 0xff000000) ^
+                (Te3[(temp >>  8) & 0xff] & 0x00ff0000) ^
+                (Te0[(temp      ) & 0xff] & 0x0000ff00) ^
+                (Te1[(temp >> 24)       ] & 0x000000ff) ^
+                rcon[i];
+            rk[ 9] = rk[ 1] ^ rk[ 8];
+            rk[10] = rk[ 2] ^ rk[ 9];
+            rk[11] = rk[ 3] ^ rk[10];
+            if (++i == 7) {
+                return 0;
+            }
+            temp = rk[11];
+            rk[12] = rk[ 4] ^
+                (Te2[(temp >> 24)       ] & 0xff000000) ^
+                (Te3[(temp >> 16) & 0xff] & 0x00ff0000) ^
+                (Te0[(temp >>  8) & 0xff] & 0x0000ff00) ^
+                (Te1[(temp      ) & 0xff] & 0x000000ff);
+            rk[13] = rk[ 5] ^ rk[12];
+            rk[14] = rk[ 6] ^ rk[13];
+            rk[15] = rk[ 7] ^ rk[14];
+
+            rk += 8;
+            }
+    }
+    return 0;
+}
+
+/**
+ * Expand the cipher key into the decryption key schedule.
+ */
+int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+                        AES_KEY *key)
+{
+
+    u32 *rk;
+    int i, j, status;
+    u32 temp;
+
+    /* first, start with an encryption schedule */
+    status = AES_set_encrypt_key(userKey, bits, key);
+    if (status < 0)
+        return status;
+
+    rk = key->rd_key;
+
+    /* invert the order of the round keys: */
+    for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {
+        temp = rk[i    ]; rk[i    ] = rk[j    ]; rk[j    ] = temp;
+        temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
+        temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
+        temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
+    }
+    /* apply the inverse MixColumn transform to all round keys but the first and the last: */
+    for (i = 1; i < (key->rounds); i++) {
+        rk += 4;
+        rk[0] =
+            Td0[Te1[(rk[0] >> 24)       ] & 0xff] ^
+            Td1[Te1[(rk[0] >> 16) & 0xff] & 0xff] ^
+            Td2[Te1[(rk[0] >>  8) & 0xff] & 0xff] ^
+            Td3[Te1[(rk[0]      ) & 0xff] & 0xff];
+        rk[1] =
+            Td0[Te1[(rk[1] >> 24)       ] & 0xff] ^
+            Td1[Te1[(rk[1] >> 16) & 0xff] & 0xff] ^
+            Td2[Te1[(rk[1] >>  8) & 0xff] & 0xff] ^
+            Td3[Te1[(rk[1]      ) & 0xff] & 0xff];
+        rk[2] =
+            Td0[Te1[(rk[2] >> 24)       ] & 0xff] ^
+            Td1[Te1[(rk[2] >> 16) & 0xff] & 0xff] ^
+            Td2[Te1[(rk[2] >>  8) & 0xff] & 0xff] ^
+            Td3[Te1[(rk[2]      ) & 0xff] & 0xff];
+        rk[3] =
+            Td0[Te1[(rk[3] >> 24)       ] & 0xff] ^
+            Td1[Te1[(rk[3] >> 16) & 0xff] & 0xff] ^
+            Td2[Te1[(rk[3] >>  8) & 0xff] & 0xff] ^
+            Td3[Te1[(rk[3]      ) & 0xff] & 0xff];
+    }
+    return 0;
+}
+
+/*
+ * Encrypt a single block
+ * in and out can overlap
+ */
+void AES_encrypt(const unsigned char *in, unsigned char *out,
+                 const AES_KEY *key) {
+
+    const u32 *rk;
+    u32 s0, s1, s2, s3, t0, t1, t2, t3;
+#ifndef FULL_UNROLL
+    int r;
+#endif /* ?FULL_UNROLL */
+
+    assert(in && out && key);
+    rk = key->rd_key;
+
+    /*
+     * map byte array block to cipher state
+     * and add initial round key:
+     */
+    s0 = GETU32(in     ) ^ rk[0];
+    s1 = GETU32(in +  4) ^ rk[1];
+    s2 = GETU32(in +  8) ^ rk[2];
+    s3 = GETU32(in + 12) ^ rk[3];
+#ifdef FULL_UNROLL
+    /* round 1: */
+    t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[ 4];
+    t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[ 5];
+    t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[ 6];
+    t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[ 7];
+    /* round 2: */
+    s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[ 8];
+    s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[ 9];
+    s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[10];
+    s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[11];
+    /* round 3: */
+    t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[12];
+    t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[13];
+    t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[14];
+    t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[15];
+    /* round 4: */
+    s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[16];
+    s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[17];
+    s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[18];
+    s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[19];
+    /* round 5: */
+    t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[20];
+    t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[21];
+    t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[22];
+    t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[23];
+    /* round 6: */
+    s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[24];
+    s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[25];
+    s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[26];
+    s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[27];
+    /* round 7: */
+    t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[28];
+    t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[29];
+    t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[30];
+    t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[31];
+    /* round 8: */
+    s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[32];
+    s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[33];
+    s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[34];
+    s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[35];
+    /* round 9: */
+    t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[36];
+    t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[37];
+    t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[38];
+    t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[39];
+    if (key->rounds > 10) {
+        /* round 10: */
+        s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[40];
+        s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[41];
+        s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[42];
+        s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[43];
+        /* round 11: */
+        t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[44];
+        t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[45];
+        t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[46];
+        t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[47];
+        if (key->rounds > 12) {
+            /* round 12: */
+            s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[48];
+            s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[49];
+            s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[50];
+            s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[51];
+            /* round 13: */
+            t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[52];
+            t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[53];
+            t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[54];
+            t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[55];
+        }
+    }
+    rk += key->rounds << 2;
+#else  /* !FULL_UNROLL */
+    /*
+     * Nr - 1 full rounds:
+     */
+    r = key->rounds >> 1;
+    for (;;) {
+        t0 =
+            Te0[(s0 >> 24)       ] ^
+            Te1[(s1 >> 16) & 0xff] ^
+            Te2[(s2 >>  8) & 0xff] ^
+            Te3[(s3      ) & 0xff] ^
+            rk[4];
+        t1 =
+            Te0[(s1 >> 24)       ] ^
+            Te1[(s2 >> 16) & 0xff] ^
+            Te2[(s3 >>  8) & 0xff] ^
+            Te3[(s0      ) & 0xff] ^
+            rk[5];
+        t2 =
+            Te0[(s2 >> 24)       ] ^
+            Te1[(s3 >> 16) & 0xff] ^
+            Te2[(s0 >>  8) & 0xff] ^
+            Te3[(s1      ) & 0xff] ^
+            rk[6];
+        t3 =
+            Te0[(s3 >> 24)       ] ^
+            Te1[(s0 >> 16) & 0xff] ^
+            Te2[(s1 >>  8) & 0xff] ^
+            Te3[(s2      ) & 0xff] ^
+            rk[7];
+
+        rk += 8;
+        if (--r == 0) {
+            break;
+        }
+
+        s0 =
+            Te0[(t0 >> 24)       ] ^
+            Te1[(t1 >> 16) & 0xff] ^
+            Te2[(t2 >>  8) & 0xff] ^
+            Te3[(t3      ) & 0xff] ^
+            rk[0];
+        s1 =
+            Te0[(t1 >> 24)       ] ^
+            Te1[(t2 >> 16) & 0xff] ^
+            Te2[(t3 >>  8) & 0xff] ^
+            Te3[(t0      ) & 0xff] ^
+            rk[1];
+        s2 =
+            Te0[(t2 >> 24)       ] ^
+            Te1[(t3 >> 16) & 0xff] ^
+            Te2[(t0 >>  8) & 0xff] ^
+            Te3[(t1      ) & 0xff] ^
+            rk[2];
+        s3 =
+            Te0[(t3 >> 24)       ] ^
+            Te1[(t0 >> 16) & 0xff] ^
+            Te2[(t1 >>  8) & 0xff] ^
+            Te3[(t2      ) & 0xff] ^
+            rk[3];
+    }
+#endif /* ?FULL_UNROLL */
+    /*
+     * apply last round and
+     * map cipher state to byte array block:
+     */
+    s0 =
+        (Te2[(t0 >> 24)       ] & 0xff000000) ^
+        (Te3[(t1 >> 16) & 0xff] & 0x00ff0000) ^
+        (Te0[(t2 >>  8) & 0xff] & 0x0000ff00) ^
+        (Te1[(t3      ) & 0xff] & 0x000000ff) ^
+        rk[0];
+    PUTU32(out     , s0);
+    s1 =
+        (Te2[(t1 >> 24)       ] & 0xff000000) ^
+        (Te3[(t2 >> 16) & 0xff] & 0x00ff0000) ^
+        (Te0[(t3 >>  8) & 0xff] & 0x0000ff00) ^
+        (Te1[(t0      ) & 0xff] & 0x000000ff) ^
+        rk[1];
+    PUTU32(out +  4, s1);
+    s2 =
+        (Te2[(t2 >> 24)       ] & 0xff000000) ^
+        (Te3[(t3 >> 16) & 0xff] & 0x00ff0000) ^
+        (Te0[(t0 >>  8) & 0xff] & 0x0000ff00) ^
+        (Te1[(t1      ) & 0xff] & 0x000000ff) ^
+        rk[2];
+    PUTU32(out +  8, s2);
+    s3 =
+        (Te2[(t3 >> 24)       ] & 0xff000000) ^
+        (Te3[(t0 >> 16) & 0xff] & 0x00ff0000) ^
+        (Te0[(t1 >>  8) & 0xff] & 0x0000ff00) ^
+        (Te1[(t2      ) & 0xff] & 0x000000ff) ^
+        rk[3];
+    PUTU32(out + 12, s3);
+}
+
+/*
+ * Decrypt a single block
+ * in and out can overlap
+ */
+void AES_decrypt(const unsigned char *in, unsigned char *out,
+                 const AES_KEY *key)
+{
+
+    const u32 *rk;
+    u32 s0, s1, s2, s3, t0, t1, t2, t3;
+#ifndef FULL_UNROLL
+    int r;
+#endif /* ?FULL_UNROLL */
+
+    assert(in && out && key);
+    rk = key->rd_key;
+
+    /*
+     * map byte array block to cipher state
+     * and add initial round key:
+     */
+    s0 = GETU32(in     ) ^ rk[0];
+    s1 = GETU32(in +  4) ^ rk[1];
+    s2 = GETU32(in +  8) ^ rk[2];
+    s3 = GETU32(in + 12) ^ rk[3];
+#ifdef FULL_UNROLL
+    /* round 1: */
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[ 4];
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[ 5];
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[ 6];
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[ 7];
+    /* round 2: */
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[ 8];
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[ 9];
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[10];
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[11];
+    /* round 3: */
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[12];
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[13];
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[14];
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[15];
+    /* round 4: */
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[16];
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[17];
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[18];
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[19];
+    /* round 5: */
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[20];
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[21];
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[22];
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[23];
+    /* round 6: */
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[24];
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[25];
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[26];
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[27];
+    /* round 7: */
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[28];
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[29];
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[30];
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[31];
+    /* round 8: */
+    s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[32];
+    s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[33];
+    s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[34];
+    s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[35];
+    /* round 9: */
+    t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[36];
+    t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[37];
+    t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[38];
+    t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[39];
+    if (key->rounds > 10) {
+        /* round 10: */
+        s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[40];
+        s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[41];
+        s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[42];
+        s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[43];
+        /* round 11: */
+        t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[44];
+        t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[45];
+        t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[46];
+        t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[47];
+        if (key->rounds > 12) {
+            /* round 12: */
+            s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >>  8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[48];
+            s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >>  8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[49];
+            s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >>  8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[50];
+            s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >>  8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[51];
+            /* round 13: */
+            t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >>  8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[52];
+            t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >>  8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[53];
+            t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >>  8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[54];
+            t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >>  8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[55];
+        }
+    }
+    rk += key->rounds << 2;
+#else  /* !FULL_UNROLL */
+    /*
+     * Nr - 1 full rounds:
+     */
+    r = key->rounds >> 1;
+    for (;;) {
+        t0 =
+            Td0[(s0 >> 24)       ] ^
+            Td1[(s3 >> 16) & 0xff] ^
+            Td2[(s2 >>  8) & 0xff] ^
+            Td3[(s1      ) & 0xff] ^
+            rk[4];
+        t1 =
+            Td0[(s1 >> 24)       ] ^
+            Td1[(s0 >> 16) & 0xff] ^
+            Td2[(s3 >>  8) & 0xff] ^
+            Td3[(s2      ) & 0xff] ^
+            rk[5];
+        t2 =
+            Td0[(s2 >> 24)       ] ^
+            Td1[(s1 >> 16) & 0xff] ^
+            Td2[(s0 >>  8) & 0xff] ^
+            Td3[(s3      ) & 0xff] ^
+            rk[6];
+        t3 =
+            Td0[(s3 >> 24)       ] ^
+            Td1[(s2 >> 16) & 0xff] ^
+            Td2[(s1 >>  8) & 0xff] ^
+            Td3[(s0      ) & 0xff] ^
+            rk[7];
+
+        rk += 8;
+        if (--r == 0) {
+            break;
+        }
+
+        s0 =
+            Td0[(t0 >> 24)       ] ^
+            Td1[(t3 >> 16) & 0xff] ^
+            Td2[(t2 >>  8) & 0xff] ^
+            Td3[(t1      ) & 0xff] ^
+            rk[0];
+        s1 =
+            Td0[(t1 >> 24)       ] ^
+            Td1[(t0 >> 16) & 0xff] ^
+            Td2[(t3 >>  8) & 0xff] ^
+            Td3[(t2      ) & 0xff] ^
+            rk[1];
+        s2 =
+            Td0[(t2 >> 24)       ] ^
+            Td1[(t1 >> 16) & 0xff] ^
+            Td2[(t0 >>  8) & 0xff] ^
+            Td3[(t3      ) & 0xff] ^
+            rk[2];
+        s3 =
+            Td0[(t3 >> 24)       ] ^
+            Td1[(t2 >> 16) & 0xff] ^
+            Td2[(t1 >>  8) & 0xff] ^
+            Td3[(t0      ) & 0xff] ^
+            rk[3];
+    }
+#endif /* ?FULL_UNROLL */
+    /*
+     * apply last round and
+     * map cipher state to byte array block:
+     */
+    s0 =
+        ((u32)Td4[(t0 >> 24)       ] << 24) ^
+        ((u32)Td4[(t3 >> 16) & 0xff] << 16) ^
+        ((u32)Td4[(t2 >>  8) & 0xff] <<  8) ^
+        ((u32)Td4[(t1      ) & 0xff])       ^
+        rk[0];
+    PUTU32(out     , s0);
+    s1 =
+        ((u32)Td4[(t1 >> 24)       ] << 24) ^
+        ((u32)Td4[(t0 >> 16) & 0xff] << 16) ^
+        ((u32)Td4[(t3 >>  8) & 0xff] <<  8) ^
+        ((u32)Td4[(t2      ) & 0xff])       ^
+        rk[1];
+    PUTU32(out +  4, s1);
+    s2 =
+        ((u32)Td4[(t2 >> 24)       ] << 24) ^
+        ((u32)Td4[(t1 >> 16) & 0xff] << 16) ^
+        ((u32)Td4[(t0 >>  8) & 0xff] <<  8) ^
+        ((u32)Td4[(t3      ) & 0xff])       ^
+        rk[2];
+    PUTU32(out +  8, s2);
+    s3 =
+        ((u32)Td4[(t3 >> 24)       ] << 24) ^
+        ((u32)Td4[(t2 >> 16) & 0xff] << 16) ^
+        ((u32)Td4[(t1 >>  8) & 0xff] <<  8) ^
+        ((u32)Td4[(t0      ) & 0xff])       ^
+        rk[3];
+    PUTU32(out + 12, s3);
+}
+
+
diff --git a/duix-sdk/src/main/cpp/aes/aes_ecb.c b/duix-sdk/src/main/cpp/aes/aes_ecb.c
new file mode 100644
index 0000000..6476803
--- /dev/null
+++ b/duix-sdk/src/main/cpp/aes/aes_ecb.c
@@ -0,0 +1,24 @@
+/*
+ * Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <assert.h>
+
+#include "aes.h"
+#include "aes_locl.h"
+
+void AES_ecb_encrypt(const unsigned char *in, unsigned char *out, const AES_KEY *key, const int enc)
+{
+    assert(in && out && key);
+    assert((AES_ENCRYPT == enc) || (AES_DECRYPT == enc));
+
+    if (AES_ENCRYPT == enc)
+        AES_encrypt(in, out, key);
+    else
+        AES_decrypt(in, out, key);
+}
diff --git a/duix-sdk/src/main/cpp/aes/aes_locl.h b/duix-sdk/src/main/cpp/aes/aes_locl.h
new file mode 100644
index 0000000..47d1dfa
--- /dev/null
+++ b/duix-sdk/src/main/cpp/aes/aes_locl.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#ifndef HEADER_AES_LOCL_H
+# define HEADER_AES_LOCL_H
+
+//# include <e_os2.h>
+# include <stdio.h>
+# include <stdlib.h>
+# include <string.h>
+
+# if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
+#  define SWAP(x) (_lrotl(x, 8) & 0x00ff00ff | _lrotr(x, 8) & 0xff00ff00)
+#  define GETU32(p) SWAP(*((u32 *)(p)))
+#  define PUTU32(ct, st) { *((u32 *)(ct)) = SWAP((st)); }
+# else
+#  define GETU32(pt) (((u32)(pt)[0] << 24) ^ ((u32)(pt)[1] << 16) ^ ((u32)(pt)[2] <<  8) ^ ((u32)(pt)[3]))
+#  define PUTU32(ct, st) { (ct)[0] = (u8)((st) >> 24); (ct)[1] = (u8)((st) >> 16); (ct)[2] = (u8)((st) >>  8); (ct)[3] = (u8)(st); }
+# endif
+
+# ifdef AES_LONG
+typedef unsigned long u32;
+# else
+typedef unsigned int u32;
+# endif
+typedef unsigned short u16;
+typedef unsigned char u8;
+
+# define MAXKC   (256/32)
+# define MAXKB   (256/8)
+# define MAXNR   14
+
+/* This controls loop-unrolling in aes_core.c */
+# undef FULL_UNROLL
+
+#endif                          /* !HEADER_AES_LOCL_H */
diff --git a/duix-sdk/src/main/cpp/aes/aesmain.c b/duix-sdk/src/main/cpp/aes/aesmain.c
new file mode 100644
index 0000000..547eaf5
--- /dev/null
+++ b/duix-sdk/src/main/cpp/aes/aesmain.c
@@ -0,0 +1,111 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include "gj_aes.h"
+#include "aesmain.h"
+
+int mainenc(int enc,char* infn,char* outfn){
+    char result[255] ;
+    memset(result,0,255);
+    char* key = "yymrjzbwyrbjszrk";
+    char* aiv = "yymrjzbwyrbjszrk";
+    int base64 = 1;
+    int outlen = 0;
+    int encrst = 0;
+    char* fn1 = infn;
+    char* fn2 = outfn;
+    FILE* fr = fopen(fn1,"rb");
+    FILE* fw = fopen(fn2,"wb");
+    while(1){
+        if(!fr){
+            encrst = -1001;
+            break;
+        }
+        if(!fw){
+            encrst = -1002;
+            break;
+        }
+        gj_aesc_t* aesc = NULL;
+        init_aesc(key,aiv,enc,&aesc);
+        uint64_t size = 0;
+        uint64_t realsize = 0;
+        if(enc){
+            fwrite("gjdigits",1,8,fw);
+            fwrite(&size,1,8,fw);
+            fwrite(&size,1,8,fw);
+            fwrite(&size,1,8,fw);
+
+            while(!feof(fr)){
+                char data[16];
+                memset(data,0,16);
+                uint64_t rst = fread(data,1,16,fr);
+                if(rst){
+                    size +=rst;
+                    do_aesc(aesc,data,16,result,&outlen);
+                    fwrite(result,1,outlen,fw);
+                }
+            }
+            fseek(fw,8,0);
+            fwrite(&size,1,8,fw);
+
+        }else{
+            uint64_t rst = fread(result,1,32,fr);
+            if(!rst){
+                encrst = -1003;
+                break;
+            }
+            if((result[0]!='g')||(result[1]!='j')){
+                encrst = -1004;
+                break;
+            }
+            uint64_t *psize = (uint64_t*)(result+8);
+            realsize = *psize;
+            if(realsize>1034*1024*1024){
+                encrst = -1005;
+                break;
+            }
+            while(!feof(fr)){
+                char data[16];
+                memset(data,0,16);
+                uint64_t rst = fread(data,1,16,fr);
+                if(rst){
+                    size +=rst;
+                    do_aesc(aesc,data,16,result,&outlen);
+                    if(size>realsize){
+                        outlen -= (size-realsize);
+                        //printf("===%lu > %lu rst %lu %d outlen \n",size,realsize,rst,outlen);
+                    }
+                    fwrite(result,1,outlen,fw);
+                }
+            }
+        }
+        break;
+    }
+    if(fr) fclose(fr);
+    if(fw) fclose(fw);
+    return encrst;
+}
+
+
+#ifdef TEST
+int main(int argc,char** argv){
+    if(argc<4){
+        printf("gaes enc|dec filein fileout\n");
+        return 0;
+    }
+    char k = argv[1][0];
+    if(k=='e'){
+        int rst =  mainenc(1,argv[2],argv[3]);
+        printf("====enc %s to %s rst %d\n",argv[2],argv[3],rst);
+        return rst;
+    }else if(k=='d'){
+        int rst =  mainenc(0,argv[2],argv[3]);
+        printf("====dec %s to %s rst %d\n",argv[2],argv[3],rst);
+        return rst;
+    }else{
+        printf("gaes enc|dec filein fileout\n");
+        return 0;
+    }
+}
+#endif
diff --git a/duix-sdk/src/main/cpp/aes/aesmain.h b/duix-sdk/src/main/cpp/aes/aesmain.h
new file mode 100644
index 0000000..48e53df
--- /dev/null
+++ b/duix-sdk/src/main/cpp/aes/aesmain.h
@@ -0,0 +1,16 @@
+
+#ifndef __AESMAIN_H
+#define __AESMAIN_H
+
+#include "gj_dll.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int mainenc(int enc,char* infn,char* outfn);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
+
diff --git a/duix-sdk/src/main/cpp/aes/base64.c b/duix-sdk/src/main/cpp/aes/base64.c
new file mode 100644
index 0000000..a35725d
--- /dev/null
+++ b/duix-sdk/src/main/cpp/aes/base64.c
@@ -0,0 +1,164 @@
+/* This is a public domain base64 implementation written by WEI Zhicheng. */
+
+#include "base64.h"
+
+#define BASE64_PAD '='
+#define BASE64DE_FIRST '+'
+#define BASE64DE_LAST 'z'
+
+/* BASE 64 encode table */
+static const char base64en[] = {
+	'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
+	'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
+	'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
+	'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
+	'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
+	'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
+	'w', 'x', 'y', 'z', '0', '1', '2', '3',
+	'4', '5', '6', '7', '8', '9', '+', '/',
+};
+
+/* ASCII order for BASE 64 decode, 255 in unused character */
+static const unsigned char base64de[] = {
+	/* nul, soh, stx, etx, eot, enq, ack, bel, */
+	   255, 255, 255, 255, 255, 255, 255, 255,
+
+	/*  bs,  ht,  nl,  vt,  np,  cr,  so,  si, */
+	   255, 255, 255, 255, 255, 255, 255, 255,
+
+	/* dle, dc1, dc2, dc3, dc4, nak, syn, etb, */
+	   255, 255, 255, 255, 255, 255, 255, 255,
+
+	/* can,  em, sub, esc,  fs,  gs,  rs,  us, */
+	   255, 255, 255, 255, 255, 255, 255, 255,
+
+	/*  sp, '!', '"', '#', '$', '%', '&', ''', */
+	   255, 255, 255, 255, 255, 255, 255, 255,
+
+	/* '(', ')', '*', '+', ',', '-', '.', '/', */
+	   255, 255, 255,  62, 255, 255, 255,  63,
+
+	/* '0', '1', '2', '3', '4', '5', '6', '7', */
+	    52,  53,  54,  55,  56,  57,  58,  59,
+
+	/* '8', '9', ':', ';', '<', '=', '>', '?', */
+	    60,  61, 255, 255, 255, 255, 255, 255,
+
+	/* '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', */
+	   255,   0,   1,  2,   3,   4,   5,    6,
+
+	/* 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', */
+	     7,   8,   9,  10,  11,  12,  13,  14,
+
+	/* 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', */
+	    15,  16,  17,  18,  19,  20,  21,  22,
+
+	/* 'X', 'Y', 'Z', '[', '\', ']', '^', '_', */
+	    23,  24,  25, 255, 255, 255, 255, 255,
+
+	/* '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', */
+	   255,  26,  27,  28,  29,  30,  31,  32,
+
+	/* 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', */
+	    33,  34,  35,  36,  37,  38,  39,  40,
+
+	/* 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', */
+	    41,  42,  43,  44,  45,  46,  47,  48,
+
+	/* 'x', 'y', 'z', '{', '|', '}', '~', del, */
+	    49,  50,  51, 255, 255, 255, 255, 255
+};
+
+unsigned int
+gjbase64_encode(const unsigned char *in, unsigned int inlen, char *out)
+{
+	int s;
+	unsigned int i;
+	unsigned int j;
+	unsigned char c;
+	unsigned char l;
+
+	s = 0;
+	l = 0;
+	for (i = j = 0; i < inlen; i++) {
+		c = in[i];
+
+		switch (s) {
+		case 0:
+			s = 1;
+			out[j++] = base64en[(c >> 2) & 0x3F];
+			break;
+		case 1:
+			s = 2;
+			out[j++] = base64en[((l & 0x3) << 4) | ((c >> 4) & 0xF)];
+			break;
+		case 2:
+			s = 0;
+			out[j++] = base64en[((l & 0xF) << 2) | ((c >> 6) & 0x3)];
+			out[j++] = base64en[c & 0x3F];
+			break;
+		}
+		l = c;
+	}
+
+	switch (s) {
+	case 1:
+		out[j++] = base64en[(l & 0x3) << 4];
+		out[j++] = BASE64_PAD;
+		out[j++] = BASE64_PAD;
+		break;
+	case 2:
+		out[j++] = base64en[(l & 0xF) << 2];
+		out[j++] = BASE64_PAD;
+		break;
+	}
+
+	out[j] = 0;
+
+	return j;
+}
+
+unsigned int
+gjbase64_decode(const char *in, unsigned int inlen, unsigned char *out)
+{
+	unsigned int i;
+	unsigned int j;
+	unsigned char c;
+
+	if (inlen & 0x3) {
+		return 0;
+	}
+
+	for (i = j = 0; i < inlen; i++) {
+		if (in[i] == BASE64_PAD) {
+			break;
+		}
+		if (in[i] < BASE64DE_FIRST || in[i] > BASE64DE_LAST) {
+			return 0;
+		}
+
+		c = base64de[(unsigned char)in[i]];
+		if (c == 255) {
+			return 0;
+		}
+
+		switch (i & 0x3) {
+		case 0:
+			out[j] = (c << 2) & 0xFF;
+			break;
+		case 1:
+			out[j++] |= (c >> 4) & 0x3;
+			out[j] = (c & 0xF) << 4;
+			break;
+		case 2:
+			out[j++] |= (c >> 2) & 0xF;
+			out[j] = (c & 0x3) << 6;
+			break;
+		case 3:
+			out[j++] |= c;
+			break;
+		}
+	}
+
+	return j;
+}
diff --git a/duix-sdk/src/main/cpp/aes/base64.h b/duix-sdk/src/main/cpp/aes/base64.h
new file mode 100644
index 0000000..34ad948
--- /dev/null
+++ b/duix-sdk/src/main/cpp/aes/base64.h
@@ -0,0 +1,29 @@
+#ifndef BASE64_H
+#define BASE64_H
+
+#define BASE64_ENCODE_OUT_SIZE(s) ((unsigned int)((((s) + 2) / 3) * 4 + 1))
+#define BASE64_DECODE_OUT_SIZE(s) ((unsigned int)(((s) / 4) * 3))
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+/*
+ * out is null-terminated encode string.
+ * return values is out length, exclusive terminating `\0'
+ */
+unsigned int
+gjbase64_encode(const unsigned char *in, unsigned int inlen, char *out);
+
+/*
+ * return values is out length
+ */
+unsigned int
+gjbase64_decode(const char *in, unsigned int inlen, unsigned char *out);
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* BASE64_H */
diff --git a/duix-sdk/src/main/cpp/aes/cbc128.c b/duix-sdk/src/main/cpp/aes/cbc128.c
new file mode 100644
index 0000000..f10d9ca
--- /dev/null
+++ b/duix-sdk/src/main/cpp/aes/cbc128.c
@@ -0,0 +1,161 @@
+/*
+ * Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the OpenSSL license (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+// #include <openssl/crypto.h>
+#include "modes.h"
+#include <string.h>
+
+#if !defined(STRICT_ALIGNMENT) && !defined(PEDANTIC)
+# define STRICT_ALIGNMENT 0
+#endif
+
+void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out,
+                           size_t len, const void *key,
+                           unsigned char ivec[16], block128_f block)
+{
+    size_t n;
+    const unsigned char *iv = ivec;
+
+    if (len == 0)
+        return;
+
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+    if (STRICT_ALIGNMENT &&
+        ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
+        while (len >= 16) {
+            for (n = 0; n < 16; ++n)
+                out[n] = in[n] ^ iv[n];
+            (*block) (out, out, key);
+            iv = out;
+            len -= 16;
+            in += 16;
+            out += 16;
+        }
+    } else {
+        while (len >= 16) {
+            for (n = 0; n < 16; n += sizeof(size_t))
+                *(size_t *)(out + n) =
+                    *(size_t *)(in + n) ^ *(size_t *)(iv + n);
+            (*block) (out, out, key);
+            iv = out;
+            len -= 16;
+            in += 16;
+            out += 16;
+        }
+    }
+#endif
+    while (len) {
+        for (n = 0; n < 16 && n < len; ++n)
+            out[n] = in[n] ^ iv[n];
+        for (; n < 16; ++n)
+            out[n] = iv[n];
+        (*block) (out, out, key);
+        iv = out;
+        if (len <= 16)
+            break;
+        len -= 16;
+        in += 16;
+        out += 16;
+    }
+    memcpy(ivec, iv, 16);
+}
+
+void CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out,
+                           size_t len, const void *key,
+                           unsigned char ivec[16], block128_f block)
+{
+    size_t n;
+    union {
+        size_t t[16 / sizeof(size_t)];
+        unsigned char c[16];
+    } tmp;
+
+    if (len == 0)
+        return;
+
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+    if (in != out) {
+        const unsigned char *iv = ivec;
+
+        if (STRICT_ALIGNMENT &&
+            ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
+            while (len >= 16) {
+                (*block) (in, out, key);
+                for (n = 0; n < 16; ++n)
+                    out[n] ^= iv[n];
+                iv = in;
+                len -= 16;
+                in += 16;
+                out += 16;
+            }
+        } else if (16 % sizeof(size_t) == 0) { /* always true */
+            while (len >= 16) {
+                size_t *out_t = (size_t *)out, *iv_t = (size_t *)iv;
+
+                (*block) (in, out, key);
+                for (n = 0; n < 16 / sizeof(size_t); n++)
+                    out_t[n] ^= iv_t[n];
+                iv = in;
+                len -= 16;
+                in += 16;
+                out += 16;
+            }
+        }
+        memcpy(ivec, iv, 16);
+    } else {
+        if (STRICT_ALIGNMENT &&
+            ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
+            unsigned char c;
+            while (len >= 16) {
+                (*block) (in, tmp.c, key);
+                for (n = 0; n < 16; ++n) {
+                    c = in[n];
+                    out[n] = tmp.c[n] ^ ivec[n];
+                    ivec[n] = c;
+                }
+                len -= 16;
+                in += 16;
+                out += 16;
+            }
+        } else if (16 % sizeof(size_t) == 0) { /* always true */
+            while (len >= 16) {
+                size_t c, *out_t = (size_t *)out, *ivec_t = (size_t *)ivec;
+                const size_t *in_t = (const size_t *)in;
+
+                (*block) (in, tmp.c, key);
+                for (n = 0; n < 16 / sizeof(size_t); n++) {
+                    c = in_t[n];
+                    out_t[n] = tmp.t[n] ^ ivec_t[n];
+                    ivec_t[n] = c;
+                }
+                len -= 16;
+                in += 16;
+                out += 16;
+            }
+        }
+    }
+#endif
+    while (len) {
+        unsigned char c;
+        (*block) (in, tmp.c, key);
+        for (n = 0; n < 16 && n < len; ++n) {
+            c = in[n];
+            out[n] = tmp.c[n] ^ ivec[n];
+            ivec[n] = c;
+        }
+        if (len <= 16) {
+            for (; n < 16; ++n)
+                ivec[n] = in[n];
+            break;
+        }
+        len -= 16;
+        in += 16;
+        out += 16;
+    }
+}
diff --git a/duix-sdk/src/main/cpp/aes/gaes_stream.cc b/duix-sdk/src/main/cpp/aes/gaes_stream.cc
new file mode 100644
index 0000000..b8812cd
--- /dev/null
+++ b/duix-sdk/src/main/cpp/aes/gaes_stream.cc
@@ -0,0 +1,213 @@
+
+#include "gaes_stream.h"
+
+#include <cstring>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <cstdio>
+#include <cstdlib>
+#include "gj_aes.h"
+
+
+class GaesIStreamBuf final: public std::streambuf
+{
+private:
+	char *m_inbuf;
+	size_t m_inbufsize;
+	bool m_owns_inbuf;
+	char *m_leftbuf;
+
+    FILE *file;
+    uint64_t cur_size;
+    uint64_t file_size;
+    gj_aesc_t* aesc ;
+protected:
+	virtual std::streambuf* setbuf(char *s, std::streamsize n){
+	    setg(0, 0, 0);
+	    if (m_owns_inbuf) {
+	        delete [] m_inbuf;
+	    }
+	    m_inbufsize = n;
+	    if (s) {
+	        m_inbuf = s;
+	        m_owns_inbuf = false;
+	    } else {
+	        m_inbuf = new char[m_inbufsize];
+	        m_leftbuf = new char[m_inbufsize];
+	        m_owns_inbuf = true;
+        }
+	    return this;
+    }
+
+	virtual int sync(){
+	    int result = 0;
+	    return result;
+    }
+
+    virtual int underflow() override{
+	    int __c = traits_type::eof();
+        if (!file) return __c;
+        if(cur_size>=file_size){
+            printf("===eof %ld ===%ld\n",cur_size,file_size);
+            return __c;
+        }
+	    bool initial = false;
+	    if (eback() == 0) {
+	        setg(m_inbuf, m_inbuf + m_inbufsize, m_inbuf + m_inbufsize);
+	        initial = true;
+        }
+	    const size_t unget_sz = initial ? 0 : std::min<size_t>((egptr() - eback()) / 2, 4);
+	    if (gptr() == egptr()) {
+	        memmove(eback(), egptr() - unget_sz, unget_sz);
+	        size_t nmemb = static_cast<size_t>(egptr() - eback() - unget_sz);
+            char* pdst = eback() + unget_sz;
+            int modb = nmemb % 16;
+            size_t leftb = nmemb - modb;
+            char* pbuf = m_leftbuf;
+            size_t leftf = file_size - cur_size;
+            if(leftb>leftf)leftb=leftf;
+            memset(pbuf,0,m_inbufsize);
+            size_t rd = fread(pbuf, 1, leftb, file);
+            //printf("%d-%ld-%ld----------------%ld--%ld#\n",cur_size,file_size,modb,nmemb,rd);
+	        //ssize_t readed = read(m_fd, eback() + unget_sz, nmemb);
+            if(rd>0){
+                cur_size += rd;
+                int cnt = leftb /16;
+                int k;
+                for(k=0;k<cnt;k++){
+                    int outlen = 0;
+                    do_aesc(aesc,pbuf,16,pdst,&outlen);
+                    pbuf += 16;
+                    pdst += 16;
+                }
+		        setg(eback(), eback() + unget_sz, eback() + unget_sz + rd);
+		        __c = traits_type::to_int_type(*gptr());
+            }
+	    } else {
+	        __c = traits_type::to_int_type(*gptr());
+        }
+	    return __c;
+    }
+public:
+    GaesIStreamBuf(std::string& filename) :m_inbuf(0), m_inbufsize(0), m_owns_inbuf(false){
+	    setbuf(0, 1024);
+        cur_size = 0;
+        file = fopen(filename.c_str(), "rb");
+        fseek(file, 0, SEEK_END);
+        file_size = ftell(file); //获取音频文件大小
+        fseek(file, 0, SEEK_SET);
+        char* key = "yymrjzbwyrbjszrk";
+        char* aiv = "yymrjzbwyrbjszrk";
+        init_aesc(key,aiv,0,&this->aesc);
+        char head[50];
+        memset(head,0,50);
+        uint64_t rst = fread(head,1,8,file);
+        rst = fread(&cur_size,1,8,file);
+        printf("===head %s size %ld\n",head,cur_size);
+        rst = fread(head,1,16,file);
+        cur_size = 32;
+    }
+
+    ~GaesIStreamBuf(){
+        close();
+	    if (m_owns_inbuf) {
+	        delete[] m_inbuf;
+	    }
+    }
+
+    void close(){
+        if(aesc){
+            free_aesc(&this->aesc);
+        }
+        if (file){
+            fclose(file);
+            file = NULL;
+        }
+    }
+};
+
+
+
+GaesIStream::GaesIStream(std::string filename):
+    std::istream(new GaesIStreamBuf(filename)){
+}
+
+GaesIStream::~GaesIStream()
+{
+    delete rdbuf();
+}
+
+#ifdef TEST
+int maindec(int argc,char** argv){
+    std::string filename(argv[1]);// = "test.enc";
+    //std::string filename = "final.mdlenc";
+    GaesIStream fin(filename);
+    //std::string fn2 = "final.mdldec";
+    std::string fn2(argv[2]);// = "test.dec";
+    std::ofstream fout(fn2,std::ios::binary);
+
+    char buf[1024];
+    int rd = 0;
+    while(!fin.eof()){
+    //while((rd = fin.read(buf,16))>0){
+        //printf("===rd %ld\n",rd);
+        fin.read(buf,16);
+        fout.write(buf,16);
+
+    }
+    //char ch;
+    //while (fin.get(ch)) {
+        //printf("+");
+        //fout << ch;
+    //}
+    return 0;
+}
+
+
+
+int mainenc(int argc,char** argv){
+    char result[255] ;
+    memset(result,0,255);
+    char* key = "yymrjzbwyrbjszrk";
+    char* aiv = "yymrjzbwyrbjszrk";
+    int base64 = 1;
+    int outlen = 0;
+    gj_aesc_t* aesc = NULL;
+    init_aesc(key,aiv,1,&aesc);
+    char* fn1 = argv[1];
+    char* fn2 = argv[2];
+    FILE* fr = fopen(fn1,"rb");
+    FILE* fw = fopen(fn2,"wb");
+    fwrite("abcdefgh",1,8,fw);
+    uint64_t size = 0;
+    fwrite(&size,1,8,fw);
+    fwrite(&size,1,8,fw);
+    fwrite(&size,1,8,fw);
+    while(!feof(fr)){
+        char data[16];
+        memset(data,0,16);
+        uint64_t rst = fread(data,1,16,fr);
+        printf("===rst %d\n",rst);
+        if(rst){
+            size +=rst;
+            do_aesc(aesc,data,16,result,&outlen);
+            printf("===out %d\n",outlen);
+            fwrite(result,1,16,fw);
+        }
+    }
+    fseek(fw,8,0);
+    fwrite(&size,1,8,fw);
+    fclose(fr);
+    fclose(fw);
+    return 0;
+}
+
+int main(int argc,char** argv){
+    if(argc<4){
+        return mainenc(argc,argv);
+    }else{
+        return maindec(argc,argv);
+    }
+}
+#endif
diff --git a/duix-sdk/src/main/cpp/aes/gaes_stream.h b/duix-sdk/src/main/cpp/aes/gaes_stream.h
new file mode 100644
index 0000000..d3e2d1d
--- /dev/null
+++ b/duix-sdk/src/main/cpp/aes/gaes_stream.h
@@ -0,0 +1,22 @@
+#ifndef COMPRESSED_STREAMS_ZSTD_STREAM_H
+#define COMPRESSED_STREAMS_ZSTD_STREAM_H
+
+#include <iostream>
+
+
+
+
+
+class GaesIStream: public std::istream
+{
+public:
+    GaesIStream(std::string filename);
+
+    virtual ~GaesIStream();
+};
+
+
+
+
+
+#endif // COMPRESSED_STREAMS_ZSTD_STREAM_H
diff --git a/duix-sdk/src/main/cpp/aes/gaesmain b/duix-sdk/src/main/cpp/aes/gaesmain
new file mode 100644
index 0000000..c3cae0f
Binary files /dev/null and b/duix-sdk/src/main/cpp/aes/gaesmain differ
diff --git a/duix-sdk/src/main/cpp/aes/gj_aes.c b/duix-sdk/src/main/cpp/aes/gj_aes.c
new file mode 100644
index 0000000..3314ef6
--- /dev/null
+++ b/duix-sdk/src/main/cpp/aes/gj_aes.c
@@ -0,0 +1,69 @@
+#include <stdlib.h>
+#include <string.h>
+#include "gj_aes.h"
+#include "base64.h"
+
+#include "aes.h"
+
+
+struct gj_aesc_s{
+    char key[16];
+    char iv[16];
+    int enc;
+	AES_KEY *aeskey;
+};
+
+int free_aesc(gj_aesc_t** paesc){
+    if(!paesc||!*paesc)return -1;
+    if((*paesc)->aeskey)free((*paesc)->aeskey);
+    free(*paesc);
+    *paesc = NULL;
+    return 0;
+}
+
+
+int init_aesc(char* key,char* iv,int enc,gj_aesc_t** paesc){
+    if(strlen(key)!=16) return -1;
+    if(strlen(iv)!=16) return -2;
+    gj_aesc_t* aesc = (gj_aesc_t*)malloc(sizeof(gj_aesc_t));
+    int k;
+    for(k=0;k<16;k++){
+        aesc->key[k]=key[k];
+        aesc->iv[k]=iv[k];
+    }
+    aesc->aeskey = (AES_KEY*)malloc(sizeof(AES_KEY));
+    aesc->enc = enc;
+    if(enc){
+	    AES_set_encrypt_key((const unsigned char*)aesc->key, 128, aesc->aeskey);
+    }else{
+	    AES_set_decrypt_key((const unsigned char*)aesc->key, 128, aesc->aeskey);
+    }
+    *paesc = aesc;
+    return 0;
+}
+
+int do_aesc(gj_aesc_t* aesc,char* in,int inlen,char* out,int* outlen){
+    char* psrc = in;
+    char* pdest = out;
+    int cnt = 0;
+    int left=inlen;
+    while(left>0){
+	    AES_cbc_encrypt((const unsigned char*)psrc,(unsigned char*)pdest,16,aesc->aeskey,(unsigned char*)aesc->iv,aesc->enc);
+        psrc += 16;
+        pdest += 16;
+        left -= 16;
+        cnt += 16;
+    }
+    *outlen = cnt;
+    return 0;
+}
+
+int do_base64(int enc,char* in,int inlen,char* out,int* outlen){
+    if(enc){
+        gjbase64_encode((unsigned char*)in,inlen,out);
+        *outlen = strlen(out);
+    }else{
+        *outlen = gjbase64_decode(in,inlen,(unsigned char*)out);
+    }
+    return 0;
+}
diff --git a/duix-sdk/src/main/cpp/aes/gj_aes.h b/duix-sdk/src/main/cpp/aes/gj_aes.h
new file mode 100644
index 0000000..a09338f
--- /dev/null
+++ b/duix-sdk/src/main/cpp/aes/gj_aes.h
@@ -0,0 +1,22 @@
+#ifndef __GJ_AES_H__
+#define __GJ_AES_H__
+
+#include "gj_dll.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct gj_aesc_s gj_aesc_t;
+
+GJLIBAPI int free_aesc(gj_aesc_t** paesc);
+GJLIBAPI int init_aesc(char* key,char* iv,int enc,gj_aesc_t** paesc);
+
+GJLIBAPI int do_aesc(gj_aesc_t* aesc,char* in,int inlen,char* out,int* outlen);
+
+GJLIBAPI int do_base64(int enc,char* in,int inlen,char* out,int* outlen);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/duix-sdk/src/main/cpp/aes/gj_dll.h b/duix-sdk/src/main/cpp/aes/gj_dll.h
new file mode 100644
index 0000000..9d1c953
--- /dev/null
+++ b/duix-sdk/src/main/cpp/aes/gj_dll.h
@@ -0,0 +1,21 @@
+#ifndef __GJ_DLL_H__
+#define __GJ_DLL_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#define GJLIB_EXPORT 1
+#if defined(GJLIB_EXPORT)
+    #if defined _WIN32 || defined __CYGWIN__
+        #define GJLIBAPI __declspec(dllexport)
+    #else
+        #define GJLIBAPI __attribute__((visibility("default")))
+    #endif
+#else
+    #define GJLIBAPI
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/duix-sdk/src/main/cpp/aes/makefile b/duix-sdk/src/main/cpp/aes/makefile
new file mode 100644
index 0000000..dbe0cac
--- /dev/null
+++ b/duix-sdk/src/main/cpp/aes/makefile
@@ -0,0 +1,3 @@
+all:
+	g++ -fPIC -o gjaesmain -g aesmain.c \
+		aes_cbc.c aes_core.c aes_ecb.c cbc128.c base64.c gj_aes.c -lm --std=c++11 -I.   -DTEST
diff --git a/duix-sdk/src/main/cpp/aes/modes.h b/duix-sdk/src/main/cpp/aes/modes.h
new file mode 100644
index 0000000..8d2c17e
--- /dev/null
+++ b/duix-sdk/src/main/cpp/aes/modes.h
@@ -0,0 +1,22 @@
+#ifndef HEADER_MODES_H
+# define HEADER_MODES_H
+
+# include <stddef.h>
+
+typedef void (*block128_f) (const unsigned char in[16],
+                            unsigned char out[16], const void *key);
+
+typedef void (*cbc128_f) (const unsigned char *in, unsigned char *out,
+                          size_t len, const void *key,
+                          unsigned char ivec[16], int enc);
+
+void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out,
+                           size_t len, const void *key,
+                           unsigned char ivec[16], block128_f block);
+void CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out,
+                           size_t len, const void *key,
+                           unsigned char ivec[16], block128_f block);
+
+
+
+#endif
\ No newline at end of file
diff --git a/duix-sdk/src/main/cpp/android/DuixJni.cpp b/duix-sdk/src/main/cpp/android/DuixJni.cpp
new file mode 100644
index 0000000..42f5a01
--- /dev/null
+++ b/duix-sdk/src/main/cpp/android/DuixJni.cpp
@@ -0,0 +1,243 @@
+#include <android/asset_manager_jni.h>
+#include <android/native_window_jni.h>
+#include <android/native_window.h>
+#include <android/log.h>
+#include <jni.h>
+#include <string>
+#include <vector>
+#include <unistd.h>
+#include "gjsimp.h"
+#include "JniHelper.h"
+#include "aesmain.h"
+#include "jmat.h"
+#include "Log.h"
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+       //
+       //
+#define TAG  "tooken"
+#ifdef DEBUGME
+#define JNIEXPORT 
+#define JNI_OnLoad
+#define jint int
+#define jlong long
+#define jstring string
+#define JNICALL 
+#define JavaVM void
+#define LOGI(...)
+#define JNIEnv void
+#define jobject void*
+#endif
+extern "C" {
+
+  static dhduix_t* g_digit = 0;
+  static JMat*    g_gpgmat = NULL;
+  static int  g_width = 540;
+  static int  g_height = 960;
+  static int  g_taskid = -1;
+
+  JNIEXPORT jint JNI_OnLoad(JavaVM *vm, void *reserved) {
+    LOGD(TAG, "JNI_OnLoad");
+    //g_digit = new GDigit(g_width,g_height,g_msgcb);
+    JniHelper::sJavaVM = vm;
+    return JNI_VERSION_1_4;
+  }
+
+  JNIEXPORT void JNI_OnUnload(JavaVM *vm, void *reserved) {
+    LOGI(TAG, "unload");
+    if(g_digit){
+      dhduix_free(g_digit);
+      g_digit = nullptr;
+    }
+  }
+
+  static std::string getStringUTF(JNIEnv *env, jstring obj) {
+    char *c_str = (char *) env->GetStringUTFChars(obj, nullptr);
+    std::string tmpString = std::string(c_str);
+    env->ReleaseStringUTFChars(obj, c_str);
+    return tmpString;
+  }
+
+
+  JNIEXPORT jint JNICALL Java_ai_guiji_duix_DuixNcnn_alloc(JNIEnv *env, jobject thiz,
+      jint taskid,jint mincalc,jint width,jint height){
+    LOGI(TAG, "create");
+    g_taskid = taskid;
+    dhduix_alloc(&g_digit,mincalc,width,height);
+    return 0;
+  }
+
+  JNIEXPORT jint JNICALL Java_ai_guiji_duix_DuixNcnn_free(JNIEnv *env, jobject thiz,jint taskid){
+    if(g_taskid==taskid){
+      dhduix_free(g_digit);
+      g_digit = nullptr;
+    }
+    return 0;
+  }
+
+  JNIEXPORT jint JNICALL Java_ai_guiji_duix_DuixNcnn_initPcmex(JNIEnv *env, jobject thiz, 
+      jint maxsize,jint minoff,jint minblock,jint maxblock,jint rgb){
+    if(!g_digit)return -1;
+    int rst = dhduix_initPcmex(g_digit,maxsize,minoff,minblock,maxblock,rgb);
+    return rst;
+  }
+
+  JNIEXPORT jint JNICALL Java_ai_guiji_duix_DuixNcnn_initWenet(JNIEnv *env, jobject thiz,
+      jstring fnwenet){
+    if(!g_digit)return -1;
+    std::string str = getStringUTF(env,fnwenet);
+    char* ps = (char*)(str.c_str());
+    int rst = dhduix_initWenet(g_digit,ps);
+    return rst;
+  }
+
+  JNIEXPORT jint JNICALL Java_ai_guiji_duix_DuixNcnn_initMunet(JNIEnv *env, jobject thiz,
+      jstring fnparam,jstring fnbin,jstring fnmask){
+    if(!g_digit)return -1;
+    std::string sparam = getStringUTF(env,fnparam);
+    std::string sbin = getStringUTF(env,fnbin);
+    std::string smask = getStringUTF(env,fnmask);
+    int rst = dhduix_initMunet(g_digit,(char*)sparam.c_str(),(char*)sbin.c_str(),(char*)smask.c_str());
+    return rst;
+  }
+
+  JNIEXPORT jint JNICALL Java_ai_guiji_duix_DuixNcnn_initMunetex(JNIEnv *env, jobject thiz,
+      jstring fnparam,jstring fnbin,jstring fnmask,jint kind){
+    if(!g_digit)return -1;
+    std::string sparam = getStringUTF(env,fnparam);
+    std::string sbin = getStringUTF(env,fnbin);
+    std::string smask = getStringUTF(env,fnmask);
+    int rst = dhduix_initMunetex(g_digit,(char*)sparam.c_str(),(char*)sbin.c_str(),(char*)smask.c_str(),kind?kind:168);
+    return rst;
+  }
+
+  JNIEXPORT jlong JNICALL Java_ai_guiji_duix_DuixNcnn_newsession(JNIEnv *env, jobject thiz){
+    if(!g_digit)return -1;
+    uint64_t sessid = dhduix_newsession(g_digit);
+    return (jlong)sessid;
+  }
+
+  JNIEXPORT jint JNICALL Java_ai_guiji_duix_DuixNcnn_pushpcm(JNIEnv *env, jobject thiz, 
+      jlong sessid,jbyteArray arrbuf,jint size,jint kind){
+    if(!g_digit)return -1;
+    jbyte *pcmbuf = (jbyte *) env->GetPrimitiveArrayCritical(arrbuf, 0);
+    uint64_t sid = sessid;
+    int rst = dhduix_pushpcm(g_digit,sid,(char*)pcmbuf,size,kind);
+    env->ReleasePrimitiveArrayCritical(arrbuf,pcmbuf, 0);
+    return rst;
+  }
+
+  JNIEXPORT jint JNICALL Java_ai_guiji_duix_DuixNcnn_finsession(JNIEnv *env, jobject thiz,jlong sessid){
+    if(!g_digit)return -1;
+    uint64_t sid = sessid;
+    return dhduix_finsession(g_digit,sid);
+  }
+
+  JNIEXPORT jint JNICALL Java_ai_guiji_duix_DuixNcnn_consession(JNIEnv *env, jobject thiz,jlong sessid){
+    if(!g_digit)return -1;
+    uint64_t sid = sessid;
+    return dhduix_consession(g_digit,sid);
+  }
+
+  JNIEXPORT jint JNICALL Java_ai_guiji_duix_DuixNcnn_allcnt(JNIEnv *env, jobject thiz,jlong sessid){
+    if(!g_digit)return -1;
+    uint64_t sid = sessid;
+    return dhduix_allcnt(g_digit,sid);
+  }
+
+  JNIEXPORT jint JNICALL Java_ai_guiji_duix_DuixNcnn_readycnt(JNIEnv *env, jobject thiz,jlong sessid){
+    if(!g_digit)return -1;
+    uint64_t sid = sessid;
+    return dhduix_readycnt(g_digit,sid);
+  }
+
+  JNIEXPORT jint JNICALL Java_ai_guiji_duix_DuixNcnn_fileload(JNIEnv* env, jobject thiz,
+      jstring picfile, jstring mskfile,jint width,jint height,
+      jbyteArray arrpic,jbyteArray arrmsk,jint bursize){
+  //
+    std::string s_pic = getStringUTF(env,picfile);
+    std::string s_msk = getStringUTF(env,mskfile);
+    jbyte *picbuf = (jbyte *) env->GetPrimitiveArrayCritical(arrpic, 0);
+    JMat* mat_pic = new JMat(width,height,(uint8_t*)picbuf);
+    mat_pic->loadjpg(s_pic,1);
+    env->ReleasePrimitiveArrayCritical( arrpic,picbuf, 0);
+    delete mat_pic;
+
+    if(s_msk.length()){
+        jbyte *mskbuf = (jbyte *) env->GetPrimitiveArrayCritical(arrmsk, 0);
+        JMat* mat_msk = new JMat(width,height,(uint8_t*)mskbuf);
+        mat_msk->loadjpg(s_msk,1);
+        env->ReleasePrimitiveArrayCritical( arrmsk,mskbuf, 0);
+        delete mat_msk;
+    }
+    return 0;
+  }
+
+  JNIEXPORT jint JNICALL Java_ai_guiji_duix_DuixNcnn_bufrst(JNIEnv* env, jobject thiz,
+      jlong sessid, jintArray arrbox, jint inx,
+      jbyteArray arrimg,jint imgsize){
+    if(!g_digit)return -1;
+    uint64_t sid = sessid;
+    jint *boxData = (jint*) env->GetPrimitiveArrayCritical( arrbox, 0);
+    jbyte *imgbuf = (jbyte*) env->GetPrimitiveArrayCritical(arrimg, 0);
+    int bnfinx = inx;
+    int rst = dhduix_simpinx(g_digit,sid,(uint8_t*)imgbuf, 0,0, 
+        (int*)boxData,NULL,NULL,bnfinx);
+    env->ReleasePrimitiveArrayCritical( arrimg,imgbuf, 0);
+    env->ReleasePrimitiveArrayCritical( arrbox, boxData, 0);
+    return rst;
+  }
+
+  JNIEXPORT jint JNICALL Java_ai_guiji_duix_DuixNcnn_filerst(JNIEnv* env, jobject thiz,
+      jlong sessid,jstring picfile, jstring mskfile,
+      jintArray arrbox, jstring fgfile,jint inx,
+      jbyteArray arrimg,jbyteArray arrmsk,jint imgsize){
+    if(!g_digit)return -1;
+    uint64_t sid = sessid;
+    std::string s_pic = getStringUTF(env,picfile);
+    std::string s_msk = getStringUTF(env,mskfile);
+    std::string s_fg = getStringUTF(env,fgfile);
+    jint *boxData = (jint*) env->GetPrimitiveArrayCritical( arrbox, 0);
+    jbyte *imgbuf = (jbyte*) env->GetPrimitiveArrayCritical(arrimg, 0);
+    jbyte *mskbuf = (jbyte*) env->GetPrimitiveArrayCritical(arrmsk, 0);
+    int rst = dhduix_fileinx(g_digit,sid,
+        (char*)s_pic.c_str(),(int*)boxData,
+        (char*)s_msk.c_str(),(char*)s_fg.c_str(),
+        inx,(char*)imgbuf,(char*)mskbuf,imgsize);
+    env->ReleasePrimitiveArrayCritical( arrimg,imgbuf, 0);
+    env->ReleasePrimitiveArrayCritical( arrmsk,mskbuf, 0);
+    env->ReleasePrimitiveArrayCritical( arrbox, boxData, 0);
+    return rst;
+  }
+
+    JNIEXPORT jint JNICALL
+        Java_ai_guiji_duix_DuixNcnn_startgpg(JNIEnv *env, jobject thiz, jstring picfn,jstring gpgfn){
+            std::string s_pic = getStringUTF(env,picfn);
+            std::string s_gpg = getStringUTF(env,gpgfn);
+            if(!g_gpgmat)g_gpgmat = new JMat();
+            int rst = g_gpgmat->loadjpg(s_pic);
+            if(rst)return rst;
+            rst = g_gpgmat->savegpg(s_gpg);
+            return rst;
+        }
+
+    JNIEXPORT jint JNICALL
+        Java_ai_guiji_duix_DuixNcnn_processmd5(JNIEnv *env, jobject thiz, jint kind,jstring infn,jstring outfn){
+            std::string s_in = getStringUTF(env,infn);
+            std::string s_out = getStringUTF(env,outfn);
+            int rst = mainenc(kind,(char*)s_in.c_str(),(char*)s_out.c_str());
+            return rst;
+        }
+
+    JNIEXPORT jint JNICALL
+        Java_ai_guiji_duix_DuixNcnn_stopgpg(JNIEnv *env, jobject thiz){
+            if(g_gpgmat){
+                delete g_gpgmat;
+                g_gpgmat = NULL;
+            }
+            return 0;
+    }
+}
+
diff --git a/duix-sdk/src/main/cpp/android/JniHelper.cpp b/duix-sdk/src/main/cpp/android/JniHelper.cpp
new file mode 100644
index 0000000..aa6f3fc
--- /dev/null
+++ b/duix-sdk/src/main/cpp/android/JniHelper.cpp
@@ -0,0 +1,384 @@
+#include <malloc.h>
+#include "JniHelper.h"
+#include "Log.h"
+
+#define TAG "JniHelper"
+
+using namespace std;
+
+JavaVM *JniHelper::sJavaVM = nullptr;
+
+JNIEnv *JniHelper::getJNIEnv() {
+    if (sJavaVM == nullptr) {
+        LOGE(TAG, "sJavaVM is nullptr");
+        return nullptr;
+    }
+
+    JNIEnv *env = nullptr;
+    bool attached = false;
+    switch (sJavaVM->GetEnv((void **) &env, JNI_VERSION_1_4)) {
+        case JNI_OK:
+            break;
+        case JNI_EDETACHED:
+            if (sJavaVM->AttachCurrentThread(&env, nullptr) != 0) {
+                LOGE(TAG, "Could not attach current thread");
+            }
+            attached = true;
+            break;
+        case JNI_EVERSION:
+            LOGE(TAG, "Invalid java version");
+            break;
+        default:
+            break;
+    }
+
+    if (attached) {
+        sJavaVM->DetachCurrentThread();
+    }
+
+    return env;
+}
+
+bool JniHelper::attachCurrentThread() {
+    if (sJavaVM == nullptr) {
+        LOGE(TAG, "sJavaVM is nullptr");
+        return false;
+    }
+
+    JNIEnv *env = nullptr;
+    bool attached = false;
+    switch (sJavaVM->GetEnv((void **) &env, JNI_VERSION_1_4)) {
+        case JNI_OK:
+            break;
+        case JNI_EDETACHED:
+            if (sJavaVM->AttachCurrentThread(&env, nullptr) != 0) {
+                LOGE(TAG, "Could not attach current thread");
+            } else {
+                attached = true;
+            }
+            break;
+        case JNI_EVERSION:
+            LOGE(TAG, "Invalid java version");
+            break;
+        default:
+            break;
+    }
+
+    return attached;
+}
+
+void JniHelper::detachCurrentThread() {
+    sJavaVM->DetachCurrentThread();
+}
+
+void JniHelper::throwException(JNIEnv *env, const char *className, const char *msg) {
+    jclass exception = env->FindClass(className);
+    env->ThrowNew(exception, msg);
+}
+
+jstring JniHelper::newStringUTF(JNIEnv *env, const char *data) {
+    if (!data) return nullptr;
+    jstring str = nullptr;
+    int size = strlen(data);
+    jbyteArray array = env->NewByteArray(size);
+    if (!array) {  // OutOfMemoryError exception has already been thrown.
+        LOGE(TAG, "convertString: OutOfMemoryError is thrown.");
+    } else {
+        env->SetByteArrayRegion(array, 0, size, (jbyte *) data);
+        jclass string_Clazz = env->FindClass("java/lang/String");
+        jmethodID string_initMethodID = env->GetMethodID(string_Clazz, "<init>",
+                                                         "([BLjava/lang/String;)V");
+        jstring utf = env->NewStringUTF("UTF-8");
+        str = (jstring) env->NewObject(string_Clazz, string_initMethodID, array, utf);
+        env->DeleteLocalRef(utf);
+        env->DeleteLocalRef(array);
+    }
+    return str;
+};
+
+jobject JniHelper::createByteBuffer(JNIEnv *env, unsigned char *buffer, int size) {
+    if (env == nullptr || buffer == nullptr) {
+        return nullptr;
+    }
+
+    jobject byteBuffer = env->NewDirectByteBuffer(buffer, size);
+    //byteBuffer = env->NewGlobalRef(byteBuffer);
+
+    return byteBuffer;
+}
+
+jobject JniHelper::createByteBuffer(JNIEnv *env, int size) {
+    if (env == nullptr) {
+        return nullptr;
+    }
+
+    auto buffer = static_cast<uint8_t *>(malloc(static_cast<size_t>(size)));
+    jobject byteBuffer = env->NewDirectByteBuffer(buffer, size);
+    free(buffer);
+    return byteBuffer;
+}
+
+void JniHelper::deleteLocalRef(jobject jobj) {
+    JNIEnv *env = JniHelper::getJNIEnv();
+    if (env == nullptr || jobj == nullptr) {
+        return;
+    }
+
+    env->DeleteLocalRef(jobj);
+}
+
+string JniHelper::getStringUTF(JNIEnv *env, jstring obj) {
+    char *c_str = (char *) env->GetStringUTFChars(obj, nullptr);
+    string tmpString = std::string(c_str);
+    env->ReleaseStringUTFChars(obj, c_str);
+    return tmpString;
+}
+
+char *JniHelper::getCharArrayUTF(JNIEnv *env, jstring obj) {
+    char *c_str = (char *) env->GetStringUTFChars(obj, nullptr);
+    env->ReleaseStringUTFChars(obj, c_str);
+    return c_str;
+}
+
+void JniHelper::callVoidMethod(jobject obj, jmethodID methodId) {
+    if (sJavaVM == nullptr) {
+        LOGE(TAG, "sJavaVM is nullptr");
+        return;
+    }
+
+    JNIEnv *env = nullptr;
+    bool attached = false;
+    switch (sJavaVM->GetEnv((void **) &env, JNI_VERSION_1_4)) {
+        case JNI_OK:
+            break;
+        case JNI_EDETACHED:
+            if (sJavaVM->AttachCurrentThread(&env, nullptr) != 0) {
+                LOGE(TAG, "Could not attach current thread");
+            }
+            attached = true;
+            break;
+        case JNI_EVERSION:
+            LOGE(TAG, "Invalid java version");
+            break;
+        default:
+            break;
+    }
+
+    if (env != nullptr) {
+        env->CallVoidMethod(obj, methodId);
+    }
+
+    if (attached) {
+        sJavaVM->DetachCurrentThread();
+    }
+}
+
+void JniHelper::callVoidMethod(jobject obj, jmethodID methodId, jint arg1, jint arg2, jint arg3, jint arg4) {
+    if (sJavaVM == nullptr) {
+        LOGE(TAG, "sJavaVM is nullptr");
+        return;
+    }
+
+    JNIEnv *env = nullptr;
+    bool attached = false;
+    switch (sJavaVM->GetEnv((void **) &env, JNI_VERSION_1_4)) {
+        case JNI_OK:
+            break;
+        case JNI_EDETACHED:
+            if (sJavaVM->AttachCurrentThread(&env, nullptr) != 0) {
+                LOGE(TAG, "Could not attach current thread");
+            }
+            attached = true;
+            break;
+        case JNI_EVERSION:
+            LOGE(TAG, "Invalid java version");
+            break;
+        default:
+            break;
+    }
+
+    if (env != nullptr) {
+        env->CallVoidMethod(obj, methodId, arg1, arg2, arg3, arg4);
+    }
+
+    if (attached) {
+        sJavaVM->DetachCurrentThread();
+    }
+}
+
+void
+JniHelper::callVoidMethod(jobject obj, jmethodID methodId, jint arg1, jint arg2, jint arg3,
+                          jstring arg4, jstring arg5, jobject arg6) {
+    if (sJavaVM == nullptr) {
+        LOGE(TAG, "sJavaVM is nullptr");
+        return;
+    }
+
+    JNIEnv *env = nullptr;
+    bool attached = false;
+    switch (sJavaVM->GetEnv((void **) &env, JNI_VERSION_1_4)) {
+        case JNI_OK:
+            break;
+        case JNI_EDETACHED:
+            if (sJavaVM->AttachCurrentThread(&env, nullptr) != 0) {
+                LOGE(TAG, "Could not attach current thread");
+            }
+            attached = true;
+            break;
+        case JNI_EVERSION:
+            LOGE(TAG, "Invalid java version");
+            break;
+        default:
+            break;
+    }
+
+    if (env != nullptr) {
+        env->CallVoidMethod(obj, methodId, arg1, arg2, arg3, arg4, arg5, arg6);
+    }
+
+    if (attached) {
+        sJavaVM->DetachCurrentThread();
+    }
+}
+
+int JniHelper::callIntMethod(jobject obj, jmethodID methodId, jobject arg1, jint arg2) {
+    if (sJavaVM == nullptr) {
+        LOGE(TAG, "sJavaVM is nullptr");
+        return -1;
+    }
+
+    JNIEnv *env = nullptr;
+    bool attached = false;
+    switch (sJavaVM->GetEnv((void **) &env, JNI_VERSION_1_4)) {
+        case JNI_OK:
+            break;
+        case JNI_EDETACHED:
+            if (sJavaVM->AttachCurrentThread(&env, nullptr) != 0) {
+                LOGE(TAG, "Could not attach current thread");
+            }
+            attached = true;
+            break;
+        case JNI_EVERSION:
+            LOGE(TAG, "Invalid java version");
+            break;
+        default:
+            break;
+    }
+
+    int ret = -1;
+    if (env != nullptr) {
+        ret = env->CallIntMethod(obj, methodId, arg1, arg2);
+    }
+
+    if (attached) {
+        sJavaVM->DetachCurrentThread();
+    }
+
+    return ret;
+}
+
+
+void JniHelper::callStaticVoidMethod(jclass cls, jmethodID methodId, jint arg1) {
+    if (sJavaVM == nullptr) {
+        LOGE(TAG, "sJavaVM is nullptr");
+        return;
+    }
+
+    JNIEnv *env = nullptr;
+    bool attached = false;
+    switch (sJavaVM->GetEnv((void **) &env, JNI_VERSION_1_4)) {
+        case JNI_OK:
+            break;
+        case JNI_EDETACHED:
+            if (sJavaVM->AttachCurrentThread(&env, nullptr) != 0) {
+                LOGE(TAG, "Could not attach current thread");
+            }
+            attached = true;
+            break;
+        case JNI_EVERSION:
+            LOGE(TAG, "Invalid java version");
+            break;
+        default:
+            break;
+    }
+
+    if (env != nullptr) {
+        env->CallStaticVoidMethod(cls, methodId, arg1);
+    }
+
+    if (attached) {
+        sJavaVM->DetachCurrentThread();
+    }
+}
+
+jobject JniHelper::callObjectMethod(jobject obj, jmethodID methodId) {
+    if (sJavaVM == nullptr) {
+        LOGE(TAG, "sJavaVM is nullptr");
+        return nullptr;
+    }
+
+    JNIEnv *env = nullptr;
+    bool attached = false;
+    switch (sJavaVM->GetEnv((void **) &env, JNI_VERSION_1_4)) {
+        case JNI_OK:
+            break;
+        case JNI_EDETACHED:
+            if (sJavaVM->AttachCurrentThread(&env, nullptr) != 0) {
+                LOGE(TAG, "Could not attach current thread");
+            }
+            attached = true;
+            break;
+        case JNI_EVERSION:
+            LOGE(TAG, "Invalid java version");
+            break;
+        default:
+            break;
+    }
+
+    jobject ret = nullptr;
+    if (env != nullptr) {
+        ret = env->CallObjectMethod(obj, methodId);
+    }
+
+    if (attached) {
+        sJavaVM->DetachCurrentThread();
+    }
+
+    return ret;
+}
+
+jboolean JniHelper::callBooleanMethod(jobject obj, jmethodID methodId) {
+    if (sJavaVM == nullptr) {
+        LOGE(TAG, "sJavaVM is nullptr");
+        return false;
+    }
+
+    JNIEnv *env = nullptr;
+    bool attached = false;
+    switch (sJavaVM->GetEnv((void **) &env, JNI_VERSION_1_4)) {
+        case JNI_OK:
+            break;
+        case JNI_EDETACHED:
+            if (sJavaVM->AttachCurrentThread(&env, nullptr) != 0) {
+                LOGE(TAG, "Could not attach current thread");
+            }
+            attached = true;
+            break;
+        case JNI_EVERSION:
+            LOGE(TAG, "Invalid java version");
+            break;
+        default:
+            break;
+    }
+
+    jboolean ret;
+    if (env != nullptr) {
+        ret = env->CallBooleanMethod(obj, methodId);
+    }
+
+    if (attached) {
+        sJavaVM->DetachCurrentThread();
+    }
+
+    return ret;
+}
diff --git a/duix-sdk/src/main/cpp/android/JniHelper.h b/duix-sdk/src/main/cpp/android/JniHelper.h
new file mode 100644
index 0000000..aae30f4
--- /dev/null
+++ b/duix-sdk/src/main/cpp/android/JniHelper.h
@@ -0,0 +1,50 @@
+#ifndef GPLAYER_JNIHELPER_H
+#define GPLAYER_JNIHELPER_H
+
+#include <jni.h>
+#include <string>
+
+using namespace std;
+
+class JniHelper {
+public:
+    static JNIEnv *getJNIEnv();
+
+    static bool attachCurrentThread();
+
+    static void detachCurrentThread();
+
+    static void throwException(JNIEnv *env, const char *className, const char *msg);
+
+    static jstring newStringUTF(JNIEnv *env, const char *data);
+
+    static string getStringUTF(JNIEnv *env, jstring obj);
+
+    static char *getCharArrayUTF(JNIEnv *env, jstring obj);
+
+    static jobject createByteBuffer(JNIEnv *env, unsigned char *buffer, int size);
+
+    static jobject createByteBuffer(JNIEnv *env, int size);
+
+    static void deleteLocalRef(jobject jobj);
+
+    static void callVoidMethod(jobject obj, jmethodID methodId);
+
+    static void callVoidMethod(jobject obj, jmethodID methodId, jint arg1, jint arg2, jint arg3, jint arg4);
+
+    static void callVoidMethod(jobject obj, jmethodID methodId, jint arg1, jint arg2,
+                               jint arg3, jstring arg4, jstring arg5, jobject arg6);
+
+    static int callIntMethod(jobject obj, jmethodID methodId, jobject arg1, jint arg2);
+
+    static void callStaticVoidMethod(jclass cls, jmethodID methodId, jint arg1);
+
+    static jobject callObjectMethod(jobject obj, jmethodID methodId);
+
+    static jboolean callBooleanMethod(jobject obj, jmethodID methodId);
+
+public:
+    static JavaVM *sJavaVM;
+};
+
+#endif //GPLAYER_JNIHELPER_H
diff --git a/duix-sdk/src/main/cpp/android/Log.cpp b/duix-sdk/src/main/cpp/android/Log.cpp
new file mode 100644
index 0000000..2c01e67
--- /dev/null
+++ b/duix-sdk/src/main/cpp/android/Log.cpp
@@ -0,0 +1,80 @@
+#if defined(_WIN32)
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#include "Log.h"
+
+#include <stdio.h>
+#include <time.h>
+#include <stdarg.h>
+
+#ifdef __ANDROID__
+#include <android/log.h>
+android_LogPriority s_android_logprio[LOG_TRACE + 1] = {
+        ANDROID_LOG_UNKNOWN,
+        ANDROID_LOG_FATAL,
+        ANDROID_LOG_ERROR,
+        ANDROID_LOG_WARN,
+        ANDROID_LOG_INFO,
+        ANDROID_LOG_DEBUG,
+        ANDROID_LOG_VERBOSE
+};
+
+#endif
+
+#if defined(_WIN32)
+#include <windows.h>
+#endif
+
+void __log_print(int lv, const char *tag, const char *funame, int line, const char *fmt, ...) {
+    char log_info[2040];
+    char *buf = log_info;
+    int ret, len = sizeof(log_info);
+
+//Android 不需要时间
+#ifndef __ANDROID__
+    /*
+    if (lv <= LogLevel::LOG_INFO) {    // 日志级别不小于INFO则打印时带时间标记
+        *buf++ = '[';
+        _get_curtime_str(buf);
+        //buf = buf + strlen(buf);
+        buf += 23;  // 时间格式为：XXXX - XX - XX XX : XX : XX.XXX  共占23个字节
+        *buf++ = ']';
+        *buf++ = ' ';
+
+        len -= buf - log_info;
+    }
+    */
+
+    if (lv <= LogLevel::LOG_WARN) {    // 日志级别不小于WARN则打印时带代码行信息
+        ret = sprintf(buf, "%s line:%-4d ", funame, line);
+        buf += ret;
+        len -= ret;
+    }
+#endif
+
+    va_list arglist;
+    va_start(arglist, fmt);
+
+    int itemLen = buf - log_info;
+#if defined( WIN32 )
+    ret = _vsnprintf(buf, len - 1, fmt, arglist);
+#else
+    ret = vsnprintf(buf, len - 1, fmt, arglist);
+#endif
+    if (ret < 0) {
+        buf[len - 1] = 0;
+        buf[len - 2] = '\n';
+        itemLen += len - 1;
+    } else
+        itemLen += ret;
+
+    va_end(arglist);
+
+#if defined(__ANDROID__)
+    __android_log_print(s_android_logprio[lv], tag, log_info, "");
+#else
+    //本地输出
+    //printf("Tag=%s %s\n", tag, log_info);
+#endif
+}
diff --git a/duix-sdk/src/main/cpp/android/Log.h b/duix-sdk/src/main/cpp/android/Log.h
new file mode 100644
index 0000000..c506c6f
--- /dev/null
+++ b/duix-sdk/src/main/cpp/android/Log.h
@@ -0,0 +1,44 @@
+#ifndef __GPLAYER_LOG_H__
+#define __GPLAYER_LOG_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//调试日志开关，1为开，其它为关
+#define LOG_OPEN 0
+
+#define __ANDROID__ 1
+enum LogLevel
+{
+    LOG_OFF    = 0,		        //!< 不打印日志
+    LOG_FATAL  = 1,	 	        //!< 严重
+    LOG_ERROR  = 2,				//!< 错误
+    LOG_WARN   = 3,				//!< 警告
+    LOG_INFO   = 4,				//!< 信息
+    LOG_DEBUG  = 5,				//!< 调试
+    LOG_TRACE  = 6,				//!< 跟踪
+};
+
+void __log_print(int lv, const char* tag, const char* funame, int line, const char *fmt, ...);
+
+#define LOGI(TAG, ...)  __log_print(LogLevel::LOG_INFO,  TAG, __FUNCTION__, __LINE__, __VA_ARGS__)
+#define LOGW(TAG, ...)  __log_print(LogLevel::LOG_WARN,  TAG, __FUNCTION__, __LINE__, __VA_ARGS__)
+#define LOGE(TAG, ...)  __log_print(LogLevel::LOG_ERROR, TAG, __FUNCTION__, __LINE__, __VA_ARGS__)
+#define LOGF(TAG, ...)  __log_print(LogLevel::LOG_FATAL, TAG, __FUNCTION__, __LINE__, __VA_ARGS__)
+
+#if defined(__ANDROID__)
+#if(LOG_OPEN == 1)
+#define LOGD(TAG,...)  __log_print(LogLevel::LOG_DEBUG, TAG, __FUNCTION__, __LINE__, __VA_ARGS__)
+#else
+#define LOGD(TAG, ...)  NULL
+#endif
+#else
+#define LOGD(TAG, ...)  __log_print(LogLevel::LOG_DEBUG, TAG, __FUNCTION__, __LINE__, __VA_ARGS__)
+#endif
+
+#ifdef __cplusplus
+};
+#endif
+
+#endif // !__GPLAYER_LOG_H__
diff --git a/duix-sdk/src/main/cpp/dhcore/atomicops.h b/duix-sdk/src/main/cpp/dhcore/atomicops.h
new file mode 100644
index 0000000..b103bc6
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhcore/atomicops.h
@@ -0,0 +1,761 @@
+﻿// ©2013-2016 Cameron Desrochers.
+// Distributed under the simplified BSD license (see the license file that
+// should have come with this header).
+// Uses Jeff Preshing's semaphore implementation (under the terms of its
+// separate zlib license, embedded below).
+
+#pragma once
+
+// Provides portable (VC++2010+, Intel ICC 13, GCC 4.7+, and anything C++11 compliant) implementation
+// of low-level memory barriers, plus a few semi-portable utility macros (for inlining and alignment).
+// Also has a basic atomic type (limited to hardware-supported atomics with no memory ordering guarantees).
+// Uses the AE_* prefix for macros (historical reasons), and the "moodycamel" namespace for symbols.
+
+#include <cerrno>
+#include <cassert>
+#include <type_traits>
+#include <cerrno>
+#include <cstdint>
+#include <ctime>
+
+// Platform detection
+#if defined(__INTEL_COMPILER)
+#define AE_ICC
+#elif defined(_MSC_VER)
+#define AE_VCPP
+#elif defined(__GNUC__)
+#define AE_GCC
+#endif
+
+#if defined(_M_IA64) || defined(__ia64__)
+#define AE_ARCH_IA64
+#elif defined(_WIN64) || defined(__amd64__) || defined(_M_X64) || defined(__x86_64__)
+#define AE_ARCH_X64
+#elif defined(_M_IX86) || defined(__i386__)
+#define AE_ARCH_X86
+#elif defined(_M_PPC) || defined(__powerpc__)
+#define AE_ARCH_PPC
+#else
+#define AE_ARCH_UNKNOWN
+#endif
+
+
+// AE_UNUSED
+#define AE_UNUSED(x) ((void)x)
+
+// AE_NO_TSAN/AE_TSAN_ANNOTATE_*
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#if __cplusplus >= 201703L  // inline variables require C++17
+namespace moodycamel { inline int ae_tsan_global; }
+#define AE_TSAN_ANNOTATE_RELEASE() AnnotateHappensBefore(__FILE__, __LINE__, (void *)(&::moodycamel::ae_tsan_global))
+#define AE_TSAN_ANNOTATE_ACQUIRE() AnnotateHappensAfter(__FILE__, __LINE__, (void *)(&::moodycamel::ae_tsan_global))
+extern "C" void AnnotateHappensBefore(const char*, int, void*);
+extern "C" void AnnotateHappensAfter(const char*, int, void*);
+#else  // when we can't work with tsan, attempt to disable its warnings
+#define AE_NO_TSAN __attribute__((no_sanitize("thread")))
+#endif
+#endif
+#endif
+#ifndef AE_NO_TSAN
+#define AE_NO_TSAN
+#endif
+#ifndef AE_TSAN_ANNOTATE_RELEASE
+#define AE_TSAN_ANNOTATE_RELEASE()
+#define AE_TSAN_ANNOTATE_ACQUIRE()
+#endif
+
+
+// AE_FORCEINLINE
+#if defined(AE_VCPP) || defined(AE_ICC)
+#define AE_FORCEINLINE __forceinline
+#elif defined(AE_GCC)
+//#define AE_FORCEINLINE __attribute__((always_inline)) 
+#define AE_FORCEINLINE inline
+#else
+#define AE_FORCEINLINE inline
+#endif
+
+
+// AE_ALIGN
+#if defined(AE_VCPP) || defined(AE_ICC)
+#define AE_ALIGN(x) __declspec(align(x))
+#elif defined(AE_GCC)
+#define AE_ALIGN(x) __attribute__((aligned(x)))
+#else
+// Assume GCC compliant syntax...
+#define AE_ALIGN(x) __attribute__((aligned(x)))
+#endif
+
+
+// Portable atomic fences implemented below:
+
+namespace moodycamel {
+
+enum memory_order {
+	memory_order_relaxed,
+	memory_order_acquire,
+	memory_order_release,
+	memory_order_acq_rel,
+	memory_order_seq_cst,
+
+	// memory_order_sync: Forces a full sync:
+	// #LoadLoad, #LoadStore, #StoreStore, and most significantly, #StoreLoad
+	memory_order_sync = memory_order_seq_cst
+};
+
+}    // end namespace moodycamel
+
+#if (defined(AE_VCPP) && (_MSC_VER < 1700 || defined(__cplusplus_cli))) || (defined(AE_ICC) && __INTEL_COMPILER < 1600)
+// VS2010 and ICC13 don't support std::atomic_*_fence, implement our own fences
+
+#include <intrin.h>
+
+#if defined(AE_ARCH_X64) || defined(AE_ARCH_X86)
+#define AeFullSync _mm_mfence
+#define AeLiteSync _mm_mfence
+#elif defined(AE_ARCH_IA64)
+#define AeFullSync __mf
+#define AeLiteSync __mf
+#elif defined(AE_ARCH_PPC)
+#include <ppcintrinsics.h>
+#define AeFullSync __sync
+#define AeLiteSync __lwsync
+#endif
+
+
+#ifdef AE_VCPP
+#pragma warning(push)
+#pragma warning(disable: 4365)		// Disable erroneous 'conversion from long to unsigned int, signed/unsigned mismatch' error when using `assert`
+#ifdef __cplusplus_cli
+#pragma managed(push, off)
+#endif
+#endif
+
+namespace moodycamel {
+
+AE_FORCEINLINE void compiler_fence(memory_order order) AE_NO_TSAN
+{
+	switch (order) {
+		case memory_order_relaxed: break;
+		case memory_order_acquire: _ReadBarrier(); break;
+		case memory_order_release: _WriteBarrier(); break;
+		case memory_order_acq_rel: _ReadWriteBarrier(); break;
+		case memory_order_seq_cst: _ReadWriteBarrier(); break;
+		default: assert(false);
+	}
+}
+
+// x86/x64 have a strong memory model -- all loads and stores have
+// acquire and release semantics automatically (so only need compiler
+// barriers for those).
+#if defined(AE_ARCH_X86) || defined(AE_ARCH_X64)
+AE_FORCEINLINE void fence(memory_order order) AE_NO_TSAN
+{
+	switch (order) {
+		case memory_order_relaxed: break;
+		case memory_order_acquire: _ReadBarrier(); break;
+		case memory_order_release: _WriteBarrier(); break;
+		case memory_order_acq_rel: _ReadWriteBarrier(); break;
+		case memory_order_seq_cst:
+			_ReadWriteBarrier();
+			AeFullSync();
+			_ReadWriteBarrier();
+			break;
+		default: assert(false);
+	}
+}
+#else
+AE_FORCEINLINE void fence(memory_order order) AE_NO_TSAN
+{
+	// Non-specialized arch, use heavier memory barriers everywhere just in case :-(
+	switch (order) {
+		case memory_order_relaxed:
+			break;
+		case memory_order_acquire:
+			_ReadBarrier();
+			AeLiteSync();
+			_ReadBarrier();
+			break;
+		case memory_order_release:
+			_WriteBarrier();
+			AeLiteSync();
+			_WriteBarrier();
+			break;
+		case memory_order_acq_rel:
+			_ReadWriteBarrier();
+			AeLiteSync();
+			_ReadWriteBarrier();
+			break;
+		case memory_order_seq_cst:
+			_ReadWriteBarrier();
+			AeFullSync();
+			_ReadWriteBarrier();
+			break;
+		default: assert(false);
+	}
+}
+#endif
+}    // end namespace moodycamel
+#else
+// Use standard library of atomics
+#include <atomic>
+
+namespace moodycamel {
+
+AE_FORCEINLINE void compiler_fence(memory_order order) AE_NO_TSAN
+{
+	switch (order) {
+		case memory_order_relaxed: break;
+		case memory_order_acquire: std::atomic_signal_fence(std::memory_order_acquire); break;
+		case memory_order_release: std::atomic_signal_fence(std::memory_order_release); break;
+		case memory_order_acq_rel: std::atomic_signal_fence(std::memory_order_acq_rel); break;
+		case memory_order_seq_cst: std::atomic_signal_fence(std::memory_order_seq_cst); break;
+		default: assert(false);
+	}
+}
+
+AE_FORCEINLINE void fence(memory_order order) AE_NO_TSAN
+{
+	switch (order) {
+		case memory_order_relaxed: break;
+		case memory_order_acquire: AE_TSAN_ANNOTATE_ACQUIRE(); std::atomic_thread_fence(std::memory_order_acquire); break;
+		case memory_order_release: AE_TSAN_ANNOTATE_RELEASE(); std::atomic_thread_fence(std::memory_order_release); break;
+		case memory_order_acq_rel: AE_TSAN_ANNOTATE_ACQUIRE(); AE_TSAN_ANNOTATE_RELEASE(); std::atomic_thread_fence(std::memory_order_acq_rel); break;
+		case memory_order_seq_cst: AE_TSAN_ANNOTATE_ACQUIRE(); AE_TSAN_ANNOTATE_RELEASE(); std::atomic_thread_fence(std::memory_order_seq_cst); break;
+		default: assert(false);
+	}
+}
+
+}    // end namespace moodycamel
+
+#endif
+
+
+#if !defined(AE_VCPP) || (_MSC_VER >= 1700 && !defined(__cplusplus_cli))
+#define AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
+#endif
+
+#ifdef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
+#include <atomic>
+#endif
+#include <utility>
+
+// WARNING: *NOT* A REPLACEMENT FOR std::atomic. READ CAREFULLY:
+// Provides basic support for atomic variables -- no memory ordering guarantees are provided.
+// The guarantee of atomicity is only made for types that already have atomic load and store guarantees
+// at the hardware level -- on most platforms this generally means aligned pointers and integers (only).
+namespace moodycamel {
+template<typename T>
+class weak_atomic
+{
+public:
+	AE_NO_TSAN weak_atomic() : value() { }
+#ifdef AE_VCPP
+#pragma warning(push)
+#pragma warning(disable: 4100)		// Get rid of (erroneous) 'unreferenced formal parameter' warning
+#endif
+	template<typename U> AE_NO_TSAN weak_atomic(U&& x) : value(std::forward<U>(x)) {  }
+#ifdef __cplusplus_cli
+	// Work around bug with universal reference/nullptr combination that only appears when /clr is on
+	AE_NO_TSAN weak_atomic(nullptr_t) : value(nullptr) {  }
+#endif
+	AE_NO_TSAN weak_atomic(weak_atomic const& other) : value(other.load()) {  }
+	AE_NO_TSAN weak_atomic(weak_atomic&& other) : value(std::move(other.load())) {  }
+#ifdef AE_VCPP
+#pragma warning(pop)
+#endif
+
+	AE_FORCEINLINE operator T() const AE_NO_TSAN { return load(); }
+
+	
+#ifndef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
+	template<typename U> AE_FORCEINLINE weak_atomic const& operator=(U&& x) AE_NO_TSAN { value = std::forward<U>(x); return *this; }
+	AE_FORCEINLINE weak_atomic const& operator=(weak_atomic const& other) AE_NO_TSAN { value = other.value; return *this; }
+	
+	AE_FORCEINLINE T load() const AE_NO_TSAN { return value; }
+	
+	AE_FORCEINLINE T fetch_add_acquire(T increment) AE_NO_TSAN
+	{
+#if defined(AE_ARCH_X64) || defined(AE_ARCH_X86)
+		if (sizeof(T) == 4) return _InterlockedExchangeAdd((long volatile*)&value, (long)increment);
+#if defined(_M_AMD64)
+		else if (sizeof(T) == 8) return _InterlockedExchangeAdd64((long long volatile*)&value, (long long)increment);
+#endif
+#else
+#error Unsupported platform
+#endif
+		assert(false && "T must be either a 32 or 64 bit type");
+		return value;
+	}
+	
+	AE_FORCEINLINE T fetch_add_release(T increment) AE_NO_TSAN
+	{
+#if defined(AE_ARCH_X64) || defined(AE_ARCH_X86)
+		if (sizeof(T) == 4) return _InterlockedExchangeAdd((long volatile*)&value, (long)increment);
+#if defined(_M_AMD64)
+		else if (sizeof(T) == 8) return _InterlockedExchangeAdd64((long long volatile*)&value, (long long)increment);
+#endif
+#else
+#error Unsupported platform
+#endif
+		assert(false && "T must be either a 32 or 64 bit type");
+		return value;
+	}
+#else
+	template<typename U>
+	AE_FORCEINLINE weak_atomic const& operator=(U&& x) AE_NO_TSAN
+	{
+		value.store(std::forward<U>(x), std::memory_order_relaxed);
+		return *this;
+	}
+	
+	AE_FORCEINLINE weak_atomic const& operator=(weak_atomic const& other) AE_NO_TSAN
+	{
+		value.store(other.value.load(std::memory_order_relaxed), std::memory_order_relaxed);
+		return *this;
+	}
+
+	AE_FORCEINLINE T load() const AE_NO_TSAN { return value.load(std::memory_order_relaxed); }
+	
+	AE_FORCEINLINE T fetch_add_acquire(T increment) AE_NO_TSAN
+	{
+		return value.fetch_add(increment, std::memory_order_acquire);
+	}
+	
+	AE_FORCEINLINE T fetch_add_release(T increment) AE_NO_TSAN
+	{
+		return value.fetch_add(increment, std::memory_order_release);
+	}
+#endif
+	
+
+private:
+#ifndef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
+	// No std::atomic support, but still need to circumvent compiler optimizations.
+	// `volatile` will make memory access slow, but is guaranteed to be reliable.
+	volatile T value;
+#else
+	std::atomic<T> value;
+#endif
+};
+
+}	// end namespace moodycamel
+
+
+
+// Portable single-producer, single-consumer semaphore below:
+
+#if defined(_WIN32)
+// Avoid including windows.h in a header; we only need a handful of
+// items, so we'll redeclare them here (this is relatively safe since
+// the API generally has to remain stable between Windows versions).
+// I know this is an ugly hack but it still beats polluting the global
+// namespace with thousands of generic names or adding a .cpp for nothing.
+extern "C" {
+	struct _SECURITY_ATTRIBUTES;
+	__declspec(dllimport) void* __stdcall CreateSemaphoreW(_SECURITY_ATTRIBUTES* lpSemaphoreAttributes, long lInitialCount, long lMaximumCount, const wchar_t* lpName);
+	__declspec(dllimport) int __stdcall CloseHandle(void* hObject);
+	__declspec(dllimport) unsigned long __stdcall WaitForSingleObject(void* hHandle, unsigned long dwMilliseconds);
+	__declspec(dllimport) int __stdcall ReleaseSemaphore(void* hSemaphore, long lReleaseCount, long* lpPreviousCount);
+}
+#elif defined(__MACH__)
+#include <mach/mach.h>
+#elif defined(__unix__)
+#include <semaphore.h>
+#elif defined(FREERTOS)
+#include <FreeRTOS.h>
+#include <semphr.h>
+#include <task.h>
+#endif
+
+namespace moodycamel
+{
+	// Code in the spsc_sema namespace below is an adaptation of Jeff Preshing's
+	// portable + lightweight semaphore implementations, originally from
+	// https://github.com/preshing/cpp11-on-multicore/blob/master/common/sema.h
+	// LICENSE:
+	// Copyright (c) 2015 Jeff Preshing
+	//
+	// This software is provided 'as-is', without any express or implied
+	// warranty. In no event will the authors be held liable for any damages
+	// arising from the use of this software.
+	//
+	// Permission is granted to anyone to use this software for any purpose,
+	// including commercial applications, and to alter it and redistribute it
+	// freely, subject to the following restrictions:
+	//
+	// 1. The origin of this software must not be misrepresented; you must not
+	//    claim that you wrote the original software. If you use this software
+	//    in a product, an acknowledgement in the product documentation would be
+	//    appreciated but is not required.
+	// 2. Altered source versions must be plainly marked as such, and must not be
+	//    misrepresented as being the original software.
+	// 3. This notice may not be removed or altered from any source distribution.
+	namespace spsc_sema
+	{
+#if defined(_WIN32)
+		class Semaphore
+		{
+		private:
+		    void* m_hSema;
+		    
+		    Semaphore(const Semaphore& other);
+		    Semaphore& operator=(const Semaphore& other);
+
+		public:
+		    AE_NO_TSAN Semaphore(int initialCount = 0) : m_hSema()
+		    {
+		        assert(initialCount >= 0);
+		        const long maxLong = 0x7fffffff;
+		        m_hSema = CreateSemaphoreW(nullptr, initialCount, maxLong, nullptr);
+		        assert(m_hSema);
+		    }
+
+		    AE_NO_TSAN ~Semaphore()
+		    {
+		        CloseHandle(m_hSema);
+		    }
+
+		    bool wait() AE_NO_TSAN
+		    {
+		    	const unsigned long infinite = 0xffffffff;
+		        return WaitForSingleObject(m_hSema, infinite) == 0;
+		    }
+
+			bool try_wait() AE_NO_TSAN
+			{
+				return WaitForSingleObject(m_hSema, 0) == 0;
+			}
+
+			bool timed_wait(std::uint64_t usecs) AE_NO_TSAN
+			{
+				return WaitForSingleObject(m_hSema, (unsigned long)(usecs / 1000)) == 0;
+			}
+
+		    void signal(int count = 1) AE_NO_TSAN
+		    {
+		        while (!ReleaseSemaphore(m_hSema, count, nullptr));
+		    }
+		};
+#elif defined(__MACH__)
+		//---------------------------------------------------------
+		// Semaphore (Apple iOS and OSX)
+		// Can't use POSIX semaphores due to http://lists.apple.com/archives/darwin-kernel/2009/Apr/msg00010.html
+		//---------------------------------------------------------
+		class Semaphore
+		{
+		private:
+		    semaphore_t m_sema;
+
+		    Semaphore(const Semaphore& other);
+		    Semaphore& operator=(const Semaphore& other);
+
+		public:
+		    AE_NO_TSAN Semaphore(int initialCount = 0) : m_sema()
+		    {
+		        assert(initialCount >= 0);
+		        kern_return_t rc = semaphore_create(mach_task_self(), &m_sema, SYNC_POLICY_FIFO, initialCount);
+		        assert(rc == KERN_SUCCESS);
+		        AE_UNUSED(rc);
+		    }
+
+		    AE_NO_TSAN ~Semaphore()
+		    {
+		        semaphore_destroy(mach_task_self(), m_sema);
+		    }
+
+		    bool wait() AE_NO_TSAN
+		    {
+		        return semaphore_wait(m_sema) == KERN_SUCCESS;
+		    }
+
+			bool try_wait() AE_NO_TSAN
+			{
+				return timed_wait(0);
+			}
+
+			bool timed_wait(std::uint64_t timeout_usecs) AE_NO_TSAN
+			{
+				mach_timespec_t ts;
+				ts.tv_sec = static_cast<unsigned int>(timeout_usecs / 1000000);
+				ts.tv_nsec = static_cast<int>((timeout_usecs % 1000000) * 1000);
+
+				// added in OSX 10.10: https://developer.apple.com/library/prerelease/mac/documentation/General/Reference/APIDiffsMacOSX10_10SeedDiff/modules/Darwin.html
+				kern_return_t rc = semaphore_timedwait(m_sema, ts);
+				return rc == KERN_SUCCESS;
+			}
+
+		    void signal() AE_NO_TSAN
+		    {
+		        while (semaphore_signal(m_sema) != KERN_SUCCESS);
+		    }
+
+		    void signal(int count) AE_NO_TSAN
+		    {
+		        while (count-- > 0)
+		        {
+		            while (semaphore_signal(m_sema) != KERN_SUCCESS);
+		        }
+		    }
+		};
+#elif defined(__unix__)
+		//---------------------------------------------------------
+		// Semaphore (POSIX, Linux)
+		//---------------------------------------------------------
+		class Semaphore
+		{
+		private:
+		    sem_t m_sema;
+
+		    Semaphore(const Semaphore& other);
+		    Semaphore& operator=(const Semaphore& other);
+
+		public:
+		    AE_NO_TSAN Semaphore(int initialCount = 0) : m_sema()
+		    {
+		        assert(initialCount >= 0);
+		        int rc = sem_init(&m_sema, 0, static_cast<unsigned int>(initialCount));
+		        assert(rc == 0);
+		        AE_UNUSED(rc);
+		    }
+
+		    AE_NO_TSAN ~Semaphore()
+		    {
+		        sem_destroy(&m_sema);
+		    }
+
+		    bool wait() AE_NO_TSAN
+		    {
+		        // http://stackoverflow.com/questions/2013181/gdb-causes-sem-wait-to-fail-with-eintr-error
+		        int rc;
+		        do
+		        {
+		            rc = sem_wait(&m_sema);
+		        }
+		        while (rc == -1 && errno == EINTR);
+		        return rc == 0;
+		    }
+
+			bool try_wait() AE_NO_TSAN
+			{
+				int rc;
+				do {
+					rc = sem_trywait(&m_sema);
+				} while (rc == -1 && errno == EINTR);
+				return rc == 0;
+			}
+
+			bool timed_wait(std::uint64_t usecs) AE_NO_TSAN
+			{
+				struct timespec ts;
+				const int usecs_in_1_sec = 1000000;
+				const int nsecs_in_1_sec = 1000000000;
+				clock_gettime(CLOCK_REALTIME, &ts);
+				ts.tv_sec += static_cast<time_t>(usecs / usecs_in_1_sec);
+				ts.tv_nsec += static_cast<long>(usecs % usecs_in_1_sec) * 1000;
+				// sem_timedwait bombs if you have more than 1e9 in tv_nsec
+				// so we have to clean things up before passing it in
+				if (ts.tv_nsec >= nsecs_in_1_sec) {
+					ts.tv_nsec -= nsecs_in_1_sec;
+					++ts.tv_sec;
+				}
+
+				int rc;
+				do {
+					rc = sem_timedwait(&m_sema, &ts);
+				} while (rc == -1 && errno == EINTR);
+				return rc == 0;
+			}
+
+		    void signal() AE_NO_TSAN
+		    {
+		        while (sem_post(&m_sema) == -1);
+		    }
+
+		    void signal(int count) AE_NO_TSAN
+		    {
+		        while (count-- > 0)
+		        {
+		            while (sem_post(&m_sema) == -1);
+		        }
+		    }
+		};
+#elif defined(FREERTOS)
+		//---------------------------------------------------------
+		// Semaphore (FreeRTOS)
+		//---------------------------------------------------------
+		class Semaphore
+		{
+		private:
+			SemaphoreHandle_t m_sema;
+
+			Semaphore(const Semaphore& other);
+			Semaphore& operator=(const Semaphore& other);
+
+		public:
+			AE_NO_TSAN Semaphore(int initialCount = 0) : m_sema()
+			{
+				assert(initialCount >= 0);
+				m_sema = xSemaphoreCreateCounting(static_cast<UBaseType_t>(~0ull), static_cast<UBaseType_t>(initialCount));
+				assert(m_sema);
+			}
+
+			AE_NO_TSAN ~Semaphore()
+			{
+				vSemaphoreDelete(m_sema);
+			}
+
+			bool wait() AE_NO_TSAN
+			{
+				return xSemaphoreTake(m_sema, portMAX_DELAY) == pdTRUE;
+			}
+
+			bool try_wait() AE_NO_TSAN
+			{
+				// Note: In an ISR context, if this causes a task to unblock,
+				// the caller won't know about it
+				if (xPortIsInsideInterrupt())
+					return xSemaphoreTakeFromISR(m_sema, NULL) == pdTRUE;
+				return xSemaphoreTake(m_sema, 0) == pdTRUE;
+			}
+
+			bool timed_wait(std::uint64_t usecs) AE_NO_TSAN
+			{
+				std::uint64_t msecs = usecs / 1000;
+				TickType_t ticks = static_cast<TickType_t>(msecs / portTICK_PERIOD_MS);
+				if (ticks == 0)
+					return try_wait();
+				return xSemaphoreTake(m_sema, ticks) == pdTRUE;
+			}
+
+			void signal() AE_NO_TSAN
+			{
+				// Note: In an ISR context, if this causes a task to unblock,
+				// the caller won't know about it
+				BaseType_t rc;
+				if (xPortIsInsideInterrupt())
+					rc = xSemaphoreGiveFromISR(m_sema, NULL);
+				else
+					rc = xSemaphoreGive(m_sema);
+				assert(rc == pdTRUE);
+				AE_UNUSED(rc);
+			}
+
+			void signal(int count) AE_NO_TSAN
+			{
+				while (count-- > 0)
+					signal();
+			}
+		};
+#else
+#error Unsupported platform! (No semaphore wrapper available)
+#endif
+
+		//---------------------------------------------------------
+		// LightweightSemaphore
+		//---------------------------------------------------------
+		class LightweightSemaphore
+		{
+		public:
+			typedef std::make_signed<std::size_t>::type ssize_t;
+			
+		private:
+		    weak_atomic<ssize_t> m_count;
+		    Semaphore m_sema;
+
+		    bool waitWithPartialSpinning(std::int64_t timeout_usecs = -1) AE_NO_TSAN
+		    {
+		        ssize_t oldCount;
+		        // Is there a better way to set the initial spin count?
+		        // If we lower it to 1000, testBenaphore becomes 15x slower on my Core i7-5930K Windows PC,
+		        // as threads start hitting the kernel semaphore.
+		        int spin = 1024;
+		        while (--spin >= 0)
+		        {
+		            if (m_count.load() > 0)
+		            {
+		                m_count.fetch_add_acquire(-1);
+		                return true;
+		            }
+		            compiler_fence(memory_order_acquire);     // Prevent the compiler from collapsing the loop.
+		        }
+		        oldCount = m_count.fetch_add_acquire(-1);
+				if (oldCount > 0)
+					return true;
+		        if (timeout_usecs < 0)
+				{
+					if (m_sema.wait())
+						return true;
+				}
+				if (timeout_usecs > 0 && m_sema.timed_wait(static_cast<uint64_t>(timeout_usecs)))
+					return true;
+				// At this point, we've timed out waiting for the semaphore, but the
+				// count is still decremented indicating we may still be waiting on
+				// it. So we have to re-adjust the count, but only if the semaphore
+				// wasn't signaled enough times for us too since then. If it was, we
+				// need to release the semaphore too.
+				while (true)
+				{
+					oldCount = m_count.fetch_add_release(1);
+					if (oldCount < 0)
+						return false;    // successfully restored things to the way they were
+					// Oh, the producer thread just signaled the semaphore after all. Try again:
+					oldCount = m_count.fetch_add_acquire(-1);
+					if (oldCount > 0 && m_sema.try_wait())
+						return true;
+				}
+		    }
+
+		public:
+		    AE_NO_TSAN LightweightSemaphore(ssize_t initialCount = 0) : m_count(initialCount), m_sema()
+		    {
+		        assert(initialCount >= 0);
+		    }
+
+		    bool tryWait() AE_NO_TSAN
+		    {
+		        if (m_count.load() > 0)
+		        {
+		        	m_count.fetch_add_acquire(-1);
+		        	return true;
+		        }
+		        return false;
+		    }
+
+		    bool wait() AE_NO_TSAN
+		    {
+		        return tryWait() || waitWithPartialSpinning();
+		    }
+
+			bool wait(std::int64_t timeout_usecs) AE_NO_TSAN
+			{
+				return tryWait() || waitWithPartialSpinning(timeout_usecs);
+			}
+
+		    void signal(ssize_t count = 1) AE_NO_TSAN
+		    {
+		    	assert(count >= 0);
+		        ssize_t oldCount = m_count.fetch_add_release(count);
+		        assert(oldCount >= -1);
+		        if (oldCount < 0)
+		        {
+		            m_sema.signal(1);
+		        }
+		    }
+		    
+		    std::size_t availableApprox() const AE_NO_TSAN
+		    {
+		    	ssize_t count = m_count.load();
+		    	return count > 0 ? static_cast<std::size_t>(count) : 0;
+		    }
+		};
+	}	// end namespace spsc_sema
+}	// end namespace moodycamel
+
+#if defined(AE_VCPP) && (_MSC_VER < 1700 || defined(__cplusplus_cli))
+#pragma warning(pop)
+#ifdef __cplusplus_cli
+#pragma managed(pop)
+#endif
+#endif
diff --git a/duix-sdk/src/main/cpp/dhcore/blockingconcurrentqueue.h b/duix-sdk/src/main/cpp/dhcore/blockingconcurrentqueue.h
new file mode 100644
index 0000000..205a4db
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhcore/blockingconcurrentqueue.h
@@ -0,0 +1,582 @@
+// Provides an efficient blocking version of moodycamel::ConcurrentQueue.
+// ©2015-2020 Cameron Desrochers. Distributed under the terms of the simplified
+// BSD license, available at the top of concurrentqueue.h.
+// Also dual-licensed under the Boost Software License (see LICENSE.md)
+// Uses Jeff Preshing's semaphore implementation (under the terms of its
+// separate zlib license, see lightweightsemaphore.h).
+
+#pragma once
+
+#include "concurrentqueue.h"
+#include "lightweightsemaphore.h"
+
+#include <type_traits>
+#include <cerrno>
+#include <memory>
+#include <chrono>
+#include <ctime>
+
+namespace moodycamel
+{
+// This is a blocking version of the queue. It has an almost identical interface to
+// the normal non-blocking version, with the addition of various wait_dequeue() methods
+// and the removal of producer-specific dequeue methods.
+template<typename T, typename Traits = ConcurrentQueueDefaultTraits>
+class BlockingConcurrentQueue
+{
+private:
+	typedef ::moodycamel::ConcurrentQueue<T, Traits> ConcurrentQueue;
+	typedef ::moodycamel::LightweightSemaphore LightweightSemaphore;
+
+public:
+	typedef typename ConcurrentQueue::producer_token_t producer_token_t;
+	typedef typename ConcurrentQueue::consumer_token_t consumer_token_t;
+	
+	typedef typename ConcurrentQueue::index_t index_t;
+	typedef typename ConcurrentQueue::size_t size_t;
+	typedef typename std::make_signed<size_t>::type ssize_t;
+	
+	static const size_t BLOCK_SIZE = ConcurrentQueue::BLOCK_SIZE;
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = ConcurrentQueue::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD;
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = ConcurrentQueue::EXPLICIT_INITIAL_INDEX_SIZE;
+	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = ConcurrentQueue::IMPLICIT_INITIAL_INDEX_SIZE;
+	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = ConcurrentQueue::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = ConcurrentQueue::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE;
+	static const size_t MAX_SUBQUEUE_SIZE = ConcurrentQueue::MAX_SUBQUEUE_SIZE;
+	
+public:
+	// Creates a queue with at least `capacity` element slots; note that the
+	// actual number of elements that can be inserted without additional memory
+	// allocation depends on the number of producers and the block size (e.g. if
+	// the block size is equal to `capacity`, only a single block will be allocated
+	// up-front, which means only a single producer will be able to enqueue elements
+	// without an extra allocation -- blocks aren't shared between producers).
+	// This method is not thread safe -- it is up to the user to ensure that the
+	// queue is fully constructed before it starts being used by other threads (this
+	// includes making the memory effects of construction visible, possibly with a
+	// memory barrier).
+	explicit BlockingConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE)
+		: inner(capacity), sema(create<LightweightSemaphore, ssize_t, int>(0, (int)Traits::MAX_SEMA_SPINS), &BlockingConcurrentQueue::template destroy<LightweightSemaphore>)
+	{
+		assert(reinterpret_cast<ConcurrentQueue*>((BlockingConcurrentQueue*)1) == &((BlockingConcurrentQueue*)1)->inner && "BlockingConcurrentQueue must have ConcurrentQueue as its first member");
+		if (!sema) {
+			MOODYCAMEL_THROW(std::bad_alloc());
+		}
+	}
+	
+	BlockingConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers)
+		: inner(minCapacity, maxExplicitProducers, maxImplicitProducers), sema(create<LightweightSemaphore, ssize_t, int>(0, (int)Traits::MAX_SEMA_SPINS), &BlockingConcurrentQueue::template destroy<LightweightSemaphore>)
+	{
+		assert(reinterpret_cast<ConcurrentQueue*>((BlockingConcurrentQueue*)1) == &((BlockingConcurrentQueue*)1)->inner && "BlockingConcurrentQueue must have ConcurrentQueue as its first member");
+		if (!sema) {
+			MOODYCAMEL_THROW(std::bad_alloc());
+		}
+	}
+	
+	// Disable copying and copy assignment
+	BlockingConcurrentQueue(BlockingConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+	BlockingConcurrentQueue& operator=(BlockingConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+	
+	// Moving is supported, but note that it is *not* a thread-safe operation.
+	// Nobody can use the queue while it's being moved, and the memory effects
+	// of that move must be propagated to other threads before they can use it.
+	// Note: When a queue is moved, its tokens are still valid but can only be
+	// used with the destination queue (i.e. semantically they are moved along
+	// with the queue itself).
+	BlockingConcurrentQueue(BlockingConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+		: inner(std::move(other.inner)), sema(std::move(other.sema))
+	{ }
+	
+	inline BlockingConcurrentQueue& operator=(BlockingConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+	{
+		return swap_internal(other);
+	}
+	
+	// Swaps this queue's state with the other's. Not thread-safe.
+	// Swapping two queues does not invalidate their tokens, however
+	// the tokens that were created for one queue must be used with
+	// only the swapped queue (i.e. the tokens are tied to the
+	// queue's movable state, not the object itself).
+	inline void swap(BlockingConcurrentQueue& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap_internal(other);
+	}
+	
+private:
+	BlockingConcurrentQueue& swap_internal(BlockingConcurrentQueue& other)
+	{
+		if (this == &other) {
+			return *this;
+		}
+		
+		inner.swap(other.inner);
+		sema.swap(other.sema);
+		return *this;
+	}
+	
+public:
+	// Enqueues a single item (by copying it).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T const& item)
+	{
+		if ((details::likely)(inner.enqueue(item))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by moving it, if possible).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T&& item)
+	{
+		if ((details::likely)(inner.enqueue(std::move(item)))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T const& item)
+	{
+		if ((details::likely)(inner.enqueue(token, item))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T&& item)
+	{
+		if ((details::likely)(inner.enqueue(token, std::move(item)))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues several items.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved instead of copied.
+	// Thread-safe.
+	template<typename It>
+	inline bool enqueue_bulk(It itemFirst, size_t count)
+	{
+		if ((details::likely)(inner.enqueue_bulk(std::forward<It>(itemFirst), count))) {
+			sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues several items using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails
+	// (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	inline bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		if ((details::likely)(inner.enqueue_bulk(token, std::forward<It>(itemFirst), count))) {
+			sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by copying it).
+	// Does not allocate memory. Fails if not enough room to enqueue (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T const& item)
+	{
+		if (inner.try_enqueue(item)) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by moving it, if possible).
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T&& item)
+	{
+		if (inner.try_enqueue(std::move(item))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T const& item)
+	{
+		if (inner.try_enqueue(token, item)) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T&& item)
+	{
+		if (inner.try_enqueue(token, std::move(item))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues several items.
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	inline bool try_enqueue_bulk(It itemFirst, size_t count)
+	{
+		if (inner.try_enqueue_bulk(std::forward<It>(itemFirst), count)) {
+			sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues several items using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	inline bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		if (inner.try_enqueue_bulk(token, std::forward<It>(itemFirst), count)) {
+			sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+			return true;
+		}
+		return false;
+	}
+	
+	
+	// Attempts to dequeue from the queue.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool try_dequeue(U& item)
+	{
+		if (sema->tryWait()) {
+			while (!inner.try_dequeue(item)) {
+				continue;
+			}
+			return true;
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue from the queue using an explicit consumer token.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool try_dequeue(consumer_token_t& token, U& item)
+	{
+		if (sema->tryWait()) {
+			while (!inner.try_dequeue(token, item)) {
+				continue;
+			}
+			return true;
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t try_dequeue_bulk(It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		max = (size_t)sema->tryWaitMany((LightweightSemaphore::ssize_t)(ssize_t)max);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(itemFirst, max - count);
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		max = (size_t)sema->tryWaitMany((LightweightSemaphore::ssize_t)(ssize_t)max);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(token, itemFirst, max - count);
+		}
+		return count;
+	}
+	
+	
+	
+	// Blocks the current thread until there's something to dequeue, then
+	// dequeues it.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline void wait_dequeue(U& item)
+	{
+		while (!sema->wait()) {
+			continue;
+		}
+		while (!inner.try_dequeue(item)) {
+			continue;
+		}
+	}
+
+	// Blocks the current thread until either there's something to dequeue
+	// or the timeout (specified in microseconds) expires. Returns false
+	// without setting `item` if the timeout expires, otherwise assigns
+	// to `item` and returns true.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool wait_dequeue_timed(U& item, std::int64_t timeout_usecs)
+	{
+		if (!sema->wait(timeout_usecs)) {
+			return false;
+		}
+		while (!inner.try_dequeue(item)) {
+			continue;
+		}
+		return true;
+	}
+    
+    // Blocks the current thread until either there's something to dequeue
+	// or the timeout expires. Returns false without setting `item` if the
+    // timeout expires, otherwise assigns to `item` and returns true.
+	// Never allocates. Thread-safe.
+	template<typename U, typename Rep, typename Period>
+	inline bool wait_dequeue_timed(U& item, std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_timed(item, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+    }
+	
+	// Blocks the current thread until there's something to dequeue, then
+	// dequeues it using an explicit consumer token.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline void wait_dequeue(consumer_token_t& token, U& item)
+	{
+		while (!sema->wait()) {
+			continue;
+		}
+		while (!inner.try_dequeue(token, item)) {
+			continue;
+		}
+	}
+	
+	// Blocks the current thread until either there's something to dequeue
+	// or the timeout (specified in microseconds) expires. Returns false
+	// without setting `item` if the timeout expires, otherwise assigns
+	// to `item` and returns true.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool wait_dequeue_timed(consumer_token_t& token, U& item, std::int64_t timeout_usecs)
+	{
+		if (!sema->wait(timeout_usecs)) {
+			return false;
+		}
+		while (!inner.try_dequeue(token, item)) {
+			continue;
+		}
+		return true;
+	}
+    
+    // Blocks the current thread until either there's something to dequeue
+	// or the timeout expires. Returns false without setting `item` if the
+    // timeout expires, otherwise assigns to `item` and returns true.
+	// Never allocates. Thread-safe.
+	template<typename U, typename Rep, typename Period>
+	inline bool wait_dequeue_timed(consumer_token_t& token, U& item, std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_timed(token, item, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+    }
+	
+	// Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued, which will
+	// always be at least one (this method blocks until the queue
+	// is non-empty) and at most max.
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t wait_dequeue_bulk(It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(itemFirst, max - count);
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued, which can
+	// be 0 if the timeout expires while waiting for elements,
+	// and at most max.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue_bulk.
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t wait_dequeue_bulk_timed(It itemFirst, size_t max, std::int64_t timeout_usecs)
+	{
+		size_t count = 0;
+		max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max, timeout_usecs);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(itemFirst, max - count);
+		}
+		return count;
+	}
+    
+    // Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued, which can
+	// be 0 if the timeout expires while waiting for elements,
+	// and at most max.
+	// Never allocates. Thread-safe.
+	template<typename It, typename Rep, typename Period>
+	inline size_t wait_dequeue_bulk_timed(It itemFirst, size_t max, std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_bulk_timed<It&>(itemFirst, max, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+    }
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued, which will
+	// always be at least one (this method blocks until the queue
+	// is non-empty) and at most max.
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t wait_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(token, itemFirst, max - count);
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued, which can
+	// be 0 if the timeout expires while waiting for elements,
+	// and at most max.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue_bulk.
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t wait_dequeue_bulk_timed(consumer_token_t& token, It itemFirst, size_t max, std::int64_t timeout_usecs)
+	{
+		size_t count = 0;
+		max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max, timeout_usecs);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(token, itemFirst, max - count);
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued, which can
+	// be 0 if the timeout expires while waiting for elements,
+	// and at most max.
+	// Never allocates. Thread-safe.
+	template<typename It, typename Rep, typename Period>
+	inline size_t wait_dequeue_bulk_timed(consumer_token_t& token, It itemFirst, size_t max, std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_bulk_timed<It&>(token, itemFirst, max, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+    }
+	
+	
+	// Returns an estimate of the total number of elements currently in the queue. This
+	// estimate is only accurate if the queue has completely stabilized before it is called
+	// (i.e. all enqueue and dequeue operations have completed and their memory effects are
+	// visible on the calling thread, and no further operations start while this method is
+	// being called).
+	// Thread-safe.
+	inline size_t size_approx() const
+	{
+		return (size_t)sema->availableApprox();
+	}
+	
+	
+	// Returns true if the underlying atomic variables used by
+	// the queue are lock-free (they should be on most platforms).
+	// Thread-safe.
+	static constexpr bool is_lock_free()
+	{
+		return ConcurrentQueue::is_lock_free();
+	}
+	
+
+private:
+	template<typename U, typename A1, typename A2>
+	static inline U* create(A1&& a1, A2&& a2)
+	{
+		void* p = (Traits::malloc)(sizeof(U));
+		return p != nullptr ? new (p) U(std::forward<A1>(a1), std::forward<A2>(a2)) : nullptr;
+	}
+	
+	template<typename U>
+	static inline void destroy(U* p)
+	{
+		if (p != nullptr) {
+			p->~U();
+		}
+		(Traits::free)(p);
+	}
+	
+private:
+	ConcurrentQueue inner;
+	std::unique_ptr<LightweightSemaphore, void (*)(LightweightSemaphore*)> sema;
+};
+
+
+template<typename T, typename Traits>
+inline void swap(BlockingConcurrentQueue<T, Traits>& a, BlockingConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+}	// end namespace moodycamel
diff --git a/duix-sdk/src/main/cpp/dhcore/concurrentqueue.h b/duix-sdk/src/main/cpp/dhcore/concurrentqueue.h
new file mode 100644
index 0000000..99caefc
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhcore/concurrentqueue.h
@@ -0,0 +1,3747 @@
+// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free queue.
+// An overview, including benchmark results, is provided here:
+//     http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++
+// The full design is also described in excruciating detail at:
+//    http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue
+
+// Simplified BSD license:
+// Copyright (c) 2013-2020, Cameron Desrochers.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice, this list of
+// conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice, this list of
+// conditions and the following disclaimer in the documentation and/or other materials
+// provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+// TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Also dual-licensed under the Boost Software License (see LICENSE.md)
+
+#pragma once
+
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
+// Disable -Wconversion warnings (spuriously triggered when Traits::size_t and
+// Traits::index_t are set to < 32 bits, causing integer promotion, causing warnings
+// upon assigning any computed values)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+
+#ifdef MCDBGQ_USE_RELACY
+#pragma GCC diagnostic ignored "-Wint-to-pointer-cast"
+#endif
+#endif
+
+#if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17)
+// VS2019 with /W4 warns about constant conditional expressions but unless /std=c++17 or higher
+// does not support `if constexpr`, so we have no choice but to simply disable the warning
+#pragma warning(push)
+#pragma warning(disable: 4127)  // conditional expression is constant
+#endif
+
+#if defined(__APPLE__)
+#include "TargetConditionals.h"
+#endif
+
+#ifdef MCDBGQ_USE_RELACY
+#include "relacy/relacy_std.hpp"
+#include "relacy_shims.h"
+// We only use malloc/free anyway, and the delete macro messes up `= delete` method declarations.
+// We'll override the default trait malloc ourselves without a macro.
+#undef new
+#undef delete
+#undef malloc
+#undef free
+#else
+#include <atomic>		// Requires C++11. Sorry VS2010.
+#include <cassert>
+#endif
+#include <cstddef>              // for max_align_t
+#include <cstdint>
+#include <cstdlib>
+#include <type_traits>
+#include <algorithm>
+#include <utility>
+#include <limits>
+#include <climits>		// for CHAR_BIT
+#include <array>
+#include <thread>		// partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading
+#include <mutex>        // used for thread exit synchronization
+
+// Platform-specific definitions of a numeric thread ID type and an invalid value
+namespace moodycamel { namespace details {
+	template<typename thread_id_t> struct thread_id_converter {
+		typedef thread_id_t thread_id_numeric_size_t;
+		typedef thread_id_t thread_id_hash_t;
+		static thread_id_hash_t prehash(thread_id_t const& x) { return x; }
+	};
+} }
+#if defined(MCDBGQ_USE_RELACY)
+namespace moodycamel { namespace details {
+	typedef std::uint32_t thread_id_t;
+	static const thread_id_t invalid_thread_id  = 0xFFFFFFFFU;
+	static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU;
+	static inline thread_id_t thread_id() { return rl::thread_index(); }
+} }
+#elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__)
+// No sense pulling in windows.h in a header, we'll manually declare the function
+// we use and rely on backwards-compatibility for this not to break
+extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void);
+namespace moodycamel { namespace details {
+	static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows");
+	typedef std::uint32_t thread_id_t;
+	static const thread_id_t invalid_thread_id  = 0;			// See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx
+	static const thread_id_t invalid_thread_id2 = 0xFFFFFFFFU;	// Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread IDs are presently multiples of 4.
+	static inline thread_id_t thread_id() { return static_cast<thread_id_t>(::GetCurrentThreadId()); }
+} }
+#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || (defined(__APPLE__) && TARGET_OS_IPHONE) || defined(__MVS__) || defined(MOODYCAMEL_NO_THREAD_LOCAL)
+namespace moodycamel { namespace details {
+	static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, "std::thread::id is expected to be either 4 or 8 bytes");
+	
+	typedef std::thread::id thread_id_t;
+	static const thread_id_t invalid_thread_id;         // Default ctor creates invalid ID
+
+	// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's
+	// only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't
+	// be.
+	static inline thread_id_t thread_id() { return std::this_thread::get_id(); }
+
+	template<std::size_t> struct thread_id_size { };
+	template<> struct thread_id_size<4> { typedef std::uint32_t numeric_t; };
+	template<> struct thread_id_size<8> { typedef std::uint64_t numeric_t; };
+
+	template<> struct thread_id_converter<thread_id_t> {
+		typedef thread_id_size<sizeof(thread_id_t)>::numeric_t thread_id_numeric_size_t;
+#ifndef __APPLE__
+		typedef std::size_t thread_id_hash_t;
+#else
+		typedef thread_id_numeric_size_t thread_id_hash_t;
+#endif
+
+		static thread_id_hash_t prehash(thread_id_t const& x)
+		{
+#ifndef __APPLE__
+			return std::hash<std::thread::id>()(x);
+#else
+			return *reinterpret_cast<thread_id_hash_t const*>(&x);
+#endif
+		}
+	};
+} }
+#else
+// Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475
+// In order to get a numeric thread ID in a platform-independent way, we use a thread-local
+// static variable's address as a thread identifier :-)
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+#define MOODYCAMEL_THREADLOCAL __thread
+#elif defined(_MSC_VER)
+#define MOODYCAMEL_THREADLOCAL __declspec(thread)
+#else
+// Assume C++11 compliant compiler
+#define MOODYCAMEL_THREADLOCAL thread_local
+#endif
+namespace moodycamel { namespace details {
+	typedef std::uintptr_t thread_id_t;
+	static const thread_id_t invalid_thread_id  = 0;		// Address can't be nullptr
+	static const thread_id_t invalid_thread_id2 = 1;		// Member accesses off a null pointer are also generally invalid. Plus it's not aligned.
+	inline thread_id_t thread_id() { static MOODYCAMEL_THREADLOCAL int x; return reinterpret_cast<thread_id_t>(&x); }
+} }
+#endif
+
+// Constexpr if
+#ifndef MOODYCAMEL_CONSTEXPR_IF
+#if (defined(_MSC_VER) && defined(_HAS_CXX17) && _HAS_CXX17) || __cplusplus > 201402L
+#define MOODYCAMEL_CONSTEXPR_IF if constexpr
+#define MOODYCAMEL_MAYBE_UNUSED [[maybe_unused]]
+#else
+#define MOODYCAMEL_CONSTEXPR_IF if
+#define MOODYCAMEL_MAYBE_UNUSED
+#endif
+#endif
+
+// Exceptions
+#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED
+#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__))
+#define MOODYCAMEL_EXCEPTIONS_ENABLED
+#endif
+#endif
+#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
+#define MOODYCAMEL_TRY try
+#define MOODYCAMEL_CATCH(...) catch(__VA_ARGS__)
+#define MOODYCAMEL_RETHROW throw
+#define MOODYCAMEL_THROW(expr) throw (expr)
+#else
+#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF (true)
+#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF (false)
+#define MOODYCAMEL_RETHROW
+#define MOODYCAMEL_THROW(expr)
+#endif
+
+#ifndef MOODYCAMEL_NOEXCEPT
+#if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED)
+#define MOODYCAMEL_NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true
+#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800
+// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when it shouldn't :-(
+// We have to assume *all* non-trivial constructors may throw on VS2012!
+#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900
+#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value || std::is_nothrow_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value || std::is_nothrow_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#else
+#define MOODYCAMEL_NOEXCEPT noexcept
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr)
+#endif
+#endif
+
+#ifndef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#ifdef MCDBGQ_USE_RELACY
+#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#else
+// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445
+// g++ <=4.7 doesn't support thread_local either.
+// Finally, iOS/ARM doesn't have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work
+#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) && (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && !defined(__MVS__)
+// Assume `thread_local` is fully supported in all other C++11 compilers/platforms
+#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED    // tentatively enabled for now; years ago several users report having problems with it on
+#endif
+#endif
+#endif
+
+// VS2012 doesn't support deleted functions. 
+// In this case, we declare the function normally but don't define it. A link error will be generated if the function is called.
+#ifndef MOODYCAMEL_DELETE_FUNCTION
+#if defined(_MSC_VER) && _MSC_VER < 1800
+#define MOODYCAMEL_DELETE_FUNCTION
+#else
+#define MOODYCAMEL_DELETE_FUNCTION = delete
+#endif
+#endif
+
+namespace moodycamel { namespace details {
+#ifndef MOODYCAMEL_ALIGNAS
+// VS2013 doesn't support alignas or alignof, and align() requires a constant literal
+#if defined(_MSC_VER) && _MSC_VER <= 1800
+#define MOODYCAMEL_ALIGNAS(alignment) __declspec(align(alignment))
+#define MOODYCAMEL_ALIGNOF(obj) __alignof(obj)
+#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) typename details::Vs2013Aligned<std::alignment_of<obj>::value, T>::type
+	template<int Align, typename T> struct Vs2013Aligned { };  // default, unsupported alignment
+	template<typename T> struct Vs2013Aligned<1, T> { typedef __declspec(align(1)) T type; };
+	template<typename T> struct Vs2013Aligned<2, T> { typedef __declspec(align(2)) T type; };
+	template<typename T> struct Vs2013Aligned<4, T> { typedef __declspec(align(4)) T type; };
+	template<typename T> struct Vs2013Aligned<8, T> { typedef __declspec(align(8)) T type; };
+	template<typename T> struct Vs2013Aligned<16, T> { typedef __declspec(align(16)) T type; };
+	template<typename T> struct Vs2013Aligned<32, T> { typedef __declspec(align(32)) T type; };
+	template<typename T> struct Vs2013Aligned<64, T> { typedef __declspec(align(64)) T type; };
+	template<typename T> struct Vs2013Aligned<128, T> { typedef __declspec(align(128)) T type; };
+	template<typename T> struct Vs2013Aligned<256, T> { typedef __declspec(align(256)) T type; };
+#else
+	template<typename T> struct identity { typedef T type; };
+#define MOODYCAMEL_ALIGNAS(alignment) alignas(alignment)
+#define MOODYCAMEL_ALIGNOF(obj) alignof(obj)
+#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) alignas(alignof(obj)) typename details::identity<T>::type
+#endif
+#endif
+} }
+
+
+// TSAN can false report races in lock-free code.  To enable TSAN to be used from projects that use this one,
+// we can apply per-function compile-time suppression.
+// See https://clang.llvm.org/docs/ThreadSanitizer.html#has-feature-thread-sanitizer
+#define MOODYCAMEL_NO_TSAN
+#if defined(__has_feature)
+ #if __has_feature(thread_sanitizer)
+  #undef MOODYCAMEL_NO_TSAN
+  #define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread")))
+ #endif // TSAN
+#endif // TSAN
+
+// Compiler-specific likely/unlikely hints
+namespace moodycamel { namespace details {
+#if defined(__GNUC__)
+	static inline bool (likely)(bool x) { return __builtin_expect((x), true); }
+	static inline bool (unlikely)(bool x) { return __builtin_expect((x), false); }
+#else
+	static inline bool (likely)(bool x) { return x; }
+	static inline bool (unlikely)(bool x) { return x; }
+#endif
+} }
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+#include "internal/concurrentqueue_internal_debug.h"
+#endif
+
+namespace moodycamel {
+namespace details {
+	template<typename T>
+	struct const_numeric_max {
+		static_assert(std::is_integral<T>::value, "const_numeric_max can only be used with integers");
+		static const T value = std::numeric_limits<T>::is_signed
+			? (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast<T>(1)
+			: static_cast<T>(-1);
+	};
+
+#if defined(__GLIBCXX__)
+	typedef ::max_align_t std_max_align_t;      // libstdc++ forgot to add it to std:: for a while
+#else
+	typedef std::max_align_t std_max_align_t;   // Others (e.g. MSVC) insist it can *only* be accessed via std::
+#endif
+
+	// Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting
+	// 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64.
+	typedef union {
+		std_max_align_t x;
+		long long y;
+		void* z;
+	} max_align_t;
+}
+
+// Default traits for the ConcurrentQueue. To change some of the
+// traits without re-implementing all of them, inherit from this
+// struct and shadow the declarations you wish to be different;
+// since the traits are used as a template type parameter, the
+// shadowed declarations will be used where defined, and the defaults
+// otherwise.
+struct ConcurrentQueueDefaultTraits
+{
+	// General-purpose size type. std::size_t is strongly recommended.
+	typedef std::size_t size_t;
+	
+	// The type used for the enqueue and dequeue indices. Must be at least as
+	// large as size_t. Should be significantly larger than the number of elements
+	// you expect to hold at once, especially if you have a high turnover rate;
+	// for example, on 32-bit x86, if you expect to have over a hundred million
+	// elements or pump several million elements through your queue in a very
+	// short space of time, using a 32-bit type *may* trigger a race condition.
+	// A 64-bit int type is recommended in that case, and in practice will
+	// prevent a race condition no matter the usage of the queue. Note that
+	// whether the queue is lock-free with a 64-int type depends on the whether
+	// std::atomic<std::uint64_t> is lock-free, which is platform-specific.
+	typedef std::size_t index_t;
+	
+	// Internally, all elements are enqueued and dequeued from multi-element
+	// blocks; this is the smallest controllable unit. If you expect few elements
+	// but many producers, a smaller block size should be favoured. For few producers
+	// and/or many elements, a larger block size is preferred. A sane default
+	// is provided. Must be a power of 2.
+	static const size_t BLOCK_SIZE = 32;
+	
+	// For explicit producers (i.e. when using a producer token), the block is
+	// checked for being empty by iterating through a list of flags, one per element.
+	// For large block sizes, this is too inefficient, and switching to an atomic
+	// counter-based approach is faster. The switch is made for block sizes strictly
+	// larger than this threshold.
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32;
+	
+	// How many full blocks can be expected for a single explicit producer? This should
+	// reflect that number's maximum for optimal performance. Must be a power of 2.
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32;
+	
+	// How many full blocks can be expected for a single implicit producer? This should
+	// reflect that number's maximum for optimal performance. Must be a power of 2.
+	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32;
+	
+	// The initial size of the hash table mapping thread IDs to implicit producers.
+	// Note that the hash is resized every time it becomes half full.
+	// Must be a power of two, and either 0 or at least 1. If 0, implicit production
+	// (using the enqueue methods without an explicit producer token) is disabled.
+	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32;
+	
+	// Controls the number of items that an explicit consumer (i.e. one with a token)
+	// must consume before it causes all consumers to rotate and move on to the next
+	// internal queue.
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256;
+	
+	// The maximum number of elements (inclusive) that can be enqueued to a sub-queue.
+	// Enqueue operations that would cause this limit to be surpassed will fail. Note
+	// that this limit is enforced at the block level (for performance reasons), i.e.
+	// it's rounded up to the nearest block size.
+	static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max<size_t>::value;
+
+	// The number of times to spin before sleeping when waiting on a semaphore.
+	// Recommended values are on the order of 1000-10000 unless the number of
+	// consumer threads exceeds the number of idle cores (in which case try 0-100).
+	// Only affects instances of the BlockingConcurrentQueue.
+	static const int MAX_SEMA_SPINS = 10000;
+
+	// Whether to recycle dynamically-allocated blocks into an internal free list or
+	// not. If false, only pre-allocated blocks (controlled by the constructor
+	// arguments) will be recycled, and all others will be `free`d back to the heap.
+	// Note that blocks consumed by explicit producers are only freed on destruction
+	// of the queue (not following destruction of the token) regardless of this trait.
+	static const bool RECYCLE_ALLOCATED_BLOCKS = false;
+
+	
+#ifndef MCDBGQ_USE_RELACY
+	// Memory allocation can be customized if needed.
+	// malloc should return nullptr on failure, and handle alignment like std::malloc.
+#if defined(malloc) || defined(free)
+	// Gah, this is 2015, stop defining macros that break standard code already!
+	// Work around malloc/free being special macros:
+	static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); }
+	static inline void WORKAROUND_free(void* ptr) { return free(ptr); }
+	static inline void* (malloc)(size_t size) { return WORKAROUND_malloc(size); }
+	static inline void (free)(void* ptr) { return WORKAROUND_free(ptr); }
+#else
+	static inline void* malloc(size_t size) { return std::malloc(size); }
+	static inline void free(void* ptr) { return std::free(ptr); }
+#endif
+#else
+	// Debug versions when running under the Relacy race detector (ignore
+	// these in user code)
+	static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); }
+	static inline void free(void* ptr) { return rl::rl_free(ptr, $); }
+#endif
+};
+
+
+// When producing or consuming many elements, the most efficient way is to:
+//    1) Use one of the bulk-operation methods of the queue with a token
+//    2) Failing that, use the bulk-operation methods without a token
+//    3) Failing that, create a token and use that with the single-item methods
+//    4) Failing that, use the single-parameter methods of the queue
+// Having said that, don't create tokens willy-nilly -- ideally there should be
+// a maximum of one token per thread (of each kind).
+struct ProducerToken;
+struct ConsumerToken;
+
+template<typename T, typename Traits> class ConcurrentQueue;
+template<typename T, typename Traits> class BlockingConcurrentQueue;
+class ConcurrentQueueTests;
+
+
+namespace details
+{
+	struct ConcurrentQueueProducerTypelessBase
+	{
+		ConcurrentQueueProducerTypelessBase* next;
+		std::atomic<bool> inactive;
+		ProducerToken* token;
+		
+		ConcurrentQueueProducerTypelessBase()
+			: next(nullptr), inactive(false), token(nullptr)
+		{
+		}
+	};
+	
+	template<bool use32> struct _hash_32_or_64 {
+		static inline std::uint32_t hash(std::uint32_t h)
+		{
+			// MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
+			// Since the thread ID is already unique, all we really want to do is propagate that
+			// uniqueness evenly across all the bits, so that we can use a subset of the bits while
+			// reducing collisions significantly
+			h ^= h >> 16;
+			h *= 0x85ebca6b;
+			h ^= h >> 13;
+			h *= 0xc2b2ae35;
+			return h ^ (h >> 16);
+		}
+	};
+	template<> struct _hash_32_or_64<1> {
+		static inline std::uint64_t hash(std::uint64_t h)
+		{
+			h ^= h >> 33;
+			h *= 0xff51afd7ed558ccd;
+			h ^= h >> 33;
+			h *= 0xc4ceb9fe1a85ec53;
+			return h ^ (h >> 33);
+		}
+	};
+	template<std::size_t size> struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> {  };
+	
+	static inline size_t hash_thread_id(thread_id_t id)
+	{
+		static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values");
+		return static_cast<size_t>(hash_32_or_64<sizeof(thread_id_converter<thread_id_t>::thread_id_hash_t)>::hash(
+			thread_id_converter<thread_id_t>::prehash(id)));
+	}
+	
+	template<typename T>
+	static inline bool circular_less_than(T a, T b)
+	{
+		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "circular_less_than is intended to be used only with unsigned integer types");
+		return static_cast<T>(a - b) > static_cast<T>(static_cast<T>(1) << (static_cast<T>(sizeof(T) * CHAR_BIT - 1)));
+		// Note: extra parens around rhs of operator<< is MSVC bug: https://developercommunity2.visualstudio.com/t/C4554-triggers-when-both-lhs-and-rhs-is/10034931
+		//       silencing the bug requires #pragma warning(disable: 4554) around the calling code and has no effect when done here.
+	}
+	
+	template<typename U>
+	static inline char* align_for(char* ptr)
+	{
+		const std::size_t alignment = std::alignment_of<U>::value;
+		return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment;
+	}
+
+	template<typename T>
+	static inline T ceil_to_pow_2(T x)
+	{
+		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types");
+
+		// Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+		--x;
+		x |= x >> 1;
+		x |= x >> 2;
+		x |= x >> 4;
+		for (std::size_t i = 1; i < sizeof(T); i <<= 1) {
+			x |= x >> (i << 3);
+		}
+		++x;
+		return x;
+	}
+	
+	template<typename T>
+	static inline void swap_relaxed(std::atomic<T>& left, std::atomic<T>& right)
+	{
+		T temp = std::move(left.load(std::memory_order_relaxed));
+		left.store(std::move(right.load(std::memory_order_relaxed)), std::memory_order_relaxed);
+		right.store(std::move(temp), std::memory_order_relaxed);
+	}
+	
+	template<typename T>
+	static inline T const& nomove(T const& x)
+	{
+		return x;
+	}
+	
+	template<bool Enable>
+	struct nomove_if
+	{
+		template<typename T>
+		static inline T const& eval(T const& x)
+		{
+			return x;
+		}
+	};
+	
+	template<>
+	struct nomove_if<false>
+	{
+		template<typename U>
+		static inline auto eval(U&& x)
+			-> decltype(std::forward<U>(x))
+		{
+			return std::forward<U>(x);
+		}
+	};
+	
+	template<typename It>
+	static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it)
+	{
+		return *it;
+	}
+	
+#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+	template<typename T> struct is_trivially_destructible : std::is_trivially_destructible<T> { };
+#else
+	template<typename T> struct is_trivially_destructible : std::has_trivial_destructor<T> { };
+#endif
+	
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#ifdef MCDBGQ_USE_RELACY
+	typedef RelacyThreadExitListener ThreadExitListener;
+	typedef RelacyThreadExitNotifier ThreadExitNotifier;
+#else
+	class ThreadExitNotifier;
+
+	struct ThreadExitListener
+	{
+		typedef void (*callback_t)(void*);
+		callback_t callback;
+		void* userData;
+		
+		ThreadExitListener* next;		// reserved for use by the ThreadExitNotifier
+		ThreadExitNotifier* chain;		// reserved for use by the ThreadExitNotifier
+	};
+
+	class ThreadExitNotifier
+	{
+	public:
+		static void subscribe(ThreadExitListener* listener)
+		{
+			auto& tlsInst = instance();
+			std::lock_guard<std::mutex> guard(mutex());
+			listener->next = tlsInst.tail;
+			listener->chain = &tlsInst;
+			tlsInst.tail = listener;
+		}
+		
+		static void unsubscribe(ThreadExitListener* listener)
+		{
+			std::lock_guard<std::mutex> guard(mutex());
+			if (!listener->chain) {
+				return;  // race with ~ThreadExitNotifier
+			}
+			auto& tlsInst = *listener->chain;
+			listener->chain = nullptr;
+			ThreadExitListener** prev = &tlsInst.tail;
+			for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) {
+				if (ptr == listener) {
+					*prev = ptr->next;
+					break;
+				}
+				prev = &ptr->next;
+			}
+		}
+		
+	private:
+		ThreadExitNotifier() : tail(nullptr) { }
+		ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
+		ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
+		
+		~ThreadExitNotifier()
+		{
+			// This thread is about to exit, let everyone know!
+			assert(this == &instance() && "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined.");
+			std::lock_guard<std::mutex> guard(mutex());
+			for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) {
+				ptr->chain = nullptr;
+				ptr->callback(ptr->userData);
+			}
+		}
+		
+		// Thread-local
+		static inline ThreadExitNotifier& instance()
+		{
+			static thread_local ThreadExitNotifier notifier;
+			return notifier;
+		}
+
+		static inline std::mutex& mutex()
+		{
+			// Must be static because the ThreadExitNotifier could be destroyed while unsubscribe is called
+			static std::mutex mutex;
+			return mutex;
+		}
+		
+	private:
+		ThreadExitListener* tail;
+	};
+#endif
+#endif
+	
+	template<typename T> struct static_is_lock_free_num { enum { value = 0 }; };
+	template<> struct static_is_lock_free_num<signed char> { enum { value = ATOMIC_CHAR_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<short> { enum { value = ATOMIC_SHORT_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<int> { enum { value = ATOMIC_INT_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<long> { enum { value = ATOMIC_LONG_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<long long> { enum { value = ATOMIC_LLONG_LOCK_FREE }; };
+	template<typename T> struct static_is_lock_free : static_is_lock_free_num<typename std::make_signed<T>::type> {  };
+	template<> struct static_is_lock_free<bool> { enum { value = ATOMIC_BOOL_LOCK_FREE }; };
+	template<typename U> struct static_is_lock_free<U*> { enum { value = ATOMIC_POINTER_LOCK_FREE }; };
+}
+
+
+struct ProducerToken
+{
+	template<typename T, typename Traits>
+	explicit ProducerToken(ConcurrentQueue<T, Traits>& queue);
+	
+	template<typename T, typename Traits>
+	explicit ProducerToken(BlockingConcurrentQueue<T, Traits>& queue);
+	
+	ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
+		: producer(other.producer)
+	{
+		other.producer = nullptr;
+		if (producer != nullptr) {
+			producer->token = this;
+		}
+	}
+	
+	inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap(other);
+		return *this;
+	}
+	
+	void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT
+	{
+		std::swap(producer, other.producer);
+		if (producer != nullptr) {
+			producer->token = this;
+		}
+		if (other.producer != nullptr) {
+			other.producer->token = &other;
+		}
+	}
+	
+	// A token is always valid unless:
+	//     1) Memory allocation failed during construction
+	//     2) It was moved via the move constructor
+	//        (Note: assignment does a swap, leaving both potentially valid)
+	//     3) The associated queue was destroyed
+	// Note that if valid() returns true, that only indicates
+	// that the token is valid for use with a specific queue,
+	// but not which one; that's up to the user to track.
+	inline bool valid() const { return producer != nullptr; }
+	
+	~ProducerToken()
+	{
+		if (producer != nullptr) {
+			producer->token = nullptr;
+			producer->inactive.store(true, std::memory_order_release);
+		}
+	}
+	
+	// Disable copying and assignment
+	ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+	ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+	
+private:
+	template<typename T, typename Traits> friend class ConcurrentQueue;
+	friend class ConcurrentQueueTests;
+	
+protected:
+	details::ConcurrentQueueProducerTypelessBase* producer;
+};
+
+
+struct ConsumerToken
+{
+	template<typename T, typename Traits>
+	explicit ConsumerToken(ConcurrentQueue<T, Traits>& q);
+	
+	template<typename T, typename Traits>
+	explicit ConsumerToken(BlockingConcurrentQueue<T, Traits>& q);
+	
+	ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
+		: initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer)
+	{
+	}
+	
+	inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap(other);
+		return *this;
+	}
+	
+	void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT
+	{
+		std::swap(initialOffset, other.initialOffset);
+		std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset);
+		std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent);
+		std::swap(currentProducer, other.currentProducer);
+		std::swap(desiredProducer, other.desiredProducer);
+	}
+	
+	// Disable copying and assignment
+	ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+	ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+
+private:
+	template<typename T, typename Traits> friend class ConcurrentQueue;
+	friend class ConcurrentQueueTests;
+	
+private: // but shared with ConcurrentQueue
+	std::uint32_t initialOffset;
+	std::uint32_t lastKnownGlobalOffset;
+	std::uint32_t itemsConsumedFromCurrent;
+	details::ConcurrentQueueProducerTypelessBase* currentProducer;
+	details::ConcurrentQueueProducerTypelessBase* desiredProducer;
+};
+
+// Need to forward-declare this swap because it's in a namespace.
+// See http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces
+template<typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT;
+
+
+template<typename T, typename Traits = ConcurrentQueueDefaultTraits>
+class ConcurrentQueue
+{
+public:
+	typedef ::moodycamel::ProducerToken producer_token_t;
+	typedef ::moodycamel::ConsumerToken consumer_token_t;
+	
+	typedef typename Traits::index_t index_t;
+	typedef typename Traits::size_t size_t;
+	
+	static const size_t BLOCK_SIZE = static_cast<size_t>(Traits::BLOCK_SIZE);
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD);
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE);
+	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::IMPLICIT_INITIAL_INDEX_SIZE);
+	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = static_cast<size_t>(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE);
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast<std::uint32_t>(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE);
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4307)		// + integral constant overflow (that's what the ternary expression is for!)
+#pragma warning(disable: 4309)		// static_cast: Truncation of constant value
+#endif
+	static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max<size_t>::value - static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max<size_t>::value : ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE);
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+	static_assert(!std::numeric_limits<size_t>::is_signed && std::is_integral<size_t>::value, "Traits::size_t must be an unsigned integral type");
+	static_assert(!std::numeric_limits<index_t>::is_signed && std::is_integral<index_t>::value, "Traits::index_t must be an unsigned integral type");
+	static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t");
+	static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)");
+	static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)");
+	static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
+	static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
+	static_assert((INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2");
+	static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)");
+
+public:
+	// Creates a queue with at least `capacity` element slots; note that the
+	// actual number of elements that can be inserted without additional memory
+	// allocation depends on the number of producers and the block size (e.g. if
+	// the block size is equal to `capacity`, only a single block will be allocated
+	// up-front, which means only a single producer will be able to enqueue elements
+	// without an extra allocation -- blocks aren't shared between producers).
+	// This method is not thread safe -- it is up to the user to ensure that the
+	// queue is fully constructed before it starts being used by other threads (this
+	// includes making the memory effects of construction visible, possibly with a
+	// memory barrier).
+	explicit ConcurrentQueue(size_t capacity = 32 * BLOCK_SIZE)
+		: producerListTail(nullptr),
+		producerCount(0),
+		initialBlockPoolIndex(0),
+		nextExplicitConsumerId(0),
+		globalExplicitConsumerOffset(0)
+	{
+		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+		populate_initial_implicit_producer_hash();
+		populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1));
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		// Track all the producers using a fully-resolved typed list for
+		// each kind; this makes it possible to debug them starting from
+		// the root queue object (otherwise wacky casts are needed that
+		// don't compile in the debugger's expression evaluator).
+		explicitProducers.store(nullptr, std::memory_order_relaxed);
+		implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+	}
+	
+	// Computes the correct amount of pre-allocated blocks for you based
+	// on the minimum number of elements you want available at any given
+	// time, and the maximum concurrent number of each type of producer.
+	ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers)
+		: producerListTail(nullptr),
+		producerCount(0),
+		initialBlockPoolIndex(0),
+		nextExplicitConsumerId(0),
+		globalExplicitConsumerOffset(0)
+	{
+		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+		populate_initial_implicit_producer_hash();
+		size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers + maxImplicitProducers);
+		populate_initial_block_list(blocks);
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		explicitProducers.store(nullptr, std::memory_order_relaxed);
+		implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+	}
+	
+	// Note: The queue should not be accessed concurrently while it's
+	// being deleted. It's up to the user to synchronize this.
+	// This method is not thread safe.
+	~ConcurrentQueue()
+	{
+		// Destroy producers
+		auto ptr = producerListTail.load(std::memory_order_relaxed);
+		while (ptr != nullptr) {
+			auto next = ptr->next_prod();
+			if (ptr->token != nullptr) {
+				ptr->token->producer = nullptr;
+			}
+			destroy(ptr);
+			ptr = next;
+		}
+		
+		// Destroy implicit producer hash tables
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) {
+			auto hash = implicitProducerHash.load(std::memory_order_relaxed);
+			while (hash != nullptr) {
+				auto prev = hash->prev;
+				if (prev != nullptr) {		// The last hash is part of this object and was not allocated dynamically
+					for (size_t i = 0; i != hash->capacity; ++i) {
+						hash->entries[i].~ImplicitProducerKVP();
+					}
+					hash->~ImplicitProducerHash();
+					(Traits::free)(hash);
+				}
+				hash = prev;
+			}
+		}
+		
+		// Destroy global free list
+		auto block = freeList.head_unsafe();
+		while (block != nullptr) {
+			auto next = block->freeListNext.load(std::memory_order_relaxed);
+			if (block->dynamicallyAllocated) {
+				destroy(block);
+			}
+			block = next;
+		}
+		
+		// Destroy initial free list
+		destroy_array(initialBlockPool, initialBlockPoolSize);
+	}
+
+	// Disable copying and copy assignment
+	ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+	ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+	
+	// Moving is supported, but note that it is *not* a thread-safe operation.
+	// Nobody can use the queue while it's being moved, and the memory effects
+	// of that move must be propagated to other threads before they can use it.
+	// Note: When a queue is moved, its tokens are still valid but can only be
+	// used with the destination queue (i.e. semantically they are moved along
+	// with the queue itself).
+	ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+		: producerListTail(other.producerListTail.load(std::memory_order_relaxed)),
+		producerCount(other.producerCount.load(std::memory_order_relaxed)),
+		initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)),
+		initialBlockPool(other.initialBlockPool),
+		initialBlockPoolSize(other.initialBlockPoolSize),
+		freeList(std::move(other.freeList)),
+		nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)),
+		globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed))
+	{
+		// Move the other one into this, and leave the other one as an empty queue
+		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+		populate_initial_implicit_producer_hash();
+		swap_implicit_producer_hashes(other);
+		
+		other.producerListTail.store(nullptr, std::memory_order_relaxed);
+		other.producerCount.store(0, std::memory_order_relaxed);
+		other.nextExplicitConsumerId.store(0, std::memory_order_relaxed);
+		other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed);
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
+		other.explicitProducers.store(nullptr, std::memory_order_relaxed);
+		implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
+		other.implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+		
+		other.initialBlockPoolIndex.store(0, std::memory_order_relaxed);
+		other.initialBlockPoolSize = 0;
+		other.initialBlockPool = nullptr;
+		
+		reown_producers();
+	}
+	
+	inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+	{
+		return swap_internal(other);
+	}
+	
+	// Swaps this queue's state with the other's. Not thread-safe.
+	// Swapping two queues does not invalidate their tokens, however
+	// the tokens that were created for one queue must be used with
+	// only the swapped queue (i.e. the tokens are tied to the
+	// queue's movable state, not the object itself).
+	inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap_internal(other);
+	}
+	
+private:
+	ConcurrentQueue& swap_internal(ConcurrentQueue& other)
+	{
+		if (this == &other) {
+			return *this;
+		}
+		
+		details::swap_relaxed(producerListTail, other.producerListTail);
+		details::swap_relaxed(producerCount, other.producerCount);
+		details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex);
+		std::swap(initialBlockPool, other.initialBlockPool);
+		std::swap(initialBlockPoolSize, other.initialBlockPoolSize);
+		freeList.swap(other.freeList);
+		details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId);
+		details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset);
+		
+		swap_implicit_producer_hashes(other);
+		
+		reown_producers();
+		other.reown_producers();
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		details::swap_relaxed(explicitProducers, other.explicitProducers);
+		details::swap_relaxed(implicitProducers, other.implicitProducers);
+#endif
+		
+		return *this;
+	}
+	
+public:
+	// Enqueues a single item (by copying it).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T const& item)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue<CanAlloc>(item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T&& item)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue<CanAlloc>(std::move(item));
+	}
+	
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T const& item)
+	{
+		return inner_enqueue<CanAlloc>(token, item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T&& item)
+	{
+		return inner_enqueue<CanAlloc>(token, std::move(item));
+	}
+	
+	// Enqueues several items.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool enqueue_bulk(It itemFirst, size_t count)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue_bulk<CanAlloc>(itemFirst, count);
+	}
+	
+	// Enqueues several items using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails
+	// (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		return inner_enqueue_bulk<CanAlloc>(token, itemFirst, count);
+	}
+	
+	// Enqueues a single item (by copying it).
+	// Does not allocate memory. Fails if not enough room to enqueue (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T const& item)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue<CannotAlloc>(item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible).
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T&& item)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue<CannotAlloc>(std::move(item));
+	}
+	
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T const& item)
+	{
+		return inner_enqueue<CannotAlloc>(token, item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T&& item)
+	{
+		return inner_enqueue<CannotAlloc>(token, std::move(item));
+	}
+	
+	// Enqueues several items.
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool try_enqueue_bulk(It itemFirst, size_t count)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		else return inner_enqueue_bulk<CannotAlloc>(itemFirst, count);
+	}
+	
+	// Enqueues several items using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		return inner_enqueue_bulk<CannotAlloc>(token, itemFirst, count);
+	}
+	
+	
+	
+	// Attempts to dequeue from the queue.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	bool try_dequeue(U& item)
+	{
+		// Instead of simply trying each producer in turn (which could cause needless contention on the first
+		// producer), we score them heuristically.
+		size_t nonEmptyCount = 0;
+		ProducerBase* best = nullptr;
+		size_t bestSize = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) {
+			auto size = ptr->size_approx();
+			if (size > 0) {
+				if (size > bestSize) {
+					bestSize = size;
+					best = ptr;
+				}
+				++nonEmptyCount;
+			}
+		}
+		
+		// If there was at least one non-empty queue but it appears empty at the time
+		// we try to dequeue from it, we need to make sure every queue's been tried
+		if (nonEmptyCount > 0) {
+			if ((details::likely)(best->dequeue(item))) {
+				return true;
+			}
+			for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+				if (ptr != best && ptr->dequeue(item)) {
+					return true;
+				}
+			}
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue from the queue.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// This differs from the try_dequeue(item) method in that this one does
+	// not attempt to reduce contention by interleaving the order that producer
+	// streams are dequeued from. So, using this method can reduce overall throughput
+	// under contention, but will give more predictable results in single-threaded
+	// consumer scenarios. This is mostly only useful for internal unit tests.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	bool try_dequeue_non_interleaved(U& item)
+	{
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			if (ptr->dequeue(item)) {
+				return true;
+			}
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue from the queue using an explicit consumer token.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	bool try_dequeue(consumer_token_t& token, U& item)
+	{
+		// The idea is roughly as follows:
+		// Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the highest efficiency consumer dictates the rotation speed of everyone else, more or less
+		// If you see that the global offset has changed, you must reset your consumption counter and move to your designated place
+		// If there's no items where you're supposed to be, keep moving until you find a producer with some items
+		// If the global offset has not changed but you've run out of items to consume, move over from your current position until you find an producer with something in it
+		
+		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+			if (!update_current_producer_after_rotation(token)) {
+				return false;
+			}
+		}
+		
+		// If there was at least one non-empty queue but it appears empty at the time
+		// we try to dequeue from it, we need to make sure every queue's been tried
+		if (static_cast<ProducerBase*>(token.currentProducer)->dequeue(item)) {
+			if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+			}
+			return true;
+		}
+		
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+		if (ptr == nullptr) {
+			ptr = tail;
+		}
+		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
+			if (ptr->dequeue(item)) {
+				token.currentProducer = ptr;
+				token.itemsConsumedFromCurrent = 1;
+				return true;
+			}
+			ptr = ptr->next_prod();
+			if (ptr == nullptr) {
+				ptr = tail;
+			}
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	size_t try_dequeue_bulk(It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			count += ptr->dequeue_bulk(itemFirst, max - count);
+			if (count == max) {
+				break;
+			}
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
+	{
+		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+			if (!update_current_producer_after_rotation(token)) {
+				return 0;
+			}
+		}
+		
+		size_t count = static_cast<ProducerBase*>(token.currentProducer)->dequeue_bulk(itemFirst, max);
+		if (count == max) {
+			if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+			}
+			return max;
+		}
+		token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
+		max -= count;
+		
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+		if (ptr == nullptr) {
+			ptr = tail;
+		}
+		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
+			auto dequeued = ptr->dequeue_bulk(itemFirst, max);
+			count += dequeued;
+			if (dequeued != 0) {
+				token.currentProducer = ptr;
+				token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued);
+			}
+			if (dequeued == max) {
+				break;
+			}
+			max -= dequeued;
+			ptr = ptr->next_prod();
+			if (ptr == nullptr) {
+				ptr = tail;
+			}
+		}
+		return count;
+	}
+	
+	
+	
+	// Attempts to dequeue from a specific producer's inner queue.
+	// If you happen to know which producer you want to dequeue from, this
+	// is significantly faster than using the general-case try_dequeue methods.
+	// Returns false if the producer's queue appeared empty at the time it
+	// was checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item)
+	{
+		return static_cast<ExplicitProducer*>(producer.producer)->dequeue(item);
+	}
+	
+	// Attempts to dequeue several elements from a specific producer's inner queue.
+	// Returns the number of items actually dequeued.
+	// If you happen to know which producer you want to dequeue from, this
+	// is significantly faster than using the general-case try_dequeue methods.
+	// Returns 0 if the producer's queue appeared empty at the time it
+	// was checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max)
+	{
+		return static_cast<ExplicitProducer*>(producer.producer)->dequeue_bulk(itemFirst, max);
+	}
+	
+	
+	// Returns an estimate of the total number of elements currently in the queue. This
+	// estimate is only accurate if the queue has completely stabilized before it is called
+	// (i.e. all enqueue and dequeue operations have completed and their memory effects are
+	// visible on the calling thread, and no further operations start while this method is
+	// being called).
+	// Thread-safe.
+	size_t size_approx() const
+	{
+		size_t size = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			size += ptr->size_approx();
+		}
+		return size;
+	}
+	
+	
+	// Returns true if the underlying atomic variables used by
+	// the queue are lock-free (they should be on most platforms).
+	// Thread-safe.
+	static constexpr bool is_lock_free()
+	{
+		return
+			details::static_is_lock_free<bool>::value == 2 &&
+			details::static_is_lock_free<size_t>::value == 2 &&
+			details::static_is_lock_free<std::uint32_t>::value == 2 &&
+			details::static_is_lock_free<index_t>::value == 2 &&
+			details::static_is_lock_free<void*>::value == 2 &&
+			details::static_is_lock_free<typename details::thread_id_converter<details::thread_id_t>::thread_id_numeric_size_t>::value == 2;
+	}
+
+
+private:
+	friend struct ProducerToken;
+	friend struct ConsumerToken;
+	struct ExplicitProducer;
+	friend struct ExplicitProducer;
+	struct ImplicitProducer;
+	friend struct ImplicitProducer;
+	friend class ConcurrentQueueTests;
+		
+	enum AllocationMode { CanAlloc, CannotAlloc };
+	
+	
+	///////////////////////////////
+	// Queue methods
+	///////////////////////////////
+	
+	template<AllocationMode canAlloc, typename U>
+	inline bool inner_enqueue(producer_token_t const& token, U&& element)
+	{
+		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
+	}
+	
+	template<AllocationMode canAlloc, typename U>
+	inline bool inner_enqueue(U&& element)
+	{
+		auto producer = get_or_add_implicit_producer();
+		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
+	}
+	
+	template<AllocationMode canAlloc, typename It>
+	inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
+	}
+	
+	template<AllocationMode canAlloc, typename It>
+	inline bool inner_enqueue_bulk(It itemFirst, size_t count)
+	{
+		auto producer = get_or_add_implicit_producer();
+		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
+	}
+	
+	inline bool update_current_producer_after_rotation(consumer_token_t& token)
+	{
+		// Ah, there's been a rotation, figure out where we should be!
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		if (token.desiredProducer == nullptr && tail == nullptr) {
+			return false;
+		}
+		auto prodCount = producerCount.load(std::memory_order_relaxed);
+		auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed);
+		if ((details::unlikely)(token.desiredProducer == nullptr)) {
+			// Aha, first time we're dequeueing anything.
+			// Figure out our local position
+			// Note: offset is from start, not end, but we're traversing from end -- subtract from count first
+			std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount);
+			token.desiredProducer = tail;
+			for (std::uint32_t i = 0; i != offset; ++i) {
+				token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+				if (token.desiredProducer == nullptr) {
+					token.desiredProducer = tail;
+				}
+			}
+		}
+		
+		std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset;
+		if (delta >= prodCount) {
+			delta = delta % prodCount;
+		}
+		for (std::uint32_t i = 0; i != delta; ++i) {
+			token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+			if (token.desiredProducer == nullptr) {
+				token.desiredProducer = tail;
+			}
+		}
+		
+		token.lastKnownGlobalOffset = globalOffset;
+		token.currentProducer = token.desiredProducer;
+		token.itemsConsumedFromCurrent = 0;
+		return true;
+	}
+	
+	
+	///////////////////////////
+	// Free list
+	///////////////////////////
+	
+	template <typename N>
+	struct FreeListNode
+	{
+		FreeListNode() : freeListRefs(0), freeListNext(nullptr) { }
+		
+		std::atomic<std::uint32_t> freeListRefs;
+		std::atomic<N*> freeListNext;
+	};
+	
+	// A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but
+	// simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly
+	// speedy under low contention.
+	template<typename N>		// N must inherit FreeListNode or have the same fields (and initialization of them)
+	struct FreeList
+	{
+		FreeList() : freeListHead(nullptr) { }
+		FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); }
+		void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); }
+		
+		FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
+		FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
+		
+		inline void add(N* node)
+		{
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+			debug::DebugLock lock(mutex);
+#endif		
+			// We know that the should-be-on-freelist bit is 0 at this point, so it's safe to
+			// set it using a fetch_add
+			if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) {
+				// Oh look! We were the last ones referencing this node, and we know
+				// we want to add it to the free list, so let's do it!
+		 		add_knowing_refcount_is_zero(node);
+			}
+		}
+		
+		inline N* try_get()
+		{
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+			debug::DebugLock lock(mutex);
+#endif		
+			auto head = freeListHead.load(std::memory_order_acquire);
+			while (head != nullptr) {
+				auto prevHead = head;
+				auto refs = head->freeListRefs.load(std::memory_order_relaxed);
+				if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire, std::memory_order_relaxed)) {
+					head = freeListHead.load(std::memory_order_acquire);
+					continue;
+				}
+				
+				// Good, reference count has been incremented (it wasn't at zero), which means we can read the
+				// next and not worry about it changing between now and the time we do the CAS
+				auto next = head->freeListNext.load(std::memory_order_relaxed);
+				if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) {
+					// Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no
+					// matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on).
+					assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0);
+					
+					// Decrease refcount twice, once for our ref, and once for the list's ref
+					head->freeListRefs.fetch_sub(2, std::memory_order_release);
+					return head;
+				}
+				
+				// OK, the head must have changed on us, but we still need to decrease the refcount we increased.
+				// Note that we don't need to release any memory effects, but we do need to ensure that the reference
+				// count decrement happens-after the CAS on the head.
+				refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel);
+				if (refs == SHOULD_BE_ON_FREELIST + 1) {
+					add_knowing_refcount_is_zero(prevHead);
+				}
+			}
+			
+			return nullptr;
+		}
+		
+		// Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes)
+		N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); }
+		
+	private:
+		inline void add_knowing_refcount_is_zero(N* node)
+		{
+			// Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run
+			// only one copy of this method per node at a time, i.e. the single thread case), then we know
+			// we can safely change the next pointer of the node; however, once the refcount is back above
+			// zero, then other threads could increase it (happens under heavy contention, when the refcount
+			// goes to zero in between a load and a refcount increment of a node in try_get, then back up to
+			// something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS
+			// to add the node to the actual list fails, decrease the refcount and leave the add operation to
+			// the next thread who puts the refcount back at zero (which could be us, hence the loop).
+			auto head = freeListHead.load(std::memory_order_relaxed);
+			while (true) {
+				node->freeListNext.store(head, std::memory_order_relaxed);
+				node->freeListRefs.store(1, std::memory_order_release);
+				if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) {
+					// Hmm, the add failed, but we can only try again when the refcount goes back to zero
+					if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_release) == 1) {
+						continue;
+					}
+				}
+				return;
+			}
+		}
+		
+	private:
+		// Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention)
+		std::atomic<N*> freeListHead;
+	
+	static const std::uint32_t REFS_MASK = 0x7FFFFFFF;
+	static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000;
+		
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+		debug::DebugMutex mutex;
+#endif
+	};
+	
+	
+	///////////////////////////
+	// Block
+	///////////////////////////
+	
+	enum InnerQueueContext { implicit_context = 0, explicit_context = 1 };
+	
+	struct Block
+	{
+		Block()
+			: next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), dynamicallyAllocated(true)
+		{
+#ifdef MCDBGQ_TRACKMEM
+			owner = nullptr;
+#endif
+		}
+		
+		template<InnerQueueContext context>
+		inline bool is_empty() const
+		{
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Check flags
+				for (size_t i = 0; i < BLOCK_SIZE; ++i) {
+					if (!emptyFlags[i].load(std::memory_order_relaxed)) {
+						return false;
+					}
+				}
+				
+				// Aha, empty; make sure we have all other memory effects that happened before the empty flags were set
+				std::atomic_thread_fence(std::memory_order_acquire);
+				return true;
+			}
+			else {
+				// Check counter
+				if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) {
+					std::atomic_thread_fence(std::memory_order_acquire);
+					return true;
+				}
+				assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE);
+				return false;
+			}
+		}
+		
+		// Returns true if the block is now empty (does not apply in explicit context)
+		template<InnerQueueContext context>
+		inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i)
+		{
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set flag
+				assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].load(std::memory_order_relaxed));
+				emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].store(true, std::memory_order_release);
+				return false;
+			}
+			else {
+				// Increment counter
+				auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_release);
+				assert(prevVal < BLOCK_SIZE);
+				return prevVal == BLOCK_SIZE - 1;
+			}
+		}
+		
+		// Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0).
+		// Returns true if the block is now empty (does not apply in explicit context).
+		template<InnerQueueContext context>
+		inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, size_t count)
+		{
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set flags
+				std::atomic_thread_fence(std::memory_order_release);
+				i = BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1)) - count + 1;
+				for (size_t j = 0; j != count; ++j) {
+					assert(!emptyFlags[i + j].load(std::memory_order_relaxed));
+					emptyFlags[i + j].store(true, std::memory_order_relaxed);
+				}
+				return false;
+			}
+			else {
+				// Increment counter
+				auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_release);
+				assert(prevVal + count <= BLOCK_SIZE);
+				return prevVal + count == BLOCK_SIZE;
+			}
+		}
+		
+		template<InnerQueueContext context>
+		inline void set_all_empty()
+		{
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set all flags
+				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+					emptyFlags[i].store(true, std::memory_order_relaxed);
+				}
+			}
+			else {
+				// Reset counter
+				elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed);
+			}
+		}
+		
+		template<InnerQueueContext context>
+		inline void reset_empty()
+		{
+			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Reset flags
+				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+					emptyFlags[i].store(false, std::memory_order_relaxed);
+				}
+			}
+			else {
+				// Reset counter
+				elementsCompletelyDequeued.store(0, std::memory_order_relaxed);
+			}
+		}
+		
+		inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT { return static_cast<T*>(static_cast<void*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
+		inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT { return static_cast<T const*>(static_cast<void const*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
+		
+	private:
+		static_assert(std::alignment_of<T>::value <= sizeof(T), "The queue does not support types with an alignment greater than their size at this time");
+		MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements;
+	public:
+		Block* next;
+		std::atomic<size_t> elementsCompletelyDequeued;
+		std::atomic<bool> emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1];
+	public:
+		std::atomic<std::uint32_t> freeListRefs;
+		std::atomic<Block*> freeListNext;
+		bool dynamicallyAllocated;		// Perhaps a better name for this would be 'isNotPartOfInitialBlockPool'
+		
+#ifdef MCDBGQ_TRACKMEM
+		void* owner;
+#endif
+	};
+	static_assert(std::alignment_of<Block>::value >= std::alignment_of<T>::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping");
+
+
+#ifdef MCDBGQ_TRACKMEM
+public:
+	struct MemStats;
+private:
+#endif
+	
+	///////////////////////////
+	// Producer base
+	///////////////////////////
+	
+	struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase
+	{
+		ProducerBase(ConcurrentQueue* parent_, bool isExplicit_) :
+			tailIndex(0),
+			headIndex(0),
+			dequeueOptimisticCount(0),
+			dequeueOvercommit(0),
+			tailBlock(nullptr),
+			isExplicit(isExplicit_),
+			parent(parent_)
+		{
+		}
+		
+		virtual ~ProducerBase() { }
+		
+		template<typename U>
+		inline bool dequeue(U& element)
+		{
+			if (isExplicit) {
+				return static_cast<ExplicitProducer*>(this)->dequeue(element);
+			}
+			else {
+				return static_cast<ImplicitProducer*>(this)->dequeue(element);
+			}
+		}
+		
+		template<typename It>
+		inline size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			if (isExplicit) {
+				return static_cast<ExplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
+			}
+			else {
+				return static_cast<ImplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
+			}
+		}
+		
+		inline ProducerBase* next_prod() const { return static_cast<ProducerBase*>(next); }
+		
+		inline size_t size_approx() const
+		{
+			auto tail = tailIndex.load(std::memory_order_relaxed);
+			auto head = headIndex.load(std::memory_order_relaxed);
+			return details::circular_less_than(head, tail) ? static_cast<size_t>(tail - head) : 0;
+		}
+		
+		inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); }
+	protected:
+		std::atomic<index_t> tailIndex;		// Where to enqueue to next
+		std::atomic<index_t> headIndex;		// Where to dequeue from next
+		
+		std::atomic<index_t> dequeueOptimisticCount;
+		std::atomic<index_t> dequeueOvercommit;
+		
+		Block* tailBlock;
+		
+	public:
+		bool isExplicit;
+		ConcurrentQueue* parent;
+		
+	protected:
+#ifdef MCDBGQ_TRACKMEM
+		friend struct MemStats;
+#endif
+	};
+	
+	
+	///////////////////////////
+	// Explicit queue
+	///////////////////////////
+		
+	struct ExplicitProducer : public ProducerBase
+	{
+		explicit ExplicitProducer(ConcurrentQueue* parent_) :
+			ProducerBase(parent_, true),
+			blockIndex(nullptr),
+			pr_blockIndexSlotsUsed(0),
+			pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1),
+			pr_blockIndexFront(0),
+			pr_blockIndexEntries(nullptr),
+			pr_blockIndexRaw(nullptr)
+		{
+			size_t poolBasedIndexSize = details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1;
+			if (poolBasedIndexSize > pr_blockIndexSize) {
+				pr_blockIndexSize = poolBasedIndexSize;
+			}
+			
+			new_block_index(0);		// This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE
+		}
+		
+		~ExplicitProducer()
+		{
+			// Destruct any elements not yet dequeued.
+			// Since we're in the destructor, we can assume all elements
+			// are either completely dequeued or completely not (no halfways).
+			if (this->tailBlock != nullptr) {		// Note this means there must be a block index too
+				// First find the block that's partially dequeued, if any
+				Block* halfDequeuedBlock = nullptr;
+				if ((this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) != 0) {
+					// The head's not on a block boundary, meaning a block somewhere is partially dequeued
+					// (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary)
+					size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1);
+					while (details::circular_less_than<index_t>(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) {
+						i = (i + 1) & (pr_blockIndexSize - 1);
+					}
+					assert(details::circular_less_than<index_t>(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed)));
+					halfDequeuedBlock = pr_blockIndexEntries[i].block;
+				}
+				
+				// Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration)
+				auto block = this->tailBlock;
+				do {
+					block = block->next;
+					if (block->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+						continue;
+					}
+					
+					size_t i = 0;	// Offset into block
+					if (block == halfDequeuedBlock) {
+						i = static_cast<size_t>(this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+					}
+					
+					// Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index
+					auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast<size_t>(this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+					while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) {
+						(*block)[i++]->~T();
+					}
+				} while (block != this->tailBlock);
+			}
+			
+			// Destroy all blocks that we own
+			if (this->tailBlock != nullptr) {
+				auto block = this->tailBlock;
+				do {
+					auto nextBlock = block->next;
+					this->parent->add_block_to_free_list(block);
+					block = nextBlock;
+				} while (block != this->tailBlock);
+			}
+			
+			// Destroy the block indices
+			auto header = static_cast<BlockIndexHeader*>(pr_blockIndexRaw);
+			while (header != nullptr) {
+				auto prev = static_cast<BlockIndexHeader*>(header->prev);
+				header->~BlockIndexHeader();
+				(Traits::free)(header);
+				header = prev;
+			}
+		}
+		
+		template<AllocationMode allocMode, typename U>
+		inline bool enqueue(U&& element)
+		{
+			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			index_t newTailIndex = 1 + currentTailIndex;
+			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+				// We reached the end of a block, start a new one
+				auto startBlock = this->tailBlock;
+				auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+				if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+					// We can re-use the block ahead of us, it's empty!					
+					this->tailBlock = this->tailBlock->next;
+					this->tailBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+					
+					// We'll put the block on the block index (guaranteed to be room since we're conceptually removing the
+					// last block from it first -- except instead of removing then adding, we can just overwrite).
+					// Note that there must be a valid block index here, since even if allocation failed in the ctor,
+					// it would have been re-attempted when adding the first block to the queue; since there is such
+					// a block, a block index must have been successfully allocated.
+				}
+				else {
+					// Whatever head value we see here is >= the last value we saw here (relatively),
+					// and <= its current value. Since we have the most recent tail, the head must be
+					// <= to it.
+					auto head = this->headIndex.load(std::memory_order_relaxed);
+					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+					if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE)
+						|| (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+						// We can't enqueue in another block because there's not enough leeway -- the
+						// tail could surpass the head by the time the block fills up! (Or we'll exceed
+						// the size limit, if the second part of the condition was true.)
+						return false;
+					}
+					// We're going to need a new block; check that the block index has room
+					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) {
+						// Hmm, the circular block index is already full -- we'll need
+						// to allocate a new index. Note pr_blockIndexRaw can only be nullptr if
+						// the initial allocation failed in the constructor.
+						
+						MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
+							return false;
+						}
+						else if (!new_block_index(pr_blockIndexSlotsUsed)) {
+							return false;
+						}
+					}
+					
+					// Insert a new block in the circular linked list
+					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+					if (newBlock == nullptr) {
+						return false;
+					}
+#ifdef MCDBGQ_TRACKMEM
+					newBlock->owner = this;
+#endif
+					newBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+					if (this->tailBlock == nullptr) {
+						newBlock->next = newBlock;
+					}
+					else {
+						newBlock->next = this->tailBlock->next;
+						this->tailBlock->next = newBlock;
+					}
+					this->tailBlock = newBlock;
+					++pr_blockIndexSlotsUsed;
+				}
+
+				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
+					// The constructor may throw. We want the element not to appear in the queue in
+					// that case (without corrupting the queue):
+					MOODYCAMEL_TRY {
+						new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+					}
+					MOODYCAMEL_CATCH (...) {
+						// Revert change to the current block, but leave the new block available
+						// for next time
+						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+						this->tailBlock = startBlock == nullptr ? this->tailBlock : startBlock;
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				else {
+					(void)startBlock;
+					(void)originalBlockIndexSlotsUsed;
+				}
+				
+				// Add block to block index
+				auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+				entry.base = currentTailIndex;
+				entry.block = this->tailBlock;
+				blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release);
+				pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+				
+				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
+					this->tailIndex.store(newTailIndex, std::memory_order_release);
+					return true;
+				}
+			}
+			
+			// Enqueue
+			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+			
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+		
+		template<typename U>
+		bool dequeue(U& element)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
+				// Might be something to dequeue, let's give it a try
+				
+				// Note that this if is purely for performance purposes in the common case when the queue is
+				// empty and the values are eventually consistent -- we may enter here spuriously.
+				
+				// Note that whatever the values of overcommit and tail are, they are not going to change (unless we
+				// change them) and must be the same value at this point (inside the if) as when the if condition was
+				// evaluated.
+
+				// We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit below.
+				// This ensures that whatever the value we got loaded into overcommit, the load of dequeueOptisticCount in
+				// the fetch_add below will result in a value at least as recent as that (and therefore at least as large).
+				// Note that I believe a compiler (signal) fence here would be sufficient due to the nature of fetch_add (all
+				// read-modify-write operations are guaranteed to work on the latest value in the modification order), but
+				// unfortunately that can't be shown to be correct using only the C++11 standard.
+				// See http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				// Increment optimistic counter, then check if it went over the boundary
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
+				
+				// Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is only ever
+				// incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), and since we now
+				// have a version of dequeueOptimisticCount that is at least as recent as overcommit (due to the release upon
+				// incrementing dequeueOvercommit and the acquire above that synchronizes with it), overcommit <= myDequeueCount.
+				// However, we can't assert this since both dequeueOptimisticCount and dequeueOvercommit may (independently)
+				// overflow; in such a case, though, the logic still holds since the difference between the two is maintained.
+				
+				// Note that we reload tail here in case it changed; it will be the same value as before or greater, since
+				// this load is sequenced after (happens after) the earlier load above. This is supported by read-read
+				// coherency (as defined in the standard), explained here: http://en.cppreference.com/w/cpp/atomic/memory_order
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
+					// Guaranteed to be at least one element to dequeue!
+					
+					// Get the index. Note that since there's guaranteed to be at least one element, this
+					// will never exceed tail. We need to do an acquire-release fence here since it's possible
+					// that whatever condition got us to this point was for an earlier enqueued element (that
+					// we already see the memory effects for), but that by the time we increment somebody else
+					// has incremented it, and we need to see the memory effects for *that* element, which is
+					// in such a case is necessarily visible on the thread that incremented it in the first
+					// place with the more current condition (they must have acquired a tail that is at least
+					// as recent).
+					auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+					
+					
+					// Determine which block the element is in
+					
+					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
+					
+					// We need to be careful here about subtracting and dividing because of index wrap-around.
+					// When an index wraps, we need to preserve the sign of the offset when dividing it by the
+					// block size (in order to get a correct signed block count offset in all cases):
+					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+					auto blockBaseIndex = index & ~static_cast<index_t>(BLOCK_SIZE - 1);
+					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(blockBaseIndex - headBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
+					auto block = localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block;
+					
+					// Dequeue
+					auto& el = *((*block)[index]);
+					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
+						// Make sure the element is still fully dequeued and destroyed even if the assignment
+						// throws
+						struct Guard {
+							Block* block;
+							index_t index;
+							
+							~Guard()
+							{
+								(*block)[index]->~T();
+								block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
+							}
+						} guard = { block, index };
+
+						element = std::move(el); // NOLINT
+					}
+					else {
+						element = std::move(el); // NOLINT
+						el.~T(); // NOLINT
+						block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
+					}
+					
+					return true;
+				}
+				else {
+					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
+					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);		// Release so that the fetch_add on dequeueOptimisticCount is guaranteed to happen before this write
+				}
+			}
+		
+			return false;
+		}
+		
+		template<AllocationMode allocMode, typename It>
+		bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count)
+		{
+			// First, we need to make sure we have enough room to enqueue all of the elements;
+			// this means pre-allocating blocks and putting them in the block index (but only if
+			// all the allocations succeeded).
+			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			auto startBlock = this->tailBlock;
+			auto originalBlockIndexFront = pr_blockIndexFront;
+			auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+			
+			Block* firstAllocatedBlock = nullptr;
+			
+			// Figure out how many blocks we'll need to allocate, and do so
+			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+			if (blockBaseDiff > 0) {
+				// Allocate as many blocks as possible from ahead
+				while (blockBaseDiff > 0 && this->tailBlock != nullptr && this->tailBlock->next != firstAllocatedBlock && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+					
+					this->tailBlock = this->tailBlock->next;
+					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
+					
+					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+					entry.base = currentTailIndex;
+					entry.block = this->tailBlock;
+					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+				}
+				
+				// Now allocate as many blocks as necessary from the block pool
+				while (blockBaseDiff > 0) {
+					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+					
+					auto head = this->headIndex.load(std::memory_order_relaxed);
+					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) {
+						MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
+							// Failed to allocate, undo changes (but keep injected blocks)
+							pr_blockIndexFront = originalBlockIndexFront;
+							pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+							this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+							return false;
+						}
+						else if (full || !new_block_index(originalBlockIndexSlotsUsed)) {
+							// Failed to allocate, undo changes (but keep injected blocks)
+							pr_blockIndexFront = originalBlockIndexFront;
+							pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+							this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+							return false;
+						}
+						
+						// pr_blockIndexFront is updated inside new_block_index, so we need to
+						// update our fallback value too (since we keep the new index even if we
+						// later fail)
+						originalBlockIndexFront = originalBlockIndexSlotsUsed;
+					}
+					
+					// Insert a new block in the circular linked list
+					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+					if (newBlock == nullptr) {
+						pr_blockIndexFront = originalBlockIndexFront;
+						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+						return false;
+					}
+					
+#ifdef MCDBGQ_TRACKMEM
+					newBlock->owner = this;
+#endif
+					newBlock->ConcurrentQueue::Block::template set_all_empty<explicit_context>();
+					if (this->tailBlock == nullptr) {
+						newBlock->next = newBlock;
+					}
+					else {
+						newBlock->next = this->tailBlock->next;
+						this->tailBlock->next = newBlock;
+					}
+					this->tailBlock = newBlock;
+					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
+					
+					++pr_blockIndexSlotsUsed;
+					
+					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+					entry.base = currentTailIndex;
+					entry.block = this->tailBlock;
+					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+				}
+				
+				// Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and
+				// publish the new block index front
+				auto block = firstAllocatedBlock;
+				while (true) {
+					block->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+					if (block == this->tailBlock) {
+						break;
+					}
+					block = block->next;
+				}
+				
+				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
+					blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
+				}
+			}
+			
+			// Enqueue, one block at a time
+			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+			currentTailIndex = startTailIndex;
+			auto endBlock = this->tailBlock;
+			this->tailBlock = startBlock;
+			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
+			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
+				this->tailBlock = firstAllocatedBlock;
+			}
+			while (true) {
+				index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+					stopIndex = newTailIndex;
+				}
+				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
+					while (currentTailIndex != stopIndex) {
+						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+					}
+				}
+				else {
+					MOODYCAMEL_TRY {
+						while (currentTailIndex != stopIndex) {
+							// Must use copy constructor even if move constructor is available
+							// because we may have to revert if there's an exception.
+							// Sorry about the horrible templated next line, but it was the only way
+							// to disable moving *at compile time*, which is important because a type
+							// may only define a (noexcept) move constructor, and so calls to the
+							// cctor will not compile, even if they are in an if branch that will never
+							// be executed
+							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
+							++currentTailIndex;
+							++itemFirst;
+						}
+					}
+					MOODYCAMEL_CATCH (...) {
+						// Oh dear, an exception's been thrown -- destroy the elements that
+						// were enqueued so far and revert the entire bulk operation (we'll keep
+						// any allocated blocks in our linked list for later, though).
+						auto constructedStopIndex = currentTailIndex;
+						auto lastBlockEnqueued = this->tailBlock;
+						
+						pr_blockIndexFront = originalBlockIndexFront;
+						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+						
+						if (!details::is_trivially_destructible<T>::value) {
+							auto block = startBlock;
+							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+								block = firstAllocatedBlock;
+							}
+							currentTailIndex = startTailIndex;
+							while (true) {
+								stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
+									stopIndex = constructedStopIndex;
+								}
+								while (currentTailIndex != stopIndex) {
+									(*block)[currentTailIndex++]->~T();
+								}
+								if (block == lastBlockEnqueued) {
+									break;
+								}
+								block = block->next;
+							}
+						}
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				
+				if (this->tailBlock == endBlock) {
+					assert(currentTailIndex == newTailIndex);
+					break;
+				}
+				this->tailBlock = this->tailBlock->next;
+			}
+			
+			MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
+				if (firstAllocatedBlock != nullptr)
+					blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
+			}
+			
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+		
+		template<typename It>
+		size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
+			if (details::circular_less_than<size_t>(0, desiredCount)) {
+				desiredCount = desiredCount < max ? desiredCount : max;
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
+				
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
+				if (details::circular_less_than<size_t>(0, actualCount)) {
+					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+					if (actualCount < desiredCount) {
+						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
+					}
+					
+					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
+					// will never exceed tail.
+					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+					
+					// Determine which block the first element is in
+					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
+					
+					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+					auto firstBlockBaseIndex = firstIndex & ~static_cast<index_t>(BLOCK_SIZE - 1);
+					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(firstBlockBaseIndex - headBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
+					auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1);
+					
+					// Iterate the blocks and dequeue
+					auto index = firstIndex;
+					do {
+						auto firstIndexInBlock = index;
+						index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+						auto block = localBlockIndex->entries[indexIndex].block;
+						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
+							while (index != endIndex) {
+								auto& el = *((*block)[index]);
+								*itemFirst++ = std::move(el);
+								el.~T();
+								++index;
+							}
+						}
+						else {
+							MOODYCAMEL_TRY {
+								while (index != endIndex) {
+									auto& el = *((*block)[index]);
+									*itemFirst = std::move(el);
+									++itemFirst;
+									el.~T();
+									++index;
+								}
+							}
+							MOODYCAMEL_CATCH (...) {
+								// It's too late to revert the dequeue, but we can make sure that all
+								// the dequeued objects are properly destroyed and the block index
+								// (and empty count) are properly updated before we propagate the exception
+								do {
+									block = localBlockIndex->entries[indexIndex].block;
+									while (index != endIndex) {
+										(*block)[index++]->~T();
+									}
+									block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
+									indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+									
+									firstIndexInBlock = index;
+									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+								} while (index != firstIndex + actualCount);
+								
+								MOODYCAMEL_RETHROW;
+							}
+						}
+						block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
+						indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+					} while (index != firstIndex + actualCount);
+					
+					return actualCount;
+				}
+				else {
+					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
+					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
+				}
+			}
+			
+			return 0;
+		}
+		
+	private:
+		struct BlockIndexEntry
+		{
+			index_t base;
+			Block* block;
+		};
+		
+		struct BlockIndexHeader
+		{
+			size_t size;
+			std::atomic<size_t> front;		// Current slot (not next, like pr_blockIndexFront)
+			BlockIndexEntry* entries;
+			void* prev;
+		};
+		
+		
+		bool new_block_index(size_t numberOfFilledSlotsToExpose)
+		{
+			auto prevBlockSizeMask = pr_blockIndexSize - 1;
+			
+			// Create the new block
+			pr_blockIndexSize <<= 1;
+			auto newRawPtr = static_cast<char*>((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize));
+			if (newRawPtr == nullptr) {
+				pr_blockIndexSize >>= 1;		// Reset to allow graceful retry
+				return false;
+			}
+			
+			auto newBlockIndexEntries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(newRawPtr + sizeof(BlockIndexHeader)));
+			
+			// Copy in all the old indices, if any
+			size_t j = 0;
+			if (pr_blockIndexSlotsUsed != 0) {
+				auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask;
+				do {
+					newBlockIndexEntries[j++] = pr_blockIndexEntries[i];
+					i = (i + 1) & prevBlockSizeMask;
+				} while (i != pr_blockIndexFront);
+			}
+			
+			// Update everything
+			auto header = new (newRawPtr) BlockIndexHeader;
+			header->size = pr_blockIndexSize;
+			header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed);
+			header->entries = newBlockIndexEntries;
+			header->prev = pr_blockIndexRaw;		// we link the new block to the old one so we can free it later
+			
+			pr_blockIndexFront = j;
+			pr_blockIndexEntries = newBlockIndexEntries;
+			pr_blockIndexRaw = newRawPtr;
+			blockIndex.store(header, std::memory_order_release);
+			
+			return true;
+		}
+		
+	private:
+		std::atomic<BlockIndexHeader*> blockIndex;
+		
+		// To be used by producer only -- consumer must use the ones in referenced by blockIndex
+		size_t pr_blockIndexSlotsUsed;
+		size_t pr_blockIndexSize;
+		size_t pr_blockIndexFront;		// Next slot (not current)
+		BlockIndexEntry* pr_blockIndexEntries;
+		void* pr_blockIndexRaw;
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+	public:
+		ExplicitProducer* nextExplicitProducer;
+	private:
+#endif
+		
+#ifdef MCDBGQ_TRACKMEM
+		friend struct MemStats;
+#endif
+	};
+	
+	
+	//////////////////////////////////
+	// Implicit queue
+	//////////////////////////////////
+	
+	struct ImplicitProducer : public ProducerBase
+	{			
+		ImplicitProducer(ConcurrentQueue* parent_) :
+			ProducerBase(parent_, false),
+			nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE),
+			blockIndex(nullptr)
+		{
+			new_block_index();
+		}
+		
+		~ImplicitProducer()
+		{
+			// Note that since we're in the destructor we can assume that all enqueue/dequeue operations
+			// completed already; this means that all undequeued elements are placed contiguously across
+			// contiguous blocks, and that only the first and last remaining blocks can be only partially
+			// empty (all other remaining blocks must be completely full).
+			
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+			// Unregister ourselves for thread termination notification
+			if (!this->inactive.load(std::memory_order_relaxed)) {
+				details::ThreadExitNotifier::unsubscribe(&threadExitListener);
+			}
+#endif
+			
+			// Destroy all remaining elements!
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto index = this->headIndex.load(std::memory_order_relaxed);
+			Block* block = nullptr;
+			assert(index == tail || details::circular_less_than(index, tail));
+			bool forceFreeLastBlock = index != tail;		// If we enter the loop, then the last (tail) block will not be freed
+			while (index != tail) {
+				if ((index & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 || block == nullptr) {
+					if (block != nullptr) {
+						// Free the old block
+						this->parent->add_block_to_free_list(block);
+					}
+					
+					block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed);
+				}
+				
+				((*block)[index])->~T();
+				++index;
+			}
+			// Even if the queue is empty, there's still one block that's not on the free list
+			// (unless the head index reached the end of it, in which case the tail will be poised
+			// to create a new block).
+			if (this->tailBlock != nullptr && (forceFreeLastBlock || (tail & static_cast<index_t>(BLOCK_SIZE - 1)) != 0)) {
+				this->parent->add_block_to_free_list(this->tailBlock);
+			}
+			
+			// Destroy block index
+			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+			if (localBlockIndex != nullptr) {
+				for (size_t i = 0; i != localBlockIndex->capacity; ++i) {
+					localBlockIndex->index[i]->~BlockIndexEntry();
+				}
+				do {
+					auto prev = localBlockIndex->prev;
+					localBlockIndex->~BlockIndexHeader();
+					(Traits::free)(localBlockIndex);
+					localBlockIndex = prev;
+				} while (localBlockIndex != nullptr);
+			}
+		}
+		
+		template<AllocationMode allocMode, typename U>
+		inline bool enqueue(U&& element)
+		{
+			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			index_t newTailIndex = 1 + currentTailIndex;
+			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+				// We reached the end of a block, start a new one
+				auto head = this->headIndex.load(std::memory_order_relaxed);
+				assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+				if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+					return false;
+				}
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+				debug::DebugLock lock(mutex);
+#endif
+				// Find out where we'll be inserting this block in the block index
+				BlockIndexEntry* idxEntry;
+				if (!insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) {
+					return false;
+				}
+				
+				// Get ahold of a new block
+				auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+				if (newBlock == nullptr) {
+					rewind_block_index_tail();
+					idxEntry->value.store(nullptr, std::memory_order_relaxed);
+					return false;
+				}
+#ifdef MCDBGQ_TRACKMEM
+				newBlock->owner = this;
+#endif
+				newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+
+				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
+					// May throw, try to insert now before we publish the fact that we have this new block
+					MOODYCAMEL_TRY {
+						new ((*newBlock)[currentTailIndex]) T(std::forward<U>(element));
+					}
+					MOODYCAMEL_CATCH (...) {
+						rewind_block_index_tail();
+						idxEntry->value.store(nullptr, std::memory_order_relaxed);
+						this->parent->add_block_to_free_list(newBlock);
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				
+				// Insert the new block into the index
+				idxEntry->value.store(newBlock, std::memory_order_relaxed);
+				
+				this->tailBlock = newBlock;
+				
+				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
+					this->tailIndex.store(newTailIndex, std::memory_order_release);
+					return true;
+				}
+			}
+			
+			// Enqueue
+			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+			
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+		
+		template<typename U>
+		bool dequeue(U& element)
+		{
+			// See ExplicitProducer::dequeue for rationale and explanation
+			index_t tail = this->tailIndex.load(std::memory_order_relaxed);
+			index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
+					index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+					
+					// Determine which block the element is in
+					auto entry = get_block_index_entry_for_index(index);
+					
+					// Dequeue
+					auto block = entry->value.load(std::memory_order_relaxed);
+					auto& el = *((*block)[index]);
+					
+					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+						// Note: Acquiring the mutex with every dequeue instead of only when a block
+						// is released is very sub-optimal, but it is, after all, purely debug code.
+						debug::DebugLock lock(producer->mutex);
+#endif
+						struct Guard {
+							Block* block;
+							index_t index;
+							BlockIndexEntry* entry;
+							ConcurrentQueue* parent;
+							
+							~Guard()
+							{
+								(*block)[index]->~T();
+								if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
+									entry->value.store(nullptr, std::memory_order_relaxed);
+									parent->add_block_to_free_list(block);
+								}
+							}
+						} guard = { block, index, entry, this->parent };
+
+						element = std::move(el); // NOLINT
+					}
+					else {
+						element = std::move(el); // NOLINT
+						el.~T(); // NOLINT
+
+						if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
+							{
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+								debug::DebugLock lock(mutex);
+#endif
+								// Add the block back into the global free pool (and remove from block index)
+								entry->value.store(nullptr, std::memory_order_relaxed);
+							}
+							this->parent->add_block_to_free_list(block);		// releases the above store
+						}
+					}
+					
+					return true;
+				}
+				else {
+					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);
+				}
+			}
+		
+			return false;
+		}
+		
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4706)  // assignment within conditional expression
+#endif
+		template<AllocationMode allocMode, typename It>
+		bool enqueue_bulk(It itemFirst, size_t count)
+		{
+			// First, we need to make sure we have enough room to enqueue all of the elements;
+			// this means pre-allocating blocks and putting them in the block index (but only if
+			// all the allocations succeeded).
+			
+			// Note that the tailBlock we start off with may not be owned by us any more;
+			// this happens if it was filled up exactly to the top (setting tailIndex to
+			// the first index of the next block which is not yet allocated), then dequeued
+			// completely (putting it on the free list) before we enqueue again.
+			
+			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			auto startBlock = this->tailBlock;
+			Block* firstAllocatedBlock = nullptr;
+			auto endBlock = this->tailBlock;
+			
+			// Figure out how many blocks we'll need to allocate, and do so
+			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+			if (blockBaseDiff > 0) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+				debug::DebugLock lock(mutex);
+#endif
+				do {
+					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+					
+					// Find out where we'll be inserting this block in the block index
+					BlockIndexEntry* idxEntry = nullptr;  // initialization here unnecessary but compiler can't always tell
+					Block* newBlock;
+					bool indexInserted = false;
+					auto head = this->headIndex.load(std::memory_order_relaxed);
+					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+
+					if (full || !(indexInserted = insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) || (newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>()) == nullptr) {
+						// Index allocation or block allocation failed; revert any other allocations
+						// and index insertions done so far for this operation
+						if (indexInserted) {
+							rewind_block_index_tail();
+							idxEntry->value.store(nullptr, std::memory_order_relaxed);
+						}
+						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
+							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+							idxEntry = get_block_index_entry_for_index(currentTailIndex);
+							idxEntry->value.store(nullptr, std::memory_order_relaxed);
+							rewind_block_index_tail();
+						}
+						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+						this->tailBlock = startBlock;
+						
+						return false;
+					}
+					
+#ifdef MCDBGQ_TRACKMEM
+					newBlock->owner = this;
+#endif
+					newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+					newBlock->next = nullptr;
+					
+					// Insert the new block into the index
+					idxEntry->value.store(newBlock, std::memory_order_relaxed);
+					
+					// Store the chain of blocks so that we can undo if later allocations fail,
+					// and so that we can find the blocks when we do the actual enqueueing
+					if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr) {
+						assert(this->tailBlock != nullptr);
+						this->tailBlock->next = newBlock;
+					}
+					this->tailBlock = newBlock;
+					endBlock = newBlock;
+					firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock;
+				} while (blockBaseDiff > 0);
+			}
+			
+			// Enqueue, one block at a time
+			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+			currentTailIndex = startTailIndex;
+			this->tailBlock = startBlock;
+			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
+			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
+				this->tailBlock = firstAllocatedBlock;
+			}
+			while (true) {
+				index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+					stopIndex = newTailIndex;
+				}
+				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
+					while (currentTailIndex != stopIndex) {
+						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+					}
+				}
+				else {
+					MOODYCAMEL_TRY {
+						while (currentTailIndex != stopIndex) {
+							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
+							++currentTailIndex;
+							++itemFirst;
+						}
+					}
+					MOODYCAMEL_CATCH (...) {
+						auto constructedStopIndex = currentTailIndex;
+						auto lastBlockEnqueued = this->tailBlock;
+						
+						if (!details::is_trivially_destructible<T>::value) {
+							auto block = startBlock;
+							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+								block = firstAllocatedBlock;
+							}
+							currentTailIndex = startTailIndex;
+							while (true) {
+								stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
+									stopIndex = constructedStopIndex;
+								}
+								while (currentTailIndex != stopIndex) {
+									(*block)[currentTailIndex++]->~T();
+								}
+								if (block == lastBlockEnqueued) {
+									break;
+								}
+								block = block->next;
+							}
+						}
+						
+						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
+							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+							auto idxEntry = get_block_index_entry_for_index(currentTailIndex);
+							idxEntry->value.store(nullptr, std::memory_order_relaxed);
+							rewind_block_index_tail();
+						}
+						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+						this->tailBlock = startBlock;
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				
+				if (this->tailBlock == endBlock) {
+					assert(currentTailIndex == newTailIndex);
+					break;
+				}
+				this->tailBlock = this->tailBlock->next;
+			}
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+		
+		template<typename It>
+		size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
+			if (details::circular_less_than<size_t>(0, desiredCount)) {
+				desiredCount = desiredCount < max ? desiredCount : max;
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
+				
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
+				if (details::circular_less_than<size_t>(0, actualCount)) {
+					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+					if (actualCount < desiredCount) {
+						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
+					}
+					
+					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
+					// will never exceed tail.
+					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+					
+					// Iterate the blocks and dequeue
+					auto index = firstIndex;
+					BlockIndexHeader* localBlockIndex;
+					auto indexIndex = get_block_index_index_for_index(index, localBlockIndex);
+					do {
+						auto blockStartIndex = index;
+						index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+						
+						auto entry = localBlockIndex->index[indexIndex];
+						auto block = entry->value.load(std::memory_order_relaxed);
+						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
+							while (index != endIndex) {
+								auto& el = *((*block)[index]);
+								*itemFirst++ = std::move(el);
+								el.~T();
+								++index;
+							}
+						}
+						else {
+							MOODYCAMEL_TRY {
+								while (index != endIndex) {
+									auto& el = *((*block)[index]);
+									*itemFirst = std::move(el);
+									++itemFirst;
+									el.~T();
+									++index;
+								}
+							}
+							MOODYCAMEL_CATCH (...) {
+								do {
+									entry = localBlockIndex->index[indexIndex];
+									block = entry->value.load(std::memory_order_relaxed);
+									while (index != endIndex) {
+										(*block)[index++]->~T();
+									}
+									
+									if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+										debug::DebugLock lock(mutex);
+#endif
+										entry->value.store(nullptr, std::memory_order_relaxed);
+										this->parent->add_block_to_free_list(block);
+									}
+									indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+									
+									blockStartIndex = index;
+									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+								} while (index != firstIndex + actualCount);
+								
+								MOODYCAMEL_RETHROW;
+							}
+						}
+						if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
+							{
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+								debug::DebugLock lock(mutex);
+#endif
+								// Note that the set_many_empty above did a release, meaning that anybody who acquires the block
+								// we're about to free can use it safely since our writes (and reads!) will have happened-before then.
+								entry->value.store(nullptr, std::memory_order_relaxed);
+							}
+							this->parent->add_block_to_free_list(block);		// releases the above store
+						}
+						indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+					} while (index != firstIndex + actualCount);
+					
+					return actualCount;
+				}
+				else {
+					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
+				}
+			}
+			
+			return 0;
+		}
+		
+	private:
+		// The block size must be > 1, so any number with the low bit set is an invalid block base index
+		static const index_t INVALID_BLOCK_BASE = 1;
+		
+		struct BlockIndexEntry
+		{
+			std::atomic<index_t> key;
+			std::atomic<Block*> value;
+		};
+		
+		struct BlockIndexHeader
+		{
+			size_t capacity;
+			std::atomic<size_t> tail;
+			BlockIndexEntry* entries;
+			BlockIndexEntry** index;
+			BlockIndexHeader* prev;
+		};
+		
+		template<AllocationMode allocMode>
+		inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex)
+		{
+			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);		// We're the only writer thread, relaxed is OK
+			if (localBlockIndex == nullptr) {
+				return false;  // this can happen if new_block_index failed in the constructor
+			}
+			size_t newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
+			idxEntry = localBlockIndex->index[newTail];
+			if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE ||
+				idxEntry->value.load(std::memory_order_relaxed) == nullptr) {
+				
+				idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+				localBlockIndex->tail.store(newTail, std::memory_order_release);
+				return true;
+			}
+			
+			// No room in the old block index, try to allocate another one!
+			MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
+				return false;
+			}
+			else if (!new_block_index()) {
+				return false;
+			}
+			else {
+				localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+				newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
+				idxEntry = localBlockIndex->index[newTail];
+				assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE);
+				idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+				localBlockIndex->tail.store(newTail, std::memory_order_release);
+				return true;
+			}
+		}
+		
+		inline void rewind_block_index_tail()
+		{
+			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+			localBlockIndex->tail.store((localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1), std::memory_order_relaxed);
+		}
+		
+		inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const
+		{
+			BlockIndexHeader* localBlockIndex;
+			auto idx = get_block_index_index_for_index(index, localBlockIndex);
+			return localBlockIndex->index[idx];
+		}
+		
+		inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const
+		{
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+			debug::DebugLock lock(mutex);
+#endif
+			index &= ~static_cast<index_t>(BLOCK_SIZE - 1);
+			localBlockIndex = blockIndex.load(std::memory_order_acquire);
+			auto tail = localBlockIndex->tail.load(std::memory_order_acquire);
+			auto tailBase = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed);
+			assert(tailBase != INVALID_BLOCK_BASE);
+			// Note: Must use division instead of shift because the index may wrap around, causing a negative
+			// offset, whose negativity we want to preserve
+			auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(index - tailBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
+			size_t idx = (tail + offset) & (localBlockIndex->capacity - 1);
+			assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index && localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr);
+			return idx;
+		}
+		
+		bool new_block_index()
+		{
+			auto prev = blockIndex.load(std::memory_order_relaxed);
+			size_t prevCapacity = prev == nullptr ? 0 : prev->capacity;
+			auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity;
+			auto raw = static_cast<char*>((Traits::malloc)(
+				sizeof(BlockIndexHeader) +
+				std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * entryCount +
+				std::alignment_of<BlockIndexEntry*>::value - 1 + sizeof(BlockIndexEntry*) * nextBlockIndexCapacity));
+			if (raw == nullptr) {
+				return false;
+			}
+			
+			auto header = new (raw) BlockIndexHeader;
+			auto entries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(raw + sizeof(BlockIndexHeader)));
+			auto index = reinterpret_cast<BlockIndexEntry**>(details::align_for<BlockIndexEntry*>(reinterpret_cast<char*>(entries) + sizeof(BlockIndexEntry) * entryCount));
+			if (prev != nullptr) {
+				auto prevTail = prev->tail.load(std::memory_order_relaxed);
+				auto prevPos = prevTail;
+				size_t i = 0;
+				do {
+					prevPos = (prevPos + 1) & (prev->capacity - 1);
+					index[i++] = prev->index[prevPos];
+				} while (prevPos != prevTail);
+				assert(i == prevCapacity);
+			}
+			for (size_t i = 0; i != entryCount; ++i) {
+				new (entries + i) BlockIndexEntry;
+				entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed);
+				index[prevCapacity + i] = entries + i;
+			}
+			header->prev = prev;
+			header->entries = entries;
+			header->index = index;
+			header->capacity = nextBlockIndexCapacity;
+			header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed);
+			
+			blockIndex.store(header, std::memory_order_release);
+			
+			nextBlockIndexCapacity <<= 1;
+			
+			return true;
+		}
+		
+	private:
+		size_t nextBlockIndexCapacity;
+		std::atomic<BlockIndexHeader*> blockIndex;
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+	public:
+		details::ThreadExitListener threadExitListener;
+	private:
+#endif
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+	public:
+		ImplicitProducer* nextImplicitProducer;
+	private:
+#endif
+
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+		mutable debug::DebugMutex mutex;
+#endif
+#ifdef MCDBGQ_TRACKMEM
+		friend struct MemStats;
+#endif
+	};
+	
+	
+	//////////////////////////////////
+	// Block pool manipulation
+	//////////////////////////////////
+	
+	void populate_initial_block_list(size_t blockCount)
+	{
+		initialBlockPoolSize = blockCount;
+		if (initialBlockPoolSize == 0) {
+			initialBlockPool = nullptr;
+			return;
+		}
+		
+		initialBlockPool = create_array<Block>(blockCount);
+		if (initialBlockPool == nullptr) {
+			initialBlockPoolSize = 0;
+		}
+		for (size_t i = 0; i < initialBlockPoolSize; ++i) {
+			initialBlockPool[i].dynamicallyAllocated = false;
+		}
+	}
+	
+	inline Block* try_get_block_from_initial_pool()
+	{
+		if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) {
+			return nullptr;
+		}
+		
+		auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed);
+		
+		return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr;
+	}
+	
+	inline void add_block_to_free_list(Block* block)
+	{
+#ifdef MCDBGQ_TRACKMEM
+		block->owner = nullptr;
+#endif
+		if (!Traits::RECYCLE_ALLOCATED_BLOCKS && block->dynamicallyAllocated) {
+			destroy(block);
+		}
+		else {
+			freeList.add(block);
+		}
+	}
+	
+	inline void add_blocks_to_free_list(Block* block)
+	{
+		while (block != nullptr) {
+			auto next = block->next;
+			add_block_to_free_list(block);
+			block = next;
+		}
+	}
+	
+	inline Block* try_get_block_from_free_list()
+	{
+		return freeList.try_get();
+	}
+	
+	// Gets a free block from one of the memory pools, or allocates a new one (if applicable)
+	template<AllocationMode canAlloc>
+	Block* requisition_block()
+	{
+		auto block = try_get_block_from_initial_pool();
+		if (block != nullptr) {
+			return block;
+		}
+		
+		block = try_get_block_from_free_list();
+		if (block != nullptr) {
+			return block;
+		}
+		
+		MOODYCAMEL_CONSTEXPR_IF (canAlloc == CanAlloc) {
+			return create<Block>();
+		}
+		else {
+			return nullptr;
+		}
+	}
+	
+
+#ifdef MCDBGQ_TRACKMEM
+	public:
+		struct MemStats {
+			size_t allocatedBlocks;
+			size_t usedBlocks;
+			size_t freeBlocks;
+			size_t ownedBlocksExplicit;
+			size_t ownedBlocksImplicit;
+			size_t implicitProducers;
+			size_t explicitProducers;
+			size_t elementsEnqueued;
+			size_t blockClassBytes;
+			size_t queueClassBytes;
+			size_t implicitBlockIndexBytes;
+			size_t explicitBlockIndexBytes;
+			
+			friend class ConcurrentQueue;
+			
+		private:
+			static MemStats getFor(ConcurrentQueue* q)
+			{
+				MemStats stats = { 0 };
+				
+				stats.elementsEnqueued = q->size_approx();
+			
+				auto block = q->freeList.head_unsafe();
+				while (block != nullptr) {
+					++stats.allocatedBlocks;
+					++stats.freeBlocks;
+					block = block->freeListNext.load(std::memory_order_relaxed);
+				}
+				
+				for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+					bool implicit = dynamic_cast<ImplicitProducer*>(ptr) != nullptr;
+					stats.implicitProducers += implicit ? 1 : 0;
+					stats.explicitProducers += implicit ? 0 : 1;
+					
+					if (implicit) {
+						auto prod = static_cast<ImplicitProducer*>(ptr);
+						stats.queueClassBytes += sizeof(ImplicitProducer);
+						auto head = prod->headIndex.load(std::memory_order_relaxed);
+						auto tail = prod->tailIndex.load(std::memory_order_relaxed);
+						auto hash = prod->blockIndex.load(std::memory_order_relaxed);
+						if (hash != nullptr) {
+							for (size_t i = 0; i != hash->capacity; ++i) {
+								if (hash->index[i]->key.load(std::memory_order_relaxed) != ImplicitProducer::INVALID_BLOCK_BASE && hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) {
+									++stats.allocatedBlocks;
+									++stats.ownedBlocksImplicit;
+								}
+							}
+							stats.implicitBlockIndexBytes += hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry);
+							for (; hash != nullptr; hash = hash->prev) {
+								stats.implicitBlockIndexBytes += sizeof(typename ImplicitProducer::BlockIndexHeader) + hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*);
+							}
+						}
+						for (; details::circular_less_than<index_t>(head, tail); head += BLOCK_SIZE) {
+							//auto block = prod->get_block_index_entry_for_index(head);
+							++stats.usedBlocks;
+						}
+					}
+					else {
+						auto prod = static_cast<ExplicitProducer*>(ptr);
+						stats.queueClassBytes += sizeof(ExplicitProducer);
+						auto tailBlock = prod->tailBlock;
+						bool wasNonEmpty = false;
+						if (tailBlock != nullptr) {
+							auto block = tailBlock;
+							do {
+								++stats.allocatedBlocks;
+								if (!block->ConcurrentQueue::Block::template is_empty<explicit_context>() || wasNonEmpty) {
+									++stats.usedBlocks;
+									wasNonEmpty = wasNonEmpty || block != tailBlock;
+								}
+								++stats.ownedBlocksExplicit;
+								block = block->next;
+							} while (block != tailBlock);
+						}
+						auto index = prod->blockIndex.load(std::memory_order_relaxed);
+						while (index != nullptr) {
+							stats.explicitBlockIndexBytes += sizeof(typename ExplicitProducer::BlockIndexHeader) + index->size * sizeof(typename ExplicitProducer::BlockIndexEntry);
+							index = static_cast<typename ExplicitProducer::BlockIndexHeader*>(index->prev);
+						}
+					}
+				}
+				
+				auto freeOnInitialPool = q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ? 0 : q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed);
+				stats.allocatedBlocks += freeOnInitialPool;
+				stats.freeBlocks += freeOnInitialPool;
+				
+				stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks;
+				stats.queueClassBytes += sizeof(ConcurrentQueue);
+				
+				return stats;
+			}
+		};
+		
+		// For debugging only. Not thread-safe.
+		MemStats getMemStats()
+		{
+			return MemStats::getFor(this);
+		}
+	private:
+		friend struct MemStats;
+#endif
+	
+	
+	//////////////////////////////////
+	// Producer list manipulation
+	//////////////////////////////////	
+	
+	ProducerBase* recycle_or_create_producer(bool isExplicit)
+	{
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+		debug::DebugLock lock(implicitProdMutex);
+#endif
+		// Try to re-use one first
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) {
+				bool expected = true;
+				if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) {
+					// We caught one! It's been marked as activated, the caller can have it
+					return ptr;
+				}
+			}
+		}
+
+		return add_producer(isExplicit ? static_cast<ProducerBase*>(create<ExplicitProducer>(this)) : create<ImplicitProducer>(this));
+	}
+	
+	ProducerBase* add_producer(ProducerBase* producer)
+	{
+		// Handle failed memory allocation
+		if (producer == nullptr) {
+			return nullptr;
+		}
+		
+		producerCount.fetch_add(1, std::memory_order_relaxed);
+		
+		// Add it to the lock-free list
+		auto prevTail = producerListTail.load(std::memory_order_relaxed);
+		do {
+			producer->next = prevTail;
+		} while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed));
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		if (producer->isExplicit) {
+			auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed);
+			do {
+				static_cast<ExplicitProducer*>(producer)->nextExplicitProducer = prevTailExplicit;
+			} while (!explicitProducers.compare_exchange_weak(prevTailExplicit, static_cast<ExplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
+		}
+		else {
+			auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed);
+			do {
+				static_cast<ImplicitProducer*>(producer)->nextImplicitProducer = prevTailImplicit;
+			} while (!implicitProducers.compare_exchange_weak(prevTailImplicit, static_cast<ImplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
+		}
+#endif
+		
+		return producer;
+	}
+	
+	void reown_producers()
+	{
+		// After another instance is moved-into/swapped-with this one, all the
+		// producers we stole still think their parents are the other queue.
+		// So fix them up!
+		for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) {
+			ptr->parent = this;
+		}
+	}
+	
+	
+	//////////////////////////////////
+	// Implicit producer hash
+	//////////////////////////////////
+	
+	struct ImplicitProducerKVP
+	{
+		std::atomic<details::thread_id_t> key;
+		ImplicitProducer* value;		// No need for atomicity since it's only read by the thread that sets it in the first place
+		
+		ImplicitProducerKVP() : value(nullptr) { }
+		
+		ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
+		{
+			key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed);
+			value = other.value;
+		}
+		
+		inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
+		{
+			swap(other);
+			return *this;
+		}
+		
+		inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT
+		{
+			if (this != &other) {
+				details::swap_relaxed(key, other.key);
+				std::swap(value, other.value);
+			}
+		}
+	};
+	
+	template<typename XT, typename XTraits>
+	friend void moodycamel::swap(typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&, typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT;
+	
+	struct ImplicitProducerHash
+	{
+		size_t capacity;
+		ImplicitProducerKVP* entries;
+		ImplicitProducerHash* prev;
+	};
+	
+	inline void populate_initial_implicit_producer_hash()
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
+			return;
+		}
+		else {
+			implicitProducerHashCount.store(0, std::memory_order_relaxed);
+			auto hash = &initialImplicitProducerHash;
+			hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
+			hash->entries = &initialImplicitProducerHashEntries[0];
+			for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) {
+				initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
+			}
+			hash->prev = nullptr;
+			implicitProducerHash.store(hash, std::memory_order_relaxed);
+		}
+	}
+	
+	void swap_implicit_producer_hashes(ConcurrentQueue& other)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
+			return;
+		}
+		else {
+			// Swap (assumes our implicit producer hash is initialized)
+			initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries);
+			initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0];
+			other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0];
+			
+			details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount);
+			
+			details::swap_relaxed(implicitProducerHash, other.implicitProducerHash);
+			if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) {
+				implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed);
+			}
+			else {
+				ImplicitProducerHash* hash;
+				for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash; hash = hash->prev) {
+					continue;
+				}
+				hash->prev = &initialImplicitProducerHash;
+			}
+			if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) {
+				other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed);
+			}
+			else {
+				ImplicitProducerHash* hash;
+				for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash; hash = hash->prev) {
+					continue;
+				}
+				hash->prev = &other.initialImplicitProducerHash;
+			}
+		}
+	}
+	
+	// Only fails (returns nullptr) if memory allocation fails
+	ImplicitProducer* get_or_add_implicit_producer()
+	{
+		// Note that since the data is essentially thread-local (key is thread ID),
+		// there's a reduced need for fences (memory ordering is already consistent
+		// for any individual thread), except for the current table itself.
+		
+		// Start by looking for the thread ID in the current and all previous hash tables.
+		// If it's not found, it must not be in there yet, since this same thread would
+		// have added it previously to one of the tables that we traversed.
+		
+		// Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table
+		
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+		debug::DebugLock lock(implicitProdMutex);
+#endif
+		
+		auto id = details::thread_id();
+		auto hashedId = details::hash_thread_id(id);
+		
+		auto mainHash = implicitProducerHash.load(std::memory_order_acquire);
+		assert(mainHash != nullptr);  // silence clang-tidy and MSVC warnings (hash cannot be null)
+		for (auto hash = mainHash; hash != nullptr; hash = hash->prev) {
+			// Look for the id in this hash
+			auto index = hashedId;
+			while (true) {		// Not an infinite loop because at least one slot is free in the hash table
+				index &= hash->capacity - 1u;
+				
+				auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed);
+				if (probedKey == id) {
+					// Found it! If we had to search several hashes deep, though, we should lazily add it
+					// to the current main hash table to avoid the extended search next time.
+					// Note there's guaranteed to be room in the current hash table since every subsequent
+					// table implicitly reserves space for all previous tables (there's only one
+					// implicitProducerHashCount).
+					auto value = hash->entries[index].value;
+					if (hash != mainHash) {
+						index = hashedId;
+						while (true) {
+							index &= mainHash->capacity - 1u;
+							auto empty = details::invalid_thread_id;
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+							auto reusable = details::invalid_thread_id2;
+							if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed) ||
+								mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+#else
+							if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+#endif
+								mainHash->entries[index].value = value;
+								break;
+							}
+							++index;
+						}
+					}
+					
+					return value;
+				}
+				if (probedKey == details::invalid_thread_id) {
+					break;		// Not in this hash table
+				}
+				++index;
+			}
+		}
+		
+		// Insert!
+		auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed);
+		while (true) {
+			// NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+			if (newCount >= (mainHash->capacity >> 1) && !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire)) {
+				// We've acquired the resize lock, try to allocate a bigger hash table.
+				// Note the acquire fence synchronizes with the release fence at the end of this block, and hence when
+				// we reload implicitProducerHash it must be the most recent version (it only gets changed within this
+				// locked block).
+				mainHash = implicitProducerHash.load(std::memory_order_acquire);
+				if (newCount >= (mainHash->capacity >> 1)) {
+					size_t newCapacity = mainHash->capacity << 1;
+					while (newCount >= (newCapacity >> 1)) {
+						newCapacity <<= 1;
+					}
+					auto raw = static_cast<char*>((Traits::malloc)(sizeof(ImplicitProducerHash) + std::alignment_of<ImplicitProducerKVP>::value - 1 + sizeof(ImplicitProducerKVP) * newCapacity));
+					if (raw == nullptr) {
+						// Allocation failed
+						implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+						implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+						return nullptr;
+					}
+					
+					auto newHash = new (raw) ImplicitProducerHash;
+					newHash->capacity = static_cast<size_t>(newCapacity);
+					newHash->entries = reinterpret_cast<ImplicitProducerKVP*>(details::align_for<ImplicitProducerKVP>(raw + sizeof(ImplicitProducerHash)));
+					for (size_t i = 0; i != newCapacity; ++i) {
+						new (newHash->entries + i) ImplicitProducerKVP;
+						newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
+					}
+					newHash->prev = mainHash;
+					implicitProducerHash.store(newHash, std::memory_order_release);
+					implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+					mainHash = newHash;
+				}
+				else {
+					implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+				}
+			}
+			
+			// If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table
+			// to finish being allocated by another thread (and if we just finished allocating above, the condition will
+			// always be true)
+			if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) {
+				auto producer = static_cast<ImplicitProducer*>(recycle_or_create_producer(false));
+				if (producer == nullptr) {
+					implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+					return nullptr;
+				}
+				
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+				producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback;
+				producer->threadExitListener.userData = producer;
+				details::ThreadExitNotifier::subscribe(&producer->threadExitListener);
+#endif
+				
+				auto index = hashedId;
+				while (true) {
+					index &= mainHash->capacity - 1u;
+					auto empty = details::invalid_thread_id;
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+					auto reusable = details::invalid_thread_id2;
+					if (mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+						implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);  // already counted as a used slot
+						mainHash->entries[index].value = producer;
+						break;
+					}
+#endif
+					if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+						mainHash->entries[index].value = producer;
+						break;
+					}
+					++index;
+				}
+				return producer;
+			}
+			
+			// Hmm, the old hash is quite full and somebody else is busy allocating a new one.
+			// We need to wait for the allocating thread to finish (if it succeeds, we add, if not,
+			// we try to allocate ourselves).
+			mainHash = implicitProducerHash.load(std::memory_order_acquire);
+		}
+	}
+	
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+	void implicit_producer_thread_exited(ImplicitProducer* producer)
+	{
+		// Remove from hash
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+		debug::DebugLock lock(implicitProdMutex);
+#endif
+		auto hash = implicitProducerHash.load(std::memory_order_acquire);
+		assert(hash != nullptr);		// The thread exit listener is only registered if we were added to a hash in the first place
+		auto id = details::thread_id();
+		auto hashedId = details::hash_thread_id(id);
+		details::thread_id_t probedKey;
+		
+		// We need to traverse all the hashes just in case other threads aren't on the current one yet and are
+		// trying to add an entry thinking there's a free slot (because they reused a producer)
+		for (; hash != nullptr; hash = hash->prev) {
+			auto index = hashedId;
+			do {
+				index &= hash->capacity - 1u;
+				probedKey = id;
+				if (hash->entries[index].key.compare_exchange_strong(probedKey, details::invalid_thread_id2, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+					break;
+				}
+				++index;
+			} while (probedKey != details::invalid_thread_id);		// Can happen if the hash has changed but we weren't put back in it yet, or if we weren't added to this hash in the first place
+		}
+		
+		// Mark the queue as being recyclable
+		producer->inactive.store(true, std::memory_order_release);
+	}
+	
+	static void implicit_producer_thread_exited_callback(void* userData)
+	{
+		auto producer = static_cast<ImplicitProducer*>(userData);
+		auto queue = producer->parent;
+		queue->implicit_producer_thread_exited(producer);
+	}
+#endif
+	
+	//////////////////////////////////
+	// Utility functions
+	//////////////////////////////////
+
+	template<typename TAlign>
+	static inline void* aligned_malloc(size_t size)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
+			return (Traits::malloc)(size);
+		else {
+			size_t alignment = std::alignment_of<TAlign>::value;
+			void* raw = (Traits::malloc)(size + alignment - 1 + sizeof(void*));
+			if (!raw)
+				return nullptr;
+			char* ptr = details::align_for<TAlign>(reinterpret_cast<char*>(raw) + sizeof(void*));
+			*(reinterpret_cast<void**>(ptr) - 1) = raw;
+			return ptr;
+		}
+	}
+
+	template<typename TAlign>
+	static inline void aligned_free(void* ptr)
+	{
+		MOODYCAMEL_CONSTEXPR_IF (std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
+			return (Traits::free)(ptr);
+		else
+			(Traits::free)(ptr ? *(reinterpret_cast<void**>(ptr) - 1) : nullptr);
+	}
+
+	template<typename U>
+	static inline U* create_array(size_t count)
+	{
+		assert(count > 0);
+		U* p = static_cast<U*>(aligned_malloc<U>(sizeof(U) * count));
+		if (p == nullptr)
+			return nullptr;
+
+		for (size_t i = 0; i != count; ++i)
+			new (p + i) U();
+		return p;
+	}
+
+	template<typename U>
+	static inline void destroy_array(U* p, size_t count)
+	{
+		if (p != nullptr) {
+			assert(count > 0);
+			for (size_t i = count; i != 0; )
+				(p + --i)->~U();
+		}
+		aligned_free<U>(p);
+	}
+
+	template<typename U>
+	static inline U* create()
+	{
+		void* p = aligned_malloc<U>(sizeof(U));
+		return p != nullptr ? new (p) U : nullptr;
+	}
+
+	template<typename U, typename A1>
+	static inline U* create(A1&& a1)
+	{
+		void* p = aligned_malloc<U>(sizeof(U));
+		return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr;
+	}
+
+	template<typename U>
+	static inline void destroy(U* p)
+	{
+		if (p != nullptr)
+			p->~U();
+		aligned_free<U>(p);
+	}
+
+private:
+	std::atomic<ProducerBase*> producerListTail;
+	std::atomic<std::uint32_t> producerCount;
+	
+	std::atomic<size_t> initialBlockPoolIndex;
+	Block* initialBlockPool;
+	size_t initialBlockPoolSize;
+	
+#ifndef MCDBGQ_USEDEBUGFREELIST
+	FreeList<Block> freeList;
+#else
+	debug::DebugFreeList<Block> freeList;
+#endif
+	
+	std::atomic<ImplicitProducerHash*> implicitProducerHash;
+	std::atomic<size_t> implicitProducerHashCount;		// Number of slots logically used
+	ImplicitProducerHash initialImplicitProducerHash;
+	std::array<ImplicitProducerKVP, INITIAL_IMPLICIT_PRODUCER_HASH_SIZE> initialImplicitProducerHashEntries;
+	std::atomic_flag implicitProducerHashResizeInProgress;
+	
+	std::atomic<std::uint32_t> nextExplicitConsumerId;
+	std::atomic<std::uint32_t> globalExplicitConsumerOffset;
+	
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+	debug::DebugMutex implicitProdMutex;
+#endif
+	
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+	std::atomic<ExplicitProducer*> explicitProducers;
+	std::atomic<ImplicitProducer*> implicitProducers;
+#endif
+};
+
+
+template<typename T, typename Traits>
+ProducerToken::ProducerToken(ConcurrentQueue<T, Traits>& queue)
+	: producer(queue.recycle_or_create_producer(true))
+{
+	if (producer != nullptr) {
+		producer->token = this;
+	}
+}
+
+template<typename T, typename Traits>
+ProducerToken::ProducerToken(BlockingConcurrentQueue<T, Traits>& queue)
+	: producer(reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->recycle_or_create_producer(true))
+{
+	if (producer != nullptr) {
+		producer->token = this;
+	}
+}
+
+template<typename T, typename Traits>
+ConsumerToken::ConsumerToken(ConcurrentQueue<T, Traits>& queue)
+	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
+{
+	initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+	lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
+}
+
+template<typename T, typename Traits>
+ConsumerToken::ConsumerToken(BlockingConcurrentQueue<T, Traits>& queue)
+	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
+{
+	initialOffset = reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+	lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
+}
+
+template<typename T, typename Traits>
+inline void swap(ConcurrentQueue<T, Traits>& a, ConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+template<typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+}
+
+#if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17)
+#pragma warning(pop)
+#endif
+
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
+#pragma GCC diagnostic pop
+#endif
diff --git a/duix-sdk/src/main/cpp/dhcore/dh_atomic.h b/duix-sdk/src/main/cpp/dhcore/dh_atomic.h
new file mode 100644
index 0000000..4164930
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhcore/dh_atomic.h
@@ -0,0 +1,1198 @@
+/**
+ * @file dh_atomic.h  Atomic support
+ *
+ * Copyright (C) 2022 Sebastian Reimers
+ */
+
+#ifndef DH_H_ATOMIC__
+#define DH_H_ATOMIC__
+
+/* C11 */
+#if defined(HAVE_ATOMIC) && __STDC_VERSION__ >= 201112L &&                    \
+	!defined(__STDC_NO_ATOMICS__)
+
+#include <stdatomic.h>
+
+#define DH_ATOMIC _Atomic
+
+#define DH_ATOMIC_BOOL_LOCK_FREE ATOMIC_BOOL_LOCK_FREE
+#define DH_ATOMIC_CHAR_LOCK_FREE ATOMIC_CHAR_LOCK_FREE
+#define DH_ATOMIC_WCHAR_T_LOCK_FREE ATOMIC_WCHAR_T_LOCK_FREE
+#define DH_ATOMIC_SHORT_LOCK_FREE ATOMIC_SHORT_LOCK_FREE
+#define DH_ATOMIC_INT_LOCK_FREE ATOMIC_INT_LOCK_FREE
+#define DH_ATOMIC_LONG_LOCK_FREE ATOMIC_LONG_LOCK_FREE
+#define DH_ATOMIC_LLONG_LOCK_FREE ATOMIC_LLONG_LOCK_FREE
+#define DH_ATOMIC_POINTER_LOCK_FREE ATOMIC_POINTER_LOCK_FREE
+
+#define dh_memory_order_relaxed memory_order_relaxed
+#define dh_memory_order_acquire memory_order_acquire
+#define dh_memory_order_release memory_order_release
+#define dh_memory_order_acq_rel memory_order_acq_rel
+#define dh_memory_order_seq_cst memory_order_seq_cst
+
+#define dh_atomic_store(_a, _v, _mo) \
+	atomic_stodh_explicit(_a, _v, _mo)
+
+#define dh_atomic_load(_a, _mo) \
+	atomic_load_explicit(_a, _mo)
+
+#define dh_atomic_exchange(_a, _v, _mo) \
+	atomic_exchange_explicit(_a, _v, _mo)
+
+#define dh_atomic_compadh_exchange_strong(\
+	_a, _expected, _desired, _success_mo, _fail_mo) \
+	atomic_compadh_exchange_strong_explicit(\
+		_a, _expected, _desired, _success_mo, _fail_mo)
+
+#define dh_atomic_compadh_exchange_weak(\
+	_a, _expected, _desired, _success_mo, _fail_mo) \
+	atomic_compadh_exchange_weak_explicit(\
+		_a, _expected, _desired, _success_mo, _fail_mo)
+
+#define dh_atomic_fetch_add(_a, _v, _mo) \
+	atomic_fetch_add_explicit(_a, _v, _mo)
+
+#define dh_atomic_fetch_sub(_a, _v, _mo) \
+	atomic_fetch_sub_explicit(_a, _v, _mo)
+
+#define dh_atomic_fetch_or(_a, _v, _mo) \
+	atomic_fetch_or_explicit(_a, _v, _mo)
+
+#define dh_atomic_fetch_xor(_a, _v, _mo) \
+	atomic_fetch_xor_explicit(_a, _v, _mo)
+
+#define dh_atomic_fetch_and(_a, _v, _mo) \
+	atomic_fetch_and_explicit(_a, _v, _mo)
+
+/* gcc-style __atomic* intrinsics.
+ * Note: clang-cl also supports these, even though it impersonates MSVC. */
+#elif (defined(__GNUC__) || defined(__clang__)) && \
+	defined(__GCC_ATOMIC_BOOL_LOCK_FREE) && \
+	defined(__GCC_ATOMIC_CHAR_LOCK_FREE) && \
+	defined(__GCC_ATOMIC_WCHAR_T_LOCK_FREE) && \
+	defined(__GCC_ATOMIC_SHORT_LOCK_FREE) && \
+	defined(__GCC_ATOMIC_INT_LOCK_FREE) && \
+	defined(__GCC_ATOMIC_LONG_LOCK_FREE) && \
+	defined(__GCC_ATOMIC_LLONG_LOCK_FREE) && \
+	defined(__GCC_ATOMIC_POINTER_LOCK_FREE) && \
+	defined(__ATOMIC_RELAXED) && defined(__ATOMIC_ACQUIRE) && \
+	defined(__ATOMIC_RELEASE) && defined(__ATOMIC_ACQ_REL) && \
+	defined(__ATOMIC_SEQ_CST)
+
+#define DH_ATOMIC_BOOL_LOCK_FREE __GCC_ATOMIC_BOOL_LOCK_FREE
+#define DH_ATOMIC_CHAR_LOCK_FREE __GCC_ATOMIC_CHAR_LOCK_FREE
+#define DH_ATOMIC_WCHAR_T_LOCK_FREE __GCC_ATOMIC_WCHAR_T_LOCK_FREE
+#define DH_ATOMIC_SHORT_LOCK_FREE __GCC_ATOMIC_SHORT_LOCK_FREE
+#define DH_ATOMIC_INT_LOCK_FREE __GCC_ATOMIC_INT_LOCK_FREE
+#define DH_ATOMIC_LONG_LOCK_FREE __GCC_ATOMIC_LONG_LOCK_FREE
+#define DH_ATOMIC_LLONG_LOCK_FREE __GCC_ATOMIC_LLONG_LOCK_FREE
+#define DH_ATOMIC_POINTER_LOCK_FREE __GCC_ATOMIC_POINTER_LOCK_FREE
+
+#define dh_memory_order_relaxed __ATOMIC_RELAXED
+#define dh_memory_order_acquire __ATOMIC_ACQUIRE
+#define dh_memory_order_release __ATOMIC_RELEASE
+#define dh_memory_order_acq_rel __ATOMIC_ACQ_REL
+#define dh_memory_order_seq_cst __ATOMIC_SEQ_CST
+
+#define dh_atomic_store(_a, _v, _mo) \
+	__atomic_store_n(_a, _v, _mo)
+
+#define dh_atomic_load(_a, _mo) \
+	__atomic_load_n(_a, _mo)
+
+#define dh_atomic_exchange(_a, _v, _mo) \
+	__atomic_exchange_n(_a, _v, _mo)
+
+#define dh_atomic_compadh_exchange_strong(\
+	_a, _expected, _desired, _success_mo, _fail_mo) \
+	__atomic_compadh_exchange_n(\
+		_a, _expected, _desired, 0, _success_mo, _fail_mo)
+
+#define dh_atomic_compadh_exchange_weak(\
+	_a, _expected, _desired, _success_mo, _fail_mo) \
+	__atomic_compadh_exchange_n(\
+		_a, _expected, _desired, 1, _success_mo, _fail_mo)
+
+#define dh_atomic_fetch_add(_a, _v, _mo) \
+	__atomic_fetch_add(_a, _v, _mo)
+
+#define dh_atomic_fetch_sub(_a, _v, _mo) \
+	__atomic_fetch_sub(_a, _v, _mo)
+
+#define dh_atomic_fetch_or(_a, _v, _mo) \
+	__atomic_fetch_or(_a, _v, _mo)
+
+#define dh_atomic_fetch_xor(_a, _v, _mo) \
+	__atomic_fetch_xor(_a, _v, _mo)
+
+#define dh_atomic_fetch_and(_a, _v, _mo) \
+	__atomic_fetch_and(_a, _v, _mo)
+
+/* gcc-style __sync* intrinsics. */
+#elif defined(__GNUC__) && \
+	(defined(__GCC_HAVE_SYNC_COMPADH_AND_SWAP_1) || \
+	defined(__GCC_HAVE_SYNC_COMPADH_AND_SWAP_2) || \
+	defined(__GCC_HAVE_SYNC_COMPADH_AND_SWAP_4) || \
+	defined(__GCC_HAVE_SYNC_COMPADH_AND_SWAP_8))
+
+#if !defined(__SIZEOF_SHORT__) || !defined(__SIZEOF_INT__) || \
+	!defined(__SIZEOF_LONG__) || !defined(__SIZEOF_LONG_LONG__)
+#include <limits.h>
+#endif
+#if !defined(__SIZEOF_POINTER__)
+#include <stdint.h>
+#endif
+#if !defined(__SIZEOF_WCHAR_T__)
+#include <wchar.h>
+#endif
+
+#if defined(__GCC_HAVE_SYNC_COMPADH_AND_SWAP_1)
+#define DH_ATOMIC_CHAR_LOCK_FREE 2
+#endif
+
+#if defined(__GCC_HAVE_SYNC_COMPADH_AND_SWAP_2)
+#if (defined(__SIZEOF_SHORT__) && __SIZEOF_SHORT__ == 2) || \
+	(defined(USHRT_MAX) && USHRT_MAX == 0xffffu)
+#define DH_ATOMIC_SHORT_LOCK_FREE 2
+#endif
+#if (defined(__SIZEOF_INT__) && __SIZEOF_INT__ == 2) || \
+	(defined(UINT_MAX) && UINT_MAX == 0xffffu)
+#define DH_ATOMIC_INT_LOCK_FREE 2
+#endif
+#if (defined(__SIZEOF_LONG__) && __SIZEOF_LONG__ == 2) || \
+	(defined(ULONG_MAX) && ULONG_MAX == 0xffffu)
+#define DH_ATOMIC_LONG_LOCK_FREE 2
+#endif
+#if (defined(__SIZEOF_LONG_LONG__) && __SIZEOF_LONG_LONG__ == 2) || \
+	(defined(ULLONG_MAX) && ULLONG_MAX == 0xffffu)
+#define DH_ATOMIC_LLONG_LOCK_FREE 2
+#endif
+#if (defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__ == 2) || \
+	(defined(UINTPTR_MAX) && UINTPTR_MAX == 0xffffu)
+#define DH_ATOMIC_POINTER_LOCK_FREE 2
+#endif
+#if (defined(__SIZEOF_WCHAR_T__) && __SIZEOF_WCHAR_T__ == 2) || \
+	(defined(WCHAR_MAX) && (WCHAR_MAX == 0xffff || WCHAR_MAX == 0x7fff))
+#define DH_ATOMIC_WCHAR_T_LOCK_FREE 2
+#endif
+#endif
+
+#if defined(__GCC_HAVE_SYNC_COMPADH_AND_SWAP_4)
+#if (defined(__SIZEOF_SHORT__) && __SIZEOF_SHORT__ == 4) || \
+	(defined(USHRT_MAX) && USHRT_MAX == 0xffffffffu)
+#define DH_ATOMIC_SHORT_LOCK_FREE 2
+#endif
+#if (defined(__SIZEOF_INT__) && __SIZEOF_INT__ == 4) || \
+	(defined(UINT_MAX) && UINT_MAX == 0xffffffffu)
+#define DH_ATOMIC_INT_LOCK_FREE 2
+#endif
+#if (defined(__SIZEOF_LONG__) && __SIZEOF_LONG__ == 4) || \
+	(defined(ULONG_MAX) && ULONG_MAX == 0xffffffffu)
+#define DH_ATOMIC_LONG_LOCK_FREE 2
+#endif
+#if (defined(__SIZEOF_LONG_LONG__) && __SIZEOF_LONG_LONG__ == 4) || \
+	(defined(ULLONG_MAX) && ULLONG_MAX == 0xffffffffu)
+#define DH_ATOMIC_LLONG_LOCK_FREE 2
+#endif
+#if (defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__ == 4) || \
+	(defined(UINTPTR_MAX) && UINTPTR_MAX == 0xffffffffu)
+#define DH_ATOMIC_POINTER_LOCK_FREE 2
+#endif
+#if (defined(__SIZEOF_WCHAR_T__) && __SIZEOF_WCHAR_T__ == 4) || \
+	(defined(WCHAR_MAX) && (WCHAR_MAX == 0xffffffff || \
+		WCHAR_MAX == 0x7fffffff))
+#define DH_ATOMIC_WCHAR_T_LOCK_FREE 2
+#endif
+#endif
+
+#if defined(__GCC_HAVE_SYNC_COMPADH_AND_SWAP_8)
+#if (defined(__SIZEOF_SHORT__) && __SIZEOF_SHORT__ == 8) || \
+	(defined(USHRT_MAX) && USHRT_MAX == 0xffffffffffffffffu)
+#define DH_ATOMIC_SHORT_LOCK_FREE 2
+#endif
+#if (defined(__SIZEOF_INT__) && __SIZEOF_INT__ == 8) || \
+	(defined(UINT_MAX) && UINT_MAX == 0xffffffffffffffffu)
+#define DH_ATOMIC_INT_LOCK_FREE 2
+#endif
+#if (defined(__SIZEOF_LONG__) && __SIZEOF_LONG__ == 8) || \
+	(defined(ULONG_MAX) && ULONG_MAX == 0xffffffffffffffffu)
+#define DH_ATOMIC_LONG_LOCK_FREE 2
+#endif
+#if (defined(__SIZEOF_LONG_LONG__) && __SIZEOF_LONG_LONG__ == 8) || \
+	(defined(ULLONG_MAX) && ULLONG_MAX == 0xffffffffffffffffu)
+#define DH_ATOMIC_LLONG_LOCK_FREE 2
+#endif
+#if (defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__ == 8) || \
+	(defined(UINTPTR_MAX) && UINTPTR_MAX == 0xffffffffffffffffu)
+#define DH_ATOMIC_POINTER_LOCK_FREE 2
+#endif
+#if (defined(__SIZEOF_WCHAR_T__) && __SIZEOF_WCHAR_T__ == 8) || \
+	(defined(WCHAR_MAX) && (WCHAR_MAX == 0xffffffffffffffff || \
+		WCHAR_MAX == 0x7fffffffffffffff))
+#define DH_ATOMIC_WCHAR_T_LOCK_FREE 2
+#endif
+#endif
+
+#if !defined(DH_ATOMIC_CHAR_LOCK_FREE)
+#define DH_ATOMIC_CHAR_LOCK_FREE 0
+#endif
+#if !defined(DH_ATOMIC_SHORT_LOCK_FREE)
+#define DH_ATOMIC_SHORT_LOCK_FREE 0
+#endif
+#if !defined(DH_ATOMIC_INT_LOCK_FREE)
+#define DH_ATOMIC_INT_LOCK_FREE 0
+#endif
+#if !defined(DH_ATOMIC_LONG_LOCK_FREE)
+#define DH_ATOMIC_LONG_LOCK_FREE 0
+#endif
+#if !defined(DH_ATOMIC_LLONG_LOCK_FREE)
+#define DH_ATOMIC_LLONG_LOCK_FREE 0
+#endif
+#if !defined(DH_ATOMIC_POINTER_LOCK_FREE)
+#define DH_ATOMIC_POINTER_LOCK_FREE 0
+#endif
+#if !defined(DH_ATOMIC_WCHAR_T_LOCK_FREE)
+#define DH_ATOMIC_WCHAR_T_LOCK_FREE 0
+#endif
+
+/* Assume bool is always 1 byte. Add platform-specific exceptions,
+ * if needed. */
+#define DH_ATOMIC_BOOL_LOCK_FREE DH_ATOMIC_CHAR_LOCK_FREE
+
+/* These constants match __ATOMIC_* predefined macros on
+ * gcc versions that support __atomic intrinsics. */
+#define dh_memory_order_relaxed 0
+#define dh_memory_order_acquire 2
+#define dh_memory_order_release 3
+#define dh_memory_order_acq_rel 4
+#define dh_memory_order_seq_cst 5
+
+#if defined(__x86_64__)
+
+#define dh_atomic_store(_a, _v, _mo) \
+	__extension__\
+	({\
+		__typeof__(*(_a)) _val = (_v);\
+		if ((_mo) != dh_memory_order_seq_cst) {\
+			__asm__ __volatile__ ("mov %1, %0"\
+				: "=m" (*(_a))\
+				: "q" (_val)\
+				: "memory");\
+		}\
+		else {\
+			__asm__ __volatile__ ("xchg %1, %0"\
+				: "=m" (*(_a)), "+q" (_val)\
+				: \
+				: "memory");\
+		}\
+	})
+
+#define dh_atomic_load(_a, _mo) \
+	__extension__\
+	({\
+		__typeof__(*(_a)) _val;\
+		__asm__ __volatile__ ("mov %1, %0"\
+			: "=q" (_val)\
+			: "m" (*(_a))\
+			: "memory");\
+		_val;\
+	})
+
+#define dh_atomic_exchange(_a, _v, _mo) \
+	__extension__\
+	({\
+		__typeof__(*(_a)) _val = (_v);\
+		__asm__ __volatile__ ("xchg %1, %0"\
+			: "+m" (*(_a)), "+q" (_val)\
+			: \
+			: "memory");\
+		_val;\
+	})
+
+#elif defined(__i386__)
+
+#define dh_atomic_store(_a, _v, _mo) \
+	__extension__\
+	({\
+		__typeof__(*(_a)) _val = (_v);\
+		if (sizeof(_val) < 8) {\
+			if ((_mo) != dh_memory_order_seq_cst) {\
+				__asm__ __volatile__ ("mov %1, %0"\
+					: "=m" (*(_a))\
+					: "q" (_val)\
+					: "memory");\
+			}\
+			else {\
+				__asm__ __volatile__ ("xchg %1, %0"\
+					: "=m" (*(_a)), "+q" (_val)\
+					: \
+					: "memory");\
+			}\
+		}\
+		else {\
+			__typeof__(*(_a)) _expected = *(_a);\
+			while (1) {\
+				__typeof__(*(_a)) _prev_val =\
+					__sync_val_compadh_and_swap(\
+						_a, _expected, _val);\
+				if (_prev_val == _expected)\
+					break;\
+				_expected = _prev_val;\
+			}\
+		}\
+	})
+
+#define dh_atomic_load(_a, _mo) \
+	__extension__\
+	({\
+		__typeof__(*(_a)) _val;\
+		if (sizeof(_val) < 8) {\
+			__asm__ __volatile__ ("mov %1, %0"\
+				: "=q" (_val)\
+				: "m" (*(_a))\
+				: "memory");\
+		}\
+		else {\
+			_val = __sync_val_compadh_and_swap(\
+				_a,\
+				(__typeof__(*(_a)))0,\
+				(__typeof__(*(_a)))0);\
+		}\
+		_val;\
+	})
+
+#define dh_atomic_exchange(_a, _v, _mo) \
+	__extension__\
+	({\
+		__typeof__(*(_a)) _val = (_v);\
+		if (sizeof(_val) < 8) {\
+			__asm__ __volatile__ ("xchg %1, %0"\
+				: "+m" (*(_a)), "+q" (_val)\
+				: \
+				: "memory");\
+		}\
+		else {\
+			__typeof__(*(_a)) _expected = *(_a);\
+			while (1) {\
+				__typeof__(*(_a)) _prev_val =\
+					__sync_val_compadh_and_swap(\
+						_a, _expected, _val);\
+				if (_prev_val == _expected)\
+					break;\
+				_expected = _prev_val;\
+			}\
+			_val = _expected;\
+		}\
+		_val;\
+	})
+
+#else
+
+#define dh_atomic_store(_a, _v, _mo) \
+	(void)dh_atomic_exchange(_a, _v, _mo)
+
+#define dh_atomic_load(_a, _mo) \
+	__sync_val_compadh_and_swap(\
+		_a, (__typeof__(*(_a)))0, (__typeof__(*(_a)))0)
+
+#define dh_atomic_exchange(_a, _v, _mo) \
+	__extension__\
+	({\
+		__typeof__(*(_a)) _val = (_v);\
+		__typeof__(*(_a)) _expected = *(_a);\
+		while (1) {\
+			__typeof__(*(_a)) _prev_val =\
+				__sync_val_compadh_and_swap(\
+					_a, _expected, _val);\
+			if (_prev_val == _expected)\
+				break;\
+			_expected = _prev_val;\
+		}\
+		_expected;\
+	})
+
+#endif
+
+#define dh_atomic_compadh_exchange_strong(\
+	_a, _expected, _desired, _success_mo, _fail_mo) \
+	__extension__\
+	({\
+		__typeof__(*(_a)) _exp_val = *(_expected);\
+		__typeof__(*(_a)) _prev_val =\
+			__sync_val_compadh_and_swap(_a, _exp_val,\
+				(__typeof__(*(_a)))(_desired));\
+		*(_expected) = _prev_val;\
+		_prev_val == _exp_val;\
+	})
+
+#define dh_atomic_compadh_exchange_weak(\
+	_a, _expected, _desired, _success_mo, _fail_mo) \
+	dh_atomic_compadh_exchange_strong(\
+		_a, _expected, _desired, _success_mo, _fail_mo)
+
+#define dh_atomic_fetch_add(_a, _v, _mo) \
+	__sync_fetch_and_add(_a, (__typeof__(*(_a)))(_v))
+
+#define dh_atomic_fetch_sub(_a, _v, _mo) \
+	__sync_fetch_and_sub(_a, (__typeof__(*(_a)))(_v))
+
+#define dh_atomic_fetch_or(_a, _v, _mo) \
+	__sync_fetch_and_or(_a, (__typeof__(*(_a)))(_v))
+
+#define dh_atomic_fetch_xor(_a, _v, _mo) \
+	__sync_fetch_and_xor(_a, (__typeof__(*(_a)))(_v))
+
+#define dh_atomic_fetch_and(_a, _v, _mo) \
+	__sync_fetch_and_and(_a, (__typeof__(*(_a)))(_v))
+
+/* MSVC Interlocked* intrinsics. This needs to go after clang to let clang-cl
+ * get handled above. */
+#elif defined(_MSC_VER)
+
+#include <assert.h>
+#include <intrin.h>
+//#include "dh_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define DH_ATOMIC_BOOL_LOCK_FREE 2
+#define DH_ATOMIC_CHAR_LOCK_FREE 2
+#define DH_ATOMIC_WCHAR_T_LOCK_FREE 2
+#define DH_ATOMIC_SHORT_LOCK_FREE 2
+#define DH_ATOMIC_INT_LOCK_FREE 2
+#define DH_ATOMIC_LONG_LOCK_FREE 2
+#define DH_ATOMIC_LLONG_LOCK_FREE 2
+#define DH_ATOMIC_POINTER_LOCK_FREE 2
+
+/* These constants don't matter but for consistency they match
+ * values in std::memory_order from <atomic> in C++.
+ * There are specialized intrinsics for ARM and ARM64
+ * for different memory ordering types, but they are not used (yet) below. */
+#define dh_memory_order_relaxed 0
+#define dh_memory_order_acquire 2
+#define dh_memory_order_release 3
+#define dh_memory_order_acq_rel 4
+#define dh_memory_order_seq_cst 5
+
+static unsigned __int64 _dh_atomic_exchange(
+	size_t size, void *a, unsigned __int64 v);
+
+#if defined(_M_IX86) || defined(_M_AMD64)
+
+static __forceinline void _dh_atomic_store(
+	size_t size, void *a, unsigned __int64 v, unsigned int mo)
+{
+	assert(size == 1u || size == 2u || size == 4u || size == 8u);
+	if (mo != dh_memory_order_seq_cst) {
+		_ReadWriteBarrier();
+		switch (size) {
+		case 1u:
+			*(volatile unsigned __int8*)a = (unsigned __int8)v;
+			break;
+		case 2u:
+			*(volatile unsigned __int16*)a = (unsigned __int16)v;
+			break;
+		case 4u:
+			*(volatile unsigned __int32*)a = (unsigned __int32)v;
+			break;
+		default:
+#if defined(_M_IX86)
+			{
+				__int64 prev_val =
+					*(const volatile __int64*)(a);
+				while (1) {
+					__int64 prev_val2 =
+						_InterlockedCompareExchange64(
+							(__int64*)a,
+							(__int64)v,
+							prev_val);
+					if (prev_val2 == prev_val)
+						break;
+					prev_val = prev_val2;
+				}
+			}
+#else
+			*(volatile unsigned __int64*)a = v;
+#endif
+			break;
+		}
+		_ReadWriteBarrier();
+	}
+	else {
+		_dh_atomic_exchange(size, a, v);
+	}
+}
+
+static __forceinline unsigned __int64 _dh_atomic_load(
+	size_t size, const void *a, unsigned int mo)
+{
+	unsigned __int64 v;
+	assert(size == 1u || size == 2u || size == 4u || size == 8u);
+	_ReadWriteBarrier();
+	switch (size) {
+	case 1u:
+		v = *(const volatile unsigned __int8*)a;
+		break;
+	case 2u:
+		v = *(const volatile unsigned __int16*)a;
+		break;
+	case 4u:
+		v = *(const volatile unsigned __int32*)a;
+		break;
+	default:
+#if defined(_M_IX86)
+		v = _InterlockedCompareExchange64((__int64*)a, 0, 0);
+#else
+		v = *(const volatile unsigned __int64*)a;
+#endif
+		break;
+	}
+	_ReadWriteBarrier();
+
+	return v;
+}
+
+#elif defined(_M_ARM) || defined(_M_ARM64)
+
+static __forceinline void _dh_atomic_store(
+	size_t size, void *a, unsigned __int64 v, unsigned int mo)
+{
+	assert(size == 1u || size == 2u || size == 4u || size == 8u);
+	_ReadWriteBarrier();
+
+	if (mo >= dh_memory_order_release)
+		__dmb(0x0b); /* dmb ish */
+
+	_ReadWriteBarrier();
+
+	switch (size) {
+	case 1u:
+		__iso_volatile_store8((__int8*)a, (__int8)v);
+		break;
+	case 2u:
+		__iso_volatile_store16((__int16*)a, (__int16)v);
+		break;
+	case 4u:
+		__iso_volatile_store32((__int32*)a, (__int32)v);
+		break;
+	default:
+		__iso_volatile_store64((__int64*)a, (__int64)v);
+		break;
+	}
+
+	_ReadWriteBarrier();
+
+	if (mo == dh_memory_order_seq_cst)
+		__dmb(0x0b); /* dmb ish */
+
+	_ReadWriteBarrier();
+}
+
+static __forceinline unsigned __int64 _dh_atomic_load(
+	size_t size, const void *a, unsigned int mo)
+{
+	unsigned __int64 v;
+	assert(size == 1u || size == 2u || size == 4u || size == 8u);
+	_ReadWriteBarrier();
+
+	switch (size) {
+	case 1u:
+		v = __iso_volatile_load8((const unsigned __int8*)a);
+		break;
+	case 2u:
+		v = __iso_volatile_load16((const unsigned __int16*)a);
+		break;
+	case 4u:
+		v = __iso_volatile_load32((const unsigned __int32*)a);
+		break;
+	default:
+		v = __iso_volatile_load64((const unsigned __int64*)a);
+		break;
+	}
+
+	_ReadWriteBarrier();
+
+	if (mo != dh_memory_order_relaxed && mo <= dh_memory_order_acquire)
+		__dmb(0x0b); /* dmb ish */
+
+	_ReadWriteBarrier();
+
+	return v;
+}
+
+#else
+
+static __forceinline void _dh_atomic_store(
+	size_t size, void *a, unsigned __int64 v, unsigned int mo)
+{
+	assert(size == 1u || size == 2u || size == 4u || size == 8u);
+	_ReadWriteBarrier();
+	switch (size) {
+	case 1u:
+		{
+			char prev_val = *(const volatile char*)(a);
+			while (1) {
+				char prev_val2 =
+					_InterlockedCompareExchange8(
+						(char*)a,
+						(char)v,
+						prev_val);
+				if (prev_val2 == prev_val)
+					break;
+				prev_val = prev_val2;
+			}
+		}
+		break;
+	case 2u:
+		{
+			short prev_val = *(const volatile short*)(a);
+			while (1) {
+				short prev_val2 =
+					_InterlockedCompareExchange16(
+						(short*)a,
+						(short)v,
+						prev_val);
+				if (prev_val2 == prev_val)
+					break;
+				prev_val = prev_val2;
+			}
+		}
+		break;
+	case 4u:
+		{
+			long prev_val = *(const volatile long*)(a);
+			while (1) {
+				long prev_val2 =
+					_InterlockedCompareExchange(
+						(long*)a,
+						(long)v,
+						prev_val);
+				if (prev_val2 == prev_val)
+					break;
+				prev_val = prev_val2;
+			}
+		}
+		break;
+	default:
+		{
+			__int64 prev_val = *(const volatile __int64*)(a);
+			while (1) {
+				__int64 prev_val2 =
+					_InterlockedCompareExchange64(
+						(__int64*)a,
+						(__int64)v,
+						prev_val);
+				if (prev_val2 == prev_val)
+					break;
+				prev_val = prev_val2;
+			}
+		}
+		break;
+	}
+	_ReadWriteBarrier();
+}
+
+static __forceinline unsigned __int64 _dh_atomic_load(
+	size_t size, const void *a, unsigned int mo)
+{
+	unsigned __int64 v;
+	assert(size == 1u || size == 2u || size == 4u || size == 8u);
+	switch (size) {
+	case 1u:
+		v = _InterlockedCompareExchange8((char*)a, 0, 0);
+		break;
+	case 2u:
+		v = _InterlockedCompareExchange16((short*)a, 0, 0);
+		break;
+	case 4u:
+		v = _InterlockedCompareExchange((long*)a, 0, 0);
+		break;
+	default:
+		v = _InterlockedCompareExchange64((__int64*)a, 0, 0);
+		break;
+	}
+
+	return v;
+}
+
+#endif
+
+#define dh_atomic_store(_a, _v, _mo) \
+	_dh_atomic_store(sizeof(*(_a)), _a, _v, _mo);
+
+#define dh_atomic_load(_a, _mo) \
+	_dh_atomic_load(sizeof(*(_a)), _a, _mo)
+
+static __forceinline unsigned __int64 _dh_atomic_exchange(
+	size_t size, void *a, unsigned __int64 v)
+{
+	unsigned __int64 prev_val;
+	assert(size == 1u || size == 2u || size == 4u || size == 8u);
+	switch (size) {
+	case 1u:
+		prev_val = _InterlockedExchange8((char*)a, (char)v);
+		break;
+	case 2u:
+		prev_val = _InterlockedExchange16((short*)a, (short)v);
+		break;
+	case 4u:
+		prev_val = _InterlockedExchange((long*)a, (long)v);
+		break;
+	default:
+#if defined(_M_IX86)
+		{
+			_ReadWriteBarrier();
+			prev_val = *(const volatile __int64*)(a);
+			while (1) {
+				__int64 prev_val2 =
+					_InterlockedCompareExchange64(
+						(__int64*)a,
+						(__int64)v,
+						(__int64)prev_val);
+				if (prev_val2 == prev_val)
+					break;
+				prev_val = prev_val2;
+			}
+			_ReadWriteBarrier();
+		}
+#else
+		prev_val = _InterlockedExchange64((__int64*)a, (__int64)v);
+#endif
+		break;
+	}
+
+	return prev_val;
+}
+
+#define dh_atomic_exchange(_a, _v, _mo) \
+	_dh_atomic_exchange(sizeof(*(_a)), _a, _v)
+
+static __forceinline bool _dh_atomic_compadh_exchange_strong(
+	size_t size, void *a, void *expected, unsigned __int64 desired)
+{
+	bool res;
+	assert(size == 1u || size == 2u || size == 4u || size == 8u);
+	switch (size) {
+	case 1u:
+		{
+			char expected_val = *(char*)expected;
+			char prev_val =
+				 _InterlockedCompareExchange8(
+					(char*)a,
+					(char)desired,
+					expected_val);
+			*(char*)expected = prev_val;
+			res = prev_val == expected_val;
+		}
+		break;
+	case 2u:
+		{
+			short expected_val = *(short*)expected;
+			short prev_val =
+				_InterlockedCompareExchange16(
+					(short*)a,
+					(short)desired,
+					expected_val);
+			*(short*)expected = prev_val;
+			res = prev_val == expected_val;
+		}
+		break;
+	case 4u:
+		{
+			long expected_val = *(long*)expected;
+			long prev_val =
+				 _InterlockedCompareExchange(
+					(long*)a,
+					(long)desired,
+					expected_val);
+			*(long*)expected = prev_val;
+			res = prev_val == expected_val;
+		}
+		break;
+	default:
+		{
+			__int64 expected_val = *(__int64*)expected;
+			__int64 prev_val =
+				_InterlockedCompareExchange64(
+					(__int64*)a,
+					(__int64)desired,
+					expected_val);
+			*(__int64*)expected = prev_val;
+			res = prev_val == expected_val;
+		}
+		break;
+	}
+
+	return res;
+}
+
+#define dh_atomic_compadh_exchange_strong(\
+	_a, _expected, _desired, _success_mo, _fail_mo) \
+	_dh_atomic_compadh_exchange_strong(\
+		sizeof(*(_a)), _a, _expected, _desired)
+
+#define dh_atomic_compadh_exchange_weak(\
+	_a, _expected, _desired, _success_mo, _fail_mo) \
+	dh_atomic_compadh_exchange_strong(\
+		_a, _expected, _desired, _success_mo, _fail_mo)
+
+static __forceinline unsigned __int64 _dh_atomic_fetch_add(
+	size_t size, void *a, unsigned __int64 v)
+{
+	unsigned __int64 prev_val;
+	assert(size == 1u || size == 2u || size == 4u || size == 8u);
+	switch (size) {
+	case 1u:
+		prev_val = _InterlockedExchangeAdd8((char*)a, (char)v);
+		break;
+	case 2u:
+		prev_val = _InterlockedExchangeAdd16((short*)a, (short)v);
+		break;
+	case 4u:
+		prev_val = _InterlockedExchangeAdd((long*)a, (long)v);
+		break;
+	default:
+#if defined(_M_IX86)
+		{
+			_ReadWriteBarrier();
+			prev_val = *(const volatile __int64*)(a);
+			while (1) {
+				__int64 new_val = prev_val + v;
+				__int64 prev_val2 =
+					 _InterlockedCompareExchange64(
+						(__int64*)a,
+						(__int64)new_val,
+						(__int64)prev_val);
+				if (prev_val2 == prev_val)
+					break;
+				prev_val = prev_val2;
+			}
+			_ReadWriteBarrier();
+		}
+#else
+		prev_val = _InterlockedExchangeAdd64((__int64*)a, (__int64)v);
+#endif
+		break;
+	}
+
+	return prev_val;
+}
+
+#define dh_atomic_fetch_add(_a, _v, _mo) \
+	_dh_atomic_fetch_add(sizeof(*(_a)), _a, _v)
+
+#define dh_atomic_fetch_sub(_a, _v, _mo) \
+	dh_atomic_fetch_add(_a, -(__int64)(_v), _mo)
+
+static __forceinline unsigned __int64 _dh_atomic_fetch_or(
+	size_t size, void *a, unsigned __int64 v)
+{
+	unsigned __int64 prev_val;
+	assert(size == 1u || size == 2u || size == 4u || size == 8u);
+	switch (size) {
+	case 1u:
+		prev_val = _InterlockedOr8((char*)a, (char)v);
+		break;
+	case 2u:
+		prev_val = _InterlockedOr16((short*)a, (short)v);
+		break;
+	case 4u:
+		prev_val = _InterlockedOr((long*)a, (long)v);
+		break;
+	default:
+#if defined(_M_IX86)
+		{
+			_ReadWriteBarrier();
+			prev_val = *(const volatile __int64*)(a);
+			while (1) {
+				__int64 new_val = prev_val | v;
+				__int64 prev_val2 =
+					_InterlockedCompareExchange64(
+						(__int64*)a,
+						(__int64)new_val,
+						(__int64)prev_val);
+				if (prev_val2 == prev_val)
+					break;
+				prev_val = prev_val2;
+			}
+			_ReadWriteBarrier();
+		}
+#else
+		prev_val = _InterlockedOr64((__int64*)a, (__int64)v);
+#endif
+		break;
+	}
+
+	return prev_val;
+}
+
+#define dh_atomic_fetch_or(_a, _v, _mo) \
+	_dh_atomic_fetch_or(sizeof(*(_a)), _a, _v)
+
+static __forceinline unsigned __int64 _dh_atomic_fetch_xor(
+	size_t size, void *a, unsigned __int64 v)
+{
+	unsigned __int64 prev_val;
+	assert(size == 1u || size == 2u || size == 4u || size == 8u);
+	switch (size) {
+	case 1u:
+		prev_val = _InterlockedXor8((char*)a, (char)v);
+		break;
+	case 2u:
+		prev_val = _InterlockedXor16((short*)a, (short)v);
+		break;
+	case 4u:
+		prev_val = _InterlockedXor((long*)a, (long)v);
+		break;
+	default:
+#if defined(_M_IX86)
+		{
+			_ReadWriteBarrier();
+			prev_val = *(const volatile __int64*)(a);
+			while (1) {
+				__int64 new_val = prev_val ^ v;
+				__int64 prev_val2 =
+					_InterlockedCompareExchange64(
+						(__int64*)a,
+						(__int64)new_val,
+						(__int64)prev_val);
+				if (prev_val2 == prev_val)
+					break;
+				prev_val = prev_val2;
+			}
+			_ReadWriteBarrier();
+		}
+#else
+		prev_val = _InterlockedXor64((__int64*)a, (__int64)v);
+#endif
+		break;
+	}
+
+	return prev_val;
+}
+
+#define dh_atomic_fetch_xor(_a, _v, _mo) \
+	_dh_atomic_fetch_xor(sizeof(*(_a)), _a, _v)
+
+static __forceinline unsigned __int64 _dh_atomic_fetch_and(
+	size_t size, void *a, unsigned __int64 v)
+{
+	unsigned __int64 prev_val;
+	assert(size == 1u || size == 2u || size == 4u || size == 8u);
+	switch (size) {
+	case 1u:
+		prev_val = _InterlockedAnd8((char*)a, (char)v);
+		break;
+	case 2u:
+		prev_val = _InterlockedAnd16((short*)a, (short)v);
+		break;
+	case 4u:
+		prev_val = _InterlockedAnd((long*)a, (long)v);
+		break;
+	default:
+#if defined(_M_IX86)
+		{
+			_ReadWriteBarrier();
+			prev_val = *(const volatile __int64*)(a);
+			while (1) {
+				__int64 new_val = prev_val & v;
+				__int64 prev_val2 =
+					_InterlockedCompareExchange64(
+						(__int64*)a,
+						(__int64)new_val,
+						(__int64)prev_val);
+				if (prev_val2 == prev_val)
+					break;
+				prev_val = prev_val2;
+			}
+			_ReadWriteBarrier();
+		}
+#else
+		prev_val = _InterlockedAnd64((__int64*)a, (__int64)v);
+#endif
+		break;
+	}
+
+	return prev_val;
+}
+
+#define dh_atomic_fetch_and(_a, _v, _mo) \
+	_dh_atomic_fetch_and(sizeof(*(_a)), _a, _v)
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#else
+#error "Compiler does not support atomics"
+#endif /* HAVE_ATOMIC */
+
+#ifndef DH_ATOMIC
+#define DH_ATOMIC
+#endif
+
+
+/* --- Some short alias helpers --- */
+
+/**
+ * @def dh_atomic_rlx(_a)
+ *
+ * Load value from an atomic object with relaxed order
+ *
+ * @param _a  pointer to the atomic object
+ *
+ * @return value of the atomic variable
+ */
+#define dh_atomic_rlx(_a) dh_atomic_load(_a, dh_memory_order_relaxed)
+
+
+/**
+ * @def dh_atomic_rlx_set(_a, _v)
+ *
+ * Store value in an atomic object with relaxed order
+ *
+ * @param _a  pointer to the atomic object
+ * @param _v  new value
+ */
+#define dh_atomic_rlx_set(_a, _v)                                            \
+	dh_atomic_store(_a, _v, dh_memory_order_relaxed)
+
+
+/**
+ * @def dh_atomic_rlx_add(_a, _v)
+ *
+ * Replace value from an atomic object with addition and relaxed order
+ *
+ * @param _a  pointer to the atomic object
+ * @param _v  value to add
+ *
+ * @return value held previously by the atomic variable
+ */
+#define dh_atomic_rlx_add(_a, _v)                                            \
+	dh_atomic_fetch_add(_a, _v, dh_memory_order_relaxed)
+
+
+/**
+ * @def dh_atomic_rlx_sub(_a, _v)
+ *
+ * Replace value from an atomic object with subtraction and relaxed order
+ *
+ * @param _a  pointer to the atomic object
+ * @param _v  value to subtract
+ *
+ * @return value held previously by the atomic variable
+ */
+#define dh_atomic_rlx_sub(_a, _v)                                            \
+	dh_atomic_fetch_sub(_a, _v, dh_memory_order_relaxed)
+
+
+/**
+ * @def dh_atomic_acq(_a)
+ *
+ * Load value from an atomic object with acquire order
+ *
+ * @param _a  pointer to the atomic object
+ *
+ * @return value of the atomic variable
+ */
+#define dh_atomic_acq(_a) dh_atomic_load(_a, dh_memory_order_acquire)
+
+
+/**
+ * @def dh_atomic_rls_set(_a, _v)
+ *
+ * Store value in an atomic object with release order
+ *
+ * @param _a  pointer to the atomic object
+ * @param _v  new value
+ */
+#define dh_atomic_rls_set(_a, _v)                                          \
+	dh_atomic_store(_a, _v, dh_memory_order_release)
+
+
+/**
+ * @def dh_atomic_acq_add(_a, _v)
+ *
+ * Replace value from an atomic object with addition and acquire-release order
+ *
+ * @param _a  pointer to the atomic object
+ * @param _v  value to add
+ *
+ * @return value held previously by the atomic variable
+ */
+#define dh_atomic_acq_add(_a, _v)                                          \
+	dh_atomic_fetch_add(_a, _v, dh_memory_order_acq_rel)
+
+
+/**
+ * @def dh_atomic_acq_sub(_a, _v)
+ *
+ * Replace value from an atomic object with subtraction and acquire-release
+ * order
+ *
+ * @param _a  pointer to the atomic object
+ * @param _v  value to subtract
+ *
+ * @return value held previously by the atomic variable
+ */
+#define dh_atomic_acq_sub(_a, _v)                                          \
+	dh_atomic_fetch_sub(_a, _v, dh_memory_order_acq_rel)
+
+
+/**
+ * @def dh_atomic_seq(_a)
+ *
+ * Load value from an atomic object with sequentially-consistent order
+ *
+ * @param _a  pointer to the atomic object
+ *
+ * @return value of the atomic variable
+ */
+#define dh_atomic_seq(_a) dh_atomic_load(_a, dh_memory_order_seq_cst)
+
+
+/**
+ * @def dh_atomic_seq_set(_a, _v)
+ *
+ * Store value in an atomic object with sequentially-consistent order
+ *
+ * @param _a  pointer to the atomic object
+ * @param _v  new value
+ */
+#define dh_atomic_seq_set(_a, _v)                                            \
+	dh_atomic_store(_a, _v, dh_memory_order_seq_cst)
+
+
+/**
+ * @def dh_atomic_seq_add(_a, _v)
+ *
+ * Replace value from an atomic object with addition and
+ * sequentially-consistent order
+ *
+ * @param _a  pointer to the atomic object
+ * @param _v  value to add
+ *
+ * @return value held previously by the atomic variable
+ */
+#define dh_atomic_seq_add(_a, _v)                                            \
+	dh_atomic_fetch_add(_a, _v, dh_memory_order_seq_cst)
+
+
+/**
+ * @def dh_atomic_seq_sub(_a, _v)
+ *
+ * Replace value from an atomic object with subtraction and
+ * sequentially-consistent order
+ *
+ * @param _a  pointer to the atomic object
+ * @param _v  value to subtract
+ *
+ * @return value held previously by the atomic variable
+ */
+#define dh_atomic_seq_sub(_a, _v)                                            \
+	dh_atomic_fetch_sub(_a, _v, dh_memory_order_seq_cst)
+
+
+#endif /* DH_H_ATOMIC__ */
diff --git a/duix-sdk/src/main/cpp/dhcore/dh_data.cpp b/duix-sdk/src/main/cpp/dhcore/dh_data.cpp
new file mode 100644
index 0000000..533946c
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhcore/dh_data.cpp
@@ -0,0 +1,391 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+#include "dh_data.h"
+#ifdef WIN32
+#include <windows.h>
+#else
+//#include <sys/timeb.h>
+#include <unistd.h>
+#endif
+#include <time.h>
+#include "dh_mem.h"
+
+jmat_t* jmat_addref(jmat_t* mat){
+  if(mat) dhmem_ref(mat);
+  return mat;
+}
+
+jmat_t* jmat_deref(jmat_t* mat){
+  if(!mat)return NULL;
+  return (jmat_t*)dhmem_deref(mat);
+}
+
+void* jdata_addref(void* data){
+  if(!data)return NULL;
+  return dhmem_ref(data);
+}
+
+void* jdata_deref(void* data){
+  if(!data)return NULL;
+  return dhmem_deref(data);
+}
+
+static void my_jbuf_destroy(void* arg){
+  jbuf_t* buf = (jbuf_t*)arg;
+  //printf("===jbuf destroy %p\n",buf);
+  //
+}
+
+jbuf_t* jbuf_allocex(char* mem,int size,dhmem_destroy_h fndestroy){
+  jbuf_t* buf = (jbuf_t*)dhmem_alloc(sizeof(jbuf_t),fndestroy);
+  memset(buf,0,sizeof(jbuf_t));
+
+  return buf;
+}
+
+jbuf_t* jbuf_alloc(int size){
+  int len = size>0?size:0;
+  jbuf_t* buf = (jbuf_t*)dhmem_alloc(sizeof(jbuf_t)+len,my_jbuf_destroy);
+  //printf("===jbuf alloc %p\n",buf);
+  memset(buf,0,len+sizeof(jbuf_t));
+  if(size>0){
+    buf->data = (char*)buf + sizeof(jbuf_t);
+  }else{
+    buf->data = NULL;
+  }
+  buf->size = size;
+  return buf;
+}
+
+jbuf_t* jbuf_strdup(char* txt,int pos){
+  int len = strlen(txt);
+  if((pos>0)&&(pos<len))len = pos;
+  char* pb = txt; 
+  int size = len;
+  jbuf_t* buf = jbuf_alloc(size+1);
+  memcpy(buf->data,pb,size);
+  buf->data[size]=0;
+  return buf;
+}
+
+jbuf_t* jbuf_dupmem(char* mem,int size){
+  jbuf_t* buf = jbuf_alloc(size);
+  if(size) memcpy(buf->data,mem,size);
+  return buf;
+}
+
+jbuf_t* jbuf_refmem(char* mem,int size){
+  jbuf_t* buf = jbuf_alloc(0);
+  buf->data = mem;
+  buf->size = size;
+  buf->ref = 1;
+  return buf;
+}
+
+jbuf_t* jbuf_null(uint64_t sessid){
+  jbuf_t* buf = jbuf_alloc(0);
+  buf->sessid = sessid;
+  buf->data = NULL;
+  buf->size = 0;
+  buf->ref = 0;
+  return buf;
+}
+
+int       jbuf_zeros(jbuf_t* buf){
+  if(buf->size>0){
+    memset(buf->data,0,buf->size);
+  }
+  return 0;
+}
+
+int       jbuf_free(jbuf_t* buf){
+  dhmem_deref(buf);
+  return 0;
+}
+
+int       jbuf_copy(jbuf_t* dst,jbuf_t* src){
+  int size = src->size;
+  if(size>dst->size)size = dst->size;
+  memcpy(dst->data,src->data,size);
+  return 0;
+}
+
+int       jmat_dump(jmat_t* mat){
+  if(mat->gpu){
+    printf("===w %d h %d c %d d %d b %d p %p \n",
+        mat->width,mat->height,mat->channel,mat->stride,mat->bit,mat->data);
+    return 0;
+  }
+  printf("===w %d h %d c %d d %d b %d p %p [\n",
+      mat->width,mat->height,mat->channel,mat->stride,mat->bit,mat->data);
+  int rgb = (mat->channel==3)?1:0;
+  if(mat->bit == 4){
+    for(int m=0;m<3;m++){
+      printf("[");
+      float* pa = (float*)jmat_row(mat,m);
+      for(int k=0;k<3;k++){
+        if(rgb){
+          printf("[%f %f %f]",pa[0],pa[1],pa[2]);
+          pa+=3;
+        }else{
+          printf(" %f ",*pa++);
+        }
+      }
+      if(rgb){
+        pa = (float*)jmat_row(mat,m) + mat->width*mat->channel - 9;
+      }else{
+        pa = (float*)jmat_row(mat,m) + mat->width*mat->channel - 3;
+      }
+      //printf("\n====offset %ld\n",(char*)pa - mat->data);
+      printf("====");
+      for(int k=0;k<3;k++){
+        if(rgb){
+          printf("[%f %f %f]",pa[0],pa[1],pa[2]);
+          pa+=3;
+        }else{
+          printf(" %f ",*pa++);
+        }
+      }
+      printf("]\n");
+    }
+    for(int m=3;m>0;m--){
+      printf("[");
+      float* pa = (float*)jmat_row(mat,mat->height - m);
+      for(int k=0;k<3;k++){
+        if(rgb){
+          printf("[%f %f %f]",pa[0],pa[1],pa[2]);
+          pa+=3;
+        }else{
+          printf(" %f ",*pa++);
+        }
+      }
+      if(rgb){
+        pa = (float*)jmat_row(mat,mat->height - m) + mat->width*mat->channel - 9;
+      }else{
+        pa = (float*)jmat_row(mat,mat->height - m) + mat->width*mat->channel - 3;
+      }
+      printf("====");
+      for(int k=0;k<3;k++){
+        if(rgb){
+          printf("[%f %f %f]",pa[0],pa[1],pa[2]);
+          pa+=3;
+        }else{
+          printf(" %f ",*pa++);
+        }
+      }
+      printf("]\n");
+    }
+  }else{
+    for(int m=0;m<3;m++){
+      printf("[");
+      uint8_t* pa = (uint8_t*)jmat_row(mat,m);
+      for(int k=0;k<3;k++){
+        printf("[%d %d %d]",pa[0],pa[1],pa[2]);
+        pa+=3;
+      }
+      pa = (uint8_t*)jmat_row(mat,m) + mat->width*mat->channel - 9;
+      printf("====");
+      for(int k=0;k<3;k++){
+        printf("[%d %d %d]",pa[0],pa[1],pa[2]);
+        pa+=3;
+      }
+      printf("]\n");
+    }
+    for(int m=3;m>0;m--){
+      printf("[");
+      uint8_t* pa = (uint8_t*)jmat_row(mat,mat->height - m);
+      for(int k=0;k<3;k++){
+        printf("[%d %d %d]",pa[0],pa[1],pa[2]);
+        pa+=3;
+      }
+      pa = (uint8_t*)jmat_row(mat,mat->height - m) + mat->width*mat->channel - 9;
+      printf("====");
+      for(int k=0;k<3;k++){
+        printf("[%d %d %d]",pa[0],pa[1],pa[2]);
+        pa+=3;
+      }
+      printf("]\n");
+    }
+  }
+  printf("]=====\n");
+  return 0;
+}
+
+static void my_jmat_destroy(void* arg){
+  jmat_t* mat = (jmat_t*)arg;
+  if(!mat->buf.ref){
+    dhmem_deref(mat->data);
+    mat->data = NULL;
+  }
+  jbuf_t* buf = mat->buf.next;
+  while(buf){
+    jbuf_t* tbuf = buf;
+    buf = buf->next;
+    dhmem_deref(tbuf);
+  }
+  //if(mat->rmat)dhmem_deref(mat->rmat);
+  //if(mat->bmat)dhmem_deref(mat->bmat);
+  //printf("===jmat destroy %p \n",mat);
+}
+
+jmat_t* jmat_allocex(int w,int h,int c ,int d, int b,void* mem,dhmem_destroy_h fndestroy){
+  int bit = b?b:1;
+  int stride = d?d:w*c;
+  int size = bit*stride*h;
+  int realsize = 0;//mem?0:size;
+  realsize = sizeof(jmat_t);
+  jmat_t* mat = (jmat_t*)dhmem_alloc(realsize,fndestroy);
+  //printf("===jmat alloc %p\n",mat);
+  //printf("===jmat alloc %p \n",mat);
+  memset(mat,0,realsize);
+  jbuf_t* buf = (jbuf_t*)&mat->buf;
+  mat->width = w;
+  mat->height = h;
+  mat->channel = c;
+  mat->bit = bit;
+  mat->stride = stride;
+  buf->data = (char*)mem;
+  buf->size = size;
+  mat->data = buf->data;
+  return mat;
+}
+
+jmat_t* jmat_null(){
+  jmat_t* mat = (jmat_t*)dhmem_zalloc(sizeof(jmat_t),my_jmat_destroy);
+  return mat;
+}
+
+jmat_t* jmat_alloc(int w,int h,int c ,int d, int b,void* mem){
+  int bit = b?b:1;
+  int stride = d?d:w*c;
+  int size = bit*stride*h;
+  int realsize = sizeof(jmat_t);
+  jmat_t* mat = (jmat_t*)dhmem_zalloc(realsize,my_jmat_destroy);
+  //printf("===jmat alloc %p\n",mat);
+  //printf("===jmat alloc %p \n",mat);
+  jbuf_t* buf = (jbuf_t*)&mat->buf;
+  mat->width = w;
+  mat->height = h;
+  mat->channel = c;
+  mat->bit = bit;
+  mat->stride = stride;
+  if(mem){
+    buf->data = (char*)mem;
+    buf->ref = 1;
+  }else{
+    buf->data = (char*)dhmem_zalloc(size,NULL);
+  }
+  buf->size = size;
+  mat->data = buf->data;
+  return mat;
+}
+
+jmat_t* jmat_crgb(int w,int h,uint8_t *mem){
+  jmat_t* mat = jmat_alloc(w,h,3,0,1,mem);
+  return mat;
+}
+
+char*   jmat_row(jmat_t* mat,int row){
+  if(row>=mat->height)return NULL;
+  int offset =  row*mat->stride*mat->bit;
+  //printf("==row %d stride %d offset %d\n",row,mat->stride,offset);
+  return mat->data + offset;
+}
+
+char*   jmat_item(jmat_t* mat,int col,int row){
+  if(row>=mat->height)return NULL;
+  if(col>=mat->width)return NULL;
+  int offset =  row*mat->stride*mat->bit + col*mat->bit;
+  return mat->data + offset;
+}
+
+int       jmat_zero(jmat_t* src){
+  return jbuf_zeros(&src->buf);
+}
+
+jmat_t*   jmat_clone(jmat_t* mat){
+  jmat_t* dst = NULL;
+  dst = jmat_alloc(mat->width,mat->height,mat->channel,mat->stride,mat->bit,NULL);
+  memcpy(dst->data,mat->data,mat->buf.size);
+  return dst;
+}
+
+int     jmat_free(jmat_t* mat){
+  if(mat) dhmem_deref(mat);
+  //printf("===jmat free %p \n",mat);
+  return 0;
+}
+
+int       jmat_copy(jmat_t* dst,jmat_t* src){
+  if(dst->buf.size!=src->buf.size)return -1;
+  memcpy(dst->data,src->data,dst->buf.size);
+  return 0;
+}
+
+int jmat_reshape(jmat_t* mat,int w,int h){
+  mat->width = w;
+  mat->height = h;
+  mat->stride = w*mat->channel;
+  return 0;
+}
+
+int jmat_reroi(jmat_t* mat,jmat_t* src,int w,int h,int l,int t){
+  int d = src->stride;
+  int c = src->channel;
+  int b = src->bit;
+  int s = b*d*h;
+  char* mem = src->data + t*d*b + l*c*b;
+  //
+  jbuf_t* buf = (jbuf_t*)&mat->buf;
+
+  mat->width = w;
+  mat->height = h;
+  mat->channel = c;
+  mat->bit = b;
+  mat->stride = d;
+  buf->data = (char*)mem;
+  buf->size = s;
+  mat->data = buf->data;
+  mat->gpu = src->gpu;
+  mat->buf.ref = 1;
+  return 0;
+}
+
+jmat_t* jmat_roi(jmat_t* mat,int w,int h,int l,int t){
+  int d = mat->stride;
+  int c = mat->channel;
+  int b = mat->bit;
+  char* roidata = mat->data + t*d*b + l*c*b;
+  jmat_t* roimat = jmat_alloc(w,h,c,d,b,roidata);
+  roimat->gpu = mat->gpu;
+  return roimat;
+}
+
+
+uint64_t jtimer_msstamp(){
+  struct timespec ts;
+#ifdef WIN32
+  //return clock();
+  clock_gettime(0, &ts);
+#else
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+#endif
+  return (ts.tv_sec*1000l) + (ts.tv_nsec/CLOCKS_PER_SEC);
+}
+
+
+void jtimer_mssleep(int ms) {
+#ifdef WIN32
+  Sleep(ms);
+#else
+  /*
+     struct timeval delay;
+     delay.tv_sec = 0;
+     delay.tv_usec = ms * 1000; // 20 ms
+     select(0, NULL, NULL, NULL, &delay);
+     */
+  usleep(ms*1000);
+#endif
+}
diff --git a/duix-sdk/src/main/cpp/dhcore/dh_data.h b/duix-sdk/src/main/cpp/dhcore/dh_data.h
new file mode 100644
index 0000000..24502ac
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhcore/dh_data.h
@@ -0,0 +1,77 @@
+#ifndef GJ_MEDDATA_H
+#define GJ_MEDDATA_H
+#include <stdint.h>
+#include "dh_mem.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+  typedef struct jbuf_s jbuf_t;
+
+  struct jbuf_s{
+    char    *data;
+    int     size;
+    uint64_t  sessid;
+    int64_t   tag;
+    int     ref;
+    jbuf_t  *next;
+  };
+
+  jbuf_t* jbuf_alloc(int size);
+  jbuf_t* jbuf_strdup(char* txt,int size);
+  jbuf_t* jbuf_dupmem(char* mem,int size);
+  jbuf_t* jbuf_refmem(char* mem,int size);
+  jbuf_t* jbuf_null(uint64_t sessid);
+  int       jbuf_zeros(jbuf_t* buf);
+  int       jbuf_free(jbuf_t* buf);
+  int       jbuf_copy(jbuf_t* dst,jbuf_t* src);
+
+  typedef struct jmat_s jmat_t;
+  struct jmat_s{
+    jbuf_t    buf;
+    char      *data;
+    int       width;
+    int       height;
+    int       channel;
+    int       stride;
+    int       bit;  
+    int       gpu;
+    //jmat_t    *rmat;    
+    //jmat_t    *bmat;    
+  };
+
+  jmat_t* jmat_null();
+  jmat_t* jmat_allocex(int w,int h,int c ,int d, int b,void* mem,dhmem_destroy_h fndestroy);
+  jmat_t* jmat_alloc(int w,int h,int c ,int d, int b,void* mem);
+  jmat_t* jmat_crgb(int w,int h,uint8_t *mem); 
+  char*   jmat_row(jmat_t* mat,int row);
+  char*   jmat_item(jmat_t* mat,int col,int row);
+  int     jmat_free(jmat_t* mat);
+  int jmat_reshape(jmat_t* mat,int w,int h);
+  jmat_t* jmat_roi(jmat_t* mat,int w,int h,int l,int t);
+  int       jmat_reroi(jmat_t* mat,jmat_t* src,int w,int h,int l,int t);
+  int       jmat_copy(jmat_t* dst,jmat_t* src);
+  int       jmat_dump(jmat_t* mat);
+
+  jmat_t*   jmat_clone(jmat_t* mat);
+  //jmat_t* jmat_roi(jmat_t* src,int left,int top,int width,int height);
+  int       jmat_zero(jmat_t* src);
+
+  jmat_t* jmat_addref(jmat_t* mat);
+  jmat_t* jmat_deref(jmat_t* mat);
+
+
+
+
+  void* jdata_addref(void* data);
+  void* jdata_deref(void* data);
+  uint64_t jtimer_msstamp();  
+  void jtimer_mssleep(int ms) ;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/duix-sdk/src/main/cpp/dhcore/dh_mem.c b/duix-sdk/src/main/cpp/dhcore/dh_mem.c
new file mode 100644
index 0000000..5872226
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhcore/dh_mem.c
@@ -0,0 +1,300 @@
+/**
+ * @file mem.c  Memory management with reference counting
+ *
+ * Copyright (C) 2010 Creytiv.com
+ */
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "dh_atomic.h"
+#include "dh_mem.h"
+
+
+
+
+
+/** Defines a reference-counting memory object */
+struct dhmem {
+	DH_ATOMIC uint32_t nrefs; /**< Number of references  */
+	uint32_t size;         /**< Size of memory object */
+	dhmem_destroy_h *dh;     /**< Destroy handler       */
+};
+
+
+#define STAT_ALLOC(_m, _size) (_m)->size = (uint32_t)(_size);
+#define STAT_REALLOC(_m, _size) (_m)->size = (uint32_t)(_size);
+#define STAT_DEREF(_m)
+#define MAGIC_CHECK(_m)
+
+
+enum {
+#if defined(__x86_64__)
+	/* Use 16-byte alignment on x86-x32 as well */
+	dhmem_alignment = 16u,
+#else
+	dhmem_alignment = sizeof(void*) >= 8u ? 16u : 8u,
+#endif
+	alignment_mask = dhmem_alignment - 1u,
+	dhmem_header_size = (sizeof(struct dhmem) + alignment_mask) &
+		(~(size_t)alignment_mask)
+};
+
+#define MEM_SIZE_MAX \
+	(size_t)(sizeof(size_t) > sizeof(uint32_t) ? \
+		(~(uint32_t)0u) : (~(size_t)0u) - dhmem_header_size)
+
+
+static inline struct dhmem *get_mem(void *p)
+{
+	return (struct dhmem *)(void *)(((unsigned char *)p) - dhmem_header_size);
+}
+
+
+static inline void *get_dhmem_data(struct dhmem *m)
+{
+	return (void *)(((unsigned char *)m) + dhmem_header_size);
+}
+
+char    *dhstr_dup(char* txt){
+  int len = strlen(txt);
+  char* str = (char*)dhmem_zalloc(len+1,NULL);
+  memcpy(str,txt,len);
+  return str;
+}
+
+/**
+ * Allocate a new reference-counted memory object
+ *
+ * @param size Size of memory object
+ * @param dh   Optional destructor, called when destroyed
+ *
+ * @return Pointer to allocated object
+ */
+void *dhmem_alloc(size_t size, dhmem_destroy_h *dh)
+{
+	struct dhmem *m;
+
+	if (size > MEM_SIZE_MAX)
+		return NULL;
+
+
+	m = (struct dhmem*)malloc(dhmem_header_size + size);
+	if (!m)
+		return NULL;
+
+	dh_atomic_rlx_set(&m->nrefs, 1u);
+	m->dh    = dh;
+
+	STAT_ALLOC(m, size);
+
+	return get_dhmem_data(m);
+}
+
+
+/**
+ * Allocate a new reference-counted memory object. Memory is zeroed.
+ *
+ * @param size Size of memory object
+ * @param dh   Optional destructor, called when destroyed
+ *
+ * @return Pointer to allocated object
+ */
+void *dhmem_zalloc(size_t size, dhmem_destroy_h *dh)
+{
+	void *p;
+
+	p = dhmem_alloc(size, dh);
+	if (!p)
+		return NULL;
+
+	memset(p, 0, size);
+
+	return p;
+}
+
+
+/**
+ * Re-allocate a reference-counted memory object
+ *
+ * @param data Memory object
+ * @param size New size of memory object
+ *
+ * @return New pointer to allocated object
+ *
+ * @note Realloc NULL pointer is not supported
+ */
+void *dhmem_realloc(void *data, size_t size)
+{
+	struct dhmem *m, *m2;
+
+	if (!data)
+		return NULL;
+
+	if (size > MEM_SIZE_MAX)
+		return NULL;
+
+	m = get_mem(data);
+
+	MAGIC_CHECK(m);
+
+	if (dh_atomic_acq(&m->nrefs) > 1u) {
+		void* p = dhmem_alloc(size, m->dh);
+		if (p) {
+			memcpy(p, data, m->size);
+			dhmem_deref(data);
+		}
+		return p;
+	}
+
+
+	m2 = (struct dhmem*)realloc(m, dhmem_header_size + size);
+
+	if (!m2) {
+		return NULL;
+	}
+
+	STAT_REALLOC(m2, size);
+
+	return get_dhmem_data(m2);
+}
+
+
+/**
+ * Re-allocate a reference-counted array
+ *
+ * @param ptr      Pointer to existing array, NULL to allocate a new array
+ * @param nmemb    Number of members in array
+ * @param membsize Number of bytes in each member
+ * @param dh       Optional destructor, only used when ptr is NULL
+ *
+ * @return New pointer to allocated array
+ */
+void *dhmem_reallocarray(void *ptr, size_t nmemb, size_t membsize,
+		       dhmem_destroy_h *dh)
+{
+	size_t tsize;
+
+	if (membsize && nmemb > MEM_SIZE_MAX / membsize) {
+		return NULL;
+	}
+
+	tsize = nmemb * membsize;
+
+	if (ptr) {
+		return dhmem_realloc(ptr, tsize);
+	}
+	else {
+		return dhmem_alloc(tsize, dh);
+	}
+}
+
+
+/**
+ * Set or unset a destructor for a memory object
+ *
+ * @param data Memory object
+ * @param dh   called when destroyed, NULL for remove
+ */
+void dhmem_destructor(void *data, dhmem_destroy_h *dh)
+{
+	struct dhmem *m;
+
+	if (!data)
+		return;
+
+	m = get_mem(data);
+
+	MAGIC_CHECK(m);
+
+	m->dh = dh;
+}
+
+
+/**
+ * Reference a reference-counted memory object
+ *
+ * @param data Memory object
+ *
+ * @return Memory object (same as data)
+ */
+void *dhmem_ref(void *data)
+{
+	struct dhmem *m;
+
+	if (!data)
+		return NULL;
+
+	m = get_mem(data);
+
+	MAGIC_CHECK(m);
+
+	dh_atomic_rlx_add(&m->nrefs, 1u);
+
+	return data;
+}
+
+
+/**
+ * Dereference a reference-counted memory object. When the reference count
+ * is zero, the destroy handler will be called (if present) and the memory
+ * will be freed
+ *
+ * @param data Memory object
+ *
+ * @return Always NULL
+ */
+/* coverity[-tainted_data_sink: arg-0] */
+void *dhmem_deref(void *data)
+{
+	struct dhmem *m;
+
+	if (!data)
+		return NULL;
+
+	m = get_mem(data);
+
+	MAGIC_CHECK(m);
+
+	if (dh_atomic_acq_sub(&m->nrefs, 1u) > 1u) {
+		return NULL;
+	}
+
+	if (m->dh)
+		m->dh(data);
+
+	/* NOTE: check if the destructor called dhmem_ref() */
+	if (dh_atomic_rlx(&m->nrefs) > 0u)
+		return NULL;
+
+
+	STAT_DEREF(m);
+
+	free(m);
+
+	return NULL;
+}
+
+
+/**
+ * Get number of references to a reference-counted memory object
+ *
+ * @param data Memory object
+ *
+ * @return Number of references
+ */
+uint32_t dhmem_nrefs(const void *data)
+{
+	struct dhmem *m;
+
+	if (!data)
+		return 0;
+
+	m = get_mem((void*)data);
+
+	MAGIC_CHECK(m);
+
+	return (uint32_t)dh_atomic_acq(&m->nrefs);
+}
+
+
diff --git a/duix-sdk/src/main/cpp/dhcore/dh_mem.h b/duix-sdk/src/main/cpp/dhcore/dh_mem.h
new file mode 100644
index 0000000..5571c3e
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhcore/dh_mem.h
@@ -0,0 +1,28 @@
+#ifndef GJ_DHMEM_H
+#define GJ_DHMEM_H
+#include <stdint.h>
+#include <string.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (dhmem_destroy_h)(void *data);
+
+char    *dhstr_dup(char* txt);
+void    *dhmem_alloc(size_t size, dhmem_destroy_h *dh);
+void    *dhmem_zalloc(size_t size, dhmem_destroy_h *dh);
+void    *dhmem_realloc(void *data, size_t size);
+void    *dhmem_reallocarray(void *ptr, size_t nmemb,
+			  size_t membsize, dhmem_destroy_h *dh);
+void     dhmem_destructor(void *data, dhmem_destroy_h *dh);
+void    *dhmem_ref(void *data);
+void    *dhmem_deref(void *data);
+uint32_t dhmem_nrefs(const void *data);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/duix-sdk/src/main/cpp/dhcore/dh_que.cpp b/duix-sdk/src/main/cpp/dhcore/dh_que.cpp
new file mode 100644
index 0000000..7c51cb6
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhcore/dh_que.cpp
@@ -0,0 +1,241 @@
+#include "dh_que.h"
+
+#include "readerwriterqueue.h"
+#include "concurrentqueue.h"
+#include "blockingconcurrentqueue.h"
+#include "dh_atomic.h"
+
+typedef moodycamel::ReaderWriterQueue<jbuf_t*>  ReaderWriterQueue;
+typedef moodycamel::ConcurrentQueue<jbuf_t*> ConcurrentQueue;
+typedef moodycamel::BlockingConcurrentQueue<jbuf_t*> BlockingConcurrentQueue;
+
+typedef int (*jqfn_pop)(jqueue_t* que,int flush,jbuf_t** pbuf);
+typedef int (*jqfn_push)(jqueue_t* que,int flush,jbuf_t* buf);
+
+struct jqueue_s{
+  void        *m_obj;
+  int         m_kind;
+	DH_ATOMIC uint32_t nrefs; /**< Number of references  */
+  int         m_cache;
+  //jbuf_t      *m_readcache;
+  //jbuf_t      *m_writecache;
+  jqfn_push   fn_push;
+  jqfn_pop    fn_pop;
+  uint64_t    m_lastsess;
+};
+
+typedef struct{
+  jqueue_t  que;
+  jbuf_t    *m_head;
+  jbuf_t    *m_tail;
+  pthread_mutex_t  m_lock; 
+}jlockque_t;
+
+static int simp_push(jqueue_t* que,int flush,jbuf_t* buf){
+  void* obj = que->m_obj;
+  //if(flush){
+  return reinterpret_cast<ReaderWriterQueue*>(obj)->enqueue(buf);
+  //}else{
+  //return reinterpret_cast<ReaderWriterQueue*>(obj)->try_enqueue(buf);
+  //}
+}
+
+static int simp_pop(jqueue_t* que,int flush,jbuf_t** pbuf){
+  void* obj = que->m_obj;
+  return reinterpret_cast<ReaderWriterQueue*>(obj)->try_dequeue(*pbuf);
+}
+
+static int muti_push(jqueue_t* que,int flush,jbuf_t* buf){
+  void* obj = que->m_obj;
+  //if(flush){
+  return reinterpret_cast<BlockingConcurrentQueue*>(obj)->enqueue(buf);
+  //}else{
+  //return reinterpret_cast<BlockingConcurrentQueue*>(obj)->try_enqueue(buf);
+  //}
+}
+
+static int muti_pop(jqueue_t* que,int flush,jbuf_t** pbuf){
+  void* obj = que->m_obj;
+  return reinterpret_cast<BlockingConcurrentQueue*>(obj)->try_dequeue(*pbuf);
+}
+
+static int lock_push(jqueue_t* que,int flush,jbuf_t* buf){
+  jlockque_t* exque = reinterpret_cast<jlockque_t*>(que);
+  buf->next = NULL;
+  pthread_mutex_lock(&exque->m_lock);
+  if(exque->m_tail){
+    if(exque->m_head==exque->m_tail){
+      exque->m_head->next = buf;
+      exque->m_tail = buf;
+    }else{
+      exque->m_tail->next = buf;
+      exque->m_tail = buf;
+    }
+  }else{
+    exque->m_head = buf;
+    exque->m_tail = buf;
+  }
+  pthread_mutex_unlock(&exque->m_lock);
+  //printf("===push %p one %d %p\n",que,que->m_size,buf);
+  //printf("===que %p head %p tail %p\n",que,exque->m_head,exque->m_tail);
+  return 1;  //
+}
+
+static int lock_pop(jqueue_t* que,int flush,jbuf_t** pbuf){
+  jlockque_t* exque = reinterpret_cast<jlockque_t*>(que);
+  jbuf_t* buf = NULL;
+  int rst = 0;
+  pthread_mutex_lock(&exque->m_lock);
+  buf = exque->m_head;
+  if(buf){
+    if(exque->m_tail==buf){
+      exque->m_head = NULL;
+      exque->m_tail = NULL;
+    }else{
+      exque->m_head = buf->next;
+    }
+    buf->next = NULL;
+  }
+  pthread_mutex_unlock(&exque->m_lock);
+  //printf("===pop %p one %d %p\n",que,que->m_size,buf);
+  //printf("===que %p head %p tail %p\n",que,exque->m_head,exque->m_tail);
+  *pbuf = buf;
+  return rst;
+}
+
+
+void my_jque_destroy(void* arg){
+  jqueue_t* que = (jqueue_t*)arg;
+
+  /*
+     buf= que->m_readcache;
+     while(buf){
+     jbuf_t* one = buf->next;
+     jbuf_free(buf);
+     buf = one;
+     }
+     buf= que->m_writecache;
+     while(buf){
+     jbuf_t* one = buf->next;
+     jbuf_free(buf);
+     buf = one;
+     }
+     */
+  jbuf_t* buf  = NULL;
+  que->fn_pop(que,1,&buf);
+  while(buf){
+    jbuf_free(buf);
+    buf = NULL;
+    que->fn_pop(que,1,&buf);
+    //printf("===free one %p\n",buf);
+  }
+
+  if(que->m_kind==GQUE_SIMP){
+    delete reinterpret_cast<ReaderWriterQueue*>(que->m_obj);
+  }else{
+    delete reinterpret_cast<BlockingConcurrentQueue*>(que->m_obj);
+  }
+
+}
+
+jqueue_t*  jque_alloc(int size,int cache,int kind){
+  jqueue_t* que = NULL;
+  //if(kind==GQUE_LOCK){
+  if(0){
+    jlockque_t* exq = (jlockque_t*)dhmem_alloc(sizeof(jlockque_t),my_jque_destroy);
+    memset(exq,0,sizeof(jlockque_t));
+    pthread_mutex_init(&exq->m_lock,NULL);
+    que = reinterpret_cast<jqueue_t*>(exq);
+    que->fn_pop = lock_pop;
+    que->fn_push = lock_push;
+  }else{
+    que = (jqueue_t*)dhmem_alloc(sizeof(jqueue_t),my_jque_destroy);
+    memset(que,0,sizeof(jqueue_t));
+    if(kind==GQUE_SIMP){
+      que->m_obj = new ReaderWriterQueue();
+      que->fn_push = simp_push;
+      que->fn_pop = simp_pop;
+    }else {
+      que->m_obj = new BlockingConcurrentQueue();
+      que->fn_push = muti_push;
+      que->fn_pop = muti_pop;
+    }
+  }
+  if(que){
+    que->m_cache = cache;
+    que->m_kind = kind;
+  }
+  return que;
+}
+
+int jque_push(jqueue_t* que,jbuf_t* buf){
+  if(!buf)return 0;
+  if(buf->sessid>que->m_lastsess) que->m_lastsess = buf->sessid;
+  /*
+     if(que->m_cache){
+     while(que->m_writecache){
+     jbuf_t* one = que->m_writecache;
+     que->m_writecache = one->next;
+     que->fn_push(que,1,one);
+     }
+     }
+     */
+  int rst = que->fn_push(que,!que->m_cache,buf);
+	dh_atomic_rlx_add(&que->nrefs, 1u);
+  /*
+     if(!rst&&que->m_cache){
+     if(que->m_writecache){
+     jbuf_t* tail = que->m_writecache;
+     while(tail->next)tail = tail->next;
+     tail->next = buf;
+     }else{
+     que->m_writecache = buf;
+     }
+     rst = 1;
+     }
+     */
+  return rst;
+}
+
+jbuf_t* jque_pop(jqueue_t* que,uint64_t sessid){
+  if(sessid&&(sessid<que->m_lastsess)){
+    //printf("===last %ld of %ld\n",sessid,que->m_lastsess);
+    return NULL;
+  }
+  int rst = 0;
+  jbuf_t* buf = NULL;
+
+  /*
+     if(que->m_readcache){
+     buf = que->m_readcache;
+     que->m_readcache = que->m_readcache->next;
+     buf->next = NULL;
+     rst = 1;
+     }else{
+     */
+  rst = que->fn_pop(que,0,&buf);
+  if(buf)dh_atomic_acq_sub(&que->nrefs, 1u);
+  /*
+     if(rst&&buf){
+     que->m_readcache = buf->next;
+     buf->next = NULL;
+     }
+     }
+     */
+  return buf;
+}
+
+jbuf_t* jque_popall(jqueue_t* que){
+  return NULL;
+}
+
+int jque_size(jqueue_t* que){
+  return que->nrefs;
+}
+
+int jque_free(jqueue_t* que){
+  dhmem_deref(que);
+  return 0;
+}
+
+
diff --git a/duix-sdk/src/main/cpp/dhcore/dh_que.h b/duix-sdk/src/main/cpp/dhcore/dh_que.h
new file mode 100644
index 0000000..07e21e9
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhcore/dh_que.h
@@ -0,0 +1,29 @@
+#ifndef GJ_MEDQUE_H
+#define GJ_MEDQUE_H
+#include "dh_data.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+typedef struct jqueue_s jqueue_t;
+#define GQUE_SIMP  1001
+#define GQUE_MUTI  1003
+#define GQUE_LOCK  1005
+
+  jqueue_t*   jque_alloc(int size,int cache,int kind);
+  int         jque_push(jqueue_t* que,jbuf_t* buf);
+  jbuf_t*     jque_pop(jqueue_t* que,uint64_t sessid);
+  jbuf_t*     jque_popall(jqueue_t* que);
+  int         jque_size(jqueue_t* que);
+  int         jque_free(jqueue_t* que);
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/duix-sdk/src/main/cpp/dhcore/dh_types.h b/duix-sdk/src/main/cpp/dhcore/dh_types.h
new file mode 100644
index 0000000..7a47e49
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhcore/dh_types.h
@@ -0,0 +1,389 @@
+/**
+ * @file re_types.h  Defines basic types
+ *
+ * Copyright (C) 2010 Creytiv.com
+ */
+
+#include <assert.h>
+#include <stddef.h>
+#include <sys/types.h>
+
+#ifdef __cplusplus
+#define restrict
+#endif
+
+#ifdef _MSC_VER
+#include <stdlib.h>
+
+#include <BaseTsd.h>
+typedef SSIZE_T ssize_t;
+
+#endif
+
+/*
+ * Basic integral types and boolean from C99
+ */
+#include <inttypes.h>
+#include <stdbool.h>
+
+
+/* Needed for MS compiler */
+#ifdef _MSC_VER
+#ifndef __cplusplus
+#define inline _inline
+#endif
+#endif
+
+
+/*
+ * Misc macros
+ */
+
+/** Defines the NULL pointer */
+#ifndef NULL
+#define NULL ((void *)0)
+#endif
+
+/** Get number of elements in an array */
+#define DH_ARRAY_SIZE(a) ((sizeof(a))/(sizeof((a)[0])))
+
+
+/** Align a value to the boundary of mask */
+#define DH_ALIGN_MASK(x, mask)    (((x)+(mask))&~(mask))
+
+/** Check alignment of pointer (p) and byte count (c) **/
+#define re_is_aligned(p, c) (((uintptr_t)(const void *)(p)) % (c) == 0)
+
+/** Get the minimal value */
+#undef MIN
+#define MIN(a,b) (((a)<(b)) ? (a) : (b))
+
+/** Get the maximal value */
+#undef MAX
+#define MAX(a,b) (((a)>(b)) ? (a) : (b))
+
+#ifndef __cplusplus
+
+/** Get the minimal value */
+#undef min
+#define min(x,y) MIN(x, y)
+
+/** Get the maximal value */
+#undef max
+#define max(x,y) MAX(x, y)
+
+#endif
+
+/** Defines a soft breakpoint */
+#if (defined(__i386__) || defined(__x86_64__))
+#define DH_BREAKPOINT __asm__("int $0x03")
+#elif defined(__has_builtin)
+#if __has_builtin(__builtin_debugtrap)
+#define DH_BREAKPOINT __builtin_debugtrap()
+#endif
+#endif
+
+#ifndef DH_BREAKPOINT
+#define DH_BREAKPOINT
+#endif
+
+/* Backwards compat */
+#define BREAKPOINT DH_BREAKPOINT
+
+
+/* Error return/goto debug helpers */
+#ifdef TRACE_ERR
+#define PRINT_TRACE_ERR(err)						\
+		(void)re_fprintf(stderr, "TRACE_ERR: %s:%u: %s():"	\
+			      " %m (%d)\n",				\
+			      __FILE__, __LINE__, __func__,		\
+			      (err), (err));
+#else
+#define PRINT_TRACE_ERR(err)
+#endif
+
+#define IF_ERR_GOTO_OUT(err)		\
+	if ((err)) {			\
+		PRINT_TRACE_ERR((err))	\
+		goto out;		\
+	}
+
+#define IF_ERR_GOTO_OUT1(err)		\
+	if ((err)) {			\
+		PRINT_TRACE_ERR((err))	\
+		goto out1;		\
+	}
+
+#define IF_ERR_GOTO_OUT2(err)		\
+	if ((err)) {			\
+		PRINT_TRACE_ERR((err))	\
+		goto out2;		\
+	}
+
+#define IF_ERR_RETURN(err)		\
+	if ((err)) {			\
+		PRINT_TRACE_ERR((err))	\
+		return (err);		\
+	}
+
+#define IF_RETURN_EINVAL(exp)		\
+	if ((exp)) {			\
+		PRINT_TRACE_ERR(EINVAL)	\
+		return (EINVAL);	\
+	}
+
+#define RETURN_ERR(err)			\
+	if ((err)) {			\
+		PRINT_TRACE_ERR((err))	\
+	}				\
+	return (err);
+
+
+/* Error codes */
+#include <errno.h>
+
+/* Duplication of error codes. Values are from linux asm-generic/errno.h */
+
+/** No data available */
+#ifndef ENODATA
+#define ENODATA 200
+#endif
+
+/** Accessing a corrupted shared library */
+#ifndef ELIBBAD
+#define ELIBBAD 204
+#endif
+
+/** Destination address required */
+#ifndef EDESTADDRREQ
+#define EDESTADDRREQ 205
+#endif
+
+/** Protocol not supported */
+#ifndef EPROTONOSUPPORT
+#define EPROTONOSUPPORT 206
+#endif
+
+/** Operation not supported */
+#ifndef ENOTSUP
+#define ENOTSUP 207
+#endif
+
+/** Address family not supported by protocol */
+#ifndef EAFNOSUPPORT
+#define EAFNOSUPPORT 208
+#endif
+
+/** Cannot assign requested address */
+#ifndef EADDRNOTAVAIL
+#define EADDRNOTAVAIL 209
+#endif
+
+/** Software caused connection abort */
+#ifndef ECONNABORTED
+#define ECONNABORTED 210
+#endif
+
+/** Connection reset by peer */
+#ifndef ECONNRESET
+#define ECONNRESET 211
+#endif
+
+/** Transport endpoint is not connected */
+#ifndef ENOTCONN
+#define ENOTCONN 212
+#endif
+
+/** Connection timed out */
+#ifndef ETIMEDOUT
+#define ETIMEDOUT 213
+#endif
+
+/** Connection refused */
+#ifndef ECONNREFUSED
+#define ECONNREFUSED 214
+#endif
+
+/** Operation already in progress */
+#ifndef EALREADY
+#define EALREADY 215
+#endif
+
+/** Operation now in progress */
+#ifndef EINPROGRESS
+#define EINPROGRESS 216
+#endif
+
+/** Authentication error */
+#ifndef EAUTH
+#define EAUTH 217
+#endif
+
+/** No STREAM resources */
+#ifndef ENOSR
+#define ENOSR 218
+#endif
+
+/** Key was rejected by service */
+#ifndef EKEYREJECTED
+#define EKEYREJECTED 129
+#endif
+
+/* Cannot send after transport endpoint shutdown */
+#ifndef ESHUTDOWN
+#define ESHUTDOWN 108
+#endif
+
+/*
+ * Give the compiler a hint which branch is "likely" or "unlikely" (inspired
+ * by linux kernel and C++20/C2X)
+ */
+#ifdef __GNUC__
+#define likely(x)       __builtin_expect(!!(x), 1)
+#define unlikely(x)     __builtin_expect(!!(x), 0)
+#else
+#define likely(x) x
+#define unlikely(x) x
+#endif
+
+#ifdef WIN32
+#define re_restrict __restrict
+#else
+#define re_restrict restrict
+#endif
+
+/* Socket helpers */
+#ifdef WIN32
+#define DH_ERRNO_SOCK WSAGetLastError()
+#define DH_BAD_SOCK INVALID_SOCKET
+typedef size_t re_sock_t;
+#else
+#define DH_ERRNO_SOCK errno
+#define DH_BAD_SOCK -1
+typedef int re_sock_t;
+#endif
+
+
+/* re_assert helpers */
+
+/**
+ * @def re_assert(expr)
+ *
+ * If expression is false, prints error and calls abort() (not in
+ * RELEASE/NDEBUG builds)
+ *
+ * @param expr   expression
+ */
+
+
+/**
+ * @def re_assert_se(expr)
+ *
+ * If expression is false, prints error and calls abort(),
+ * in RELEASE/NDEBUG builds expression is always executed and keeps side effect
+ *
+ * @param expr   expression
+ */
+
+#if defined(RELEASE) || defined(NDEBUG)
+#define re_assert(expr) (void)0
+#define re_assert_se(expr) do{(void)(expr);} while(false)
+#else
+#define re_assert(expr) assert(expr)
+#define re_assert_se(expr) assert(expr)
+#endif
+
+
+/* DH_VA_ARG SIZE helpers */
+#if !defined(DISABLE_DH_ARG) &&                                               \
+	!defined(__STRICT_ANSI__) && /* Needs ## trailing comma fix, with C23 \
+					we can use __VA_OPT__ */              \
+	__STDC_VERSION__ >= 201112L  /* _Generic C11 support required */
+
+#define HAVE_DH_ARG 1
+
+#define DH_ARG_SIZE(type)                                                     \
+	_Generic((0)?(type):(type),                                           \
+	bool:			sizeof(int),                                  \
+	char:			sizeof(int),                                  \
+	unsigned char:		sizeof(unsigned int),                         \
+	short:			sizeof(int),                                  \
+	unsigned short:		sizeof(unsigned int),	                      \
+	int:			sizeof(int),                                  \
+	unsigned int:		sizeof(unsigned int),                         \
+	long:			sizeof(long),                                 \
+	unsigned long:		sizeof(unsigned long),                        \
+	long long:		sizeof(long long),                            \
+	unsigned long long:	sizeof(unsigned long long),                   \
+	float:			sizeof(double),                               \
+	double:			sizeof(double),                               \
+	char const*:		sizeof(char const *),                         \
+	char*:			sizeof(char *),                               \
+	void const*:		sizeof(void const *),                         \
+	void*:			sizeof(void *),                               \
+	struct pl:		sizeof(struct pl),                            \
+	default: sizeof(void*)                                                \
+)
+
+#define DH_ARG_0() 0
+#define DH_ARG_1(expr) DH_ARG_SIZE(expr), (expr), 0
+#define DH_ARG_2(expr, ...) DH_ARG_SIZE(expr), (expr), DH_ARG_1(__VA_ARGS__)
+#define DH_ARG_3(expr, ...) DH_ARG_SIZE(expr), (expr), DH_ARG_2(__VA_ARGS__)
+#define DH_ARG_4(expr, ...) DH_ARG_SIZE(expr), (expr), DH_ARG_3(__VA_ARGS__)
+#define DH_ARG_5(expr, ...) DH_ARG_SIZE(expr), (expr), DH_ARG_4(__VA_ARGS__)
+#define DH_ARG_6(expr, ...) DH_ARG_SIZE(expr), (expr), DH_ARG_5(__VA_ARGS__)
+#define DH_ARG_7(expr, ...) DH_ARG_SIZE(expr), (expr), DH_ARG_6(__VA_ARGS__)
+#define DH_ARG_8(expr, ...) DH_ARG_SIZE(expr), (expr), DH_ARG_7(__VA_ARGS__)
+#define DH_ARG_9(expr, ...) DH_ARG_SIZE(expr), (expr), DH_ARG_8(__VA_ARGS__)
+#define DH_ARG_10(expr, ...) DH_ARG_SIZE(expr), (expr), DH_ARG_9(__VA_ARGS__)
+#define DH_ARG_11(expr, ...) DH_ARG_SIZE(expr), (expr), DH_ARG_10(__VA_ARGS__)
+#define DH_ARG_12(expr, ...) DH_ARG_SIZE(expr), (expr), DH_ARG_11(__VA_ARGS__)
+#define DH_ARG_13(expr, ...) DH_ARG_SIZE(expr), (expr), DH_ARG_12(__VA_ARGS__)
+#define DH_ARG_14(expr, ...) DH_ARG_SIZE(expr), (expr), DH_ARG_13(__VA_ARGS__)
+#define DH_ARG_15(expr, ...) DH_ARG_SIZE(expr), (expr), DH_ARG_14(__VA_ARGS__)
+#define DH_ARG_16(expr, ...) DH_ARG_SIZE(expr), (expr), DH_ARG_15(__VA_ARGS__)
+#define DH_ARG_17(expr, ...) DH_ARG_SIZE(expr), (expr), DH_ARG_16(__VA_ARGS__)
+#define DH_ARG_18(expr, ...) DH_ARG_SIZE(expr), (expr), DH_ARG_17(__VA_ARGS__)
+#define DH_ARG_19(expr, ...) DH_ARG_SIZE(expr), (expr), DH_ARG_18(__VA_ARGS__)
+#define DH_ARG_20(expr, ...) DH_ARG_SIZE(expr), (expr), DH_ARG_19(__VA_ARGS__)
+#define DH_ARG_21(expr, ...) DH_ARG_SIZE(expr), (expr), DH_ARG_20(__VA_ARGS__)
+#define DH_ARG_22(expr, ...) DH_ARG_SIZE(expr), (expr), DH_ARG_21(__VA_ARGS__)
+#define DH_ARG_23(expr, ...) DH_ARG_SIZE(expr), (expr), DH_ARG_22(__VA_ARGS__)
+#define DH_ARG_24(expr, ...) DH_ARG_SIZE(expr), (expr), DH_ARG_23(__VA_ARGS__)
+#define DH_ARG_25(expr, ...) DH_ARG_SIZE(expr), (expr), DH_ARG_24(__VA_ARGS__)
+#define DH_ARG_26(expr, ...) DH_ARG_SIZE(expr), (expr), DH_ARG_25(__VA_ARGS__)
+#define DH_ARG_27(expr, ...) DH_ARG_SIZE(expr), (expr), DH_ARG_26(__VA_ARGS__)
+#define DH_ARG_28(expr, ...) DH_ARG_SIZE(expr), (expr), DH_ARG_27(__VA_ARGS__)
+#define DH_ARG_29(expr, ...) DH_ARG_SIZE(expr), (expr), DH_ARG_28(__VA_ARGS__)
+#define DH_ARG_30(expr, ...) DH_ARG_SIZE(expr), (expr), DH_ARG_29(__VA_ARGS__)
+#define DH_ARG_31(expr, ...) DH_ARG_SIZE(expr), (expr), DH_ARG_30(__VA_ARGS__)
+#define DH_ARG_32(expr, ...) DH_ARG_SIZE(expr), (expr), DH_ARG_31(__VA_ARGS__)
+
+#define DH_ARG_VA_NUM_2(X, X32, X31, X30, X29, X28, X27, X26, X25, X24, X23,  \
+			X22, X21, X20, X19, X18, X17, X16, X15, X14, X13,     \
+			X12, X11, X10, X9, X8, X7, X6, X5, X4, X3, X2, X1, N, \
+			...)                                                  \
+	N
+#define DH_ARG_VA_NUM(...)                                                    \
+	DH_ARG_VA_NUM_2(0, ##__VA_ARGS__, 32, 31, 30, 29, 28, 27, 26, 25, 24, \
+			23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11,   \
+			10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
+
+#define DH_ARG_N3(N, ...) DH_ARG_##N(__VA_ARGS__)
+#define DH_ARG_N2(N, ...) DH_ARG_N3(N, __VA_ARGS__)
+#define DH_VA_ARGS(...) DH_ARG_N2(DH_ARG_VA_NUM(__VA_ARGS__), __VA_ARGS__)
+#endif /* End DH_VA_ARG SIZE helpers */
+
+#define DH_VA_ARG(ap, val, type, safe)                                        \
+	if (likely((safe))) {                                                 \
+		size_t sz = va_arg((ap), size_t);                             \
+		if (unlikely(!sz)) {                                          \
+			err = ENODATA;                                        \
+			goto out;                                             \
+		}                                                             \
+		if (unlikely(sz != sizeof(type))) {                           \
+			err = EOVERFLOW;                                      \
+			goto out;                                             \
+		}                                                             \
+	}                                                                     \
+	(val) = va_arg((ap), type)
diff --git a/duix-sdk/src/main/cpp/dhcore/lightweightsemaphore.h b/duix-sdk/src/main/cpp/dhcore/lightweightsemaphore.h
new file mode 100644
index 0000000..a041475
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhcore/lightweightsemaphore.h
@@ -0,0 +1,427 @@
+// Provides an efficient implementation of a semaphore (LightweightSemaphore).
+// This is an extension of Jeff Preshing's sempahore implementation (licensed 
+// under the terms of its separate zlib license) that has been adapted and
+// extended by Cameron Desrochers.
+
+#pragma once
+
+#include <cstddef> // For std::size_t
+#include <atomic>
+#include <type_traits> // For std::make_signed<T>
+
+#if defined(_WIN32)
+// Avoid including windows.h in a header; we only need a handful of
+// items, so we'll redeclare them here (this is relatively safe since
+// the API generally has to remain stable between Windows versions).
+// I know this is an ugly hack but it still beats polluting the global
+// namespace with thousands of generic names or adding a .cpp for nothing.
+extern "C" {
+	struct _SECURITY_ATTRIBUTES;
+	__declspec(dllimport) void* __stdcall CreateSemaphoreW(_SECURITY_ATTRIBUTES* lpSemaphoreAttributes, long lInitialCount, long lMaximumCount, const wchar_t* lpName);
+	__declspec(dllimport) int __stdcall CloseHandle(void* hObject);
+	__declspec(dllimport) unsigned long __stdcall WaitForSingleObject(void* hHandle, unsigned long dwMilliseconds);
+	__declspec(dllimport) int __stdcall ReleaseSemaphore(void* hSemaphore, long lReleaseCount, long* lpPreviousCount);
+}
+#elif defined(__MACH__)
+#include <mach/mach.h>
+#elif defined(__MVS__)
+#include <zos-semaphore.h>
+#elif defined(__unix__)
+#include <semaphore.h>
+
+#if defined(__GLIBC_PREREQ) && defined(_GNU_SOURCE)
+#if __GLIBC_PREREQ(2,30)
+#define MOODYCAMEL_LIGHTWEIGHTSEMAPHORE_MONOTONIC
+#endif
+#endif
+#endif
+
+namespace moodycamel
+{
+namespace details
+{
+
+// Code in the mpmc_sema namespace below is an adaptation of Jeff Preshing's
+// portable + lightweight semaphore implementations, originally from
+// https://github.com/preshing/cpp11-on-multicore/blob/master/common/sema.h
+// LICENSE:
+// Copyright (c) 2015 Jeff Preshing
+//
+// This software is provided 'as-is', without any express or implied
+// warranty. In no event will the authors be held liable for any damages
+// arising from the use of this software.
+//
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+//
+// 1. The origin of this software must not be misrepresented; you must not
+//	claim that you wrote the original software. If you use this software
+//	in a product, an acknowledgement in the product documentation would be
+//	appreciated but is not required.
+// 2. Altered source versions must be plainly marked as such, and must not be
+//	misrepresented as being the original software.
+// 3. This notice may not be removed or altered from any source distribution.
+#if defined(_WIN32)
+class Semaphore
+{
+private:
+	void* m_hSema;
+	
+	Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+	Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+
+public:
+	Semaphore(int initialCount = 0)
+	{
+		assert(initialCount >= 0);
+		const long maxLong = 0x7fffffff;
+		m_hSema = CreateSemaphoreW(nullptr, initialCount, maxLong, nullptr);
+		assert(m_hSema);
+	}
+
+	~Semaphore()
+	{
+		CloseHandle(m_hSema);
+	}
+
+	bool wait()
+	{
+		const unsigned long infinite = 0xffffffff;
+		return WaitForSingleObject(m_hSema, infinite) == 0;
+	}
+	
+	bool try_wait()
+	{
+		return WaitForSingleObject(m_hSema, 0) == 0;
+	}
+	
+	bool timed_wait(std::uint64_t usecs)
+	{
+		return WaitForSingleObject(m_hSema, (unsigned long)(usecs / 1000)) == 0;
+	}
+
+	void signal(int count = 1)
+	{
+		while (!ReleaseSemaphore(m_hSema, count, nullptr));
+	}
+};
+#elif defined(__MACH__)
+//---------------------------------------------------------
+// Semaphore (Apple iOS and OSX)
+// Can't use POSIX semaphores due to http://lists.apple.com/archives/darwin-kernel/2009/Apr/msg00010.html
+//---------------------------------------------------------
+class Semaphore
+{
+private:
+	semaphore_t m_sema;
+
+	Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+	Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+
+public:
+	Semaphore(int initialCount = 0)
+	{
+		assert(initialCount >= 0);
+		kern_return_t rc = semaphore_create(mach_task_self(), &m_sema, SYNC_POLICY_FIFO, initialCount);
+		assert(rc == KERN_SUCCESS);
+		(void)rc;
+	}
+
+	~Semaphore()
+	{
+		semaphore_destroy(mach_task_self(), m_sema);
+	}
+
+	bool wait()
+	{
+		return semaphore_wait(m_sema) == KERN_SUCCESS;
+	}
+	
+	bool try_wait()
+	{
+		return timed_wait(0);
+	}
+	
+	bool timed_wait(std::uint64_t timeout_usecs)
+	{
+		mach_timespec_t ts;
+		ts.tv_sec = static_cast<unsigned int>(timeout_usecs / 1000000);
+		ts.tv_nsec = static_cast<int>((timeout_usecs % 1000000) * 1000);
+
+		// added in OSX 10.10: https://developer.apple.com/library/prerelease/mac/documentation/General/Reference/APIDiffsMacOSX10_10SeedDiff/modules/Darwin.html
+		kern_return_t rc = semaphore_timedwait(m_sema, ts);
+		return rc == KERN_SUCCESS;
+	}
+
+	void signal()
+	{
+		while (semaphore_signal(m_sema) != KERN_SUCCESS);
+	}
+
+	void signal(int count)
+	{
+		while (count-- > 0)
+		{
+			while (semaphore_signal(m_sema) != KERN_SUCCESS);
+		}
+	}
+};
+#elif defined(__unix__) || defined(__MVS__)
+//---------------------------------------------------------
+// Semaphore (POSIX, Linux, zOS)
+//---------------------------------------------------------
+class Semaphore
+{
+private:
+	sem_t m_sema;
+
+	Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+	Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+
+public:
+	Semaphore(int initialCount = 0)
+	{
+		assert(initialCount >= 0);
+		int rc = sem_init(&m_sema, 0, static_cast<unsigned int>(initialCount));
+		assert(rc == 0);
+		(void)rc;
+	}
+
+	~Semaphore()
+	{
+		sem_destroy(&m_sema);
+	}
+
+	bool wait()
+	{
+		// http://stackoverflow.com/questions/2013181/gdb-causes-sem-wait-to-fail-with-eintr-error
+		int rc;
+		do {
+			rc = sem_wait(&m_sema);
+		} while (rc == -1 && errno == EINTR);
+		return rc == 0;
+	}
+
+	bool try_wait()
+	{
+		int rc;
+		do {
+			rc = sem_trywait(&m_sema);
+		} while (rc == -1 && errno == EINTR);
+		return rc == 0;
+	}
+
+	bool timed_wait(std::uint64_t usecs)
+	{
+		struct timespec ts;
+		const int usecs_in_1_sec = 1000000;
+		const int nsecs_in_1_sec = 1000000000;
+#ifdef MOODYCAMEL_LIGHTWEIGHTSEMAPHORE_MONOTONIC
+		clock_gettime(CLOCK_MONOTONIC, &ts);
+#else
+		clock_gettime(CLOCK_REALTIME, &ts);
+#endif
+		ts.tv_sec += (time_t)(usecs / usecs_in_1_sec);
+		ts.tv_nsec += (long)(usecs % usecs_in_1_sec) * 1000;
+		// sem_timedwait bombs if you have more than 1e9 in tv_nsec
+		// so we have to clean things up before passing it in
+		if (ts.tv_nsec >= nsecs_in_1_sec) {
+			ts.tv_nsec -= nsecs_in_1_sec;
+			++ts.tv_sec;
+		}
+
+		int rc;
+		do {
+#ifdef MOODYCAMEL_LIGHTWEIGHTSEMAPHORE_MONOTONIC
+			rc = sem_clockwait(&m_sema, CLOCK_MONOTONIC, &ts);
+#else
+			rc = sem_timedwait(&m_sema, &ts);
+#endif
+		} while (rc == -1 && errno == EINTR);
+		return rc == 0;
+	}
+
+	void signal()
+	{
+		while (sem_post(&m_sema) == -1);
+	}
+
+	void signal(int count)
+	{
+		while (count-- > 0)
+		{
+			while (sem_post(&m_sema) == -1);
+		}
+	}
+};
+#else
+#error Unsupported platform! (No semaphore wrapper available)
+#endif
+
+}	// end namespace details
+
+
+//---------------------------------------------------------
+// LightweightSemaphore
+//---------------------------------------------------------
+class LightweightSemaphore
+{
+public:
+	typedef std::make_signed<std::size_t>::type ssize_t;
+
+private:
+	std::atomic<ssize_t> m_count;
+	details::Semaphore m_sema;
+	int m_maxSpins;
+
+	bool waitWithPartialSpinning(std::int64_t timeout_usecs = -1)
+	{
+		ssize_t oldCount;
+		int spin = m_maxSpins;
+		while (--spin >= 0)
+		{
+			oldCount = m_count.load(std::memory_order_relaxed);
+			if ((oldCount > 0) && m_count.compare_exchange_strong(oldCount, oldCount - 1, std::memory_order_acquire, std::memory_order_relaxed))
+				return true;
+			std::atomic_signal_fence(std::memory_order_acquire);	 // Prevent the compiler from collapsing the loop.
+		}
+		oldCount = m_count.fetch_sub(1, std::memory_order_acquire);
+		if (oldCount > 0)
+			return true;
+		if (timeout_usecs < 0)
+		{
+			if (m_sema.wait())
+				return true;
+		}
+		if (timeout_usecs > 0 && m_sema.timed_wait((std::uint64_t)timeout_usecs))
+			return true;
+		// At this point, we've timed out waiting for the semaphore, but the
+		// count is still decremented indicating we may still be waiting on
+		// it. So we have to re-adjust the count, but only if the semaphore
+		// wasn't signaled enough times for us too since then. If it was, we
+		// need to release the semaphore too.
+		while (true)
+		{
+			oldCount = m_count.load(std::memory_order_acquire);
+			if (oldCount >= 0 && m_sema.try_wait())
+				return true;
+			if (oldCount < 0 && m_count.compare_exchange_strong(oldCount, oldCount + 1, std::memory_order_relaxed, std::memory_order_relaxed))
+				return false;
+		}
+	}
+
+	ssize_t waitManyWithPartialSpinning(ssize_t max, std::int64_t timeout_usecs = -1)
+	{
+		assert(max > 0);
+		ssize_t oldCount;
+		int spin = m_maxSpins;
+		while (--spin >= 0)
+		{
+			oldCount = m_count.load(std::memory_order_relaxed);
+			if (oldCount > 0)
+			{
+				ssize_t newCount = oldCount > max ? oldCount - max : 0;
+				if (m_count.compare_exchange_strong(oldCount, newCount, std::memory_order_acquire, std::memory_order_relaxed))
+					return oldCount - newCount;
+			}
+			std::atomic_signal_fence(std::memory_order_acquire);
+		}
+		oldCount = m_count.fetch_sub(1, std::memory_order_acquire);
+		if (oldCount <= 0)
+		{
+			if ((timeout_usecs == 0) || (timeout_usecs < 0 && !m_sema.wait()) || (timeout_usecs > 0 && !m_sema.timed_wait((std::uint64_t)timeout_usecs)))
+			{
+				while (true)
+				{
+					oldCount = m_count.load(std::memory_order_acquire);
+					if (oldCount >= 0 && m_sema.try_wait())
+						break;
+					if (oldCount < 0 && m_count.compare_exchange_strong(oldCount, oldCount + 1, std::memory_order_relaxed, std::memory_order_relaxed))
+						return 0;
+				}
+			}
+		}
+		if (max > 1)
+			return 1 + tryWaitMany(max - 1);
+		return 1;
+	}
+
+public:
+	LightweightSemaphore(ssize_t initialCount = 0, int maxSpins = 10000) : m_count(initialCount), m_maxSpins(maxSpins)
+	{
+		assert(initialCount >= 0);
+		assert(maxSpins >= 0);
+	}
+
+	bool tryWait()
+	{
+		ssize_t oldCount = m_count.load(std::memory_order_relaxed);
+		while (oldCount > 0)
+		{
+			if (m_count.compare_exchange_weak(oldCount, oldCount - 1, std::memory_order_acquire, std::memory_order_relaxed))
+				return true;
+		}
+		return false;
+	}
+
+	bool wait()
+	{
+		return tryWait() || waitWithPartialSpinning();
+	}
+
+	bool wait(std::int64_t timeout_usecs)
+	{
+		return tryWait() || waitWithPartialSpinning(timeout_usecs);
+	}
+
+	// Acquires between 0 and (greedily) max, inclusive
+	ssize_t tryWaitMany(ssize_t max)
+	{
+		assert(max >= 0);
+		ssize_t oldCount = m_count.load(std::memory_order_relaxed);
+		while (oldCount > 0)
+		{
+			ssize_t newCount = oldCount > max ? oldCount - max : 0;
+			if (m_count.compare_exchange_weak(oldCount, newCount, std::memory_order_acquire, std::memory_order_relaxed))
+				return oldCount - newCount;
+		}
+		return 0;
+	}
+
+	// Acquires at least one, and (greedily) at most max
+	ssize_t waitMany(ssize_t max, std::int64_t timeout_usecs)
+	{
+		assert(max >= 0);
+		ssize_t result = tryWaitMany(max);
+		if (result == 0 && max > 0)
+			result = waitManyWithPartialSpinning(max, timeout_usecs);
+		return result;
+	}
+	
+	ssize_t waitMany(ssize_t max)
+	{
+		ssize_t result = waitMany(max, -1);
+		assert(result > 0);
+		return result;
+	}
+
+	void signal(ssize_t count = 1)
+	{
+		assert(count >= 0);
+		ssize_t oldCount = m_count.fetch_add(count, std::memory_order_release);
+		ssize_t toRelease = -oldCount < count ? -oldCount : count;
+		if (toRelease > 0)
+		{
+			m_sema.signal((int)toRelease);
+		}
+	}
+	
+	std::size_t availableApprox() const
+	{
+		ssize_t count = m_count.load(std::memory_order_relaxed);
+		return count > 0 ? static_cast<std::size_t>(count) : 0;
+	}
+};
+
+}   // end namespace moodycamel
diff --git a/duix-sdk/src/main/cpp/dhcore/readerwritercircularbuffer.h b/duix-sdk/src/main/cpp/dhcore/readerwritercircularbuffer.h
new file mode 100644
index 0000000..a1946ae
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhcore/readerwritercircularbuffer.h
@@ -0,0 +1,321 @@
+// ©2020 Cameron Desrochers.
+// Distributed under the simplified BSD license (see the license file that
+// should have come with this header).
+
+// Provides a C++11 implementation of a single-producer, single-consumer wait-free concurrent
+// circular buffer (fixed-size queue).
+
+#pragma once
+
+#include <utility>
+#include <chrono>
+#include <memory>
+#include <cstdlib>
+#include <cstdint>
+#include <cassert>
+
+// Note that this implementation is fully modern C++11 (not compatible with old MSVC versions)
+// but we still include atomicops.h for its LightweightSemaphore implementation.
+#include "atomicops.h"
+
+#ifndef MOODYCAMEL_CACHE_LINE_SIZE
+#define MOODYCAMEL_CACHE_LINE_SIZE 64
+#endif
+
+namespace moodycamel {
+
+template<typename T>
+class BlockingReaderWriterCircularBuffer
+{
+public:
+	typedef T value_type;
+
+public:
+	explicit BlockingReaderWriterCircularBuffer(std::size_t capacity)
+		: maxcap(capacity), mask(), rawData(), data(),
+		slots_(new spsc_sema::LightweightSemaphore(static_cast<spsc_sema::LightweightSemaphore::ssize_t>(capacity))),
+		items(new spsc_sema::LightweightSemaphore(0)),
+		nextSlot(0), nextItem(0)
+	{
+		// Round capacity up to power of two to compute modulo mask.
+		// Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+		--capacity;
+		capacity |= capacity >> 1;
+		capacity |= capacity >> 2;
+		capacity |= capacity >> 4;
+		for (std::size_t i = 1; i < sizeof(std::size_t); i <<= 1)
+			capacity |= capacity >> (i << 3);
+		mask = capacity++;
+		rawData = static_cast<char*>(std::malloc(capacity * sizeof(T) + std::alignment_of<T>::value - 1));
+		data = align_for<T>(rawData);
+	}
+
+	BlockingReaderWriterCircularBuffer(BlockingReaderWriterCircularBuffer&& other)
+		: maxcap(0), mask(0), rawData(nullptr), data(nullptr),
+		slots_(new spsc_sema::LightweightSemaphore(0)),
+		items(new spsc_sema::LightweightSemaphore(0)),
+		nextSlot(), nextItem()
+	{
+		swap(other);
+	}
+
+	BlockingReaderWriterCircularBuffer(BlockingReaderWriterCircularBuffer const&) = delete;
+
+	// Note: The queue should not be accessed concurrently while it's
+	// being deleted. It's up to the user to synchronize this.
+	~BlockingReaderWriterCircularBuffer()
+	{
+		for (std::size_t i = 0, n = items->availableApprox(); i != n; ++i)
+			reinterpret_cast<T*>(data)[(nextItem + i) & mask].~T();
+		std::free(rawData);
+	}
+
+	BlockingReaderWriterCircularBuffer& operator=(BlockingReaderWriterCircularBuffer&& other) noexcept
+	{
+		swap(other);
+		return *this;
+	}
+
+	BlockingReaderWriterCircularBuffer& operator=(BlockingReaderWriterCircularBuffer const&) = delete;
+
+	// Swaps the contents of this buffer with the contents of another.
+	// Not thread-safe.
+	void swap(BlockingReaderWriterCircularBuffer& other) noexcept
+	{
+		std::swap(maxcap, other.maxcap);
+		std::swap(mask, other.mask);
+		std::swap(rawData, other.rawData);
+		std::swap(data, other.data);
+		std::swap(slots_, other.slots_);
+		std::swap(items, other.items);
+		std::swap(nextSlot, other.nextSlot);
+		std::swap(nextItem, other.nextItem);
+	}
+
+	// Enqueues a single item (by copying it).
+	// Fails if not enough room to enqueue.
+	// Thread-safe when called by producer thread.
+	// No exception guarantee (state will be corrupted) if constructor of T throws.
+	bool try_enqueue(T const& item)
+	{
+		if (!slots_->tryWait())
+			return false;
+		inner_enqueue(item);
+		return true;
+	}
+
+	// Enqueues a single item (by moving it, if possible).
+	// Fails if not enough room to enqueue.
+	// Thread-safe when called by producer thread.
+	// No exception guarantee (state will be corrupted) if constructor of T throws.
+	bool try_enqueue(T&& item)
+	{
+		if (!slots_->tryWait())
+			return false;
+		inner_enqueue(std::move(item));
+		return true;
+	}
+
+	// Blocks the current thread until there's enough space to enqueue the given item,
+	// then enqueues it (via copy).
+	// Thread-safe when called by producer thread.
+	// No exception guarantee (state will be corrupted) if constructor of T throws.
+	void wait_enqueue(T const& item)
+	{
+		while (!slots_->wait());
+		inner_enqueue(item);
+	}
+
+	// Blocks the current thread until there's enough space to enqueue the given item,
+	// then enqueues it (via move, if possible).
+	// Thread-safe when called by producer thread.
+	// No exception guarantee (state will be corrupted) if constructor of T throws.
+	void wait_enqueue(T&& item)
+	{
+		while (!slots_->wait());
+		inner_enqueue(std::move(item));
+	}
+
+	// Blocks the current thread until there's enough space to enqueue the given item,
+	// or the timeout expires. Returns false without enqueueing the item if the timeout
+	// expires, otherwise enqueues the item (via copy) and returns true.
+	// Thread-safe when called by producer thread.
+	// No exception guarantee (state will be corrupted) if constructor of T throws.
+	bool wait_enqueue_timed(T const& item, std::int64_t timeout_usecs)
+	{
+		if (!slots_->wait(timeout_usecs))
+			return false;
+		inner_enqueue(item);
+		return true;
+	}
+
+	// Blocks the current thread until there's enough space to enqueue the given item,
+	// or the timeout expires. Returns false without enqueueing the item if the timeout
+	// expires, otherwise enqueues the item (via move, if possible) and returns true.
+	// Thread-safe when called by producer thread.
+	// No exception guarantee (state will be corrupted) if constructor of T throws.
+	bool wait_enqueue_timed(T&& item, std::int64_t timeout_usecs)
+	{
+		if (!slots_->wait(timeout_usecs))
+			return false;
+		inner_enqueue(std::move(item));
+		return true;
+	}
+
+	// Blocks the current thread until there's enough space to enqueue the given item,
+	// or the timeout expires. Returns false without enqueueing the item if the timeout
+	// expires, otherwise enqueues the item (via copy) and returns true.
+	// Thread-safe when called by producer thread.
+	// No exception guarantee (state will be corrupted) if constructor of T throws.
+	template<typename Rep, typename Period>
+	inline bool wait_enqueue_timed(T const& item, std::chrono::duration<Rep, Period> const& timeout)
+	{
+		return wait_enqueue_timed(item, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+	}
+
+	// Blocks the current thread until there's enough space to enqueue the given item,
+	// or the timeout expires. Returns false without enqueueing the item if the timeout
+	// expires, otherwise enqueues the item (via move, if possible) and returns true.
+	// Thread-safe when called by producer thread.
+	// No exception guarantee (state will be corrupted) if constructor of T throws.
+	template<typename Rep, typename Period>
+	inline bool wait_enqueue_timed(T&& item, std::chrono::duration<Rep, Period> const& timeout)
+	{
+		return wait_enqueue_timed(std::move(item), std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+	}
+
+	// Attempts to dequeue a single item.
+	// Returns false if the buffer is empty.
+	// Thread-safe when called by consumer thread.
+	// No exception guarantee (state will be corrupted) if assignment operator of U throws.
+	template<typename U>
+	bool try_dequeue(U& item)
+	{
+		if (!items->tryWait())
+			return false;
+		inner_dequeue(item);
+		return true;
+	}
+
+	// Blocks the current thread until there's something to dequeue, then dequeues it.
+	// Thread-safe when called by consumer thread.
+	// No exception guarantee (state will be corrupted) if assignment operator of U throws.
+	template<typename U>
+	void wait_dequeue(U& item)
+	{
+		while (!items->wait());
+		inner_dequeue(item);
+	}
+
+	// Blocks the current thread until either there's something to dequeue
+	// or the timeout expires. Returns false without setting `item` if the
+	// timeout expires, otherwise assigns to `item` and returns true.
+	// Thread-safe when called by consumer thread.
+	// No exception guarantee (state will be corrupted) if assignment operator of U throws.
+	template<typename U>
+	bool wait_dequeue_timed(U& item, std::int64_t timeout_usecs)
+	{
+		if (!items->wait(timeout_usecs))
+			return false;
+		inner_dequeue(item);
+		return true;
+	}
+
+	// Blocks the current thread until either there's something to dequeue
+	// or the timeout expires. Returns false without setting `item` if the
+	// timeout expires, otherwise assigns to `item` and returns true.
+	// Thread-safe when called by consumer thread.
+	// No exception guarantee (state will be corrupted) if assignment operator of U throws.
+	template<typename U, typename Rep, typename Period>
+	inline bool wait_dequeue_timed(U& item, std::chrono::duration<Rep, Period> const& timeout)
+	{
+		return wait_dequeue_timed(item, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+	}
+
+	// Returns a pointer to the next element in the queue (the one that would
+	// be removed next by a call to `try_dequeue` or `try_pop`). If the queue
+	// appears empty at the time the method is called, returns nullptr instead.
+	// Thread-safe when called by consumer thread.
+	inline T* peek()
+	{
+		if (!items->availableApprox())
+			return nullptr;
+		return inner_peek();
+	}
+
+	// Pops the next element from the queue, if there is one.
+	// Thread-safe when called by consumer thread.
+	inline bool try_pop()
+	{
+		if (!items->tryWait())
+			return false;
+		inner_pop();
+		return true;
+	}
+
+	// Returns a (possibly outdated) snapshot of the total number of elements currently in the buffer.
+	// Thread-safe.
+	inline std::size_t size_approx() const
+	{
+		return items->availableApprox();
+	}
+
+	// Returns the maximum number of elements that this circular buffer can hold at once.
+	// Thread-safe.
+	inline std::size_t max_capacity() const
+	{
+		return maxcap;
+	}
+
+private:
+	template<typename U>
+	void inner_enqueue(U&& item)
+	{
+		std::size_t i = nextSlot++;
+		new (reinterpret_cast<T*>(data) + (i & mask)) T(std::forward<U>(item));
+		items->signal();
+	}
+
+	template<typename U>
+	void inner_dequeue(U& item)
+	{
+		std::size_t i = nextItem++;
+		T& element = reinterpret_cast<T*>(data)[i & mask];
+		item = std::move(element);
+		element.~T();
+		slots_->signal();
+	}
+
+	T* inner_peek()
+	{
+		return reinterpret_cast<T*>(data) + (nextItem & mask);
+	}
+
+	void inner_pop()
+	{
+		std::size_t i = nextItem++;
+		reinterpret_cast<T*>(data)[i & mask].~T();
+		slots_->signal();
+	}
+
+	template<typename U>
+	static inline char* align_for(char* ptr)
+	{
+		const std::size_t alignment = std::alignment_of<U>::value;
+		return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment;
+	}
+
+private:
+	std::size_t maxcap;                           // actual (non-power-of-two) capacity
+	std::size_t mask;                             // circular buffer capacity mask (for cheap modulo)
+	char* rawData;                                // raw circular buffer memory
+	char* data;                                   // circular buffer memory aligned to element alignment
+	std::unique_ptr<spsc_sema::LightweightSemaphore> slots_;  // number of slots currently free (named with underscore to accommodate Qt's 'slots' macro)
+	std::unique_ptr<spsc_sema::LightweightSemaphore> items;   // number of elements currently enqueued
+	char cachelineFiller0[MOODYCAMEL_CACHE_LINE_SIZE - sizeof(char*) * 2 - sizeof(std::size_t) * 2 - sizeof(std::unique_ptr<spsc_sema::LightweightSemaphore>) * 2];
+	std::size_t nextSlot;                         // index of next free slot to enqueue into
+	char cachelineFiller1[MOODYCAMEL_CACHE_LINE_SIZE - sizeof(std::size_t)];
+	std::size_t nextItem;                         // index of next element to dequeue from
+};
+
+}
diff --git a/duix-sdk/src/main/cpp/dhcore/readerwriterqueue.h b/duix-sdk/src/main/cpp/dhcore/readerwriterqueue.h
new file mode 100644
index 0000000..d461141
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhcore/readerwriterqueue.h
@@ -0,0 +1,979 @@
+// ©2013-2020 Cameron Desrochers.
+// Distributed under the simplified BSD license (see the license file that
+// should have come with this header).
+
+#pragma once
+
+#include "atomicops.h"
+#include <new>
+#include <type_traits>
+#include <utility>
+#include <cassert>
+#include <stdexcept>
+#include <new>
+#include <cstdint>
+#include <cstdlib>		// For malloc/free/abort & size_t
+#include <memory>
+#if __cplusplus > 199711L || _MSC_VER >= 1700 // C++11 or VS2012
+#include <chrono>
+#endif
+
+
+// A lock-free queue for a single-consumer, single-producer architecture.
+// The queue is also wait-free in the common path (except if more memory
+// needs to be allocated, in which case malloc is called).
+// Allocates memory sparingly, and only once if the original maximum size
+// estimate is never exceeded.
+// Tested on x86/x64 processors, but semantics should be correct for all
+// architectures (given the right implementations in atomicops.h), provided
+// that aligned integer and pointer accesses are naturally atomic.
+// Note that there should only be one consumer thread and producer thread;
+// Switching roles of the threads, or using multiple consecutive threads for
+// one role, is not safe unless properly synchronized.
+// Using the queue exclusively from one thread is fine, though a bit silly.
+
+#ifndef MOODYCAMEL_CACHE_LINE_SIZE
+#define MOODYCAMEL_CACHE_LINE_SIZE 64
+#endif
+
+#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED
+#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__))
+//#define MOODYCAMEL_EXCEPTIONS_ENABLED
+#endif
+#endif
+
+#ifndef MOODYCAMEL_HAS_EMPLACE
+#if !defined(_MSC_VER) || _MSC_VER >= 1800 // variadic templates: either a non-MS compiler or VS >= 2013
+#define MOODYCAMEL_HAS_EMPLACE    1
+#endif
+#endif
+
+#ifndef MOODYCAMEL_MAYBE_ALIGN_TO_CACHELINE
+#if defined (__APPLE__) && defined (__MACH__) && __cplusplus >= 201703L
+// This is required to find out what deployment target we are using
+#include <AvailabilityMacros.h>
+#if !defined(MAC_OS_X_VERSION_MIN_REQUIRED) || !defined(MAC_OS_X_VERSION_10_14) || MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_14
+// C++17 new(size_t, align_val_t) is not backwards-compatible with older versions of macOS, so we can't support over-alignment in this case
+#define MOODYCAMEL_MAYBE_ALIGN_TO_CACHELINE
+#endif
+#endif
+#endif
+
+#ifndef MOODYCAMEL_MAYBE_ALIGN_TO_CACHELINE
+#define MOODYCAMEL_MAYBE_ALIGN_TO_CACHELINE AE_ALIGN(MOODYCAMEL_CACHE_LINE_SIZE)
+#endif
+
+#ifdef AE_VCPP
+#pragma warning(push)
+#pragma warning(disable: 4324)	// structure was padded due to __declspec(align())
+#pragma warning(disable: 4820)	// padding was added
+#pragma warning(disable: 4127)	// conditional expression is constant
+#endif
+
+namespace moodycamel {
+
+template<typename T, size_t MAX_BLOCK_SIZE = 512>
+class MOODYCAMEL_MAYBE_ALIGN_TO_CACHELINE ReaderWriterQueue
+{
+	// Design: Based on a queue-of-queues. The low-level queues are just
+	// circular buffers with front and tail indices indicating where the
+	// next element to dequeue is and where the next element can be enqueued,
+	// respectively. Each low-level queue is called a "block". Each block
+	// wastes exactly one element's worth of space to keep the design simple
+	// (if front == tail then the queue is empty, and can't be full).
+	// The high-level queue is a circular linked list of blocks; again there
+	// is a front and tail, but this time they are pointers to the blocks.
+	// The front block is where the next element to be dequeued is, provided
+	// the block is not empty. The back block is where elements are to be
+	// enqueued, provided the block is not full.
+	// The producer thread owns all the tail indices/pointers. The consumer
+	// thread owns all the front indices/pointers. Both threads read each
+	// other's variables, but only the owning thread updates them. E.g. After
+	// the consumer reads the producer's tail, the tail may change before the
+	// consumer is done dequeuing an object, but the consumer knows the tail
+	// will never go backwards, only forwards.
+	// If there is no room to enqueue an object, an additional block (of
+	// equal size to the last block) is added. Blocks are never removed.
+
+public:
+	typedef T value_type;
+
+	// Constructs a queue that can hold at least `size` elements without further
+	// allocations. If more than MAX_BLOCK_SIZE elements are requested,
+	// then several blocks of MAX_BLOCK_SIZE each are reserved (including
+	// at least one extra buffer block).
+	AE_NO_TSAN explicit ReaderWriterQueue(size_t size = 15)
+#ifndef NDEBUG
+		: enqueuing(false)
+		,dequeuing(false)
+#endif
+	{
+		assert(MAX_BLOCK_SIZE == ceilToPow2(MAX_BLOCK_SIZE) && "MAX_BLOCK_SIZE must be a power of 2");
+		assert(MAX_BLOCK_SIZE >= 2 && "MAX_BLOCK_SIZE must be at least 2");
+		
+		Block* firstBlock = nullptr;
+		
+		largestBlockSize = ceilToPow2(size + 1);		// We need a spare slot to fit size elements in the block
+		if (largestBlockSize > MAX_BLOCK_SIZE * 2) {
+			// We need a spare block in case the producer is writing to a different block the consumer is reading from, and
+			// wants to enqueue the maximum number of elements. We also need a spare element in each block to avoid the ambiguity
+			// between front == tail meaning "empty" and "full".
+			// So the effective number of slots that are guaranteed to be usable at any time is the block size - 1 times the
+			// number of blocks - 1. Solving for size and applying a ceiling to the division gives us (after simplifying):
+			size_t initialBlockCount = (size + MAX_BLOCK_SIZE * 2 - 3) / (MAX_BLOCK_SIZE - 1);
+			largestBlockSize = MAX_BLOCK_SIZE;
+			Block* lastBlock = nullptr;
+			for (size_t i = 0; i != initialBlockCount; ++i) {
+				auto block = make_block(largestBlockSize);
+				if (block == nullptr) {
+#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
+					throw std::bad_alloc();
+#else
+					abort();
+#endif
+				}
+				if (firstBlock == nullptr) {
+					firstBlock = block;
+				}
+				else {
+					lastBlock->next = block;
+				}
+				lastBlock = block;
+				block->next = firstBlock;
+			}
+		}
+		else {
+			firstBlock = make_block(largestBlockSize);
+			if (firstBlock == nullptr) {
+#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
+				throw std::bad_alloc();
+#else
+				abort();
+#endif
+			}
+			firstBlock->next = firstBlock;
+		}
+		frontBlock = firstBlock;
+		tailBlock = firstBlock;
+		
+		// Make sure the reader/writer threads will have the initialized memory setup above:
+		fence(memory_order_sync);
+	}
+
+	// Note: The queue should not be accessed concurrently while it's
+	// being moved. It's up to the user to synchronize this.
+	AE_NO_TSAN ReaderWriterQueue(ReaderWriterQueue&& other)
+		: frontBlock(other.frontBlock.load()),
+		tailBlock(other.tailBlock.load()),
+		largestBlockSize(other.largestBlockSize)
+#ifndef NDEBUG
+		,enqueuing(false)
+		,dequeuing(false)
+#endif
+	{
+		other.largestBlockSize = 32;
+		Block* b = other.make_block(other.largestBlockSize);
+		if (b == nullptr) {
+#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
+			throw std::bad_alloc();
+#else
+			abort();
+#endif
+		}
+		b->next = b;
+		other.frontBlock = b;
+		other.tailBlock = b;
+	}
+
+	// Note: The queue should not be accessed concurrently while it's
+	// being moved. It's up to the user to synchronize this.
+	ReaderWriterQueue& operator=(ReaderWriterQueue&& other) AE_NO_TSAN
+	{
+		Block* b = frontBlock.load();
+		frontBlock = other.frontBlock.load();
+		other.frontBlock = b;
+		b = tailBlock.load();
+		tailBlock = other.tailBlock.load();
+		other.tailBlock = b;
+		std::swap(largestBlockSize, other.largestBlockSize);
+		return *this;
+	}
+
+	// Note: The queue should not be accessed concurrently while it's
+	// being deleted. It's up to the user to synchronize this.
+	AE_NO_TSAN ~ReaderWriterQueue()
+	{
+		// Make sure we get the latest version of all variables from other CPUs:
+		fence(memory_order_sync);
+
+		// Destroy any remaining objects in queue and free memory
+		Block* frontBlock_ = frontBlock;
+		Block* block = frontBlock_;
+		do {
+			Block* nextBlock = block->next;
+			size_t blockFront = block->front;
+			size_t blockTail = block->tail;
+
+			for (size_t i = blockFront; i != blockTail; i = (i + 1) & block->sizeMask) {
+				auto element = reinterpret_cast<T*>(block->data + i * sizeof(T));
+				element->~T();
+				(void)element;
+			}
+			
+			auto rawBlock = block->rawThis;
+			block->~Block();
+			std::free(rawBlock);
+			block = nextBlock;
+		} while (block != frontBlock_);
+	}
+
+
+	// Enqueues a copy of element if there is room in the queue.
+	// Returns true if the element was enqueued, false otherwise.
+	// Does not allocate memory.
+	AE_FORCEINLINE bool try_enqueue(T const& element) AE_NO_TSAN
+	{
+		return inner_enqueue<CannotAlloc>(element);
+	}
+
+	// Enqueues a moved copy of element if there is room in the queue.
+	// Returns true if the element was enqueued, false otherwise.
+	// Does not allocate memory.
+	AE_FORCEINLINE bool try_enqueue(T&& element) AE_NO_TSAN
+	{
+		return inner_enqueue<CannotAlloc>(std::forward<T>(element));
+	}
+
+#if MOODYCAMEL_HAS_EMPLACE
+	// Like try_enqueue() but with emplace semantics (i.e. construct-in-place).
+	template<typename... Args>
+	AE_FORCEINLINE bool try_emplace(Args&&... args) AE_NO_TSAN
+	{
+		return inner_enqueue<CannotAlloc>(std::forward<Args>(args)...);
+	}
+#endif
+
+	// Enqueues a copy of element on the queue.
+	// Allocates an additional block of memory if needed.
+	// Only fails (returns false) if memory allocation fails.
+	AE_FORCEINLINE bool enqueue(T const& element) AE_NO_TSAN
+	{
+		return inner_enqueue<CanAlloc>(element);
+	}
+
+	// Enqueues a moved copy of element on the queue.
+	// Allocates an additional block of memory if needed.
+	// Only fails (returns false) if memory allocation fails.
+	AE_FORCEINLINE bool enqueue(T&& element) AE_NO_TSAN
+	{
+		return inner_enqueue<CanAlloc>(std::forward<T>(element));
+	}
+
+#if MOODYCAMEL_HAS_EMPLACE
+	// Like enqueue() but with emplace semantics (i.e. construct-in-place).
+	template<typename... Args>
+	AE_FORCEINLINE bool emplace(Args&&... args) AE_NO_TSAN
+	{
+		return inner_enqueue<CanAlloc>(std::forward<Args>(args)...);
+	}
+#endif
+
+	// Attempts to dequeue an element; if the queue is empty,
+	// returns false instead. If the queue has at least one element,
+	// moves front to result using operator=, then returns true.
+	template<typename U>
+	bool try_dequeue(U& result) AE_NO_TSAN
+	{
+#ifndef NDEBUG
+		ReentrantGuard guard(this->dequeuing);
+#endif
+
+		// High-level pseudocode:
+		// Remember where the tail block is
+		// If the front block has an element in it, dequeue it
+		// Else
+		//     If front block was the tail block when we entered the function, return false
+		//     Else advance to next block and dequeue the item there
+
+		// Note that we have to use the value of the tail block from before we check if the front
+		// block is full or not, in case the front block is empty and then, before we check if the
+		// tail block is at the front block or not, the producer fills up the front block *and
+		// moves on*, which would make us skip a filled block. Seems unlikely, but was consistently
+		// reproducible in practice.
+		// In order to avoid overhead in the common case, though, we do a double-checked pattern
+		// where we have the fast path if the front block is not empty, then read the tail block,
+		// then re-read the front block and check if it's not empty again, then check if the tail
+		// block has advanced.
+		
+		Block* frontBlock_ = frontBlock.load();
+		size_t blockTail = frontBlock_->localTail;
+		size_t blockFront = frontBlock_->front.load();
+		
+		if (blockFront != blockTail || blockFront != (frontBlock_->localTail = frontBlock_->tail.load())) {
+			fence(memory_order_acquire);
+			
+		non_empty_front_block:
+			// Front block not empty, dequeue from here
+			auto element = reinterpret_cast<T*>(frontBlock_->data + blockFront * sizeof(T));
+			result = std::move(*element);
+			element->~T();
+
+			blockFront = (blockFront + 1) & frontBlock_->sizeMask;
+
+			fence(memory_order_release);
+			frontBlock_->front = blockFront;
+		}
+		else if (frontBlock_ != tailBlock.load()) {
+			fence(memory_order_acquire);
+
+			frontBlock_ = frontBlock.load();
+			blockTail = frontBlock_->localTail = frontBlock_->tail.load();
+			blockFront = frontBlock_->front.load();
+			fence(memory_order_acquire);
+			
+			if (blockFront != blockTail) {
+				// Oh look, the front block isn't empty after all
+				goto non_empty_front_block;
+			}
+			
+			// Front block is empty but there's another block ahead, advance to it
+			Block* nextBlock = frontBlock_->next;
+			// Don't need an acquire fence here since next can only ever be set on the tailBlock,
+			// and we're not the tailBlock, and we did an acquire earlier after reading tailBlock which
+			// ensures next is up-to-date on this CPU in case we recently were at tailBlock.
+
+			size_t nextBlockFront = nextBlock->front.load();
+			size_t nextBlockTail = nextBlock->localTail = nextBlock->tail.load();
+			fence(memory_order_acquire);
+
+			// Since the tailBlock is only ever advanced after being written to,
+			// we know there's for sure an element to dequeue on it
+			assert(nextBlockFront != nextBlockTail);
+			AE_UNUSED(nextBlockTail);
+
+			// We're done with this block, let the producer use it if it needs
+			fence(memory_order_release);		// Expose possibly pending changes to frontBlock->front from last dequeue
+			frontBlock = frontBlock_ = nextBlock;
+
+			compiler_fence(memory_order_release);	// Not strictly needed
+
+			auto element = reinterpret_cast<T*>(frontBlock_->data + nextBlockFront * sizeof(T));
+			
+			result = std::move(*element);
+			element->~T();
+
+			nextBlockFront = (nextBlockFront + 1) & frontBlock_->sizeMask;
+			
+			fence(memory_order_release);
+			frontBlock_->front = nextBlockFront;
+		}
+		else {
+			// No elements in current block and no other block to advance to
+			return false;
+		}
+
+		return true;
+	}
+
+
+	// Returns a pointer to the front element in the queue (the one that
+	// would be removed next by a call to `try_dequeue` or `pop`). If the
+	// queue appears empty at the time the method is called, nullptr is
+	// returned instead.
+	// Must be called only from the consumer thread.
+	T* peek() const AE_NO_TSAN
+	{
+#ifndef NDEBUG
+		ReentrantGuard guard(this->dequeuing);
+#endif
+		// See try_dequeue() for reasoning
+
+		Block* frontBlock_ = frontBlock.load();
+		size_t blockTail = frontBlock_->localTail;
+		size_t blockFront = frontBlock_->front.load();
+		
+		if (blockFront != blockTail || blockFront != (frontBlock_->localTail = frontBlock_->tail.load())) {
+			fence(memory_order_acquire);
+		non_empty_front_block:
+			return reinterpret_cast<T*>(frontBlock_->data + blockFront * sizeof(T));
+		}
+		else if (frontBlock_ != tailBlock.load()) {
+			fence(memory_order_acquire);
+			frontBlock_ = frontBlock.load();
+			blockTail = frontBlock_->localTail = frontBlock_->tail.load();
+			blockFront = frontBlock_->front.load();
+			fence(memory_order_acquire);
+			
+			if (blockFront != blockTail) {
+				goto non_empty_front_block;
+			}
+			
+			Block* nextBlock = frontBlock_->next;
+			
+			size_t nextBlockFront = nextBlock->front.load();
+			fence(memory_order_acquire);
+
+			assert(nextBlockFront != nextBlock->tail.load());
+			return reinterpret_cast<T*>(nextBlock->data + nextBlockFront * sizeof(T));
+		}
+		
+		return nullptr;
+	}
+	
+	// Removes the front element from the queue, if any, without returning it.
+	// Returns true on success, or false if the queue appeared empty at the time
+	// `pop` was called.
+	bool pop() AE_NO_TSAN
+	{
+#ifndef NDEBUG
+		ReentrantGuard guard(this->dequeuing);
+#endif
+		// See try_dequeue() for reasoning
+		
+		Block* frontBlock_ = frontBlock.load();
+		size_t blockTail = frontBlock_->localTail;
+		size_t blockFront = frontBlock_->front.load();
+		
+		if (blockFront != blockTail || blockFront != (frontBlock_->localTail = frontBlock_->tail.load())) {
+			fence(memory_order_acquire);
+			
+		non_empty_front_block:
+			auto element = reinterpret_cast<T*>(frontBlock_->data + blockFront * sizeof(T));
+			element->~T();
+
+			blockFront = (blockFront + 1) & frontBlock_->sizeMask;
+
+			fence(memory_order_release);
+			frontBlock_->front = blockFront;
+		}
+		else if (frontBlock_ != tailBlock.load()) {
+			fence(memory_order_acquire);
+			frontBlock_ = frontBlock.load();
+			blockTail = frontBlock_->localTail = frontBlock_->tail.load();
+			blockFront = frontBlock_->front.load();
+			fence(memory_order_acquire);
+			
+			if (blockFront != blockTail) {
+				goto non_empty_front_block;
+			}
+			
+			// Front block is empty but there's another block ahead, advance to it
+			Block* nextBlock = frontBlock_->next;
+			
+			size_t nextBlockFront = nextBlock->front.load();
+			size_t nextBlockTail = nextBlock->localTail = nextBlock->tail.load();
+			fence(memory_order_acquire);
+
+			assert(nextBlockFront != nextBlockTail);
+			AE_UNUSED(nextBlockTail);
+
+			fence(memory_order_release);
+			frontBlock = frontBlock_ = nextBlock;
+
+			compiler_fence(memory_order_release);
+
+			auto element = reinterpret_cast<T*>(frontBlock_->data + nextBlockFront * sizeof(T));
+			element->~T();
+
+			nextBlockFront = (nextBlockFront + 1) & frontBlock_->sizeMask;
+			
+			fence(memory_order_release);
+			frontBlock_->front = nextBlockFront;
+		}
+		else {
+			// No elements in current block and no other block to advance to
+			return false;
+		}
+
+		return true;
+	}
+	
+	// Returns the approximate number of items currently in the queue.
+	// Safe to call from both the producer and consumer threads.
+	inline size_t size_approx() const AE_NO_TSAN
+	{
+		size_t result = 0;
+		Block* frontBlock_ = frontBlock.load();
+		Block* block = frontBlock_;
+		do {
+			fence(memory_order_acquire);
+			size_t blockFront = block->front.load();
+			size_t blockTail = block->tail.load();
+			result += (blockTail - blockFront) & block->sizeMask;
+			block = block->next.load();
+		} while (block != frontBlock_);
+		return result;
+	}
+
+	// Returns the total number of items that could be enqueued without incurring
+	// an allocation when this queue is empty.
+	// Safe to call from both the producer and consumer threads.
+	//
+	// NOTE: The actual capacity during usage may be different depending on the consumer.
+	//       If the consumer is removing elements concurrently, the producer cannot add to
+	//       the block the consumer is removing from until it's completely empty, except in
+	//       the case where the producer was writing to the same block the consumer was
+	//       reading from the whole time.
+	inline size_t max_capacity() const {
+		size_t result = 0;
+		Block* frontBlock_ = frontBlock.load();
+		Block* block = frontBlock_;
+		do {
+			fence(memory_order_acquire);
+			result += block->sizeMask;
+			block = block->next.load();
+		} while (block != frontBlock_);
+		return result;
+	}
+
+
+private:
+	enum AllocationMode { CanAlloc, CannotAlloc };
+
+#if MOODYCAMEL_HAS_EMPLACE
+	template<AllocationMode canAlloc, typename... Args>
+	bool inner_enqueue(Args&&... args) AE_NO_TSAN
+#else
+	template<AllocationMode canAlloc, typename U>
+	bool inner_enqueue(U&& element) AE_NO_TSAN
+#endif
+	{
+#ifndef NDEBUG
+		ReentrantGuard guard(this->enqueuing);
+#endif
+
+		// High-level pseudocode (assuming we're allowed to alloc a new block):
+		// If room in tail block, add to tail
+		// Else check next block
+		//     If next block is not the head block, enqueue on next block
+		//     Else create a new block and enqueue there
+		//     Advance tail to the block we just enqueued to
+
+		Block* tailBlock_ = tailBlock.load();
+		size_t blockFront = tailBlock_->localFront;
+		size_t blockTail = tailBlock_->tail.load();
+
+		size_t nextBlockTail = (blockTail + 1) & tailBlock_->sizeMask;
+		if (nextBlockTail != blockFront || nextBlockTail != (tailBlock_->localFront = tailBlock_->front.load())) {
+			fence(memory_order_acquire);
+			// This block has room for at least one more element
+			char* location = tailBlock_->data + blockTail * sizeof(T);
+#if MOODYCAMEL_HAS_EMPLACE
+			new (location) T(std::forward<Args>(args)...);
+#else
+			new (location) T(std::forward<U>(element));
+#endif
+
+			fence(memory_order_release);
+			tailBlock_->tail = nextBlockTail;
+		}
+		else {
+			fence(memory_order_acquire);
+			if (tailBlock_->next.load() != frontBlock) {
+				// Note that the reason we can't advance to the frontBlock and start adding new entries there
+				// is because if we did, then dequeue would stay in that block, eventually reading the new values,
+				// instead of advancing to the next full block (whose values were enqueued first and so should be
+				// consumed first).
+
+				fence(memory_order_acquire);		// Ensure we get latest writes if we got the latest frontBlock
+
+				// tailBlock is full, but there's a free block ahead, use it
+				Block* tailBlockNext = tailBlock_->next.load();
+				size_t nextBlockFront = tailBlockNext->localFront = tailBlockNext->front.load();
+				nextBlockTail = tailBlockNext->tail.load();
+				fence(memory_order_acquire);
+
+				// This block must be empty since it's not the head block and we
+				// go through the blocks in a circle
+				assert(nextBlockFront == nextBlockTail);
+				tailBlockNext->localFront = nextBlockFront;
+
+				char* location = tailBlockNext->data + nextBlockTail * sizeof(T);
+#if MOODYCAMEL_HAS_EMPLACE
+				new (location) T(std::forward<Args>(args)...);
+#else
+				new (location) T(std::forward<U>(element));
+#endif
+
+				tailBlockNext->tail = (nextBlockTail + 1) & tailBlockNext->sizeMask;
+
+				fence(memory_order_release);
+				tailBlock = tailBlockNext;
+			}
+			else if (canAlloc == CanAlloc) {
+				// tailBlock is full and there's no free block ahead; create a new block
+				auto newBlockSize = largestBlockSize >= MAX_BLOCK_SIZE ? largestBlockSize : largestBlockSize * 2;
+				auto newBlock = make_block(newBlockSize);
+				if (newBlock == nullptr) {
+					// Could not allocate a block!
+					return false;
+				}
+				largestBlockSize = newBlockSize;
+
+#if MOODYCAMEL_HAS_EMPLACE
+				new (newBlock->data) T(std::forward<Args>(args)...);
+#else
+				new (newBlock->data) T(std::forward<U>(element));
+#endif
+				assert(newBlock->front == 0);
+				newBlock->tail = newBlock->localTail = 1;
+
+				newBlock->next = tailBlock_->next.load();
+				tailBlock_->next = newBlock;
+
+				// Might be possible for the dequeue thread to see the new tailBlock->next
+				// *without* seeing the new tailBlock value, but this is OK since it can't
+				// advance to the next block until tailBlock is set anyway (because the only
+				// case where it could try to read the next is if it's already at the tailBlock,
+				// and it won't advance past tailBlock in any circumstance).
+
+				fence(memory_order_release);
+				tailBlock = newBlock;
+			}
+			else if (canAlloc == CannotAlloc) {
+				// Would have had to allocate a new block to enqueue, but not allowed
+				return false;
+			}
+			else {
+				assert(false && "Should be unreachable code");
+				return false;
+			}
+		}
+
+		return true;
+	}
+
+
+	// Disable copying
+	ReaderWriterQueue(ReaderWriterQueue const&) {  }
+
+	// Disable assignment
+	ReaderWriterQueue& operator=(ReaderWriterQueue const&) {  }
+
+
+	AE_FORCEINLINE static size_t ceilToPow2(size_t x)
+	{
+		// From http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+		--x;
+		x |= x >> 1;
+		x |= x >> 2;
+		x |= x >> 4;
+		for (size_t i = 1; i < sizeof(size_t); i <<= 1) {
+			x |= x >> (i << 3);
+		}
+		++x;
+		return x;
+	}
+	
+	template<typename U>
+	static AE_FORCEINLINE char* align_for(char* ptr) AE_NO_TSAN
+	{
+		const std::size_t alignment = std::alignment_of<U>::value;
+		return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment;
+	}
+private:
+#ifndef NDEBUG
+	struct ReentrantGuard
+	{
+		AE_NO_TSAN ReentrantGuard(weak_atomic<bool>& _inSection)
+			: inSection(_inSection)
+		{
+			assert(!inSection && "Concurrent (or re-entrant) enqueue or dequeue operation detected (only one thread at a time may hold the producer or consumer role)");
+			inSection = true;
+		}
+
+		AE_NO_TSAN ~ReentrantGuard() { inSection = false; }
+
+	private:
+		ReentrantGuard& operator=(ReentrantGuard const&);
+
+	private:
+		weak_atomic<bool>& inSection;
+	};
+#endif
+
+	struct Block
+	{
+		// Avoid false-sharing by putting highly contended variables on their own cache lines
+		weak_atomic<size_t> front;	// (Atomic) Elements are read from here
+		size_t localTail;			// An uncontended shadow copy of tail, owned by the consumer
+		
+		char cachelineFiller0[MOODYCAMEL_CACHE_LINE_SIZE - sizeof(weak_atomic<size_t>) - sizeof(size_t)];
+		weak_atomic<size_t> tail;	// (Atomic) Elements are enqueued here
+		size_t localFront;
+		
+		char cachelineFiller1[MOODYCAMEL_CACHE_LINE_SIZE - sizeof(weak_atomic<size_t>) - sizeof(size_t)];	// next isn't very contended, but we don't want it on the same cache line as tail (which is)
+		weak_atomic<Block*> next;	// (Atomic)
+		
+		char* data;		// Contents (on heap) are aligned to T's alignment
+
+		const size_t sizeMask;
+
+
+		// size must be a power of two (and greater than 0)
+		AE_NO_TSAN Block(size_t const& _size, char* _rawThis, char* _data)
+			: front(0UL), localTail(0), tail(0UL), localFront(0), next(nullptr), data(_data), sizeMask(_size - 1), rawThis(_rawThis)
+		{
+		}
+
+	private:
+		// C4512 - Assignment operator could not be generated
+		Block& operator=(Block const&);
+
+	public:
+		char* rawThis;
+	};
+	
+	
+	static Block* make_block(size_t capacity) AE_NO_TSAN
+	{
+		// Allocate enough memory for the block itself, as well as all the elements it will contain
+		auto size = sizeof(Block) + std::alignment_of<Block>::value - 1;
+		size += sizeof(T) * capacity + std::alignment_of<T>::value - 1;
+		auto newBlockRaw = static_cast<char*>(std::malloc(size));
+		if (newBlockRaw == nullptr) {
+			return nullptr;
+		}
+		
+		auto newBlockAligned = align_for<Block>(newBlockRaw);
+		auto newBlockData = align_for<T>(newBlockAligned + sizeof(Block));
+		return new (newBlockAligned) Block(capacity, newBlockRaw, newBlockData);
+	}
+
+private:
+	weak_atomic<Block*> frontBlock;		// (Atomic) Elements are dequeued from this block
+	
+	char cachelineFiller[MOODYCAMEL_CACHE_LINE_SIZE - sizeof(weak_atomic<Block*>)];
+	weak_atomic<Block*> tailBlock;		// (Atomic) Elements are enqueued to this block
+
+	size_t largestBlockSize;
+
+#ifndef NDEBUG
+	weak_atomic<bool> enqueuing;
+	mutable weak_atomic<bool> dequeuing;
+#endif
+};
+
+// Like ReaderWriterQueue, but also providees blocking operations
+template<typename T, size_t MAX_BLOCK_SIZE = 512>
+class BlockingReaderWriterQueue
+{
+private:
+	typedef ::moodycamel::ReaderWriterQueue<T, MAX_BLOCK_SIZE> ReaderWriterQueue;
+	
+public:
+	explicit BlockingReaderWriterQueue(size_t size = 15) AE_NO_TSAN
+		: inner(size), sema(new spsc_sema::LightweightSemaphore())
+	{ }
+
+	BlockingReaderWriterQueue(BlockingReaderWriterQueue&& other) AE_NO_TSAN
+		: inner(std::move(other.inner)), sema(std::move(other.sema))
+	{ }
+
+	BlockingReaderWriterQueue& operator=(BlockingReaderWriterQueue&& other) AE_NO_TSAN
+	{
+		std::swap(sema, other.sema);
+		std::swap(inner, other.inner);
+		return *this;
+	}
+
+
+	// Enqueues a copy of element if there is room in the queue.
+	// Returns true if the element was enqueued, false otherwise.
+	// Does not allocate memory.
+	AE_FORCEINLINE bool try_enqueue(T const& element) AE_NO_TSAN
+	{
+		if (inner.try_enqueue(element)) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+
+	// Enqueues a moved copy of element if there is room in the queue.
+	// Returns true if the element was enqueued, false otherwise.
+	// Does not allocate memory.
+	AE_FORCEINLINE bool try_enqueue(T&& element) AE_NO_TSAN
+	{
+		if (inner.try_enqueue(std::forward<T>(element))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+
+#if MOODYCAMEL_HAS_EMPLACE
+	// Like try_enqueue() but with emplace semantics (i.e. construct-in-place).
+	template<typename... Args>
+	AE_FORCEINLINE bool try_emplace(Args&&... args) AE_NO_TSAN
+	{
+		if (inner.try_emplace(std::forward<Args>(args)...)) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+#endif
+
+
+	// Enqueues a copy of element on the queue.
+	// Allocates an additional block of memory if needed.
+	// Only fails (returns false) if memory allocation fails.
+	AE_FORCEINLINE bool enqueue(T const& element) AE_NO_TSAN
+	{
+		if (inner.enqueue(element)) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+
+	// Enqueues a moved copy of element on the queue.
+	// Allocates an additional block of memory if needed.
+	// Only fails (returns false) if memory allocation fails.
+	AE_FORCEINLINE bool enqueue(T&& element) AE_NO_TSAN
+	{
+		if (inner.enqueue(std::forward<T>(element))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+
+#if MOODYCAMEL_HAS_EMPLACE
+	// Like enqueue() but with emplace semantics (i.e. construct-in-place).
+	template<typename... Args>
+	AE_FORCEINLINE bool emplace(Args&&... args) AE_NO_TSAN
+	{
+		if (inner.emplace(std::forward<Args>(args)...)) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+#endif
+
+
+	// Attempts to dequeue an element; if the queue is empty,
+	// returns false instead. If the queue has at least one element,
+	// moves front to result using operator=, then returns true.
+	template<typename U>
+	bool try_dequeue(U& result) AE_NO_TSAN
+	{
+		if (sema->tryWait()) {
+			bool success = inner.try_dequeue(result);
+			assert(success);
+			AE_UNUSED(success);
+			return true;
+		}
+		return false;
+	}
+	
+	
+	// Attempts to dequeue an element; if the queue is empty,
+	// waits until an element is available, then dequeues it.
+	template<typename U>
+	void wait_dequeue(U& result) AE_NO_TSAN
+	{
+		while (!sema->wait());
+		bool success = inner.try_dequeue(result);
+		AE_UNUSED(result);
+		assert(success);
+		AE_UNUSED(success);
+	}
+
+
+	// Attempts to dequeue an element; if the queue is empty,
+	// waits until an element is available up to the specified timeout,
+	// then dequeues it and returns true, or returns false if the timeout
+	// expires before an element can be dequeued.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue.
+	template<typename U>
+	bool wait_dequeue_timed(U& result, std::int64_t timeout_usecs) AE_NO_TSAN
+	{
+		if (!sema->wait(timeout_usecs)) {
+			return false;
+		}
+		bool success = inner.try_dequeue(result);
+		AE_UNUSED(result);
+		assert(success);
+		AE_UNUSED(success);
+		return true;
+	}
+
+
+#if __cplusplus > 199711L || _MSC_VER >= 1700
+	// Attempts to dequeue an element; if the queue is empty,
+	// waits until an element is available up to the specified timeout,
+	// then dequeues it and returns true, or returns false if the timeout
+	// expires before an element can be dequeued.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue.
+	template<typename U, typename Rep, typename Period>
+	inline bool wait_dequeue_timed(U& result, std::chrono::duration<Rep, Period> const& timeout) AE_NO_TSAN
+	{
+        return wait_dequeue_timed(result, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+	}
+#endif
+
+
+	// Returns a pointer to the front element in the queue (the one that
+	// would be removed next by a call to `try_dequeue` or `pop`). If the
+	// queue appears empty at the time the method is called, nullptr is
+	// returned instead.
+	// Must be called only from the consumer thread.
+	AE_FORCEINLINE T* peek() const AE_NO_TSAN
+	{
+		return inner.peek();
+	}
+	
+	// Removes the front element from the queue, if any, without returning it.
+	// Returns true on success, or false if the queue appeared empty at the time
+	// `pop` was called.
+	AE_FORCEINLINE bool pop() AE_NO_TSAN
+	{
+		if (sema->tryWait()) {
+			bool result = inner.pop();
+			assert(result);
+			AE_UNUSED(result);
+			return true;
+		}
+		return false;
+	}
+	
+	// Returns the approximate number of items currently in the queue.
+	// Safe to call from both the producer and consumer threads.
+	AE_FORCEINLINE size_t size_approx() const AE_NO_TSAN
+	{
+		return sema->availableApprox();
+	}
+
+	// Returns the total number of items that could be enqueued without incurring
+	// an allocation when this queue is empty.
+	// Safe to call from both the producer and consumer threads.
+	//
+	// NOTE: The actual capacity during usage may be different depending on the consumer.
+	//       If the consumer is removing elements concurrently, the producer cannot add to
+	//       the block the consumer is removing from until it's completely empty, except in
+	//       the case where the producer was writing to the same block the consumer was
+	//       reading from the whole time.
+	AE_FORCEINLINE size_t max_capacity() const {
+		return inner.max_capacity();
+	}
+
+private:
+	// Disable copying & assignment
+	BlockingReaderWriterQueue(BlockingReaderWriterQueue const&) {  }
+	BlockingReaderWriterQueue& operator=(BlockingReaderWriterQueue const&) {  }
+	
+private:
+	ReaderWriterQueue inner;
+	std::unique_ptr<spsc_sema::LightweightSemaphore> sema;
+};
+
+}    // end namespace moodycamel
+
+#ifdef AE_VCPP
+#pragma warning(pop)
+#endif
diff --git a/duix-sdk/src/main/cpp/dhmfcc/AudioFFT.cpp b/duix-sdk/src/main/cpp/dhmfcc/AudioFFT.cpp
new file mode 100644
index 0000000..ad10b3d
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhmfcc/AudioFFT.cpp
@@ -0,0 +1,1146 @@
+#include "mfcc/AudioFFT.hpp"
+
+
+namespace audiofft
+{
+
+    namespace detail
+    {
+
+
+        constexpr bool IsPowerOf2(size_t val)
+        {
+          return (val == 1 || (val & (val-1)) == 0);
+        }
+
+
+        template<typename TypeDest, typename TypeSrc>
+        void ConvertBuffer(TypeDest* dest, const TypeSrc* src, size_t len)
+        {
+          for (size_t i=0; i<len; ++i)
+          {
+            dest[i] = static_cast<TypeDest>(src[i]);
+          }
+        }
+
+
+        template<typename TypeDest, typename TypeSrc, typename TypeFactor>
+        void ScaleBuffer(TypeDest* dest, const TypeSrc* src, const TypeFactor factor, size_t len)
+        {
+          for (size_t i=0; i<len; ++i)
+          {
+            dest[i] = static_cast<TypeDest>(static_cast<TypeFactor>(src[i]) * factor);
+          }
+        }
+
+    } // End of namespace detail
+
+
+    // ================================================================
+
+
+#ifdef AUDIOFFT_OOURA_USED
+
+    /**
+     * @internal
+     * @class OouraFFT
+     * @brief FFT implementation based on the great radix-4 routines by Takuya Ooura
+     */
+    class OouraFFT : public detail::AudioFFTImpl
+    {
+    public:
+        OouraFFT() :
+                detail::AudioFFTImpl(),
+                _size(0),
+                _ip(),
+                _w(),
+                _buffer()
+        {
+        }
+
+        OouraFFT(const OouraFFT&) = delete;
+        OouraFFT& operator=(const OouraFFT&) = delete;
+
+        virtual void init(size_t size) override
+        {
+          if (_size != size)
+          {
+            _ip.resize(2 + static_cast<int>(std::sqrt(static_cast<double>(size))));
+            _w.resize(size / 2);
+            _buffer.resize(size);
+            _size = size;
+
+            const int size4 = static_cast<int>(_size) / 4;
+            makewt(size4, _ip.data(), _w.data());
+            makect(size4, _ip.data(), _w.data() + size4);
+          }
+        }
+
+        virtual void fft(const float* data, float* re, float* im) override
+        {
+          // Convert into the format as required by the Ooura FFT
+          detail::ConvertBuffer(_buffer.data(), data, _size);
+
+          rdft(static_cast<int>(_size), +1, _buffer.data(), _ip.data(), _w.data());
+
+          // Convert back to split-complex
+          {
+            double* b = _buffer.data();
+            double* bEnd = b + _size;
+            float *r = re;
+            float *i = im;
+            while (b != bEnd)
+            {
+              *(r++) = static_cast<float>(*(b++));
+              *(i++) = static_cast<float>(-(*(b++)));
+            }
+          }
+          const size_t size2 = _size / 2;
+          re[size2] = -im[0];
+          im[0] = 0.0;
+          im[size2] = 0.0;
+        }
+
+        virtual void ifft(float* data, const float* re, const float* im) override
+        {
+          // Convert into the format as required by the Ooura FFT
+          {
+            double* b = _buffer.data();
+            double* bEnd = b + _size;
+            const float *r = re;
+            const float *i = im;
+            while (b != bEnd)
+            {
+              *(b++) = static_cast<double>(*(r++));
+              *(b++) = -static_cast<double>(*(i++));
+            }
+            _buffer[1] = re[_size / 2];
+          }
+
+          rdft(static_cast<int>(_size), -1, _buffer.data(), _ip.data(), _w.data());
+
+          // Convert back to split-complex
+          detail::ScaleBuffer(data, _buffer.data(), 2.0 / static_cast<double>(_size), _size);
+        }
+
+    private:
+        size_t _size;
+        std::vector<int> _ip;
+        std::vector<double> _w;
+        std::vector<double> _buffer;
+
+        void rdft(int n, int isgn, double *a, int *ip, double *w)
+        {
+          int nw = ip[0];
+          int nc = ip[1];
+
+          if (isgn >= 0)
+          {
+            if (n > 4)
+            {
+              bitrv2(n, ip + 2, a);
+              cftfsub(n, a, w);
+              rftfsub(n, a, nc, w + nw);
+            }
+            else if (n == 4)
+            {
+              cftfsub(n, a, w);
+            }
+            double xi = a[0] - a[1];
+            a[0] += a[1];
+            a[1] = xi;
+          }
+          else
+          {
+            a[1] = 0.5 * (a[0] - a[1]);
+            a[0] -= a[1];
+            if (n > 4)
+            {
+              rftbsub(n, a, nc, w + nw);
+              bitrv2(n, ip + 2, a);
+              cftbsub(n, a, w);
+            }
+            else if (n == 4)
+            {
+              cftfsub(n, a, w);
+            }
+          }
+        }
+
+
+        /* -------- initializing routines -------- */
+
+        void makewt(int nw, int *ip, double *w)
+        {
+          int j, nwh;
+          double delta, x, y;
+
+          ip[0] = nw;
+          ip[1] = 1;
+          if (nw > 2) {
+            nwh = nw >> 1;
+            delta = atan(1.0) / nwh;
+            w[0] = 1;
+            w[1] = 0;
+            w[nwh] = cos(delta * nwh);
+            w[nwh + 1] = w[nwh];
+            if (nwh > 2) {
+              for (j = 2; j < nwh; j += 2) {
+                x = cos(delta * j);
+                y = sin(delta * j);
+                w[j] = x;
+                w[j + 1] = y;
+                w[nw - j] = y;
+                w[nw - j + 1] = x;
+              }
+              bitrv2(nw, ip + 2, w);
+            }
+          }
+        }
+
+
+        void makect(int nc, int *ip, double *c)
+        {
+          int j, nch;
+          double delta;
+
+          ip[1] = nc;
+          if (nc > 1) {
+            nch = nc >> 1;
+            delta = atan(1.0) / nch;
+            c[0] = cos(delta * nch);
+            c[nch] = 0.5 * c[0];
+            for (j = 1; j < nch; j++) {
+              c[j] = 0.5 * cos(delta * j);
+              c[nc - j] = 0.5 * sin(delta * j);
+            }
+          }
+        }
+
+
+        /* -------- child routines -------- */
+
+
+        void bitrv2(int n, int *ip, double *a)
+        {
+          int j, j1, k, k1, l, m, m2;
+          double xr, xi, yr, yi;
+
+          ip[0] = 0;
+          l = n;
+          m = 1;
+          while ((m << 3) < l) {
+            l >>= 1;
+            for (j = 0; j < m; j++) {
+              ip[m + j] = ip[j] + l;
+            }
+            m <<= 1;
+          }
+          m2 = 2 * m;
+          if ((m << 3) == l) {
+            for (k = 0; k < m; k++) {
+              for (j = 0; j < k; j++) {
+                j1 = 2 * j + ip[k];
+                k1 = 2 * k + ip[j];
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 += 2 * m2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 -= m2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 += 2 * m2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+              }
+              j1 = 2 * k + m2 + ip[k];
+              k1 = j1 + m2;
+              xr = a[j1];
+              xi = a[j1 + 1];
+              yr = a[k1];
+              yi = a[k1 + 1];
+              a[j1] = yr;
+              a[j1 + 1] = yi;
+              a[k1] = xr;
+              a[k1 + 1] = xi;
+            }
+          } else {
+            for (k = 1; k < m; k++) {
+              for (j = 0; j < k; j++) {
+                j1 = 2 * j + ip[k];
+                k1 = 2 * k + ip[j];
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 += m2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+              }
+            }
+          }
+        }
+
+
+        void cftfsub(int n, double *a, double *w)
+        {
+          int j, j1, j2, j3, l;
+          double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+          l = 2;
+          if (n > 8) {
+            cft1st(n, a, w);
+            l = 8;
+            while ((l << 2) < n) {
+              cftmdl(n, l, a, w);
+              l <<= 2;
+            }
+          }
+          if ((l << 2) == n) {
+            for (j = 0; j < l; j += 2) {
+              j1 = j + l;
+              j2 = j1 + l;
+              j3 = j2 + l;
+              x0r = a[j] + a[j1];
+              x0i = a[j + 1] + a[j1 + 1];
+              x1r = a[j] - a[j1];
+              x1i = a[j + 1] - a[j1 + 1];
+              x2r = a[j2] + a[j3];
+              x2i = a[j2 + 1] + a[j3 + 1];
+              x3r = a[j2] - a[j3];
+              x3i = a[j2 + 1] - a[j3 + 1];
+              a[j] = x0r + x2r;
+              a[j + 1] = x0i + x2i;
+              a[j2] = x0r - x2r;
+              a[j2 + 1] = x0i - x2i;
+              a[j1] = x1r - x3i;
+              a[j1 + 1] = x1i + x3r;
+              a[j3] = x1r + x3i;
+              a[j3 + 1] = x1i - x3r;
+            }
+          } else {
+            for (j = 0; j < l; j += 2) {
+              j1 = j + l;
+              x0r = a[j] - a[j1];
+              x0i = a[j + 1] - a[j1 + 1];
+              a[j] += a[j1];
+              a[j + 1] += a[j1 + 1];
+              a[j1] = x0r;
+              a[j1 + 1] = x0i;
+            }
+          }
+        }
+
+
+        void cftbsub(int n, double *a, double *w)
+        {
+          int j, j1, j2, j3, l;
+          double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+          l = 2;
+          if (n > 8) {
+            cft1st(n, a, w);
+            l = 8;
+            while ((l << 2) < n) {
+              cftmdl(n, l, a, w);
+              l <<= 2;
+            }
+          }
+          if ((l << 2) == n) {
+            for (j = 0; j < l; j += 2) {
+              j1 = j + l;
+              j2 = j1 + l;
+              j3 = j2 + l;
+              x0r = a[j] + a[j1];
+              x0i = -a[j + 1] - a[j1 + 1];
+              x1r = a[j] - a[j1];
+              x1i = -a[j + 1] + a[j1 + 1];
+              x2r = a[j2] + a[j3];
+              x2i = a[j2 + 1] + a[j3 + 1];
+              x3r = a[j2] - a[j3];
+              x3i = a[j2 + 1] - a[j3 + 1];
+              a[j] = x0r + x2r;
+              a[j + 1] = x0i - x2i;
+              a[j2] = x0r - x2r;
+              a[j2 + 1] = x0i + x2i;
+              a[j1] = x1r - x3i;
+              a[j1 + 1] = x1i - x3r;
+              a[j3] = x1r + x3i;
+              a[j3 + 1] = x1i + x3r;
+            }
+          } else {
+            for (j = 0; j < l; j += 2) {
+              j1 = j + l;
+              x0r = a[j] - a[j1];
+              x0i = -a[j + 1] + a[j1 + 1];
+              a[j] += a[j1];
+              a[j + 1] = -a[j + 1] - a[j1 + 1];
+              a[j1] = x0r;
+              a[j1 + 1] = x0i;
+            }
+          }
+        }
+
+
+        void cft1st(int n, double *a, double *w)
+        {
+          int j, k1, k2;
+          double wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
+          double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+          x0r = a[0] + a[2];
+          x0i = a[1] + a[3];
+          x1r = a[0] - a[2];
+          x1i = a[1] - a[3];
+          x2r = a[4] + a[6];
+          x2i = a[5] + a[7];
+          x3r = a[4] - a[6];
+          x3i = a[5] - a[7];
+          a[0] = x0r + x2r;
+          a[1] = x0i + x2i;
+          a[4] = x0r - x2r;
+          a[5] = x0i - x2i;
+          a[2] = x1r - x3i;
+          a[3] = x1i + x3r;
+          a[6] = x1r + x3i;
+          a[7] = x1i - x3r;
+          wk1r = w[2];
+          x0r = a[8] + a[10];
+          x0i = a[9] + a[11];
+          x1r = a[8] - a[10];
+          x1i = a[9] - a[11];
+          x2r = a[12] + a[14];
+          x2i = a[13] + a[15];
+          x3r = a[12] - a[14];
+          x3i = a[13] - a[15];
+          a[8] = x0r + x2r;
+          a[9] = x0i + x2i;
+          a[12] = x2i - x0i;
+          a[13] = x0r - x2r;
+          x0r = x1r - x3i;
+          x0i = x1i + x3r;
+          a[10] = wk1r * (x0r - x0i);
+          a[11] = wk1r * (x0r + x0i);
+          x0r = x3i + x1r;
+          x0i = x3r - x1i;
+          a[14] = wk1r * (x0i - x0r);
+          a[15] = wk1r * (x0i + x0r);
+          k1 = 0;
+          for (j = 16; j < n; j += 16) {
+            k1 += 2;
+            k2 = 2 * k1;
+            wk2r = w[k1];
+            wk2i = w[k1 + 1];
+            wk1r = w[k2];
+            wk1i = w[k2 + 1];
+            wk3r = wk1r - 2 * wk2i * wk1i;
+            wk3i = 2 * wk2i * wk1r - wk1i;
+            x0r = a[j] + a[j + 2];
+            x0i = a[j + 1] + a[j + 3];
+            x1r = a[j] - a[j + 2];
+            x1i = a[j + 1] - a[j + 3];
+            x2r = a[j + 4] + a[j + 6];
+            x2i = a[j + 5] + a[j + 7];
+            x3r = a[j + 4] - a[j + 6];
+            x3i = a[j + 5] - a[j + 7];
+            a[j] = x0r + x2r;
+            a[j + 1] = x0i + x2i;
+            x0r -= x2r;
+            x0i -= x2i;
+            a[j + 4] = wk2r * x0r - wk2i * x0i;
+            a[j + 5] = wk2r * x0i + wk2i * x0r;
+            x0r = x1r - x3i;
+            x0i = x1i + x3r;
+            a[j + 2] = wk1r * x0r - wk1i * x0i;
+            a[j + 3] = wk1r * x0i + wk1i * x0r;
+            x0r = x1r + x3i;
+            x0i = x1i - x3r;
+            a[j + 6] = wk3r * x0r - wk3i * x0i;
+            a[j + 7] = wk3r * x0i + wk3i * x0r;
+            wk1r = w[k2 + 2];
+            wk1i = w[k2 + 3];
+            wk3r = wk1r - 2 * wk2r * wk1i;
+            wk3i = 2 * wk2r * wk1r - wk1i;
+            x0r = a[j + 8] + a[j + 10];
+            x0i = a[j + 9] + a[j + 11];
+            x1r = a[j + 8] - a[j + 10];
+            x1i = a[j + 9] - a[j + 11];
+            x2r = a[j + 12] + a[j + 14];
+            x2i = a[j + 13] + a[j + 15];
+            x3r = a[j + 12] - a[j + 14];
+            x3i = a[j + 13] - a[j + 15];
+            a[j + 8] = x0r + x2r;
+            a[j + 9] = x0i + x2i;
+            x0r -= x2r;
+            x0i -= x2i;
+            a[j + 12] = -wk2i * x0r - wk2r * x0i;
+            a[j + 13] = -wk2i * x0i + wk2r * x0r;
+            x0r = x1r - x3i;
+            x0i = x1i + x3r;
+            a[j + 10] = wk1r * x0r - wk1i * x0i;
+            a[j + 11] = wk1r * x0i + wk1i * x0r;
+            x0r = x1r + x3i;
+            x0i = x1i - x3r;
+            a[j + 14] = wk3r * x0r - wk3i * x0i;
+            a[j + 15] = wk3r * x0i + wk3i * x0r;
+          }
+        }
+
+
+        void cftmdl(int n, int l, double *a, double *w)
+        {
+          int j, j1, j2, j3, k, k1, k2, m, m2;
+          double wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
+          double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+          m = l << 2;
+          for (j = 0; j < l; j += 2) {
+            j1 = j + l;
+            j2 = j1 + l;
+            j3 = j2 + l;
+            x0r = a[j] + a[j1];
+            x0i = a[j + 1] + a[j1 + 1];
+            x1r = a[j] - a[j1];
+            x1i = a[j + 1] - a[j1 + 1];
+            x2r = a[j2] + a[j3];
+            x2i = a[j2 + 1] + a[j3 + 1];
+            x3r = a[j2] - a[j3];
+            x3i = a[j2 + 1] - a[j3 + 1];
+            a[j] = x0r + x2r;
+            a[j + 1] = x0i + x2i;
+            a[j2] = x0r - x2r;
+            a[j2 + 1] = x0i - x2i;
+            a[j1] = x1r - x3i;
+            a[j1 + 1] = x1i + x3r;
+            a[j3] = x1r + x3i;
+            a[j3 + 1] = x1i - x3r;
+          }
+          wk1r = w[2];
+          for (j = m; j < l + m; j += 2) {
+            j1 = j + l;
+            j2 = j1 + l;
+            j3 = j2 + l;
+            x0r = a[j] + a[j1];
+            x0i = a[j + 1] + a[j1 + 1];
+            x1r = a[j] - a[j1];
+            x1i = a[j + 1] - a[j1 + 1];
+            x2r = a[j2] + a[j3];
+            x2i = a[j2 + 1] + a[j3 + 1];
+            x3r = a[j2] - a[j3];
+            x3i = a[j2 + 1] - a[j3 + 1];
+            a[j] = x0r + x2r;
+            a[j + 1] = x0i + x2i;
+            a[j2] = x2i - x0i;
+            a[j2 + 1] = x0r - x2r;
+            x0r = x1r - x3i;
+            x0i = x1i + x3r;
+            a[j1] = wk1r * (x0r - x0i);
+            a[j1 + 1] = wk1r * (x0r + x0i);
+            x0r = x3i + x1r;
+            x0i = x3r - x1i;
+            a[j3] = wk1r * (x0i - x0r);
+            a[j3 + 1] = wk1r * (x0i + x0r);
+          }
+          k1 = 0;
+          m2 = 2 * m;
+          for (k = m2; k < n; k += m2) {
+            k1 += 2;
+            k2 = 2 * k1;
+            wk2r = w[k1];
+            wk2i = w[k1 + 1];
+            wk1r = w[k2];
+            wk1i = w[k2 + 1];
+            wk3r = wk1r - 2 * wk2i * wk1i;
+            wk3i = 2 * wk2i * wk1r - wk1i;
+            for (j = k; j < l + k; j += 2) {
+              j1 = j + l;
+              j2 = j1 + l;
+              j3 = j2 + l;
+              x0r = a[j] + a[j1];
+              x0i = a[j + 1] + a[j1 + 1];
+              x1r = a[j] - a[j1];
+              x1i = a[j + 1] - a[j1 + 1];
+              x2r = a[j2] + a[j3];
+              x2i = a[j2 + 1] + a[j3 + 1];
+              x3r = a[j2] - a[j3];
+              x3i = a[j2 + 1] - a[j3 + 1];
+              a[j] = x0r + x2r;
+              a[j + 1] = x0i + x2i;
+              x0r -= x2r;
+              x0i -= x2i;
+              a[j2] = wk2r * x0r - wk2i * x0i;
+              a[j2 + 1] = wk2r * x0i + wk2i * x0r;
+              x0r = x1r - x3i;
+              x0i = x1i + x3r;
+              a[j1] = wk1r * x0r - wk1i * x0i;
+              a[j1 + 1] = wk1r * x0i + wk1i * x0r;
+              x0r = x1r + x3i;
+              x0i = x1i - x3r;
+              a[j3] = wk3r * x0r - wk3i * x0i;
+              a[j3 + 1] = wk3r * x0i + wk3i * x0r;
+            }
+            wk1r = w[k2 + 2];
+            wk1i = w[k2 + 3];
+            wk3r = wk1r - 2 * wk2r * wk1i;
+            wk3i = 2 * wk2r * wk1r - wk1i;
+            for (j = k + m; j < l + (k + m); j += 2) {
+              j1 = j + l;
+              j2 = j1 + l;
+              j3 = j2 + l;
+              x0r = a[j] + a[j1];
+              x0i = a[j + 1] + a[j1 + 1];
+              x1r = a[j] - a[j1];
+              x1i = a[j + 1] - a[j1 + 1];
+              x2r = a[j2] + a[j3];
+              x2i = a[j2 + 1] + a[j3 + 1];
+              x3r = a[j2] - a[j3];
+              x3i = a[j2 + 1] - a[j3 + 1];
+              a[j] = x0r + x2r;
+              a[j + 1] = x0i + x2i;
+              x0r -= x2r;
+              x0i -= x2i;
+              a[j2] = -wk2i * x0r - wk2r * x0i;
+              a[j2 + 1] = -wk2i * x0i + wk2r * x0r;
+              x0r = x1r - x3i;
+              x0i = x1i + x3r;
+              a[j1] = wk1r * x0r - wk1i * x0i;
+              a[j1 + 1] = wk1r * x0i + wk1i * x0r;
+              x0r = x1r + x3i;
+              x0i = x1i - x3r;
+              a[j3] = wk3r * x0r - wk3i * x0i;
+              a[j3 + 1] = wk3r * x0i + wk3i * x0r;
+            }
+          }
+        }
+
+
+        void rftfsub(int n, double *a, int nc, double *c)
+        {
+          int j, k, kk, ks, m;
+          double wkr, wki, xr, xi, yr, yi;
+
+          m = n >> 1;
+          ks = 2 * nc / m;
+          kk = 0;
+          for (j = 2; j < m; j += 2) {
+            k = n - j;
+            kk += ks;
+            wkr = 0.5 - c[nc - kk];
+            wki = c[kk];
+            xr = a[j] - a[k];
+            xi = a[j + 1] + a[k + 1];
+            yr = wkr * xr - wki * xi;
+            yi = wkr * xi + wki * xr;
+            a[j] -= yr;
+            a[j + 1] -= yi;
+            a[k] += yr;
+            a[k + 1] -= yi;
+          }
+        }
+
+
+        void rftbsub(int n, double *a, int nc, double *c)
+        {
+          int j, k, kk, ks, m;
+          double wkr, wki, xr, xi, yr, yi;
+
+          a[1] = -a[1];
+          m = n >> 1;
+          ks = 2 * nc / m;
+          kk = 0;
+          for (j = 2; j < m; j += 2) {
+            k = n - j;
+            kk += ks;
+            wkr = 0.5 - c[nc - kk];
+            wki = c[kk];
+            xr = a[j] - a[k];
+            xi = a[j + 1] + a[k + 1];
+            yr = wkr * xr + wki * xi;
+            yi = wkr * xi - wki * xr;
+            a[j] -= yr;
+            a[j + 1] = yi - a[j + 1];
+            a[k] += yr;
+            a[k + 1] = yi - a[k + 1];
+          }
+          a[m + 1] = -a[m + 1];
+        }
+    };
+
+
+    /**
+     * @internal
+     * @brief Concrete FFT implementation
+     */
+    typedef OouraFFT AudioFFTImplementation;
+
+
+#endif // AUDIOFFT_OOURA_USED
+
+
+    // ================================================================
+
+
+#ifdef AUDIOFFT_INTEL_IPP_USED
+
+
+    /**
+   * @internal
+   * @class IntelIppFFT
+   * @brief FFT implementation using the Intel Integrated Performance Primitives
+   */
+  class IntelIppFFT : public detail::AudioFFTImpl
+  {
+  public:
+    IntelIppFFT() :
+      detail::AudioFFTImpl(),
+      _size(0),
+      _operationalBufferSize(0),
+      _powerOf2(0),
+      _fftSpec(nullptr),
+      _fftSpecBuf(0),
+      _fftWorkBuf(0),
+      _operationalBuffer(nullptr)
+    {
+      ippInit();
+    }
+
+    IntelIppFFT(const IntelIppFFT&) = delete;
+    IntelIppFFT& operator=(const IntelIppFFT&) = delete;
+
+    virtual ~IntelIppFFT()
+    {
+      init(0);
+    }
+
+    virtual void init(size_t size) override
+    {
+      if (_fftSpec)
+      {
+        if (_fftWorkBuf) ippFree(_fftWorkBuf);
+        if (_fftSpecBuf) ippFree(_fftSpecBuf);
+        ippFree(_operationalBuffer);
+
+        _size = 0;
+        _operationalBufferSize = 0;
+        _powerOf2 = 0;
+        _fftSpec = 0;
+      }
+
+      if (size > 0)
+      {
+        _size = size;
+        _operationalBufferSize = _size + 2;
+        _powerOf2 = (int)(log((double)_size)/log(2.0));
+
+        // Query to get buffer sizes
+        int sizeFFTSpec,
+          sizeFFTInitBuf,
+          sizeFFTWorkBuf;
+        ippsFFTGetSize_R_32f(
+          _powerOf2,
+          IPP_FFT_NODIV_BY_ANY,
+          ippAlgHintAccurate,
+          &sizeFFTSpec,
+          &sizeFFTInitBuf,
+          &sizeFFTWorkBuf
+        );
+
+        Ipp8u* fftInitBuf;
+
+        // init buffers
+        _fftSpecBuf = ippsMalloc_8u(sizeFFTSpec);
+        _fftWorkBuf = ippsMalloc_8u(sizeFFTWorkBuf);
+        fftInitBuf = ippsMalloc_8u(sizeFFTInitBuf);
+
+        // Initialize FFT
+        ippsFFTInit_R_32f(
+          &_fftSpec,
+          _powerOf2,
+          IPP_FFT_NODIV_BY_ANY,
+          ippAlgHintAccurate,
+          _fftSpecBuf,
+          fftInitBuf
+        );
+        if (fftInitBuf) ippFree(fftInitBuf);
+
+        // init operational buffer
+        _operationalBuffer = ippsMalloc_32f(
+          _operationalBufferSize
+        );
+      }
+    }
+
+    virtual void fft(const float* data, float* re, float* im) override
+    {
+      size_t complexNumbersCount = _operationalBufferSize / 2;
+      ippsFFTFwd_RToCCS_32f(
+        data,
+        _operationalBuffer,
+        _fftSpec,
+        _fftWorkBuf
+      );
+
+      // no need to scale
+
+      size_t complexCounter = 0;
+      for (int i = 0; i < complexNumbersCount; ++i)
+      {
+        re[i] = _operationalBuffer[complexCounter++];
+        im[i] = _operationalBuffer[complexCounter++];
+      }
+    }
+
+    virtual void ifft(float* data, const float* re, const float* im) override
+    {
+      size_t complexNumbersCount = _operationalBufferSize / 2;
+
+      size_t complexCounter = 0;
+      for (int i = 0; i < complexNumbersCount; ++i)
+      {
+        _operationalBuffer[complexCounter++] = re[i];
+        _operationalBuffer[complexCounter++] = im[i];
+      }
+
+      ippsFFTInv_CCSToR_32f(
+        _operationalBuffer,
+        data,
+        _fftSpec,
+        _fftWorkBuf
+      );
+
+      // scaling
+      const float factor = 1.0f / static_cast<float>(_size);
+      ippsMulC_32f_I(factor, data, _size);
+    }
+
+  private:
+    size_t _size;
+    size_t _operationalBufferSize;
+    size_t _powerOf2;
+    IppsFFTSpec_R_32f* _fftSpec;
+    Ipp8u* _fftSpecBuf;
+    Ipp8u* _fftWorkBuf;
+    Ipp32f* _operationalBuffer;
+  };
+
+
+  /**
+   * @internal
+   * @brief Concrete FFT implementation
+   */
+  typedef IntelIppFFT AudioFFTImplementation;
+
+
+#endif // AUDIOFFT_INTEL_IPP_USED
+
+
+    // ================================================================
+
+
+#ifdef AUDIOFFT_APPLE_ACCELERATE_USED
+
+
+    /**
+   * @internal
+   * @class AppleAccelerateFFT
+   * @brief FFT implementation using the Apple Accelerate framework internally
+   */
+  class AppleAccelerateFFT : public detail::AudioFFTImpl
+  {
+  public:
+    AppleAccelerateFFT() :
+      detail::AudioFFTImpl(),
+      _size(0),
+      _powerOf2(0),
+      _fftSetup(0),
+      _re(),
+      _im()
+    {
+    }
+
+    AppleAccelerateFFT(const AppleAccelerateFFT&) = delete;
+    AppleAccelerateFFT& operator=(const AppleAccelerateFFT&) = delete;
+
+    virtual ~AppleAccelerateFFT()
+    {
+      init(0);
+    }
+
+    virtual void init(size_t size) override
+    {
+      if (_fftSetup)
+      {
+        vDSP_destroy_fftsetup(_fftSetup);
+        _size = 0;
+        _powerOf2 = 0;
+        _fftSetup = 0;
+        _re.clear();
+        _im.clear();
+      }
+
+      if (size > 0)
+      {
+        _size = size;
+        _powerOf2 = 0;
+        while ((1 << _powerOf2) < _size)
+        {
+          ++_powerOf2;
+        }
+        _fftSetup = vDSP_create_fftsetup(_powerOf2, FFT_RADIX2);
+        _re.resize(_size / 2);
+        _im.resize(_size / 2);
+      }
+    }
+
+    virtual void fft(const float* data, float* re, float* im) override
+    {
+      const size_t size2 = _size / 2;
+      DSPSplitComplex splitComplex;
+      splitComplex.realp = re;
+      splitComplex.imagp = im;
+      vDSP_ctoz(reinterpret_cast<const COMPLEX*>(data), 2, &splitComplex, 1, size2);
+      vDSP_fft_zrip(_fftSetup, &splitComplex, 1, _powerOf2, FFT_FORWARD);
+      const float factor = 0.5f;
+      vDSP_vsmul(re, 1, &factor, re, 1, size2);
+      vDSP_vsmul(im, 1, &factor, im, 1, size2);
+      re[size2] = im[0];
+      im[0] = 0.0f;
+      im[size2] = 0.0f;
+    }
+
+    virtual void ifft(float* data, const float* re, const float* im) override
+    {
+      const size_t size2 = _size / 2;
+      ::memcpy(_re.data(), re, size2 * sizeof(float));
+      ::memcpy(_im.data(), im, size2 * sizeof(float));
+      _im[0] = re[size2];
+      DSPSplitComplex splitComplex;
+      splitComplex.realp = _re.data();
+      splitComplex.imagp = _im.data();
+      vDSP_fft_zrip(_fftSetup, &splitComplex, 1, _powerOf2, FFT_INVERSE);
+      vDSP_ztoc(&splitComplex, 1, reinterpret_cast<COMPLEX*>(data), 2, size2);
+      const float factor = 1.0f / static_cast<float>(_size);
+      vDSP_vsmul(data, 1, &factor, data, 1, _size);
+    }
+
+  private:
+    size_t _size;
+    size_t _powerOf2;
+    FFTSetup _fftSetup;
+    std::vector<float> _re;
+    std::vector<float> _im;
+  };
+
+
+  /**
+   * @internal
+   * @brief Concrete FFT implementation
+   */
+  typedef AppleAccelerateFFT AudioFFTImplementation;
+
+
+#endif // AUDIOFFT_APPLE_ACCELERATE_USED
+
+
+    // ================================================================
+
+
+#ifdef AUDIOFFT_FFTW3_USED
+
+
+    /**
+   * @internal
+   * @class FFTW3FFT
+   * @brief FFT implementation using FFTW3 internally (see fftw.org)
+   */
+  class FFTW3FFT : public detail::AudioFFTImpl
+  {
+  public:
+    FFTW3FFT() :
+      detail::AudioFFTImpl(),
+      _size(0),
+      _complexSize(0),
+      _planForward(0),
+      _planBackward(0),
+      _data(0),
+      _re(0),
+      _im(0)
+    {
+    }
+
+    FFTW3FFT(const FFTW3FFT&) = delete;
+    FFTW3FFT& operator=(const FFTW3FFT&) = delete;
+
+    virtual ~FFTW3FFT()
+    {
+      init(0);
+    }
+
+    virtual void init(size_t size) override
+    {
+      if (_size != size)
+      {
+        if (_size > 0)
+        {
+          fftwf_destroy_plan(_planForward);
+          fftwf_destroy_plan(_planBackward);
+          _planForward = 0;
+          _planBackward = 0;
+          _size = 0;
+          _complexSize = 0;
+
+          if (_data)
+          {
+            fftwf_free(_data);
+            _data = 0;
+          }
+
+          if (_re)
+          {
+            fftwf_free(_re);
+            _re = 0;
+          }
+
+          if (_im)
+          {
+            fftwf_free(_im);
+            _im = 0;
+          }
+        }
+
+        if (size > 0)
+        {
+          _size = size;
+          _complexSize = AudioFFT::ComplexSize(_size);
+          const size_t complexSize = AudioFFT::ComplexSize(_size);
+          _data = reinterpret_cast<float*>(fftwf_malloc(_size * sizeof(float)));
+          _re = reinterpret_cast<float*>(fftwf_malloc(complexSize * sizeof(float)));
+          _im = reinterpret_cast<float*>(fftwf_malloc(complexSize * sizeof(float)));
+
+          fftw_iodim dim;
+          dim.n = static_cast<int>(size);
+          dim.is = 1;
+          dim.os = 1;
+          _planForward = fftwf_plan_guru_split_dft_r2c(1, &dim, 0, 0, _data, _re, _im, FFTW_MEASURE);
+          _planBackward = fftwf_plan_guru_split_dft_c2r(1, &dim, 0, 0, _re, _im, _data, FFTW_MEASURE);
+        }
+      }
+    }
+
+    virtual void fft(const float* data, float* re, float* im) override
+    {
+      ::memcpy(_data, data, _size * sizeof(float));
+      fftwf_execute_split_dft_r2c(_planForward, _data, _re, _im);
+      ::memcpy(re, _re, _complexSize * sizeof(float));
+      ::memcpy(im, _im, _complexSize * sizeof(float));
+    }
+
+    virtual void ifft(float* data, const float* re, const float* im) override
+    {
+      ::memcpy(_re, re, _complexSize * sizeof(float));
+      ::memcpy(_im, im, _complexSize * sizeof(float));
+      fftwf_execute_split_dft_c2r(_planBackward, _re, _im, _data);
+      detail::ScaleBuffer(data, _data, 1.0f / static_cast<float>(_size), _size);
+    }
+
+  private:
+    size_t _size;
+    size_t _complexSize;
+    fftwf_plan _planForward;
+    fftwf_plan _planBackward;
+    float* _data;
+    float* _re;
+    float* _im;
+  };
+
+
+  /**
+   * @internal
+   * @brief Concrete FFT implementation
+   */
+  typedef FFTW3FFT AudioFFTImplementation;
+
+
+#endif // AUDIOFFT_FFTW3_USED
+
+
+    // =============================================================
+
+
+    AudioFFT::AudioFFT() :
+            _impl(new AudioFFTImplementation())
+    {
+    }
+
+
+    AudioFFT::~AudioFFT()
+    {
+    }
+
+
+    void AudioFFT::init(size_t size)
+    {
+      assert(detail::IsPowerOf2(size));
+      _impl->init(size);
+    }
+
+
+    void AudioFFT::fft(const float* data, float* re, float* im)
+    {
+      _impl->fft(data, re, im);
+    }
+
+
+    void AudioFFT::ifft(float* data, const float* re, const float* im)
+    {
+      _impl->ifft(data, re, im);
+    }
+
+
+    size_t AudioFFT::ComplexSize(size_t size)
+    {
+      return (size / 2) + 1;
+    }
+
+} // End of namespace
+
+
diff --git a/duix-sdk/src/main/cpp/dhmfcc/dhpcm.cpp b/duix-sdk/src/main/cpp/dhmfcc/dhpcm.cpp
new file mode 100644
index 0000000..3069d42
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhmfcc/dhpcm.cpp
@@ -0,0 +1,803 @@
+#include "dhpcm.h"
+#include "mfcc/mfcc.hpp"
+#include <stdio.h>
+#include "aicommon.h"
+#include <vector>
+#include <string>
+#include "opencv2/core.hpp"
+#ifdef USE_HELPER    
+#include "dhdatahelper.h"
+#endif
+
+
+
+PcmItem::PcmItem(int sentid,int minoff,int maxblock,int flip,int inx){
+  m_flip = flip;
+  m_inx = inx;
+  m_sentid = sentid;
+  m_maxblock = maxblock;
+  //int dist = minoff - STREAM_MFCC_FILL;
+  //if(dist>0) m_minoff = dist;
+  m_minoff = minoff;
+  int allcnt = m_minoff + maxblock + 2*STREAM_MFCC_FILL;
+  pcm_allsamp = allcnt*STREAM_BASE_SAMP;
+  mel_allcnt = pcm_allsamp/160+1;
+  bnf_allcnt = mel_allcnt*0.25f - 0.75f;
+  //printf("==minoff %d max %d allcnt %d melcnt %d bnfcnt %d\n", minoff,maxblock,allcnt,mel_allcnt,bnf_allcnt );
+  m_pcm = jmat_alloc(STREAM_BASE_SAMP,allcnt,1,0,4,NULL);
+  //m_pcm = new jmat_t(STREAM_BASE_SAMP,allcnt,1);
+  m_mel = jmat_alloc(STREAM_BASE_MEL,mel_allcnt,1,0,4,NULL);
+  //m_mel = new jmat_t(STREAM_BASE_MEL,mel_allcnt,1);
+  m_bnf = jmat_alloc(STREAM_BASE_BNF,bnf_allcnt,1,0,4,NULL);
+  m_bnfflip = jmat_alloc(STREAM_BASE_BNF,bnf_allcnt,1,0,4,NULL);
+  //m_bnf = new jmat_t(STREAM_BASE_BNF,bnf_allcnt,1);
+  //gjvad_alloc(&m_vad,STREAM_BASE_SAMP/2);
+  mat_flip = jmat_null();
+}
+
+PcmItem::~PcmItem(){
+  if(m_pcm) jmat_free(m_pcm);
+  if(m_mel) jmat_free(m_mel);
+  if(m_bnf)jmat_free(m_bnf);
+  if(m_wav)jmat_free(m_wav);
+  if(m_mfcc)jmat_free(m_mfcc);
+  if(m_bnfflip)jmat_free(m_bnfflip);
+  jmat_deref(mat_flip);
+  //gjvad_free(&m_vad);
+}
+
+int PcmItem::reset(){
+  jbuf_zeros((jbuf_t*)m_pcm);
+  jbuf_zeros((jbuf_t*)m_mel);
+  jbuf_zeros((jbuf_t*)m_bnf);
+  return 0;
+}
+
+int PcmItem::fillPcm(uint64_t sessid,uint64_t tickinx,jmat_t* premat,jmat_t* mat){
+  m_wav = mat;
+  pcm_block = mat->height;
+  if(pcm_block>(m_maxblock+STREAM_MFCC_FILL))return -1;
+  pre_block = premat?premat->height:0;
+  int pcmcnt = pcm_block ;
+
+  int allcnt = m_minoff + pcmcnt + 2*STREAM_MFCC_FILL;
+  //printf("===off %d pcm %d pad %d\n",m_minoff,pcmcnt,2*STREAM_MFCC_FILL);
+  pcm_allsamp = allcnt*STREAM_BASE_SAMP;
+  mel_allcnt = pcm_allsamp/160+1;
+  bnf_allcnt = mel_allcnt*0.25f - 0.75f;
+
+  m_sessid = sessid;
+  m_pcminx = tickinx;
+  {
+    //fill pre
+    int dlen = m_minoff + STREAM_MFCC_FILL;
+    int blank = dlen - pre_block;
+    if(blank){
+      float* pbuf = (float*)m_pcm->data;
+      int samp = blank*STREAM_BASE_SAMP;
+      memset(pbuf,0,samp*sizeof(float));
+    }
+    if(pre_block){
+      short* ps = (short*)premat->data;
+      for(int k=blank;k<dlen;k++){
+        float* pbuf = (float*)jmat_row(m_pcm,k);
+        for(int m=0;m<STREAM_BASE_SAMP;m++){
+          *pbuf++ = *ps++/32768.f;
+        }
+      }
+    }
+  }
+  {
+    //fill pcm
+    int dlen = pcmcnt + STREAM_MFCC_FILL;
+    int blank = dlen - pcm_block;
+    int offset = m_minoff + STREAM_MFCC_FILL;
+    short* ps = (short*)mat->data;
+    for(int k=0;k<pcm_block;k++){
+      float* pbuf = (float*)jmat_row(m_pcm,k+offset);
+      for(int m=0;m<STREAM_BASE_SAMP;m++){
+        *pbuf++ = *ps++/32768.f;
+      }
+    }
+    if(blank){
+      float* pbuf = (float*)jmat_row(m_pcm,offset+pcm_block);
+      float* abuf = (float*)m_pcm->data;
+      int samp = blank*STREAM_BASE_SAMP;
+      memset(pbuf,0,samp*sizeof(float));
+    }
+  }
+  return 0;
+}
+
+int PcmItem::checkValid(uint64_t tickinx){
+  if(!tickinx)return 1;
+  return tickinx<=(m_pcminx+pcm_block);//&&(tickinx<=(m_pcminx+pcm_block));
+}
+
+jmat_t* PcmItem::readlast(int minoff){
+  if(minoff>pcm_block)return NULL;
+  int start = pcm_block - minoff;
+  jmat_t* mpre = jmat_alloc(STREAM_BASE_PCM,minoff,1,0,1, m_wav->data + start);
+  return mpre;
+}
+
+int PcmItem::readblock(){
+  return  pcm_read;
+}
+
+int PcmItem::numblock(){
+  return pcm_block;
+}
+
+int PcmItem::readbnf(char* buf){
+  if(!m_ready)return 0;
+  char* mdata = jmat_row(m_mfcc,0);
+  int cnt = pcm_block ;
+  memcpy(buf,mdata,STREAM_ALL_BNF*cnt);
+  return 0;
+}
+
+int PcmItem::readblock(jmat_t* pcm,jmat_t* mfcc){
+  if(!m_ready)return 0;
+  if(pcm_read>=pcm_block)return 0;
+  if(pcm){
+    char* rdata = jmat_row(m_wav,pcm_read);
+    memcpy(pcm->data,rdata,STREAM_BASE_PCM);
+  }
+  int inx =  pcm_read?pcm_read-1:0;
+  char* mdata = jmat_row(m_mfcc,inx);
+  //printf("===inx %d mfcc %d\n",inx,m_mfcc->height);
+  memcpy(mfcc->data,mdata,STREAM_ALL_BNF);
+  pcm_read++;
+  return 1;
+}
+
+int PcmItem::readblock(int inx,jmat_t* pcm,jmat_t* mfcc){
+  if(!m_ready)return 0;
+  if(inx>=pcm_block)return 0;
+  if(pcm){
+    char* rdata = jmat_row(m_wav,inx);
+    memcpy(pcm->data,rdata,STREAM_BASE_PCM);
+  }
+  int newinx =  inx?inx-1:0;
+  if(m_flip){
+    jmat_reroi(mat_flip,m_mfcc,STREAM_BASE_BNF,20,0,newinx);
+#ifdef USE_HELPER    //jmat_dump(mat_flip);
+    cv::Mat sm = dh2cvmat(mat_flip);
+    jmat_reshape(mfcc,20,STREAM_BASE_BNF);
+    cv::Mat dm = dh2cvmat(mfcc);
+    cv::transpose(sm,dm);
+#endif
+    //jmat_dump(mfcc);
+  }else{
+    char* mdata = jmat_row(m_mfcc,newinx);
+  //printf("===inx %d mfcc %d\n",inx,m_mfcc->height);
+    memcpy(mfcc->data,mdata,STREAM_ALL_BNF);
+  }
+  return 1;
+}
+
+void PcmItem::dump(FILE* dumpfile){
+  printf("===dumpone %d\n",pcm_block);
+  for(int k=0;k<pcm_block;k++){
+    char* rdata = jmat_row(m_wav,k);
+    fwrite(rdata,1,STREAM_BASE_PCM,dumpfile);
+  }
+}
+
+int PcmItem::runWenet(WeAI* weai){
+  int rst = 0;
+  float* fwav = (float*)m_pcm->data;
+  float* mel = (float*)m_mel->data;
+  rst = DhWenet::calcmfcc(fwav,pcm_allsamp,mel,mel_allcnt);
+
+  //float* bnf = m_flip? (float*)m_bnfflip->data:(float*)m_bnf->data;
+  float* bnf = (float*)m_bnf->data;
+  //tooken
+  uint64_t tick = jtimer_msstamp();
+#ifdef AIRUN_FLAG
+  rst =  weai->run(mel,mel_allcnt,bnf,bnf_allcnt);
+#endif
+  int dist = jtimer_msstamp()-tick;
+  if(0){
+    float* pf = (float*)bnf;
+    for(int k=0;k<256;k++){
+      printf("=%d==%f\n",k,*pf++);
+    }
+  }
+
+  printf("===pcm %ld %d  mel %d bnf %d dist %d \n",tick,m_pcm->height,mel_allcnt,bnf_allcnt,dist);
+  /*
+  if(m_flip){
+    printf("==flip\n");
+    cv::Mat matbnf = dh2cvmat(m_bnf) ;
+    cv::Mat matflip =dh2cvmat(m_bnfflip);
+    cv::transpose(matflip,matbnf);
+    //jmat_reshape(m_bnf,256,bnf_allcnt);
+  }
+  */
+
+  //printf("===bbb \n");
+  int inxstart = m_minoff;
+  uint64_t tickinx = m_pcminx;
+  float* rbnf = (float*)jmat_row(m_bnf,inxstart);
+  int rcnt = pcm_block;
+  jmat_t* matbnf = jmat_alloc(STREAM_BASE_BNF,rcnt+19,1,0,4,NULL);
+  memcpy(matbnf->data,rbnf,matbnf->buf.size);
+  m_mfcc = matbnf;
+  /*
+  jmat_t* dmat = jmat_alloc(20,256,1,0,4,NULL);
+  cv::Mat bm =dh2cvmat(dmat);
+  for(int k=0;k<10;k++){
+    printf("====k%d\n",k);
+    jmat_t* mat = jmat_roi(m_mfcc,256,20,0,k);
+    cv::Mat am = dh2cvmat(mat) ;
+    cv::transpose(am,bm);
+    jmat_deref(mat);
+    break;
+  }
+  */
+  m_ready = 1;
+  return 0;
+}
+
+PcmFile::PcmFile(int fps,int minoff,int mincnt,int maxcnt){
+  m_fps = fps;
+  m_scale = fps*1.0f/25.0f;
+  m_adj = fps!=25;
+  m_minoff = minoff;
+  m_mincnt = mincnt;
+  m_maxcnt = maxcnt;
+  m_maxsize = maxcnt* STREAM_BASE_PCM;
+  m_minsize = mincnt* STREAM_BASE_PCM;
+  m_arrmax = (int*)malloc(sizeof(int)*1024);
+  memset(m_arrmax,0,sizeof(int)*1024);
+  m_arrmin = (int*)malloc(sizeof(int)*1024);
+  memset(m_arrmin,0,sizeof(int)*1024);
+}
+
+PcmFile::~PcmFile(){
+  for(int k=0;k<vec_pcm.size();k++){
+    PcmItem* item = vec_pcm[k];
+    delete item;
+  }
+  if(m_preitem){
+    delete m_preitem;
+    m_preitem = NULL;
+  }
+  free(m_arrmax);
+  free(m_arrmin);
+}
+
+int PcmFile::itemSize(){
+  return vec_pcm.size();
+}
+
+int PcmFile::process(int inx,WeAI* weai){
+  if(inx<0){
+    for(int k=0;k<vec_pcm.size();k++){
+      PcmItem* item = vec_pcm[k];
+      int rst = item->runWenet(weai);
+      m_calcblock = m_calcblock + item->numblock();
+      m_calccnt += 1;
+    }
+    return 0;
+  }else{
+    if(inx>=vec_pcm.size())return -1;
+    PcmItem* item = vec_pcm[inx];
+    int rst = item->runWenet(weai);
+    m_calcblock = m_calcblock + item->numblock();
+    m_calccnt += 1;
+    return rst;
+  }
+}
+
+int PcmFile::appenditem(jmat_t* mat,int noone){
+  int chkblock = mat->height;
+  int chkmin = m_lastitem?m_minoff:0;
+  //printf("===chk min %d chkblock %d\n",chkmin,chkblock);
+  int inx = m_fileblock ;
+  PcmItem* item = new PcmItem(0,chkmin,chkblock,m_flip,inx);
+  jmat_t* mpre = NULL;
+  if(m_lastitem){
+    mpre = m_lastitem->readlast(chkmin);
+  }
+  int rst = item->fillPcm(0,0,mpre,mat);
+  vec_pcm.push_back(item);
+  m_lastitem = item;
+  m_arrmin[vec_pcm.size()-1] = m_fileblock;
+  m_fileblock += item->numblock();
+  //printf("===m_fileblock %d to %d \n",m_fileblock,fileBlock());
+  m_arrmax[vec_pcm.size()-1] = m_fileblock;
+
+  if(mpre)jmat_free(mpre);
+  return 0;
+}
+
+int PcmFile::prepare(char* buf,int size,char* prebuf,int presize){
+  int rst = 0;
+  m_presize = presize;
+  m_preblock = presize/STREAM_BASE_PCM;
+  int cursize = size;
+  char* curhead = buf;
+  if(m_preblock){
+    jmat_t* mat = jmat_alloc(STREAM_BASE_PCM,m_preblock,1,0,1,prebuf);
+    int chkblock = mat->height;
+    int chkmin = m_lastitem?m_minoff:0;
+    int inx = 0;
+    PcmItem* item = new PcmItem(0,chkmin,chkblock,m_flip,inx);
+    m_preitem = item;
+    m_lastitem = item;
+    rst = item->fillPcm(0,0,NULL,mat);
+  }
+  while(cursize >= m_maxsize){
+    jmat_t* mat = jmat_alloc(STREAM_BASE_PCM,m_maxcnt,1,0,1,NULL);
+    memcpy(mat->data ,curhead,m_maxsize);
+    rst += appenditem(mat);
+    cursize -= m_maxsize;
+    curhead += m_maxsize;
+    //printf("====cursize %d\n",cursize);
+  }
+  if(cursize>0){
+    int block = cursize / STREAM_BASE_PCM;
+    if(block<m_mincnt)block = m_mincnt;
+    //printf("===lastblock %d cursize %d \n",block,cursize);
+    jmat_t* mat = jmat_alloc(STREAM_BASE_PCM,block,1,0,1,NULL);
+    memcpy(mat->data ,curhead,block*STREAM_BASE_PCM);
+    rst += appenditem(mat);
+  }
+  return 0;
+}
+
+int PcmFile::setflip(int flip){
+  m_flip = flip;
+  return 0;
+}
+
+int PcmFile::prepare(std::string& pcmfn){
+  /*
+  void* fhnd = wav_read_open(pcmfn.c_str());
+  if(!fhnd)return -1;
+  int format, channels, sr, bits_per_sample;
+  unsigned int data_length;
+  int res = wav_get_header(fhnd, &format, &channels, &sr, &bits_per_sample, &data_length);
+  if(data_length<1) return -2;
+  int sample = data_length/2;
+  jbuf_t* pcmbuf = jbuf_alloc(data_length);
+  int rst = wav_read_data(fhnd,(unsigned char*)pcmbuf->data,data_length);
+  wav_read_close(fhnd);
+  int cursize = data_length;
+  char* curhead = pcmbuf->data;
+
+  rst =  prepare(curhead,cursize);
+  dhmem_deref(pcmbuf);
+  return rst;
+  */
+  return 0;
+}
+
+jmat_t* PcmFile::readbnf(int sinx){
+  jmat_t* bnf = jmat_alloc(STREAM_BASE_BNF,m_fileblock,1,0,4,NULL);
+
+  return bnf;
+}
+
+int PcmFile::readbnf(char* bnf,int bnfsize){
+  int block = fileBlock();
+  int allsize = block*STREAM_BASE_BNF*sizeof(float);
+  if(bnfsize<allsize)return -1;
+  jmat_t* mbnf = jmat_alloc(STREAM_BASE_BNF,block,1,0,4,bnf);
+  int inx = 0;
+  for(int k=0;k<vec_pcm.size();k++){
+    PcmItem* item = vec_pcm[k];
+    char* buf = jmat_row(mbnf,inx);
+    item->readbnf(buf);
+    inx += item->numblock();
+  }
+  return block;
+}
+
+int PcmFile::readblock(int sinx,jmat_t* pcm,jmat_t* feat){
+  //if(pcm->width!=STREAM_BASE_PCM)return -2001; 
+  //if(feat->width!=STREAM_BASE_BNF)return -2002; 
+  int inx = sinx/m_scale;
+  if(inx>=m_fileblock)return -1;
+  printf("===inx %d calc %d\n",inx,m_calccnt);
+  if(inx>=m_calcblock)return 0;
+  int rst = 0;
+  PcmItem* curitem = NULL;
+  int newinx = 0;
+  for(int k=0;k<m_calccnt;k++){
+    if((inx<m_arrmax[k])&&(inx>=m_arrmin[k])){
+      curitem = vec_pcm[k];
+      newinx = inx - m_arrmin[k];
+      break;
+    }
+  }
+  if(curitem){
+    rst = curitem->readblock(newinx,pcm,feat);
+    if(rst){
+      if(pcm)pcm->buf.sessid = inx;
+      feat->buf.sessid = inx;
+    }
+    return rst;
+  }
+  return 0;
+}
+
+PcmSession::PcmSession(uint64_t sessid,int minoff,int mincnt,int maxcnt){
+  m_sessid = sessid;
+  m_minoff = minoff;
+  m_mincnt = mincnt;
+  m_maxcnt = maxcnt;
+  m_checkcnt = (mincnt+maxcnt)/2;
+  m_maxsize = maxcnt* STREAM_BASE_PCM;
+  m_minsize = mincnt* STREAM_BASE_PCM;
+  int csize = STREAM_BASE_PCM;
+  m_pcmcache = (uint8_t*)malloc(STREAM_BASE_PCM*maxcnt*10);
+  m_cachepos = 0;
+  m_cachemax = STREAM_BASE_PCM*maxcnt*10;
+  m_lastitem = NULL;
+  m_arrflag = (int*)malloc(1024*sizeof(int));
+  memset(m_arrflag,0,1024*sizeof(int));
+  m_arrmax = (int*)malloc(sizeof(int)*1024000);
+  memset(m_arrmax,0,sizeof(int)*1024000);
+  m_arrmin = (int*)malloc(sizeof(int)*1024000);
+  memset(m_arrmin,0,sizeof(int)*1024000);
+}
+
+PcmSession::~PcmSession(){
+  //std::unique_lock lock(m_lock);
+  for(int k=0;k<vec_pcm.size();k++){
+    PcmItem* item = vec_pcm[k];
+    if(item) delete item;
+    vec_pcm[k] = NULL;
+  }
+  free(m_pcmcache);
+  free(m_arrflag);
+  free(m_arrmin);
+  free(m_arrmax);
+}
+
+int PcmSession::setflip(int flip){
+  m_flip = flip;
+  return 0;
+}
+
+int PcmSession::appenditem(jmat_t* mat,int noone){
+  //std::unique_lock lock(m_lock);
+  //printf("===append %d\n",mat->height*STREAM_BASE_PCM);
+  //printf("===cur %d min %d max %d\n",m_curflag,m_minoff,m_maxcnt);
+  int chkblock = mat->height;
+  //printf("===chkblock %d\n",chkblock);
+  int chkmin = m_lastitem?m_minoff:0;
+  int inx = m_fileblock ;
+  PcmItem* item = new PcmItem(m_curflag,chkmin,chkblock,m_flip,inx);
+  //printf("===check cur %d off %d block %d\n",m_curflag,chkmin,chkblock);
+  jmat_t* mpre = NULL;
+  if(m_lastitem){
+    mpre = m_lastitem->readlast(chkmin);
+  }
+  int rst = item->fillPcm(m_sessid,0,mpre,mat);
+  //printf("===fill %d\n",rst);
+  vec_pcm.push_back(item);
+  m_lastitem = item;
+  m_arrmin[vec_pcm.size()-1] = m_fileblock;
+  m_fileblock += item->numblock();
+  m_arrmax[vec_pcm.size()-1] = m_fileblock;
+
+  m_numpush += chkblock;
+  m_lastitem = item;
+  m_workcnt ++;
+  if(mpre)jmat_free(mpre);
+  return 1;
+}
+
+int PcmSession::checkpcmcache(int flush){
+  if(m_cachepos<m_minsize)return 0;
+  //printf("===checkcache %d\n",m_cachepos);
+  uint8_t* curhead = m_pcmcache;
+  int cursize =  m_cachepos;
+  int rst = 0;
+  if(!m_lastitem){
+    jmat_t* mat = jmat_alloc(STREAM_BASE_PCM,m_mincnt,1,0,1,NULL);
+    memcpy(mat->data ,curhead,m_minsize);
+    rst += appenditem(mat);
+    cursize -= m_minsize;
+    curhead += m_minsize;
+  }
+  while(cursize >= m_maxsize){
+    jmat_t* mat = jmat_alloc(STREAM_BASE_PCM,m_maxcnt,1,0,1,NULL);
+    memcpy(mat->data ,curhead,m_maxsize);
+    rst += appenditem(mat);
+    cursize -= m_maxsize;
+    curhead += m_maxsize;
+  }
+  int dist =  m_calccnt - m_readcnt;
+  int force = dist<2;//distwait()<m_checkcnt;
+                     //printf("===dist %d cal %d read %d\n",dist,m_calccnt,m_readcnt);
+                     //printf("===force cnt %d\n",force);
+  if(force){
+    if(cursize >=m_minsize){
+      int chkblock = cursize / STREAM_BASE_PCM;
+      int chksize = chkblock * STREAM_BASE_PCM;
+      jmat_t* mat = jmat_alloc(STREAM_BASE_PCM,chkblock,1,0,1,NULL);
+      memcpy(mat->data ,curhead,chksize);
+      curhead += chksize;
+      cursize -= chksize;
+      rst += appenditem(mat);
+    }
+  }
+  if(curhead!=m_pcmcache){
+    m_cachepos = cursize;
+    memmove(m_pcmcache,curhead,cursize);
+  }
+  return rst;
+}
+
+int PcmSession::pushpcm(uint64_t sessid,uint8_t* buf,int len){
+  if(m_finished)return -1;
+  if(m_sessid!=sessid)return -2;
+
+  int rst = 0;
+  uint8_t* curhead = buf;
+  int cursize = len;
+  m_totalpush += len;
+  int allcnt = m_cachepos + cursize;
+
+  while(allcnt >= m_cachemax){
+    int cpsize = m_cachemax - m_cachepos;
+    memcpy(m_pcmcache + m_cachepos,curhead,cpsize);
+    m_cachepos = m_cachemax;
+    cursize -= cpsize;
+    curhead += cpsize;
+    allcnt -= m_cachemax;
+    rst += checkpcmcache();
+  }
+  if(cursize){
+    memcpy(m_pcmcache + m_cachepos,curhead,cursize);
+    m_cachepos += cursize;
+    rst += checkpcmcache();
+  }
+  return rst;
+}
+
+
+int PcmSession::simppcm(uint64_t sessid,uint8_t* buf,int len){
+  if(m_finished)return -1;
+  if(m_sessid!=sessid)return -2;
+  int rst = 0;
+  //printf("==curpos %d len %d\n",m_cachepos,len);
+  uint8_t* curhead = buf;
+  int cursize = len;
+  m_totalpush += len;
+  //int chkblock = m_first&&!m_lastitem?m_mincnt:m_maxcnt;
+  //int chksize = m_first&&!m_lastitem?m_firstsize:m_basesize;
+  //int chkfirst = m_first&&!m_lastitem;
+  if(m_cachepos){
+    int cnt = m_cachepos + len;
+    if(cnt>=m_minsize){
+      int chkblock = cnt / STREAM_BASE_PCM;
+      if(chkblock>m_maxcnt)chkblock = m_maxcnt;
+      int chksize = chkblock * STREAM_BASE_PCM;
+      jmat_t* mat = jmat_alloc(STREAM_BASE_PCM,chkblock,1,0,1,NULL);
+      int cpsize = (m_cachepos > chksize)?chksize:m_cachepos;
+      memcpy(mat->data,m_pcmcache,cpsize);
+      int left = chksize - cpsize;
+      if(left>0) memcpy(mat->data + cpsize,buf,left);
+      //printf("append a %d\n",left);
+      m_cachepos -= cpsize;
+      cursize -= left;
+      curhead += left;
+      rst = appenditem(mat);
+    }else{
+      memcpy(m_pcmcache+ m_cachepos,buf,len);
+      m_cachepos += len;
+      return 0;
+    }
+  }
+  while(cursize>=m_minsize){
+    //printf("pbbb\n");
+    int chkblock = cursize / STREAM_BASE_PCM;
+    if(chkblock>m_maxcnt)chkblock = m_maxcnt;
+    int chksize = chkblock * STREAM_BASE_PCM;
+
+    jmat_t* mat = jmat_alloc(STREAM_BASE_PCM,chkblock,1,0,1,NULL);
+    memcpy(mat->data ,curhead,chksize);
+    curhead += chksize;
+    cursize -= chksize;
+    rst = appenditem(mat);
+  }
+  if(cursize>0){
+    //printf("==cursize %d\n",cursize);
+    memcpy(m_pcmcache,curhead,cursize);
+    m_cachepos = cursize;
+  }
+  return rst;
+}
+
+int PcmSession::conpcm(uint64_t sessid){
+  //if(m_finished)return -1;
+  if(m_sessid!=sessid)return -2;
+  m_cachepos = 0;
+  m_finished = 0;
+  m_curflag ++;
+  return 0;
+}
+
+int PcmSession::finpcm(uint64_t sessid){
+  if(m_finished)return -1;
+  if(m_sessid!=sessid)return -2;
+  checkpcmcache();
+  if(m_cachepos){
+    int block = m_cachepos / STREAM_BASE_PCM;
+    int left = m_cachepos % STREAM_BASE_PCM;
+    if(left)block++;
+    jmat_t* mat = jmat_alloc(STREAM_BASE_PCM,block,1,0,1,NULL);
+    memset(mat->data,0,STREAM_BASE_PCM*block);
+    memcpy(mat->data,m_pcmcache,m_cachepos);
+    appenditem(mat);
+  }
+  m_finished = 1;
+  return 0;
+}
+
+int PcmSession::runfirst(uint64_t sessid,WeAI* weai){
+  if(m_sessid!=sessid)return -2;
+  if(!m_first)return 0;
+  if(m_calccnt)return 0;
+  PcmItem* item = vec_pcm[m_calccnt];
+  if(item){
+    item->runWenet(weai);
+    m_numcalc += item->numblock();
+  }
+  m_calccnt ++;
+  m_first = 0;
+  //
+  return 0;
+}
+
+int PcmSession::runcalc(uint64_t sessid,WeAI* weai,int mincalc){
+  if(m_sessid!=sessid)return -2;
+  if(m_first)return -1;
+  int rst = 0;
+  if(m_calccnt<m_workcnt){
+    int dist = m_calccnt - m_readcnt;
+    //printf("===disc %d work %d mincalc %d\n",dist,m_workcnt,mincalc);
+    if(dist<mincalc){
+      PcmItem* item = vec_pcm[m_calccnt];
+      if(item){
+        item->runWenet(weai);
+        m_numcalc += item->numblock();
+      }
+      m_calccnt ++;
+      rst = 1;
+    }
+  }else if(m_finished){
+    rst = -1;
+  }else{
+    rst = 0;
+  }
+  if(rst<1){
+    int dist = m_readcnt - m_clrcnt;
+    if(dist>5){
+      for(int k=0;k<m_readcnt-5;k++){
+        PcmItem* item = vec_pcm[k];
+        vec_pcm[k] = NULL;
+        if(item){ 
+          delete item;
+          m_clrcnt = k;
+        }
+      }
+    }
+  }
+  return rst;
+}
+
+void PcmSession::dump(char* dumpfn){
+  FILE* dumpfile = fopen(dumpfn,"wb");
+  printf("===dump %ld\n",vec_pcm.size());
+  for(int k=0;k<vec_pcm.size();k++){
+    PcmItem* item = vec_pcm[k];
+    item->dump(dumpfile);
+  }
+  fclose(dumpfile);
+}
+
+int PcmSession::distwait(){
+  printf("===calc %d read %d \n",m_numcalc,m_numread);
+  return m_numpush - m_numread;
+}
+
+int PcmSession::readnext(uint64_t sessid,uint8_t* pcmbuf,int pcmlen,uint8_t* bnfbuf,int bnflen){
+  if(m_sessid!=sessid)return -2;
+  if(pcmlen!=STREAM_BASE_PCM)return -1;
+  if(bnflen!=STREAM_ALL_BNF)return -2;
+  jmat_t* mpcm = jmat_alloc(STREAM_BASE_PCM,1,1,0,1,pcmbuf);
+  jmat_t* mbnf = jmat_alloc(STREAM_BASE_BNF,20,1,0,4,bnfbuf);
+  int rst = readnext(sessid,mpcm,mbnf);
+  jmat_free(mpcm);
+  jmat_free(mbnf);
+  return rst;
+}
+
+int PcmSession::readblock(uint64_t sessid,uint8_t* bnfbuf,int bnflen,int inx){
+  if(m_sessid!=sessid)return -2;
+  if(bnflen!=STREAM_ALL_BNF)return -2;
+  jmat_t* mbnf = jmat_alloc(STREAM_BASE_BNF,20,1,0,4,bnfbuf);
+  int rst = readblock(sessid,mbnf,inx);
+  jmat_free(mbnf);
+  return rst;
+
+}
+
+int PcmSession::readblock(uint64_t sessid,jmat_t* mbnf,int inx){
+  if(m_sessid!=sessid)return -2;
+  if(mbnf->width!=STREAM_BASE_BNF)return -2002; 
+  //if(inx>=m_calccnt)return -99;
+  //printf("===inx %d num %d\n",inx,m_numcalc);
+  if(inx>=m_numcalc)return -99;
+  int rst = 0;
+  PcmItem* curitem = NULL;
+  int newinx = 0;
+  if((inx<m_arrmax[m_readcnt])&&(inx>=m_arrmin[m_readcnt])){
+    curitem = vec_pcm[m_readcnt];
+    newinx = inx - m_arrmin[m_readcnt];
+  }else{
+    for(int k=0;k<m_calccnt;k++){
+      //printf("==k %d max %d min %d\n",k,m_arrmax[k],m_arrmin[k]);
+      if((inx<m_arrmax[k])&&(inx>=m_arrmin[k])){
+        curitem = vec_pcm[k];
+        m_readcnt = k;
+        newinx = inx - m_arrmin[k];
+        break;
+      }
+    }
+  }
+  //printf("===curitem %p inx %d new %d\n",curitem,inx ,newinx);
+  if(curitem){
+    rst = curitem->readblock(newinx,NULL,mbnf);
+    if(rst){
+      mbnf->buf.sessid = inx;
+    }
+    return rst;
+  }
+  return 0;
+}
+
+int PcmSession::readnext(uint64_t sessid,jmat_t* mpcm,jmat_t* mbnf){
+  if(mpcm->width!=STREAM_BASE_PCM)return -2001; 
+  if(mbnf->width!=STREAM_BASE_BNF)return -2002; 
+  //printf("===p %d r %d\n",m_totalpush,m_totalread);
+  if(m_totalread<m_totalpush){
+    //printf("===q %d r %d\n",m_readcnt,m_calccnt);
+    if(m_readcnt<m_calccnt){
+      PcmItem* item = vec_pcm[m_readcnt];      
+      int rst = item->readblock(mpcm,mbnf);
+      if(!rst){
+        m_readcnt++;
+        return 0;
+      }else{
+#ifdef PCMDEBUG
+        if(1){
+          char fn[255];
+          sprintf(fn,"out_%d.data",++m_debugout);
+          FILE* df = fopen(fn,"wb");
+          fwrite(mpcm->data,1,STREAM_BASE_PCM,df);
+          fclose(df);
+        }
+#endif
+        m_numread += 1;
+        m_totalread+=STREAM_BASE_PCM;
+        return item->itemsentid();
+      }
+    }else{
+      return 0;
+    }
+  }else{
+    return m_finished?-1:0;
+  }
+}
+
+
+
diff --git a/duix-sdk/src/main/cpp/dhmfcc/dhpcm.h b/duix-sdk/src/main/cpp/dhmfcc/dhpcm.h
new file mode 100644
index 0000000..b98150b
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhmfcc/dhpcm.h
@@ -0,0 +1,168 @@
+                                                                                        ///*
+#pragma once
+#include "dh_data.h"
+#include "aicommon.h"
+#include <mutex>
+#include <vector>
+#include "dhwenet.h"
+#include "wenetai.h"
+
+//#define PCMDEBUG 1
+#define AIRUN_FLAG 1
+class PcmItem{
+  private:
+    uint64_t m_sessid = 0;
+    int   m_minoff = 0;
+    int   m_maxblock = 0;
+
+    int   pcm_allsamp = 0;
+    int   bnf_allcnt = 0;
+    int   mel_allcnt = 0;
+    jmat_t* m_wav = NULL;
+    jmat_t* m_pcm = NULL;
+    jmat_t* m_mel = NULL;
+    jmat_t* m_bnf = NULL;
+    jmat_t* m_bnfflip = NULL;
+    jmat_t* m_mfcc = NULL;
+    int   pcm_block = 0;
+    int   pcm_read = 0;
+    int   pre_block = 0;
+    uint64_t m_pcminx = 0;
+    //gjvad_t* m_vad = NULL;
+    int   m_ready = 0;
+    int m_sentid = 1;
+    int m_flip = 0;
+    int m_inx = 0;
+    jmat_t* mat_flip = NULL;
+  public:
+    int itemsentid(){return m_sentid;};
+    int blocks(){return pcm_block;};
+    int ready(){return m_ready;};
+    int finished(){return pcm_read>=pcm_block;};
+    int reset();
+    PcmItem(int sentid,int minoff  ,int maxblock ,int flip,int inx);
+    int fillPcm(uint64_t sessid,uint64_t tickinx,jmat_t* premat,jmat_t* mat);
+    int checkValid(uint64_t tickinx);
+    jmat_t* readlast(int minoff);
+    int runWenet(WeAI* weai);
+    int readblock(jmat_t* pcm,jmat_t* mfcc);
+    int readblock(int inx,jmat_t* pcm,jmat_t* mfcc);
+    int readbnf(char* buf);
+    int numblock();
+    int startinx(){return m_inx;};
+    int endinx(){return m_inx+pcm_block;};
+    int readblock();
+    void dump(FILE* dumpfile);
+    ~PcmItem();
+};
+
+class PcmFile{
+  private:
+    int         m_fps = 25;
+    int         m_adj = 0;
+    float       m_scale = 1.0f;
+    int         m_minoff = 0;
+    int         m_mincnt = 0;
+    int         m_maxcnt = 0;
+    int         m_minsize = 0;
+    int         m_maxsize = 0;
+
+    int         m_fileblock = 0;
+    int         m_calcblock = 0;
+    int         m_clrcnt = 0;
+    int         m_readcnt = 0;
+    int         m_calccnt = 0;
+    int         *m_arrmax = NULL;
+    int         *m_arrmin = NULL;
+    std::vector<PcmItem*>  vec_pcm ;
+    int       appenditem(jmat_t* mat,int noone=0);
+    PcmItem     *m_lastitem = NULL;
+    PcmItem     *m_lastread = NULL;
+    int         m_presize = 0;
+    int         m_preblock = 0;
+    PcmItem     *m_preitem = NULL;;
+    int         m_flip = 0;
+  public:
+    PcmFile(int fps = 25,int minoff = STREAM_BASE_MINOFF,int mincnt = STREAM_BASE_MINBLOCK,int maxcnt = STREAM_BASE_MAXBLOCK);
+    int setflip(int flip);
+    int prepare(std::string& pcmfn);
+    int prepare(char* buf,int size,char* prebuf = NULL,int presize = 0);
+    int itemSize();
+    int process(int inx,WeAI* ai);
+    int readblock(int sinx,jmat_t* pcm,jmat_t* feat);
+    jmat_t* readbnf(int sinx);
+    int readbnf(char* bnf,int bnfsize);
+    int fileBlock(){return m_fileblock*m_scale;};
+    int calcBlock(){return m_calcblock*m_scale;};
+    virtual ~PcmFile();
+};
+
+class PcmSession{
+  private:
+    int         m_sessid = 0;
+
+    int         m_minoff = 0;
+    int         m_mincnt = 0;
+    int         m_maxcnt = 0;
+    int         m_minsize = 0;
+    int         m_maxsize = 0;
+    //int         m_basesize = 0;
+    //int         m_firstsize = 0;
+
+    int         m_cachepos = 0;
+    int         m_cachemax = 0;
+    uint8_t      *m_pcmcache = NULL;
+
+    std::mutex  m_lock;
+    int         *m_arrflag;
+    int         m_curflag = 1;
+
+    std::vector<PcmItem*>  vec_pcm ;
+    PcmItem     *m_lastitem = NULL;
+
+    volatile int         m_clrcnt = 0;
+    volatile int         m_workcnt = 0;
+    volatile int         m_readcnt = 0;
+    volatile int         m_calccnt = 0;
+    int       appenditem(jmat_t* mat,int noone=0);
+
+    volatile int       m_totalpush = 0;
+    volatile int       m_totalread = 0;
+    volatile int       m_finished = 0;
+    int       m_first = 1;
+    int     m_debuginx = 0;
+    int     m_debugout = 0;
+    int     checkpcmcache(int flash=0);
+    int     m_numcalc = 0;
+    int     m_numread = 0;
+    int     m_numpush = 0;
+    int     distwait();
+    int     m_checkcnt = 0;
+    int     m_flip = 0;
+    int         *m_arrmax = NULL;
+    int         *m_arrmin = NULL;
+    int         m_fileblock = 0;
+    int         m_calcblock = 0;
+  public:
+    int setflip(int flip);
+    uint64_t sessid(){return m_sessid;};
+    int simppcm(uint64_t sessid,uint8_t* buf,int len);
+    int pushpcm(uint64_t sessid,uint8_t* buf,int len);
+    int finpcm(uint64_t sessid);
+    int conpcm(uint64_t sessid);
+    int runcalc(uint64_t sessid,WeAI* weai,int mincalc=1);
+    int runfirst(uint64_t sessid,WeAI* weai);
+    int readnext(uint64_t sessid,jmat_t* mpcm,jmat_t* mbnf);
+    int readnext(uint64_t sessid,uint8_t* pcmbuf,int pcmlen,uint8_t* bnfbuf,int bnflen);
+    int readblock(uint64_t sessid,jmat_t* mbnf,int index);
+    int readblock(uint64_t sessid,uint8_t* bnfbuf,int bnflen,int inx);
+    PcmSession(uint64_t sessid,int minoff = STREAM_BASE_MINOFF,int mincnt = STREAM_BASE_MINBLOCK,int maxcnt = STREAM_BASE_MAXBLOCK);
+    ~PcmSession();
+    void dump(char* dumpfn);
+    int first(){return m_first;};
+    int fileBlock(){return m_fileblock;};
+    //int calcBlock(){return m_calcblock;};
+    int calcBlock(){return m_numcalc;};
+};
+
+
diff --git a/duix-sdk/src/main/cpp/dhmfcc/dhwenet.cpp b/duix-sdk/src/main/cpp/dhmfcc/dhwenet.cpp
new file mode 100644
index 0000000..4ef27a5
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhmfcc/dhwenet.cpp
@@ -0,0 +1,44 @@
+#include "dhwenet.h"
+#include <stdio.h>
+#include <vector>
+#include <string>
+#include "aicommon.h"
+#include "mfcc/mfcc.hpp"
+
+
+int DhWenet::cntmel(int pcmblock){
+  int allcnt = pcmblock + 2*STREAM_MFCC_FILL;
+  int pcm_allsamp = allcnt*STREAM_BASE_SAMP;
+  int mel_allcnt = pcm_allsamp/160+1;
+  return mel_allcnt;
+}
+
+int DhWenet::cntbnf(int melblock){
+  int bnf_allcnt = melblock*0.25f - 0.75f;
+  return bnf_allcnt;
+}
+
+int DhWenet::calcmfcc(float* fwav,float* mel2){
+    int rst = 0;
+    int melcnt = MFCC_WAVCHUNK/160+1;
+    rst = log_mel(fwav,MFCC_WAVCHUNK, 16000,mel2);
+    return rst;
+}
+
+int DhWenet::calcmfcc(float* fwav,int fsample,float* mel2,int melcnt){
+    int rst = 0;
+    rst = log_mel(fwav,fsample, 16000,mel2);
+    return rst;
+}
+
+int DhWenet::calcmfcc(jmat_t* mwav,jmat_t* mmel){
+    int rst = 0;
+    int melcnt = MFCC_WAVCHUNK/160+1;
+    for(size_t k=0;k<mwav->height;k++){
+        float* fwav = (float*)jmat_row(mwav,k);
+        float* mel2 = (float*)jmat_row(mmel,k);
+        rst = log_mel(fwav,MFCC_WAVCHUNK, 16000,mel2);
+    }
+    return rst;
+}
+
diff --git a/duix-sdk/src/main/cpp/dhmfcc/dhwenet.h b/duix-sdk/src/main/cpp/dhmfcc/dhwenet.h
new file mode 100644
index 0000000..667a669
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhmfcc/dhwenet.h
@@ -0,0 +1,14 @@
+#pragma once
+#include "dh_data.h"
+#include "wenetai.h"
+#include <mutex> 
+
+class DhWenet{
+    public:
+        static int calcmfcc(jmat_t* mwav,jmat_t* mmel);
+        static int calcmfcc(float* fwav,float* mel2);
+        static int calcmfcc(float* fwav,int fsample,float* mel2,int melcnt);
+        static int cntmel(int pcmblock);
+        static int cntbnf(int melblock);
+
+};
diff --git a/duix-sdk/src/main/cpp/dhmfcc/iir_filter.cpp b/duix-sdk/src/main/cpp/dhmfcc/iir_filter.cpp
new file mode 100644
index 0000000..11e5e7b
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhmfcc/iir_filter.cpp
@@ -0,0 +1,310 @@
+#include "mfcc/iir_filter.hpp"
+#include <stdio.h>
+
+#ifdef UES_IIR_I
+
+void IIR_I::reset()
+{
+    for(int i = 0; i <= m_num_order; i++)
+    {
+        m_pNum[i] = 0.0;
+    }
+    for(int i = 0; i <= m_den_order; i++)
+    {
+        m_pDen[i] = 0.0;
+    }
+}
+IIR_I::IIR_I()
+{
+    m_pNum = NULL;
+    m_pDen = NULL;
+    m_px = NULL;
+    m_py = NULL;
+    m_num_order = -1;
+    m_den_order = -1;
+};
+IIR_I::~IIR_I()
+{
+    delete[] m_pNum;
+    delete[] m_pDen;
+    delete[] m_px;
+    delete[] m_py;
+    m_pNum = NULL;
+    m_pDen = NULL;
+    m_px = NULL;
+    m_py = NULL;
+};
+
+/** \brief
+ *
+ * \param num 分子多项式的系数，升序排列,num[0] 为常数项
+ * \param m 分子多项式的阶数
+ * \param den 分母多项式的系数，升序排列,den[0] 为常数项
+ * \param m 分母多项式的阶数
+ * \return
+ */
+void IIR_I::setPara(double num[], int num_order, double den[], int den_order)
+{
+    delete[] m_pNum;
+    delete[] m_pDen;
+    delete[] m_px;
+    delete[] m_py;
+    m_pNum = new double[num_order + 1];
+    m_pDen = new double[den_order + 1];
+    m_num_order = num_order;
+    m_den_order = den_order;
+    m_px = new double[num_order + 1];
+    m_py = new double[den_order + 1];
+    for(int i = 0; i < m_num_order; i++)
+    {
+        m_pNum[i] = num[i];
+        m_px[i] = 0.0;
+    }
+    m_pNum[m_num_order] = 0.0;
+    m_px[m_num_order] = 0.0;
+    for(int i = 0; i < m_den_order; i++)
+    {
+        m_pDen[i] = den[i];
+        m_py[i] = 0.0;
+    }
+    m_pDen[m_den_order] = 0.0;
+    m_py[m_den_order] = 0.0;
+}
+
+/** \brief 计算 IIR 滤波器的时域响应，不影响滤波器的内部状态
+ * \param data_in 为滤波器的输入，0 时刻之前的输入默认为 0，data_in[M] 及之后的输入默认为data_in[M-1]
+ * \param data_out 滤波器的输出
+ * \param M 输入数据的长度
+ * \param N 输出数据的长度
+ * \return
+ */
+void IIR_I::resp(double data_in[], int M, double data_out[], int N)
+{
+    int i, k, il;
+    for(k = 0; k < N; k++)
+    {
+        data_out[k] = 0.0;
+        for(i = 0; i <= m_num_order; i++)
+        {
+            if( k - i >= 0)
+            {
+                il = ((k - i) < M) ? (k - i) : (M - 1);
+                data_out[k] = data_out[k] + m_pNum[i] * data_in[il];
+            }
+        }
+        for(i = 1; i <= m_den_order; i++)
+        {
+            if( k - i >= 0)
+            {
+                data_out[k] = data_out[k] - m_pDen[i] * data_out[k - i];
+            }
+        }
+    }
+}
+
+/** \brief 滤波函数，采用直接I型结构
+ * 注：该函数内部修改过，移植librosa.pcen时参照scipy.signal.lfilter所做的设计。
+ *
+ * \param data_in[] 输入数据
+ * \param data_out[] 保存滤波后的数据
+ * \param len 数组的长度
+ * \return
+ */
+void IIR_I::filter(double data_in[], double data_out[], int len)
+{
+    int i, k;
+    m_py[1] = 1; //修改的地方，因为公式中y[n-k]，当为第一个元素时会出现y[-1]，pcen中y[-1]会被认为为1。
+    for(k = 0; k < len; k++)
+    {
+        m_px[0] = data_in[k];
+        m_py[0] = 0.0;
+        for(i = 0; i <= m_num_order; i++)
+        {
+            m_py[0] = m_py[0] + m_pNum[i] * m_px[i];
+        }
+        for(i = 1; i <= m_den_order; i++)
+        {
+            m_py[0] = m_py[0] - m_pDen[i] * m_py[i];
+        }
+        for(i = m_num_order; i >= 1; i--)
+        {
+            m_px[i] = m_px[i-1];
+        }
+        for(i = m_den_order; i >= 1; i--)
+        {
+            m_py[i] = m_py[i-1];
+        }
+        data_out[k] = m_py[0];
+    }
+}
+
+#endif
+
+#ifdef UES_IIR_II
+
+
+IIR_II::IIR_II()
+{
+//ctor
+m_pNum = NULL;
+m_pDen = NULL;
+m_pW = NULL;
+m_num_order = -1;
+m_den_order = -1;
+m_N = 0;
+};
+
+void IIR_II::reset()
+{
+    for(int i = 0; i < m_N; i++)
+    {
+        m_pW[i] = 0.0;
+    }
+}
+/** \brief
+ *
+ * \param num 分子多项式的系数，升序排列,num[0] 为常数项
+ * \param m 分子多项式的阶数
+ * \param den 分母多项式的系数，升序排列,den[0] 为常数项
+ * \param m 分母多项式的阶数
+ * \return
+ */
+void IIR_II::setPara(double num[], int num_order, double den[], int den_order)
+{
+    delete[] m_pNum;
+    delete[] m_pDen;
+    delete[] m_pW;
+    m_num_order = num_order;
+    m_den_order = den_order;
+    m_N = fmax(num_order, den_order) + 1;
+    m_pNum = new double[m_N];
+    m_pDen = new double[m_N];
+    m_pW = new double[m_N];
+    for(int i = 0; i < m_N; i++)
+    {
+        m_pNum[i] = 0.0;
+        m_pDen[i] = 0.0;
+        m_pW[i] = 0.0;
+    }
+    for(int i = 0; i <= num_order; i++)
+    {
+        m_pNum[i] = num[i];
+    }
+    for(int i = 0; i <= den_order; i++)
+    {
+        m_pDen[i] = den[i];
+    }
+}
+/** \brief 计算 IIR 滤波器的时域响应，不影响滤波器的内部状态
+ * \param data_in 为滤波器的输入，0 时刻之前的输入默认为 0，data_in[M] 及之后的输入默认为data_in[M-1]
+ * \param data_out 滤波器的输出
+ * \param M 输入数据的长度
+ * \param N 输出数据的长度
+ * \return
+ */
+void IIR_II::resp(double data_in[], int M, double data_out[], int N)
+{
+    int i, k, il;
+    for(k = 0; k < N; k++)
+    {
+        data_out[k] = 0.0;
+        for(i = 0; i <= m_num_order; i++)
+        {
+            if( k - i >= 0)
+            {
+                il = ((k - i) < M) ? (k - i) : (M - 1);
+                data_out[k] = data_out[k] + m_pNum[i] * data_in[il];
+            }
+        }
+        for(i = 1; i <= m_den_order; i++)
+        {
+            if( k - i >= 0)
+            {
+                data_out[k] = data_out[k] - m_pDen[i] * data_out[k - i];
+            }
+        }
+    }
+}
+/** \brief 滤波函数，采用直接II型结构
+ *
+ * \param data 输入数据
+ * \return 滤波后的结果
+ */
+double IIR_II::filter(double data)
+{
+    m_pW[0] = data;
+    for(int i = 1; i <= m_den_order; i++) // 先更新 w[n] 的状态
+    {
+        m_pW[0] = m_pW[0] - m_pDen[i] * m_pW[i];
+    }
+    data = 0.0;
+    for(int i = 0; i <= m_num_order; i++)
+    {
+        data = data + m_pNum[i] * m_pW[i];
+    }
+    for(int i = m_N - 1; i >= 1; i--)
+    {
+        m_pW[i] = m_pW[i-1];
+    }
+    return data;
+}
+/** \brief 滤波函数，采用直接II型结构
+ *
+ * \param data[] 传入输入数据，返回时给出滤波后的结果
+ * \param len data[] 数组的长度
+ * \return
+ */
+void IIR_II::filter(double data[], int len)
+{
+    int i, k;
+    for(k = 0; k < len; k++)
+    {
+        m_pW[0] = data[k];
+        for(i = 1; i <= m_den_order; i++) // 先更新 w[n] 的状态
+        {
+            m_pW[0] = m_pW[0] - m_pDen[i] * m_pW[i];
+        }
+        data[k] = 0.0;
+        for(i = 0; i <= m_num_order; i++)
+        {
+            data[k] = data[k] + m_pNum[i] * m_pW[i];
+        }
+
+        for(i = m_N - 1; i >= 1; i--)
+        {
+            m_pW[i] = m_pW[i-1];
+        }
+    }
+}
+/** \brief 滤波函数，采用直接II型结构
+ *
+ * \param data_in[] 输入数据
+ * \param data_out[] 保存滤波后的数据
+ * \param len 数组的长度
+ * \return
+ */
+void IIR_II::filter(double data_in[], double data_out[], int len)
+{
+    int i, k;
+    for(k = 0; k < len; k++)
+    {
+        m_pW[0] = data_in[k];
+        for(i = 1; i <= m_den_order; i++) // 先更新 w[n] 的状态
+        {
+            m_pW[0] = m_pW[0] - m_pDen[i] * m_pW[i];
+        }
+        data_out[k] = 0.0;
+        for(i = 0; i <= m_num_order; i++)
+        {
+            data_out[k] = data_out[k] + m_pNum[i] * m_pW[i];
+        }
+
+        for(i = m_N - 1; i >= 1; i--)
+        {
+            m_pW[i] = m_pW[i-1];
+        }
+    }
+}
+
+#endif
+
diff --git a/duix-sdk/src/main/cpp/dhmfcc/mfcc.cpp b/duix-sdk/src/main/cpp/dhmfcc/mfcc.cpp
new file mode 100644
index 0000000..fc93f89
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhmfcc/mfcc.cpp
@@ -0,0 +1,369 @@
+#include "mfcc/mfcc.hpp"
+#include "mfcc/AudioFFT.hpp"
+#include "mfcc/iir_filter.hpp"
+#include "opencv2/core.hpp"
+
+static int nSamplesPerSec = 16000;
+static int length_DFT = 1024;//2048;
+static int hop_length = 160;//int(0.05 * nSamplesPerSec);
+static int win_length = 800;// int(0.1 * nSamplesPerSec);
+static int number_filterbanks = 80;
+static float preemphasis = 0.97;
+static int max_db = 100;
+static int ref_db = 20;
+static int r = 1;
+static double pi = 3.14159265358979323846;
+
+static cv::Mat_<double> mel_basis;
+static cv::Mat_<float> hannWindow;
+
+static std::shared_ptr<IIR_I> filter;
+
+//"""Convert Hz to Mels"""
+static double hz_to_mel(double frequencies, bool htk = false) {
+    if (htk) {
+        return 2595.0 * log10(1.0 + frequencies / 700.0);
+    }
+    // Fill in the linear part
+    double f_min = 0.0;
+    double f_sp = 200.0 / 3;
+    double mels = (frequencies - f_min) / f_sp;
+    // Fill in the log-scale part
+    double min_log_hz = 1000.0;                         // beginning of log region (Hz)
+    double min_log_mel = (min_log_hz - f_min) / f_sp;   // same (Mels)
+    double logstep = log(6.4) / 27.0;              // step size for log region
+
+    // 对照Python平台的librosa库，移植
+    //如果是多维数列
+//    if (frequencies.ndim) {
+//        // If we have array data, vectorize
+//        log_t = (frequencies >= min_log_hz)
+//        mels[log_t] = min_log_mel + np.log(frequencies[log_t] / min_log_hz) / logstep
+//    } else
+    if (frequencies >= min_log_hz) {
+        // If we have scalar data, heck directly
+        mels = min_log_mel + log(frequencies / min_log_hz) / logstep;
+    }
+    return mels;
+}
+
+//"""Convert mel bin numbers to frequencies"""
+static cv::Mat_<double> mel_to_hz(cv::Mat_<double> mels, bool htk = false) {
+//    if (htk) {
+//        return //python://700.0 * (10.0**(mels / 2595.0) - 1.0);
+//    }
+    // Fill in the linear scale
+    double f_min = 0.0;
+    double f_sp = 200.0 / 3;
+    cv::Mat_<double> freqs = mels * f_sp + f_min;
+    // And now the nonlinear scale
+    double min_log_hz = 1000.0;                         // beginning of log region (Hz)
+    double min_log_mel = (min_log_hz - f_min) / f_sp;   // same (Mels)
+    double logstep = log(6.4) / 27.0;              // step size for log region
+    // 对照Python平台的librosa库，移植
+    //if (mels.ndim) {
+    // If we have vector data, vectorize
+    cv::Mat_<bool> log_t = (mels >= min_log_mel);
+    for (int i = 0; i < log_t.cols; i++) {
+        if (log_t(0, i)) {
+            freqs(0, i) = cv::exp((mels(0, i) - min_log_mel) * logstep) * min_log_hz;
+        }
+    }
+    //}
+    return freqs;
+}
+
+static cv::Mat_<double> cvlinspace(double min_, double max_, int length) {
+    auto cvmat = cv::Mat_<double>(1, length);
+    for (int i = 0; i < length; i++) {
+        cvmat(0, i) = ((max_ - min_) / (length - 1) * i) + min_;
+    }
+    return cvmat;
+}
+
+//"""Create a Filterbank matrix to combine FFT bins into Mel-frequency bins"""
+static cv::Mat_<double> mel_spectrogram_create(int nps, int n_fft, int n_mels) {
+    double f_max = nps / 2.0;
+    double f_min = 0;
+    int n_fft_2 = 1 + n_fft / 2;
+    // Initialize the weights
+    //auto weights = nc::zeros<double>(nc::uint32(n_mels), nc::uint32(n_fft_2));
+    auto weights = cv::Mat_<double>(n_mels, n_fft_2, 0.0);
+    // Center freqs of each FFT bin
+    //auto fftfreqs_ = nc::linspace<double>(f_min, f_max, nc::uint32(n_fft_2), true);
+    auto fftfreqs = cvlinspace(f_min, f_max, n_fft_2);
+
+    // 'Center freqs' of mel bands - uniformly spaced between limits
+    double min_mel = hz_to_mel(f_min, false);
+    double max_mel = hz_to_mel(f_max, false);
+    //auto mels_ = nc::linspace(min_mel, max_mel, nc::uint32(n_mels + 2));
+    auto mels = cvlinspace(min_mel, max_mel, n_mels + 2);
+    auto mel_f = mel_to_hz(mels, false);
+
+    //auto fdiff_ = nc::diff(mel_f_); //沿着指定轴计算第N维的离散差值(后一个元素减去前一个元素)
+    cv::Mat_<double> d1(1, mel_f.cols * mel_f.rows - 1, (double *) (mel_f.data) + 1);
+    cv::Mat_<double> d2(1, mel_f.cols * mel_f.rows - 1, (double *) (mel_f.data));
+    cv::Mat_<double> fdiff = d1 - d2;
+
+    //auto ramps = nc::subtract.outer(mel_f, fftfreqs); //nc没有subtract.outer
+    //nc::NdArray<double> ramps = nc::zeros<double>(mel_f.cols, fftfreqs.cols);
+    auto ramps = cv::Mat_<double>(mel_f.cols, fftfreqs.cols);
+    for (int i = 0; i < mel_f.cols; i++) {
+        for (int j = 0; j < fftfreqs.cols; j++) {
+            ramps(i, j) = mel_f(0, i) - fftfreqs(0, j);
+        }
+    }
+
+    for (int i = 0; i < n_mels; i++) {
+        // lower and upper slopes for all bins
+        //auto ramps_1 = nc::NdArray<double>(1, ramps.cols);
+        auto ramps_1 = cv::Mat_<double>(1, ramps.cols);
+        for (int j = 0; j < ramps.cols; j++) {
+            ramps_1(0, j) = ramps(i, j);
+        }
+        //auto ramps_2 = nc::NdArray<double>(1, ramps.cols);
+        auto ramps_2 = cv::Mat_<double>(1, ramps.cols);
+        for (int j = 0; j < ramps.cols; j++) {
+            ramps_2(0, j) = ramps(i + 2, j);
+        }
+        cv::Mat_<double> lower = ramps_1 * -1 / fdiff(0, i);
+        cv::Mat_<double> upper = ramps_2 / fdiff(0, i + 1);
+        // .. then intersect them with each other and zero
+        //auto weights_1 = nc::maximum(nc::zeros<double>(1, ramps.cols), nc::minimum(lower, upper));
+        cv::Mat weights_1 = cv::Mat_<double>(1, lower.cols);
+
+        cv::Mat c1 = lower;//(cv::Mat_<double>(1,5) << 1,2,-3,4,-5);
+        cv::Mat c2 = upper;
+        cv::min(c1, c2, weights_1);
+        cv::max(weights_1, 0, weights_1);
+
+        for (int j = 0; j < n_fft_2; j++) {
+            /*
+            double da = lower(0,j);
+            double db = upper(0,j);
+            double dc = da>db?db:da;
+            if(dc<0)dc = 0;
+            weights(i, j) = dc;//weights_1.at<double_t>(0, j);
+            */
+            weights(i, j) = weights_1.at<double_t>(0, j);
+        }
+    }
+
+    // Slaney-style mel is scaled to be approx constant energy per channel
+    auto enorm = cv::Mat_<double>(1, n_mels);
+    for (int j = 0; j < n_mels; j++) {
+        enorm(0, j) = 2.0 / (mel_f(0, j + 2) - mel_f(0, j));
+    }
+    for (int j = 0; j < n_mels; j++) {
+        for (int k = 0; k < n_fft_2; k++) {
+            weights(j, k) *= enorm(0, j);
+        }
+    }
+    return weights;
+}
+
+//"""Short-time Fourier transform (STFT)""": 默认center=True, window='hann', pad_mode='reflect'
+static cv::Mat_<double> MagnitudeSpectrogram(const cv::Mat_<float> *emphasis_data, int n_fft = 2048, int hop_length = 0, int win_length = 0) {
+    if (win_length == 0) {
+        win_length = n_fft;
+    }
+    if (hop_length == 0) {
+        hop_length = win_length / 4;
+    }
+
+    int pad_lenght = n_fft / 2;
+    cv::Mat_<float> cv_padbuffer;
+    cv::copyMakeBorder(*emphasis_data, cv_padbuffer, 0, 0, pad_lenght, pad_lenght, cv::BORDER_REFLECT_101);
+
+    if (hannWindow.empty()) {
+        hannWindow = cv::Mat_<float>(1, n_fft, 0.0f);
+        int insert_cnt = 0;
+        if (n_fft > win_length) {
+            insert_cnt = (n_fft - win_length) / 2;
+        } else {
+            //std::cout << "\tn_fft:" << n_fft << " > win_length:" << n_fft << std::endl;
+            return cv::Mat_<double>(0, 0);
+        }
+        for (int k = 1; k <= win_length; k++) {
+            hannWindow(0, k - 1 + insert_cnt) = float(0.5 * (1 - cos(2 * pi * k / (win_length + 1))));
+        }
+    }
+    int size = cv_padbuffer.rows * cv_padbuffer.cols;//padbuffer.size()
+    int number_feature_vectors = (size - n_fft) / hop_length + 1;
+    int number_coefficients = n_fft / 2 + 1;
+    cv::Mat_<float> feature_vector(number_feature_vectors, number_coefficients, 0.0f);
+
+    audiofft::AudioFFT fft;
+    fft.init(size_t(n_fft));
+    for (int i = 0; i <= size - n_fft; i += hop_length) {
+        cv::Mat_<float> framef = cv::Mat_<float>(1, n_fft, (float *) (cv_padbuffer.data) + i).clone();
+        framef = framef.mul(hannWindow);
+
+        cv::Mat_<float> Xrf(1, number_coefficients);
+        cv::Mat_<float> Xif(1, number_coefficients);
+        fft.fft((float *) (framef.data), (float *) (Xrf.data), (float *) (Xif.data));
+
+        cv::pow(Xrf, 2, Xrf);
+        cv::pow(Xif, 2, Xif);
+        cv::Mat_<float> cv_feature(1, number_coefficients, &(feature_vector[i / hop_length][0]));
+        cv::sqrt(Xrf + Xif, cv_feature);
+    }
+    cv::Mat_<float> cv_mag;
+    cv::transpose(feature_vector, cv_mag);
+    cv::Mat_<double> mag;
+    cv_mag.convertTo(mag, CV_64FC1);
+
+    return mag;
+}
+
+//cv::Mat_<double> log_mel(std::vector<uint8_t> &ifile_data, int nSamples_per_sec) {
+int log_mel(float* ifile_data, int ifile_length,int nSamples_per_sec,float* ofile_data) {
+    if (nSamples_per_sec != nSamplesPerSec) {
+        return -1;//cv::Mat_<double>(0, 0);
+    }
+    cv::Mat_<float> d1(1, ifile_length - 1, (float *) (ifile_data) + 1);
+    cv::Mat_<float> d2(1, ifile_length-1 , (float *) (ifile_data));
+
+    cv::Mat_<float> cv_emphasis_data;
+
+    cv::hconcat(cv::Mat_<float>::zeros(1, 1), d1 - d2 * preemphasis, cv_emphasis_data);
+    auto mag = MagnitudeSpectrogram(&cv_emphasis_data, length_DFT, hop_length, win_length);
+    auto magb = cv::abs(mag);
+    cv::pow(magb,2,mag);
+
+    //tooken
+    if (mel_basis.empty()) {
+        mel_basis = mel_spectrogram_create(nSamplesPerSec, length_DFT, number_filterbanks);
+    }
+
+    cv::Mat cv_mel = mel_basis * mag;
+    cv::log(cv_mel+ 1e-5, cv_mel);
+    cv_mel = cv_mel / 2.3025850929940459 * 10; // 2.3025850929940459=log(10)
+
+    cv_mel = cv_mel - ref_db;
+    cv::Mat cv_mel_r;//(cv_mel.cols,cv_mel.rows,CV_64FC1,ofile_data);
+    cv::transpose(cv_mel, cv_mel_r);
+    //cv::Mat rcv(cv_mel_r.cols,cv_mel_r.rows, CV_32FC1,ofile_data);
+    cv::Mat rrr(cv_mel.cols,cv_mel.rows,CV_32FC1,ofile_data);
+    cv_mel_r.convertTo(rrr, CV_32FC1);
+
+    if (r == 1) {
+        // 原计算公式是：
+        // mel = mel[:len(mel) // hp.r * hp.r].reshape([len(mel) // hp.r, hp.r * hp.n_mels])
+        // 当r=1的时候公式运算无任何数值改变。
+    } else {
+        //std::cout << R"(the "r" is not 1.)" << std::endl;
+    }
+    return 0;
+}
+
+/**--------------------------------- 以下是pcen运算方法 ---------------------------------**/
+
+// scipy.signal.lfilter_zi()
+static cv::Mat_<double> cvlfilter_zi(cv::Mat_<double> b, cv::Mat_<double> a) {
+    if ((b.rows != 1) || (a.rows != 1)) {
+        //std::cout << "Numerator b and Denominator a must be 1-D." << std::endl;
+    }
+    if (a(0, 0) != 1) {
+        // Normalize the coefficients so a[0] == 1.
+        b = b / a(0, 0);
+        a = a / a(0, 0);
+    }
+    int len_a = a.cols * a.rows;
+    int len_b = b.cols * b.rows;
+    int n = len_a > len_b ? len_a : len_b;
+    if (len_a < n) {
+        cv::hconcat(a, cv::Mat_<float>::zeros(1, n - len_a), a);
+    } else if (len_b < n) {
+        cv::hconcat(b, cv::Mat_<float>::zeros(1, n - len_b), b);
+    }
+    return cv::Mat_<double>(0, 0);
+}
+/*
+// scipy.signal.lfilter()
+// Filter data along one-dimension with an IIR or FIR filter.
+cv::Mat_<double> cvlfilter(cv::Mat_<double> &b, cv::Mat_<double> &a, cv::Mat_<double> &x,
+                           cv::Mat_<double> &zi, int axis = -1) {
+    if (a.rows * a.cols == 1) {
+        // This path only supports types fdgFDGO to mirror _linear_filter below.
+        // Any of b, a, x, or zi can set the dtype, but there is no default
+        // casting of other types; instead a NotImplementedError is raised.
+        // 后续如果需要，则进行补充
+    } else {
+        // return sigtools._linear_filter(b, a, x, axis, zi)
+        // sigtools._linear_filter()
+        // (y,Vf) = _linear_filter(b,a,X,Dim=-1,Vi=None)  implemented using Direct Form II transposed flow diagram.
+        // If Vi is not given, Vf is not returned.
+        ;
+    }
+}
+*/
+/*********************************************
+ * 名称：pcen
+ * 功能：传入音频数据，输出pcen方式提取的特征数据。
+ * 参数：@ifile_data        传入的音频数据
+ *      @nSamples_per_sec  音频采样率
+ * 返回：cv::Mat_<double>   特征数据
+*********************************************/
+static cv::Mat_<double> pcen(std::vector<uint8_t> &ifile_data, int nSamples_per_sec) {
+    //if (!(&ifile_data) || ifile_data.empty()) {
+    if (ifile_data.empty()) {
+        //std::cout << "error: invalid paramter: ifile_data" << std::endl;
+        return cv::Mat_<double>(0, 0);
+    }
+    if (nSamples_per_sec != nSamplesPerSec) {
+//        std::cout << R"(error: the "nSamples_per_sec" is not 16000.)" << std::endl;
+        return cv::Mat_<double>(0, 0);
+    }
+    int ifile_length = int(ifile_data.size() / 4);
+    cv::Mat_<float> cv_emphasis_data(1, ifile_length, (float *) (ifile_data.data()));
+//    std::cout<<ifile_length<<"====="<<cv_emphasis_data[0][960000-1]<<std::endl;
+    //getchar();
+
+    // magnitude spectrogram 幅度谱图
+    auto mag = MagnitudeSpectrogram(&cv_emphasis_data, length_DFT, hop_length, win_length);
+    mag = cv::abs(mag) * std::pow(2, 31);
+
+    // 生成梅尔谱图 mel spectrogram       //3ms
+    if (mel_basis.empty()) {
+        mel_basis = mel_spectrogram_create(nSamplesPerSec, length_DFT, number_filterbanks);
+    }
+
+    // doc
+    cv::Mat_<double> mel = mel_basis * mag;
+
+#if 1 
+    if (!filter) {
+        filter = std::make_shared<IIR_I>();
+        double iir_b[1] = {0.05638943879134889};
+        double iir_a[2] = {1.0, -0.9436105612086512};
+        //filter.reset();
+        filter->setPara(iir_b, 1, iir_a, 2);
+    }
+    cv::Mat_<double> S_smooth = cv::Mat_<double>(mel.rows, mel.cols);
+    for (int i = 0; i < mel.rows; i++) {
+        filter->filter(mel[i], S_smooth[i], mel.cols);
+    }
+
+#endif
+    double gain = 0.98;
+    double bias = 2.0;
+    double power = 0.5;
+    double eps = 1e-6;
+    //python: smooth = np.exp(-gain * (np.log(eps) + np.log1p(S_smooth / eps)))
+    cv::Mat_<double> S_smooth_log1p;
+    cv::log(S_smooth / eps + 1, S_smooth_log1p);
+    cv::Mat_<double> smooth;
+    cv::exp((S_smooth_log1p + cv::log(eps)) * (-gain), smooth);
+    //python: S_out = (bias ** power) * np.expm1(power * np.log1p(ref * smooth / bias))
+    cv::Mat_<double> smooth_log1p;
+    cv::Mat_<double> smooth_log1p_exp;
+    cv::log(mel.mul(smooth) / bias + 1, smooth_log1p);
+    cv::exp(power * smooth_log1p, smooth_log1p_exp);
+    cv::Mat_<double> S_out = (smooth_log1p_exp - 1) * pow(bias, power);
+    // transpose
+    cv::Mat_<double> pcen;
+    cv::transpose(S_out, pcen);
+
+    return pcen;
+}
diff --git a/duix-sdk/src/main/cpp/dhmfcc/mfcc/AudioFFT.hpp b/duix-sdk/src/main/cpp/dhmfcc/mfcc/AudioFFT.hpp
new file mode 100644
index 0000000..ef1d6c4
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhmfcc/mfcc/AudioFFT.hpp
@@ -0,0 +1,120 @@
+#pragma once
+
+#ifndef _AUDIOFFT_H
+#define _AUDIOFFT_H
+
+
+
+#include <cstddef>
+#include <memory>
+#include <cassert>
+#include <cmath>
+#include <cstring>
+
+//#define AUDIOFFT_APPLE_ACCELERATE //AUDIOFFT_INTEL_IPP//AUDIOFFT_FFTW3//AUDIOFFT_APPLE_ACCELERATE
+
+#if defined(AUDIOFFT_INTEL_IPP)
+#define AUDIOFFT_INTEL_IPP_USED
+  #include <ipp.h>
+#elif defined(AUDIOFFT_APPLE_ACCELERATE)
+#define AUDIOFFT_APPLE_ACCELERATE_USED
+  #include <Accelerate/Accelerate.h>
+  #include <vector>
+#elif defined (AUDIOFFT_FFTW3)
+#define AUDIOFFT_FFTW3_USED
+  #include <fftw3.h>
+#else
+#if !defined(AUDIOFFT_OOURA)
+#define AUDIOFFT_OOURA
+#endif
+#define AUDIOFFT_OOURA_USED
+#include <vector>
+#endif
+
+namespace audiofft
+{
+
+    namespace detail
+    {
+        class AudioFFTImpl;
+    }
+
+    /**
+     * @class AudioFFT
+     * @brief Performs 1D FFTs
+     */
+    class AudioFFT
+    {
+    public:
+        /**
+         * @brief Constructor
+         */
+        AudioFFT();
+
+        AudioFFT(const AudioFFT&) = delete;
+        AudioFFT& operator=(const AudioFFT&) = delete;
+
+        /**
+         * @brief Destructor
+         */
+        ~AudioFFT();
+
+        /**
+         * @brief Initializes the FFT object
+         * @param size Size of the real input (must be power 2)
+         */
+        void init(size_t size);
+
+        /**
+         * @brief Performs the forward FFT
+         * @param data The real input data (has to be of the length as specified in init())
+         * @param re The real part of the complex output (has to be of length as returned by ComplexSize())
+         * @param im The imaginary part of the complex output (has to be of length as returned by ComplexSize())
+         */
+        void fft(const float* data, float* re, float* im);
+
+        /**
+         * @brief Performs the inverse FFT
+         * @param data The real output data (has to be of the length as specified in init())
+         * @param re The real part of the complex input (has to be of length as returned by ComplexSize())
+         * @param im The imaginary part of the complex input (has to be of length as returned by ComplexSize())
+         */
+        void ifft(float* data, const float* re, const float* im);
+
+        /**
+         * @brief Calculates the necessary size of the real/imaginary complex arrays
+         * @param size The size of the real data
+         * @return The size of the real/imaginary complex arrays
+         */
+        static size_t ComplexSize(size_t size);
+
+    private:
+        std::unique_ptr<detail::AudioFFTImpl> _impl;
+    };
+
+
+    /**
+     * @deprecated
+     * @brief Let's keep an AudioFFTBase type around for now because it has been here already in the 1st version in order to avoid breaking existing code.
+     */
+    typedef AudioFFT AudioFFTBase;
+
+    namespace detail
+    {
+        class AudioFFTImpl
+        {
+        public:
+            AudioFFTImpl() = default;
+            AudioFFTImpl(const AudioFFTImpl&) = delete;
+            AudioFFTImpl& operator=(const AudioFFTImpl&) = delete;
+            virtual ~AudioFFTImpl() = default;
+            virtual void init(size_t size) = 0;
+            virtual void fft(const float* data, float* re, float* im) = 0;
+            virtual void ifft(float* data, const float* re, const float* im) = 0;
+        };
+    }
+
+} // End of namespace
+
+
+#endif // Header guard
diff --git a/duix-sdk/src/main/cpp/dhmfcc/mfcc/iir_filter.hpp b/duix-sdk/src/main/cpp/dhmfcc/mfcc/iir_filter.hpp
new file mode 100644
index 0000000..cb133e9
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhmfcc/mfcc/iir_filter.hpp
@@ -0,0 +1,69 @@
+#pragma once
+
+#ifndef SERVICESUPERVISOR_IIR_FILTER_H
+#define SERVICESUPERVISOR_IIR_FILTER_H
+
+//E(t,f) is computed using a first-order in-finite impulse response (IIR) filter
+#define UES_IIR_I
+//#define UES_IIR_II
+
+#ifdef UES_IIR_I
+
+class IIR_I
+{
+private:
+    double *m_pNum;
+    double *m_pDen;
+    double *m_px;
+    double *m_py;
+    int m_num_order;
+    int m_den_order;
+public:
+    IIR_I();
+    ~IIR_I();
+    void reset();
+    void setPara(double num[], int num_order, double den[], int den_order);
+    void resp(double data_in[], int m, double data_out[], int n);
+    void filter(double data_in[], double data_out[], int len);
+};
+
+#endif
+
+#ifdef UES_IIR_II
+class IIR_II
+{
+public:
+    IIR_II();
+    void reset();
+    void setPara(double num[], int num_order, double den[], int den_order);
+    void resp(double data_in[], int m, double data_out[], int n);
+    double filter(double data);
+    void filter(double data[], int len);
+    void filter(double data_in[], double data_out[], int len);
+protected:
+private:
+    double *m_pNum;
+    double *m_pDen;
+    double *m_pW;
+    int m_num_order;
+    int m_den_order;
+    int m_N;
+};
+
+class IIR_BODE
+{
+private:
+    double *m_pNum;
+    double *m_pDen;
+    int m_num_order;
+    int m_den_order;
+    std::complex<double> poly_val(double p[], int order, double omega);
+public:
+    IIR_BODE();
+    void setPara(double num[], int num_order, double den[], int den_order);
+    std::complex<double> bode(double omega);
+    void bode(double omega[], int n, std::complex<double> resp[]);
+};
+#endif
+
+#endif
diff --git a/duix-sdk/src/main/cpp/dhmfcc/mfcc/mfcc.hpp b/duix-sdk/src/main/cpp/dhmfcc/mfcc/mfcc.hpp
new file mode 100644
index 0000000..93697a5
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhmfcc/mfcc/mfcc.hpp
@@ -0,0 +1,7 @@
+#pragma once
+
+//#include"../third/numcpp/NumCpp.hpp"
+//#include "sas_util.h"
+
+
+int log_mel(float* ifile_data, int ifile_length,int nSamples_per_sec,float* ofile_data) ;
diff --git a/duix-sdk/src/main/cpp/dhmfcc/mfcc/sas_util.h b/duix-sdk/src/main/cpp/dhmfcc/mfcc/sas_util.h
new file mode 100644
index 0000000..ba23a83
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhmfcc/mfcc/sas_util.h
@@ -0,0 +1,120 @@
+#pragma once
+
+#include <string>
+#include <chrono>
+#include <vector>
+#include <assert.h>
+#include <memory>
+#include <fstream>
+#include "opencv2/core.hpp"
+
+//using namespace std;
+//using namespace std::chrono;
+
+class parambase
+{
+public:
+    std::string name;
+    std::string help;
+    std::string strval;
+    parambase(){}
+    virtual  ~parambase(){}
+    virtual bool set(const char* value) {return true;};
+};
+
+/**
+ */
+class EnginePar
+{
+public:
+    static int cs_timeout; //�кŷ�����ɵĳ�ʱʱ��(Ĭ���������һ�νкŴ�����һ�����,Ĭ��ֵ5����)
+    static int cs_detecthandsup_time; //�кź���������ֵ�ʱ��(Ĭ��10s)
+    static int cs_detecthandsup_interval ; //�кź���������ֵ�ʱ����(Ĭ��1��1��)
+    static int cs_detectsmile_interval; //�кź�΢Ц����ʱ����(Ĭ��1��1��)
+    static int cs_detectspeech_interval;//�кź���������ʱ����(Ĭ��20��)
+    static int cs_detectpose_interval;  //�кź���̬����ʱ����(Ĭ��5��1��)
+    static int detectpose_interval;     //�ǽк��ڼ���̬����ʱ����(Ĭ��5��1��)
+    static int detectsmile_interval;    //�ǽк��ڼ�΢Ц����ʱ����(Ĭ��1��1��)
+    static int detectappearance_interval; //��װ�����
+    static float action_turnpen_thrd;   //ת����ֵ
+    static float action_turnchair_thrd; //ת����ֵ
+    static float action_record_time;    //����¼��ʱ��
+    static float sit_supporthead_thrd;  //��ͷ��ֵ
+    static float sit_layondesk_thrd;    //ſ����ֵ
+    static float sit_relyingonchair_thrd;//������ֵ
+    static std::string log_path;
+    static std::string log_level;
+    static std::string temp_path;
+	static bool set(const char* key, const char* val);
+	static bool haskey(const char* key);
+	static const char* getvalue(const char* key);
+};
+/**
+ */
+enum VideoScene
+{
+    SCENE_counter,    // ��̨
+    SCENE_financial,   // ����
+    SCENE_lobby,       // ����
+    SCENE_hall             // ����
+};
+/**
+ */
+class VideoPar
+{
+private:
+    std::vector<shared_ptr<parambase>> params;
+public:
+    VideoScene scene;            //����: 1��̨, 2����, 3����, 4����(��װ���)
+    bool audio_enable ;          //��Ƶ���� 1��,0��
+    int audio_channels ;         //��Ƶͨ���� 0,1,2,4,6
+    int audio_sample_rate ;      // ������ 44100, 48000, 96000, 192000
+    bool video_enable ;          // ��Ƶ���� 1��,0��
+    //int video_analyse_rate ;   //��Ƶ��������: ����>0,ÿ�����֡��
+    bool video_sample_keyframe;  //ֻ����ؼ�֡
+    bool video_record;           //����¼����Ƶ 1��,0��
+    int video_record_duration;   //��Ƶ¼��ʱ��,Ĭ��10s
+    int video_record_reviewtime; //��Ƶ¼�ƻ���ʱ��,Ĭ��5s
+    int face_minsize;            //��С������С
+    VideoPar();
+    //~VideoPar();
+    bool set(const char* key, const char* val);
+    static bool haskey(const char* key);
+};
+
+template<class T>
+inline int64_t NowTime()
+{
+	return std::chrono::time_point_cast<T>(std::chrono::system_clock::now()).time_since_epoch().count();
+}
+
+/**--------------------------------- ������models����ģ�����õ��ķ��� ---------------------------------**/
+
+inline bool detectFileExist(char *file_path) {
+    std::ifstream _ifstream;
+    _ifstream.open(file_path, std::ios::in);
+    if (!_ifstream) {
+        return false;
+    }
+    _ifstream.close();
+    return true;
+}
+
+// ����任��������xy������ת
+inline cv::Mat_<double> rotate_point(cv::Mat_<double> xy, double angle) {
+    cv::Mat rotate_matrix = (cv::Mat_<double>(2, 2) << cos(angle), -sin(angle), sin(angle), cos(angle));
+    cv::transpose(rotate_matrix, rotate_matrix);
+    auto rotate_xy = xy * rotate_matrix;
+    return rotate_xy;
+}
+
+// �����Ƿ��ڿ���
+inline bool check_point_in_rect(cv::Point point, cv::Rect rect) {
+    if ((rect.x < point.x && point.x < rect.x + rect.width) &&
+        (rect.y < point.y && point.y < rect.y + rect.height)) {
+        return true;//��rect�ڲ�
+    } else {
+        return false;//��rect���ϻ��ⲿ
+    }
+}
+
diff --git a/duix-sdk/src/main/cpp/dhmfcc/wenetai.cpp b/duix-sdk/src/main/cpp/dhmfcc/wenetai.cpp
new file mode 100644
index 0000000..5ff3c56
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhmfcc/wenetai.cpp
@@ -0,0 +1,119 @@
+#include "wenetai.h"
+WeAI::WeAI(int melcnt,int bnfcnt,int trd){
+  n_trd = trd;
+  dimin = melcnt;
+  dimout = bnfcnt;
+  sizein = melcnt*80*sizeof(float);
+  sizeout = bnfcnt*256*sizeof(float);
+  shapein[1] = melcnt;
+  shapeout[1] = bnfcnt;
+  buflen[0] = melcnt;
+
+  bufin = (float*)malloc(sizein+1024);
+  bufout = (float*)malloc(sizeout+1024);
+}
+
+WeAI::~WeAI(){
+  free(bufin);
+  free(bufout);
+}
+
+int WeAI::dorun(float* mel,int melcnt,float* bnf,int bnfcnt){
+  return 0;
+}
+
+
+int WeAI::run(float* mel,int melcnt,float* bnf,int bnfcnt){
+  dimin = melcnt;
+  dimout = bnfcnt;
+  sizein = melcnt*80*sizeof(float);
+  sizeout = bnfcnt*256*sizeof(float);
+  shapein[1] = melcnt;
+  shapeout[1] = bnfcnt;
+  buflen[0] = melcnt;
+  return dorun(mel,melcnt,bnf,bnfcnt);
+}
+
+int WeAI::test(){
+  return dorun(bufin,dimin,bufout,dimout);
+}
+
+int WeOnnx::dorun(float* mel,int melcnt,float* bnf,int bnfcnt){
+  //
+  Ort::Value arrin[2] = {Ort::Value::CreateTensor( memoryInfo, mel ,sizein ,  shapein, 3 ,ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT),Ort::Value::CreateTensor( memoryInfo, buflen ,sizelen ,  shapelen, 1 ,ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32)};
+  Ort::Value arrout[1] = {Ort::Value::CreateTensor( memoryInfo, bnf ,sizeout ,  shapeout, 3 ,ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT)};
+  session.Run(runOptions, names_in, arrin, 2, names_out,arrout, 1);
+  return 0;
+}
+
+WeOnnx::WeOnnx(std::string modelfn,int mel,int bnf,int trd):WeAI(mel,bnf,trd){
+  //
+  env = Ort::Env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING, "wenet");
+  sessionOptions = Ort::SessionOptions();
+//  sessionOptions.SetIntraOpNumThreads(n_trd);
+    sessionOptions.SetIntraOpNumThreads(2);
+// todo jth add
+  //sessionOptions.SetIntraOpNumThreads(1);
+  //sessionOptions.SetInterOpNumThreads(1);
+  sessionOptions.AddConfigEntry("session.disable_prepacking", "1");
+  sessionOptions.SetGraphOptimizationLevel( GraphOptimizationLevel::ORT_ENABLE_ALL);
+  session = Ort::Session(env, modelfn.c_str(), sessionOptions);
+  memoryInfo = Ort::MemoryInfo::CreateCpu( OrtAllocatorType::OrtDeviceAllocator, OrtMemType::OrtMemTypeCPU);
+  //Ort::MemoryInfo::CreateCpu( OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
+  //tensorin = Ort::Value::CreateTensor( memoryInfo, bufin ,sizein ,  shapein, 3 ,ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT);
+  //tensorlen = Ort::Value::CreateTensor( memoryInfo, buflen ,sizelen ,  shapelen, 1 ,ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32);
+  //tensorout = Ort::Value::CreateTensor( memoryInfo, bufout ,sizeout ,  shapeout, 3 ,ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT);
+}
+
+WeOnnx::~WeOnnx(){
+}
+
+
+#ifdef WENETOPENV
+int WeOpvn::dorun(float* mel,int melcnt,float* bnf,int bnfcnt){
+	printf("====opvn run %d \n",sizeout);
+	std::cout<<ainput_shape<<std::endl;
+	std::cout<<aoutput_shape<<std::endl;
+  ov::Tensor ainput_tensor = ov::Tensor(ainput_type, ainput_shape, mel);
+  ov::Tensor binput_tensor = ov::Tensor(binput_type, binput_shape, binput_data);
+  ov::Tensor aoutput_tensor = ov::Tensor(aoutput_type, aoutput_shape, bnf);
+  infer_request.set_input_tensor(0,ainput_tensor);
+  infer_request.set_input_tensor(1,binput_tensor);
+  infer_request.set_output_tensor(0,aoutput_tensor);
+  infer_request.infer();
+  //const ov::Tensor& output_tensor = infer_request.get_output_tensor();
+  //const float* data = (float*)output_tensor.data();//<const float>();
+  //memcpy(bnf,data,sizeout);
+  return 0;
+}
+
+WeOpvn::WeOpvn(std::string modelfn,std::string xmlfn,int mel,int bnf,int trd):WeAI(mel,bnf,trd){
+  std::shared_ptr<ov::Model>  model = core.read_model(xmlfn,modelfn);
+  ov::preprocess::PrePostProcessor ppp(model);
+
+  ov::preprocess::InputInfo& ainfo = ppp.input(aname);  
+  ov::preprocess::InputInfo&  binfo = ppp.input(bname);  
+  ainput_shape[1] = mel;
+  aoutput_shape[1] = bnf;
+  binput_data[0] = mel;
+  ainfo.tensor().set_element_type(ainput_type).set_shape(ainput_shape);
+  binfo.tensor().set_element_type(binput_type).set_shape(binput_shape);
+  ainfo.preprocess();                                                                             //
+  binfo.preprocess();                                                                             //
+  ov::preprocess::OutputInfo&  aout = ppp.output(cname);  
+  aout.tensor().set_element_type(aoutput_type);
+
+  model = ppp.build();
+  std::string device_name = "CPU";
+  ov::CompiledModel  compiled_model = core.compile_model(model, device_name,
+      ov::inference_num_threads(int(n_trd)) );
+
+  infer_request = compiled_model.create_infer_request();
+  //
+  //model = nullptr;
+}
+
+WeOpvn::~WeOpvn(){
+
+}
+#endif
diff --git a/duix-sdk/src/main/cpp/dhmfcc/wenetai.h b/duix-sdk/src/main/cpp/dhmfcc/wenetai.h
new file mode 100644
index 0000000..20b55a0
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhmfcc/wenetai.h
@@ -0,0 +1,92 @@
+#pragma once
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include <stdlib.h>
+
+
+class WeAI{
+  protected:
+    int n_trd = 4;
+    int dimin = 321;
+    int dimout = 78;
+    int dimlen = 1;
+    int64_t sizein = 321*80*sizeof(float);
+    int64_t sizeout = 78*256*sizeof(float);
+    int64_t sizelen = sizeof(int32_t);
+    float* bufin = NULL;
+    float* bufout = NULL;
+    int32_t buflen[1]; 
+    int64_t shapein[3]={1,321,80};
+    int64_t shapelen[1]={1};
+    int64_t shapeout[3]={1,78,256};
+    const char* names_in[2]={"speech","speech_lengths"};
+    const char* names_out[1]={"encoder_out"};
+
+    virtual int dorun(float* mel,int melcnt,float* bnf,int bnfcnt);
+  public:
+    WeAI(int melcnt,int bnfcnt,int trd=4);
+    int run(float* mel,int melcnt,float* bnf,int bnfcnt);
+    int test();
+    virtual ~WeAI();
+};
+
+
+#define WENETONNX  1
+#ifdef WENETONNX
+#include "onnxruntime_cxx_api.h"
+class WeOnnx:public WeAI{
+  protected:
+
+    //Ort::Value tensorin{nullptr};
+    //Ort::Value tensorlen{nullptr};
+    //Ort::Value tensorout{nullptr};
+
+    Ort::Env env{nullptr};
+    Ort::SessionOptions sessionOptions{nullptr};
+    Ort::RunOptions runOptions;
+    Ort::Session session{nullptr};
+    Ort::MemoryInfo memoryInfo{nullptr};
+  protected:
+    virtual int dorun(float* mel,int melcnt,float* bnf,int bnfcnt);
+  public:
+    WeOnnx(std::string modelfn,int mel,int bnf,int trd);
+    virtual ~WeOnnx();
+};
+#endif
+
+#ifdef WENETMNN
+class WeMnn:public WeAI{
+};
+#endif
+
+
+//#define WENETOPENV 
+#ifdef WENETOPENV
+#include "openvino/openvino.hpp"
+class WeOpvn:public WeAI{
+  private:
+    ov::element::Type ainput_type = ov::element::f32;
+    ov::element::Type binput_type = ov::element::i32;
+    ov::element::Type aoutput_type = ov::element::f32;
+
+    ov::Shape ainput_shape = {1, 321,80};
+    ov::Shape binput_shape = {1};
+    ov::Shape aoutput_shape = {1, 79,256};
+
+
+    int32_t  binput_data[1];
+
+    ov::Core core;
+    ov::InferRequest infer_request ;
+    std::string aname = "speech";
+    std::string bname = "speech_lengths";
+    std::string cname = "encoder_out";
+  protected:
+    virtual int dorun(float* mel,int melcnt,float* bnf,int bnfcnt);
+  public:
+    WeOpvn(std::string modelfn,std::string xmlfn,int mel,int bnf,int trd);
+    virtual ~WeOpvn();
+};
+#endif
+
diff --git a/duix-sdk/src/main/cpp/dhmfcc/wenetov.cpp b/duix-sdk/src/main/cpp/dhmfcc/wenetov.cpp
new file mode 100644
index 0000000..2fd97f8
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhmfcc/wenetov.cpp
@@ -0,0 +1,134 @@
+// Copyright (C) 2018-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <iterator>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+#include <sys/timeb.h>
+#include <unistd.h>
+#include <time.h>
+
+// clang-format off
+#include "openvino/openvino.hpp"
+#include "openvino/core/preprocess/input_info.hpp"
+
+uint64_t jtimer_msstamp(){
+  struct timespec ts;
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return (ts.tv_sec*1000l) + (ts.tv_nsec/CLOCKS_PER_SEC);
+}
+
+// clang-format on
+
+/**
+ * @brief Main with support Unicode paths, wide strings
+ */
+int main(int argc, char* argv[]) {
+
+        const std::string amodel_path = "wenet.xml";
+        const std::string bmodel_path = "wenet.bin";
+
+        // -------- Step 1. Initialize OpenVINO Runtime Core --------
+        ov::Core core;
+
+        // -------- Step 2. Read a model --------
+        printf("===aaa\n");
+        std::shared_ptr<ov::Model> model = core.read_model(amodel_path,bmodel_path);
+        printf("===bbb\n");
+        //printInputAndOutputsInfo(*model);
+
+        OPENVINO_ASSERT(model->inputs().size() == 2, "Sample supports models with 1 input only");
+        OPENVINO_ASSERT(model->outputs().size() == 1, "Sample supports models with 1 output only");
+
+        // -------- Step 3. Set up input
+
+        // Read input image to a tensor and set it to an infer request
+        // without resize and layout conversions
+
+        ov::element::Type ainput_type = ov::element::f32;
+        ov::Shape ainput_shape = {1, 321,80};
+        float*  ainput_data = (float*)malloc(sizeof(float)*321*80);
+        memset(ainput_data,0,sizeof(float)*321*80);
+        ov::element::Type binput_type = ov::element::i32;
+        ov::Shape binput_shape = {1};
+        int32_t*  binput_data = (int32_t*)malloc(10);
+        *binput_data = 321;
+
+        // just wrap image data by ov::Tensor without allocating of new memory
+        ov::Tensor ainput_tensor = ov::Tensor(ainput_type, ainput_shape, ainput_data);
+        ov::Tensor binput_tensor = ov::Tensor(binput_type, binput_shape, binput_data);
+
+        //const ov::Layout tensor_layout{"NHWC"};
+
+        // -------- Step 4. Configure preprocessing --------
+
+        ov::preprocess::PrePostProcessor ppp(model);
+
+        // 1) Set input tensor information:
+        // - input() provides information about a single model input
+        // - reuse precision and shape from already available `input_tensor`
+        // - layout of data is 'NHWC'
+        std::string aname = "speech";
+        ov::preprocess::InputInfo& ainfo = ppp.input(aname);  
+        ainfo.tensor().set_shape(ainput_shape).set_element_type(ainput_type);//set_layout(tensor_layout);
+        std::string bname = "speech_lengths";
+        ov::preprocess::InputInfo& binfo = ppp.input(bname);  
+        binfo.tensor().set_shape(binput_shape).set_element_type(binput_type);//set_layout(tensor_layout);
+        ainfo.preprocess();                                                                             //
+        binfo.preprocess();                                                                             //
+        ppp.output().tensor().set_element_type(ov::element::f32);
+                                                                                                                                    //
+                                                                                                        //
+                                                                                                        //
+                                                                             //
+        // 2) Adding explicit preprocessing steps:
+        // - convert layout to 'NCHW' (from 'NHWC' specified above at tensor layout)
+        // - apply linear resize from tensor spatial dims to model spatial dims
+        //ppp.input().preprocess().resize(ov::preprocess::ResizeAlgorithm::RESIZE_LINEAR);
+        // 4) Suppose model has 'NCHW' layout for input
+        //ppp.input().model().set_layout("NCHW");
+        // 5) Set output tensor information:
+        // - precision of tensor is supposed to be 'f32'
+
+        // 6) Apply preprocessing modifying the original 'model'
+        model = ppp.build();
+
+        std::string device_name = "CPU";
+        // -------- Step 5. Loading a model to the device --------
+        ov::CompiledModel compiled_model = core.compile_model(model, device_name,
+          ov::inference_num_threads(int(4))
+        );
+
+        // -------- Step 6. Create an infer request --------
+        ov::InferRequest infer_request = compiled_model.create_infer_request();
+        // -----------------------------------------------------------------------------------------------------
+
+        // -------- Step 7. Prepare input --------
+        infer_request.set_input_tensor(0,ainput_tensor);
+        infer_request.set_input_tensor(1,binput_tensor);
+
+        // -------- Step 8. Do inference synchronously --------
+        for(int k=0;k<10000;k++){
+    uint64_t tick = jtimer_msstamp();
+        infer_request.infer();
+    int dist = jtimer_msstamp()-tick;
+    printf("===dist %d\n",dist);
+    usleep(1000);
+        }
+
+        // -------- Step 9. Process output
+        const ov::Tensor& output_tensor = infer_request.get_output_tensor();
+        const float* data = output_tensor.data<const float>();
+        for(int k=0;k<10;k++){
+          printf("===%f \n",data[k]);
+        }
+        //
+
+        // Print classification results
+        // -----------------------------------------------------------------------------------------------------
+
+    return EXIT_SUCCESS;
+}
diff --git a/duix-sdk/src/main/cpp/dhunet/blendgram.cpp b/duix-sdk/src/main/cpp/dhunet/blendgram.cpp
new file mode 100644
index 0000000..f0799c3
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhunet/blendgram.cpp
@@ -0,0 +1,437 @@
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+
+#include "blendgram.h"
+
+
+  void  exColorBlend_Normal(uint8* T,uint8* A,uint8* B){ ColorBlend_Buffer(T,A,B,Normal); }
+  void  exColorBlend_Lighten(uint8* T,uint8* A,uint8* B)       { ColorBlend_Buffer(T,A,B,Lighten);}
+  void  exColorBlend_Darken(uint8* T,uint8* A,uint8* B)        { ColorBlend_Buffer(T,A,B,Darken);}
+  void  exColorBlend_Multiply(uint8* T,uint8* A,uint8* B)      { ColorBlend_Buffer(T,A,B,Multiply);}
+  void  exColorBlend_Average(uint8* T,uint8* A,uint8* B)       { ColorBlend_Buffer(T,A,B,Average);}
+  void  exColorBlend_Add(uint8* T,uint8* A,uint8* B)           { ColorBlend_Buffer(T,A,B,Add);}
+
+  void  exColorBlend_Subtract(uint8* T,uint8* A,uint8* B)      { ColorBlend_Buffer(T,A,B,Subtract);}
+  void  exColorBlend_Difference(uint8* T,uint8* A,uint8* B)    { ColorBlend_Buffer(T,A,B,Difference);}
+  void  exColorBlend_Negation(uint8* T,uint8* A,uint8* B)      { ColorBlend_Buffer(T,A,B,Negation);}
+  void  exColorBlend_Screen(uint8* T,uint8* A,uint8* B)        { ColorBlend_Buffer(T,A,B,Screen);}
+  void  exColorBlend_Exclusion(uint8* T,uint8* A,uint8* B)     { ColorBlend_Buffer(T,A,B,Exclusion);}
+
+  void  exColorBlend_Overlay(uint8* T,uint8* A,uint8* B)       { ColorBlend_Buffer(T,A,B,Overlay);}
+  void  exColorBlend_SoftLight(uint8* T,uint8* A,uint8* B)     { ColorBlend_Buffer(T,A,B,SoftLight);}
+  void  exColorBlend_HardLight(uint8* T,uint8* A,uint8* B)     { ColorBlend_Buffer(T,A,B,HardLight);}
+  void  exColorBlend_ColorDodge(uint8* T,uint8* A,uint8* B)    { ColorBlend_Buffer(T,A,B,ColorDodge);}
+  void  exColorBlend_ColorBurn(uint8* T,uint8* A,uint8* B)     { ColorBlend_Buffer(T,A,B,ColorBurn);}
+
+  void  exColorBlend_LinearDodge(uint8* T,uint8* A,uint8* B)   { ColorBlend_Buffer(T,A,B,LinearDodge);}
+  void  exColorBlend_LinearBurn(uint8* T,uint8* A,uint8* B)    { ColorBlend_Buffer(T,A,B,LinearBurn);}
+  void  exColorBlend_LinearLight(uint8* T,uint8* A,uint8* B)   { ColorBlend_Buffer(T,A,B,LinearLight);}
+  void  exColorBlend_VividLight(uint8* T,uint8* A,uint8* B)    { ColorBlend_Buffer(T,A,B,VividLight);}
+  void  exColorBlend_PinLight(uint8* T,uint8* A,uint8* B)      { ColorBlend_Buffer(T,A,B,PinLight);}
+
+  void  exColorBlend_HardMix(uint8* T,uint8* A,uint8* B)       { ColorBlend_Buffer(T,A,B,HardMix);}
+  void  exColorBlend_Reflect(uint8* T,uint8* A,uint8* B)       { ColorBlend_Buffer(T,A,B,Reflect);}
+  void  exColorBlend_Glow(uint8* T,uint8* A,uint8* B)          { ColorBlend_Buffer(T,A,B,Glow);}
+  void  exColorBlend_Phoenix(uint8* T,uint8* A,uint8* B)       { ColorBlend_Buffer(T,A,B,Phoenix);}
+
+typedef void (*BlendFunc) (uint8* T,uint8* A,uint8* B);
+static int MAX_FUNC = 25;
+static BlendFunc blendfuncs[25]={
+  &exColorBlend_Normal,
+  &exColorBlend_Lighten,
+  &exColorBlend_Darken,
+  &exColorBlend_Multiply,
+  &exColorBlend_Average,
+  &exColorBlend_Add,
+
+  &exColorBlend_Subtract,
+  &exColorBlend_Difference,
+  &exColorBlend_Negation,
+  &exColorBlend_Screen,
+  &exColorBlend_Exclusion,
+
+  &exColorBlend_Overlay,
+  &exColorBlend_SoftLight,
+  &exColorBlend_HardLight,
+  &exColorBlend_ColorDodge,
+  &exColorBlend_ColorBurn,
+
+  &exColorBlend_LinearDodge,
+  &exColorBlend_LinearBurn,
+  &exColorBlend_LinearLight,
+  &exColorBlend_VividLight,
+  &exColorBlend_PinLight,
+
+  &exColorBlend_HardMix,
+  &exColorBlend_Reflect,
+  &exColorBlend_Glow,
+  &exColorBlend_Phoenix
+};
+
+void BlendGramSimp(unsigned char *Src,unsigned char* Mask, unsigned char *Dest, int Width, int Height, int Mode)
+{
+	if(Mode<1)return;
+	if(Mode>=MAX_FUNC)return;
+	BlendFunc func=blendfuncs[Mode];
+	unsigned char *LinePS, *LinePD,*LinePM;
+	for (int Y = 0; Y < Height; Y += 1)
+	{
+		LinePS = Src + Y * Width * 4;
+		LinePM = Mask + Y * Width * 4;
+		LinePD = Dest + Y * Width * 4;
+		for (int X = 0; X < Width; X += 1)
+		{
+			func(LinePD,LinePS,LinePM);
+			LinePS += 4;
+			LinePM += 4;
+			LinePD += 4;
+		}
+	}
+}
+
+void BlendGramAlpha3(unsigned char *Src,unsigned char* Mask, unsigned char *Dest, int Width, int Height)
+{
+    //printf("w %d h %d\n",Width,Height);
+	unsigned char *LinePS, *LinePD,*LinePM;
+	for (int Y = 0; Y < Height; Y += 1)
+	{
+		LinePS = Src + Y * Width * 3;
+		LinePM = Mask + Y * Width * 3;
+		LinePD = Dest + Y * Width * 3;
+		for (int X = 0; X < Width; X += 1)
+		{
+			//func(LinePD,LinePS,LinePM);
+            //ColorBlend_Alpha(LinePD,LinePD,LinePS,*LinePM);
+            float alpha = *LinePM/255.0f;
+            float beta = 1.0f-alpha;
+            //if(beta<0.5f) printf("==alpha %f beta %f\n",alpha,beta);
+            //if(beta<0.5f) printf("od %u ps %u\n",LinePD[0],LinePS[0]);
+            LinePD[0] =  CLAMPCOLOR(     LinePD[0]*alpha+LinePS[0]*beta);
+            //if(beta<0.5f) printf("new %u ps%u \n",LinePD[0],LinePS[0]);
+            //if(beta<0.5f) getchar();
+            LinePD[1] = CLAMPCOLOR(LinePD[1]*alpha+LinePS[1]*beta);
+            LinePD[2] = CLAMPCOLOR( LinePD[2]*alpha+LinePS[2]*beta);
+			LinePS += 3;
+			LinePM += 3;
+			LinePD += 3;
+		}
+	}
+}
+
+void BlendGramAlpha(unsigned char *Src,unsigned char* Mask, unsigned char *Dest, int Width, int Height)
+{
+	unsigned char *LinePS, *LinePD,*LinePM;
+	for (int Y = 0; Y < Height; Y += 1)
+	{
+		LinePS = Src + Y * Width * 3;
+		LinePM = Mask + Y * Width * 1;
+		LinePD = Dest + Y * Width * 3;
+		for (int X = 0; X < Width; X += 1)
+		{
+			//func(LinePD,LinePS,LinePM);
+            ColorBlend_Alpha(LinePD,LinePD,LinePS,*LinePM);
+            /*
+            float alpha = *LinePM/255.0f;
+            float beta = 1.0f-alpha;
+            //printf("==alpha %f beta %f\n",alpha,beta);
+            LinePD[0] = LinePD[0]*alpha+LinePS[0]*beta;
+            LinePD[1] = LinePD[1]*alpha+LinePS[1]*beta;
+            LinePD[2] = LinePD[2]*alpha+LinePS[2]*beta;
+            */
+			LinePS += 3;
+			LinePM += 1;
+			LinePD += 3;
+		}
+	}
+}
+
+void BlendGramAlphaRev(unsigned char *Src,unsigned char* Mask, unsigned char *Dest, int Width, int Height)
+{
+	unsigned char *LinePS, *LinePD,*LinePM;
+	for (int Y = 0; Y < Height; Y += 1)
+	{
+		LinePS = Src + Y * Width * 3;
+		LinePM = Mask + Y * Width * 1;
+		LinePD = Dest + Y * Width * 3;
+		for (int X = 0; X < Width; X += 1)
+		{
+			//func(LinePD,LinePS,LinePM);
+            ColorBlend_Alpha(LinePD,LinePS,LinePD,*LinePM);
+			LinePS += 3;
+			LinePM += 1;
+			LinePD += 3;
+		}
+	}
+}
+
+
+
+
+/*
+void BlendGram(CBitmap* image,CBitmap* mask,int mode)
+{
+	if(mode<1)return;
+	if(mode>=MAX_FUNC)return;
+	BlendFunc func=blendfuncs[mode];
+	int Stride=image->width*4;
+		unsigned char *LinePS, *LinePD,*LinePM;
+	for (int Y = 0; Y < image->height; Y += 1)
+	{
+		LinePS = (unsigned char*)image->pixels +image->stride*Y;
+		LinePM = (unsigned char*)mask->pixels + mask->stride*Y;
+		LinePD = (unsigned char*)image->pixels +image->stride*Y;
+		for (int X = 0; X < image->width; X += 1)
+		{
+			func(LinePD,LinePS,LinePM);
+			LinePS += 4;
+			LinePM += 4;
+			LinePD += 4;
+		}
+	}
+}
+
+void BlendImageAdjustWithMask(CBitmap* bmp,CBitmap* adj,CBitmap* dst ,CBitmap* msk,int mode)
+{
+	unsigned char* bmppixels=(unsigned char*)bmp->pixels;
+	unsigned char* mskpixels=(unsigned char*)msk->pixels;
+	unsigned char* dstpixels=(unsigned char*)dst->pixels;
+	unsigned char* adjpixels=(unsigned char*)adj->pixels;
+	int stride=bmp->stride;
+	int width=bmp->width;
+	int height=bmp->height;
+	int X,Y;
+	unsigned char* LinePS , * LinePM , * LinePD , * LinePA ;
+	#pragma omp parallel for private(LinePS,LinePM,LinePD,LinePA,X,Y)
+	for (Y = 0; Y < height; Y ++)
+	{
+		int offset=stride*Y;
+		LinePS = bmppixels +offset;
+		LinePM = mskpixels +offset;
+		LinePD = dstpixels +offset;
+		LinePA = adjpixels +offset;
+		for (X = 0; X < width; X ++)
+		{
+			unsigned char M=*LinePM;
+			if(M==0xFF){
+				LinePD[0]=LinePS[0];
+				LinePD[1]=LinePS[1];
+				LinePD[2]=LinePS[2];
+			}else if(M==0x00){
+				LinePD[0]=LinePA[0];
+				LinePD[1]=LinePA[1];
+				LinePD[2]=LinePA[2];
+			}else{
+				ColorBlend_Alpha(LinePD,LinePS,LinePA,M);
+			}
+			LinePD[3]=LinePS[3];
+			LinePS += 4; LinePM += 4; LinePD += 4; LinePA += 4;
+		}
+	}
+}
+
+
+void BlendImageAdjustWithMaskEx(CBitmap* bmp,CBitmap* adj,CBitmap* dst ,CBitmap* msk,int mode)
+{
+	unsigned char* bmppixels=(unsigned char*)bmp->pixels;
+	unsigned char* mskpixels=(unsigned char*)msk->pixels;
+	unsigned char* dstpixels=(unsigned char*)dst->pixels;
+	unsigned char* adjpixels=(unsigned char*)adj->pixels;
+	int stride=bmp->stride;
+	int width=bmp->width;
+	int height=bmp->height;
+	int X,Y;
+	unsigned char* LinePS , * LinePM , * LinePD , * LinePA ;
+	#pragma omp parallel for private(LinePS,LinePM,LinePD,LinePA,X,Y)
+	for (Y = 0; Y < height; Y ++)
+	{
+		int offset=stride*Y;
+		LinePS = bmppixels +offset;
+		LinePM = mskpixels +offset;
+		LinePD = dstpixels +offset;
+		LinePA = adjpixels +offset;
+		for (X = 0; X < width; X ++)
+		{
+			unsigned char M=*LinePM;
+			if(M==0xFF){
+				LinePD[0]=LinePS[0];
+				LinePD[1]=LinePS[1];
+				LinePD[2]=LinePS[2];
+			}else if(M==0x00){
+				LinePD[0]=LinePA[0];
+				LinePD[1]=LinePA[1];
+				LinePD[2]=LinePA[2];
+			}else{
+				//ColorBlend_Alpha(LinePD,LinePS,LinePA,M);
+				LinePD[0]=LinePS[0]*M>>8;
+				LinePD[1]=LinePS[1]*M>>8;
+				LinePD[2]=LinePS[2]*M>>8;
+			}
+			LinePD[3]=M;
+			LinePS += 4; LinePM += 4; LinePD += 4; LinePA += 4;
+		}
+	}
+}
+
+
+
+
+void BlendImageAdjustWithAlpha(CBitmap* bmp,CBitmap* adj,CBitmap* dst ,int alpha,int mode){
+	unsigned char* bmppixels=(unsigned char*)bmp->pixels;
+	unsigned char* dstpixels=(unsigned char*)dst->pixels;
+	unsigned char* adjpixels=(unsigned char*)adj->pixels;
+	int stride=bmp->stride;
+	int width=bmp->width;
+	int height=bmp->height;
+	int X,Y;
+	unsigned char M=CLAMPCOLOR(alpha);
+	unsigned char *LinePS ,  *LinePD , *LinePA ;
+	#pragma omp parallel for private(LinePS,LinePD,LinePA,X,Y)
+	for (Y = 0; Y < height; Y ++)
+	{
+		int offset=stride*Y;
+		LinePS = bmppixels +offset;
+		LinePD = dstpixels +offset;
+		LinePA = adjpixels +offset;
+		for (X = 0; X < width; X ++)
+		{
+			if(M==0xFF){
+				LinePD[0]=LinePS[0];
+				LinePD[1]=LinePS[1];
+				LinePD[2]=LinePS[2];
+			}else if(M==0x00){
+				LinePD[0]=LinePA[0];
+				LinePD[1]=LinePA[1];
+				LinePD[2]=LinePA[2];
+			}else{
+				ColorBlend_Alpha(LinePD,LinePS,LinePA,M);
+			}
+			LinePD[3]=LinePS[3];
+			LinePS += 4;  LinePD += 4; LinePA += 4;
+		}
+	}
+}
+
+void BlendImageAdjustWithAlphaMask(CBitmap* bmp,CBitmap* adj,CBitmap* dst ,CBitmap* msk,int alpha,int mode){
+	unsigned char* bmppixels=(unsigned char*)bmp->pixels;
+	unsigned char* mskpixels=(unsigned char*)msk->pixels;
+	unsigned char* dstpixels=(unsigned char*)dst->pixels;
+	unsigned char* adjpixels=(unsigned char*)adj->pixels;
+	int stride=bmp->stride;
+	int width=bmp->width;
+	int height=bmp->height;
+	int X,Y;
+	unsigned char NM=CLAMPCOLOR(alpha);
+	unsigned char *LinePS , *LinePM , *LinePD , *LinePA ;
+	#pragma omp parallel for private(LinePS,LinePM,LinePD,LinePA,X,Y)
+	for (Y = 0; Y < height; Y ++)
+	{
+		int offset=stride*Y;
+		LinePS = bmppixels +offset;
+		LinePM = mskpixels +offset;
+		LinePD = dstpixels +offset;
+		LinePA = adjpixels +offset;
+		for (X = 0; X < width; X ++)
+		{
+			unsigned char M=*LinePM;
+			if(M==0xFF){
+				LinePD[0]=LinePS[0];
+				LinePD[1]=LinePS[1];
+				LinePD[2]=LinePS[2];
+			}else if(M==0x00){
+				if(NM==0xFF){
+					LinePD[0]=LinePS[0];
+					LinePD[1]=LinePS[1];
+					LinePD[2]=LinePS[2];
+				}else {
+					if(NM==0x00){
+					//none
+						LinePD[0]=LinePA[0];
+						LinePD[1]=LinePA[1];
+						LinePD[2]=LinePA[2];
+					}else{
+						ColorBlend_Alpha(LinePD,LinePS,LinePA,NM);
+					}
+				}
+			}else{
+				//
+				if(NM==0xFF){
+					LinePD[0]=LinePS[0];
+					LinePD[1]=LinePS[1];
+					LinePD[2]=LinePS[2];
+				}else{
+					if(NM==0x00){
+						ColorBlend_Alpha(LinePD,LinePS,LinePA,M);
+					}else{
+						ColorBlend_Alpha(LinePA,LinePS,LinePA,NM);
+						ColorBlend_Alpha(LinePD,LinePS,LinePA,M);
+					}
+				}
+			}
+			LinePD[3]=LinePS[3];
+			LinePS += 4; LinePM += 4; LinePD += 4; LinePA += 4;
+		}
+	}
+}
+
+void ReadAlphaBySrc(CBitmap* src,CBitmap* alpha){
+	memcpy(alpha,src,sizeof(CBitmap));
+	alpha->stride=src->width;
+	alpha->channel=1;
+	alpha->pixels=(CPixel*)malloc(alpha->width*alpha->height*sizeof(unsigned char));
+	unsigned char* bmppixels=(unsigned char*)src->pixels;
+	unsigned char* alapixels=(unsigned char*)alpha->pixels;
+	int stride=src->stride;
+	int width=src->width;
+	int height=src->height;
+	int X,Y;
+	unsigned char *LinePS ,  *LinePA;
+	//#pragma omp parallel for private(LinePS,LinePA)
+	for (Y = 0; Y < height; Y ++)
+	{
+		LinePS = bmppixels +stride*Y;
+		LinePA = alapixels +width*Y;
+		for (X = 0; X < width; X ++)
+		{
+			LinePA[0]=LinePS[3];
+			LinePS += 4;  LinePA ++;
+		}
+	}
+}
+
+
+void CheckAlpha(CBitmap* bmp,CBitmap* alpha)
+{
+	unsigned char* bmppixels=(unsigned char*)bmp->pixels;
+	unsigned char* alapixels=(unsigned char*)alpha->pixels;
+	int stride=bmp->stride;
+	int width=bmp->width;
+	int height=bmp->height;
+	int X,Y;
+	unsigned char *LinePS ,  *LinePA;
+	//#pragma omp parallel for private(LinePS,LinePA)
+	for (Y = 0; Y < height; Y ++)
+	{
+		LinePS = bmppixels +stride*Y;
+		LinePA = alapixels +width*Y;
+		for (X = 0; X < width; X ++)
+		{
+			//unsigned char M=LinePA[0];
+			if(*LinePA==0x00){
+				LinePS[0]=0;
+				LinePS[1]=0;
+				LinePS[2]=0;
+				LinePS[3]=0;
+			//}else if(M<0xff){
+				//if(LinePD[0]>M)LinePD[0]=M;
+				//if(LinePD[1]>M)LinePD[1]=M;
+				//if(LinePD[2]>M)LinePD[2]=M;
+				//LinePD[3]=M;
+			}else{
+			}
+			LinePS += 4;  LinePA++;
+		}
+	}
+}
+*/
+
diff --git a/duix-sdk/src/main/cpp/dhunet/blendgram.h b/duix-sdk/src/main/cpp/dhunet/blendgram.h
new file mode 100644
index 0000000..7e8ec8a
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhunet/blendgram.h
@@ -0,0 +1,287 @@
+#ifndef __BLENDGRAM_H__
+#define __BLENDGRAM_H__
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+typedef unsigned char uchar;
+#define CLAMPCOLOR(x) (uchar)((x)<(0)?(0):((x)>(255)?(255):(x)))
+
+#define MMAX(A,B)     ((A)>(B)?(A):(B))
+#define MMIN(A,B)     ((A)<(B)?(A):(B))
+
+static int ConstBlend_Buffer = 0;
+static int ConstBlend_Normal=ConstBlend_Buffer+1;
+static int ConstBlend_Lighten=ConstBlend_Buffer+2;
+static int  ConstBlend_Darken=ConstBlend_Buffer+3;
+static int  ConstBlend_Multiply=ConstBlend_Buffer+4;
+static int  ConstBlend_Average=ConstBlend_Buffer+5;
+
+static int  ConstBlend_Add=ConstBlend_Buffer+6;
+static int  ConstBlend_Subtract=ConstBlend_Buffer+7;
+static int  ConstBlend_Difference=ConstBlend_Buffer+8;
+static int  ConstBlend_Negation=ConstBlend_Buffer+9;
+static int  ConstBlend_Screen=ConstBlend_Buffer+10;
+static int  ConstBlend_Exclusion=ConstBlend_Buffer+11;
+static int  ConstBlend_Overlay=ConstBlend_Buffer+12;
+static int  ConstBlend_SoftLight=ConstBlend_Buffer+13;
+static int  ConstBlend_HardLight=ConstBlend_Buffer+14;
+static int  ConstBlend_ColorDodge=ConstBlend_Buffer+15;
+static int  ConstBlend_ColorBurn=ConstBlend_Buffer+16;
+static int  ConstBlend_LinearDodge=ConstBlend_Buffer+17;
+static int  ConstBlend_LinearBurn=ConstBlend_Buffer+18;
+static int  ConstBlend_LinearLight=ConstBlend_Buffer+19;
+static int  ConstBlend_VividLight=ConstBlend_Buffer+20;
+static int  ConstBlend_PinLight=ConstBlend_Buffer+21;
+static int  ConstBlend_HardMix=ConstBlend_Buffer+22;
+static int  ConstBlend_Reflect=ConstBlend_Buffer+23;
+static int  ConstBlend_Glow=ConstBlend_Buffer+24;
+static int  ConstBlend_Phoenix=ConstBlend_Buffer+25;
+
+//void BlendGram(CBitmap* immage,CBitmap* mask,int mode);
+
+//#typedef unsigned char uint8
+#define uint8 unsigned char
+#define float64 double
+#define TRUE 1
+#define FALSE 0
+
+inline uint8 mmin(uint8 A,uint8 B){
+	return A<B?A:B;
+}
+inline uint8 mmax(uint8 A,uint8 B){
+	return A>B?A:B;
+}
+
+#define ChannelBlend_Normal(A,B)     ((uint8)(A))
+#define ChannelBlend_Lighten(A,B)    ((uint8)((B > A) ? B:A))
+#define ChannelBlend_Darken(A,B)     ((uint8)((B > A) ? A:B))
+#define ChannelBlend_Multiply(A,B)   ((uint8)((A * B) / 255))
+#define ChannelBlend_Average(A,B)    ((uint8)((A + B) / 2))
+#define ChannelBlend_Add(A,B)        ((uint8)(mmin(255, (A + B))))
+#define ChannelBlend_Subtract(A,B)   ((uint8)((A + B < 255) ? 0:(A + B - 255)))
+#define ChannelBlend_Difference(A,B) ((uint8)(abs(A - B)))
+#define ChannelBlend_Negation(A,B)   ((uint8)(255 - abs(255 - A - B)))
+#define ChannelBlend_Screen(A,B)     ((uint8)(255 - (((255 - A) * (255 - B)) >> 8)))
+#define ChannelBlend_Exclusion(A,B)  ((uint8)(A + B - 2 * A * B / 255))
+#define ChannelBlend_Overlay(A,B)    ((uint8)((B < 128) ? (2 * A * B / 255):(255 - 2 * (255 - A) * (255 - B) / 255)))
+#define ChannelBlend_SoftLight(A,B)  ((uint8)((B < 128)?(2*((A>>1)+64))*((float)B/255):(255-(2*(255-((A>>1)+64))*(float)(255-B)/255))))
+#define ChannelBlend_HardLight(A,B)  (ChannelBlend_Overlay(B,A))
+#define ChannelBlend_ColorDodge(A,B) ((uint8)((B == 255) ? B:mmin(255, ((A << 8 ) / (255 - B)))))
+#define ChannelBlend_ColorBurn(A,B)  ((uint8)((B == 0) ? B:mmax(0, (255 - ((255 - A) << 8 ) / B))))
+#define ChannelBlend_LinearDodge(A,B)(ChannelBlend_Add(A,B))
+#define ChannelBlend_LinearBurn(A,B) (ChannelBlend_Subtract(A,B))
+#define ChannelBlend_LinearLight(A,B)((uint8)(B < 128)?ChannelBlend_LinearBurn(A,(2 * B)):ChannelBlend_LinearDodge(A,(2 * (B - 128))))
+#define ChannelBlend_VividLight(A,B) ((uint8)(B < 128)?ChannelBlend_ColorBurn(A,(2 * B)):ChannelBlend_ColorDodge(A,(2 * (B - 128))))
+#define ChannelBlend_PinLight(A,B)   ((uint8)(B < 128)?ChannelBlend_Darken(A,(2 * B)):ChannelBlend_Lighten(A,(2 * (B - 128))))
+#define ChannelBlend_HardMix(A,B)    ((uint8)((ChannelBlend_VividLight(A,B) < 128) ? 0:255))
+#define ChannelBlend_Reflect(A,B)    ((uint8)((B == 255) ? B:mmin(255, (A * A / (255 - B)))))
+#define ChannelBlend_Glow(A,B)       (ChannelBlend_Reflect(B,A))
+#define ChannelBlend_Phoenix(A,B)    ((uint8)(mmin(A,B) - mmax(A,B) + 255))
+#define ChannelBlend_SoftEx(A,B)    (A*B/255+A*(255-((255-A)*(255-B)/255)-A*B/255)/255)
+
+#define ChannelBlend_Alpha(A,B,O)    ((uint8)(O * A + (1 - O) * B))
+#define ChannelBlend_AlphaEx(A,B,O)    ((uint8)((O * A + (255 - O) * B)/255))
+#define ChannelBlend_AlphaF(A,B,F,O) (ChannelBlend_AlphaEx(F(A,B),A,O))
+
+#define ColorBlend_Alpha(T,A,B,O)      (T)[0] = ChannelBlend_AlphaEx((A)[0], (B)[0],O), (T)[1] = ChannelBlend_AlphaEx((A)[1], (B)[1],O), (T)[2] = ChannelBlend_AlphaEx((A)[2], (B)[2],O)
+//, (T)[3] = ChannelBlend_AlphaEx((A)[3], (B)[3],O)
+#define ColorBlend_AlphaF(T,A,B,F,O)      (T)[0] = ChannelBlend_AlphaF((A)[0], (B)[0],F,O), (T)[1] = ChannelBlend_AlphaF((A)[1], (B)[1],F,O), (T)[2] = ChannelBlend_AlphaF((A)[2], (B    )[2],F,O) , (T)[3] = ChannelBlend_AlphaEx((A)[3], (B)[3],O)
+
+
+#define ColorBlend_Buffer(T,A,B,M)      (T)[0] = ChannelBlend_##M((A)[0], (B)[0]), (T)[1] = ChannelBlend_##M((A)[1], (B)[1]), (T)[2] = ChannelBlend_##M((A)[2], (B)[2])
+
+#define ColorBlend_Normal(T,A,B)        (ColorBlend_Buffer(T,A,B,Normal))
+#define ColorBlend_Lighten(T,A,B)       (ColorBlend_Buffer(T,A,B,Lighten))
+#define ColorBlend_Darken(T,A,B)        (ColorBlend_Buffer(T,A,B,Darken))
+#define ColorBlend_Multiply(T,A,B)      (ColorBlend_Buffer(T,A,B,Multiply))
+#define ColorBlend_Average(T,A,B)       (ColorBlend_Buffer(T,A,B,Average))
+#define ColorBlend_Add(T,A,B)           (ColorBlend_Buffer(T,A,B,Add))
+#define ColorBlend_Subtract(T,A,B)      (ColorBlend_Buffer(T,A,B,Subtract))
+#define ColorBlend_Difference(T,A,B)    (ColorBlend_Buffer(T,A,B,Difference))
+#define ColorBlend_Negation(T,A,B)      (ColorBlend_Buffer(T,A,B,Negation))
+#define ColorBlend_Screen(T,A,B)        (ColorBlend_Buffer(T,A,B,Screen))
+#define ColorBlend_Exclusion(T,A,B)     (ColorBlend_Buffer(T,A,B,Exclusion))
+#define ColorBlend_Overlay(T,A,B)       (ColorBlend_Buffer(T,A,B,Overlay))
+#define ColorBlend_SoftLight(T,A,B)     (ColorBlend_Buffer(T,A,B,SoftLight))
+#define ColorBlend_HardLight(T,A,B)     (ColorBlend_Buffer(T,A,B,HardLight))
+#define ColorBlend_ColorDodge(T,A,B)    (ColorBlend_Buffer(T,A,B,ColorDodge))
+#define ColorBlend_ColorBurn(T,A,B)     (ColorBlend_Buffer(T,A,B,ColorBurn))
+#define ColorBlend_LinearDodge(T,A,B)   (ColorBlend_Buffer(T,A,B,LinearDodge))
+#define ColorBlend_LinearBurn(T,A,B)    (ColorBlend_Buffer(T,A,B,LinearBurn))
+#define ColorBlend_LinearLight(T,A,B)   (ColorBlend_Buffer(T,A,B,LinearLight))
+#define ColorBlend_VividLight(T,A,B)    (ColorBlend_Buffer(T,A,B,VividLight))
+#define ColorBlend_PinLight(T,A,B)      (ColorBlend_Buffer(T,A,B,PinLight))
+#define ColorBlend_HardMix(T,A,B)       (ColorBlend_Buffer(T,A,B,HardMix))
+#define ColorBlend_Reflect(T,A,B)       (ColorBlend_Buffer(T,A,B,Reflect))
+#define ColorBlend_Glow(T,A,B)          (ColorBlend_Buffer(T,A,B,Glow))
+#define ColorBlend_Phoenix(T,A,B)       (ColorBlend_Buffer(T,A,B,Phoenix))
+
+
+#define ColorBlend_Hue(T,B,L)            ColorBlend_Hls(T,B,L,HueL,LuminationB,SaturationB)
+#define ColorBlend_Saturation(T,B,L)     ColorBlend_Hls(T,B,L,HueB,LuminationB,SaturationL)
+#define ColorBlend_Color(T,B,L)          ColorBlend_Hls(T,B,L,HueL,LuminationB,SaturationL)
+#define ColorBlend_Luminosity(T,B,L)     ColorBlend_Hls(T,B,L,HueB,LuminationL,SaturationB)
+
+
+
+#define ColorBlend_Hls(T,B,L,O1,O2,O3) {                                        \
+    float64 HueB, LuminationB, SaturationB;                                     \
+    float64 HueL, LuminationL, SaturationL;                                     \
+    Color_RgbToHls((B)[2],(B)[1],(B)[0], &HueB, &LuminationB, &SaturationB);    \
+    Color_RgbToHls((L)[2],(L)[1],(L)[0], &HueL, &LuminationL, &SaturationL);    \
+    Color_HlsToRgb(O1,O2,O3,&(T)[2],&(T)[1],&(T)[0]);                           \
+    }
+
+
+/*********************************************************************/
+
+#define COLOR_OPAQUE                (0)
+#define COLOR_TRANSPARENT           (127)
+
+#define RGB_SIZE                    (3)
+#define RGB_BPP                     (24)
+#define RGB_MAXRED                  (255)
+#define RGB_MAXGREEN                (255)
+#define RGB_MAXBLUE                 (255)
+
+#define ARGB_SIZE                   (4)
+#define ARGB_BPP                    (32)
+#define ARGB_MAXALPHA               (127)
+#define ARGB_MAXRED                 (RGB_MAXRED)
+#define ARGB_MAXGREEN               (RGB_MAXGREEN)
+#define ARGB_MAXBLUE                (RGB_MAXBLUE)
+
+/*********************************************************************/
+
+#define Color_GetChannel(c,shift)   ((uint8)((c) >> (shift)))
+#define Color_Reverse(c,bpp)        ((((uint8)(c) << 24) | ((uint8)((c) >> 8 ) << 16) | ((uint8)((c) >> 16) << 8 ) | \ ((uint8)((c) >> 24))) >> (32 - (bpp)))
+
+#define Rgb_ByteWidth(width)        ((width) * RGB_SIZE)
+#define Rgb_PixelWidth(width)       ((width) / RGB_SIZE)
+
+#define Rgb_GetRed(rgb)             (Color_GetChannel(rgb, 0))
+#define Rgb_GetGreen(rgb)           (Color_GetChannel(rgb, 8))
+#define Rgb_GetBlue(rgb)            (Color_GetChannel(rgb, 16))
+
+#define Rgba_GetRed(rgba)           (Color_GetChannel(rgba, 24))
+#define Rgba_GetGreen(rgba)         (Color_GetChannel(rgba, 16))
+#define Rgba_GetBlue(rgba)          (Color_GetChannel(rgba, 8))
+#define Rgba_GetAlpha(rgba)         (Color_GetChannel(rgba, 0))
+
+#define Argb_GetAlpha(argb)         (Color_GetChannel(argb, 24))
+#define Argb_GetRed(argb)           (Color_GetChannel(argb, 16))
+#define Argb_GetGreen(argb)         (Color_GetChannel(argb, 8))
+#define Argb_GetBlue(argb)          (Color_GetChannel(argb, 0))
+
+#define MakeRgb(r,g,b)              (((uint32)(uint8)(b) << 16) | ((uint16)(uint8)(g) << 8 ) | (uint8)(r))
+#define MakeRgba(r,g,b,a)           (((uint32)(uint8)(r) << 24) | ((uint16)(uint8)(g) << 16) | ((uint16)(uint8)(b) << 8 ) | (uint8)(a))
+#define MakeArgb(a,r,g,b)           (((uint32)(uint8)(a) << 24) | ((uint32)(uint8)(r) << 16) | ((uint16)(uint8)(g) << 8 ) | (uint8)(b))
+#define HexToRgb(hex)               (MakeRgb(((hex & 0xFF0000) >> 16), ((hex & 0x00FF00) >> 8 ), (hex & 0xFF)))
+
+inline int Color_HueToRgb(float64 M1, float64 M2, float64 Hue, float64 *Channel)
+{
+    if (Hue < 0.0)
+        Hue += 1.0;
+    else if (Hue > 1.0)
+        Hue -= 1.0;
+
+    if ((6.0 * Hue) < 1.0)
+        *Channel = (M1 + (M2 - M1) * Hue * 6.0);
+    else if ((2.0 * Hue) < 1.0)
+        *Channel = (M2);
+    else if ((3.0 * Hue) < 2.0)
+        *Channel = (M1 + (M2 - M1) * ((2.0F / 3.0F) - Hue) * 6.0);
+    else
+        *Channel = (M1);
+
+    return TRUE;
+}
+
+inline void Color_RgbToHls(uint8 Red, uint8 Green, uint8 Blue, float64 *Hue, float64 *Lumination, float64 *Saturation)
+{
+    float64 Delta;
+    float64 Max, Min;
+    float64 Redf, Greenf, Bluef;
+
+    Redf    = (float64)Red   / 255.0;
+    Greenf  = (float64)Green / 255.0;
+    Bluef   = (float64)Blue  / 255.0;
+
+    //Max     = fmax(fmax(Redf, Greenf), Bluef);
+    //Min     = fmin(fmin(Redf, Greenf), Bluef);
+    Max     = MMAX(MMAX(Red, Green), Blue)/255.0;
+    Min     = MMIN(MMIN(Red, Green), Blue)/255.0;
+
+    *Hue        = 0;
+    *Lumination = (Max + Min) / 2.0F;
+    *Saturation = 0;
+
+    if (Max == Min)
+        return ;
+
+    Delta = (Max - Min);
+
+    if (*Lumination < 0.5)
+        *Saturation = Delta / (Max + Min);
+    else
+        *Saturation = Delta / (2.0 - Max - Min);
+
+    if (Redf == Max)
+        *Hue = (Greenf - Bluef) / Delta;
+    else if (Greenf == Max)
+        *Hue = 2.0 + (Bluef - Redf) / Delta;
+    else
+        *Hue = 4.0 + (Redf - Greenf) / Delta;
+
+    *Hue /= 6.0;
+
+    if (*Hue < 0.0)
+        *Hue += 1.0;
+
+}
+
+inline void Color_HlsToRgb(float64 Hue, float64 Lumination, float64 Saturation, uint8 *Red, uint8 *Green, uint8 *Blue)
+{
+    float64 M1, M2;
+    float64 Redf, Greenf, Bluef;
+
+    if (Saturation == 0) {
+        Redf    = Lumination;
+        Greenf  = Lumination;
+        Bluef   = Lumination;
+    } else {
+        if (Lumination <= 0.5)
+            M2 = Lumination * (1.0 + Saturation);
+        else
+            M2 = Lumination + Saturation - Lumination * Saturation;
+
+        M1 = (2.0 * Lumination - M2);
+
+        Color_HueToRgb(M1, M2, Hue + (1.0F / 3.0F), &Redf);
+        Color_HueToRgb(M1, M2, Hue, &Greenf);
+        Color_HueToRgb(M1, M2, Hue - (1.0F / 3.0F), &Bluef);
+    }
+
+    *Red    = (uint8)(Redf * 255);
+    *Blue   = (uint8)(Bluef * 255);
+    *Green  = (uint8)(Greenf * 255);
+
+}
+
+void BlendGramSimp(unsigned char *Src,unsigned char* Mask, unsigned char *Dest, int Width, int Height, int Mode);
+void BlendGramAlpha(unsigned char *Src,unsigned char* Mask, unsigned char *Dest, int Width, int Height);
+void BlendGramAlpha3(unsigned char *Src,unsigned char* Mask, unsigned char *Dest, int Width, int Height);
+void BlendGramAlphaRev(unsigned char *Src,unsigned char* Mask, unsigned char *Dest, int Width, int Height);
+/*
+void BlendImageAdjustWithMask(CBitmap* bmp,CBitmap* adj,CBitmap* dst ,CBitmap* msk,int mode);
+void BlendImageAdjustWithMaskEx(CBitmap* bmp,CBitmap* adj,CBitmap* dst ,CBitmap* msk,int mode);
+void BlendImageAdjustWithAlpha(CBitmap* bmp,CBitmap* adj,CBitmap* dst ,int alpha,int mode);
+void BlendImageAdjustWithAlphaMask(CBitmap* bmp,CBitmap* adj,CBitmap* dst ,CBitmap* msk,int alpha,int mode);
+
+void CheckAlpha(CBitmap* bmp,CBitmap* alpha);
+void ReadAlphaBySrc(CBitmap* src,CBitmap* alpha);
+*/
+
+#endif
diff --git a/duix-sdk/src/main/cpp/dhunet/face_utils.cpp b/duix-sdk/src/main/cpp/dhunet/face_utils.cpp
new file mode 100644
index 0000000..8c842f1
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhunet/face_utils.cpp
@@ -0,0 +1,133 @@
+#include "face_utils.h"
+//#include <sys/timeb.h>
+
+/*
+cv::Mat resize_image(cv::Mat srcimg, int height, int width, int* top, int* left){
+    cv::Mat dstimg;
+    int srch = srcimg.rows, srcw = srcimg.cols;
+    int neww = width;
+    int newh = height;
+    if (srch != srcw) {
+        float hw_scale = (float)srch / srcw;
+        if (hw_scale > 1) {
+            newh = height;
+            neww = int(width / hw_scale);
+            cv::resize(srcimg, dstimg, cv::Size(neww, newh), cv::INTER_AREA);
+            *left = int((width - neww) * 0.5);
+            cv::copyMakeBorder(dstimg, dstimg, 0, 0, *left, width - neww - *left, cv::BORDER_CONSTANT, 0);
+        }
+        else
+        {
+            newh = (int)height * hw_scale;
+            neww = width;
+            cv::resize(srcimg, dstimg,cv::Size(neww, newh), cv::INTER_AREA);
+            *top = (int)(height - newh) * 0.5;
+            cv::copyMakeBorder(dstimg, dstimg, *top, height - newh - *top, 0, 0, cv::BORDER_CONSTANT, 0);
+
+        }
+    } else {
+        cv::resize(srcimg, dstimg, cv::Size(neww, newh), cv::INTER_AREA);
+    }
+    return dstimg;
+}
+*/
+
+
+int dumpfile(char* file,char** pbuf){
+    std::string fname(file);
+    std::ifstream cache(fname,std::ios::binary);
+    cache.seekg(0,std::ios::end);
+    const int engSize = cache.tellg();
+    printf("===engsize %d\n",engSize );
+    cache.seekg(0,std::ios::beg);
+    char *modelMem = (char*)malloc(engSize+8000);
+    cache.read(modelMem,engSize);
+    cache.close();
+    *pbuf = modelMem;
+    return engSize;
+}
+
+void dumpchar(char* abuf,int len){
+    uint8_t* buf = (uint8_t*)abuf;
+    printf("\n----------------------chardump------------------------\n");
+    int i;
+    for(i = 0; i < len; i++) {
+        printf("=%u=", buf[i]);
+        if( (i+1) % 16 == 0) {
+            printf("\n");
+        }
+    }
+    if(i%16 != 0) {
+        printf("\n");
+    }
+    printf("\n----------------------chardump------------------------\n");
+}
+
+
+void dumpfloat(float* abuf,int len){
+    printf("\n----------------------floatdump------------------------\n");
+    int i;
+    for(i = 0; i < len; i++) {
+        printf("=%f=", abuf[i]);
+        if( (i+1) % 16 == 0) {
+            printf("\n");
+        }
+    }
+    if(i%16 != 0) {
+        printf("\n");
+    }
+    printf("\n----------------------floatdump------------------------\n");
+}
+
+void dumpshort(short* abuf,int len){
+    printf("\n----------------------floatdump------------------------\n");
+    int i;
+    for(i = 0; i < len; i++) {
+        printf("=%d=", abuf[i]);
+        if( (i+1) % 16 == 0) {
+            printf("\n");
+        }
+    }
+    if(i%16 != 0) {
+        printf("\n");
+    }
+    printf("\n----------------------floatdump------------------------\n");
+}
+
+
+void dumphex(char* abuf,int len){
+    unsigned char* buf = (unsigned char*)abuf;
+    int i = 0;
+    printf("\n----------------------hexdump------------------------\n");
+    for(i = 0; i < len; i++) {
+        printf("=%02x=", buf[i]);
+        if( (i+1) % 16 == 0) {
+            printf("\n");
+        }
+    }
+    if(i%16 != 0) {
+        printf("\n");
+    }
+    printf("---------------------hexdump-------------------------\n\n");
+}
+
+int diffbuf(char* abuf,char* bbuf,int size){
+    char* pa = abuf;
+    char* pb = bbuf;
+    int diff = 0;
+    for(int k= 0;k<size;k++){
+        if(*pa++==*pb++){
+        }else{
+            diff++;
+        }
+    }
+    return diff;
+}
+
+
+uint64_t aitimer_msstamp() {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (ts.tv_sec*1000l) + (ts.tv_nsec/CLOCKS_PER_SEC);
+}
+
diff --git a/duix-sdk/src/main/cpp/dhunet/face_utils.h b/duix-sdk/src/main/cpp/dhunet/face_utils.h
new file mode 100644
index 0000000..2ea6ffe
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhunet/face_utils.h
@@ -0,0 +1,21 @@
+#pragma once
+#include <stdint.h>
+#include <fstream>
+#include <sstream>
+#include <iostream>
+#include <vector>
+//#include <opencv2/dnn.hpp>
+//#include <opencv2/imgproc.hpp>
+//#include <opencv2/highgui.hpp>
+
+
+void dumpchar(char* abuf,int len);
+void dumphex(char* abuf,int len);
+void dumpshort(short* abuf,int len);
+void dumpfloat(float* abuf,int len);
+void dumpdouble(double* abuf,int len);
+int dumpfile(char* file,char** pbuf);
+int diffbuf(char* abuf,char* bbuf,int size);
+
+uint64_t aitimer_msstamp();
+
diff --git a/duix-sdk/src/main/cpp/dhunet/jmat.cpp b/duix-sdk/src/main/cpp/dhunet/jmat.cpp
new file mode 100644
index 0000000..044c468
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhunet/jmat.cpp
@@ -0,0 +1,507 @@
+#include "jmat.h"
+
+extern "C"{
+#pragma pack(push)
+#pragma pack(4)
+
+  typedef struct _gpg_hdr {
+    char        head[4];
+    int         box[4];
+    int         size[4];
+    int         width[4];
+    int         height[4];
+    uint8_t     channel[4];
+    uint8_t     bit[4];
+  }gpg_hdr;
+#pragma pack(pop)
+}
+
+
+int JBuf::zeros(uint8_t val){
+  memset(m_buf,val,m_size);
+  return m_size;
+}
+
+int  JBuf::copyto(JBuf* dst){
+  if(m_size!=dst->m_size)return -1;
+  memcpy(dst->m_buf,m_buf,m_size);
+  return m_size;
+}
+
+int  JBuf::copyfrom(JBuf* src){
+  if(m_size!=src->m_size)return -1;
+  memcpy(m_buf,src->m_buf,src->m_size);
+  return m_size;
+}
+
+
+int JBuf::forceref(int bref){
+  //if(m_ref!=bref){
+  m_ref = bref;
+  //}
+  return 0;
+}
+
+JBuf::JBuf(uint32_t size,void* buf ){
+  if(buf){
+    m_ref = true;
+    m_buf = buf;
+    m_size = size;
+  }else{
+    m_ref = false;
+    m_size = size;
+    m_buf = malloc(size+1024);
+  }
+}
+
+JBuf::~JBuf(){
+  //printf("====%d free %p\n",m_ref,m_buf);
+  if(!m_ref){
+    free(m_buf);
+    m_buf = nullptr;
+  }
+}
+
+JBuf::JBuf(){
+  m_size = 0;
+  m_buf = nullptr;
+}
+
+JMat::JMat(){
+  init_tagarr();
+}
+
+void JMat::init_tagarr(){
+  memset(m_tagarr,0,512*sizeof(int));
+}
+
+int* JMat::tagarr(){
+  return m_tagarr;
+}
+
+int JMat::savegpg(std::string gpgfile){
+  gpg_hdr ghead;
+  memset(&ghead,0,sizeof(gpg_hdr));
+  ghead.head[0]='g';
+  ghead.head[1]='p';
+  ghead.head[2]='g';
+  ghead.head[3]='1';
+  ghead.size[0]=m_size;
+  ghead.width[0]=m_width;
+  ghead.height[0]=m_height;
+  ghead.channel[0]=m_channel;
+  ghead.bit[0]=m_bit;
+
+  FILE *gpgFile = NULL;
+  const char* fn = gpgfile.c_str();
+  if ((gpgFile = fopen(fn, "wb")) == NULL)return -1;
+  fwrite(&ghead,sizeof(gpg_hdr),1,gpgFile);
+  fwrite(m_buf, m_size, 1, gpgFile);
+  fclose(gpgFile);
+  return 0;
+}
+
+int JMat::load(std::string picfile,int flag){
+  const char* fn = picfile.c_str();
+  size_t len = strlen(fn);
+  if(len<4)return -1;
+  fn+= len-3;
+  int gpg = (fn[0]=='g')&&(fn[1]=='p')&&(fn[2]=='g');
+  if(gpg){
+    return loadgpg(picfile);
+  }else{
+    return loadjpg(picfile);
+  }
+
+}
+
+int JMat::loadgpg(std::string gpgfile){
+  FILE *gpgFile = NULL;
+  const char* fn = gpgfile.c_str();
+  if ((gpgFile = fopen(fn, "rb")) == NULL)return -1;
+  int rst = 0;
+  while(1){
+    gpg_hdr ghead;
+    memset(&ghead,0,sizeof(gpg_hdr));
+    fread(&ghead,sizeof(gpg_hdr),1,gpgFile);
+    char* arr=ghead.head;
+    if((arr[0]=='g')&&
+        (arr[1]=='p')&&
+        (arr[2]=='g')){
+
+      size_t imgSize  = ghead.size[0];
+      if(m_size<imgSize){
+        //printf("==m_size %d img size %d\n",m_size,imgSize);
+        if((!m_ref)&&m_buf)free(m_buf);
+        m_buf = malloc(imgSize);
+        m_ref = 0;
+      }
+      m_size = imgSize;
+      m_width = ghead.width[0];
+      m_height = ghead.height[0];
+      m_channel = ghead.channel[0];
+      m_bit = ghead.bit[0];
+      fread(m_buf, m_size, 1, gpgFile);
+    }else{
+      rst = -11;
+    }
+    break;
+  }
+  fclose(gpgFile);
+  return rst;
+}
+
+#ifdef USE_TURBOJPG
+#include "turbojpeg.h"
+int JMat::loadjpg(std::string picfile,int flag){
+  tjhandle tjInstance = NULL;
+  int rst = 0;
+  size_t jpegSize = 0;
+  size_t imgSize = 0;
+  int newbuf = 0;
+  unsigned char *jpegBuf = NULL;
+  if(1){
+    long size;
+    FILE *jpegFile = NULL;
+    const char* fn = picfile.c_str();
+    if ((jpegFile = fopen(fn, "rb")) == NULL)return -1;
+    if (fseek(jpegFile, 0, SEEK_END) < 0 || ((size = ftell(jpegFile)) < 0) || (fseek(jpegFile, 0, SEEK_SET) < 0)){
+      fclose(jpegFile);
+      return -2;
+    }
+    if (size == 0){
+      fclose(jpegFile);
+      return -3;
+    }
+    jpegSize = size;
+    jpegBuf = (unsigned char*)tj3Alloc(jpegSize);
+    fread(jpegBuf, jpegSize, 1, jpegFile);
+    fclose(jpegFile);
+  }
+  if ((tjInstance = tj3Init(TJINIT_DECOMPRESS)) == NULL)return -11;
+  while(1){
+    unsigned char *imgBuf = NULL;
+    int w, h;
+    int inSubsamp, inColorspace;
+    int pixelFormat = TJPF_BGR;
+    rst = tj3DecompressHeader(tjInstance, jpegBuf, jpegSize);
+    if(rst<0){
+      rst = -12;
+      break;
+    }
+    w = tj3Get(tjInstance, TJPARAM_JPEGWIDTH);
+    h = tj3Get(tjInstance, TJPARAM_JPEGHEIGHT);
+    inSubsamp = tj3Get(tjInstance, TJPARAM_SUBSAMP);
+    inColorspace = tj3Get(tjInstance, TJPARAM_COLORSPACE);
+    imgSize = w * h * tjPixelSize[pixelFormat];
+    if(imgSize <0){
+      rst = -13;
+      break;
+    }
+    //printf("===imgSize %d m_size %d\n",imgSize,m_size);
+    if(m_size<imgSize){
+      if((!m_ref)&&m_buf)free(m_buf);
+      m_buf = malloc(imgSize);
+      m_ref = 0;
+    }
+    m_size = imgSize;
+    imgBuf = (unsigned char *)m_buf;
+    if(tj3Decompress8(tjInstance, jpegBuf, jpegSize, imgBuf, 0, pixelFormat) < 0){
+      rst = -15;
+      break;
+    }
+    //m_ref = 0;
+    m_bit = 1;
+    m_channel = 3;
+    m_stride = w*3;
+    m_width = w;
+    m_height = h;
+    break;
+  }
+  if(jpegBuf)tj3Free(jpegBuf);
+  jpegBuf = NULL;
+  tj3Destroy(tjInstance);
+  tjInstance = NULL;
+  return rst;
+}
+
+#else
+int JMat::loadjpg(std::string picfile,int flag){
+  return -1;
+}
+#endif
+
+JMat::JMat(int w,int h,float *buf ,int c  ,int d ):JBuf(){
+  m_bit = sizeof(float);
+  m_width = w;
+  m_height = h;
+  m_channel = c;
+  m_stride = d?d:w*c;
+  m_size = m_bit*m_stride*m_height;
+  m_buf = buf;
+
+  m_ref = 1;
+  init_tagarr();
+}
+
+JMat::JMat(int w,int h,uint8_t *buf ,int c ,int d ):JBuf(){
+  m_bit = 1;
+  m_width = w;
+  m_height = h;
+  m_channel = c;
+  m_stride = d?d:w*c;
+  m_size = m_bit*m_stride*m_height;
+  //printf("===d %d stride %d width %d height %d m_size %d\n",d,m_stride,w,h,m_size);
+  m_buf = buf;
+  m_ref = 1;
+  init_tagarr();
+}
+
+JMat::JMat(int w,int h,int c ,int d ,int b):JBuf(){
+  m_bit = b==0?sizeof(float):b;
+  m_width = w;
+  m_height = h;
+  m_channel = c;
+  m_stride = d?d:w*c;
+  m_size = m_bit*m_stride*m_height;
+  //printf("===mat %d size %d\n",m_bit,m_size);
+  m_buf = malloc(m_size+m_bit*m_stride);
+  memset(m_buf,0,m_size+m_bit*m_stride);
+  m_ref = 0;
+  init_tagarr();
+}
+
+#ifdef USE_OPENCV
+
+cv::Mat  JMat::cvmat(){
+  if(m_channel == 3){
+    cv::Mat rrr(m_height,m_width,m_bit==1?CV_8UC3:CV_32FC3,m_buf);
+    return rrr;
+  }else if(m_channel == 1){
+    cv::Mat rrr(m_height,m_width,m_bit==1?CV_8UC1:CV_32FC1,m_buf);
+    return rrr;
+  }else{
+    cv::Mat rrr(m_height,m_width*m_channel,m_bit==1?CV_8UC1:CV_32FC1,m_buf);
+    return rrr;
+  }
+}
+
+int JMat::show(const char* title,int inx){
+  std::string name(title);
+  //printf("===show m_bit %d\n",m_bit);
+  if(m_bit==1){
+    cv::Mat mat(m_height,m_width,m_channel==3?CV_8UC3:CV_8UC1,m_buf);
+    if(inx){
+      std::string str = std::to_string(inx);
+      cv::Point pt;
+      pt.x = 180;
+      pt.y = 450;
+      int baseline = 0;
+      cv::putText(mat,str,pt,0,2,cv::Scalar(0,255,0),4,8.0);
+    }
+    cv::imshow(name,mat);
+  }else{
+    cv::Mat mat(m_height,m_width,m_channel==3?CV_32FC3:CV_32FC1,m_buf);
+    cv::imshow(name,mat);
+  }
+  return 0;
+}
+int JMat::tojpg(const char* fn){
+  int rst = 0;
+  if(m_bit==1){
+    cv::Mat mat(m_height,m_width,CV_8UC3,m_buf);
+    std::string name(fn);
+    rst = cv::imwrite(name,mat);
+  }else{
+    printf("====ccc\n");
+    cv::Mat mat(m_height,m_width,CV_32FC3,m_buf);
+    cv::Mat dst(m_height,m_width,CV_8UC3);
+    mat.convertTo(dst,CV_8UC3,255.0f);
+    std::string name(fn);
+    rst = cv::imwrite(name,dst);
+  }
+  return rst;
+}
+#else
+int JMat::show(const char* title,int tag){
+  return 0;
+}
+int JMat::tojpg(const char* fn){
+  return 0;
+}
+#endif
+
+
+int JMat::tobin(const char* fn){
+  FILE* file = fopen(fn, "w");
+  if(!file)return 0;
+  fwrite(m_buf, m_size, 1, file);
+  fclose(file);
+  return 1;
+}
+
+JMat* JMat::reshape(int w,int h,int l,int t,int c){
+  int allh = h+t;
+  if(allh>m_height)return NULL;
+  int channel = c?c:m_channel;
+  JMat* mat = NULL;
+  if(m_bit==1){
+    uint8_t* buf = udata()+t*m_stride+l*m_channel;
+    mat= new JMat(w,h,buf,channel,m_stride);
+  }else{
+    float* buf = fdata()+t*m_stride+l*m_channel;
+    mat= new JMat(w,h,buf,channel,m_stride);
+  }
+  return mat;
+}
+
+JMat* JMat::refclone(int ref){
+  if(ref){
+    if(m_bit==1){
+      return new JMat(m_width,m_height,(uint8_t*)m_buf,m_channel,m_stride);
+    }else{
+      return new JMat(m_width,m_height,(float*)m_buf,m_channel,m_stride);
+    }
+  }else{
+    JMat* cm = new JMat(m_width,m_height,m_channel,m_stride,m_bit);
+    //printf("m_buf %p m_size %d\n",m_buf,m_size);
+    //printf("====w %d h %d c %d s %d refclone %d\n",m_width,m_height,m_channel,m_stride,ref);
+    memcpy(cm->m_buf,m_buf,m_size);
+    memcpy(cm->m_tagarr,m_tagarr,512*sizeof(int));
+    return cm;
+  }
+}
+
+JMat JMat::clone(){
+  JMat cm(m_width,m_height,m_channel,m_stride,m_bit);
+  memcpy(cm.m_buf,m_buf,m_size);
+  memcpy(cm.m_tagarr,m_tagarr,512*sizeof(int));
+  return cm;
+}
+
+#ifdef USE_OPENCV
+JMat::JMat(std::string picfile,int flag):JBuf(){
+  cv::Mat image ;//= cv::imread(picfile);
+  m_bit = flag?1:sizeof(float);
+  m_width = image.cols;
+  m_height = image.rows;
+  m_channel = 3;//image.channels();
+                //printf("===channels %d\n",m_channel);
+  m_stride = m_width*m_channel;
+  m_size = m_bit*m_stride*m_height;
+  m_buf = malloc(m_size+m_bit*m_stride);
+  m_ref = 0;
+  if(flag){
+    memcpy(m_buf,image.data,m_size);
+    //printf("===w %d h %d\n",image.cols,image.rows);
+    //cv::imshow("aaa",image);
+    //cv::waitKey(0);
+    //cv::Mat fmat(m_height,m_width,CV_8UC3,m_buf);
+    //float scale = 1.0f/255.0f;
+    //image.convertTo(fmat,CV_32F,scale);
+  }else{
+    cv::Mat fmat(m_height,m_width,CV_32FC3,m_buf);
+    float scale = 1.0f/255.0f;
+    image.convertTo(fmat,CV_32F,scale);
+  }
+  image.release();
+  init_tagarr();
+}
+#else
+JMat::JMat(std::string picfile,int flag):JBuf(){
+
+}
+#endif
+
+JMat::~JMat(){
+  //printf("====%d free 1 %p\n",m_ref,m_buf);
+}
+
+float* JMat::fdata(){
+  return (float*)m_buf;
+}
+
+float* JMat::frow(int row){
+  return ((float*)m_buf)+ row*m_stride;
+}
+
+char* JMat::row(int row){
+  return ((char*)m_buf)+ row*m_stride*m_bit;
+}
+
+float* JMat::fitem(int row,int col){
+  return ((float*)m_buf)+ row*m_stride + col;
+
+}
+
+void JMat::dump(){
+  printf("jmat %p size %d bit %d===\n",m_buf,m_size,m_bit);
+  printf("w %d h %d c %d s %d \n",m_width,m_height,m_channel,m_stride);
+  if(m_height!=1){
+    if(m_bit==4){
+      for(int k=0;k<m_height;k++){
+        float* buf = frow(k);
+        float* ebuf = buf + m_width -3;
+
+        printf("%d %f %f %f === %f %f %f\n",k,buf[0],buf[1],buf[2],ebuf[0],ebuf[1],ebuf[2]);
+      }
+    }else{
+      for(int k=0;k<m_height;k++){
+        short* buf = (short*)row(k);
+        short* ebuf = buf + m_width -6;
+        printf("%d %hd %hd %hd === %hd %hd %hd\n",k,buf[0],buf[1],buf[2],ebuf[0],ebuf[1],ebuf[2]);
+
+      }
+    }
+  }else{
+    short* buf = (short*)row(0);
+    for(int k=0;k<m_width/640;k++){
+      short* ebuf = buf + 640 -6;
+
+      printf("%d %hd %hd %hd === %hd %hd %hd\n",k,buf[0],buf[1],buf[2],ebuf[0],ebuf[1],ebuf[2]);
+      buf += 640;
+    }
+  }
+}
+
+
+uint8_t* JMat::udata(){
+  return (uint8_t*)m_buf;
+}
+/*
+   nc::NdArray<float> JMat::ncarray(){
+   bool own = false;
+   nc::NdArray<float> arr = nc::NdArray<float>((float*)m_buf, m_height, m_width, own);
+   return arr;
+   }
+   */
+
+
+#ifdef USE_NCNN
+ncnn::Mat JMat::packingmat(){
+  ncnn::Mat in_pack(m_width,m_height,1,(void*)m_buf,(size_t)4u*3,3);
+  ncnn::Mat in ;
+  ncnn::convert_packing(in_pack,in,1);
+  return in;
+}
+
+ncnn::Mat           JMat::ncnnmat(){
+  unsigned char* data = (unsigned char*)m_buf;
+  if(m_channel == 3){
+    ncnn::Mat mat = ncnn::Mat::from_pixels(data, ncnn::Mat::PIXEL_BGR, m_width, m_height);
+    return mat;
+  }else if(m_channel == 4){
+    ncnn::Mat mat = ncnn::Mat::from_pixels(data, ncnn::Mat::PIXEL_BGRA, m_width, m_height);
+    return mat;
+  }else if(m_channel == 1){
+    ncnn::Mat mat = ncnn::Mat::from_pixels(data, ncnn::Mat::PIXEL_GRAY, m_width, m_height);
+    return mat;
+  }else {
+    ncnn::Mat mat = ncnn::Mat::from_pixels(data, ncnn::Mat::PIXEL_GRAY, m_width*m_channel, m_height);
+    return mat;
+  }
+}
+
+
+#endif
+
diff --git a/duix-sdk/src/main/cpp/dhunet/jmat.h b/duix-sdk/src/main/cpp/dhunet/jmat.h
new file mode 100644
index 0000000..f402853
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhunet/jmat.h
@@ -0,0 +1,99 @@
+#pragma once
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <memory>
+#include <vector>
+#include <string.h>
+#include <string>
+
+//#include "NumCpp.hpp"
+#define USE_OPENCV
+#define USE_NCNN
+#define USE_TURBOJPG
+//#define USE_PPLCV
+
+#ifdef USE_OPENCV
+#include "opencv2/core.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/highgui.hpp"
+#endif
+
+#ifdef USE_NCNN
+#include "mat.h"
+#endif
+
+#ifdef USE_EIGEN
+#include "eigen3/Eigen/Core"
+typedef Eigen::Matrix<float, 1, Eigen::Dynamic, Eigen::RowMajor> Vectorf;
+typedef Eigen::Matrix<std::complex<float>, 1, Eigen::Dynamic, Eigen::RowMajor> Vectorcf;
+typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> Matrixf;
+typedef Eigen::Matrix<std::complex<float>, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> Matrixcf;
+#endif
+
+class JBuf{
+
+    public:
+        bool        m_ref = 0;
+        uint32_t    m_size = 0;
+        void*       m_buf = NULL;
+    public:
+        uint32_t    size(){return m_size;} ;
+        void*       data(){return m_buf;};
+        bool        ref(){return m_ref;};
+        int         zeros(uint8_t val=0);
+        int         copyfrom(JBuf* src);
+        int         copyto(JBuf* dst);
+        int         forceref(int bref);
+        JBuf();
+        JBuf(uint32_t size,void* buf = nullptr);
+        virtual ~JBuf();
+};
+
+class JMat:public JBuf{
+    public:
+        int     m_bit = 0;
+        int     m_width = 0;
+        int     m_height = 0;
+        int     m_channel = 0;
+        int     m_stride = 0;
+        int     m_tagarr[512];
+        void    init_tagarr();
+    public:
+        int height(){return m_height;}
+        int width(){return m_width;}
+        int stride(){return m_stride;}
+        int channel(){return m_channel;}
+        JMat(int w,int h,float *buf ,int c = 3 ,int d = 0);
+        JMat(int w,int h,uint8_t *buf ,int c = 3 ,int d = 0);
+        JMat(int w,int h,int c = 3,int d = 0,int b=0);
+        JMat(std::string picfile,int flag=0);
+        JMat();
+        int load(std::string picfile,int flag=0);
+        int loadjpg(std::string picfile,int flag=0);
+        int savegpg(std::string gpgfile);
+        int loadgpg(std::string gpgfile);
+        float* fdata();
+        char* row(int row);
+        float* frow(int row);
+        float* fitem(int row,int col);
+        int tojpg(const char* fn);
+        int tobin(const char* fn);
+        int show(const char* title,int inx = 0);
+        JMat clone();
+        JMat* refclone(int ref=1);
+        JMat* reshape(int w,int h,int l,int t,int c=0);
+        uint8_t* udata();
+        virtual ~JMat();
+        int*    tagarr();
+        void     dump();
+        //nc::NdArray<float> ncarray();
+#ifdef USE_OPENCV
+        cv::Mat             cvmat();
+#endif
+#ifdef USE_NCNN
+        ncnn::Mat           ncnnmat();
+        ncnn::Mat           packingmat();
+#endif
+        //Matrixf  tomatrix();
+};
diff --git a/duix-sdk/src/main/cpp/dhunet/malpha.cpp b/duix-sdk/src/main/cpp/dhunet/malpha.cpp
new file mode 100644
index 0000000..d40fbf6
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhunet/malpha.cpp
@@ -0,0 +1,184 @@
+#include "malpha.h"
+
+MWorkMat::MWorkMat(JMat* pic,JMat* msk,const int* boxs,int kind){
+    m_boxx = boxs[0];
+    m_boxy=boxs[1];
+    m_boxwidth=boxs[2]-m_boxx;
+    m_boxheight=boxs[3]-m_boxy;
+    //printf("x %d y %d w %d h %d \n",m_boxx,m_boxy,m_boxwidth,m_boxheight);
+    m_pic = pic;
+    m_msk = msk;
+
+
+    if(kind==168){
+
+      srcw = 168;
+      edge = 4;
+      adjw = 160;
+      mskx = 5;
+      msky = 5;
+      mskw = 150;
+      mskh = 145;
+
+    }else if(kind==128){
+      srcw = 134;
+      edge = 3;
+      adjw = 128;
+      mskx = 4;
+      msky = 4;
+      mskw = 120;
+      mskh = 120;
+
+
+    }
+
+    pic_realadjw = new JMat(adjw,adjw,3,0,1);
+    pic_maskadjw = new JMat(adjw,adjw,3,0,1);
+    //pic_cropadjw = new JMat(adjw,adjw,3,0,1);
+
+    msk_realadjw = new JMat(adjw,adjw,1,0,1);
+
+}
+
+MWorkMat::~MWorkMat(){
+    matpic_orgsrcw.release();
+    matpic_roirst.release();
+    delete pic_realadjw;
+    delete pic_maskadjw;
+    delete msk_realadjw;
+    if(pic_cloneadjw) delete pic_cloneadjw;
+}
+
+int MWorkMat::munet(JMat** ppic,JMat** pmsk){
+
+    *ppic = pic_realadjw;
+    *pmsk = pic_maskadjw;
+    return 0;
+}
+
+int MWorkMat::premunet(){
+    matpic_roisrc = cv::Mat(m_pic->cvmat(),cv::Rect(m_boxx,m_boxy,m_boxwidth,m_boxheight));
+    cv::resize(matpic_roisrc , matpic_orgsrcw, cv::Size(srcw, srcw), cv::INTER_AREA);
+    matpic_roiadjw = cv::Mat(matpic_orgsrcw,cv::Rect(edge,edge,adjw,adjw));
+    cv::Mat cvmask = pic_maskadjw->cvmat();
+    cv::Mat cvreal = pic_realadjw->cvmat();
+    //printf("===matpic %d %d\n",matpic_roiadjw.cols,matpic_roiadjw.rows);
+    //printf("===cvreal %d %d\n",cvreal.cols,cvreal.rows);
+    //getchar();
+    matpic_roiadjw.copyTo(cvreal);
+    matpic_roiadjw.copyTo(cvmask);
+    pic_cloneadjw = pic_realadjw->refclone(0);
+    cv::rectangle(cvmask,cv::Rect(mskx,msky,mskw,mskh),cv::Scalar(0,0,0),-1);//,cv::LineTypes::FILLED);
+    return 0;
+}
+
+int MWorkMat::finmunet(JMat* fgpic){
+    cv::Mat cvreal = pic_realadjw->cvmat();
+
+        //for(int k=0;k<16;k++){
+            //cv::line(cvreal,cv::Point(0,k*10),cv::Point(adjw,k*10),cv::Scalar(0,255,0));
+        //}
+        //for(int k=0;k<16;k++){
+            //cv::line(cvreal,cv::Point(k*10,0),cv::Point(k*10,adjw),cv::Scalar(0,255,0));
+        //}
+    cvreal.copyTo(matpic_roiadjw);
+    //cv::imwrite("accpre.bmp",matpic_orgsrcw);
+    if(m_msk) vtacc((uint8_t*)matpic_orgsrcw.data,srcw*srcw);
+    //cv::imwrite("accend.bmp",matpic_orgsrcw);
+    if(fgpic&&(fgpic->width()==srcw)){
+      std::vector<cv::Mat> list;
+      cv::split(matpic_orgsrcw,list);
+      matmsk_roisrc = cv::Mat(m_msk->cvmat(),cv::Rect(m_boxx,m_boxy,m_boxwidth,m_boxheight));
+      cv::resize(matmsk_roisrc , matmsk_orgsrcw, cv::Size(srcw, srcw), cv::INTER_AREA);
+      cv::Mat rrr(srcw,srcw,CV_8UC1);
+      cv::cvtColor(matmsk_orgsrcw,rrr,cv::COLOR_RGB2GRAY);
+      list.push_back(rrr);
+      cv::merge(list,fgpic->cvmat());
+    }else{
+      cv::resize(matpic_orgsrcw, matpic_roirst, cv::Size(m_boxwidth, m_boxheight), cv::INTER_AREA);
+      if(fgpic){
+        matpic_roisrc = cv::Mat(fgpic->cvmat(),cv::Rect(m_boxx,m_boxy,m_boxwidth,m_boxheight));
+        matpic_roirst.copyTo(matpic_roisrc);
+      }else{
+        matpic_roirst.copyTo(matpic_roisrc);
+      }
+    }
+    return 0;
+}
+
+int MWorkMat::alpha(JMat** preal,JMat** pimg,JMat** pmsk){
+    *preal = pic_cloneadjw;
+    *pimg =  pic_realadjw;
+    *pmsk =  msk_realadjw;
+    return 0;
+}
+
+int MWorkMat::prealpha(){
+    printf("x %d y %d w %d h %d \n",m_boxx,m_boxy,m_boxwidth,m_boxheight);
+    matmsk_roisrc = cv::Mat(m_msk->cvmat(),cv::Rect(m_boxx,m_boxy,m_boxwidth,m_boxheight));
+    cv::resize(matmsk_roisrc , matmsk_orgsrcw, cv::Size(srcw, srcw), cv::INTER_AREA);
+
+    matmsk_roiadjw = cv::Mat(matmsk_orgsrcw,cv::Rect(edge,edge,adjw,adjw));
+    cv::Mat cvmask = msk_realadjw->cvmat();
+    cv::cvtColor(matmsk_roiadjw,cvmask,cv::COLOR_RGB2GRAY);
+    return 0;
+}
+
+int MWorkMat::finalpha(){
+    cv::Mat cvmask = msk_realadjw->cvmat();
+    cv::cvtColor(cvmask,matmsk_roiadjw,cv::COLOR_GRAY2RGB);
+    //
+    cv::resize(matmsk_orgsrcw, matmsk_roirst, cv::Size(m_boxwidth, m_boxheight), cv::INTER_AREA);
+    matmsk_roirst.copyTo(matmsk_roisrc);
+    return 0;
+}
+
+int MWorkMat::vtacc(uint8_t* buf,int count){
+    /*
+    int avgr = 0;
+    int avgb = 0;
+    int avgg = 0;
+    if(1){
+        uint8_t* pb = m_pic->udata();
+        for(int k=0;k<10;k++){
+            avgr += *pb++;
+            avgg += *pb++;
+            avgb += *pb++;
+        }
+        avgr =avgr/10 +10;
+        avgg =avgg/10 -20;
+        if(avgg<0)avgg=0;
+        avgb =avgb/10 + 10;
+    }
+    */
+    uint8_t* pb = buf;
+    for(int k=0;k<count;k++){
+        int sum  = (pb[0]+ pb[2])/2.0f;
+        if(pb[1]>=sum){
+            pb[1]=sum;
+            //pb[0]=0;
+            //pb[2]=0;
+            // }else if((pb[0]<avgr)&&(pb[1]>avgg)&&(pb[2]<avgb)){
+            //pb[1]=0;
+            //pb[0]=0;
+            //pb[2]=0;
+        }
+        pb+=3;
+    }
+    /*
+    long sum = 0l;
+    float  mean = sum*0.5f/count;
+    uint8_t maxg = (mean>255.f)?255:mean;
+    //printf("sum %ld mean %f maxg %d\n",sum,mean,maxg);
+    //getchar();
+    pb = buf +1;
+    for(int k=0;k<count;k++){
+        if(*pb>maxg){
+            *pb = maxg;
+        }
+        pb+=3;
+    }
+    */
+    return 0;
+}
+
diff --git a/duix-sdk/src/main/cpp/dhunet/malpha.h b/duix-sdk/src/main/cpp/dhunet/malpha.h
new file mode 100644
index 0000000..7ad86d2
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhunet/malpha.h
@@ -0,0 +1,56 @@
+#pragma once
+#include "jmat.h"
+//#include <simpleocv.h>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <stdio.h>
+
+class MWorkMat{
+  private:
+    int     srcw = 168;
+    int     edge = 4;
+    int     adjw = 160;
+
+    int     mskx = 5;
+    int     msky = 5;
+    int     mskw = 150;
+    int     mskh = 145;
+    int     m_boxx;
+    int     m_boxy;
+    int     m_boxwidth;
+    int     m_boxheight;
+    JMat*   m_pic;
+    JMat*   m_msk;
+
+    JMat*   pic_realadjw;//blendimg
+    JMat*   pic_maskadjw;
+
+    cv::Mat matpic_roisrc;//box area
+    cv::Mat matpic_orgsrcw;
+    cv::Mat matpic_roiadjw;
+    JMat*   pic_cloneadjw;//blendimg
+    cv::Mat matpic_roirst;
+
+    //
+    JMat*   msk_realadjw;
+
+    cv::Mat matmsk_roisrc;//box area
+    cv::Mat matmsk_orgsrcw;
+    cv::Mat matmsk_roiadjw;
+
+    cv::Mat matmsk_roirst;
+
+    int vtacc(uint8_t* buf,int count);
+  public:
+    MWorkMat(JMat* pic,JMat* msk,const int* boxs,int kind=168);
+    int premunet();
+    int munet(JMat** ppic,JMat** pmsk);
+    int finmunet(JMat* fgpic=NULL);
+    int prealpha();
+    int alpha(JMat** preal,JMat** pimg,JMat** pmsk);
+    int finalpha();
+
+    virtual ~MWorkMat();
+};
+
diff --git a/duix-sdk/src/main/cpp/dhunet/munet.cpp b/duix-sdk/src/main/cpp/dhunet/munet.cpp
new file mode 100644
index 0000000..54faf27
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhunet/munet.cpp
@@ -0,0 +1,275 @@
+#include "munet.h"
+#include "cpu.h"
+#include "face_utils.h"
+#include "blendgram.h"
+
+Mobunet::Mobunet(const char* fnbin,const char* fnparam,const char* fnmsk,int wenetstep,int rgb){
+  m_rgb = rgb;
+  m_wenetstep = wenetstep;
+    initModel(fnbin,fnparam,fnmsk);
+}
+
+Mobunet::Mobunet(const char* modeldir,const char* modelid,int rgb){
+  m_rgb = rgb;
+    char fnbin[1024];
+    char fnparam[1024];
+    char fnmsk[1024];
+    sprintf(fnbin,"%s/%s.bin",modeldir,modelid);
+    sprintf(fnparam,"%s/%s.param",modeldir,modelid);
+    sprintf(fnmsk,"%s/weight_168u.bin",modeldir);
+    initModel(fnbin,fnparam,fnmsk);
+}
+
+int Mobunet::initModel(const char* binfn,const char* paramfn,const char* mskfn){
+    unet.clear();
+    //ncnn::set_cpu_powersave(2);
+    //ncnn::set_omp_num_threads(2);//ncnn::get_big_cpu_count());
+    //unet.opt = ncnn::Option();
+    unet.opt.use_vulkan_compute = false;
+    unet.opt.num_threads = ncnn::get_big_cpu_count();   // 1
+    //unet.load_param("model/mobileunet_v5_wenet_sim.param");
+    //unet.load_model("model/mobileunet_v5_wenet_sim.bin");
+    unet.load_param(paramfn);
+    unet.load_model(binfn);
+    char* wbuf = NULL;
+    dumpfile((char*)mskfn,&wbuf);
+    printf("===mskfn %s\n",mskfn);
+    mat_weights = new JMat(160,160,(uint8_t*)wbuf,1);
+    mat_weights->forceref(0);
+    mat_weightmin = new JMat(128,128,1);
+    cv::Mat ma = mat_weights->cvmat();
+    cv::Mat mb;
+    cv::resize(ma,mb,cv::Size(128,128));
+    cv::Mat mc = mat_weightmin->cvmat();
+    mb.copyTo(mc);
+    return 0;
+}
+
+Mobunet::~Mobunet(){
+    unet.clear();
+    if(mat_weights){
+        delete mat_weights;
+        mat_weights = nullptr;
+    }
+}
+
+int Mobunet::domodelold(JMat* pic,JMat* msk,JMat* feat){
+    JMat  picall(160*160,2,3,0,1);
+    uint8_t* buf = picall.udata();
+    int width = pic->width();
+    int height = pic->height();
+
+    cv::Mat c1(height,width,CV_8UC3,buf);
+    cv::Mat c2(height,width,CV_8UC3,buf+width*height*3);
+    cv::cvtColor(pic->cvmat(),c1,cv::COLOR_RGB2BGR);
+    cv::cvtColor(msk->cvmat(),c2,cv::COLOR_RGB2BGR);
+    ncnn::Mat inall ;
+      inall = ncnn::Mat::from_pixels(buf, ncnn::Mat::PIXEL_BGR, 160*160, 2);
+    inall.substract_mean_normalize(mean_vals, norm_vals);
+    //inall.reshape(160,160,6);
+    ncnn::Mat inwenet(256,20,1,feat->data());
+    ncnn::Mat outpic;
+    ncnn::Extractor ex = unet.create_extractor();
+    ex.input("face", inall);
+    ex.input("audio", inwenet);
+    ex.extract("output", outpic);
+    float outmean_vals[3] = {-1.0f, -1.0f, -1.0f};
+    float outnorm_vals[3] = { 0.5f,  0.5f,  0.5f};
+    outpic.substract_mean_normalize(outmean_vals, outnorm_vals);
+    ncnn::Mat pakpic;
+    ncnn::convert_packing(outpic,pakpic,3);
+    cv::Mat cvadj(160,160,CV_32FC3,pakpic.data);
+    //dumpfloat((float*)cvadj.data,160*160*3);
+    cv::Mat cvreal;
+    float scale = 255.0f;
+    cvadj.convertTo(cvreal,CV_8UC3,scale);
+    cv::Mat cvmask;
+    cv::cvtColor(cvreal,cvmask,cv::COLOR_RGB2BGR);
+    BlendGramAlpha((uchar*)cvmask.data,(uchar*)mat_weights->data(),(uchar*)pic->data(),160,160);
+    return 0;
+}
+
+int Mobunet::domodel(JMat* pic,JMat* msk,JMat* feat,int rect){
+  int width = pic->width();
+  int height = pic->height();
+    ncnn::Mat inmask = ncnn::Mat::from_pixels(msk->udata(), m_rgb?ncnn::Mat::PIXEL_RGB:ncnn::Mat::PIXEL_BGR2RGB, rect, rect);
+    inmask.substract_mean_normalize(mean_vals, norm_vals);
+    ncnn::Mat inreal = ncnn::Mat::from_pixels(pic->udata(), m_rgb?ncnn::Mat::PIXEL_RGB:ncnn::Mat::PIXEL_BGR2RGB, rect, rect);
+    inreal.substract_mean_normalize(mean_vals, norm_vals);
+    ncnn::Mat inpic(width,height,6);
+    float* buf = (float*)inpic.data;
+    float* pr = (float*)inreal.data;
+    memcpy(buf,pr,inreal.cstep*sizeof(float)*inreal.c);
+    buf+= inpic.cstep*inreal.c;
+    float* pm = (float*)inmask.data;
+    memcpy(buf,pm,inmask.cstep*sizeof(float)*inmask.c);
+    float* pf = (float*)feat->data();
+    if(m_wenetstep==10){
+      pf+= 256*5;
+    }
+    ncnn::Mat inwenet(256,m_wenetstep,1,pf);
+    ncnn::Mat outpic;
+    ncnn::Extractor ex = unet.create_extractor();
+    ex.input("face", inpic);
+    ex.input("audio", inwenet);
+    //printf("===debug ncnn\n");
+    ex.extract("output", outpic);
+    float outmean_vals[3] = {-1.0f, -1.0f, -1.0f};
+    float outnorm_vals[3] = { 127.5f,  127.5f,  127.5f};
+    outpic.substract_mean_normalize(outmean_vals, outnorm_vals);
+    cv::Mat cvout(width,height,CV_8UC3);
+    outpic.to_pixels(cvout.data,m_rgb?ncnn::Mat::PIXEL_RGB:ncnn::Mat::PIXEL_RGB2BGR);
+
+    if(rect==160){
+      BlendGramAlpha((uchar*)cvout.data,(uchar*)mat_weights->data(),(uchar*)pic->data(),width,height);
+    }else{
+      BlendGramAlpha((uchar*)cvout.data,(uchar*)mat_weightmin->data(),(uchar*)pic->data(),width,height);
+    }
+    return 0;
+}
+
+
+int Mobunet::preprocess(JMat* pic,JMat* feat){
+    //pic 168
+    cv::Mat roipic(pic->cvmat(),cv::Rect(4,4,160,160));
+    JMat  picmask(160,160,3,0,1);
+    JMat  picreal(160,160,3,0,1);
+    cv::Mat cvmask = picmask.cvmat();
+    cv::Mat cvreal = picreal.cvmat();
+    roipic.copyTo(cvmask);
+    roipic.copyTo(cvreal);
+    cv::rectangle(cvmask,cv::Rect(5,5,150,145),cv::Scalar(0,0,0),-1);//,cv::LineTypes::FILLED);
+    domodel(&picreal,&picmask,feat);
+    cvreal.copyTo(roipic);
+    return 0;
+}
+
+int Mobunet::fgprocess(JMat* pic,const int* boxs,JMat* feat,JMat* fg){
+    int boxx, boxy ,boxwidth, boxheight ;
+    boxx = boxs[0];boxy=boxs[1];boxwidth=boxs[2]-boxx;boxheight=boxs[3]-boxy;
+    int stride = pic->stride();
+    cv::Mat roisrc(pic->cvmat(),cv::Rect(boxx,boxy,boxwidth,boxheight));
+    cv::Mat cvorig;
+    cv::resize(roisrc , cvorig, cv::Size(168, 168), cv::INTER_AREA);
+    JMat  pic168(168,168,(uint8_t*)cvorig.data);
+    preprocess(&pic168,feat);
+    cv::Mat cvrst;;
+    cv::resize(cvorig , cvrst, cv::Size(boxwidth, boxheight), cv::INTER_AREA);
+    cv::Mat roidst(fg->cvmat(),cv::Rect(boxx,boxy,boxwidth,boxheight));
+    cvrst.copyTo(roidst);
+    return 0;
+}
+
+int Mobunet::process(JMat* pic,const int* boxs,JMat* feat){
+    int boxx, boxy ,boxwidth, boxheight ;
+    boxx = boxs[0];boxy=boxs[1];boxwidth=boxs[2]-boxx;boxheight=boxs[3]-boxy;
+    int stride = pic->stride();
+    cv::Mat roisrc(pic->cvmat(),cv::Rect(boxx,boxy,boxwidth,boxheight));
+    cv::Mat cvorig;
+    cv::resize(roisrc , cvorig, cv::Size(168, 168), cv::INTER_AREA);
+    JMat  pic168(168,168,(uint8_t*)cvorig.data);
+    preprocess(&pic168,feat);
+    cv::Mat cvrst;;
+    cv::resize(cvorig , cvrst, cv::Size(boxwidth, boxheight), cv::INTER_AREA);
+    cvrst.copyTo(roisrc);
+    return 0;
+}
+
+int Mobunet::process2(JMat* pic,const int* boxs,JMat* feat){
+    int boxx, boxy ,boxwidth, boxheight ;
+    boxx = boxs[0];boxy=boxs[1];boxwidth=boxs[2]-boxx;boxheight=boxs[3]-boxy;
+    int stride = pic->stride();
+
+    cv::Mat cvsrc = pic->cvmat();
+    printf("cvsrc %d %d \n",cvsrc.cols,cvsrc.rows);
+    cv::Mat roisrc(cvsrc,cv::Rect(boxx,boxy,boxwidth,boxheight));
+    cv::Mat cvorig;
+    cv::resize(roisrc , cvorig, cv::Size(168, 168), cv::INTER_AREA);
+    /*
+    uint8_t* data =(uint8_t*)pic->data() + boxy*stride + boxx*pic->channel();
+    int scale_w = 168;
+    int scale_h = 168;
+    ncnn::Mat prepic = ncnn::Mat::from_pixels_resize(data, ncnn::Mat::PIXEL_BGR, boxwidth, boxheight, stride,scale_w, scale_h);
+    //pic 168
+    cv::Mat cvorig(168,168,CV_8UC3,prepic.data);
+     */
+
+    cv::Mat roimask(cvorig,cv::Rect(4,4,160,160));
+    JMat  picmask(160,160,3,0,1);
+    JMat  picreal(160,160,3,0,1);
+    cv::Mat cvmask = picmask.cvmat();
+    cv::Mat cvreal = picreal.cvmat();
+    roimask.copyTo(cvmask);
+    roimask.copyTo(cvreal);
+
+    cv::rectangle(cvmask,cv::Rect(5,5,150,150),cv::Scalar(0,0,0),-1);//,cv::LineTypes::FILLED);
+
+    ncnn::Mat inmask = ncnn::Mat::from_pixels(picmask.udata(), ncnn::Mat::PIXEL_BGR2RGB, 160, 160);
+    inmask.substract_mean_normalize(mean_vals, norm_vals);
+    ncnn::Mat inreal = ncnn::Mat::from_pixels(picreal.udata(), ncnn::Mat::PIXEL_BGR2RGB, 160, 160);
+    inreal.substract_mean_normalize(mean_vals, norm_vals);
+
+    JMat  picin(160*160,2,3);
+    char*  pd = (char*)picin.data();
+    memcpy(pd,inreal.data,160*160*3*4);
+    memcpy(pd+ 160*160*3*4,inmask.data,160*160*3*4);
+
+//    char* pinpic = NULL;
+//    dumpfile("pic.bin",&pinpic);
+//    dumpfloat((float*)pd,10);
+//    dumpfloat((float*)pinpic,10);
+    //ncnn::Mat inpic(160,160,6,pd,4);
+    ncnn::Mat inpack(160,160,1,pd,(size_t)4u*6,6);
+    ncnn::Mat inpic;
+    ncnn::convert_packing(inpack,inpic,1);
+
+//    char* pwenet = NULL;
+//    dumpfile("wenet.bin",&pwenet);
+    ncnn::Mat inwenet(256,20,1,feat->data(),4);
+    ncnn::Mat outpic;
+    ncnn::Extractor ex = unet.create_extractor();
+    ex.input("face", inpic);
+    ex.input("audio", inwenet);
+    ex.extract("output", outpic);
+
+    float outmean_vals[3] = {-1.0f, -1.0f, -1.0f};
+//    float outnorm_vals[3] = { 2.0f,  2.0f,  2.0f};
+    float outnorm_vals[3] = { 127.5f,  127.5f,  127.5f};
+    outpic.substract_mean_normalize(outmean_vals, outnorm_vals);
+
+    ncnn::Mat pakpic;
+    ncnn::convert_packing(outpic,pakpic,3);
+
+    cv::Mat cvadj(160,160,CV_32FC3,pakpic.data);
+    cv::Mat cvout(160,160,CV_8UC3);
+    float scale = 1.0f;
+    cvadj.convertTo(cvout,CV_8UC3,scale);
+    //cv::imwrite("cvout.jpg",cvout);
+    cv::cvtColor(cvout,roimask,cv::COLOR_RGB2BGR);
+//    cvout.copyTo(roimask);
+
+    //cv::imwrite("roimask.jpg",roimask);
+    //cv::imwrite("cvorig.jpg",cvorig);
+    //cv::waitKey(0);
+    cv::resize(cvorig , roisrc, cv::Size(boxwidth, boxheight), cv::INTER_AREA);
+    //cv::imwrite("roisrc.jpg",roisrc);
+        //cv::imshow("cvsrc",cvsrc);
+//    cv::imshow("roisrc",roisrc);
+//    cv::imshow("cvorig",cvorig);
+//    cv::waitKey(20);
+    /*
+    {
+        uint8_t *pr = (uint8_t *) cvoutc.data;
+        printf("==%u %u %u\n", pr[0], pr[1], pr[2]);
+    }
+    //
+    float* p = (float*)cvadj.data;
+    printf("==%f %f %f\n",p[0],p[1],p[2]);
+    p+=160*160;
+    printf("==%f %f %f\n",p[0],p[1],p[2]);
+    p+=160*160;
+    printf("==%f %f %f\n",p[0],p[1],p[2]);
+    */
+    return 0;
+}
+
diff --git a/duix-sdk/src/main/cpp/dhunet/munet.h b/duix-sdk/src/main/cpp/dhunet/munet.h
new file mode 100644
index 0000000..c840e1b
--- /dev/null
+++ b/duix-sdk/src/main/cpp/dhunet/munet.h
@@ -0,0 +1,31 @@
+#pragma once
+#include "jmat.h"
+#include "net.h"
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <stdio.h>
+#include <vector>
+
+
+class Mobunet{
+    private:
+      int m_wenetstep = 20;
+      int m_rgb =0;
+        ncnn::Net unet;
+        float mean_vals[3] = {127.5f, 127.5f, 127.5f};
+        float norm_vals[3] = {1 / 127.5f, 1 / 127.5f, 1 / 127.5f};
+        JMat*   mat_weights = nullptr;
+        JMat*   mat_weightmin = nullptr;
+        int initModel(const char* binfn,const char* paramfn,const char* mskfn);
+    public:
+        int domodel(JMat* pic,JMat* msk,JMat* feat,int rect = 160);
+        int domodelold(JMat* pic,JMat* msk,JMat* feat);
+        int preprocess(JMat* pic,JMat* feat);
+        int process(JMat* pic,const int* boxs,JMat* feat);
+        int fgprocess(JMat* pic,const int* boxs,JMat* feat,JMat* fg);
+        int process2(JMat* pic,const int* boxs,JMat* feat);
+        Mobunet(const char* modeldir,const char* modelid,int rgb = 0);
+        Mobunet(const char* fnbin,const char* fnparam,const char* fnmsk,int wenetstep = 20,int rgb = 0);
+        ~Mobunet();
+};
diff --git a/duix-sdk/src/main/cpp/duix/gjduix.cpp b/duix-sdk/src/main/cpp/duix/gjduix.cpp
new file mode 100644
index 0000000..5e66165
--- /dev/null
+++ b/duix-sdk/src/main/cpp/duix/gjduix.cpp
@@ -0,0 +1,290 @@
+#include <stdlib.h>
+#include <pthread.h>
+#include "gjduix.h"
+#include "dhwenet.h"
+#include "wenetai.h"
+#include "dhpcm.h"
+#include "munet.h"
+#include "malpha.h"
+#include "dhwenet.h"
+
+struct dhmfcc_s{
+  int mincalc;
+  int minoff;  
+  int minblock;  
+  int maxblock;  
+  int inited;
+  char* wenetfn;
+
+  //DhWenet* wenet;
+  WeAI*   weai_first;
+  WeAI*   weai_common;
+  PcmSession* cursess;
+  PcmSession* presess;
+  volatile uint64_t  sessid;
+
+  volatile int running;
+  pthread_t *calcthread;
+  pthread_mutex_t pushmutex;
+  pthread_mutex_t readmutex;
+};
+
+
+static void *calcworker(void *arg){
+  dhmfcc_t* mfcc = (dhmfcc_t*)arg;
+  uint64_t sessid = 0;
+  while(mfcc->running){
+    int rst = 0;
+    PcmSession* sess = mfcc->cursess;
+    if(sess &&(sess->sessid()==mfcc->sessid)){
+      rst = sess->runcalc(mfcc->sessid,mfcc->weai_common,mfcc->mincalc);
+    }
+    if(rst!=1){
+      jtimer_mssleep(20);
+    }else{
+      jtimer_mssleep(10);
+    }
+  }
+  return NULL;
+}
+
+int dhmfcc_alloc(dhmfcc_t** pdg,int mincalc){
+  dhmfcc_t* mfcc = (dhmfcc_t*)malloc(sizeof(dhmfcc_t));
+  memset(mfcc,0,sizeof(dhmfcc_t));
+  mfcc->mincalc = mincalc?mincalc:1;
+  mfcc->minoff = STREAM_BASE_MINOFF;
+  mfcc->minblock = STREAM_BASE_MINBLOCK;
+  mfcc->maxblock = STREAM_BASE_MAXBLOCK;
+  pthread_mutex_init(&mfcc->pushmutex,NULL);
+  pthread_mutex_init(&mfcc->readmutex,NULL);
+  mfcc->calcthread = (pthread_t *)malloc(sizeof(pthread_t) );
+  mfcc->running = 1;
+  pthread_create(mfcc->calcthread, NULL, calcworker, (void*)mfcc);
+  *pdg = mfcc;
+  return 0;
+}
+
+int dhmfcc_initPcmex(dhmfcc_t* dg,int maxsize,int minoff ,int minblock ,int maxblock){
+  dg->minoff = minoff;
+  dg->minblock = minblock;
+  dg->maxblock = maxblock;
+  dg->inited = 1;
+#ifdef WENETOPENV
+  if(dg->wenetfn){
+    //
+    std::string fnonnx(dg->wenetfn);
+    std::string fnovbin = fnonnx+"_ov.bin";
+    std::string fnovxml = fnonnx+"_ov.xml";
+    int melcnt = DhWenet::cntmel(dg->minblock);
+    int bnfcnt = DhWenet::cntbnf(melcnt);
+    WeAI*  awenet ;
+    awenet = new WeOpvn(fnovbin,fnovxml,melcnt,bnfcnt,4);
+    if(dg->weai_first){
+      WeAI* oldw = dg->weai_first;
+      dg->weai_first = awenet;
+      delete oldw;
+    }else{
+      dg->weai_first = awenet;
+    }
+    awenet->test();
+  }
+#endif
+  return 0;
+}
+
+int dhmfcc_initWenet(dhmfcc_t* dg,char* fnwenet){
+  dg->wenetfn = strdup(fnwenet);
+
+  std::string fnonnx(fnwenet);
+  WeAI*  awenet ;
+    int melcnt = DhWenet::cntmel(dg->minblock);
+    int bnfcnt = DhWenet::cntbnf(melcnt);
+#ifdef WENETOPENV
+  if(dg->inited){
+    std::string fnovbin = fnonnx+"_ov.bin";
+    std::string fnovxml = fnonnx+"_ov.xml";
+    awenet = new WeOpvn(fnovbin,fnovxml,melcnt,bnfcnt,4);
+  }else{
+    awenet = new WeOnnx(fnwenet,melcnt,bnfcnt,4);
+  }
+#else
+    awenet = new WeOnnx(fnwenet,melcnt,bnfcnt,4);
+#endif
+  WeAI* bwenet = new WeOnnx(fnwenet,321,79,4);
+  if(dg->weai_first){
+    WeAI* oldw = dg->weai_first;
+    dg->weai_first = awenet;
+    delete oldw;
+  }else{
+    dg->weai_first = awenet;
+  }
+  if(dg->weai_common){
+    WeAI* oldw = dg->weai_common;
+    dg->weai_common = bwenet;
+    delete oldw;
+  }else{
+    dg->weai_common = bwenet;
+  }
+  awenet->test();
+  bwenet->test();
+  return awenet?0:-1;
+}
+
+uint64_t dhmfcc_newsession(dhmfcc_t* dg){
+  uint64_t sessid = ++dg->sessid;
+  PcmSession* sess = new PcmSession(sessid,dg->minoff,dg->minblock,dg->maxblock);
+  PcmSession* olds = dg->presess;
+  dg->presess = dg->cursess;
+  dg->cursess = sess;
+  if(olds)delete olds;
+  return sessid;
+}
+
+int dhmfcc_pushpcm(dhmfcc_t* dg,uint64_t sessid,char* buf,int size,int kind){
+  if(sessid!=dg->sessid)return -1;
+  if(!dg->running)return -2;
+  PcmSession* sess = dg->cursess;
+  if(!sess)return -3;
+  int rst =  0;
+  pthread_mutex_lock(&dg->pushmutex);
+  rst = sess->pushpcm(sessid,(uint8_t*)buf,size);
+  pthread_mutex_unlock(&dg->pushmutex);
+  if(rst>0){
+    if(sess->first()){
+      sess->runfirst(sessid,dg->weai_first);
+      uint64_t tick = jtimer_msstamp();
+      printf("====runfirst  %ld %ld \n",sessid,tick);
+    }
+    return 0;
+  }else{
+    return rst;
+  }
+}
+
+int dhmfcc_readpcm(dhmfcc_t* dg,uint64_t sessid,char* pcmbuf,int pcmlen,char* bnfbuf,int bnflen){
+  if(sessid!=dg->sessid)return -1;
+  if(!dg->running)return -2;
+  PcmSession* sess = dg->cursess;
+  if(!sess)return -3;
+  int rst = 0;
+  pthread_mutex_lock(&dg->readmutex);
+  rst =  sess->readnext(sessid,(uint8_t*)pcmbuf,pcmlen,(uint8_t*)bnfbuf,bnflen);
+  pthread_mutex_unlock(&dg->readmutex);
+  return rst;
+}
+
+int dhmfcc_consession(dhmfcc_t* dg,uint64_t sessid){
+  if(sessid!=dg->sessid)return -1;
+  if(!dg->running)return -2;
+  PcmSession* sess = dg->cursess;
+  if(!sess)return -3;
+  return sess->conpcm(sessid);
+}
+
+int dhmfcc_finsession(dhmfcc_t* dg,uint64_t sessid){
+  if(sessid!=dg->sessid)return -1;
+  if(!dg->running)return -2;
+  PcmSession* sess = dg->cursess;
+  if(!sess)return -3;
+  return sess->finpcm(sessid);
+}
+
+int dhmfcc_free(dhmfcc_t* dg){
+  dg->running = 0;
+  pthread_join(*dg->calcthread, NULL);
+  if(dg->weai_first){
+    delete dg->weai_first;
+    dg->weai_first = NULL;
+  }
+  if(dg->weai_common){
+    delete dg->weai_common;
+    dg->weai_common = NULL;
+  }
+  if(dg->cursess){
+    delete dg->cursess;
+    dg->cursess = NULL;
+  }
+  if(dg->presess){
+    delete dg->presess;
+    dg->presess = NULL;
+  }
+  pthread_mutex_destroy(&dg->pushmutex);
+  pthread_mutex_destroy(&dg->readmutex);
+  free(dg->calcthread);
+  free(dg);
+  //
+  return 0;
+}
+
+struct dhunet_s{
+  int inited;
+  int rgb;
+  Mobunet     *munet; 
+};
+
+int dhunet_alloc(dhunet_t** pdg,int rgb){
+  dhunet_t* unet = (dhunet_t*)malloc(sizeof(dhunet_t));
+  memset(unet,0,sizeof(dhunet_t));
+  unet->rgb = 1;
+  *pdg = unet;
+  return 0;
+}
+
+int dhunet_initMunet(dhunet_t* dg,char* fnparam,char* fnbin,char* fnmsk){
+  dg->munet = new Mobunet(fnbin,fnparam,fnmsk,20,dg->rgb);
+  dg->inited = 1;
+  printf("===init munet \n");
+  return 0;
+}
+
+#define AIRUN_FLAG 1
+int dhunet_simprst(dhunet_t* dg,uint64_t sessid,uint8_t* bpic,int width,int height,int* box,uint8_t* bmsk,uint8_t* bfg,uint8_t* bnfbuf,int bnflen){
+  //printf("simprst gogogo %d \n",dg->inited);
+  if(!dg->inited)return -1;
+  if(bnflen!=STREAM_ALL_BNF)return -2;
+  if(!dg->munet)return -3;
+  int rst = 0;
+  JMat* mat_pic = new JMat(width,height,bpic);
+  JMat* mat_msk = bmsk?new JMat(width,height,bmsk):NULL;
+  JMat* mat_fg = bfg?new JMat(width,height,bfg):NULL;
+  JMat* feat = new JMat(STREAM_CNT_BNF,STREAM_BASE_BNF,(float*)bnfbuf,1);
+
+  MWorkMat wmat(mat_pic,mat_msk,box);
+  wmat.premunet();
+  JMat* mpic;
+  JMat* mmsk;
+  wmat.munet(&mpic,&mmsk);
+  //tooken
+#ifdef AIRUN_FLAG
+  uint64_t ticka = jtimer_msstamp();
+  rst = dg->munet->domodel(mpic, mmsk, feat);
+  uint64_t tickb = jtimer_msstamp();
+  uint64_t dist = tickb-ticka;
+  if(dist>40){
+    printf("===domodel %d dist %ld\n",rst,dist);
+  }
+#endif
+  if(mat_fg){
+    wmat.finmunet(mat_fg);
+  }else{
+    wmat.finmunet(mat_pic);
+  }
+  if(feat)delete feat;
+  delete mat_pic;
+  if(mat_fg)delete mat_fg;
+  if(mat_msk)delete mat_msk;
+  return 0;
+}
+
+int dhunet_free(dhunet_t* dg){
+  dg->inited = 0;
+  if(dg->munet){
+    delete dg->munet;
+    dg->munet = NULL;
+  }
+  free(dg);
+  return 0;
+
+}
+
+
diff --git a/duix-sdk/src/main/cpp/duix/gjduix.h b/duix-sdk/src/main/cpp/duix/gjduix.h
new file mode 100644
index 0000000..e2f058c
--- /dev/null
+++ b/duix-sdk/src/main/cpp/duix/gjduix.h
@@ -0,0 +1,38 @@
+#ifndef GJDUIX_
+#define GJDUIX_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+typedef struct dhmfcc_s dhmfcc_t;;
+
+int dhmfcc_alloc(dhmfcc_t** pdg,int mincalc);
+int dhmfcc_initPcmex(dhmfcc_t* dg,int maxsize,int minoff ,int minblock ,int maxblock);
+int dhmfcc_initWenet(dhmfcc_t* dg,char* fnwenet); 
+
+uint64_t dhmfcc_newsession(dhmfcc_t* dg);
+int dhmfcc_pushpcm(dhmfcc_t* dg,uint64_t sessid,char* buf,int size,int kind);
+int dhmfcc_readpcm(dhmfcc_t* dg,uint64_t sessid,char* pcmbuf,int pcmlen,char* bnfbuf,int bnflen);
+int dhmfcc_finsession(dhmfcc_t* dg,uint64_t sessid);
+int dhmfcc_consession(dhmfcc_t* dg,uint64_t sessid);
+
+int dhmfcc_free(dhmfcc_t* dg);
+
+
+typedef struct dhunet_s dhunet_t;;
+int dhunet_alloc(dhunet_t** pdg,int minrender);
+int dhunet_initMunet(dhunet_t* dg,char* fnparam,char* fnbin,char* fnmsk);
+int dhunet_simprst(dhunet_t* dg,uint64_t sessid,uint8_t* bpic,int width,int height,int* box,uint8_t* bmsk,uint8_t* bfg,uint8_t* bnfbuf,int bnflen);
+int dhunet_free(dhunet_t* pdg);
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/duix-sdk/src/main/cpp/duix/gjsimp.cpp b/duix-sdk/src/main/cpp/duix/gjsimp.cpp
new file mode 100644
index 0000000..79a7485
--- /dev/null
+++ b/duix-sdk/src/main/cpp/duix/gjsimp.cpp
@@ -0,0 +1,453 @@
+#include "gjsimp.h"
+#include <stdlib.h>
+#include <pthread.h>
+#include "dhwenet.h"
+#include "wenetai.h"
+#include "dhpcm.h"
+#include "munet.h"
+#include "malpha.h"
+#include "dhwenet.h"
+#include <queue>
+//#include "Log.h"
+
+
+struct dhduix_s{
+  int kind;
+  int rect;
+  int width;
+  int height;
+  int mincalc;
+  int minoff;  
+  int minblock;  
+  int maxblock;  
+  int inited;
+  char* wenetfn;
+
+  //DhWenet* wenet;
+  WeAI*   weai_first;
+  WeAI*   weai_common;
+  PcmSession* cursess;
+  //PcmSession* presess;
+  volatile uint64_t  sessid;
+
+  jmat_t    *mat_feat;
+  volatile int running;
+  pthread_t *calcthread;
+  pthread_mutex_t pushmutex;
+  pthread_mutex_t readmutex;
+  pthread_mutex_t freemutex;
+  std::queue<PcmSession*> *slist;  
+
+  int rgb;
+  Mobunet     *munet; 
+  JMat        *mat_pic;
+  JMat        *mat_fg;
+  JMat        *mat_msk;
+};
+
+static void *calcworker(void *arg){
+  dhduix_t* mfcc = (dhduix_t*)arg;
+  uint64_t sessid = 0;
+  while(mfcc->running){
+    int rst = 0;
+    PcmSession* sess = mfcc->cursess;
+    if(sess &&(sess->sessid()==mfcc->sessid)){
+      rst = sess->runcalc(mfcc->sessid,mfcc->weai_common,mfcc->mincalc);
+    }
+    if(rst!=1){
+      if(!mfcc->slist->empty()){
+        pthread_mutex_lock(&mfcc->freemutex);
+        PcmSession* sess = mfcc->slist->front();
+        mfcc->slist->pop();
+        delete sess;
+        pthread_mutex_unlock(&mfcc->freemutex);
+        jtimer_mssleep(10);
+      }else{
+        jtimer_mssleep(20);
+      }
+    }else{
+      jtimer_mssleep(10);
+    }
+  }
+  return NULL;
+}
+
+int dhduix_alloc(dhduix_t** pdg,int mincalc,int width,int height){
+  dhduix_t* duix = (dhduix_t*)malloc(sizeof(dhduix_t));
+  memset(duix,0,sizeof(dhduix_t));
+  duix->mincalc = mincalc?mincalc:1;
+  duix->minoff = STREAM_BASE_MINOFF;
+  duix->minblock = STREAM_BASE_MINBLOCK;
+  duix->maxblock = STREAM_BASE_MAXBLOCK;
+  pthread_mutex_init(&duix->pushmutex,NULL);
+  pthread_mutex_init(&duix->readmutex,NULL);
+  pthread_mutex_init(&duix->freemutex,NULL);
+  duix->slist = new std::queue<PcmSession*>();
+  duix->calcthread = (pthread_t *)malloc(sizeof(pthread_t) );
+  duix->running = 1;
+  pthread_create(duix->calcthread, NULL, calcworker, (void*)duix);
+  duix->rgb = 1;
+  duix->width = width;
+  duix->height = height;
+  duix->mat_msk = new JMat(width,height);
+  duix->mat_fg = new JMat(width,height);
+  duix->mat_pic = new JMat(width,height);
+  //duix->mat_feat = jmat_alloc(20,STREAM_BASE_BNF,1,0,4,NULL);
+  duix->mat_feat = jmat_alloc(STREAM_BASE_BNF,20,1,0,4,NULL);
+  duix->kind = 168;
+  duix->rect = 160;
+  *pdg = duix;
+  return 0;
+}
+
+int dhduix_initPcmex(dhduix_t* dg,int maxsize,int minoff ,int minblock ,int maxblock,int rgb){
+  dg->minoff = minoff;
+  dg->minblock = minblock;
+  dg->maxblock = maxblock;
+  dg->inited = 1;
+#ifdef WENETOPENV
+  if(dg->wenetfn){
+    //
+    std::string fnonnx(dg->wenetfn);
+    std::string fnovbin = fnonnx+"_ov.bin";
+    std::string fnovxml = fnonnx+"_ov.xml";
+    int melcnt = DhWenet::cntmel(dg->minblock);
+    int bnfcnt = DhWenet::cntbnf(melcnt);
+    WeAI*  awenet ;
+    awenet = new WeOpvn(fnovbin,fnovxml,melcnt,bnfcnt,4);
+    if(dg->weai_first){
+      WeAI* oldw = dg->weai_first;
+      dg->weai_first = awenet;
+      delete oldw;
+    }else{
+      dg->weai_first = awenet;
+    }
+    awenet->test();
+  }
+#endif
+  dg->rgb = rgb;
+  return 0;
+}
+
+int dhduix_initWenet(dhduix_t* dg,char* fnwenet){
+  dg->wenetfn = strdup(fnwenet);
+
+  std::string fnonnx(fnwenet);
+  WeAI*  awenet ;
+  int melcnt = DhWenet::cntmel(dg->minblock);
+  int bnfcnt = DhWenet::cntbnf(melcnt);
+#ifdef WENETOPENV
+  if(dg->inited){
+    std::string fnovbin = fnonnx+"_ov.bin";
+    std::string fnovxml = fnonnx+"_ov.xml";
+    awenet = new WeOpvn(fnovbin,fnovxml,melcnt,bnfcnt,4);
+  }else{
+    awenet = new WeOnnx(fnwenet,melcnt,bnfcnt,4);
+  }
+#else
+  awenet = new WeOnnx(fnwenet,melcnt,bnfcnt,4);
+#endif
+  WeAI* bwenet = new WeOnnx(fnwenet,321,79,4);
+  if(dg->weai_first){
+    WeAI* oldw = dg->weai_first;
+    dg->weai_first = awenet;
+    delete oldw;
+  }else{
+    dg->weai_first = awenet;
+  }
+  if(dg->weai_common){
+    WeAI* oldw = dg->weai_common;
+    dg->weai_common = bwenet;
+    delete oldw;
+  }else{
+    dg->weai_common = bwenet;
+  }
+  awenet->test();
+  bwenet->test();
+  return awenet?0:-1;
+}
+
+uint64_t dhduix_newsession(dhduix_t* dg){
+  uint64_t sessid = ++dg->sessid;
+  PcmSession* sess = new PcmSession(sessid,dg->minoff,dg->minblock,dg->maxblock);
+  //PcmSession* olds = dg->presess;
+  //dg->presess = dg->cursess;
+  //dg->cursess = sess;
+  //if(olds)delete olds;
+  pthread_mutex_lock(&dg->pushmutex);
+  pthread_mutex_lock(&dg->readmutex);
+  PcmSession* olds = dg->cursess;
+  dg->cursess = sess;
+  pthread_mutex_unlock(&dg->pushmutex);
+  pthread_mutex_unlock(&dg->readmutex);
+  pthread_mutex_lock(&dg->freemutex);
+  dg->slist->push(olds);
+  pthread_mutex_unlock(&dg->freemutex);
+  return sessid;
+}
+
+int dhduix_pushpcm(dhduix_t* dg,uint64_t sessid,char* buf,int size,int kind){
+  if(sessid!=dg->sessid)return -1;
+  if(!dg->running)return -2;
+  PcmSession* sess = dg->cursess;
+  if(!sess)return -3;
+  int rst =  0;
+  pthread_mutex_lock(&dg->pushmutex);
+  rst = sess->pushpcm(sessid,(uint8_t*)buf,size);
+  pthread_mutex_unlock(&dg->pushmutex);
+  if(rst>0){
+    if(sess->first()){
+      sess->runfirst(sessid,dg->weai_first);
+      uint64_t tick = jtimer_msstamp();
+      printf("====runfirst  %ld %ld \n",sessid,tick);
+    }
+    return 0;
+  }else{
+    return rst;
+  }
+}
+
+int dhduix_readpcm(dhduix_t* dg,uint64_t sessid,char* pcmbuf,int pcmlen,char* bnfbuf,int bnflen){
+  if(sessid!=dg->sessid)return -1;
+  if(!dg->running)return -2;
+  PcmSession* sess = dg->cursess;
+  if(!sess)return -3;
+  int rst = 0;
+  pthread_mutex_lock(&dg->readmutex);
+  rst =  sess->readnext(sessid,(uint8_t*)pcmbuf,pcmlen,(uint8_t*)bnfbuf,bnflen);
+  pthread_mutex_unlock(&dg->readmutex);
+  return rst;
+}
+
+int dhduix_consession(dhduix_t* dg,uint64_t sessid){
+  if(sessid!=dg->sessid)return -1;
+  if(!dg->running)return -2;
+  PcmSession* sess = dg->cursess;
+  if(!sess)return -3;
+  return sess->conpcm(sessid);
+}
+
+int dhduix_finsession(dhduix_t* dg,uint64_t sessid){
+  if(sessid!=dg->sessid)return -1;
+  if(!dg->running)return -2;
+  PcmSession* sess = dg->cursess;
+  if(!sess)return -3;
+  return sess->finpcm(sessid);
+}
+
+int dhduix_free(dhduix_t* dg){
+  dg->running = 0;
+  pthread_join(*dg->calcthread, NULL);
+  if(dg->slist){
+    pthread_mutex_lock(&dg->freemutex);
+    while(!dg->slist->empty()){
+      PcmSession* sess = dg->slist->front();
+      dg->slist->pop();
+      delete sess;
+    }
+    pthread_mutex_unlock(&dg->freemutex);
+    delete dg->slist;
+  }
+
+  if(dg->weai_first){
+    delete dg->weai_first;
+    dg->weai_first = NULL;
+  }
+  if(dg->weai_common){
+    delete dg->weai_common;
+    dg->weai_common = NULL;
+  }
+  if(dg->cursess){
+    delete dg->cursess;
+    dg->cursess = NULL;
+  }
+  //if(dg->presess){
+    //delete dg->presess;
+    //dg->presess = NULL;
+  //}
+  if(dg->munet){
+    delete dg->munet;
+    dg->munet = NULL;
+  }
+  if(dg->mat_fg){
+    delete dg->mat_fg;
+    dg->mat_fg = NULL;
+  }
+  if(dg->mat_pic){
+    delete dg->mat_pic;
+    dg->mat_pic = NULL;
+  }
+  if(dg->mat_msk){
+    delete dg->mat_msk;
+    dg->mat_msk = NULL;
+  }
+  pthread_mutex_destroy(&dg->pushmutex);
+  pthread_mutex_destroy(&dg->readmutex);
+  pthread_mutex_destroy(&dg->freemutex);
+  free(dg->calcthread);
+  jmat_free(dg->mat_feat);
+  free(dg);
+  //
+  return 0;
+}
+
+
+int dhduix_initMunet(dhduix_t* dg,char* fnparam,char* fnbin,char* fnmsk){
+  dg->munet = new Mobunet(fnbin,fnparam,fnmsk,20,dg->rgb);
+  dg->inited = 1;
+  printf("===init munet \n");
+  dg->kind = 168;
+  dg->rect = 160;
+  return 0;
+}
+
+int dhduix_initMunetex(dhduix_t* dg,char* fnparam,char* fnbin,char* fnmsk,int rect){
+  dg->munet = new Mobunet(fnbin,fnparam,fnmsk,20,dg->rgb);
+  dg->inited = 1;
+  if(rect==128){
+    dg->kind = 128;
+    dg->rect = 128;
+  }else{
+    dg->kind = 168;
+    dg->rect = 160;
+  }
+  printf("===init munet \n");
+  return 0;
+}
+
+int dhduix_simppcm(dhduix_t* dg,char* buf,int size,char* pre,int presize,char* bnf,int bnfsize){
+  if(!dg->running)return -2;
+  PcmFile* mfcc = new PcmFile(25,10,STREAM_BASE_MAXBLOCK,STREAM_BASE_MAXBLOCK*20);
+  mfcc->prepare(buf,size,pre,presize);
+  mfcc->process(-1,dg->weai_first);
+  int rst = mfcc->readbnf(buf,size);
+
+  return rst;
+}
+
+int dhduix_allcnt(dhduix_t* dg,uint64_t sessid){
+  PcmSession* sess = dg->cursess;
+  if(!sess)return -3;
+  if(sess->sessid()!=sessid)return 0;
+  return sess->fileBlock();
+}
+
+int dhduix_readycnt(dhduix_t* dg,uint64_t sessid){
+  PcmSession* sess = dg->cursess;
+  if(!sess)return -3;
+  if(sess->sessid()!=sessid)return 0;
+  return sess->calcBlock();
+}
+
+
+#define AIRUN_FLAG 1
+int dhduix_fileinx(dhduix_t* dg,uint64_t sessid,char* fnpic,int* box,char* fnmsk,char* fnfg,int bnfinx,char* bimg,char* mskbuf,int imgsize){
+  if(sessid!=dg->sessid)return -1;
+  if(!dg->running)return -2;
+
+  uint64_t ticka = jtimer_msstamp();
+  std::string sfnpic(fnpic);
+  std::string sfnmsk(fnmsk);
+  std::string sfnfg(fnfg);
+  JMat* mat_pic = dg->mat_pic;
+  mat_pic->loadjpg(sfnpic,1);
+  uint8_t* bpic = (uint8_t*)mat_pic->data();
+  uint8_t* bmsk = NULL;
+  uint8_t* bfg = NULL;
+  JMat* mat_msk = NULL;
+  if(sfnmsk.length()){
+    mat_msk = dg->mat_msk;
+    mat_msk->loadjpg(sfnmsk,1);
+    bmsk = (uint8_t*)mat_msk->data();
+    memcpy(mskbuf,bmsk,dg->width*dg->height*3);
+  }
+  JMat* mat_fg = NULL;
+  if(sfnfg.length()){
+    mat_fg = dg->mat_fg;
+    mat_fg->loadjpg(sfnfg,1);
+    bfg = (uint8_t*)mat_fg->data();
+  }
+  uint64_t tickb = jtimer_msstamp();
+  uint64_t dist = tickb-ticka;
+  //LOGD("tooken","===loadjpg %ld\n",dist);
+  int rst = 0;
+  if(box){
+    rst = dhduix_simpinx(dg,sessid, bpic,dg->width,dg->height, box, bmsk, bfg,bnfinx);
+  }else{
+    rst = dhduix_simpblend(dg,sessid, bpic,dg->width,dg->height,  bmsk, bfg);
+  }
+  int size = dg->width*dg->height*3;
+  if(bfg){
+    memcpy(bimg,bfg,size);
+  }else{
+    memcpy(bimg,bpic,size);
+  }
+  if(bmsk) memcpy(mskbuf,bmsk,size);
+  return rst;
+}
+
+int dhduix_simpinx(dhduix_t* dg,uint64_t sessid,uint8_t* bpic,int width,int height,int* box,uint8_t* bmsk,uint8_t* bfg,int inx){
+  if(sessid!=dg->sessid)return -1;
+  if(!dg->running)return -2;
+  PcmSession* sess = dg->cursess;
+  if(!sess)return -3;
+  int rst = 0;
+  int w = width?width:dg->width;
+  int h = height?height:dg->height;
+  pthread_mutex_lock(&dg->readmutex);
+  rst =  sess->readblock(sessid,dg->mat_feat,inx);
+  pthread_mutex_unlock(&dg->readmutex);
+  //printf("===readblock %d\n",rst);
+  if(rst>0){
+    rst = dhduix_simprst(dg,sessid, bpic,w,h, box, bmsk, bfg,(uint8_t*)dg->mat_feat->data,STREAM_ALL_BNF);
+    return 1;
+  }
+  return rst;
+}
+
+int dhduix_simpblend(dhduix_t* dg,uint64_t sessid,uint8_t* bpic,int width,int height,uint8_t* bmsk,uint8_t* bfg){
+  //
+  return 0;
+}
+
+int dhduix_simprst(dhduix_t* dg,uint64_t sessid,uint8_t* bpic,int width,int height,int* box,uint8_t* bmsk,uint8_t* bfg,uint8_t* bnfbuf,int bnflen){
+  //printf("simprst gogogo %d \n",dg->inited);
+  if(!dg->inited)return -1;
+  if(!dg->munet)return -3;
+  int rst = 0;
+  JMat* mat_pic = new JMat(width,height,bpic);
+  JMat* mat_msk = bmsk?new JMat(width,height,bmsk):NULL;
+  JMat* mat_fg = bfg?new JMat(width,height,bfg):NULL;
+  //read pcm
+  JMat* feat = new JMat(STREAM_CNT_BNF,STREAM_BASE_BNF,(float*)bnfbuf,1);
+
+//    MWorkMat wmat(mat_pic,mat_msk,box);
+  MWorkMat wmat(mat_pic, NULL,box,dg->kind);
+  wmat.premunet();
+  JMat* mpic;
+  JMat* mmsk;
+  wmat.munet(&mpic,&mmsk);
+  //tooken
+#ifdef AIRUN_FLAG
+  uint64_t ticka = jtimer_msstamp();
+  rst = dg->munet->domodel(mpic, mmsk, feat,dg->rect);
+  uint64_t tickb = jtimer_msstamp();
+  uint64_t dist = tickb-ticka;
+  //LOGD("tooken","===domodel %ld\n",dist);
+  if(dist>40){
+    printf("===domodel %d dist %ld\n",rst,dist);
+  }
+#endif
+  if(mat_fg){
+    wmat.finmunet(mat_fg);
+  }else{
+    wmat.finmunet(mat_pic);
+  }
+  if(feat)delete feat;
+  delete mat_pic;
+  if(mat_fg)delete mat_fg;
+  if(mat_msk)delete mat_msk;
+  return 0;
+}
+
+
diff --git a/duix-sdk/src/main/cpp/include/aicommon.h b/duix-sdk/src/main/cpp/include/aicommon.h
new file mode 100644
index 0000000..bb62453
--- /dev/null
+++ b/duix-sdk/src/main/cpp/include/aicommon.h
@@ -0,0 +1,39 @@
+#pragma once
+
+//#define MFCC_OFFSET  6436
+#define MFCC_OFFSET  6400
+//##define MFCC_OFFSET  0
+#define MFCC_DEFRMS  0.1f
+#define MFCC_FPS    25
+#define MFCC_RATE   16000
+//#define MFCC_WAVCHUNK  960000
+#define MFCC_WAVCHUNK  560000
+//#define MFCC_WAVCHUNK  512
+
+//#define MFCC_MELBASE  6001
+#define MFCC_MELBASE  3501
+#define MFCC_MELCHUNK  80
+//#define MFCC_MELCHUNK  20
+
+//#define MFCC_BNFBASE  1499
+#define MFCC_BNFBASE  874
+#define MFCC_BNFCHUNK  256
+//input==== NodeArg(name='speech', type='tensor(float)', shape=['B', 'T', 80])
+//input==== NodeArg(name='speech_lengths', type='tensor(int32)', shape=['B'])
+//output==== NodeArg(name='encoder_out', type='tensor(float)', shape=['B', 'T_OUT', 'Addencoder_out_dim_2'])
+#define STREAM_BASE_MINOFF 10 
+#define STREAM_BASE_MINBLOCK 20
+#define STREAM_BASE_MAXBLOCK 50
+#define STREAM_BASE_TICK 40
+#define STREAM_BASE_PCM 1280
+#define STREAM_BASE_SAMP 640
+#define STREAM_BASE_BNF 256
+#define STREAM_CNT_BNF 20
+#define STREAM_OFF_BNF 20
+#define STREAM_ALL_BNF 20480
+#define STREAM_BASE_MEL 80
+#define STREAM_BASE_CNT 1500
+//#define STREAM_BASE_CNT 050
+#define STREAM_MFCC_FILL 10
+//#define STREAM_MFCC_FILL 5
+
diff --git a/duix-sdk/src/main/cpp/include/dhextctrl.h b/duix-sdk/src/main/cpp/include/dhextctrl.h
new file mode 100644
index 0000000..322f8a0
--- /dev/null
+++ b/duix-sdk/src/main/cpp/include/dhextctrl.h
@@ -0,0 +1,55 @@
+#ifndef GJ_EXTCTRL
+#define GJ_EXTCTRL
+
+#include <stdio.h>
+#include "dhextend.h"
+#include "gj_threadpool.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+  typedef void (*func_extprocess)(void *data);
+  typedef struct{
+    ext_env_t* env;;
+    jqueue_t* q_msg;
+    jqueue_t* q_input;
+    jqueue_t* q_output;
+    uint64_t  tick_process;
+    ext_handle_t*   hnd_process;
+    func_extrun     fn_run;
+    func_extprocess fn_process;
+  }ext_process_t;
+
+  typedef struct{
+    ext_model_t*  asr_model;
+    ext_model_t*  chat_model;
+    ext_model_t*  tts_model;
+    ext_model_t*  bnf_model;
+    ext_model_t*  render_model;
+  }extmain_t;
+
+  typedef struct{
+    volatile  uint64_t  m_sessid;
+    volatile int m_running;
+    ext_env_t* env_sess;
+    ext_process_t* asr_proc;
+    ext_process_t* chat_proc;
+    ext_process_t* tts_proc;
+    threadpool_t*  pool;
+  }extsess_t;
+
+  typedef int (*func_inout)(uint64_t looptick,void* arg);
+
+  int ext_createsess( extmain_t* extmain,char* uuid,extsess_t** pext);
+  int ext_startsess(extsess_t* ext,func_inout fn_input,func_inout fn_output,void* tag); 
+  int ext_stopsess(extsess_t* ext); 
+  int ext_destroysess(extsess_t** pext);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/duix-sdk/src/main/cpp/include/dhextend.h b/duix-sdk/src/main/cpp/include/dhextend.h
new file mode 100644
index 0000000..dde12c8
--- /dev/null
+++ b/duix-sdk/src/main/cpp/include/dhextend.h
@@ -0,0 +1,62 @@
+
+#ifndef GJ_BOTCORE
+#define GJ_BOTCORE
+#include "dh_mem.h"
+#include "dh_data.h"
+#include "dh_que.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+  typedef struct ext_handle_t     ext_handle_t;
+  typedef struct ext_env_t     ext_env_t;;
+
+  typedef ext_handle_t* (*func_extcreate)(char* uuid,void* env,void* tag);
+  typedef int (*func_extdestroy)(ext_handle_t *exthandle);
+  typedef int  (*func_extupsess)(ext_handle_t *handle,uint64_t sessid);
+  typedef int  (*func_extstart)(ext_handle_t *handle);
+  typedef int  (*func_extstop)(ext_handle_t *handle);
+  typedef int  (*func_extrun)(ext_handle_t *handle,uint64_t sessid,jbuf_t* buf);
+  typedef int  (*func_extrunex)(ext_handle_t *handle,uint64_t sessid,jbuf_t** buf);
+
+  typedef struct ext_model_t{
+    int             m_id;
+    char*           m_name;
+    func_extcreate  fn_create;
+    func_extdestroy fn_destroy;
+  }ext_model_t;
+
+
+  struct ext_handle_t{
+    void            *ext_tag;
+    char            *m_uuid;
+    uint64_t        m_sessid;
+    func_extstart   fn_start;
+    func_extstop    fn_stop;
+    func_extupsess  fn_upsess;
+    func_extrun     fn_extrun;
+    func_extrun     fn_extrunex;
+  };
+
+  struct ext_env_t{
+    uint64_t  m_sessid;
+    volatile int  m_running;
+    jqueue_t* q_arrext[16];//msg pcm asr chat tts mfcc render
+  };
+
+#define INX_QMSG 0
+#define INX_QPCM 1
+#define INX_QASR 2
+#define INX_QCHAT 3
+#define INX_QANSWER 4
+#define INX_QTTS 5
+#define INX_QMFCC 6
+#define INX_QRENDER 7
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/duix-sdk/src/main/cpp/include/dhextinc.h b/duix-sdk/src/main/cpp/include/dhextinc.h
new file mode 100644
index 0000000..2831225
--- /dev/null
+++ b/duix-sdk/src/main/cpp/include/dhextinc.h
@@ -0,0 +1,22 @@
+
+#ifndef GJ_EXTINC
+#define GJ_EXTINC
+#include "dhextend.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+ext_model_t*  load_funasrext(char* cfg);
+ext_model_t*  load_chatggmlext(char* cfg);
+ext_model_t*  load_piperext(char* cfg);
+ext_model_t* load_msasrext(char* cfg);
+
+ext_model_t* load_aliasrext(char* cfg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/duix-sdk/src/main/cpp/include/gj_dll.h b/duix-sdk/src/main/cpp/include/gj_dll.h
new file mode 100644
index 0000000..9d1c953
--- /dev/null
+++ b/duix-sdk/src/main/cpp/include/gj_dll.h
@@ -0,0 +1,21 @@
+#ifndef __GJ_DLL_H__
+#define __GJ_DLL_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#define GJLIB_EXPORT 1
+#if defined(GJLIB_EXPORT)
+    #if defined _WIN32 || defined __CYGWIN__
+        #define GJLIBAPI __declspec(dllexport)
+    #else
+        #define GJLIBAPI __attribute__((visibility("default")))
+    #endif
+#else
+    #define GJLIBAPI
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/duix-sdk/src/main/cpp/include/gjduix.h b/duix-sdk/src/main/cpp/include/gjduix.h
new file mode 100644
index 0000000..4cbd3b6
--- /dev/null
+++ b/duix-sdk/src/main/cpp/include/gjduix.h
@@ -0,0 +1,38 @@
+#ifndef GJDUIX_
+#define GJDUIX_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+typedef struct dhmfcc_s dhmfcc_t;
+
+int dhmfcc_alloc(dhmfcc_t** pdg,int mincalc);
+int dhmfcc_initPcmex(dhmfcc_t* dg,int maxsize,int minoff ,int minblock ,int maxblock);
+int dhmfcc_initWenet(dhmfcc_t* dg,char* fnwenet); 
+
+uint64_t dhmfcc_newsession(dhmfcc_t* dg);
+int dhmfcc_pushpcm(dhmfcc_t* dg,uint64_t sessid,char* buf,int size,int kind);
+int dhmfcc_readpcm(dhmfcc_t* dg,uint64_t sessid,char* pcmbuf,int pcmlen,char* bnfbuf,int bnflen);
+int dhmfcc_finsession(dhmfcc_t* dg,uint64_t sessid);
+int dhmfcc_consession(dhmfcc_t* dg,uint64_t sessid);
+
+int dhmfcc_free(dhmfcc_t* dg);
+
+
+typedef struct dhunet_s dhunet_t;;
+int dhunet_alloc(dhunet_t** pdg,int minrender);
+int dhunet_initMunet(dhunet_t* dg,char* fnparam,char* fnbin,char* fnmsk);
+int dhunet_simprst(dhunet_t* dg,uint64_t sessid,uint8_t* bpic,int width,int height,int* box,uint8_t* bmsk,uint8_t* bfg,uint8_t* bnfbuf,int bnflen);
+int dhunet_free(dhunet_t* pdg);
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/duix-sdk/src/main/cpp/include/gjsimp.h b/duix-sdk/src/main/cpp/include/gjsimp.h
new file mode 100644
index 0000000..abb3838
--- /dev/null
+++ b/duix-sdk/src/main/cpp/include/gjsimp.h
@@ -0,0 +1,52 @@
+#ifndef GJSIMP
+#define GJSIMP
+
+#include <stdint.h>
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+
+typedef struct dhduix_s dhduix_t;
+
+int dhduix_alloc(dhduix_t** pdg,int mincalc,int width,int height);
+int dhduix_initPcmex(dhduix_t* dg,int maxsize,int minoff ,int minblock ,int maxblock,int rgb);
+int dhduix_initWenet(dhduix_t* dg,char* fnwenet); 
+int dhduix_initMunet(dhduix_t* dg,char* fnparam,char* fnbin,char* fnmsk);
+int dhduix_initMunetex(dhduix_t* dg,char* fnparam,char* fnbin,char* fnmsk,int rect);
+
+uint64_t dhduix_newsession(dhduix_t* dg);
+
+int dhduix_pushpcm(dhduix_t* dg,uint64_t sessid,char* buf,int size,int kind);
+int dhduix_readpcm(dhduix_t* dg,uint64_t sessid,char* pcmbuf,int pcmlen,char* bnfbuf,int bnflen);
+int dhduix_simprst(dhduix_t* dg,uint64_t sessid,uint8_t* bpic,int width,int height,int* box,uint8_t* bmsk,uint8_t* bfg,uint8_t* bnfbuf,int bnflen);
+
+int dhduix_allcnt(dhduix_t* dg,uint64_t sessid);
+int dhduix_readycnt(dhduix_t* dg,uint64_t sessid);
+int dhduix_simpinx(dhduix_t* dg,uint64_t sessid,uint8_t* bpic,int width,int height,int* box,uint8_t* bmsk,uint8_t* bfg,int bnfinx);
+int dhduix_fileinx(dhduix_t* dg,uint64_t sessid,char* fnpic,int* box,char* fnmsk,char* fnfg,int bnfinx,char* bimg,char* mskbuf,int imgsize);
+int dhduix_simpblend(dhduix_t* dg,uint64_t sessid,uint8_t* bpic,int width,int height,uint8_t* bmsk,uint8_t* bfg);
+
+int dhduix_simppcm(dhduix_t* dg,char* buf,int size,char* pre,int presize,char* bnf,int bnfsize);
+
+
+int dhduix_finsession(dhduix_t* dg,uint64_t sessid);
+int dhduix_consession(dhduix_t* dg,uint64_t sessid);
+
+
+
+int dhduix_free(dhduix_t* dg);
+
+
+
+
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/duix-sdk/src/main/cpp/iostest/testduix.cpp b/duix-sdk/src/main/cpp/iostest/testduix.cpp
new file mode 100644
index 0000000..3f847bb
--- /dev/null
+++ b/duix-sdk/src/main/cpp/iostest/testduix.cpp
@@ -0,0 +1,129 @@
+#include <stdlib.h>
+#include <string>
+#include <stdio.h>
+#include "gjduix.h"
+#include "jmat.h"
+#include <pthread.h>
+#include "dh_data.h"
+
+
+static volatile int g_running = 0;
+static volatile uint64_t g_sessid = 0;
+static void* mfccworker(void* arg){
+
+  static  uint64_t sessid ;
+  for(int k=0;k<1;k++){
+    dhmfcc_t* mfcc = (dhmfcc_t*)arg;
+    FILE* g_wavfile = fopen("data/b10.wav","rb");
+    fseek(g_wavfile,44,0);
+    sessid = dhmfcc_newsession(mfcc);
+    g_sessid = sessid;
+    int psize = 1000;
+    int tickcnt = 0;
+    int kkk = 0;
+    char* pcm = (char*)malloc(psize);
+    while(sessid == g_sessid){
+      int readpcm = fread(pcm,1,psize,g_wavfile);
+      if(readpcm<1)break;
+      dhmfcc_pushpcm(mfcc,sessid,pcm,readpcm,0);
+      tickcnt += readpcm;
+      uint64_t tick = jtimer_msstamp();
+      //printf("====push %d %ld \n",tickcnt,tick);
+      kkk++;
+      /*
+      if(kkk%100==99){
+        sessid = dhmfcc_newsession(mfcc);
+        g_sessid = sessid;
+      }
+      */
+      jtimer_mssleep(5);
+    }
+    jtimer_mssleep(10000);
+    dhmfcc_finsession(mfcc,sessid);
+    free(pcm);
+    fclose(g_wavfile);
+    printf("===finish\n");
+  }
+  return NULL;
+}
+
+
+int main(int argc,char** argv){
+  dhmfcc_t* mfcc = NULL;
+  int rst = 0;
+  rst = dhmfcc_alloc(&mfcc,2);
+  //char* fnwenet = "model/wenet.onnx";
+  char* fnwenet = "model/wenet.onnx";
+  rst = dhmfcc_initWenet(mfcc,fnwenet);
+  rst = dhmfcc_initPcmex(mfcc,0,10,20,50);
+  dhunet_t* unet = NULL;
+  rst = dhunet_alloc(&unet,20);
+  rst = dhunet_initMunet(unet,"model/xinyan_opt.param","model/xinyan_opt.bin","model/weight_168u.bin");
+
+  std::string fnpic = "data/xinyan.jpg";
+  std::string fnmsk = "data/m1.jpg";
+  std::string fnfg = "data/xinyan.jpg";
+  JMat* mat_msk = new JMat();
+  mat_msk->loadjpg(fnmsk,1);
+  JMat* mat_pic = new JMat();
+  mat_pic->loadjpg(fnpic,1);
+  JMat* mat_fg = new JMat();
+  mat_fg->loadjpg(fnfg,1);
+  int width = mat_pic->width();
+  int height = mat_pic->height();
+  int m_boxs[4];
+  m_boxs[0]=170;m_boxs[2]=382;m_boxs[1]=382;m_boxs[3]=592;
+  uint8_t* bpic = (uint8_t*)mat_pic->data();
+  uint8_t* bmsk = (uint8_t*)mat_msk->data();
+  uint8_t* bfg = (uint8_t*)mat_fg->data();
+  int* box = m_boxs;
+  int pcmsize = 1280;
+  char* pcm = (char*)malloc(1280);
+  int bnfsize = 1024*20;
+  char* bnf = (char*)malloc(1024*20);
+  pthread_t audtrd;
+  pthread_create(&audtrd, NULL, mfccworker, (void*)mfcc);
+  //mfccworker(mfcc);
+
+  printf("====render\n");
+  //getchar();
+  while(1){
+    if(!g_sessid){
+      printf("+");
+      //cv::waitKey(40);
+      jtimer_mssleep(40);
+      continue;
+    }
+    rst = dhmfcc_readpcm(mfcc,g_sessid,pcm,pcmsize,bnf,bnfsize);
+    printf("===readpcm %ld %d\n",g_sessid,rst);
+    if(rst>0){
+      uint64_t tick = jtimer_msstamp();
+      printf("====read  %ld \n",tick);
+      rst = dhunet_simprst(unet,g_sessid, bpic,width,height, box, bmsk, bfg, (uint8_t*)bnf,bnfsize);
+      printf("===simprst %d\n",rst);
+      mat_fg->show("aaa");
+      cv::waitKey(30);
+      jtimer_mssleep(40);
+    }else if(rst < 0){
+      break;
+    }else{
+      //cv::waitKey(40);
+      jtimer_mssleep(40);
+    }
+  }
+  g_sessid = 0;
+  pthread_join(audtrd,NULL);
+  printf("====exit\n");
+  //
+  rst = dhmfcc_free(mfcc);
+  printf("====exitmfcc\n");
+  /*
+  rst = dhunet_free(unet);
+  delete mat_pic;
+  delete mat_msk;
+  delete mat_fg;
+  */
+  free(pcm);
+  free(bnf);
+  return 0;
+}
diff --git a/duix-sdk/src/main/cpp/iostest/testsimp.cpp b/duix-sdk/src/main/cpp/iostest/testsimp.cpp
new file mode 100644
index 0000000..8536c6c
--- /dev/null
+++ b/duix-sdk/src/main/cpp/iostest/testsimp.cpp
@@ -0,0 +1,200 @@
+#include <stdlib.h>
+#include <string>
+#include <stdio.h>
+#include "gjsimp.h"
+#include "jmat.h"
+#include <pthread.h>
+#include "dh_data.h"
+
+
+static volatile int g_running = 0;
+static volatile uint64_t g_sessid = 0;
+static void* mfccworker(void* arg){
+
+  static  uint64_t sessid ;
+  for(int k=0;k<1;k++){
+    dhduix_t* mfcc = (dhduix_t*)arg;
+    FILE* g_wavfile = fopen("data/b10.wav","rb");
+    fseek(g_wavfile,44,0);
+    sessid = dhduix_newsession(mfcc);
+    g_sessid = sessid;
+    int psize = 1000;
+    int tickcnt = 0;
+    int kkk = 0;
+    char* pcm = (char*)malloc(psize);
+    while(sessid == g_sessid){
+      int readpcm = fread(pcm,1,psize,g_wavfile);
+      if(readpcm<1)break;
+      dhduix_pushpcm(mfcc,sessid,pcm,readpcm,0);
+      tickcnt += readpcm;
+      uint64_t tick = jtimer_msstamp();
+      //printf("====push %d %ld \n",tickcnt,tick);
+      kkk++;
+      /*
+         if(kkk%100==99){
+         sessid = dhduix_newsession(mfcc);
+         g_sessid = sessid;
+         }
+         */
+      jtimer_mssleep(5);
+    }
+    jtimer_mssleep(10000);
+    dhduix_finsession(mfcc,sessid);
+    free(pcm);
+    fclose(g_wavfile);
+    printf("===finish\n");
+  }
+  return NULL;
+}
+
+
+int mainmemcheck(int argc,char** argv){
+  dhduix_t* dg = NULL;
+  int rst = 0;
+  int width = 540;
+  int height = 720;
+  rst = dhduix_alloc(&dg,20,width,height);
+  //char* fnwenet = "model/wenet.onnx";
+  rst = dhduix_initPcmex(dg,0,10,20,50,0);
+  rst = dhduix_initMunetex(dg,"model/xinyan_opt.param","model/xinyan_opt.bin","model/weight_168u.bin",128);
+  //char* fnwenet = "model/wenet.onnx";
+  char* fnwenet = "model/wenet.onnx";
+  rst = dhduix_initWenet(dg,fnwenet);
+  char* pcm = (char*)malloc(102400);
+
+  std::string fnpic = "data/xinyan.jpg";
+  std::string fnmsk = "data/m1.jpg";
+  std::string fnfg = "data/xinyan.jpg";
+  JMat* mat_msk = new JMat();
+  mat_msk->loadjpg(fnmsk,1);
+  JMat* mat_pic = new JMat();
+  mat_pic->loadjpg(fnpic,1);
+  JMat* mat_fg = new JMat();
+  mat_fg->loadjpg(fnfg,1);
+  int m_boxs[4];
+  m_boxs[0]=170;m_boxs[2]=382;m_boxs[1]=382;m_boxs[3]=592;
+  uint8_t* bpic = (uint8_t*)mat_pic->data();
+  uint8_t* bmsk = (uint8_t*)mat_msk->data();
+  uint8_t* bfg = (uint8_t*)mat_fg->data();
+  int* box = m_boxs;
+  for(int m=0;m<10;m++){
+    g_sessid = dhduix_newsession(dg);
+    for(int k=0;k<100;k++){
+      dhduix_pushpcm(dg,g_sessid,pcm,102400,0);
+      int allcnt = dhduix_allcnt(dg,g_sessid);
+      printf("===allcnt %d\n",allcnt);
+    }
+    int readycnt = dhduix_readycnt(dg,g_sessid);
+    while(readycnt<1){
+      jtimer_mssleep(10);
+    }
+    for(int i=0;i<100;i++){
+      readycnt = dhduix_readycnt(dg,g_sessid);
+      //printf("===readycnt %d\n",readycnt);
+      rst = dhduix_simpinx(dg,g_sessid, bpic,width,height, box, bmsk, bfg, i);
+      printf("==simp %d\n",rst);
+      jtimer_mssleep(10);
+      if(rst<0)break;
+    }
+    dhduix_finsession(dg,g_sessid);
+  }
+  free(pcm);
+  delete mat_pic;
+  delete mat_msk;
+  delete mat_fg;
+  dhduix_free(dg);
+  return 0;
+}
+
+int main(int argc,char** argv){
+  dhduix_t* dg = NULL;
+  int rst = 0;
+  int width = 1080;
+  int height = 1920;
+  rst = dhduix_alloc(&dg,20,width,height);
+  //char* fnwenet = "model/wenet.onnx";
+  char* fnwenet = "model/wenet.onnx";
+  rst = dhduix_initWenet(dg,fnwenet);
+  rst = dhduix_initPcmex(dg,0,10,20,50,0);
+  rst = dhduix_initMunetex(dg,
+    "mdl128/pro128/dh_model.param",
+      "mdl128/pro128/dh_model.bin","model/weight_168u.bin",128);
+
+  //std::string fnpic = "data/xinyan.jpg";
+  //std::string fnmsk = "data/m1.jpg";
+  std::string fnpic = "mdl128/pro128/raw_jpgs/1.sij";
+  std::string fnmsk = "mdl128/pro128/pha/1.sij";
+  std::string fnfg = "mdl128/pro128/raw_sg/1.sij";
+  //std::string fnfg = "data/xinyan.jpg";
+  JMat* mat_msk = new JMat();
+  mat_msk->loadjpg(fnmsk,1);
+  JMat* mat_pic = new JMat();
+  mat_pic->loadjpg(fnpic,1);
+  JMat* mat_fg = new JMat();
+  mat_fg->loadjpg(fnfg,1);
+  int m_boxs[4];
+  //m_boxs[0]=170;m_boxs[2]=382;m_boxs[1]=382;m_boxs[3]=592;
+  m_boxs[0]=414;m_boxs[2]=669;m_boxs[1]=925;m_boxs[3]=1180;
+  uint8_t* bpic = (uint8_t*)mat_pic->data();
+  uint8_t* bmsk = (uint8_t*)mat_msk->data();
+  uint8_t* bfg = (uint8_t*)mat_fg->data();
+  int* box = m_boxs;
+  int pcmsize = 1280;
+  char* pcm = (char*)malloc(1280);
+  int bnfsize = 1024*20;
+  char* bnf = (char*)malloc(1024*20);
+  pthread_t audtrd;
+  pthread_create(&audtrd, NULL, mfccworker, (void*)dg);
+  //mfccworker(mfcc);
+
+  printf("====render\n");
+  //getchar();
+  int bnfinx = 0;
+  while(1){
+    if(!g_sessid){
+      printf("+");
+      //cv::waitKey(40);
+      jtimer_mssleep(40);
+      continue;
+    }
+    int readycnt = dhduix_readycnt(dg,g_sessid);
+    printf("====readycnt %d\n",readycnt);
+    if(!readycnt){
+      jtimer_mssleep(40);
+      continue;
+    }
+    rst = 1;//dhduix_readpcm(dg,g_sessid,pcm,pcmsize,bnf,bnfsize);
+    printf("===readpcm %ld %d\n",g_sessid,rst);
+    if(rst>0){
+      uint64_t tick = jtimer_msstamp();
+      //printf("====read  %ld \n",tick);
+      //rst = dhduix_simprst(dg,g_sessid, bpic,width,height, box, bmsk, bfg, (uint8_t*)bnf,bnfsize);
+      rst = dhduix_simpinx(dg,g_sessid, bpic,width,height, box, bmsk, bfg, bnfinx);
+      if(rst>0)bnfinx ++;
+      printf("===simprst %d\n",rst);
+      mat_fg->show("aaa");
+      cv::waitKey(20);
+      //jtimer_mssleep(40);
+    }else if(rst < 0){
+      break;
+    }else{
+      //cv::waitKey(40);
+      jtimer_mssleep(40);
+    }
+  }
+  g_sessid = 0;
+  pthread_join(audtrd,NULL);
+  printf("====exit\n");
+  //
+  rst = dhduix_free(dg);
+  printf("====exitmfcc\n");
+  /*
+     rst = dhduix_free(unet);
+     delete mat_pic;
+     delete mat_msk;
+     delete mat_fg;
+     */
+  free(pcm);
+  free(bnf);
+  return 0;
+}
diff --git a/duix-sdk/src/main/cpp/mk/Android.mk64 b/duix-sdk/src/main/cpp/mk/Android.mk64
new file mode 100644
index 0000000..c7c3065
--- /dev/null
+++ b/duix-sdk/src/main/cpp/mk/Android.mk64
@@ -0,0 +1,37 @@
+#/****************************************************************************
+#*   Cartoonifier, for Android.
+#*****************************************************************************
+#*   by Shervin Emami, 5th Dec 2012 (shervin.emami@gmail.com)
+#*   http://www.shervinemami.info/
+#*****************************************************************************
+#*   Ch1 of the book "Mastering OpenCV with Practical Computer Vision Projects"
+#*   Copyright Packt Publishing 2012.
+#*   http://www.packtpub.com/cool-projects-with-opencv/book
+#****************************************************************************/
+
+
+LOCAL_PATH := $(call my-dir)
+
+
+include $(CLEAR_VARS)
+
+
+
+LOCAL_SRC_FILES  += src/kmatarm.cpp
+
+LOCAL_ARM_NEON := true
+LOCAL_MODULE := facedetect
+LOCAL_LDLIBS +=  -llog -ldl -lm -lmediandk
+LOCAL_LDLIBS += -ljnigraphics -fopenmp
+LOCAL_CFLAGS += -fpermissive
+LOCAL_CPPFLAGS += -fpermissive
+#LOCAL_CFLAGS += -ftree-vectorizer-verbose=2
+LOCAL_CPPFLAGS += -std=c++17
+LOCAL_LDLIBS += -lstdc++
+
+LOCAL_C_INCLUDES += $(LOCAL_PATH)
+LOCAL_C_INCLUDES += include
+LOCAL_C_INCLUDES += opencv-mobile-4.6.0-android/sdk/native/jni/include/
+LOCAL_C_INCLUDES += ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn
+
+include $(BUILD_SHARED_LIBRARY)
diff --git a/duix-sdk/src/main/cpp/mk/android.sh b/duix-sdk/src/main/cpp/mk/android.sh
new file mode 100644
index 0000000..16da804
--- /dev/null
+++ b/duix-sdk/src/main/cpp/mk/android.sh
@@ -0,0 +1,17 @@
+ANDROID_NDK=~/tools/android-ndk-r25c
+TOOLCHAIN=$ANDROID_NDK/build/cmake/android.toolchain.cmake
+BUILD_DIR=android-arm64
+mkdir -p $BUILD_DIR
+cd $BUILD_DIR
+#-G Ninja # fail
+cmake \
+    -DCMAKE_TOOLCHAIN_FILE=$TOOLCHAIN \
+    -DANDROID_LD=lld \
+    -DANDROID_ABI="arm64-v8a" \
+    -DANDROID_PLATFORM=android-24 \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DPPLCV_USE_AARCH64=ON \
+    ..
+
+# -DHPCC_USE_AARCH64=ON \
+
diff --git a/duix-sdk/src/main/cpp/mk/bt b/duix-sdk/src/main/cpp/mk/bt
new file mode 100644
index 0000000..f469b21
--- /dev/null
+++ b/duix-sdk/src/main/cpp/mk/bt
@@ -0,0 +1,58 @@
+g++ -g  \
+    -Iinclude -Ibase -Irender -Idigit -Iaisdk \
+    -I/usr/include/opencv4/ \
+		   -Ithird/x86/include/ \
+		   -Ithird/x86/include/ncnn/  \
+		   -Ithird/x86/include/onnx/ \
+		   -Ithird/x86/include/turbojpeg/ \
+    aisdk/jmat.cpp \
+    src/kmatx86.cpp \
+    aisdk/wavreader.cpp \
+    aisdk/wenet.cpp \
+    aisdk/aimodel.cpp \
+    aisdk/scrfd.cpp \
+    aisdk/pfpld.cpp \
+    aisdk/munet.cpp \
+    aisdk/malpha.cpp \
+    aisdk/wavcache.cpp \
+    aisdk/blendgram.cpp \
+    aisdk/face_utils.cpp \
+    digit/netwav.cpp \
+    digit/looper.cpp \
+    digit/netcurl.cpp \
+    digit/GRender.cpp \
+    digit/GDigit.cpp \
+    digit/dispatchqueue.cpp \
+    base/BaseRenderHelper.cpp \
+    base/AudioTrack.cpp \
+    render/EglRenderer.cpp \
+    render/RgbVideoRenderer.cpp \
+    render/SurfaceVideoRenderer.cpp \
+    render/RenderHelper.cpp \
+    render/AudioRenderer.cpp \
+    render/GlesProgram.cpp \
+    base/Log.cpp \
+    base/FrameSource.cpp \
+    base/MediaData.cpp \
+    base/MessageSource.cpp \
+    base/MessageHelper.cpp \
+    base/LoopThread.cpp \
+    base/XThread.cpp \
+    base/XTick.c \
+    base/cJSON.c \
+    base/dh_mem.c \
+    digit/grtcfg.c \
+    base/LoopThreadHelper.cpp \
+    linux/linuxtest.cpp \
+    lib/libpplcv_static.a \
+    lib/libpplcommon_static.a \
+    -fpermissive   -Wwrite-strings \
+        -Llib \
+        -L/usr/lib/x86_64-linux-gnu/ \
+        -Lthird/ncnn-20221128-android-vulkan-shared/x86_64/lib/ \
+	   -Lthird/x86/lib	\
+       -ljpeg -lturbojpeg \
+		-lopencv_core -lopencv_dnn -lopencv_imgcodecs -lopencv_imgproc -lopencv_highgui -lopencv_videoio \
+		-lonnxruntime -lncnn -lcurl \
+        -lEGL -lOpenGL -lGLESv2 -lX11 \
+        -fopenmp
diff --git a/duix-sdk/src/main/cpp/mk/exbuildso64 b/duix-sdk/src/main/cpp/mk/exbuildso64
new file mode 100644
index 0000000..0728800
--- /dev/null
+++ b/duix-sdk/src/main/cpp/mk/exbuildso64
@@ -0,0 +1,4 @@
+ndk-build NDK_PROJECT_PATH=. APP_BUILD_SCRIPT=./Android.mk64 APP_PLATFORM=android-26 APP_STL=c++_static APP_CPPFLAGS=-fexceptions APP_CFLAGS=-Wno-error APP_ABI=arm64-v8a
+#arm64-v8a
+#armeabi-v7a
+
diff --git a/duix-sdk/src/main/cpp/third/arm/arm64-v8a/ffmpeg-lite/build_free_arm64_lite.sh b/duix-sdk/src/main/cpp/third/arm/arm64-v8a/ffmpeg-lite/build_free_arm64_lite.sh
new file mode 100644
index 0000000..371b3be
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/arm64-v8a/ffmpeg-lite/build_free_arm64_lite.sh
@@ -0,0 +1,119 @@
+# build.sh
+# 在Linux下编译FFmpeg成功的脚本
+# 注意Linux和windows的换行符\r\n不太一样，要转换（dos2unix）
+#!/bin/sh
+make clean
+export NDK=~/work/android-ndk-r15c-linux-x86_64/android-ndk-r15c
+export PREBUILT=$NDK/toolchains/aarch64-linux-android-4.9/prebuilt
+export PLATFORM=$NDK/platforms/android-21/arch-arm64
+export PREFIX=../fflib/free-arm64-lite
+build_one(){
+./configure --target-os=android --prefix=$PREFIX \
+--enable-cross-compile \
+--enable-runtime-cpudetect \
+--arch=aarch64 \
+--cross-prefix=$PREBUILT/linux-x86_64/bin/aarch64-linux-android- \
+--cc=$PREBUILT/linux-x86_64/bin/aarch64-linux-android-gcc \
+--nm=$PREBUILT/linux-x86_64/bin/aarch64-linux-android-nm \
+--sysroot=$PLATFORM \
+--disable-gpl --disable-nonfree \
+--enable-shared --enable-static --enable-small \
+--disable-doc --disable-ffprobe --disable-ffplay --disable-debug \
+--enable-jni \
+--enable-mediacodec \
+--disable-avdevice \
+--enable-avcodec \
+--enable-avformat \
+--enable-avutil \
+--enable-swresample \
+--enable-swscale \
+--disable-postproc \
+--enable-avfilter \
+--disable-avresample \
+--disable-decoders \
+--enable-decoder=aac \
+--enable-decoder=aac_latm \
+--enable-decoder=flv \
+--enable-decoder=h264 \
+--enable-decoder=mp3* \
+--enable-decoder=vp6f \
+--enable-decoder=flac \
+--enable-decoder=hevc \
+--enable-decoder=vp8 \
+--enable-decoder=vp9 \
+--enable-decoder=amrnb \
+--enable-decoder=amrwb \
+--enable-decoder=mjpeg \
+--enable-decoder=png \
+--enable-decoder=h264_mediacodec \
+--enable-hwaccel=h264_mediacodec \
+--disable-encoders \
+--enable-encoder=aac \
+--enable-encoder=h264 \
+--enable-encoder=hevc \
+--enable-encoder=png \
+--enable-encoder=mjpeg \
+--disable-demuxers \
+--enable-demuxer=aac \
+--enable-demuxer=concat \
+--enable-demuxer=data \
+--enable-demuxer=flv \
+--enable-demuxer=hls \
+--enable-demuxer=live_flv \
+--enable-demuxer=mov \
+--enable-demuxer=mp3 \
+--enable-demuxer=mpegps \
+--enable-demuxer=mpegts \
+--enable-demuxer=mpegvideo \
+--enable-demuxer=flac \
+--enable-demuxer=hevc \
+--enable-demuxer=webm_dash_manifest \
+--enable-demuxer=rtsp \
+--enable-demuxer=rtp \
+--enable-demuxer=h264 \
+--enable-demuxer=mp4 \
+--enable-demuxer=image2 \
+--disable-muxers \
+--enable-muxer=rtsp \
+--enable-muxer=rtp \
+--enable-muxer=flv \
+--enable-muxer=h264 \
+--enable-muxer=mp4 \
+--enable-muxer=hevc \
+--enable-muxer=image2 \
+--disable-parsers \
+--enable-parser=aac \
+--enable-parser=aac_latm \
+--enable-parser=h264 \
+--enable-parser=flac \
+--enable-parser=hevc \
+--enable-protocols \
+--enable-protocol=async \
+--disable-protocol=bluray \
+--disable-protocol=concat \
+--disable-protocol=crypto \
+--disable-protocol=ffrtmpcrypt \
+--enable-protocol=ffrtmphttp \
+--disable-protocol=gopher \
+--disable-protocol=icecast \
+--disable-protocol=librtmp* \
+--disable-protocol=libssh \
+--disable-protocol=md5 \
+--disable-protocol=mmsh \
+--disable-protocol=mmst \
+--disable-protocol=rtmp* \
+--enable-protocol=rtmp \
+--enable-protocol=rtmpt \
+--disable-protocol=rtp \
+--disable-protocol=sctp \
+--disable-protocol=srtp \
+--disable-protocol=subfile \
+--disable-protocol=unix \
+--disable-indevs \
+--disable-outdevs \
+--disable-stripping \
+--enable-asm
+}
+build_one
+make
+make install
diff --git a/duix-sdk/src/main/cpp/third/arm/armeabi-v7a/ffmpeg-lite/build_free_arm_lite.sh b/duix-sdk/src/main/cpp/third/arm/armeabi-v7a/ffmpeg-lite/build_free_arm_lite.sh
new file mode 100644
index 0000000..cb8d6f8
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/armeabi-v7a/ffmpeg-lite/build_free_arm_lite.sh
@@ -0,0 +1,119 @@
+# build.sh
+# 在Linux下编译FFmpeg成功的脚本
+# 注意Linux和windows的换行符\r\n不太一样，要转换（dos2unix）
+#!/bin/sh
+make clean
+export NDK=~/work/android-ndk-r15c-linux-x86_64/android-ndk-r15c
+export PREBUILT=$NDK/toolchains/arm-linux-androideabi-4.9/prebuilt
+export PLATFORM=$NDK/platforms/android-21/arch-arm
+export PREFIX=../fflib/free-arm-lite
+build_one(){
+./configure --target-os=android --prefix=$PREFIX \
+--enable-cross-compile \
+--enable-runtime-cpudetect \
+--arch=arm \
+--cross-prefix=$PREBUILT/linux-x86_64/bin/arm-linux-androideabi- \
+--cc=$PREBUILT/linux-x86_64/bin/arm-linux-androideabi-gcc \
+--nm=$PREBUILT/linux-x86_64/bin/arm-linux-androideabi-nm \
+--sysroot=$PLATFORM \
+--disable-gpl --disable-nonfree \
+--enable-shared --enable-static --enable-small \
+--disable-doc --disable-ffprobe --disable-ffplay --disable-debug \
+--enable-jni \
+--enable-mediacodec \
+--disable-avdevice \
+--enable-avcodec \
+--enable-avformat \
+--enable-avutil \
+--enable-swresample \
+--enable-swscale \
+--disable-postproc \
+--enable-avfilter \
+--disable-avresample \
+--disable-decoders \
+--enable-decoder=aac \
+--enable-decoder=aac_latm \
+--enable-decoder=flv \
+--enable-decoder=h264 \
+--enable-decoder=mp3* \
+--enable-decoder=vp6f \
+--enable-decoder=flac \
+--enable-decoder=hevc \
+--enable-decoder=vp8 \
+--enable-decoder=vp9 \
+--enable-decoder=amrnb \
+--enable-decoder=amrwb \
+--enable-decoder=mjpeg \
+--enable-decoder=png \
+--enable-decoder=h264_mediacodec \
+--enable-hwaccel=h264_mediacodec \
+--disable-encoders \
+--enable-encoder=aac \
+--enable-encoder=h264 \
+--enable-encoder=hevc \
+--enable-encoder=png \
+--enable-encoder=mjpeg \
+--disable-demuxers \
+--enable-demuxer=aac \
+--enable-demuxer=concat \
+--enable-demuxer=data \
+--enable-demuxer=flv \
+--enable-demuxer=hls \
+--enable-demuxer=live_flv \
+--enable-demuxer=mov \
+--enable-demuxer=mp3 \
+--enable-demuxer=mpegps \
+--enable-demuxer=mpegts \
+--enable-demuxer=mpegvideo \
+--enable-demuxer=flac \
+--enable-demuxer=hevc \
+--enable-demuxer=webm_dash_manifest \
+--enable-demuxer=rtsp \
+--enable-demuxer=rtp \
+--enable-demuxer=h264 \
+--enable-demuxer=mp4 \
+--enable-demuxer=image2 \
+--disable-muxers \
+--enable-muxer=rtsp \
+--enable-muxer=rtp \
+--enable-muxer=flv \
+--enable-muxer=h264 \
+--enable-muxer=mp4 \
+--enable-muxer=hevc \
+--enable-muxer=image2 \
+--disable-parsers \
+--enable-parser=aac \
+--enable-parser=aac_latm \
+--enable-parser=h264 \
+--enable-parser=flac \
+--enable-parser=hevc \
+--enable-protocols \
+--enable-protocol=async \
+--disable-protocol=bluray \
+--disable-protocol=concat \
+--disable-protocol=crypto \
+--disable-protocol=ffrtmpcrypt \
+--enable-protocol=ffrtmphttp \
+--disable-protocol=gopher \
+--disable-protocol=icecast \
+--disable-protocol=librtmp* \
+--disable-protocol=libssh \
+--disable-protocol=md5 \
+--disable-protocol=mmsh \
+--disable-protocol=mmst \
+--disable-protocol=rtmp* \
+--enable-protocol=rtmp \
+--enable-protocol=rtmpt \
+--disable-protocol=rtp \
+--disable-protocol=sctp \
+--disable-protocol=srtp \
+--disable-protocol=subfile \
+--disable-protocol=unix \
+--disable-indevs \
+--disable-outdevs \
+--disable-stripping \
+--enable-asm
+}
+build_one
+make
+make install
diff --git a/duix-sdk/src/main/cpp/third/arm/armeabi-v7a/libcurl.la b/duix-sdk/src/main/cpp/third/arm/armeabi-v7a/libcurl.la
new file mode 100644
index 0000000..a4accf9
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/armeabi-v7a/libcurl.la
@@ -0,0 +1,41 @@
+# libcurl.la - a libtool library file
+# Generated by libtool (GNU libtool) 2.4.6
+#
+# Please DO NOT delete this file!
+# It is necessary for linking the library.
+
+# The name that we can dlopen(3).
+dlname=''
+
+# Names of this library.
+library_names=''
+
+# The name of the static archive.
+old_library='libcurl.a'
+
+# Linker flags that cannot go in dependency_libs.
+inherited_linker_flags=''
+
+# Libraries that this one depends upon.
+dependency_libs=' -L/Users/rying/repo/openssl-curl-android/openssl/build/armeabi-v7a/lib -lssl -lcrypto -lz'
+
+# Names of additional weak libraries provided by this library
+weak_library_names=''
+
+# Version information for libcurl.
+current=0
+age=0
+revision=0
+
+# Is this an already installed library?
+installed=yes
+
+# Should we warn about portability when linking against -modules?
+shouldnotlink=no
+
+# Files to dlopen/dlpreopen
+dlopen=''
+dlpreopen=''
+
+# Directory that this library needs to be installed in:
+libdir='/Users/rying/repo/openssl-curl-android/curl/build/armeabi-v7a/lib'
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/avcodec.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/avcodec.h
new file mode 100644
index 0000000..57334df
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/avcodec.h
@@ -0,0 +1,6342 @@
+/*
+ * copyright (c) 2001 Fabrice Bellard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AVCODEC_H
+#define AVCODEC_AVCODEC_H
+
+/**
+ * @file
+ * @ingroup libavc
+ * Libavcodec external API header
+ */
+
+#include <errno.h>
+#include "libavutil/samplefmt.h"
+#include "libavutil/attributes.h"
+#include "libavutil/avutil.h"
+#include "libavutil/buffer.h"
+#include "libavutil/cpu.h"
+#include "libavutil/channel_layout.h"
+#include "libavutil/dict.h"
+#include "libavutil/frame.h"
+#include "libavutil/log.h"
+#include "libavutil/pixfmt.h"
+#include "libavutil/rational.h"
+
+#include "version.h"
+
+/**
+ * @defgroup libavc libavcodec
+ * Encoding/Decoding Library
+ *
+ * @{
+ *
+ * @defgroup lavc_decoding Decoding
+ * @{
+ * @}
+ *
+ * @defgroup lavc_encoding Encoding
+ * @{
+ * @}
+ *
+ * @defgroup lavc_codec Codecs
+ * @{
+ * @defgroup lavc_codec_native Native Codecs
+ * @{
+ * @}
+ * @defgroup lavc_codec_wrappers External library wrappers
+ * @{
+ * @}
+ * @defgroup lavc_codec_hwaccel Hardware Accelerators bridge
+ * @{
+ * @}
+ * @}
+ * @defgroup lavc_internal Internal
+ * @{
+ * @}
+ * @}
+ */
+
+/**
+ * @ingroup libavc
+ * @defgroup lavc_encdec send/receive encoding and decoding API overview
+ * @{
+ *
+ * The avcodec_send_packet()/avcodec_receive_frame()/avcodec_send_frame()/
+ * avcodec_receive_packet() functions provide an encode/decode API, which
+ * decouples input and output.
+ *
+ * The API is very similar for encoding/decoding and audio/video, and works as
+ * follows:
+ * - Set up and open the AVCodecContext as usual.
+ * - Send valid input:
+ *   - For decoding, call avcodec_send_packet() to give the decoder raw
+ *     compressed data in an AVPacket.
+ *   - For encoding, call avcodec_send_frame() to give the encoder an AVFrame
+ *     containing uncompressed audio or video.
+ *   In both cases, it is recommended that AVPackets and AVFrames are
+ *   refcounted, or libavcodec might have to copy the input data. (libavformat
+ *   always returns refcounted AVPackets, and av_frame_get_buffer() allocates
+ *   refcounted AVFrames.)
+ * - Receive output in a loop. Periodically call one of the avcodec_receive_*()
+ *   functions and process their output:
+ *   - For decoding, call avcodec_receive_frame(). On success, it will return
+ *     an AVFrame containing uncompressed audio or video data.
+ *   - For encoding, call avcodec_receive_packet(). On success, it will return
+ *     an AVPacket with a compressed frame.
+ *   Repeat this call until it returns AVERROR(EAGAIN) or an error. The
+ *   AVERROR(EAGAIN) return value means that new input data is required to
+ *   return new output. In this case, continue with sending input. For each
+ *   input frame/packet, the codec will typically return 1 output frame/packet,
+ *   but it can also be 0 or more than 1.
+ *
+ * At the beginning of decoding or encoding, the codec might accept multiple
+ * input frames/packets without returning a frame, until its internal buffers
+ * are filled. This situation is handled transparently if you follow the steps
+ * outlined above.
+ *
+ * In theory, sending input can result in EAGAIN - this should happen only if
+ * not all output was received. You can use this to structure alternative decode
+ * or encode loops other than the one suggested above. For example, you could
+ * try sending new input on each iteration, and try to receive output if that
+ * returns EAGAIN.
+ *
+ * End of stream situations. These require "flushing" (aka draining) the codec,
+ * as the codec might buffer multiple frames or packets internally for
+ * performance or out of necessity (consider B-frames).
+ * This is handled as follows:
+ * - Instead of valid input, send NULL to the avcodec_send_packet() (decoding)
+ *   or avcodec_send_frame() (encoding) functions. This will enter draining
+ *   mode.
+ * - Call avcodec_receive_frame() (decoding) or avcodec_receive_packet()
+ *   (encoding) in a loop until AVERROR_EOF is returned. The functions will
+ *   not return AVERROR(EAGAIN), unless you forgot to enter draining mode.
+ * - Before decoding can be resumed again, the codec has to be reset with
+ *   avcodec_flush_buffers().
+ *
+ * Using the API as outlined above is highly recommended. But it is also
+ * possible to call functions outside of this rigid schema. For example, you can
+ * call avcodec_send_packet() repeatedly without calling
+ * avcodec_receive_frame(). In this case, avcodec_send_packet() will succeed
+ * until the codec's internal buffer has been filled up (which is typically of
+ * size 1 per output frame, after initial input), and then reject input with
+ * AVERROR(EAGAIN). Once it starts rejecting input, you have no choice but to
+ * read at least some output.
+ *
+ * Not all codecs will follow a rigid and predictable dataflow; the only
+ * guarantee is that an AVERROR(EAGAIN) return value on a send/receive call on
+ * one end implies that a receive/send call on the other end will succeed, or
+ * at least will not fail with AVERROR(EAGAIN). In general, no codec will
+ * permit unlimited buffering of input or output.
+ *
+ * This API replaces the following legacy functions:
+ * - avcodec_decode_video2() and avcodec_decode_audio4():
+ *   Use avcodec_send_packet() to feed input to the decoder, then use
+ *   avcodec_receive_frame() to receive decoded frames after each packet.
+ *   Unlike with the old video decoding API, multiple frames might result from
+ *   a packet. For audio, splitting the input packet into frames by partially
+ *   decoding packets becomes transparent to the API user. You never need to
+ *   feed an AVPacket to the API twice (unless it is rejected with AVERROR(EAGAIN) - then
+ *   no data was read from the packet).
+ *   Additionally, sending a flush/draining packet is required only once.
+ * - avcodec_encode_video2()/avcodec_encode_audio2():
+ *   Use avcodec_send_frame() to feed input to the encoder, then use
+ *   avcodec_receive_packet() to receive encoded packets.
+ *   Providing user-allocated buffers for avcodec_receive_packet() is not
+ *   possible.
+ * - The new API does not handle subtitles yet.
+ *
+ * Mixing new and old function calls on the same AVCodecContext is not allowed,
+ * and will result in undefined behavior.
+ *
+ * Some codecs might require using the new API; using the old API will return
+ * an error when calling it. All codecs support the new API.
+ *
+ * A codec is not allowed to return AVERROR(EAGAIN) for both sending and receiving. This
+ * would be an invalid state, which could put the codec user into an endless
+ * loop. The API has no concept of time either: it cannot happen that trying to
+ * do avcodec_send_packet() results in AVERROR(EAGAIN), but a repeated call 1 second
+ * later accepts the packet (with no other receive/flush API calls involved).
+ * The API is a strict state machine, and the passage of time is not supposed
+ * to influence it. Some timing-dependent behavior might still be deemed
+ * acceptable in certain cases. But it must never result in both send/receive
+ * returning EAGAIN at the same time at any point. It must also absolutely be
+ * avoided that the current state is "unstable" and can "flip-flop" between
+ * the send/receive APIs allowing progress. For example, it's not allowed that
+ * the codec randomly decides that it actually wants to consume a packet now
+ * instead of returning a frame, after it just returned AVERROR(EAGAIN) on an
+ * avcodec_send_packet() call.
+ * @}
+ */
+
+/**
+ * @defgroup lavc_core Core functions/structures.
+ * @ingroup libavc
+ *
+ * Basic definitions, functions for querying libavcodec capabilities,
+ * allocating core structures, etc.
+ * @{
+ */
+
+
+/**
+ * Identify the syntax and semantics of the bitstream.
+ * The principle is roughly:
+ * Two decoders with the same ID can decode the same streams.
+ * Two encoders with the same ID can encode compatible streams.
+ * There may be slight deviations from the principle due to implementation
+ * details.
+ *
+ * If you add a codec ID to this list, add it so that
+ * 1. no value of an existing codec ID changes (that would break ABI),
+ * 2. it is as close as possible to similar codecs
+ *
+ * After adding new codec IDs, do not forget to add an entry to the codec
+ * descriptor list and bump libavcodec minor version.
+ */
+enum AVCodecID {
+    AV_CODEC_ID_NONE,
+
+    /* video codecs */
+    AV_CODEC_ID_MPEG1VIDEO,
+    AV_CODEC_ID_MPEG2VIDEO, ///< preferred ID for MPEG-1/2 video decoding
+#if FF_API_XVMC
+    AV_CODEC_ID_MPEG2VIDEO_XVMC,
+#endif /* FF_API_XVMC */
+    AV_CODEC_ID_H261,
+    AV_CODEC_ID_H263,
+    AV_CODEC_ID_RV10,
+    AV_CODEC_ID_RV20,
+    AV_CODEC_ID_MJPEG,
+    AV_CODEC_ID_MJPEGB,
+    AV_CODEC_ID_LJPEG,
+    AV_CODEC_ID_SP5X,
+    AV_CODEC_ID_JPEGLS,
+    AV_CODEC_ID_MPEG4,
+    AV_CODEC_ID_RAWVIDEO,
+    AV_CODEC_ID_MSMPEG4V1,
+    AV_CODEC_ID_MSMPEG4V2,
+    AV_CODEC_ID_MSMPEG4V3,
+    AV_CODEC_ID_WMV1,
+    AV_CODEC_ID_WMV2,
+    AV_CODEC_ID_H263P,
+    AV_CODEC_ID_H263I,
+    AV_CODEC_ID_FLV1,
+    AV_CODEC_ID_SVQ1,
+    AV_CODEC_ID_SVQ3,
+    AV_CODEC_ID_DVVIDEO,
+    AV_CODEC_ID_HUFFYUV,
+    AV_CODEC_ID_CYUV,
+    AV_CODEC_ID_H264,
+    AV_CODEC_ID_INDEO3,
+    AV_CODEC_ID_VP3,
+    AV_CODEC_ID_THEORA,
+    AV_CODEC_ID_ASV1,
+    AV_CODEC_ID_ASV2,
+    AV_CODEC_ID_FFV1,
+    AV_CODEC_ID_4XM,
+    AV_CODEC_ID_VCR1,
+    AV_CODEC_ID_CLJR,
+    AV_CODEC_ID_MDEC,
+    AV_CODEC_ID_ROQ,
+    AV_CODEC_ID_INTERPLAY_VIDEO,
+    AV_CODEC_ID_XAN_WC3,
+    AV_CODEC_ID_XAN_WC4,
+    AV_CODEC_ID_RPZA,
+    AV_CODEC_ID_CINEPAK,
+    AV_CODEC_ID_WS_VQA,
+    AV_CODEC_ID_MSRLE,
+    AV_CODEC_ID_MSVIDEO1,
+    AV_CODEC_ID_IDCIN,
+    AV_CODEC_ID_8BPS,
+    AV_CODEC_ID_SMC,
+    AV_CODEC_ID_FLIC,
+    AV_CODEC_ID_TRUEMOTION1,
+    AV_CODEC_ID_VMDVIDEO,
+    AV_CODEC_ID_MSZH,
+    AV_CODEC_ID_ZLIB,
+    AV_CODEC_ID_QTRLE,
+    AV_CODEC_ID_TSCC,
+    AV_CODEC_ID_ULTI,
+    AV_CODEC_ID_QDRAW,
+    AV_CODEC_ID_VIXL,
+    AV_CODEC_ID_QPEG,
+    AV_CODEC_ID_PNG,
+    AV_CODEC_ID_PPM,
+    AV_CODEC_ID_PBM,
+    AV_CODEC_ID_PGM,
+    AV_CODEC_ID_PGMYUV,
+    AV_CODEC_ID_PAM,
+    AV_CODEC_ID_FFVHUFF,
+    AV_CODEC_ID_RV30,
+    AV_CODEC_ID_RV40,
+    AV_CODEC_ID_VC1,
+    AV_CODEC_ID_WMV3,
+    AV_CODEC_ID_LOCO,
+    AV_CODEC_ID_WNV1,
+    AV_CODEC_ID_AASC,
+    AV_CODEC_ID_INDEO2,
+    AV_CODEC_ID_FRAPS,
+    AV_CODEC_ID_TRUEMOTION2,
+    AV_CODEC_ID_BMP,
+    AV_CODEC_ID_CSCD,
+    AV_CODEC_ID_MMVIDEO,
+    AV_CODEC_ID_ZMBV,
+    AV_CODEC_ID_AVS,
+    AV_CODEC_ID_SMACKVIDEO,
+    AV_CODEC_ID_NUV,
+    AV_CODEC_ID_KMVC,
+    AV_CODEC_ID_FLASHSV,
+    AV_CODEC_ID_CAVS,
+    AV_CODEC_ID_JPEG2000,
+    AV_CODEC_ID_VMNC,
+    AV_CODEC_ID_VP5,
+    AV_CODEC_ID_VP6,
+    AV_CODEC_ID_VP6F,
+    AV_CODEC_ID_TARGA,
+    AV_CODEC_ID_DSICINVIDEO,
+    AV_CODEC_ID_TIERTEXSEQVIDEO,
+    AV_CODEC_ID_TIFF,
+    AV_CODEC_ID_GIF,
+    AV_CODEC_ID_DXA,
+    AV_CODEC_ID_DNXHD,
+    AV_CODEC_ID_THP,
+    AV_CODEC_ID_SGI,
+    AV_CODEC_ID_C93,
+    AV_CODEC_ID_BETHSOFTVID,
+    AV_CODEC_ID_PTX,
+    AV_CODEC_ID_TXD,
+    AV_CODEC_ID_VP6A,
+    AV_CODEC_ID_AMV,
+    AV_CODEC_ID_VB,
+    AV_CODEC_ID_PCX,
+    AV_CODEC_ID_SUNRAST,
+    AV_CODEC_ID_INDEO4,
+    AV_CODEC_ID_INDEO5,
+    AV_CODEC_ID_MIMIC,
+    AV_CODEC_ID_RL2,
+    AV_CODEC_ID_ESCAPE124,
+    AV_CODEC_ID_DIRAC,
+    AV_CODEC_ID_BFI,
+    AV_CODEC_ID_CMV,
+    AV_CODEC_ID_MOTIONPIXELS,
+    AV_CODEC_ID_TGV,
+    AV_CODEC_ID_TGQ,
+    AV_CODEC_ID_TQI,
+    AV_CODEC_ID_AURA,
+    AV_CODEC_ID_AURA2,
+    AV_CODEC_ID_V210X,
+    AV_CODEC_ID_TMV,
+    AV_CODEC_ID_V210,
+    AV_CODEC_ID_DPX,
+    AV_CODEC_ID_MAD,
+    AV_CODEC_ID_FRWU,
+    AV_CODEC_ID_FLASHSV2,
+    AV_CODEC_ID_CDGRAPHICS,
+    AV_CODEC_ID_R210,
+    AV_CODEC_ID_ANM,
+    AV_CODEC_ID_BINKVIDEO,
+    AV_CODEC_ID_IFF_ILBM,
+#define AV_CODEC_ID_IFF_BYTERUN1 AV_CODEC_ID_IFF_ILBM
+    AV_CODEC_ID_KGV1,
+    AV_CODEC_ID_YOP,
+    AV_CODEC_ID_VP8,
+    AV_CODEC_ID_PICTOR,
+    AV_CODEC_ID_ANSI,
+    AV_CODEC_ID_A64_MULTI,
+    AV_CODEC_ID_A64_MULTI5,
+    AV_CODEC_ID_R10K,
+    AV_CODEC_ID_MXPEG,
+    AV_CODEC_ID_LAGARITH,
+    AV_CODEC_ID_PRORES,
+    AV_CODEC_ID_JV,
+    AV_CODEC_ID_DFA,
+    AV_CODEC_ID_WMV3IMAGE,
+    AV_CODEC_ID_VC1IMAGE,
+    AV_CODEC_ID_UTVIDEO,
+    AV_CODEC_ID_BMV_VIDEO,
+    AV_CODEC_ID_VBLE,
+    AV_CODEC_ID_DXTORY,
+    AV_CODEC_ID_V410,
+    AV_CODEC_ID_XWD,
+    AV_CODEC_ID_CDXL,
+    AV_CODEC_ID_XBM,
+    AV_CODEC_ID_ZEROCODEC,
+    AV_CODEC_ID_MSS1,
+    AV_CODEC_ID_MSA1,
+    AV_CODEC_ID_TSCC2,
+    AV_CODEC_ID_MTS2,
+    AV_CODEC_ID_CLLC,
+    AV_CODEC_ID_MSS2,
+    AV_CODEC_ID_VP9,
+    AV_CODEC_ID_AIC,
+    AV_CODEC_ID_ESCAPE130,
+    AV_CODEC_ID_G2M,
+    AV_CODEC_ID_WEBP,
+    AV_CODEC_ID_HNM4_VIDEO,
+    AV_CODEC_ID_HEVC,
+#define AV_CODEC_ID_H265 AV_CODEC_ID_HEVC
+    AV_CODEC_ID_FIC,
+    AV_CODEC_ID_ALIAS_PIX,
+    AV_CODEC_ID_BRENDER_PIX,
+    AV_CODEC_ID_PAF_VIDEO,
+    AV_CODEC_ID_EXR,
+    AV_CODEC_ID_VP7,
+    AV_CODEC_ID_SANM,
+    AV_CODEC_ID_SGIRLE,
+    AV_CODEC_ID_MVC1,
+    AV_CODEC_ID_MVC2,
+    AV_CODEC_ID_HQX,
+    AV_CODEC_ID_TDSC,
+    AV_CODEC_ID_HQ_HQA,
+    AV_CODEC_ID_HAP,
+    AV_CODEC_ID_DDS,
+    AV_CODEC_ID_DXV,
+    AV_CODEC_ID_SCREENPRESSO,
+    AV_CODEC_ID_RSCC,
+
+    AV_CODEC_ID_Y41P = 0x8000,
+    AV_CODEC_ID_AVRP,
+    AV_CODEC_ID_012V,
+    AV_CODEC_ID_AVUI,
+    AV_CODEC_ID_AYUV,
+    AV_CODEC_ID_TARGA_Y216,
+    AV_CODEC_ID_V308,
+    AV_CODEC_ID_V408,
+    AV_CODEC_ID_YUV4,
+    AV_CODEC_ID_AVRN,
+    AV_CODEC_ID_CPIA,
+    AV_CODEC_ID_XFACE,
+    AV_CODEC_ID_SNOW,
+    AV_CODEC_ID_SMVJPEG,
+    AV_CODEC_ID_APNG,
+    AV_CODEC_ID_DAALA,
+    AV_CODEC_ID_CFHD,
+    AV_CODEC_ID_TRUEMOTION2RT,
+    AV_CODEC_ID_M101,
+    AV_CODEC_ID_MAGICYUV,
+    AV_CODEC_ID_SHEERVIDEO,
+    AV_CODEC_ID_YLC,
+    AV_CODEC_ID_PSD,
+    AV_CODEC_ID_PIXLET,
+    AV_CODEC_ID_SPEEDHQ,
+    AV_CODEC_ID_FMVC,
+    AV_CODEC_ID_SCPR,
+    AV_CODEC_ID_CLEARVIDEO,
+    AV_CODEC_ID_XPM,
+    AV_CODEC_ID_AV1,
+
+    /* various PCM "codecs" */
+    AV_CODEC_ID_FIRST_AUDIO = 0x10000,     ///< A dummy id pointing at the start of audio codecs
+    AV_CODEC_ID_PCM_S16LE = 0x10000,
+    AV_CODEC_ID_PCM_S16BE,
+    AV_CODEC_ID_PCM_U16LE,
+    AV_CODEC_ID_PCM_U16BE,
+    AV_CODEC_ID_PCM_S8,
+    AV_CODEC_ID_PCM_U8,
+    AV_CODEC_ID_PCM_MULAW,
+    AV_CODEC_ID_PCM_ALAW,
+    AV_CODEC_ID_PCM_S32LE,
+    AV_CODEC_ID_PCM_S32BE,
+    AV_CODEC_ID_PCM_U32LE,
+    AV_CODEC_ID_PCM_U32BE,
+    AV_CODEC_ID_PCM_S24LE,
+    AV_CODEC_ID_PCM_S24BE,
+    AV_CODEC_ID_PCM_U24LE,
+    AV_CODEC_ID_PCM_U24BE,
+    AV_CODEC_ID_PCM_S24DAUD,
+    AV_CODEC_ID_PCM_ZORK,
+    AV_CODEC_ID_PCM_S16LE_PLANAR,
+    AV_CODEC_ID_PCM_DVD,
+    AV_CODEC_ID_PCM_F32BE,
+    AV_CODEC_ID_PCM_F32LE,
+    AV_CODEC_ID_PCM_F64BE,
+    AV_CODEC_ID_PCM_F64LE,
+    AV_CODEC_ID_PCM_BLURAY,
+    AV_CODEC_ID_PCM_LXF,
+    AV_CODEC_ID_S302M,
+    AV_CODEC_ID_PCM_S8_PLANAR,
+    AV_CODEC_ID_PCM_S24LE_PLANAR,
+    AV_CODEC_ID_PCM_S32LE_PLANAR,
+    AV_CODEC_ID_PCM_S16BE_PLANAR,
+
+    AV_CODEC_ID_PCM_S64LE = 0x10800,
+    AV_CODEC_ID_PCM_S64BE,
+    AV_CODEC_ID_PCM_F16LE,
+    AV_CODEC_ID_PCM_F24LE,
+
+    /* various ADPCM codecs */
+    AV_CODEC_ID_ADPCM_IMA_QT = 0x11000,
+    AV_CODEC_ID_ADPCM_IMA_WAV,
+    AV_CODEC_ID_ADPCM_IMA_DK3,
+    AV_CODEC_ID_ADPCM_IMA_DK4,
+    AV_CODEC_ID_ADPCM_IMA_WS,
+    AV_CODEC_ID_ADPCM_IMA_SMJPEG,
+    AV_CODEC_ID_ADPCM_MS,
+    AV_CODEC_ID_ADPCM_4XM,
+    AV_CODEC_ID_ADPCM_XA,
+    AV_CODEC_ID_ADPCM_ADX,
+    AV_CODEC_ID_ADPCM_EA,
+    AV_CODEC_ID_ADPCM_G726,
+    AV_CODEC_ID_ADPCM_CT,
+    AV_CODEC_ID_ADPCM_SWF,
+    AV_CODEC_ID_ADPCM_YAMAHA,
+    AV_CODEC_ID_ADPCM_SBPRO_4,
+    AV_CODEC_ID_ADPCM_SBPRO_3,
+    AV_CODEC_ID_ADPCM_SBPRO_2,
+    AV_CODEC_ID_ADPCM_THP,
+    AV_CODEC_ID_ADPCM_IMA_AMV,
+    AV_CODEC_ID_ADPCM_EA_R1,
+    AV_CODEC_ID_ADPCM_EA_R3,
+    AV_CODEC_ID_ADPCM_EA_R2,
+    AV_CODEC_ID_ADPCM_IMA_EA_SEAD,
+    AV_CODEC_ID_ADPCM_IMA_EA_EACS,
+    AV_CODEC_ID_ADPCM_EA_XAS,
+    AV_CODEC_ID_ADPCM_EA_MAXIS_XA,
+    AV_CODEC_ID_ADPCM_IMA_ISS,
+    AV_CODEC_ID_ADPCM_G722,
+    AV_CODEC_ID_ADPCM_IMA_APC,
+    AV_CODEC_ID_ADPCM_VIMA,
+#if FF_API_VIMA_DECODER
+    AV_CODEC_ID_VIMA = AV_CODEC_ID_ADPCM_VIMA,
+#endif
+
+    AV_CODEC_ID_ADPCM_AFC = 0x11800,
+    AV_CODEC_ID_ADPCM_IMA_OKI,
+    AV_CODEC_ID_ADPCM_DTK,
+    AV_CODEC_ID_ADPCM_IMA_RAD,
+    AV_CODEC_ID_ADPCM_G726LE,
+    AV_CODEC_ID_ADPCM_THP_LE,
+    AV_CODEC_ID_ADPCM_PSX,
+    AV_CODEC_ID_ADPCM_AICA,
+    AV_CODEC_ID_ADPCM_IMA_DAT4,
+    AV_CODEC_ID_ADPCM_MTAF,
+
+    /* AMR */
+    AV_CODEC_ID_AMR_NB = 0x12000,
+    AV_CODEC_ID_AMR_WB,
+
+    /* RealAudio codecs*/
+    AV_CODEC_ID_RA_144 = 0x13000,
+    AV_CODEC_ID_RA_288,
+
+    /* various DPCM codecs */
+    AV_CODEC_ID_ROQ_DPCM = 0x14000,
+    AV_CODEC_ID_INTERPLAY_DPCM,
+    AV_CODEC_ID_XAN_DPCM,
+    AV_CODEC_ID_SOL_DPCM,
+
+    AV_CODEC_ID_SDX2_DPCM = 0x14800,
+
+    /* audio codecs */
+    AV_CODEC_ID_MP2 = 0x15000,
+    AV_CODEC_ID_MP3, ///< preferred ID for decoding MPEG audio layer 1, 2 or 3
+    AV_CODEC_ID_AAC,
+    AV_CODEC_ID_AC3,
+    AV_CODEC_ID_DTS,
+    AV_CODEC_ID_VORBIS,
+    AV_CODEC_ID_DVAUDIO,
+    AV_CODEC_ID_WMAV1,
+    AV_CODEC_ID_WMAV2,
+    AV_CODEC_ID_MACE3,
+    AV_CODEC_ID_MACE6,
+    AV_CODEC_ID_VMDAUDIO,
+    AV_CODEC_ID_FLAC,
+    AV_CODEC_ID_MP3ADU,
+    AV_CODEC_ID_MP3ON4,
+    AV_CODEC_ID_SHORTEN,
+    AV_CODEC_ID_ALAC,
+    AV_CODEC_ID_WESTWOOD_SND1,
+    AV_CODEC_ID_GSM, ///< as in Berlin toast format
+    AV_CODEC_ID_QDM2,
+    AV_CODEC_ID_COOK,
+    AV_CODEC_ID_TRUESPEECH,
+    AV_CODEC_ID_TTA,
+    AV_CODEC_ID_SMACKAUDIO,
+    AV_CODEC_ID_QCELP,
+    AV_CODEC_ID_WAVPACK,
+    AV_CODEC_ID_DSICINAUDIO,
+    AV_CODEC_ID_IMC,
+    AV_CODEC_ID_MUSEPACK7,
+    AV_CODEC_ID_MLP,
+    AV_CODEC_ID_GSM_MS, /* as found in WAV */
+    AV_CODEC_ID_ATRAC3,
+#if FF_API_VOXWARE
+    AV_CODEC_ID_VOXWARE,
+#endif
+    AV_CODEC_ID_APE,
+    AV_CODEC_ID_NELLYMOSER,
+    AV_CODEC_ID_MUSEPACK8,
+    AV_CODEC_ID_SPEEX,
+    AV_CODEC_ID_WMAVOICE,
+    AV_CODEC_ID_WMAPRO,
+    AV_CODEC_ID_WMALOSSLESS,
+    AV_CODEC_ID_ATRAC3P,
+    AV_CODEC_ID_EAC3,
+    AV_CODEC_ID_SIPR,
+    AV_CODEC_ID_MP1,
+    AV_CODEC_ID_TWINVQ,
+    AV_CODEC_ID_TRUEHD,
+    AV_CODEC_ID_MP4ALS,
+    AV_CODEC_ID_ATRAC1,
+    AV_CODEC_ID_BINKAUDIO_RDFT,
+    AV_CODEC_ID_BINKAUDIO_DCT,
+    AV_CODEC_ID_AAC_LATM,
+    AV_CODEC_ID_QDMC,
+    AV_CODEC_ID_CELT,
+    AV_CODEC_ID_G723_1,
+    AV_CODEC_ID_G729,
+    AV_CODEC_ID_8SVX_EXP,
+    AV_CODEC_ID_8SVX_FIB,
+    AV_CODEC_ID_BMV_AUDIO,
+    AV_CODEC_ID_RALF,
+    AV_CODEC_ID_IAC,
+    AV_CODEC_ID_ILBC,
+    AV_CODEC_ID_OPUS,
+    AV_CODEC_ID_COMFORT_NOISE,
+    AV_CODEC_ID_TAK,
+    AV_CODEC_ID_METASOUND,
+    AV_CODEC_ID_PAF_AUDIO,
+    AV_CODEC_ID_ON2AVC,
+    AV_CODEC_ID_DSS_SP,
+
+    AV_CODEC_ID_FFWAVESYNTH = 0x15800,
+    AV_CODEC_ID_SONIC,
+    AV_CODEC_ID_SONIC_LS,
+    AV_CODEC_ID_EVRC,
+    AV_CODEC_ID_SMV,
+    AV_CODEC_ID_DSD_LSBF,
+    AV_CODEC_ID_DSD_MSBF,
+    AV_CODEC_ID_DSD_LSBF_PLANAR,
+    AV_CODEC_ID_DSD_MSBF_PLANAR,
+    AV_CODEC_ID_4GV,
+    AV_CODEC_ID_INTERPLAY_ACM,
+    AV_CODEC_ID_XMA1,
+    AV_CODEC_ID_XMA2,
+    AV_CODEC_ID_DST,
+    AV_CODEC_ID_ATRAC3AL,
+    AV_CODEC_ID_ATRAC3PAL,
+
+    /* subtitle codecs */
+    AV_CODEC_ID_FIRST_SUBTITLE = 0x17000,          ///< A dummy ID pointing at the start of subtitle codecs.
+    AV_CODEC_ID_DVD_SUBTITLE = 0x17000,
+    AV_CODEC_ID_DVB_SUBTITLE,
+    AV_CODEC_ID_TEXT,  ///< raw UTF-8 text
+    AV_CODEC_ID_XSUB,
+    AV_CODEC_ID_SSA,
+    AV_CODEC_ID_MOV_TEXT,
+    AV_CODEC_ID_HDMV_PGS_SUBTITLE,
+    AV_CODEC_ID_DVB_TELETEXT,
+    AV_CODEC_ID_SRT,
+
+    AV_CODEC_ID_MICRODVD   = 0x17800,
+    AV_CODEC_ID_EIA_608,
+    AV_CODEC_ID_JACOSUB,
+    AV_CODEC_ID_SAMI,
+    AV_CODEC_ID_REALTEXT,
+    AV_CODEC_ID_STL,
+    AV_CODEC_ID_SUBVIEWER1,
+    AV_CODEC_ID_SUBVIEWER,
+    AV_CODEC_ID_SUBRIP,
+    AV_CODEC_ID_WEBVTT,
+    AV_CODEC_ID_MPL2,
+    AV_CODEC_ID_VPLAYER,
+    AV_CODEC_ID_PJS,
+    AV_CODEC_ID_ASS,
+    AV_CODEC_ID_HDMV_TEXT_SUBTITLE,
+
+    /* other specific kind of codecs (generally used for attachments) */
+    AV_CODEC_ID_FIRST_UNKNOWN = 0x18000,           ///< A dummy ID pointing at the start of various fake codecs.
+    AV_CODEC_ID_TTF = 0x18000,
+
+    AV_CODEC_ID_SCTE_35, ///< Contain timestamp estimated through PCR of program stream.
+    AV_CODEC_ID_BINTEXT    = 0x18800,
+    AV_CODEC_ID_XBIN,
+    AV_CODEC_ID_IDF,
+    AV_CODEC_ID_OTF,
+    AV_CODEC_ID_SMPTE_KLV,
+    AV_CODEC_ID_DVD_NAV,
+    AV_CODEC_ID_TIMED_ID3,
+    AV_CODEC_ID_BIN_DATA,
+
+
+    AV_CODEC_ID_PROBE = 0x19000, ///< codec_id is not known (like AV_CODEC_ID_NONE) but lavf should attempt to identify it
+
+    AV_CODEC_ID_MPEG2TS = 0x20000, /**< _FAKE_ codec to indicate a raw MPEG-2 TS
+                                * stream (only used by libavformat) */
+    AV_CODEC_ID_MPEG4SYSTEMS = 0x20001, /**< _FAKE_ codec to indicate a MPEG-4 Systems
+                                * stream (only used by libavformat) */
+    AV_CODEC_ID_FFMETADATA = 0x21000,   ///< Dummy codec for streams containing only metadata information.
+    AV_CODEC_ID_WRAPPED_AVFRAME = 0x21001, ///< Passthrough codec, AVFrames wrapped in AVPacket
+};
+
+/**
+ * This struct describes the properties of a single codec described by an
+ * AVCodecID.
+ * @see avcodec_descriptor_get()
+ */
+typedef struct AVCodecDescriptor {
+    enum AVCodecID     id;
+    enum AVMediaType type;
+    /**
+     * Name of the codec described by this descriptor. It is non-empty and
+     * unique for each codec descriptor. It should contain alphanumeric
+     * characters and '_' only.
+     */
+    const char      *name;
+    /**
+     * A more descriptive name for this codec. May be NULL.
+     */
+    const char *long_name;
+    /**
+     * Codec properties, a combination of AV_CODEC_PROP_* flags.
+     */
+    int             props;
+    /**
+     * MIME type(s) associated with the codec.
+     * May be NULL; if not, a NULL-terminated array of MIME types.
+     * The first item is always non-NULL and is the preferred MIME type.
+     */
+    const char *const *mime_types;
+    /**
+     * If non-NULL, an array of profiles recognized for this codec.
+     * Terminated with FF_PROFILE_UNKNOWN.
+     */
+    const struct AVProfile *profiles;
+} AVCodecDescriptor;
+
+/**
+ * Codec uses only intra compression.
+ * Video codecs only.
+ */
+#define AV_CODEC_PROP_INTRA_ONLY    (1 << 0)
+/**
+ * Codec supports lossy compression. Audio and video codecs only.
+ * @note a codec may support both lossy and lossless
+ * compression modes
+ */
+#define AV_CODEC_PROP_LOSSY         (1 << 1)
+/**
+ * Codec supports lossless compression. Audio and video codecs only.
+ */
+#define AV_CODEC_PROP_LOSSLESS      (1 << 2)
+/**
+ * Codec supports frame reordering. That is, the coded order (the order in which
+ * the encoded packets are output by the encoders / stored / input to the
+ * decoders) may be different from the presentation order of the corresponding
+ * frames.
+ *
+ * For codecs that do not have this property set, PTS and DTS should always be
+ * equal.
+ */
+#define AV_CODEC_PROP_REORDER       (1 << 3)
+/**
+ * Subtitle codec is bitmap based
+ * Decoded AVSubtitle data can be read from the AVSubtitleRect->pict field.
+ */
+#define AV_CODEC_PROP_BITMAP_SUB    (1 << 16)
+/**
+ * Subtitle codec is text based.
+ * Decoded AVSubtitle data can be read from the AVSubtitleRect->ass field.
+ */
+#define AV_CODEC_PROP_TEXT_SUB      (1 << 17)
+
+/**
+ * @ingroup lavc_decoding
+ * Required number of additionally allocated bytes at the end of the input bitstream for decoding.
+ * This is mainly needed because some optimized bitstream readers read
+ * 32 or 64 bit at once and could read over the end.<br>
+ * Note: If the first 23 bits of the additional bytes are not 0, then damaged
+ * MPEG bitstreams could cause overread and segfault.
+ */
+#define AV_INPUT_BUFFER_PADDING_SIZE 32
+
+/**
+ * @ingroup lavc_encoding
+ * minimum encoding buffer size
+ * Used to avoid some checks during header writing.
+ */
+#define AV_INPUT_BUFFER_MIN_SIZE 16384
+
+#if FF_API_WITHOUT_PREFIX
+/**
+ * @deprecated use AV_INPUT_BUFFER_PADDING_SIZE instead
+ */
+#define FF_INPUT_BUFFER_PADDING_SIZE 32
+
+/**
+ * @deprecated use AV_INPUT_BUFFER_MIN_SIZE instead
+ */
+#define FF_MIN_BUFFER_SIZE 16384
+#endif /* FF_API_WITHOUT_PREFIX */
+
+/**
+ * @ingroup lavc_encoding
+ * motion estimation type.
+ * @deprecated use codec private option instead
+ */
+#if FF_API_MOTION_EST
+enum Motion_Est_ID {
+    ME_ZERO = 1,    ///< no search, that is use 0,0 vector whenever one is needed
+    ME_FULL,
+    ME_LOG,
+    ME_PHODS,
+    ME_EPZS,        ///< enhanced predictive zonal search
+    ME_X1,          ///< reserved for experiments
+    ME_HEX,         ///< hexagon based search
+    ME_UMH,         ///< uneven multi-hexagon search
+    ME_TESA,        ///< transformed exhaustive search algorithm
+    ME_ITER=50,     ///< iterative search
+};
+#endif
+
+/**
+ * @ingroup lavc_decoding
+ */
+enum AVDiscard{
+    /* We leave some space between them for extensions (drop some
+     * keyframes for intra-only or drop just some bidir frames). */
+    AVDISCARD_NONE    =-16, ///< discard nothing
+    AVDISCARD_DEFAULT =  0, ///< discard useless packets like 0 size packets in avi
+    AVDISCARD_NONREF  =  8, ///< discard all non reference
+    AVDISCARD_BIDIR   = 16, ///< discard all bidirectional frames
+    AVDISCARD_NONINTRA= 24, ///< discard all non intra frames
+    AVDISCARD_NONKEY  = 32, ///< discard all frames except keyframes
+    AVDISCARD_ALL     = 48, ///< discard all
+};
+
+enum AVAudioServiceType {
+    AV_AUDIO_SERVICE_TYPE_MAIN              = 0,
+    AV_AUDIO_SERVICE_TYPE_EFFECTS           = 1,
+    AV_AUDIO_SERVICE_TYPE_VISUALLY_IMPAIRED = 2,
+    AV_AUDIO_SERVICE_TYPE_HEARING_IMPAIRED  = 3,
+    AV_AUDIO_SERVICE_TYPE_DIALOGUE          = 4,
+    AV_AUDIO_SERVICE_TYPE_COMMENTARY        = 5,
+    AV_AUDIO_SERVICE_TYPE_EMERGENCY         = 6,
+    AV_AUDIO_SERVICE_TYPE_VOICE_OVER        = 7,
+    AV_AUDIO_SERVICE_TYPE_KARAOKE           = 8,
+    AV_AUDIO_SERVICE_TYPE_NB                   , ///< Not part of ABI
+};
+
+/**
+ * @ingroup lavc_encoding
+ */
+typedef struct RcOverride{
+    int start_frame;
+    int end_frame;
+    int qscale; // If this is 0 then quality_factor will be used instead.
+    float quality_factor;
+} RcOverride;
+
+#if FF_API_MAX_BFRAMES
+/**
+ * @deprecated there is no libavcodec-wide limit on the number of B-frames
+ */
+#define FF_MAX_B_FRAMES 16
+#endif
+
+/* encoding support
+   These flags can be passed in AVCodecContext.flags before initialization.
+   Note: Not everything is supported yet.
+*/
+
+/**
+ * Allow decoders to produce frames with data planes that are not aligned
+ * to CPU requirements (e.g. due to cropping).
+ */
+#define AV_CODEC_FLAG_UNALIGNED       (1 <<  0)
+/**
+ * Use fixed qscale.
+ */
+#define AV_CODEC_FLAG_QSCALE          (1 <<  1)
+/**
+ * 4 MV per MB allowed / advanced prediction for H.263.
+ */
+#define AV_CODEC_FLAG_4MV             (1 <<  2)
+/**
+ * Output even those frames that might be corrupted.
+ */
+#define AV_CODEC_FLAG_OUTPUT_CORRUPT  (1 <<  3)
+/**
+ * Use qpel MC.
+ */
+#define AV_CODEC_FLAG_QPEL            (1 <<  4)
+/**
+ * Use internal 2pass ratecontrol in first pass mode.
+ */
+#define AV_CODEC_FLAG_PASS1           (1 <<  9)
+/**
+ * Use internal 2pass ratecontrol in second pass mode.
+ */
+#define AV_CODEC_FLAG_PASS2           (1 << 10)
+/**
+ * loop filter.
+ */
+#define AV_CODEC_FLAG_LOOP_FILTER     (1 << 11)
+/**
+ * Only decode/encode grayscale.
+ */
+#define AV_CODEC_FLAG_GRAY            (1 << 13)
+/**
+ * error[?] variables will be set during encoding.
+ */
+#define AV_CODEC_FLAG_PSNR            (1 << 15)
+/**
+ * Input bitstream might be truncated at a random location
+ * instead of only at frame boundaries.
+ */
+#define AV_CODEC_FLAG_TRUNCATED       (1 << 16)
+/**
+ * Use interlaced DCT.
+ */
+#define AV_CODEC_FLAG_INTERLACED_DCT  (1 << 18)
+/**
+ * Force low delay.
+ */
+#define AV_CODEC_FLAG_LOW_DELAY       (1 << 19)
+/**
+ * Place global headers in extradata instead of every keyframe.
+ */
+#define AV_CODEC_FLAG_GLOBAL_HEADER   (1 << 22)
+/**
+ * Use only bitexact stuff (except (I)DCT).
+ */
+#define AV_CODEC_FLAG_BITEXACT        (1 << 23)
+/* Fx : Flag for H.263+ extra options */
+/**
+ * H.263 advanced intra coding / MPEG-4 AC prediction
+ */
+#define AV_CODEC_FLAG_AC_PRED         (1 << 24)
+/**
+ * interlaced motion estimation
+ */
+#define AV_CODEC_FLAG_INTERLACED_ME   (1 << 29)
+#define AV_CODEC_FLAG_CLOSED_GOP      (1U << 31)
+
+/**
+ * Allow non spec compliant speedup tricks.
+ */
+#define AV_CODEC_FLAG2_FAST           (1 <<  0)
+/**
+ * Skip bitstream encoding.
+ */
+#define AV_CODEC_FLAG2_NO_OUTPUT      (1 <<  2)
+/**
+ * Place global headers at every keyframe instead of in extradata.
+ */
+#define AV_CODEC_FLAG2_LOCAL_HEADER   (1 <<  3)
+
+/**
+ * timecode is in drop frame format. DEPRECATED!!!!
+ */
+#define AV_CODEC_FLAG2_DROP_FRAME_TIMECODE (1 << 13)
+
+/**
+ * Input bitstream might be truncated at a packet boundaries
+ * instead of only at frame boundaries.
+ */
+#define AV_CODEC_FLAG2_CHUNKS         (1 << 15)
+/**
+ * Discard cropping information from SPS.
+ */
+#define AV_CODEC_FLAG2_IGNORE_CROP    (1 << 16)
+
+/**
+ * Show all frames before the first keyframe
+ */
+#define AV_CODEC_FLAG2_SHOW_ALL       (1 << 22)
+/**
+ * Export motion vectors through frame side data
+ */
+#define AV_CODEC_FLAG2_EXPORT_MVS     (1 << 28)
+/**
+ * Do not skip samples and export skip information as frame side data
+ */
+#define AV_CODEC_FLAG2_SKIP_MANUAL    (1 << 29)
+/**
+ * Do not reset ASS ReadOrder field on flush (subtitles decoding)
+ */
+#define AV_CODEC_FLAG2_RO_FLUSH_NOOP  (1 << 30)
+
+/* Unsupported options :
+ *              Syntax Arithmetic coding (SAC)
+ *              Reference Picture Selection
+ *              Independent Segment Decoding */
+/* /Fx */
+/* codec capabilities */
+
+/**
+ * Decoder can use draw_horiz_band callback.
+ */
+#define AV_CODEC_CAP_DRAW_HORIZ_BAND     (1 <<  0)
+/**
+ * Codec uses get_buffer() for allocating buffers and supports custom allocators.
+ * If not set, it might not use get_buffer() at all or use operations that
+ * assume the buffer was allocated by avcodec_default_get_buffer.
+ */
+#define AV_CODEC_CAP_DR1                 (1 <<  1)
+#define AV_CODEC_CAP_TRUNCATED           (1 <<  3)
+/**
+ * Encoder or decoder requires flushing with NULL input at the end in order to
+ * give the complete and correct output.
+ *
+ * NOTE: If this flag is not set, the codec is guaranteed to never be fed with
+ *       with NULL data. The user can still send NULL data to the public encode
+ *       or decode function, but libavcodec will not pass it along to the codec
+ *       unless this flag is set.
+ *
+ * Decoders:
+ * The decoder has a non-zero delay and needs to be fed with avpkt->data=NULL,
+ * avpkt->size=0 at the end to get the delayed data until the decoder no longer
+ * returns frames.
+ *
+ * Encoders:
+ * The encoder needs to be fed with NULL data at the end of encoding until the
+ * encoder no longer returns data.
+ *
+ * NOTE: For encoders implementing the AVCodec.encode2() function, setting this
+ *       flag also means that the encoder must set the pts and duration for
+ *       each output packet. If this flag is not set, the pts and duration will
+ *       be determined by libavcodec from the input frame.
+ */
+#define AV_CODEC_CAP_DELAY               (1 <<  5)
+/**
+ * Codec can be fed a final frame with a smaller size.
+ * This can be used to prevent truncation of the last audio samples.
+ */
+#define AV_CODEC_CAP_SMALL_LAST_FRAME    (1 <<  6)
+
+#if FF_API_CAP_VDPAU
+/**
+ * Codec can export data for HW decoding (VDPAU).
+ */
+#define AV_CODEC_CAP_HWACCEL_VDPAU       (1 <<  7)
+#endif
+
+/**
+ * Codec can output multiple frames per AVPacket
+ * Normally demuxers return one frame at a time, demuxers which do not do
+ * are connected to a parser to split what they return into proper frames.
+ * This flag is reserved to the very rare category of codecs which have a
+ * bitstream that cannot be split into frames without timeconsuming
+ * operations like full decoding. Demuxers carrying such bitstreams thus
+ * may return multiple frames in a packet. This has many disadvantages like
+ * prohibiting stream copy in many cases thus it should only be considered
+ * as a last resort.
+ */
+#define AV_CODEC_CAP_SUBFRAMES           (1 <<  8)
+/**
+ * Codec is experimental and is thus avoided in favor of non experimental
+ * encoders
+ */
+#define AV_CODEC_CAP_EXPERIMENTAL        (1 <<  9)
+/**
+ * Codec should fill in channel configuration and samplerate instead of container
+ */
+#define AV_CODEC_CAP_CHANNEL_CONF        (1 << 10)
+/**
+ * Codec supports frame-level multithreading.
+ */
+#define AV_CODEC_CAP_FRAME_THREADS       (1 << 12)
+/**
+ * Codec supports slice-based (or partition-based) multithreading.
+ */
+#define AV_CODEC_CAP_SLICE_THREADS       (1 << 13)
+/**
+ * Codec supports changed parameters at any point.
+ */
+#define AV_CODEC_CAP_PARAM_CHANGE        (1 << 14)
+/**
+ * Codec supports avctx->thread_count == 0 (auto).
+ */
+#define AV_CODEC_CAP_AUTO_THREADS        (1 << 15)
+/**
+ * Audio encoder supports receiving a different number of samples in each call.
+ */
+#define AV_CODEC_CAP_VARIABLE_FRAME_SIZE (1 << 16)
+/**
+ * Decoder is not a preferred choice for probing.
+ * This indicates that the decoder is not a good choice for probing.
+ * It could for example be an expensive to spin up hardware decoder,
+ * or it could simply not provide a lot of useful information about
+ * the stream.
+ * A decoder marked with this flag should only be used as last resort
+ * choice for probing.
+ */
+#define AV_CODEC_CAP_AVOID_PROBING       (1 << 17)
+/**
+ * Codec is intra only.
+ */
+#define AV_CODEC_CAP_INTRA_ONLY       0x40000000
+/**
+ * Codec is lossless.
+ */
+#define AV_CODEC_CAP_LOSSLESS         0x80000000
+
+
+#if FF_API_WITHOUT_PREFIX
+/**
+ * Allow decoders to produce frames with data planes that are not aligned
+ * to CPU requirements (e.g. due to cropping).
+ */
+#define CODEC_FLAG_UNALIGNED AV_CODEC_FLAG_UNALIGNED
+#define CODEC_FLAG_QSCALE AV_CODEC_FLAG_QSCALE
+#define CODEC_FLAG_4MV    AV_CODEC_FLAG_4MV
+#define CODEC_FLAG_OUTPUT_CORRUPT AV_CODEC_FLAG_OUTPUT_CORRUPT
+#define CODEC_FLAG_QPEL   AV_CODEC_FLAG_QPEL
+#if FF_API_GMC
+/**
+ * @deprecated use the "gmc" private option of the libxvid encoder
+ */
+#define CODEC_FLAG_GMC    0x0020  ///< Use GMC.
+#endif
+#if FF_API_MV0
+/**
+ * @deprecated use the flag "mv0" in the "mpv_flags" private option of the
+ * mpegvideo encoders
+ */
+#define CODEC_FLAG_MV0    0x0040
+#endif
+#if FF_API_INPUT_PRESERVED
+/**
+ * @deprecated passing reference-counted frames to the encoders replaces this
+ * flag
+ */
+#define CODEC_FLAG_INPUT_PRESERVED 0x0100
+#endif
+#define CODEC_FLAG_PASS1           AV_CODEC_FLAG_PASS1
+#define CODEC_FLAG_PASS2           AV_CODEC_FLAG_PASS2
+#define CODEC_FLAG_GRAY            AV_CODEC_FLAG_GRAY
+#if FF_API_EMU_EDGE
+/**
+ * @deprecated edges are not used/required anymore. I.e. this flag is now always
+ * set.
+ */
+#define CODEC_FLAG_EMU_EDGE        0x4000
+#endif
+#define CODEC_FLAG_PSNR            AV_CODEC_FLAG_PSNR
+#define CODEC_FLAG_TRUNCATED       AV_CODEC_FLAG_TRUNCATED
+
+#if FF_API_NORMALIZE_AQP
+/**
+ * @deprecated use the flag "naq" in the "mpv_flags" private option of the
+ * mpegvideo encoders
+ */
+#define CODEC_FLAG_NORMALIZE_AQP  0x00020000
+#endif
+#define CODEC_FLAG_INTERLACED_DCT AV_CODEC_FLAG_INTERLACED_DCT
+#define CODEC_FLAG_LOW_DELAY      AV_CODEC_FLAG_LOW_DELAY
+#define CODEC_FLAG_GLOBAL_HEADER  AV_CODEC_FLAG_GLOBAL_HEADER
+#define CODEC_FLAG_BITEXACT       AV_CODEC_FLAG_BITEXACT
+#define CODEC_FLAG_AC_PRED        AV_CODEC_FLAG_AC_PRED
+#define CODEC_FLAG_LOOP_FILTER    AV_CODEC_FLAG_LOOP_FILTER
+#define CODEC_FLAG_INTERLACED_ME  AV_CODEC_FLAG_INTERLACED_ME
+#define CODEC_FLAG_CLOSED_GOP     AV_CODEC_FLAG_CLOSED_GOP
+#define CODEC_FLAG2_FAST          AV_CODEC_FLAG2_FAST
+#define CODEC_FLAG2_NO_OUTPUT     AV_CODEC_FLAG2_NO_OUTPUT
+#define CODEC_FLAG2_LOCAL_HEADER  AV_CODEC_FLAG2_LOCAL_HEADER
+#define CODEC_FLAG2_DROP_FRAME_TIMECODE AV_CODEC_FLAG2_DROP_FRAME_TIMECODE
+#define CODEC_FLAG2_IGNORE_CROP   AV_CODEC_FLAG2_IGNORE_CROP
+
+#define CODEC_FLAG2_CHUNKS        AV_CODEC_FLAG2_CHUNKS
+#define CODEC_FLAG2_SHOW_ALL      AV_CODEC_FLAG2_SHOW_ALL
+#define CODEC_FLAG2_EXPORT_MVS    AV_CODEC_FLAG2_EXPORT_MVS
+#define CODEC_FLAG2_SKIP_MANUAL   AV_CODEC_FLAG2_SKIP_MANUAL
+
+/* Unsupported options :
+ *              Syntax Arithmetic coding (SAC)
+ *              Reference Picture Selection
+ *              Independent Segment Decoding */
+/* /Fx */
+/* codec capabilities */
+
+#define CODEC_CAP_DRAW_HORIZ_BAND AV_CODEC_CAP_DRAW_HORIZ_BAND ///< Decoder can use draw_horiz_band callback.
+/**
+ * Codec uses get_buffer() for allocating buffers and supports custom allocators.
+ * If not set, it might not use get_buffer() at all or use operations that
+ * assume the buffer was allocated by avcodec_default_get_buffer.
+ */
+#define CODEC_CAP_DR1             AV_CODEC_CAP_DR1
+#define CODEC_CAP_TRUNCATED       AV_CODEC_CAP_TRUNCATED
+#if FF_API_XVMC
+/* Codec can export data for HW decoding. This flag indicates that
+ * the codec would call get_format() with list that might contain HW accelerated
+ * pixel formats (XvMC, VDPAU, VAAPI, etc). The application can pick any of them
+ * including raw image format.
+ * The application can use the passed context to determine bitstream version,
+ * chroma format, resolution etc.
+ */
+#define CODEC_CAP_HWACCEL         0x0010
+#endif /* FF_API_XVMC */
+/**
+ * Encoder or decoder requires flushing with NULL input at the end in order to
+ * give the complete and correct output.
+ *
+ * NOTE: If this flag is not set, the codec is guaranteed to never be fed with
+ *       with NULL data. The user can still send NULL data to the public encode
+ *       or decode function, but libavcodec will not pass it along to the codec
+ *       unless this flag is set.
+ *
+ * Decoders:
+ * The decoder has a non-zero delay and needs to be fed with avpkt->data=NULL,
+ * avpkt->size=0 at the end to get the delayed data until the decoder no longer
+ * returns frames.
+ *
+ * Encoders:
+ * The encoder needs to be fed with NULL data at the end of encoding until the
+ * encoder no longer returns data.
+ *
+ * NOTE: For encoders implementing the AVCodec.encode2() function, setting this
+ *       flag also means that the encoder must set the pts and duration for
+ *       each output packet. If this flag is not set, the pts and duration will
+ *       be determined by libavcodec from the input frame.
+ */
+#define CODEC_CAP_DELAY           AV_CODEC_CAP_DELAY
+/**
+ * Codec can be fed a final frame with a smaller size.
+ * This can be used to prevent truncation of the last audio samples.
+ */
+#define CODEC_CAP_SMALL_LAST_FRAME AV_CODEC_CAP_SMALL_LAST_FRAME
+#if FF_API_CAP_VDPAU
+/**
+ * Codec can export data for HW decoding (VDPAU).
+ */
+#define CODEC_CAP_HWACCEL_VDPAU    AV_CODEC_CAP_HWACCEL_VDPAU
+#endif
+/**
+ * Codec can output multiple frames per AVPacket
+ * Normally demuxers return one frame at a time, demuxers which do not do
+ * are connected to a parser to split what they return into proper frames.
+ * This flag is reserved to the very rare category of codecs which have a
+ * bitstream that cannot be split into frames without timeconsuming
+ * operations like full decoding. Demuxers carrying such bitstreams thus
+ * may return multiple frames in a packet. This has many disadvantages like
+ * prohibiting stream copy in many cases thus it should only be considered
+ * as a last resort.
+ */
+#define CODEC_CAP_SUBFRAMES        AV_CODEC_CAP_SUBFRAMES
+/**
+ * Codec is experimental and is thus avoided in favor of non experimental
+ * encoders
+ */
+#define CODEC_CAP_EXPERIMENTAL     AV_CODEC_CAP_EXPERIMENTAL
+/**
+ * Codec should fill in channel configuration and samplerate instead of container
+ */
+#define CODEC_CAP_CHANNEL_CONF     AV_CODEC_CAP_CHANNEL_CONF
+#if FF_API_NEG_LINESIZES
+/**
+ * @deprecated no codecs use this capability
+ */
+#define CODEC_CAP_NEG_LINESIZES    0x0800
+#endif
+/**
+ * Codec supports frame-level multithreading.
+ */
+#define CODEC_CAP_FRAME_THREADS    AV_CODEC_CAP_FRAME_THREADS
+/**
+ * Codec supports slice-based (or partition-based) multithreading.
+ */
+#define CODEC_CAP_SLICE_THREADS    AV_CODEC_CAP_SLICE_THREADS
+/**
+ * Codec supports changed parameters at any point.
+ */
+#define CODEC_CAP_PARAM_CHANGE     AV_CODEC_CAP_PARAM_CHANGE
+/**
+ * Codec supports avctx->thread_count == 0 (auto).
+ */
+#define CODEC_CAP_AUTO_THREADS     AV_CODEC_CAP_AUTO_THREADS
+/**
+ * Audio encoder supports receiving a different number of samples in each call.
+ */
+#define CODEC_CAP_VARIABLE_FRAME_SIZE AV_CODEC_CAP_VARIABLE_FRAME_SIZE
+/**
+ * Codec is intra only.
+ */
+#define CODEC_CAP_INTRA_ONLY       AV_CODEC_CAP_INTRA_ONLY
+/**
+ * Codec is lossless.
+ */
+#define CODEC_CAP_LOSSLESS         AV_CODEC_CAP_LOSSLESS
+
+/**
+ * HWAccel is experimental and is thus avoided in favor of non experimental
+ * codecs
+ */
+#define HWACCEL_CODEC_CAP_EXPERIMENTAL     0x0200
+#endif /* FF_API_WITHOUT_PREFIX */
+
+#if FF_API_MB_TYPE
+//The following defines may change, don't expect compatibility if you use them.
+#define MB_TYPE_INTRA4x4   0x0001
+#define MB_TYPE_INTRA16x16 0x0002 //FIXME H.264-specific
+#define MB_TYPE_INTRA_PCM  0x0004 //FIXME H.264-specific
+#define MB_TYPE_16x16      0x0008
+#define MB_TYPE_16x8       0x0010
+#define MB_TYPE_8x16       0x0020
+#define MB_TYPE_8x8        0x0040
+#define MB_TYPE_INTERLACED 0x0080
+#define MB_TYPE_DIRECT2    0x0100 //FIXME
+#define MB_TYPE_ACPRED     0x0200
+#define MB_TYPE_GMC        0x0400
+#define MB_TYPE_SKIP       0x0800
+#define MB_TYPE_P0L0       0x1000
+#define MB_TYPE_P1L0       0x2000
+#define MB_TYPE_P0L1       0x4000
+#define MB_TYPE_P1L1       0x8000
+#define MB_TYPE_L0         (MB_TYPE_P0L0 | MB_TYPE_P1L0)
+#define MB_TYPE_L1         (MB_TYPE_P0L1 | MB_TYPE_P1L1)
+#define MB_TYPE_L0L1       (MB_TYPE_L0   | MB_TYPE_L1)
+#define MB_TYPE_QUANT      0x00010000
+#define MB_TYPE_CBP        0x00020000
+// Note bits 24-31 are reserved for codec specific use (H.264 ref0, MPEG-1 0mv, ...)
+#endif
+
+/**
+ * Pan Scan area.
+ * This specifies the area which should be displayed.
+ * Note there may be multiple such areas for one frame.
+ */
+typedef struct AVPanScan{
+    /**
+     * id
+     * - encoding: Set by user.
+     * - decoding: Set by libavcodec.
+     */
+    int id;
+
+    /**
+     * width and height in 1/16 pel
+     * - encoding: Set by user.
+     * - decoding: Set by libavcodec.
+     */
+    int width;
+    int height;
+
+    /**
+     * position of the top left corner in 1/16 pel for up to 3 fields/frames
+     * - encoding: Set by user.
+     * - decoding: Set by libavcodec.
+     */
+    int16_t position[3][2];
+}AVPanScan;
+
+/**
+ * This structure describes the bitrate properties of an encoded bitstream. It
+ * roughly corresponds to a subset the VBV parameters for MPEG-2 or HRD
+ * parameters for H.264/HEVC.
+ */
+typedef struct AVCPBProperties {
+    /**
+     * Maximum bitrate of the stream, in bits per second.
+     * Zero if unknown or unspecified.
+     */
+    int max_bitrate;
+    /**
+     * Minimum bitrate of the stream, in bits per second.
+     * Zero if unknown or unspecified.
+     */
+    int min_bitrate;
+    /**
+     * Average bitrate of the stream, in bits per second.
+     * Zero if unknown or unspecified.
+     */
+    int avg_bitrate;
+
+    /**
+     * The size of the buffer to which the ratecontrol is applied, in bits.
+     * Zero if unknown or unspecified.
+     */
+    int buffer_size;
+
+    /**
+     * The delay between the time the packet this structure is associated with
+     * is received and the time when it should be decoded, in periods of a 27MHz
+     * clock.
+     *
+     * UINT64_MAX when unknown or unspecified.
+     */
+    uint64_t vbv_delay;
+} AVCPBProperties;
+
+#if FF_API_QSCALE_TYPE
+#define FF_QSCALE_TYPE_MPEG1 0
+#define FF_QSCALE_TYPE_MPEG2 1
+#define FF_QSCALE_TYPE_H264  2
+#define FF_QSCALE_TYPE_VP56  3
+#endif
+
+/**
+ * The decoder will keep a reference to the frame and may reuse it later.
+ */
+#define AV_GET_BUFFER_FLAG_REF (1 << 0)
+
+/**
+ * @defgroup lavc_packet AVPacket
+ *
+ * Types and functions for working with AVPacket.
+ * @{
+ */
+enum AVPacketSideDataType {
+    /**
+     * An AV_PKT_DATA_PALETTE side data packet contains exactly AVPALETTE_SIZE
+     * bytes worth of palette. This side data signals that a new palette is
+     * present.
+     */
+    AV_PKT_DATA_PALETTE,
+
+    /**
+     * The AV_PKT_DATA_NEW_EXTRADATA is used to notify the codec or the format
+     * that the extradata buffer was changed and the receiving side should
+     * act upon it appropriately. The new extradata is embedded in the side
+     * data buffer and should be immediately used for processing the current
+     * frame or packet.
+     */
+    AV_PKT_DATA_NEW_EXTRADATA,
+
+    /**
+     * An AV_PKT_DATA_PARAM_CHANGE side data packet is laid out as follows:
+     * @code
+     * u32le param_flags
+     * if (param_flags & AV_SIDE_DATA_PARAM_CHANGE_CHANNEL_COUNT)
+     *     s32le channel_count
+     * if (param_flags & AV_SIDE_DATA_PARAM_CHANGE_CHANNEL_LAYOUT)
+     *     u64le channel_layout
+     * if (param_flags & AV_SIDE_DATA_PARAM_CHANGE_SAMPLE_RATE)
+     *     s32le sample_rate
+     * if (param_flags & AV_SIDE_DATA_PARAM_CHANGE_DIMENSIONS)
+     *     s32le width
+     *     s32le height
+     * @endcode
+     */
+    AV_PKT_DATA_PARAM_CHANGE,
+
+    /**
+     * An AV_PKT_DATA_H263_MB_INFO side data packet contains a number of
+     * structures with info about macroblocks relevant to splitting the
+     * packet into smaller packets on macroblock edges (e.g. as for RFC 2190).
+     * That is, it does not necessarily contain info about all macroblocks,
+     * as long as the distance between macroblocks in the info is smaller
+     * than the target payload size.
+     * Each MB info structure is 12 bytes, and is laid out as follows:
+     * @code
+     * u32le bit offset from the start of the packet
+     * u8    current quantizer at the start of the macroblock
+     * u8    GOB number
+     * u16le macroblock address within the GOB
+     * u8    horizontal MV predictor
+     * u8    vertical MV predictor
+     * u8    horizontal MV predictor for block number 3
+     * u8    vertical MV predictor for block number 3
+     * @endcode
+     */
+    AV_PKT_DATA_H263_MB_INFO,
+
+    /**
+     * This side data should be associated with an audio stream and contains
+     * ReplayGain information in form of the AVReplayGain struct.
+     */
+    AV_PKT_DATA_REPLAYGAIN,
+
+    /**
+     * This side data contains a 3x3 transformation matrix describing an affine
+     * transformation that needs to be applied to the decoded video frames for
+     * correct presentation.
+     *
+     * See libavutil/display.h for a detailed description of the data.
+     */
+    AV_PKT_DATA_DISPLAYMATRIX,
+
+    /**
+     * This side data should be associated with a video stream and contains
+     * Stereoscopic 3D information in form of the AVStereo3D struct.
+     */
+    AV_PKT_DATA_STEREO3D,
+
+    /**
+     * This side data should be associated with an audio stream and corresponds
+     * to enum AVAudioServiceType.
+     */
+    AV_PKT_DATA_AUDIO_SERVICE_TYPE,
+
+    /**
+     * This side data contains quality related information from the encoder.
+     * @code
+     * u32le quality factor of the compressed frame. Allowed range is between 1 (good) and FF_LAMBDA_MAX (bad).
+     * u8    picture type
+     * u8    error count
+     * u16   reserved
+     * u64le[error count] sum of squared differences between encoder in and output
+     * @endcode
+     */
+    AV_PKT_DATA_QUALITY_STATS,
+
+    /**
+     * This side data contains an integer value representing the stream index
+     * of a "fallback" track.  A fallback track indicates an alternate
+     * track to use when the current track can not be decoded for some reason.
+     * e.g. no decoder available for codec.
+     */
+    AV_PKT_DATA_FALLBACK_TRACK,
+
+    /**
+     * This side data corresponds to the AVCPBProperties struct.
+     */
+    AV_PKT_DATA_CPB_PROPERTIES,
+
+    /**
+     * Recommmends skipping the specified number of samples
+     * @code
+     * u32le number of samples to skip from start of this packet
+     * u32le number of samples to skip from end of this packet
+     * u8    reason for start skip
+     * u8    reason for end   skip (0=padding silence, 1=convergence)
+     * @endcode
+     */
+    AV_PKT_DATA_SKIP_SAMPLES=70,
+
+    /**
+     * An AV_PKT_DATA_JP_DUALMONO side data packet indicates that
+     * the packet may contain "dual mono" audio specific to Japanese DTV
+     * and if it is true, recommends only the selected channel to be used.
+     * @code
+     * u8    selected channels (0=mail/left, 1=sub/right, 2=both)
+     * @endcode
+     */
+    AV_PKT_DATA_JP_DUALMONO,
+
+    /**
+     * A list of zero terminated key/value strings. There is no end marker for
+     * the list, so it is required to rely on the side data size to stop.
+     */
+    AV_PKT_DATA_STRINGS_METADATA,
+
+    /**
+     * Subtitle event position
+     * @code
+     * u32le x1
+     * u32le y1
+     * u32le x2
+     * u32le y2
+     * @endcode
+     */
+    AV_PKT_DATA_SUBTITLE_POSITION,
+
+    /**
+     * Data found in BlockAdditional element of matroska container. There is
+     * no end marker for the data, so it is required to rely on the side data
+     * size to recognize the end. 8 byte id (as found in BlockAddId) followed
+     * by data.
+     */
+    AV_PKT_DATA_MATROSKA_BLOCKADDITIONAL,
+
+    /**
+     * The optional first identifier line of a WebVTT cue.
+     */
+    AV_PKT_DATA_WEBVTT_IDENTIFIER,
+
+    /**
+     * The optional settings (rendering instructions) that immediately
+     * follow the timestamp specifier of a WebVTT cue.
+     */
+    AV_PKT_DATA_WEBVTT_SETTINGS,
+
+    /**
+     * A list of zero terminated key/value strings. There is no end marker for
+     * the list, so it is required to rely on the side data size to stop. This
+     * side data includes updated metadata which appeared in the stream.
+     */
+    AV_PKT_DATA_METADATA_UPDATE,
+
+    /**
+     * MPEGTS stream ID, this is required to pass the stream ID
+     * information from the demuxer to the corresponding muxer.
+     */
+    AV_PKT_DATA_MPEGTS_STREAM_ID,
+
+    /**
+     * Mastering display metadata (based on SMPTE-2086:2014). This metadata
+     * should be associated with a video stream and containts data in the form
+     * of the AVMasteringDisplayMetadata struct.
+     */
+    AV_PKT_DATA_MASTERING_DISPLAY_METADATA,
+
+    /**
+     * This side data should be associated with a video stream and corresponds
+     * to the AVSphericalMapping structure.
+     */
+    AV_PKT_DATA_SPHERICAL,
+
+    /**
+     * The number of side data elements (in fact a bit more than it).
+     * This is not part of the public API/ABI in the sense that it may
+     * change when new side data types are added.
+     * This must stay the last enum value.
+     * If its value becomes huge, some code using it
+     * needs to be updated as it assumes it to be smaller than other limits.
+     */
+    AV_PKT_DATA_NB
+};
+
+#define AV_PKT_DATA_QUALITY_FACTOR AV_PKT_DATA_QUALITY_STATS //DEPRECATED
+
+typedef struct AVPacketSideData {
+    uint8_t *data;
+    int      size;
+    enum AVPacketSideDataType type;
+} AVPacketSideData;
+
+/**
+ * This structure stores compressed data. It is typically exported by demuxers
+ * and then passed as input to decoders, or received as output from encoders and
+ * then passed to muxers.
+ *
+ * For video, it should typically contain one compressed frame. For audio it may
+ * contain several compressed frames. Encoders are allowed to output empty
+ * packets, with no compressed data, containing only side data
+ * (e.g. to update some stream parameters at the end of encoding).
+ *
+ * AVPacket is one of the few structs in FFmpeg, whose size is a part of public
+ * ABI. Thus it may be allocated on stack and no new fields can be added to it
+ * without libavcodec and libavformat major bump.
+ *
+ * The semantics of data ownership depends on the buf field.
+ * If it is set, the packet data is dynamically allocated and is
+ * valid indefinitely until a call to av_packet_unref() reduces the
+ * reference count to 0.
+ *
+ * If the buf field is not set av_packet_ref() would make a copy instead
+ * of increasing the reference count.
+ *
+ * The side data is always allocated with av_malloc(), copied by
+ * av_packet_ref() and freed by av_packet_unref().
+ *
+ * @see av_packet_ref
+ * @see av_packet_unref
+ */
+typedef struct AVPacket {
+    /**
+     * A reference to the reference-counted buffer where the packet data is
+     * stored.
+     * May be NULL, then the packet data is not reference-counted.
+     */
+    AVBufferRef *buf;
+    /**
+     * Presentation timestamp in AVStream->time_base units; the time at which
+     * the decompressed packet will be presented to the user.
+     * Can be AV_NOPTS_VALUE if it is not stored in the file.
+     * pts MUST be larger or equal to dts as presentation cannot happen before
+     * decompression, unless one wants to view hex dumps. Some formats misuse
+     * the terms dts and pts/cts to mean something different. Such timestamps
+     * must be converted to true pts/dts before they are stored in AVPacket.
+     */
+    int64_t pts;
+    /**
+     * Decompression timestamp in AVStream->time_base units; the time at which
+     * the packet is decompressed.
+     * Can be AV_NOPTS_VALUE if it is not stored in the file.
+     */
+    int64_t dts;
+    uint8_t *data;
+    int   size;
+    int   stream_index;
+    /**
+     * A combination of AV_PKT_FLAG values
+     */
+    int   flags;
+    /**
+     * Additional packet data that can be provided by the container.
+     * Packet can contain several types of side information.
+     */
+    AVPacketSideData *side_data;
+    int side_data_elems;
+
+    /**
+     * Duration of this packet in AVStream->time_base units, 0 if unknown.
+     * Equals next_pts - this_pts in presentation order.
+     */
+    int64_t duration;
+
+    int64_t pos;                            ///< byte position in stream, -1 if unknown
+
+#if FF_API_CONVERGENCE_DURATION
+    /**
+     * @deprecated Same as the duration field, but as int64_t. This was required
+     * for Matroska subtitles, whose duration values could overflow when the
+     * duration field was still an int.
+     */
+    attribute_deprecated
+    int64_t convergence_duration;
+#endif
+} AVPacket;
+#define AV_PKT_FLAG_KEY     0x0001 ///< The packet contains a keyframe
+#define AV_PKT_FLAG_CORRUPT 0x0002 ///< The packet content is corrupted
+/**
+ * Flag is used to discard packets which are required to maintain valid
+ * decoder state but are not required for output and should be dropped
+ * after decoding.
+ **/
+#define AV_PKT_FLAG_DISCARD   0x0004
+
+enum AVSideDataParamChangeFlags {
+    AV_SIDE_DATA_PARAM_CHANGE_CHANNEL_COUNT  = 0x0001,
+    AV_SIDE_DATA_PARAM_CHANGE_CHANNEL_LAYOUT = 0x0002,
+    AV_SIDE_DATA_PARAM_CHANGE_SAMPLE_RATE    = 0x0004,
+    AV_SIDE_DATA_PARAM_CHANGE_DIMENSIONS     = 0x0008,
+};
+/**
+ * @}
+ */
+
+struct AVCodecInternal;
+
+enum AVFieldOrder {
+    AV_FIELD_UNKNOWN,
+    AV_FIELD_PROGRESSIVE,
+    AV_FIELD_TT,          //< Top coded_first, top displayed first
+    AV_FIELD_BB,          //< Bottom coded first, bottom displayed first
+    AV_FIELD_TB,          //< Top coded first, bottom displayed first
+    AV_FIELD_BT,          //< Bottom coded first, top displayed first
+};
+
+/**
+ * main external API structure.
+ * New fields can be added to the end with minor version bumps.
+ * Removal, reordering and changes to existing fields require a major
+ * version bump.
+ * You can use AVOptions (av_opt* / av_set/get*()) to access these fields from user
+ * applications.
+ * The name string for AVOptions options matches the associated command line
+ * parameter name and can be found in libavcodec/options_table.h
+ * The AVOption/command line parameter names differ in some cases from the C
+ * structure field names for historic reasons or brevity.
+ * sizeof(AVCodecContext) must not be used outside libav*.
+ */
+typedef struct AVCodecContext {
+    /**
+     * information on struct for av_log
+     * - set by avcodec_alloc_context3
+     */
+    const AVClass *av_class;
+    int log_level_offset;
+
+    enum AVMediaType codec_type; /* see AVMEDIA_TYPE_xxx */
+    const struct AVCodec  *codec;
+#if FF_API_CODEC_NAME
+    /**
+     * @deprecated this field is not used for anything in libavcodec
+     */
+    attribute_deprecated
+    char             codec_name[32];
+#endif
+    enum AVCodecID     codec_id; /* see AV_CODEC_ID_xxx */
+
+    /**
+     * fourcc (LSB first, so "ABCD" -> ('D'<<24) + ('C'<<16) + ('B'<<8) + 'A').
+     * This is used to work around some encoder bugs.
+     * A demuxer should set this to what is stored in the field used to identify the codec.
+     * If there are multiple such fields in a container then the demuxer should choose the one
+     * which maximizes the information about the used codec.
+     * If the codec tag field in a container is larger than 32 bits then the demuxer should
+     * remap the longer ID to 32 bits with a table or other structure. Alternatively a new
+     * extra_codec_tag + size could be added but for this a clear advantage must be demonstrated
+     * first.
+     * - encoding: Set by user, if not then the default based on codec_id will be used.
+     * - decoding: Set by user, will be converted to uppercase by libavcodec during init.
+     */
+    unsigned int codec_tag;
+
+#if FF_API_STREAM_CODEC_TAG
+    /**
+     * @deprecated this field is unused
+     */
+    attribute_deprecated
+    unsigned int stream_codec_tag;
+#endif
+
+    void *priv_data;
+
+    /**
+     * Private context used for internal data.
+     *
+     * Unlike priv_data, this is not codec-specific. It is used in general
+     * libavcodec functions.
+     */
+    struct AVCodecInternal *internal;
+
+    /**
+     * Private data of the user, can be used to carry app specific stuff.
+     * - encoding: Set by user.
+     * - decoding: Set by user.
+     */
+    void *opaque;
+
+    /**
+     * the average bitrate
+     * - encoding: Set by user; unused for constant quantizer encoding.
+     * - decoding: Set by user, may be overwritten by libavcodec
+     *             if this info is available in the stream
+     */
+    int64_t bit_rate;
+
+    /**
+     * number of bits the bitstream is allowed to diverge from the reference.
+     *           the reference can be CBR (for CBR pass1) or VBR (for pass2)
+     * - encoding: Set by user; unused for constant quantizer encoding.
+     * - decoding: unused
+     */
+    int bit_rate_tolerance;
+
+    /**
+     * Global quality for codecs which cannot change it per frame.
+     * This should be proportional to MPEG-1/2/4 qscale.
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    int global_quality;
+
+    /**
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    int compression_level;
+#define FF_COMPRESSION_DEFAULT -1
+
+    /**
+     * AV_CODEC_FLAG_*.
+     * - encoding: Set by user.
+     * - decoding: Set by user.
+     */
+    int flags;
+
+    /**
+     * AV_CODEC_FLAG2_*
+     * - encoding: Set by user.
+     * - decoding: Set by user.
+     */
+    int flags2;
+
+    /**
+     * some codecs need / can use extradata like Huffman tables.
+     * MJPEG: Huffman tables
+     * rv10: additional flags
+     * MPEG-4: global headers (they can be in the bitstream or here)
+     * The allocated memory should be AV_INPUT_BUFFER_PADDING_SIZE bytes larger
+     * than extradata_size to avoid problems if it is read with the bitstream reader.
+     * The bytewise contents of extradata must not depend on the architecture or CPU endianness.
+     * - encoding: Set/allocated/freed by libavcodec.
+     * - decoding: Set/allocated/freed by user.
+     */
+    uint8_t *extradata;
+    int extradata_size;
+
+    /**
+     * This is the fundamental unit of time (in seconds) in terms
+     * of which frame timestamps are represented. For fixed-fps content,
+     * timebase should be 1/framerate and timestamp increments should be
+     * identically 1.
+     * This often, but not always is the inverse of the frame rate or field rate
+     * for video. 1/time_base is not the average frame rate if the frame rate is not
+     * constant.
+     *
+     * Like containers, elementary streams also can store timestamps, 1/time_base
+     * is the unit in which these timestamps are specified.
+     * As example of such codec time base see ISO/IEC 14496-2:2001(E)
+     * vop_time_increment_resolution and fixed_vop_rate
+     * (fixed_vop_rate == 0 implies that it is different from the framerate)
+     *
+     * - encoding: MUST be set by user.
+     * - decoding: the use of this field for decoding is deprecated.
+     *             Use framerate instead.
+     */
+    AVRational time_base;
+
+    /**
+     * For some codecs, the time base is closer to the field rate than the frame rate.
+     * Most notably, H.264 and MPEG-2 specify time_base as half of frame duration
+     * if no telecine is used ...
+     *
+     * Set to time_base ticks per frame. Default 1, e.g., H.264/MPEG-2 set it to 2.
+     */
+    int ticks_per_frame;
+
+    /**
+     * Codec delay.
+     *
+     * Encoding: Number of frames delay there will be from the encoder input to
+     *           the decoder output. (we assume the decoder matches the spec)
+     * Decoding: Number of frames delay in addition to what a standard decoder
+     *           as specified in the spec would produce.
+     *
+     * Video:
+     *   Number of frames the decoded output will be delayed relative to the
+     *   encoded input.
+     *
+     * Audio:
+     *   For encoding, this field is unused (see initial_padding).
+     *
+     *   For decoding, this is the number of samples the decoder needs to
+     *   output before the decoder's output is valid. When seeking, you should
+     *   start decoding this many samples prior to your desired seek point.
+     *
+     * - encoding: Set by libavcodec.
+     * - decoding: Set by libavcodec.
+     */
+    int delay;
+
+
+    /* video only */
+    /**
+     * picture width / height.
+     *
+     * @note Those fields may not match the values of the last
+     * AVFrame output by avcodec_decode_video2 due frame
+     * reordering.
+     *
+     * - encoding: MUST be set by user.
+     * - decoding: May be set by the user before opening the decoder if known e.g.
+     *             from the container. Some decoders will require the dimensions
+     *             to be set by the caller. During decoding, the decoder may
+     *             overwrite those values as required while parsing the data.
+     */
+    int width, height;
+
+    /**
+     * Bitstream width / height, may be different from width/height e.g. when
+     * the decoded frame is cropped before being output or lowres is enabled.
+     *
+     * @note Those field may not match the value of the last
+     * AVFrame output by avcodec_receive_frame() due frame
+     * reordering.
+     *
+     * - encoding: unused
+     * - decoding: May be set by the user before opening the decoder if known
+     *             e.g. from the container. During decoding, the decoder may
+     *             overwrite those values as required while parsing the data.
+     */
+    int coded_width, coded_height;
+
+#if FF_API_ASPECT_EXTENDED
+#define FF_ASPECT_EXTENDED 15
+#endif
+
+    /**
+     * the number of pictures in a group of pictures, or 0 for intra_only
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    int gop_size;
+
+    /**
+     * Pixel format, see AV_PIX_FMT_xxx.
+     * May be set by the demuxer if known from headers.
+     * May be overridden by the decoder if it knows better.
+     *
+     * @note This field may not match the value of the last
+     * AVFrame output by avcodec_receive_frame() due frame
+     * reordering.
+     *
+     * - encoding: Set by user.
+     * - decoding: Set by user if known, overridden by libavcodec while
+     *             parsing the data.
+     */
+    enum AVPixelFormat pix_fmt;
+
+#if FF_API_MOTION_EST
+    /**
+     * This option does nothing
+     * @deprecated use codec private options instead
+     */
+    attribute_deprecated int me_method;
+#endif
+
+    /**
+     * If non NULL, 'draw_horiz_band' is called by the libavcodec
+     * decoder to draw a horizontal band. It improves cache usage. Not
+     * all codecs can do that. You must check the codec capabilities
+     * beforehand.
+     * When multithreading is used, it may be called from multiple threads
+     * at the same time; threads might draw different parts of the same AVFrame,
+     * or multiple AVFrames, and there is no guarantee that slices will be drawn
+     * in order.
+     * The function is also used by hardware acceleration APIs.
+     * It is called at least once during frame decoding to pass
+     * the data needed for hardware render.
+     * In that mode instead of pixel data, AVFrame points to
+     * a structure specific to the acceleration API. The application
+     * reads the structure and can change some fields to indicate progress
+     * or mark state.
+     * - encoding: unused
+     * - decoding: Set by user.
+     * @param height the height of the slice
+     * @param y the y position of the slice
+     * @param type 1->top field, 2->bottom field, 3->frame
+     * @param offset offset into the AVFrame.data from which the slice should be read
+     */
+    void (*draw_horiz_band)(struct AVCodecContext *s,
+                            const AVFrame *src, int offset[AV_NUM_DATA_POINTERS],
+                            int y, int type, int height);
+
+    /**
+     * callback to negotiate the pixelFormat
+     * @param fmt is the list of formats which are supported by the codec,
+     * it is terminated by -1 as 0 is a valid format, the formats are ordered by quality.
+     * The first is always the native one.
+     * @note The callback may be called again immediately if initialization for
+     * the selected (hardware-accelerated) pixel format failed.
+     * @warning Behavior is undefined if the callback returns a value not
+     * in the fmt list of formats.
+     * @return the chosen format
+     * - encoding: unused
+     * - decoding: Set by user, if not set the native format will be chosen.
+     */
+    enum AVPixelFormat (*get_format)(struct AVCodecContext *s, const enum AVPixelFormat * fmt);
+
+    /**
+     * maximum number of B-frames between non-B-frames
+     * Note: The output will be delayed by max_b_frames+1 relative to the input.
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    int max_b_frames;
+
+    /**
+     * qscale factor between IP and B-frames
+     * If > 0 then the last P-frame quantizer will be used (q= lastp_q*factor+offset).
+     * If < 0 then normal ratecontrol will be done (q= -normal_q*factor+offset).
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    float b_quant_factor;
+
+#if FF_API_RC_STRATEGY
+    /** @deprecated use codec private option instead */
+    attribute_deprecated int rc_strategy;
+#define FF_RC_STRATEGY_XVID 1
+#endif
+
+#if FF_API_PRIVATE_OPT
+    /** @deprecated use encoder private options instead */
+    attribute_deprecated
+    int b_frame_strategy;
+#endif
+
+    /**
+     * qscale offset between IP and B-frames
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    float b_quant_offset;
+
+    /**
+     * Size of the frame reordering buffer in the decoder.
+     * For MPEG-2 it is 1 IPB or 0 low delay IP.
+     * - encoding: Set by libavcodec.
+     * - decoding: Set by libavcodec.
+     */
+    int has_b_frames;
+
+#if FF_API_PRIVATE_OPT
+    /** @deprecated use encoder private options instead */
+    attribute_deprecated
+    int mpeg_quant;
+#endif
+
+    /**
+     * qscale factor between P- and I-frames
+     * If > 0 then the last P-frame quantizer will be used (q = lastp_q * factor + offset).
+     * If < 0 then normal ratecontrol will be done (q= -normal_q*factor+offset).
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    float i_quant_factor;
+
+    /**
+     * qscale offset between P and I-frames
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    float i_quant_offset;
+
+    /**
+     * luminance masking (0-> disabled)
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    float lumi_masking;
+
+    /**
+     * temporary complexity masking (0-> disabled)
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    float temporal_cplx_masking;
+
+    /**
+     * spatial complexity masking (0-> disabled)
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    float spatial_cplx_masking;
+
+    /**
+     * p block masking (0-> disabled)
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    float p_masking;
+
+    /**
+     * darkness masking (0-> disabled)
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    float dark_masking;
+
+    /**
+     * slice count
+     * - encoding: Set by libavcodec.
+     * - decoding: Set by user (or 0).
+     */
+    int slice_count;
+
+#if FF_API_PRIVATE_OPT
+    /** @deprecated use encoder private options instead */
+    attribute_deprecated
+     int prediction_method;
+#define FF_PRED_LEFT   0
+#define FF_PRED_PLANE  1
+#define FF_PRED_MEDIAN 2
+#endif
+
+    /**
+     * slice offsets in the frame in bytes
+     * - encoding: Set/allocated by libavcodec.
+     * - decoding: Set/allocated by user (or NULL).
+     */
+    int *slice_offset;
+
+    /**
+     * sample aspect ratio (0 if unknown)
+     * That is the width of a pixel divided by the height of the pixel.
+     * Numerator and denominator must be relatively prime and smaller than 256 for some video standards.
+     * - encoding: Set by user.
+     * - decoding: Set by libavcodec.
+     */
+    AVRational sample_aspect_ratio;
+
+    /**
+     * motion estimation comparison function
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    int me_cmp;
+    /**
+     * subpixel motion estimation comparison function
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    int me_sub_cmp;
+    /**
+     * macroblock comparison function (not supported yet)
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    int mb_cmp;
+    /**
+     * interlaced DCT comparison function
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    int ildct_cmp;
+#define FF_CMP_SAD          0
+#define FF_CMP_SSE          1
+#define FF_CMP_SATD         2
+#define FF_CMP_DCT          3
+#define FF_CMP_PSNR         4
+#define FF_CMP_BIT          5
+#define FF_CMP_RD           6
+#define FF_CMP_ZERO         7
+#define FF_CMP_VSAD         8
+#define FF_CMP_VSSE         9
+#define FF_CMP_NSSE         10
+#define FF_CMP_W53          11
+#define FF_CMP_W97          12
+#define FF_CMP_DCTMAX       13
+#define FF_CMP_DCT264       14
+#define FF_CMP_MEDIAN_SAD   15
+#define FF_CMP_CHROMA       256
+
+    /**
+     * ME diamond size & shape
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    int dia_size;
+
+    /**
+     * amount of previous MV predictors (2a+1 x 2a+1 square)
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    int last_predictor_count;
+
+#if FF_API_PRIVATE_OPT
+    /** @deprecated use encoder private options instead */
+    attribute_deprecated
+    int pre_me;
+#endif
+
+    /**
+     * motion estimation prepass comparison function
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    int me_pre_cmp;
+
+    /**
+     * ME prepass diamond size & shape
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    int pre_dia_size;
+
+    /**
+     * subpel ME quality
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    int me_subpel_quality;
+
+#if FF_API_AFD
+    /**
+     * DTG active format information (additional aspect ratio
+     * information only used in DVB MPEG-2 transport streams)
+     * 0 if not set.
+     *
+     * - encoding: unused
+     * - decoding: Set by decoder.
+     * @deprecated Deprecated in favor of AVSideData
+     */
+    attribute_deprecated int dtg_active_format;
+#define FF_DTG_AFD_SAME         8
+#define FF_DTG_AFD_4_3          9
+#define FF_DTG_AFD_16_9         10
+#define FF_DTG_AFD_14_9         11
+#define FF_DTG_AFD_4_3_SP_14_9  13
+#define FF_DTG_AFD_16_9_SP_14_9 14
+#define FF_DTG_AFD_SP_4_3       15
+#endif /* FF_API_AFD */
+
+    /**
+     * maximum motion estimation search range in subpel units
+     * If 0 then no limit.
+     *
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    int me_range;
+
+#if FF_API_QUANT_BIAS
+    /**
+     * @deprecated use encoder private option instead
+     */
+    attribute_deprecated int intra_quant_bias;
+#define FF_DEFAULT_QUANT_BIAS 999999
+
+    /**
+     * @deprecated use encoder private option instead
+     */
+    attribute_deprecated int inter_quant_bias;
+#endif
+
+    /**
+     * slice flags
+     * - encoding: unused
+     * - decoding: Set by user.
+     */
+    int slice_flags;
+#define SLICE_FLAG_CODED_ORDER    0x0001 ///< draw_horiz_band() is called in coded order instead of display
+#define SLICE_FLAG_ALLOW_FIELD    0x0002 ///< allow draw_horiz_band() with field slices (MPEG-2 field pics)
+#define SLICE_FLAG_ALLOW_PLANE    0x0004 ///< allow draw_horiz_band() with 1 component at a time (SVQ1)
+
+#if FF_API_XVMC
+    /**
+     * XVideo Motion Acceleration
+     * - encoding: forbidden
+     * - decoding: set by decoder
+     * @deprecated XvMC doesn't need it anymore.
+     */
+    attribute_deprecated int xvmc_acceleration;
+#endif /* FF_API_XVMC */
+
+    /**
+     * macroblock decision mode
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    int mb_decision;
+#define FF_MB_DECISION_SIMPLE 0        ///< uses mb_cmp
+#define FF_MB_DECISION_BITS   1        ///< chooses the one which needs the fewest bits
+#define FF_MB_DECISION_RD     2        ///< rate distortion
+
+    /**
+     * custom intra quantization matrix
+     * - encoding: Set by user, can be NULL.
+     * - decoding: Set by libavcodec.
+     */
+    uint16_t *intra_matrix;
+
+    /**
+     * custom inter quantization matrix
+     * - encoding: Set by user, can be NULL.
+     * - decoding: Set by libavcodec.
+     */
+    uint16_t *inter_matrix;
+
+#if FF_API_PRIVATE_OPT
+    /** @deprecated use encoder private options instead */
+    attribute_deprecated
+    int scenechange_threshold;
+
+    /** @deprecated use encoder private options instead */
+    attribute_deprecated
+    int noise_reduction;
+#endif
+
+#if FF_API_MPV_OPT
+    /**
+     * @deprecated this field is unused
+     */
+    attribute_deprecated
+    int me_threshold;
+
+    /**
+     * @deprecated this field is unused
+     */
+    attribute_deprecated
+    int mb_threshold;
+#endif
+
+    /**
+     * precision of the intra DC coefficient - 8
+     * - encoding: Set by user.
+     * - decoding: Set by libavcodec
+     */
+    int intra_dc_precision;
+
+    /**
+     * Number of macroblock rows at the top which are skipped.
+     * - encoding: unused
+     * - decoding: Set by user.
+     */
+    int skip_top;
+
+    /**
+     * Number of macroblock rows at the bottom which are skipped.
+     * - encoding: unused
+     * - decoding: Set by user.
+     */
+    int skip_bottom;
+
+#if FF_API_MPV_OPT
+    /**
+     * @deprecated use encoder private options instead
+     */
+    attribute_deprecated
+    float border_masking;
+#endif
+
+    /**
+     * minimum MB Lagrange multiplier
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    int mb_lmin;
+
+    /**
+     * maximum MB Lagrange multiplier
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    int mb_lmax;
+
+#if FF_API_PRIVATE_OPT
+    /**
+     * @deprecated use encoder private options instead
+     */
+    attribute_deprecated
+    int me_penalty_compensation;
+#endif
+
+    /**
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    int bidir_refine;
+
+#if FF_API_PRIVATE_OPT
+    /** @deprecated use encoder private options instead */
+    attribute_deprecated
+    int brd_scale;
+#endif
+
+    /**
+     * minimum GOP size
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    int keyint_min;
+
+    /**
+     * number of reference frames
+     * - encoding: Set by user.
+     * - decoding: Set by lavc.
+     */
+    int refs;
+
+#if FF_API_PRIVATE_OPT
+    /** @deprecated use encoder private options instead */
+    attribute_deprecated
+    int chromaoffset;
+#endif
+
+#if FF_API_UNUSED_MEMBERS
+    /**
+     * Multiplied by qscale for each frame and added to scene_change_score.
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    attribute_deprecated int scenechange_factor;
+#endif
+
+    /**
+     * Note: Value depends upon the compare function used for fullpel ME.
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    int mv0_threshold;
+
+#if FF_API_PRIVATE_OPT
+    /** @deprecated use encoder private options instead */
+    attribute_deprecated
+    int b_sensitivity;
+#endif
+
+    /**
+     * Chromaticity coordinates of the source primaries.
+     * - encoding: Set by user
+     * - decoding: Set by libavcodec
+     */
+    enum AVColorPrimaries color_primaries;
+
+    /**
+     * Color Transfer Characteristic.
+     * - encoding: Set by user
+     * - decoding: Set by libavcodec
+     */
+    enum AVColorTransferCharacteristic color_trc;
+
+    /**
+     * YUV colorspace type.
+     * - encoding: Set by user
+     * - decoding: Set by libavcodec
+     */
+    enum AVColorSpace colorspace;
+
+    /**
+     * MPEG vs JPEG YUV range.
+     * - encoding: Set by user
+     * - decoding: Set by libavcodec
+     */
+    enum AVColorRange color_range;
+
+    /**
+     * This defines the location of chroma samples.
+     * - encoding: Set by user
+     * - decoding: Set by libavcodec
+     */
+    enum AVChromaLocation chroma_sample_location;
+
+    /**
+     * Number of slices.
+     * Indicates number of picture subdivisions. Used for parallelized
+     * decoding.
+     * - encoding: Set by user
+     * - decoding: unused
+     */
+    int slices;
+
+    /** Field order
+     * - encoding: set by libavcodec
+     * - decoding: Set by user.
+     */
+    enum AVFieldOrder field_order;
+
+    /* audio only */
+    int sample_rate; ///< samples per second
+    int channels;    ///< number of audio channels
+
+    /**
+     * audio sample format
+     * - encoding: Set by user.
+     * - decoding: Set by libavcodec.
+     */
+    enum AVSampleFormat sample_fmt;  ///< sample format
+
+    /* The following data should not be initialized. */
+    /**
+     * Number of samples per channel in an audio frame.
+     *
+     * - encoding: set by libavcodec in avcodec_open2(). Each submitted frame
+     *   except the last must contain exactly frame_size samples per channel.
+     *   May be 0 when the codec has AV_CODEC_CAP_VARIABLE_FRAME_SIZE set, then the
+     *   frame size is not restricted.
+     * - decoding: may be set by some decoders to indicate constant frame size
+     */
+    int frame_size;
+
+    /**
+     * Frame counter, set by libavcodec.
+     *
+     * - decoding: total number of frames returned from the decoder so far.
+     * - encoding: total number of frames passed to the encoder so far.
+     *
+     *   @note the counter is not incremented if encoding/decoding resulted in
+     *   an error.
+     */
+    int frame_number;
+
+    /**
+     * number of bytes per packet if constant and known or 0
+     * Used by some WAV based audio codecs.
+     */
+    int block_align;
+
+    /**
+     * Audio cutoff bandwidth (0 means "automatic")
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    int cutoff;
+
+    /**
+     * Audio channel layout.
+     * - encoding: set by user.
+     * - decoding: set by user, may be overwritten by libavcodec.
+     */
+    uint64_t channel_layout;
+
+    /**
+     * Request decoder to use this channel layout if it can (0 for default)
+     * - encoding: unused
+     * - decoding: Set by user.
+     */
+    uint64_t request_channel_layout;
+
+    /**
+     * Type of service that the audio stream conveys.
+     * - encoding: Set by user.
+     * - decoding: Set by libavcodec.
+     */
+    enum AVAudioServiceType audio_service_type;
+
+    /**
+     * desired sample format
+     * - encoding: Not used.
+     * - decoding: Set by user.
+     * Decoder will decode to this format if it can.
+     */
+    enum AVSampleFormat request_sample_fmt;
+
+    /**
+     * This callback is called at the beginning of each frame to get data
+     * buffer(s) for it. There may be one contiguous buffer for all the data or
+     * there may be a buffer per each data plane or anything in between. What
+     * this means is, you may set however many entries in buf[] you feel necessary.
+     * Each buffer must be reference-counted using the AVBuffer API (see description
+     * of buf[] below).
+     *
+     * The following fields will be set in the frame before this callback is
+     * called:
+     * - format
+     * - width, height (video only)
+     * - sample_rate, channel_layout, nb_samples (audio only)
+     * Their values may differ from the corresponding values in
+     * AVCodecContext. This callback must use the frame values, not the codec
+     * context values, to calculate the required buffer size.
+     *
+     * This callback must fill the following fields in the frame:
+     * - data[]
+     * - linesize[]
+     * - extended_data:
+     *   * if the data is planar audio with more than 8 channels, then this
+     *     callback must allocate and fill extended_data to contain all pointers
+     *     to all data planes. data[] must hold as many pointers as it can.
+     *     extended_data must be allocated with av_malloc() and will be freed in
+     *     av_frame_unref().
+     *   * otherwise extended_data must point to data
+     * - buf[] must contain one or more pointers to AVBufferRef structures. Each of
+     *   the frame's data and extended_data pointers must be contained in these. That
+     *   is, one AVBufferRef for each allocated chunk of memory, not necessarily one
+     *   AVBufferRef per data[] entry. See: av_buffer_create(), av_buffer_alloc(),
+     *   and av_buffer_ref().
+     * - extended_buf and nb_extended_buf must be allocated with av_malloc() by
+     *   this callback and filled with the extra buffers if there are more
+     *   buffers than buf[] can hold. extended_buf will be freed in
+     *   av_frame_unref().
+     *
+     * If AV_CODEC_CAP_DR1 is not set then get_buffer2() must call
+     * avcodec_default_get_buffer2() instead of providing buffers allocated by
+     * some other means.
+     *
+     * Each data plane must be aligned to the maximum required by the target
+     * CPU.
+     *
+     * @see avcodec_default_get_buffer2()
+     *
+     * Video:
+     *
+     * If AV_GET_BUFFER_FLAG_REF is set in flags then the frame may be reused
+     * (read and/or written to if it is writable) later by libavcodec.
+     *
+     * avcodec_align_dimensions2() should be used to find the required width and
+     * height, as they normally need to be rounded up to the next multiple of 16.
+     *
+     * Some decoders do not support linesizes changing between frames.
+     *
+     * If frame multithreading is used and thread_safe_callbacks is set,
+     * this callback may be called from a different thread, but not from more
+     * than one at once. Does not need to be reentrant.
+     *
+     * @see avcodec_align_dimensions2()
+     *
+     * Audio:
+     *
+     * Decoders request a buffer of a particular size by setting
+     * AVFrame.nb_samples prior to calling get_buffer2(). The decoder may,
+     * however, utilize only part of the buffer by setting AVFrame.nb_samples
+     * to a smaller value in the output frame.
+     *
+     * As a convenience, av_samples_get_buffer_size() and
+     * av_samples_fill_arrays() in libavutil may be used by custom get_buffer2()
+     * functions to find the required data size and to fill data pointers and
+     * linesize. In AVFrame.linesize, only linesize[0] may be set for audio
+     * since all planes must be the same size.
+     *
+     * @see av_samples_get_buffer_size(), av_samples_fill_arrays()
+     *
+     * - encoding: unused
+     * - decoding: Set by libavcodec, user can override.
+     */
+    int (*get_buffer2)(struct AVCodecContext *s, AVFrame *frame, int flags);
+
+    /**
+     * If non-zero, the decoded audio and video frames returned from
+     * avcodec_decode_video2() and avcodec_decode_audio4() are reference-counted
+     * and are valid indefinitely. The caller must free them with
+     * av_frame_unref() when they are not needed anymore.
+     * Otherwise, the decoded frames must not be freed by the caller and are
+     * only valid until the next decode call.
+     *
+     * This is always automatically enabled if avcodec_receive_frame() is used.
+     *
+     * - encoding: unused
+     * - decoding: set by the caller before avcodec_open2().
+     */
+    int refcounted_frames;
+
+    /* - encoding parameters */
+    float qcompress;  ///< amount of qscale change between easy & hard scenes (0.0-1.0)
+    float qblur;      ///< amount of qscale smoothing over time (0.0-1.0)
+
+    /**
+     * minimum quantizer
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    int qmin;
+
+    /**
+     * maximum quantizer
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    int qmax;
+
+    /**
+     * maximum quantizer difference between frames
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    int max_qdiff;
+
+#if FF_API_MPV_OPT
+    /**
+     * @deprecated use encoder private options instead
+     */
+    attribute_deprecated
+    float rc_qsquish;
+
+    attribute_deprecated
+    float rc_qmod_amp;
+    attribute_deprecated
+    int rc_qmod_freq;
+#endif
+
+    /**
+     * decoder bitstream buffer size
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    int rc_buffer_size;
+
+    /**
+     * ratecontrol override, see RcOverride
+     * - encoding: Allocated/set/freed by user.
+     * - decoding: unused
+     */
+    int rc_override_count;
+    RcOverride *rc_override;
+
+#if FF_API_MPV_OPT
+    /**
+     * @deprecated use encoder private options instead
+     */
+    attribute_deprecated
+    const char *rc_eq;
+#endif
+
+    /**
+     * maximum bitrate
+     * - encoding: Set by user.
+     * - decoding: Set by user, may be overwritten by libavcodec.
+     */
+    int64_t rc_max_rate;
+
+    /**
+     * minimum bitrate
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    int64_t rc_min_rate;
+
+#if FF_API_MPV_OPT
+    /**
+     * @deprecated use encoder private options instead
+     */
+    attribute_deprecated
+    float rc_buffer_aggressivity;
+
+    attribute_deprecated
+    float rc_initial_cplx;
+#endif
+
+    /**
+     * Ratecontrol attempt to use, at maximum, <value> of what can be used without an underflow.
+     * - encoding: Set by user.
+     * - decoding: unused.
+     */
+    float rc_max_available_vbv_use;
+
+    /**
+     * Ratecontrol attempt to use, at least, <value> times the amount needed to prevent a vbv overflow.
+     * - encoding: Set by user.
+     * - decoding: unused.
+     */
+    float rc_min_vbv_overflow_use;
+
+    /**
+     * Number of bits which should be loaded into the rc buffer before decoding starts.
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    int rc_initial_buffer_occupancy;
+
+#if FF_API_CODER_TYPE
+#define FF_CODER_TYPE_VLC       0
+#define FF_CODER_TYPE_AC        1
+#define FF_CODER_TYPE_RAW       2
+#define FF_CODER_TYPE_RLE       3
+#if FF_API_UNUSED_MEMBERS
+#define FF_CODER_TYPE_DEFLATE   4
+#endif /* FF_API_UNUSED_MEMBERS */
+    /**
+     * @deprecated use encoder private options instead
+     */
+    attribute_deprecated
+    int coder_type;
+#endif /* FF_API_CODER_TYPE */
+
+#if FF_API_PRIVATE_OPT
+    /** @deprecated use encoder private options instead */
+    attribute_deprecated
+    int context_model;
+#endif
+
+#if FF_API_MPV_OPT
+    /**
+     * @deprecated use encoder private options instead
+     */
+    attribute_deprecated
+    int lmin;
+
+    /**
+     * @deprecated use encoder private options instead
+     */
+    attribute_deprecated
+    int lmax;
+#endif
+
+#if FF_API_PRIVATE_OPT
+    /** @deprecated use encoder private options instead */
+    attribute_deprecated
+    int frame_skip_threshold;
+
+    /** @deprecated use encoder private options instead */
+    attribute_deprecated
+    int frame_skip_factor;
+
+    /** @deprecated use encoder private options instead */
+    attribute_deprecated
+    int frame_skip_exp;
+
+    /** @deprecated use encoder private options instead */
+    attribute_deprecated
+    int frame_skip_cmp;
+#endif /* FF_API_PRIVATE_OPT */
+
+    /**
+     * trellis RD quantization
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    int trellis;
+
+#if FF_API_PRIVATE_OPT
+    /** @deprecated use encoder private options instead */
+    attribute_deprecated
+    int min_prediction_order;
+
+    /** @deprecated use encoder private options instead */
+    attribute_deprecated
+    int max_prediction_order;
+
+    /** @deprecated use encoder private options instead */
+    attribute_deprecated
+    int64_t timecode_frame_start;
+#endif
+
+#if FF_API_RTP_CALLBACK
+    /**
+     * @deprecated unused
+     */
+    /* The RTP callback: This function is called    */
+    /* every time the encoder has a packet to send. */
+    /* It depends on the encoder if the data starts */
+    /* with a Start Code (it should). H.263 does.   */
+    /* mb_nb contains the number of macroblocks     */
+    /* encoded in the RTP payload.                  */
+    attribute_deprecated
+    void (*rtp_callback)(struct AVCodecContext *avctx, void *data, int size, int mb_nb);
+#endif
+
+#if FF_API_PRIVATE_OPT
+    /** @deprecated use encoder private options instead */
+    attribute_deprecated
+    int rtp_payload_size;   /* The size of the RTP payload: the coder will  */
+                            /* do its best to deliver a chunk with size     */
+                            /* below rtp_payload_size, the chunk will start */
+                            /* with a start code on some codecs like H.263. */
+                            /* This doesn't take account of any particular  */
+                            /* headers inside the transmitted RTP payload.  */
+#endif
+
+#if FF_API_STAT_BITS
+    /* statistics, used for 2-pass encoding */
+    attribute_deprecated
+    int mv_bits;
+    attribute_deprecated
+    int header_bits;
+    attribute_deprecated
+    int i_tex_bits;
+    attribute_deprecated
+    int p_tex_bits;
+    attribute_deprecated
+    int i_count;
+    attribute_deprecated
+    int p_count;
+    attribute_deprecated
+    int skip_count;
+    attribute_deprecated
+    int misc_bits;
+
+    /** @deprecated this field is unused */
+    attribute_deprecated
+    int frame_bits;
+#endif
+
+    /**
+     * pass1 encoding statistics output buffer
+     * - encoding: Set by libavcodec.
+     * - decoding: unused
+     */
+    char *stats_out;
+
+    /**
+     * pass2 encoding statistics input buffer
+     * Concatenated stuff from stats_out of pass1 should be placed here.
+     * - encoding: Allocated/set/freed by user.
+     * - decoding: unused
+     */
+    char *stats_in;
+
+    /**
+     * Work around bugs in encoders which sometimes cannot be detected automatically.
+     * - encoding: Set by user
+     * - decoding: Set by user
+     */
+    int workaround_bugs;
+#define FF_BUG_AUTODETECT       1  ///< autodetection
+#if FF_API_OLD_MSMPEG4
+#define FF_BUG_OLD_MSMPEG4      2
+#endif
+#define FF_BUG_XVID_ILACE       4
+#define FF_BUG_UMP4             8
+#define FF_BUG_NO_PADDING       16
+#define FF_BUG_AMV              32
+#if FF_API_AC_VLC
+#define FF_BUG_AC_VLC           0  ///< Will be removed, libavcodec can now handle these non-compliant files by default.
+#endif
+#define FF_BUG_QPEL_CHROMA      64
+#define FF_BUG_STD_QPEL         128
+#define FF_BUG_QPEL_CHROMA2     256
+#define FF_BUG_DIRECT_BLOCKSIZE 512
+#define FF_BUG_EDGE             1024
+#define FF_BUG_HPEL_CHROMA      2048
+#define FF_BUG_DC_CLIP          4096
+#define FF_BUG_MS               8192 ///< Work around various bugs in Microsoft's broken decoders.
+#define FF_BUG_TRUNCATED       16384
+#define FF_BUG_IEDGE           32768
+
+    /**
+     * strictly follow the standard (MPEG-4, ...).
+     * - encoding: Set by user.
+     * - decoding: Set by user.
+     * Setting this to STRICT or higher means the encoder and decoder will
+     * generally do stupid things, whereas setting it to unofficial or lower
+     * will mean the encoder might produce output that is not supported by all
+     * spec-compliant decoders. Decoders don't differentiate between normal,
+     * unofficial and experimental (that is, they always try to decode things
+     * when they can) unless they are explicitly asked to behave stupidly
+     * (=strictly conform to the specs)
+     */
+    int strict_std_compliance;
+#define FF_COMPLIANCE_VERY_STRICT   2 ///< Strictly conform to an older more strict version of the spec or reference software.
+#define FF_COMPLIANCE_STRICT        1 ///< Strictly conform to all the things in the spec no matter what consequences.
+#define FF_COMPLIANCE_NORMAL        0
+#define FF_COMPLIANCE_UNOFFICIAL   -1 ///< Allow unofficial extensions
+#define FF_COMPLIANCE_EXPERIMENTAL -2 ///< Allow nonstandardized experimental things.
+
+    /**
+     * error concealment flags
+     * - encoding: unused
+     * - decoding: Set by user.
+     */
+    int error_concealment;
+#define FF_EC_GUESS_MVS   1
+#define FF_EC_DEBLOCK     2
+#define FF_EC_FAVOR_INTER 256
+
+    /**
+     * debug
+     * - encoding: Set by user.
+     * - decoding: Set by user.
+     */
+    int debug;
+#define FF_DEBUG_PICT_INFO   1
+#define FF_DEBUG_RC          2
+#define FF_DEBUG_BITSTREAM   4
+#define FF_DEBUG_MB_TYPE     8
+#define FF_DEBUG_QP          16
+#if FF_API_DEBUG_MV
+/**
+ * @deprecated this option does nothing
+ */
+#define FF_DEBUG_MV          32
+#endif
+#define FF_DEBUG_DCT_COEFF   0x00000040
+#define FF_DEBUG_SKIP        0x00000080
+#define FF_DEBUG_STARTCODE   0x00000100
+#if FF_API_UNUSED_MEMBERS
+#define FF_DEBUG_PTS         0x00000200
+#endif /* FF_API_UNUSED_MEMBERS */
+#define FF_DEBUG_ER          0x00000400
+#define FF_DEBUG_MMCO        0x00000800
+#define FF_DEBUG_BUGS        0x00001000
+#if FF_API_DEBUG_MV
+#define FF_DEBUG_VIS_QP      0x00002000
+#define FF_DEBUG_VIS_MB_TYPE 0x00004000
+#endif
+#define FF_DEBUG_BUFFERS     0x00008000
+#define FF_DEBUG_THREADS     0x00010000
+#define FF_DEBUG_GREEN_MD    0x00800000
+#define FF_DEBUG_NOMC        0x01000000
+
+#if FF_API_DEBUG_MV
+    /**
+     * debug
+     * - encoding: Set by user.
+     * - decoding: Set by user.
+     */
+    int debug_mv;
+#define FF_DEBUG_VIS_MV_P_FOR  0x00000001 // visualize forward predicted MVs of P-frames
+#define FF_DEBUG_VIS_MV_B_FOR  0x00000002 // visualize forward predicted MVs of B-frames
+#define FF_DEBUG_VIS_MV_B_BACK 0x00000004 // visualize backward predicted MVs of B-frames
+#endif
+
+    /**
+     * Error recognition; may misdetect some more or less valid parts as errors.
+     * - encoding: unused
+     * - decoding: Set by user.
+     */
+    int err_recognition;
+
+/**
+ * Verify checksums embedded in the bitstream (could be of either encoded or
+ * decoded data, depending on the codec) and print an error message on mismatch.
+ * If AV_EF_EXPLODE is also set, a mismatching checksum will result in the
+ * decoder returning an error.
+ */
+#define AV_EF_CRCCHECK  (1<<0)
+#define AV_EF_BITSTREAM (1<<1)          ///< detect bitstream specification deviations
+#define AV_EF_BUFFER    (1<<2)          ///< detect improper bitstream length
+#define AV_EF_EXPLODE   (1<<3)          ///< abort decoding on minor error detection
+
+#define AV_EF_IGNORE_ERR (1<<15)        ///< ignore errors and continue
+#define AV_EF_CAREFUL    (1<<16)        ///< consider things that violate the spec, are fast to calculate and have not been seen in the wild as errors
+#define AV_EF_COMPLIANT  (1<<17)        ///< consider all spec non compliances as errors
+#define AV_EF_AGGRESSIVE (1<<18)        ///< consider things that a sane encoder should not do as an error
+
+
+    /**
+     * opaque 64-bit number (generally a PTS) that will be reordered and
+     * output in AVFrame.reordered_opaque
+     * - encoding: unused
+     * - decoding: Set by user.
+     */
+    int64_t reordered_opaque;
+
+    /**
+     * Hardware accelerator in use
+     * - encoding: unused.
+     * - decoding: Set by libavcodec
+     */
+    struct AVHWAccel *hwaccel;
+
+    /**
+     * Hardware accelerator context.
+     * For some hardware accelerators, a global context needs to be
+     * provided by the user. In that case, this holds display-dependent
+     * data FFmpeg cannot instantiate itself. Please refer to the
+     * FFmpeg HW accelerator documentation to know how to fill this
+     * is. e.g. for VA API, this is a struct vaapi_context.
+     * - encoding: unused
+     * - decoding: Set by user
+     */
+    void *hwaccel_context;
+
+    /**
+     * error
+     * - encoding: Set by libavcodec if flags & AV_CODEC_FLAG_PSNR.
+     * - decoding: unused
+     */
+    uint64_t error[AV_NUM_DATA_POINTERS];
+
+    /**
+     * DCT algorithm, see FF_DCT_* below
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+    int dct_algo;
+#define FF_DCT_AUTO    0
+#define FF_DCT_FASTINT 1
+#define FF_DCT_INT     2
+#define FF_DCT_MMX     3
+#define FF_DCT_ALTIVEC 5
+#define FF_DCT_FAAN    6
+
+    /**
+     * IDCT algorithm, see FF_IDCT_* below.
+     * - encoding: Set by user.
+     * - decoding: Set by user.
+     */
+    int idct_algo;
+#define FF_IDCT_AUTO          0
+#define FF_IDCT_INT           1
+#define FF_IDCT_SIMPLE        2
+#define FF_IDCT_SIMPLEMMX     3
+#define FF_IDCT_ARM           7
+#define FF_IDCT_ALTIVEC       8
+#if FF_API_ARCH_SH4
+#define FF_IDCT_SH4           9
+#endif
+#define FF_IDCT_SIMPLEARM     10
+#if FF_API_UNUSED_MEMBERS
+#define FF_IDCT_IPP           13
+#endif /* FF_API_UNUSED_MEMBERS */
+#define FF_IDCT_XVID          14
+#if FF_API_IDCT_XVIDMMX
+#define FF_IDCT_XVIDMMX       14
+#endif /* FF_API_IDCT_XVIDMMX */
+#define FF_IDCT_SIMPLEARMV5TE 16
+#define FF_IDCT_SIMPLEARMV6   17
+#if FF_API_ARCH_SPARC
+#define FF_IDCT_SIMPLEVIS     18
+#endif
+#define FF_IDCT_FAAN          20
+#define FF_IDCT_SIMPLENEON    22
+#if FF_API_ARCH_ALPHA
+#define FF_IDCT_SIMPLEALPHA   23
+#endif
+#define FF_IDCT_SIMPLEAUTO    128
+
+    /**
+     * bits per sample/pixel from the demuxer (needed for huffyuv).
+     * - encoding: Set by libavcodec.
+     * - decoding: Set by user.
+     */
+     int bits_per_coded_sample;
+
+    /**
+     * Bits per sample/pixel of internal libavcodec pixel/sample format.
+     * - encoding: set by user.
+     * - decoding: set by libavcodec.
+     */
+    int bits_per_raw_sample;
+
+#if FF_API_LOWRES
+    /**
+     * low resolution decoding, 1-> 1/2 size, 2->1/4 size
+     * - encoding: unused
+     * - decoding: Set by user.
+     */
+     int lowres;
+#endif
+
+#if FF_API_CODED_FRAME
+    /**
+     * the picture in the bitstream
+     * - encoding: Set by libavcodec.
+     * - decoding: unused
+     *
+     * @deprecated use the quality factor packet side data instead
+     */
+    attribute_deprecated AVFrame *coded_frame;
+#endif
+
+    /**
+     * thread count
+     * is used to decide how many independent tasks should be passed to execute()
+     * - encoding: Set by user.
+     * - decoding: Set by user.
+     */
+    int thread_count;
+
+    /**
+     * Which multithreading methods to use.
+     * Use of FF_THREAD_FRAME will increase decoding delay by one frame per thread,
+     * so clients which cannot provide future frames should not use it.
+     *
+     * - encoding: Set by user, otherwise the default is used.
+     * - decoding: Set by user, otherwise the default is used.
+     */
+    int thread_type;
+#define FF_THREAD_FRAME   1 ///< Decode more than one frame at once
+#define FF_THREAD_SLICE   2 ///< Decode more than one part of a single frame at once
+
+    /**
+     * Which multithreading methods are in use by the codec.
+     * - encoding: Set by libavcodec.
+     * - decoding: Set by libavcodec.
+     */
+    int active_thread_type;
+
+    /**
+     * Set by the client if its custom get_buffer() callback can be called
+     * synchronously from another thread, which allows faster multithreaded decoding.
+     * draw_horiz_band() will be called from other threads regardless of this setting.
+     * Ignored if the default get_buffer() is used.
+     * - encoding: Set by user.
+     * - decoding: Set by user.
+     */
+    int thread_safe_callbacks;
+
+    /**
+     * The codec may call this to execute several independent things.
+     * It will return only after finishing all tasks.
+     * The user may replace this with some multithreaded implementation,
+     * the default implementation will execute the parts serially.
+     * @param count the number of things to execute
+     * - encoding: Set by libavcodec, user can override.
+     * - decoding: Set by libavcodec, user can override.
+     */
+    int (*execute)(struct AVCodecContext *c, int (*func)(struct AVCodecContext *c2, void *arg), void *arg2, int *ret, int count, int size);
+
+    /**
+     * The codec may call this to execute several independent things.
+     * It will return only after finishing all tasks.
+     * The user may replace this with some multithreaded implementation,
+     * the default implementation will execute the parts serially.
+     * Also see avcodec_thread_init and e.g. the --enable-pthread configure option.
+     * @param c context passed also to func
+     * @param count the number of things to execute
+     * @param arg2 argument passed unchanged to func
+     * @param ret return values of executed functions, must have space for "count" values. May be NULL.
+     * @param func function that will be called count times, with jobnr from 0 to count-1.
+     *             threadnr will be in the range 0 to c->thread_count-1 < MAX_THREADS and so that no
+     *             two instances of func executing at the same time will have the same threadnr.
+     * @return always 0 currently, but code should handle a future improvement where when any call to func
+     *         returns < 0 no further calls to func may be done and < 0 is returned.
+     * - encoding: Set by libavcodec, user can override.
+     * - decoding: Set by libavcodec, user can override.
+     */
+    int (*execute2)(struct AVCodecContext *c, int (*func)(struct AVCodecContext *c2, void *arg, int jobnr, int threadnr), void *arg2, int *ret, int count);
+
+    /**
+     * noise vs. sse weight for the nsse comparison function
+     * - encoding: Set by user.
+     * - decoding: unused
+     */
+     int nsse_weight;
+
+    /**
+     * profile
+     * - encoding: Set by user.
+     * - decoding: Set by libavcodec.
+     */
+     int profile;
+#define FF_PROFILE_UNKNOWN -99
+#define FF_PROFILE_RESERVED -100
+
+#define FF_PROFILE_AAC_MAIN 0
+#define FF_PROFILE_AAC_LOW  1
+#define FF_PROFILE_AAC_SSR  2
+#define FF_PROFILE_AAC_LTP  3
+#define FF_PROFILE_AAC_HE   4
+#define FF_PROFILE_AAC_HE_V2 28
+#define FF_PROFILE_AAC_LD   22
+#define FF_PROFILE_AAC_ELD  38
+#define FF_PROFILE_MPEG2_AAC_LOW 128
+#define FF_PROFILE_MPEG2_AAC_HE  131
+
+#define FF_PROFILE_DNXHD         0
+#define FF_PROFILE_DNXHR_LB      1
+#define FF_PROFILE_DNXHR_SQ      2
+#define FF_PROFILE_DNXHR_HQ      3
+#define FF_PROFILE_DNXHR_HQX     4
+#define FF_PROFILE_DNXHR_444     5
+
+#define FF_PROFILE_DTS         20
+#define FF_PROFILE_DTS_ES      30
+#define FF_PROFILE_DTS_96_24   40
+#define FF_PROFILE_DTS_HD_HRA  50
+#define FF_PROFILE_DTS_HD_MA   60
+#define FF_PROFILE_DTS_EXPRESS 70
+
+#define FF_PROFILE_MPEG2_422    0
+#define FF_PROFILE_MPEG2_HIGH   1
+#define FF_PROFILE_MPEG2_SS     2
+#define FF_PROFILE_MPEG2_SNR_SCALABLE  3
+#define FF_PROFILE_MPEG2_MAIN   4
+#define FF_PROFILE_MPEG2_SIMPLE 5
+
+#define FF_PROFILE_H264_CONSTRAINED  (1<<9)  // 8+1; constraint_set1_flag
+#define FF_PROFILE_H264_INTRA        (1<<11) // 8+3; constraint_set3_flag
+
+#define FF_PROFILE_H264_BASELINE             66
+#define FF_PROFILE_H264_CONSTRAINED_BASELINE (66|FF_PROFILE_H264_CONSTRAINED)
+#define FF_PROFILE_H264_MAIN                 77
+#define FF_PROFILE_H264_EXTENDED             88
+#define FF_PROFILE_H264_HIGH                 100
+#define FF_PROFILE_H264_HIGH_10              110
+#define FF_PROFILE_H264_HIGH_10_INTRA        (110|FF_PROFILE_H264_INTRA)
+#define FF_PROFILE_H264_MULTIVIEW_HIGH       118
+#define FF_PROFILE_H264_HIGH_422             122
+#define FF_PROFILE_H264_HIGH_422_INTRA       (122|FF_PROFILE_H264_INTRA)
+#define FF_PROFILE_H264_STEREO_HIGH          128
+#define FF_PROFILE_H264_HIGH_444             144
+#define FF_PROFILE_H264_HIGH_444_PREDICTIVE  244
+#define FF_PROFILE_H264_HIGH_444_INTRA       (244|FF_PROFILE_H264_INTRA)
+#define FF_PROFILE_H264_CAVLC_444            44
+
+#define FF_PROFILE_VC1_SIMPLE   0
+#define FF_PROFILE_VC1_MAIN     1
+#define FF_PROFILE_VC1_COMPLEX  2
+#define FF_PROFILE_VC1_ADVANCED 3
+
+#define FF_PROFILE_MPEG4_SIMPLE                     0
+#define FF_PROFILE_MPEG4_SIMPLE_SCALABLE            1
+#define FF_PROFILE_MPEG4_CORE                       2
+#define FF_PROFILE_MPEG4_MAIN                       3
+#define FF_PROFILE_MPEG4_N_BIT                      4
+#define FF_PROFILE_MPEG4_SCALABLE_TEXTURE           5
+#define FF_PROFILE_MPEG4_SIMPLE_FACE_ANIMATION      6
+#define FF_PROFILE_MPEG4_BASIC_ANIMATED_TEXTURE     7
+#define FF_PROFILE_MPEG4_HYBRID                     8
+#define FF_PROFILE_MPEG4_ADVANCED_REAL_TIME         9
+#define FF_PROFILE_MPEG4_CORE_SCALABLE             10
+#define FF_PROFILE_MPEG4_ADVANCED_CODING           11
+#define FF_PROFILE_MPEG4_ADVANCED_CORE             12
+#define FF_PROFILE_MPEG4_ADVANCED_SCALABLE_TEXTURE 13
+#define FF_PROFILE_MPEG4_SIMPLE_STUDIO             14
+#define FF_PROFILE_MPEG4_ADVANCED_SIMPLE           15
+
+#define FF_PROFILE_JPEG2000_CSTREAM_RESTRICTION_0   1
+#define FF_PROFILE_JPEG2000_CSTREAM_RESTRICTION_1   2
+#define FF_PROFILE_JPEG2000_CSTREAM_NO_RESTRICTION  32768
+#define FF_PROFILE_JPEG2000_DCINEMA_2K              3
+#define FF_PROFILE_JPEG2000_DCINEMA_4K              4
+
+#define FF_PROFILE_VP9_0                            0
+#define FF_PROFILE_VP9_1                            1
+#define FF_PROFILE_VP9_2                            2
+#define FF_PROFILE_VP9_3                            3
+
+#define FF_PROFILE_HEVC_MAIN                        1
+#define FF_PROFILE_HEVC_MAIN_10                     2
+#define FF_PROFILE_HEVC_MAIN_STILL_PICTURE          3
+#define FF_PROFILE_HEVC_REXT                        4
+
+    /**
+     * level
+     * - encoding: Set by user.
+     * - decoding: Set by libavcodec.
+     */
+     int level;
+#define FF_LEVEL_UNKNOWN -99
+
+    /**
+     * Skip loop filtering for selected frames.
+     * - encoding: unused
+     * - decoding: Set by user.
+     */
+    enum AVDiscard skip_loop_filter;
+
+    /**
+     * Skip IDCT/dequantization for selected frames.
+     * - encoding: unused
+     * - decoding: Set by user.
+     */
+    enum AVDiscard skip_idct;
+
+    /**
+     * Skip decoding for selected frames.
+     * - encoding: unused
+     * - decoding: Set by user.
+     */
+    enum AVDiscard skip_frame;
+
+    /**
+     * Header containing style information for text subtitles.
+     * For SUBTITLE_ASS subtitle type, it should contain the whole ASS
+     * [Script Info] and [V4+ Styles] section, plus the [Events] line and
+     * the Format line following. It shouldn't include any Dialogue line.
+     * - encoding: Set/allocated/freed by user (before avcodec_open2())
+     * - decoding: Set/allocated/freed by libavcodec (by avcodec_open2())
+     */
+    uint8_t *subtitle_header;
+    int subtitle_header_size;
+
+#if FF_API_ERROR_RATE
+    /**
+     * @deprecated use the 'error_rate' private AVOption of the mpegvideo
+     * encoders
+     */
+    attribute_deprecated
+    int error_rate;
+#endif
+
+#if FF_API_VBV_DELAY
+    /**
+     * VBV delay coded in the last frame (in periods of a 27 MHz clock).
+     * Used for compliant TS muxing.
+     * - encoding: Set by libavcodec.
+     * - decoding: unused.
+     * @deprecated this value is now exported as a part of
+     * AV_PKT_DATA_CPB_PROPERTIES packet side data
+     */
+    attribute_deprecated
+    uint64_t vbv_delay;
+#endif
+
+#if FF_API_SIDEDATA_ONLY_PKT
+    /**
+     * Encoding only and set by default. Allow encoders to output packets
+     * that do not contain any encoded data, only side data.
+     *
+     * Some encoders need to output such packets, e.g. to update some stream
+     * parameters at the end of encoding.
+     *
+     * @deprecated this field disables the default behaviour and
+     *             it is kept only for compatibility.
+     */
+    attribute_deprecated
+    int side_data_only_packets;
+#endif
+
+    /**
+     * Audio only. The number of "priming" samples (padding) inserted by the
+     * encoder at the beginning of the audio. I.e. this number of leading
+     * decoded samples must be discarded by the caller to get the original audio
+     * without leading padding.
+     *
+     * - decoding: unused
+     * - encoding: Set by libavcodec. The timestamps on the output packets are
+     *             adjusted by the encoder so that they always refer to the
+     *             first sample of the data actually contained in the packet,
+     *             including any added padding.  E.g. if the timebase is
+     *             1/samplerate and the timestamp of the first input sample is
+     *             0, the timestamp of the first output packet will be
+     *             -initial_padding.
+     */
+    int initial_padding;
+
+    /**
+     * - decoding: For codecs that store a framerate value in the compressed
+     *             bitstream, the decoder may export it here. { 0, 1} when
+     *             unknown.
+     * - encoding: May be used to signal the framerate of CFR content to an
+     *             encoder.
+     */
+    AVRational framerate;
+
+    /**
+     * Nominal unaccelerated pixel format, see AV_PIX_FMT_xxx.
+     * - encoding: unused.
+     * - decoding: Set by libavcodec before calling get_format()
+     */
+    enum AVPixelFormat sw_pix_fmt;
+
+    /**
+     * Timebase in which pkt_dts/pts and AVPacket.dts/pts are.
+     * - encoding unused.
+     * - decoding set by user.
+     */
+    AVRational pkt_timebase;
+
+    /**
+     * AVCodecDescriptor
+     * - encoding: unused.
+     * - decoding: set by libavcodec.
+     */
+    const AVCodecDescriptor *codec_descriptor;
+
+#if !FF_API_LOWRES
+    /**
+     * low resolution decoding, 1-> 1/2 size, 2->1/4 size
+     * - encoding: unused
+     * - decoding: Set by user.
+     */
+     int lowres;
+#endif
+
+    /**
+     * Current statistics for PTS correction.
+     * - decoding: maintained and used by libavcodec, not intended to be used by user apps
+     * - encoding: unused
+     */
+    int64_t pts_correction_num_faulty_pts; /// Number of incorrect PTS values so far
+    int64_t pts_correction_num_faulty_dts; /// Number of incorrect DTS values so far
+    int64_t pts_correction_last_pts;       /// PTS of the last frame
+    int64_t pts_correction_last_dts;       /// DTS of the last frame
+
+    /**
+     * Character encoding of the input subtitles file.
+     * - decoding: set by user
+     * - encoding: unused
+     */
+    char *sub_charenc;
+
+    /**
+     * Subtitles character encoding mode. Formats or codecs might be adjusting
+     * this setting (if they are doing the conversion themselves for instance).
+     * - decoding: set by libavcodec
+     * - encoding: unused
+     */
+    int sub_charenc_mode;
+#define FF_SUB_CHARENC_MODE_DO_NOTHING  -1  ///< do nothing (demuxer outputs a stream supposed to be already in UTF-8, or the codec is bitmap for instance)
+#define FF_SUB_CHARENC_MODE_AUTOMATIC    0  ///< libavcodec will select the mode itself
+#define FF_SUB_CHARENC_MODE_PRE_DECODER  1  ///< the AVPacket data needs to be recoded to UTF-8 before being fed to the decoder, requires iconv
+
+    /**
+     * Skip processing alpha if supported by codec.
+     * Note that if the format uses pre-multiplied alpha (common with VP6,
+     * and recommended due to better video quality/compression)
+     * the image will look as if alpha-blended onto a black background.
+     * However for formats that do not use pre-multiplied alpha
+     * there might be serious artefacts (though e.g. libswscale currently
+     * assumes pre-multiplied alpha anyway).
+     *
+     * - decoding: set by user
+     * - encoding: unused
+     */
+    int skip_alpha;
+
+    /**
+     * Number of samples to skip after a discontinuity
+     * - decoding: unused
+     * - encoding: set by libavcodec
+     */
+    int seek_preroll;
+
+#if !FF_API_DEBUG_MV
+    /**
+     * debug motion vectors
+     * - encoding: Set by user.
+     * - decoding: Set by user.
+     */
+    int debug_mv;
+#define FF_DEBUG_VIS_MV_P_FOR  0x00000001 //visualize forward predicted MVs of P frames
+#define FF_DEBUG_VIS_MV_B_FOR  0x00000002 //visualize forward predicted MVs of B frames
+#define FF_DEBUG_VIS_MV_B_BACK 0x00000004 //visualize backward predicted MVs of B frames
+#endif
+
+    /**
+     * custom intra quantization matrix
+     * - encoding: Set by user, can be NULL.
+     * - decoding: unused.
+     */
+    uint16_t *chroma_intra_matrix;
+
+    /**
+     * dump format separator.
+     * can be ", " or "\n      " or anything else
+     * - encoding: Set by user.
+     * - decoding: Set by user.
+     */
+    uint8_t *dump_separator;
+
+    /**
+     * ',' separated list of allowed decoders.
+     * If NULL then all are allowed
+     * - encoding: unused
+     * - decoding: set by user
+     */
+    char *codec_whitelist;
+
+    /*
+     * Properties of the stream that gets decoded
+     * - encoding: unused
+     * - decoding: set by libavcodec
+     */
+    unsigned properties;
+#define FF_CODEC_PROPERTY_LOSSLESS        0x00000001
+#define FF_CODEC_PROPERTY_CLOSED_CAPTIONS 0x00000002
+
+    /**
+     * Additional data associated with the entire coded stream.
+     *
+     * - decoding: unused
+     * - encoding: may be set by libavcodec after avcodec_open2().
+     */
+    AVPacketSideData *coded_side_data;
+    int            nb_coded_side_data;
+
+    /**
+     * A reference to the AVHWFramesContext describing the input (for encoding)
+     * or output (decoding) frames. The reference is set by the caller and
+     * afterwards owned (and freed) by libavcodec - it should never be read by
+     * the caller after being set.
+     *
+     * - decoding: This field should be set by the caller from the get_format()
+     *             callback. The previous reference (if any) will always be
+     *             unreffed by libavcodec before the get_format() call.
+     *
+     *             If the default get_buffer2() is used with a hwaccel pixel
+     *             format, then this AVHWFramesContext will be used for
+     *             allocating the frame buffers.
+     *
+     * - encoding: For hardware encoders configured to use a hwaccel pixel
+     *             format, this field should be set by the caller to a reference
+     *             to the AVHWFramesContext describing input frames.
+     *             AVHWFramesContext.format must be equal to
+     *             AVCodecContext.pix_fmt.
+     *
+     *             This field should be set before avcodec_open2() is called.
+     */
+    AVBufferRef *hw_frames_ctx;
+
+    /**
+     * Control the form of AVSubtitle.rects[N]->ass
+     * - decoding: set by user
+     * - encoding: unused
+     */
+    int sub_text_format;
+#define FF_SUB_TEXT_FMT_ASS              0
+#if FF_API_ASS_TIMING
+#define FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS 1
+#endif
+
+    /**
+     * Audio only. The amount of padding (in samples) appended by the encoder to
+     * the end of the audio. I.e. this number of decoded samples must be
+     * discarded by the caller from the end of the stream to get the original
+     * audio without any trailing padding.
+     *
+     * - decoding: unused
+     * - encoding: unused
+     */
+    int trailing_padding;
+
+    /**
+     * The number of pixels per image to maximally accept.
+     *
+     * - decoding: set by user
+     * - encoding: set by user
+     */
+    int64_t max_pixels;
+
+    /**
+     * A reference to the AVHWDeviceContext describing the device which will
+     * be used by a hardware encoder/decoder.  The reference is set by the
+     * caller and afterwards owned (and freed) by libavcodec.
+     *
+     * This should be used if either the codec device does not require
+     * hardware frames or any that are used are to be allocated internally by
+     * libavcodec.  If the user wishes to supply any of the frames used as
+     * encoder input or decoder output then hw_frames_ctx should be used
+     * instead.  When hw_frames_ctx is set in get_format() for a decoder, this
+     * field will be ignored while decoding the associated stream segment, but
+     * may again be used on a following one after another get_format() call.
+     *
+     * For both encoders and decoders this field should be set before
+     * avcodec_open2() is called and must not be written to thereafter.
+     *
+     * Note that some decoders may require this field to be set initially in
+     * order to support hw_frames_ctx at all - in that case, all frames
+     * contexts used must be created on the same device.
+     */
+    AVBufferRef *hw_device_ctx;
+
+    /**
+     * Bit set of AV_HWACCEL_FLAG_* flags, which affect hardware accelerated
+     * decoding (if active).
+     * - encoding: unused
+     * - decoding: Set by user (either before avcodec_open2(), or in the
+     *             AVCodecContext.get_format callback)
+     */
+    int hwaccel_flags;
+} AVCodecContext;
+
+AVRational av_codec_get_pkt_timebase         (const AVCodecContext *avctx);
+void       av_codec_set_pkt_timebase         (AVCodecContext *avctx, AVRational val);
+
+const AVCodecDescriptor *av_codec_get_codec_descriptor(const AVCodecContext *avctx);
+void                     av_codec_set_codec_descriptor(AVCodecContext *avctx, const AVCodecDescriptor *desc);
+
+unsigned av_codec_get_codec_properties(const AVCodecContext *avctx);
+
+int  av_codec_get_lowres(const AVCodecContext *avctx);
+void av_codec_set_lowres(AVCodecContext *avctx, int val);
+
+int  av_codec_get_seek_preroll(const AVCodecContext *avctx);
+void av_codec_set_seek_preroll(AVCodecContext *avctx, int val);
+
+uint16_t *av_codec_get_chroma_intra_matrix(const AVCodecContext *avctx);
+void av_codec_set_chroma_intra_matrix(AVCodecContext *avctx, uint16_t *val);
+
+/**
+ * AVProfile.
+ */
+typedef struct AVProfile {
+    int profile;
+    const char *name; ///< short name for the profile
+} AVProfile;
+
+typedef struct AVCodecDefault AVCodecDefault;
+
+struct AVSubtitle;
+
+/**
+ * AVCodec.
+ */
+typedef struct AVCodec {
+    /**
+     * Name of the codec implementation.
+     * The name is globally unique among encoders and among decoders (but an
+     * encoder and a decoder can share the same name).
+     * This is the primary way to find a codec from the user perspective.
+     */
+    const char *name;
+    /**
+     * Descriptive name for the codec, meant to be more human readable than name.
+     * You should use the NULL_IF_CONFIG_SMALL() macro to define it.
+     */
+    const char *long_name;
+    enum AVMediaType type;
+    enum AVCodecID id;
+    /**
+     * Codec capabilities.
+     * see AV_CODEC_CAP_*
+     */
+    int capabilities;
+    const AVRational *supported_framerates; ///< array of supported framerates, or NULL if any, array is terminated by {0,0}
+    const enum AVPixelFormat *pix_fmts;     ///< array of supported pixel formats, or NULL if unknown, array is terminated by -1
+    const int *supported_samplerates;       ///< array of supported audio samplerates, or NULL if unknown, array is terminated by 0
+    const enum AVSampleFormat *sample_fmts; ///< array of supported sample formats, or NULL if unknown, array is terminated by -1
+    const uint64_t *channel_layouts;         ///< array of support channel layouts, or NULL if unknown. array is terminated by 0
+    uint8_t max_lowres;                     ///< maximum value for lowres supported by the decoder
+    const AVClass *priv_class;              ///< AVClass for the private context
+    const AVProfile *profiles;              ///< array of recognized profiles, or NULL if unknown, array is terminated by {FF_PROFILE_UNKNOWN}
+
+    /*****************************************************************
+     * No fields below this line are part of the public API. They
+     * may not be used outside of libavcodec and can be changed and
+     * removed at will.
+     * New public fields should be added right above.
+     *****************************************************************
+     */
+    int priv_data_size;
+    struct AVCodec *next;
+    /**
+     * @name Frame-level threading support functions
+     * @{
+     */
+    /**
+     * If defined, called on thread contexts when they are created.
+     * If the codec allocates writable tables in init(), re-allocate them here.
+     * priv_data will be set to a copy of the original.
+     */
+    int (*init_thread_copy)(AVCodecContext *);
+    /**
+     * Copy necessary context variables from a previous thread context to the current one.
+     * If not defined, the next thread will start automatically; otherwise, the codec
+     * must call ff_thread_finish_setup().
+     *
+     * dst and src will (rarely) point to the same context, in which case memcpy should be skipped.
+     */
+    int (*update_thread_context)(AVCodecContext *dst, const AVCodecContext *src);
+    /** @} */
+
+    /**
+     * Private codec-specific defaults.
+     */
+    const AVCodecDefault *defaults;
+
+    /**
+     * Initialize codec static data, called from avcodec_register().
+     */
+    void (*init_static_data)(struct AVCodec *codec);
+
+    int (*init)(AVCodecContext *);
+    int (*encode_sub)(AVCodecContext *, uint8_t *buf, int buf_size,
+                      const struct AVSubtitle *sub);
+    /**
+     * Encode data to an AVPacket.
+     *
+     * @param      avctx          codec context
+     * @param      avpkt          output AVPacket (may contain a user-provided buffer)
+     * @param[in]  frame          AVFrame containing the raw data to be encoded
+     * @param[out] got_packet_ptr encoder sets to 0 or 1 to indicate that a
+     *                            non-empty packet was returned in avpkt.
+     * @return 0 on success, negative error code on failure
+     */
+    int (*encode2)(AVCodecContext *avctx, AVPacket *avpkt, const AVFrame *frame,
+                   int *got_packet_ptr);
+    int (*decode)(AVCodecContext *, void *outdata, int *outdata_size, AVPacket *avpkt);
+    int (*close)(AVCodecContext *);
+    /**
+     * Decode/encode API with decoupled packet/frame dataflow. The API is the
+     * same as the avcodec_ prefixed APIs (avcodec_send_frame() etc.), except
+     * that:
+     * - never called if the codec is closed or the wrong type,
+     * - AVPacket parameter change side data is applied right before calling
+     *   AVCodec->send_packet,
+     * - if AV_CODEC_CAP_DELAY is not set, drain packets or frames are never sent,
+     * - only one drain packet is ever passed down (until the next flush()),
+     * - a drain AVPacket is always NULL (no need to check for avpkt->size).
+     */
+    int (*send_frame)(AVCodecContext *avctx, const AVFrame *frame);
+    int (*send_packet)(AVCodecContext *avctx, const AVPacket *avpkt);
+    int (*receive_frame)(AVCodecContext *avctx, AVFrame *frame);
+    int (*receive_packet)(AVCodecContext *avctx, AVPacket *avpkt);
+    /**
+     * Flush buffers.
+     * Will be called when seeking
+     */
+    void (*flush)(AVCodecContext *);
+    /**
+     * Internal codec capabilities.
+     * See FF_CODEC_CAP_* in internal.h
+     */
+    int caps_internal;
+} AVCodec;
+
+int av_codec_get_max_lowres(const AVCodec *codec);
+
+struct MpegEncContext;
+
+/**
+ * @defgroup lavc_hwaccel AVHWAccel
+ * @{
+ */
+typedef struct AVHWAccel {
+    /**
+     * Name of the hardware accelerated codec.
+     * The name is globally unique among encoders and among decoders (but an
+     * encoder and a decoder can share the same name).
+     */
+    const char *name;
+
+    /**
+     * Type of codec implemented by the hardware accelerator.
+     *
+     * See AVMEDIA_TYPE_xxx
+     */
+    enum AVMediaType type;
+
+    /**
+     * Codec implemented by the hardware accelerator.
+     *
+     * See AV_CODEC_ID_xxx
+     */
+    enum AVCodecID id;
+
+    /**
+     * Supported pixel format.
+     *
+     * Only hardware accelerated formats are supported here.
+     */
+    enum AVPixelFormat pix_fmt;
+
+    /**
+     * Hardware accelerated codec capabilities.
+     * see HWACCEL_CODEC_CAP_*
+     */
+    int capabilities;
+
+    /*****************************************************************
+     * No fields below this line are part of the public API. They
+     * may not be used outside of libavcodec and can be changed and
+     * removed at will.
+     * New public fields should be added right above.
+     *****************************************************************
+     */
+    struct AVHWAccel *next;
+
+    /**
+     * Allocate a custom buffer
+     */
+    int (*alloc_frame)(AVCodecContext *avctx, AVFrame *frame);
+
+    /**
+     * Called at the beginning of each frame or field picture.
+     *
+     * Meaningful frame information (codec specific) is guaranteed to
+     * be parsed at this point. This function is mandatory.
+     *
+     * Note that buf can be NULL along with buf_size set to 0.
+     * Otherwise, this means the whole frame is available at this point.
+     *
+     * @param avctx the codec context
+     * @param buf the frame data buffer base
+     * @param buf_size the size of the frame in bytes
+     * @return zero if successful, a negative value otherwise
+     */
+    int (*start_frame)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size);
+
+    /**
+     * Callback for each slice.
+     *
+     * Meaningful slice information (codec specific) is guaranteed to
+     * be parsed at this point. This function is mandatory.
+     * The only exception is XvMC, that works on MB level.
+     *
+     * @param avctx the codec context
+     * @param buf the slice data buffer base
+     * @param buf_size the size of the slice in bytes
+     * @return zero if successful, a negative value otherwise
+     */
+    int (*decode_slice)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size);
+
+    /**
+     * Called at the end of each frame or field picture.
+     *
+     * The whole picture is parsed at this point and can now be sent
+     * to the hardware accelerator. This function is mandatory.
+     *
+     * @param avctx the codec context
+     * @return zero if successful, a negative value otherwise
+     */
+    int (*end_frame)(AVCodecContext *avctx);
+
+    /**
+     * Size of per-frame hardware accelerator private data.
+     *
+     * Private data is allocated with av_mallocz() before
+     * AVCodecContext.get_buffer() and deallocated after
+     * AVCodecContext.release_buffer().
+     */
+    int frame_priv_data_size;
+
+    /**
+     * Called for every Macroblock in a slice.
+     *
+     * XvMC uses it to replace the ff_mpv_decode_mb().
+     * Instead of decoding to raw picture, MB parameters are
+     * stored in an array provided by the video driver.
+     *
+     * @param s the mpeg context
+     */
+    void (*decode_mb)(struct MpegEncContext *s);
+
+    /**
+     * Initialize the hwaccel private data.
+     *
+     * This will be called from ff_get_format(), after hwaccel and
+     * hwaccel_context are set and the hwaccel private data in AVCodecInternal
+     * is allocated.
+     */
+    int (*init)(AVCodecContext *avctx);
+
+    /**
+     * Uninitialize the hwaccel private data.
+     *
+     * This will be called from get_format() or avcodec_close(), after hwaccel
+     * and hwaccel_context are already uninitialized.
+     */
+    int (*uninit)(AVCodecContext *avctx);
+
+    /**
+     * Size of the private data to allocate in
+     * AVCodecInternal.hwaccel_priv_data.
+     */
+    int priv_data_size;
+
+    /**
+     * Internal hwaccel capabilities.
+     */
+    int caps_internal;
+} AVHWAccel;
+
+/**
+ * Hardware acceleration should be used for decoding even if the codec level
+ * used is unknown or higher than the maximum supported level reported by the
+ * hardware driver.
+ *
+ * It's generally a good idea to pass this flag unless you have a specific
+ * reason not to, as hardware tends to under-report supported levels.
+ */
+#define AV_HWACCEL_FLAG_IGNORE_LEVEL (1 << 0)
+
+/**
+ * Hardware acceleration can output YUV pixel formats with a different chroma
+ * sampling than 4:2:0 and/or other than 8 bits per component.
+ */
+#define AV_HWACCEL_FLAG_ALLOW_HIGH_DEPTH (1 << 1)
+
+/**
+ * @}
+ */
+
+#if FF_API_AVPICTURE
+/**
+ * @defgroup lavc_picture AVPicture
+ *
+ * Functions for working with AVPicture
+ * @{
+ */
+
+/**
+ * Picture data structure.
+ *
+ * Up to four components can be stored into it, the last component is
+ * alpha.
+ * @deprecated use AVFrame or imgutils functions instead
+ */
+typedef struct AVPicture {
+    attribute_deprecated
+    uint8_t *data[AV_NUM_DATA_POINTERS];    ///< pointers to the image data planes
+    attribute_deprecated
+    int linesize[AV_NUM_DATA_POINTERS];     ///< number of bytes per line
+} AVPicture;
+
+/**
+ * @}
+ */
+#endif
+
+enum AVSubtitleType {
+    SUBTITLE_NONE,
+
+    SUBTITLE_BITMAP,                ///< A bitmap, pict will be set
+
+    /**
+     * Plain text, the text field must be set by the decoder and is
+     * authoritative. ass and pict fields may contain approximations.
+     */
+    SUBTITLE_TEXT,
+
+    /**
+     * Formatted text, the ass field must be set by the decoder and is
+     * authoritative. pict and text fields may contain approximations.
+     */
+    SUBTITLE_ASS,
+};
+
+#define AV_SUBTITLE_FLAG_FORCED 0x00000001
+
+typedef struct AVSubtitleRect {
+    int x;         ///< top left corner  of pict, undefined when pict is not set
+    int y;         ///< top left corner  of pict, undefined when pict is not set
+    int w;         ///< width            of pict, undefined when pict is not set
+    int h;         ///< height           of pict, undefined when pict is not set
+    int nb_colors; ///< number of colors in pict, undefined when pict is not set
+
+#if FF_API_AVPICTURE
+    /**
+     * @deprecated unused
+     */
+    attribute_deprecated
+    AVPicture pict;
+#endif
+    /**
+     * data+linesize for the bitmap of this subtitle.
+     * Can be set for text/ass as well once they are rendered.
+     */
+    uint8_t *data[4];
+    int linesize[4];
+
+    enum AVSubtitleType type;
+
+    char *text;                     ///< 0 terminated plain UTF-8 text
+
+    /**
+     * 0 terminated ASS/SSA compatible event line.
+     * The presentation of this is unaffected by the other values in this
+     * struct.
+     */
+    char *ass;
+
+    int flags;
+} AVSubtitleRect;
+
+typedef struct AVSubtitle {
+    uint16_t format; /* 0 = graphics */
+    uint32_t start_display_time; /* relative to packet pts, in ms */
+    uint32_t end_display_time; /* relative to packet pts, in ms */
+    unsigned num_rects;
+    AVSubtitleRect **rects;
+    int64_t pts;    ///< Same as packet pts, in AV_TIME_BASE
+} AVSubtitle;
+
+/**
+ * This struct describes the properties of an encoded stream.
+ *
+ * sizeof(AVCodecParameters) is not a part of the public ABI, this struct must
+ * be allocated with avcodec_parameters_alloc() and freed with
+ * avcodec_parameters_free().
+ */
+typedef struct AVCodecParameters {
+    /**
+     * General type of the encoded data.
+     */
+    enum AVMediaType codec_type;
+    /**
+     * Specific type of the encoded data (the codec used).
+     */
+    enum AVCodecID   codec_id;
+    /**
+     * Additional information about the codec (corresponds to the AVI FOURCC).
+     */
+    uint32_t         codec_tag;
+
+    /**
+     * Extra binary data needed for initializing the decoder, codec-dependent.
+     *
+     * Must be allocated with av_malloc() and will be freed by
+     * avcodec_parameters_free(). The allocated size of extradata must be at
+     * least extradata_size + AV_INPUT_BUFFER_PADDING_SIZE, with the padding
+     * bytes zeroed.
+     */
+    uint8_t *extradata;
+    /**
+     * Size of the extradata content in bytes.
+     */
+    int      extradata_size;
+
+    /**
+     * - video: the pixel format, the value corresponds to enum AVPixelFormat.
+     * - audio: the sample format, the value corresponds to enum AVSampleFormat.
+     */
+    int format;
+
+    /**
+     * The average bitrate of the encoded data (in bits per second).
+     */
+    int64_t bit_rate;
+
+    /**
+     * The number of bits per sample in the codedwords.
+     *
+     * This is basically the bitrate per sample. It is mandatory for a bunch of
+     * formats to actually decode them. It's the number of bits for one sample in
+     * the actual coded bitstream.
+     *
+     * This could be for example 4 for ADPCM
+     * For PCM formats this matches bits_per_raw_sample
+     * Can be 0
+     */
+    int bits_per_coded_sample;
+
+    /**
+     * This is the number of valid bits in each output sample. If the
+     * sample format has more bits, the least significant bits are additional
+     * padding bits, which are always 0. Use right shifts to reduce the sample
+     * to its actual size. For example, audio formats with 24 bit samples will
+     * have bits_per_raw_sample set to 24, and format set to AV_SAMPLE_FMT_S32.
+     * To get the original sample use "(int32_t)sample >> 8"."
+     *
+     * For ADPCM this might be 12 or 16 or similar
+     * Can be 0
+     */
+    int bits_per_raw_sample;
+
+    /**
+     * Codec-specific bitstream restrictions that the stream conforms to.
+     */
+    int profile;
+    int level;
+
+    /**
+     * Video only. The dimensions of the video frame in pixels.
+     */
+    int width;
+    int height;
+
+    /**
+     * Video only. The aspect ratio (width / height) which a single pixel
+     * should have when displayed.
+     *
+     * When the aspect ratio is unknown / undefined, the numerator should be
+     * set to 0 (the denominator may have any value).
+     */
+    AVRational sample_aspect_ratio;
+
+    /**
+     * Video only. The order of the fields in interlaced video.
+     */
+    enum AVFieldOrder                  field_order;
+
+    /**
+     * Video only. Additional colorspace characteristics.
+     */
+    enum AVColorRange                  color_range;
+    enum AVColorPrimaries              color_primaries;
+    enum AVColorTransferCharacteristic color_trc;
+    enum AVColorSpace                  color_space;
+    enum AVChromaLocation              chroma_location;
+
+    /**
+     * Video only. Number of delayed frames.
+     */
+    int video_delay;
+
+    /**
+     * Audio only. The channel layout bitmask. May be 0 if the channel layout is
+     * unknown or unspecified, otherwise the number of bits set must be equal to
+     * the channels field.
+     */
+    uint64_t channel_layout;
+    /**
+     * Audio only. The number of audio channels.
+     */
+    int      channels;
+    /**
+     * Audio only. The number of audio samples per second.
+     */
+    int      sample_rate;
+    /**
+     * Audio only. The number of bytes per coded audio frame, required by some
+     * formats.
+     *
+     * Corresponds to nBlockAlign in WAVEFORMATEX.
+     */
+    int      block_align;
+    /**
+     * Audio only. Audio frame size, if known. Required by some formats to be static.
+     */
+    int      frame_size;
+
+    /**
+     * Audio only. The amount of padding (in samples) inserted by the encoder at
+     * the beginning of the audio. I.e. this number of leading decoded samples
+     * must be discarded by the caller to get the original audio without leading
+     * padding.
+     */
+    int initial_padding;
+    /**
+     * Audio only. The amount of padding (in samples) appended by the encoder to
+     * the end of the audio. I.e. this number of decoded samples must be
+     * discarded by the caller from the end of the stream to get the original
+     * audio without any trailing padding.
+     */
+    int trailing_padding;
+    /**
+     * Audio only. Number of samples to skip after a discontinuity.
+     */
+    int seek_preroll;
+} AVCodecParameters;
+
+/**
+ * If c is NULL, returns the first registered codec,
+ * if c is non-NULL, returns the next registered codec after c,
+ * or NULL if c is the last one.
+ */
+AVCodec *av_codec_next(const AVCodec *c);
+
+/**
+ * Return the LIBAVCODEC_VERSION_INT constant.
+ */
+unsigned avcodec_version(void);
+
+/**
+ * Return the libavcodec build-time configuration.
+ */
+const char *avcodec_configuration(void);
+
+/**
+ * Return the libavcodec license.
+ */
+const char *avcodec_license(void);
+
+/**
+ * Register the codec codec and initialize libavcodec.
+ *
+ * @warning either this function or avcodec_register_all() must be called
+ * before any other libavcodec functions.
+ *
+ * @see avcodec_register_all()
+ */
+void avcodec_register(AVCodec *codec);
+
+/**
+ * Register all the codecs, parsers and bitstream filters which were enabled at
+ * configuration time. If you do not call this function you can select exactly
+ * which formats you want to support, by using the individual registration
+ * functions.
+ *
+ * @see avcodec_register
+ * @see av_register_codec_parser
+ * @see av_register_bitstream_filter
+ */
+void avcodec_register_all(void);
+
+/**
+ * Allocate an AVCodecContext and set its fields to default values. The
+ * resulting struct should be freed with avcodec_free_context().
+ *
+ * @param codec if non-NULL, allocate private data and initialize defaults
+ *              for the given codec. It is illegal to then call avcodec_open2()
+ *              with a different codec.
+ *              If NULL, then the codec-specific defaults won't be initialized,
+ *              which may result in suboptimal default settings (this is
+ *              important mainly for encoders, e.g. libx264).
+ *
+ * @return An AVCodecContext filled with default values or NULL on failure.
+ */
+AVCodecContext *avcodec_alloc_context3(const AVCodec *codec);
+
+/**
+ * Free the codec context and everything associated with it and write NULL to
+ * the provided pointer.
+ */
+void avcodec_free_context(AVCodecContext **avctx);
+
+#if FF_API_GET_CONTEXT_DEFAULTS
+/**
+ * @deprecated This function should not be used, as closing and opening a codec
+ * context multiple time is not supported. A new codec context should be
+ * allocated for each new use.
+ */
+int avcodec_get_context_defaults3(AVCodecContext *s, const AVCodec *codec);
+#endif
+
+/**
+ * Get the AVClass for AVCodecContext. It can be used in combination with
+ * AV_OPT_SEARCH_FAKE_OBJ for examining options.
+ *
+ * @see av_opt_find().
+ */
+const AVClass *avcodec_get_class(void);
+
+#if FF_API_COPY_CONTEXT
+/**
+ * Get the AVClass for AVFrame. It can be used in combination with
+ * AV_OPT_SEARCH_FAKE_OBJ for examining options.
+ *
+ * @see av_opt_find().
+ */
+const AVClass *avcodec_get_frame_class(void);
+
+/**
+ * Get the AVClass for AVSubtitleRect. It can be used in combination with
+ * AV_OPT_SEARCH_FAKE_OBJ for examining options.
+ *
+ * @see av_opt_find().
+ */
+const AVClass *avcodec_get_subtitle_rect_class(void);
+
+/**
+ * Copy the settings of the source AVCodecContext into the destination
+ * AVCodecContext. The resulting destination codec context will be
+ * unopened, i.e. you are required to call avcodec_open2() before you
+ * can use this AVCodecContext to decode/encode video/audio data.
+ *
+ * @param dest target codec context, should be initialized with
+ *             avcodec_alloc_context3(NULL), but otherwise uninitialized
+ * @param src source codec context
+ * @return AVERROR() on error (e.g. memory allocation error), 0 on success
+ *
+ * @deprecated The semantics of this function are ill-defined and it should not
+ * be used. If you need to transfer the stream parameters from one codec context
+ * to another, use an intermediate AVCodecParameters instance and the
+ * avcodec_parameters_from_context() / avcodec_parameters_to_context()
+ * functions.
+ */
+attribute_deprecated
+int avcodec_copy_context(AVCodecContext *dest, const AVCodecContext *src);
+#endif
+
+/**
+ * Allocate a new AVCodecParameters and set its fields to default values
+ * (unknown/invalid/0). The returned struct must be freed with
+ * avcodec_parameters_free().
+ */
+AVCodecParameters *avcodec_parameters_alloc(void);
+
+/**
+ * Free an AVCodecParameters instance and everything associated with it and
+ * write NULL to the supplied pointer.
+ */
+void avcodec_parameters_free(AVCodecParameters **par);
+
+/**
+ * Copy the contents of src to dst. Any allocated fields in dst are freed and
+ * replaced with newly allocated duplicates of the corresponding fields in src.
+ *
+ * @return >= 0 on success, a negative AVERROR code on failure.
+ */
+int avcodec_parameters_copy(AVCodecParameters *dst, const AVCodecParameters *src);
+
+/**
+ * Fill the parameters struct based on the values from the supplied codec
+ * context. Any allocated fields in par are freed and replaced with duplicates
+ * of the corresponding fields in codec.
+ *
+ * @return >= 0 on success, a negative AVERROR code on failure
+ */
+int avcodec_parameters_from_context(AVCodecParameters *par,
+                                    const AVCodecContext *codec);
+
+/**
+ * Fill the codec context based on the values from the supplied codec
+ * parameters. Any allocated fields in codec that have a corresponding field in
+ * par are freed and replaced with duplicates of the corresponding field in par.
+ * Fields in codec that do not have a counterpart in par are not touched.
+ *
+ * @return >= 0 on success, a negative AVERROR code on failure.
+ */
+int avcodec_parameters_to_context(AVCodecContext *codec,
+                                  const AVCodecParameters *par);
+
+/**
+ * Initialize the AVCodecContext to use the given AVCodec. Prior to using this
+ * function the context has to be allocated with avcodec_alloc_context3().
+ *
+ * The functions avcodec_find_decoder_by_name(), avcodec_find_encoder_by_name(),
+ * avcodec_find_decoder() and avcodec_find_encoder() provide an easy way for
+ * retrieving a codec.
+ *
+ * @warning This function is not thread safe!
+ *
+ * @note Always call this function before using decoding routines (such as
+ * @ref avcodec_receive_frame()).
+ *
+ * @code
+ * avcodec_register_all();
+ * av_dict_set(&opts, "b", "2.5M", 0);
+ * codec = avcodec_find_decoder(AV_CODEC_ID_H264);
+ * if (!codec)
+ *     exit(1);
+ *
+ * context = avcodec_alloc_context3(codec);
+ *
+ * if (avcodec_open2(context, codec, opts) < 0)
+ *     exit(1);
+ * @endcode
+ *
+ * @param avctx The context to initialize.
+ * @param codec The codec to open this context for. If a non-NULL codec has been
+ *              previously passed to avcodec_alloc_context3() or
+ *              for this context, then this parameter MUST be either NULL or
+ *              equal to the previously passed codec.
+ * @param options A dictionary filled with AVCodecContext and codec-private options.
+ *                On return this object will be filled with options that were not found.
+ *
+ * @return zero on success, a negative value on error
+ * @see avcodec_alloc_context3(), avcodec_find_decoder(), avcodec_find_encoder(),
+ *      av_dict_set(), av_opt_find().
+ */
+int avcodec_open2(AVCodecContext *avctx, const AVCodec *codec, AVDictionary **options);
+
+/**
+ * Close a given AVCodecContext and free all the data associated with it
+ * (but not the AVCodecContext itself).
+ *
+ * Calling this function on an AVCodecContext that hasn't been opened will free
+ * the codec-specific data allocated in avcodec_alloc_context3() with a non-NULL
+ * codec. Subsequent calls will do nothing.
+ *
+ * @note Do not use this function. Use avcodec_free_context() to destroy a
+ * codec context (either open or closed). Opening and closing a codec context
+ * multiple times is not supported anymore -- use multiple codec contexts
+ * instead.
+ */
+int avcodec_close(AVCodecContext *avctx);
+
+/**
+ * Free all allocated data in the given subtitle struct.
+ *
+ * @param sub AVSubtitle to free.
+ */
+void avsubtitle_free(AVSubtitle *sub);
+
+/**
+ * @}
+ */
+
+/**
+ * @addtogroup lavc_packet
+ * @{
+ */
+
+/**
+ * Allocate an AVPacket and set its fields to default values.  The resulting
+ * struct must be freed using av_packet_free().
+ *
+ * @return An AVPacket filled with default values or NULL on failure.
+ *
+ * @note this only allocates the AVPacket itself, not the data buffers. Those
+ * must be allocated through other means such as av_new_packet.
+ *
+ * @see av_new_packet
+ */
+AVPacket *av_packet_alloc(void);
+
+/**
+ * Create a new packet that references the same data as src.
+ *
+ * This is a shortcut for av_packet_alloc()+av_packet_ref().
+ *
+ * @return newly created AVPacket on success, NULL on error.
+ *
+ * @see av_packet_alloc
+ * @see av_packet_ref
+ */
+AVPacket *av_packet_clone(const AVPacket *src);
+
+/**
+ * Free the packet, if the packet is reference counted, it will be
+ * unreferenced first.
+ *
+ * @param packet packet to be freed. The pointer will be set to NULL.
+ * @note passing NULL is a no-op.
+ */
+void av_packet_free(AVPacket **pkt);
+
+/**
+ * Initialize optional fields of a packet with default values.
+ *
+ * Note, this does not touch the data and size members, which have to be
+ * initialized separately.
+ *
+ * @param pkt packet
+ */
+void av_init_packet(AVPacket *pkt);
+
+/**
+ * Allocate the payload of a packet and initialize its fields with
+ * default values.
+ *
+ * @param pkt packet
+ * @param size wanted payload size
+ * @return 0 if OK, AVERROR_xxx otherwise
+ */
+int av_new_packet(AVPacket *pkt, int size);
+
+/**
+ * Reduce packet size, correctly zeroing padding
+ *
+ * @param pkt packet
+ * @param size new size
+ */
+void av_shrink_packet(AVPacket *pkt, int size);
+
+/**
+ * Increase packet size, correctly zeroing padding
+ *
+ * @param pkt packet
+ * @param grow_by number of bytes by which to increase the size of the packet
+ */
+int av_grow_packet(AVPacket *pkt, int grow_by);
+
+/**
+ * Initialize a reference-counted packet from av_malloc()ed data.
+ *
+ * @param pkt packet to be initialized. This function will set the data, size,
+ *        buf and destruct fields, all others are left untouched.
+ * @param data Data allocated by av_malloc() to be used as packet data. If this
+ *        function returns successfully, the data is owned by the underlying AVBuffer.
+ *        The caller may not access the data through other means.
+ * @param size size of data in bytes, without the padding. I.e. the full buffer
+ *        size is assumed to be size + AV_INPUT_BUFFER_PADDING_SIZE.
+ *
+ * @return 0 on success, a negative AVERROR on error
+ */
+int av_packet_from_data(AVPacket *pkt, uint8_t *data, int size);
+
+#if FF_API_AVPACKET_OLD_API
+/**
+ * @warning This is a hack - the packet memory allocation stuff is broken. The
+ * packet is allocated if it was not really allocated.
+ *
+ * @deprecated Use av_packet_ref
+ */
+attribute_deprecated
+int av_dup_packet(AVPacket *pkt);
+/**
+ * Copy packet, including contents
+ *
+ * @return 0 on success, negative AVERROR on fail
+ */
+int av_copy_packet(AVPacket *dst, const AVPacket *src);
+
+/**
+ * Copy packet side data
+ *
+ * @return 0 on success, negative AVERROR on fail
+ */
+int av_copy_packet_side_data(AVPacket *dst, const AVPacket *src);
+
+/**
+ * Free a packet.
+ *
+ * @deprecated Use av_packet_unref
+ *
+ * @param pkt packet to free
+ */
+attribute_deprecated
+void av_free_packet(AVPacket *pkt);
+#endif
+/**
+ * Allocate new information of a packet.
+ *
+ * @param pkt packet
+ * @param type side information type
+ * @param size side information size
+ * @return pointer to fresh allocated data or NULL otherwise
+ */
+uint8_t* av_packet_new_side_data(AVPacket *pkt, enum AVPacketSideDataType type,
+                                 int size);
+
+/**
+ * Wrap an existing array as a packet side data.
+ *
+ * @param pkt packet
+ * @param type side information type
+ * @param data the side data array. It must be allocated with the av_malloc()
+ *             family of functions. The ownership of the data is transferred to
+ *             pkt.
+ * @param size side information size
+ * @return a non-negative number on success, a negative AVERROR code on
+ *         failure. On failure, the packet is unchanged and the data remains
+ *         owned by the caller.
+ */
+int av_packet_add_side_data(AVPacket *pkt, enum AVPacketSideDataType type,
+                            uint8_t *data, size_t size);
+
+/**
+ * Shrink the already allocated side data buffer
+ *
+ * @param pkt packet
+ * @param type side information type
+ * @param size new side information size
+ * @return 0 on success, < 0 on failure
+ */
+int av_packet_shrink_side_data(AVPacket *pkt, enum AVPacketSideDataType type,
+                               int size);
+
+/**
+ * Get side information from packet.
+ *
+ * @param pkt packet
+ * @param type desired side information type
+ * @param size pointer for side information size to store (optional)
+ * @return pointer to data if present or NULL otherwise
+ */
+uint8_t* av_packet_get_side_data(const AVPacket *pkt, enum AVPacketSideDataType type,
+                                 int *size);
+
+#if FF_API_MERGE_SD_API
+attribute_deprecated
+int av_packet_merge_side_data(AVPacket *pkt);
+
+attribute_deprecated
+int av_packet_split_side_data(AVPacket *pkt);
+#endif
+
+const char *av_packet_side_data_name(enum AVPacketSideDataType type);
+
+/**
+ * Pack a dictionary for use in side_data.
+ *
+ * @param dict The dictionary to pack.
+ * @param size pointer to store the size of the returned data
+ * @return pointer to data if successful, NULL otherwise
+ */
+uint8_t *av_packet_pack_dictionary(AVDictionary *dict, int *size);
+/**
+ * Unpack a dictionary from side_data.
+ *
+ * @param data data from side_data
+ * @param size size of the data
+ * @param dict the metadata storage dictionary
+ * @return 0 on success, < 0 on failure
+ */
+int av_packet_unpack_dictionary(const uint8_t *data, int size, AVDictionary **dict);
+
+
+/**
+ * Convenience function to free all the side data stored.
+ * All the other fields stay untouched.
+ *
+ * @param pkt packet
+ */
+void av_packet_free_side_data(AVPacket *pkt);
+
+/**
+ * Setup a new reference to the data described by a given packet
+ *
+ * If src is reference-counted, setup dst as a new reference to the
+ * buffer in src. Otherwise allocate a new buffer in dst and copy the
+ * data from src into it.
+ *
+ * All the other fields are copied from src.
+ *
+ * @see av_packet_unref
+ *
+ * @param dst Destination packet
+ * @param src Source packet
+ *
+ * @return 0 on success, a negative AVERROR on error.
+ */
+int av_packet_ref(AVPacket *dst, const AVPacket *src);
+
+/**
+ * Wipe the packet.
+ *
+ * Unreference the buffer referenced by the packet and reset the
+ * remaining packet fields to their default values.
+ *
+ * @param pkt The packet to be unreferenced.
+ */
+void av_packet_unref(AVPacket *pkt);
+
+/**
+ * Move every field in src to dst and reset src.
+ *
+ * @see av_packet_unref
+ *
+ * @param src Source packet, will be reset
+ * @param dst Destination packet
+ */
+void av_packet_move_ref(AVPacket *dst, AVPacket *src);
+
+/**
+ * Copy only "properties" fields from src to dst.
+ *
+ * Properties for the purpose of this function are all the fields
+ * beside those related to the packet data (buf, data, size)
+ *
+ * @param dst Destination packet
+ * @param src Source packet
+ *
+ * @return 0 on success AVERROR on failure.
+ */
+int av_packet_copy_props(AVPacket *dst, const AVPacket *src);
+
+/**
+ * Convert valid timing fields (timestamps / durations) in a packet from one
+ * timebase to another. Timestamps with unknown values (AV_NOPTS_VALUE) will be
+ * ignored.
+ *
+ * @param pkt packet on which the conversion will be performed
+ * @param tb_src source timebase, in which the timing fields in pkt are
+ *               expressed
+ * @param tb_dst destination timebase, to which the timing fields will be
+ *               converted
+ */
+void av_packet_rescale_ts(AVPacket *pkt, AVRational tb_src, AVRational tb_dst);
+
+/**
+ * @}
+ */
+
+/**
+ * @addtogroup lavc_decoding
+ * @{
+ */
+
+/**
+ * Find a registered decoder with a matching codec ID.
+ *
+ * @param id AVCodecID of the requested decoder
+ * @return A decoder if one was found, NULL otherwise.
+ */
+AVCodec *avcodec_find_decoder(enum AVCodecID id);
+
+/**
+ * Find a registered decoder with the specified name.
+ *
+ * @param name name of the requested decoder
+ * @return A decoder if one was found, NULL otherwise.
+ */
+AVCodec *avcodec_find_decoder_by_name(const char *name);
+
+/**
+ * The default callback for AVCodecContext.get_buffer2(). It is made public so
+ * it can be called by custom get_buffer2() implementations for decoders without
+ * AV_CODEC_CAP_DR1 set.
+ */
+int avcodec_default_get_buffer2(AVCodecContext *s, AVFrame *frame, int flags);
+
+#if FF_API_EMU_EDGE
+/**
+ * Return the amount of padding in pixels which the get_buffer callback must
+ * provide around the edge of the image for codecs which do not have the
+ * CODEC_FLAG_EMU_EDGE flag.
+ *
+ * @return Required padding in pixels.
+ *
+ * @deprecated CODEC_FLAG_EMU_EDGE is deprecated, so this function is no longer
+ * needed
+ */
+attribute_deprecated
+unsigned avcodec_get_edge_width(void);
+#endif
+
+/**
+ * Modify width and height values so that they will result in a memory
+ * buffer that is acceptable for the codec if you do not use any horizontal
+ * padding.
+ *
+ * May only be used if a codec with AV_CODEC_CAP_DR1 has been opened.
+ */
+void avcodec_align_dimensions(AVCodecContext *s, int *width, int *height);
+
+/**
+ * Modify width and height values so that they will result in a memory
+ * buffer that is acceptable for the codec if you also ensure that all
+ * line sizes are a multiple of the respective linesize_align[i].
+ *
+ * May only be used if a codec with AV_CODEC_CAP_DR1 has been opened.
+ */
+void avcodec_align_dimensions2(AVCodecContext *s, int *width, int *height,
+                               int linesize_align[AV_NUM_DATA_POINTERS]);
+
+/**
+ * Converts AVChromaLocation to swscale x/y chroma position.
+ *
+ * The positions represent the chroma (0,0) position in a coordinates system
+ * with luma (0,0) representing the origin and luma(1,1) representing 256,256
+ *
+ * @param xpos  horizontal chroma sample position
+ * @param ypos  vertical   chroma sample position
+ */
+int avcodec_enum_to_chroma_pos(int *xpos, int *ypos, enum AVChromaLocation pos);
+
+/**
+ * Converts swscale x/y chroma position to AVChromaLocation.
+ *
+ * The positions represent the chroma (0,0) position in a coordinates system
+ * with luma (0,0) representing the origin and luma(1,1) representing 256,256
+ *
+ * @param xpos  horizontal chroma sample position
+ * @param ypos  vertical   chroma sample position
+ */
+enum AVChromaLocation avcodec_chroma_pos_to_enum(int xpos, int ypos);
+
+/**
+ * Decode the audio frame of size avpkt->size from avpkt->data into frame.
+ *
+ * Some decoders may support multiple frames in a single AVPacket. Such
+ * decoders would then just decode the first frame and the return value would be
+ * less than the packet size. In this case, avcodec_decode_audio4 has to be
+ * called again with an AVPacket containing the remaining data in order to
+ * decode the second frame, etc...  Even if no frames are returned, the packet
+ * needs to be fed to the decoder with remaining data until it is completely
+ * consumed or an error occurs.
+ *
+ * Some decoders (those marked with AV_CODEC_CAP_DELAY) have a delay between input
+ * and output. This means that for some packets they will not immediately
+ * produce decoded output and need to be flushed at the end of decoding to get
+ * all the decoded data. Flushing is done by calling this function with packets
+ * with avpkt->data set to NULL and avpkt->size set to 0 until it stops
+ * returning samples. It is safe to flush even those decoders that are not
+ * marked with AV_CODEC_CAP_DELAY, then no samples will be returned.
+ *
+ * @warning The input buffer, avpkt->data must be AV_INPUT_BUFFER_PADDING_SIZE
+ *          larger than the actual read bytes because some optimized bitstream
+ *          readers read 32 or 64 bits at once and could read over the end.
+ *
+ * @note The AVCodecContext MUST have been opened with @ref avcodec_open2()
+ * before packets may be fed to the decoder.
+ *
+ * @param      avctx the codec context
+ * @param[out] frame The AVFrame in which to store decoded audio samples.
+ *                   The decoder will allocate a buffer for the decoded frame by
+ *                   calling the AVCodecContext.get_buffer2() callback.
+ *                   When AVCodecContext.refcounted_frames is set to 1, the frame is
+ *                   reference counted and the returned reference belongs to the
+ *                   caller. The caller must release the frame using av_frame_unref()
+ *                   when the frame is no longer needed. The caller may safely write
+ *                   to the frame if av_frame_is_writable() returns 1.
+ *                   When AVCodecContext.refcounted_frames is set to 0, the returned
+ *                   reference belongs to the decoder and is valid only until the
+ *                   next call to this function or until closing or flushing the
+ *                   decoder. The caller may not write to it.
+ * @param[out] got_frame_ptr Zero if no frame could be decoded, otherwise it is
+ *                           non-zero. Note that this field being set to zero
+ *                           does not mean that an error has occurred. For
+ *                           decoders with AV_CODEC_CAP_DELAY set, no given decode
+ *                           call is guaranteed to produce a frame.
+ * @param[in]  avpkt The input AVPacket containing the input buffer.
+ *                   At least avpkt->data and avpkt->size should be set. Some
+ *                   decoders might also require additional fields to be set.
+ * @return A negative error code is returned if an error occurred during
+ *         decoding, otherwise the number of bytes consumed from the input
+ *         AVPacket is returned.
+ *
+* @deprecated Use avcodec_send_packet() and avcodec_receive_frame().
+ */
+attribute_deprecated
+int avcodec_decode_audio4(AVCodecContext *avctx, AVFrame *frame,
+                          int *got_frame_ptr, const AVPacket *avpkt);
+
+/**
+ * Decode the video frame of size avpkt->size from avpkt->data into picture.
+ * Some decoders may support multiple frames in a single AVPacket, such
+ * decoders would then just decode the first frame.
+ *
+ * @warning The input buffer must be AV_INPUT_BUFFER_PADDING_SIZE larger than
+ * the actual read bytes because some optimized bitstream readers read 32 or 64
+ * bits at once and could read over the end.
+ *
+ * @warning The end of the input buffer buf should be set to 0 to ensure that
+ * no overreading happens for damaged MPEG streams.
+ *
+ * @note Codecs which have the AV_CODEC_CAP_DELAY capability set have a delay
+ * between input and output, these need to be fed with avpkt->data=NULL,
+ * avpkt->size=0 at the end to return the remaining frames.
+ *
+ * @note The AVCodecContext MUST have been opened with @ref avcodec_open2()
+ * before packets may be fed to the decoder.
+ *
+ * @param avctx the codec context
+ * @param[out] picture The AVFrame in which the decoded video frame will be stored.
+ *             Use av_frame_alloc() to get an AVFrame. The codec will
+ *             allocate memory for the actual bitmap by calling the
+ *             AVCodecContext.get_buffer2() callback.
+ *             When AVCodecContext.refcounted_frames is set to 1, the frame is
+ *             reference counted and the returned reference belongs to the
+ *             caller. The caller must release the frame using av_frame_unref()
+ *             when the frame is no longer needed. The caller may safely write
+ *             to the frame if av_frame_is_writable() returns 1.
+ *             When AVCodecContext.refcounted_frames is set to 0, the returned
+ *             reference belongs to the decoder and is valid only until the
+ *             next call to this function or until closing or flushing the
+ *             decoder. The caller may not write to it.
+ *
+ * @param[in] avpkt The input AVPacket containing the input buffer.
+ *            You can create such packet with av_init_packet() and by then setting
+ *            data and size, some decoders might in addition need other fields like
+ *            flags&AV_PKT_FLAG_KEY. All decoders are designed to use the least
+ *            fields possible.
+ * @param[in,out] got_picture_ptr Zero if no frame could be decompressed, otherwise, it is nonzero.
+ * @return On error a negative value is returned, otherwise the number of bytes
+ * used or zero if no frame could be decompressed.
+ *
+ * @deprecated Use avcodec_send_packet() and avcodec_receive_frame().
+ */
+attribute_deprecated
+int avcodec_decode_video2(AVCodecContext *avctx, AVFrame *picture,
+                         int *got_picture_ptr,
+                         const AVPacket *avpkt);
+
+/**
+ * Decode a subtitle message.
+ * Return a negative value on error, otherwise return the number of bytes used.
+ * If no subtitle could be decompressed, got_sub_ptr is zero.
+ * Otherwise, the subtitle is stored in *sub.
+ * Note that AV_CODEC_CAP_DR1 is not available for subtitle codecs. This is for
+ * simplicity, because the performance difference is expect to be negligible
+ * and reusing a get_buffer written for video codecs would probably perform badly
+ * due to a potentially very different allocation pattern.
+ *
+ * Some decoders (those marked with CODEC_CAP_DELAY) have a delay between input
+ * and output. This means that for some packets they will not immediately
+ * produce decoded output and need to be flushed at the end of decoding to get
+ * all the decoded data. Flushing is done by calling this function with packets
+ * with avpkt->data set to NULL and avpkt->size set to 0 until it stops
+ * returning subtitles. It is safe to flush even those decoders that are not
+ * marked with CODEC_CAP_DELAY, then no subtitles will be returned.
+ *
+ * @note The AVCodecContext MUST have been opened with @ref avcodec_open2()
+ * before packets may be fed to the decoder.
+ *
+ * @param avctx the codec context
+ * @param[out] sub The Preallocated AVSubtitle in which the decoded subtitle will be stored,
+ *                 must be freed with avsubtitle_free if *got_sub_ptr is set.
+ * @param[in,out] got_sub_ptr Zero if no subtitle could be decompressed, otherwise, it is nonzero.
+ * @param[in] avpkt The input AVPacket containing the input buffer.
+ */
+int avcodec_decode_subtitle2(AVCodecContext *avctx, AVSubtitle *sub,
+                            int *got_sub_ptr,
+                            AVPacket *avpkt);
+
+/**
+ * Supply raw packet data as input to a decoder.
+ *
+ * Internally, this call will copy relevant AVCodecContext fields, which can
+ * influence decoding per-packet, and apply them when the packet is actually
+ * decoded. (For example AVCodecContext.skip_frame, which might direct the
+ * decoder to drop the frame contained by the packet sent with this function.)
+ *
+ * @warning The input buffer, avpkt->data must be AV_INPUT_BUFFER_PADDING_SIZE
+ *          larger than the actual read bytes because some optimized bitstream
+ *          readers read 32 or 64 bits at once and could read over the end.
+ *
+ * @warning Do not mix this API with the legacy API (like avcodec_decode_video2())
+ *          on the same AVCodecContext. It will return unexpected results now
+ *          or in future libavcodec versions.
+ *
+ * @note The AVCodecContext MUST have been opened with @ref avcodec_open2()
+ *       before packets may be fed to the decoder.
+ *
+ * @param avctx codec context
+ * @param[in] avpkt The input AVPacket. Usually, this will be a single video
+ *                  frame, or several complete audio frames.
+ *                  Ownership of the packet remains with the caller, and the
+ *                  decoder will not write to the packet. The decoder may create
+ *                  a reference to the packet data (or copy it if the packet is
+ *                  not reference-counted).
+ *                  Unlike with older APIs, the packet is always fully consumed,
+ *                  and if it contains multiple frames (e.g. some audio codecs),
+ *                  will require you to call avcodec_receive_frame() multiple
+ *                  times afterwards before you can send a new packet.
+ *                  It can be NULL (or an AVPacket with data set to NULL and
+ *                  size set to 0); in this case, it is considered a flush
+ *                  packet, which signals the end of the stream. Sending the
+ *                  first flush packet will return success. Subsequent ones are
+ *                  unnecessary and will return AVERROR_EOF. If the decoder
+ *                  still has frames buffered, it will return them after sending
+ *                  a flush packet.
+ *
+ * @return 0 on success, otherwise negative error code:
+ *      AVERROR(EAGAIN):   input is not accepted in the current state - user
+ *                         must read output with avcodec_receive_frame() (once
+ *                         all output is read, the packet should be resent, and
+ *                         the call will not fail with EAGAIN).
+ *      AVERROR_EOF:       the decoder has been flushed, and no new packets can
+ *                         be sent to it (also returned if more than 1 flush
+ *                         packet is sent)
+ *      AVERROR(EINVAL):   codec not opened, it is an encoder, or requires flush
+ *      AVERROR(ENOMEM):   failed to add packet to internal queue, or similar
+ *      other errors: legitimate decoding errors
+ */
+int avcodec_send_packet(AVCodecContext *avctx, const AVPacket *avpkt);
+
+/**
+ * Return decoded output data from a decoder.
+ *
+ * @param avctx codec context
+ * @param frame This will be set to a reference-counted video or audio
+ *              frame (depending on the decoder type) allocated by the
+ *              decoder. Note that the function will always call
+ *              av_frame_unref(frame) before doing anything else.
+ *
+ * @return
+ *      0:                 success, a frame was returned
+ *      AVERROR(EAGAIN):   output is not available in this state - user must try
+ *                         to send new input
+ *      AVERROR_EOF:       the decoder has been fully flushed, and there will be
+ *                         no more output frames
+ *      AVERROR(EINVAL):   codec not opened, or it is an encoder
+ *      other negative values: legitimate decoding errors
+ */
+int avcodec_receive_frame(AVCodecContext *avctx, AVFrame *frame);
+
+/**
+ * Supply a raw video or audio frame to the encoder. Use avcodec_receive_packet()
+ * to retrieve buffered output packets.
+ *
+ * @param avctx     codec context
+ * @param[in] frame AVFrame containing the raw audio or video frame to be encoded.
+ *                  Ownership of the frame remains with the caller, and the
+ *                  encoder will not write to the frame. The encoder may create
+ *                  a reference to the frame data (or copy it if the frame is
+ *                  not reference-counted).
+ *                  It can be NULL, in which case it is considered a flush
+ *                  packet.  This signals the end of the stream. If the encoder
+ *                  still has packets buffered, it will return them after this
+ *                  call. Once flushing mode has been entered, additional flush
+ *                  packets are ignored, and sending frames will return
+ *                  AVERROR_EOF.
+ *
+ *                  For audio:
+ *                  If AV_CODEC_CAP_VARIABLE_FRAME_SIZE is set, then each frame
+ *                  can have any number of samples.
+ *                  If it is not set, frame->nb_samples must be equal to
+ *                  avctx->frame_size for all frames except the last.
+ *                  The final frame may be smaller than avctx->frame_size.
+ * @return 0 on success, otherwise negative error code:
+ *      AVERROR(EAGAIN):   input is not accepted in the current state - user
+ *                         must read output with avcodec_receive_packet() (once
+ *                         all output is read, the packet should be resent, and
+ *                         the call will not fail with EAGAIN).
+ *      AVERROR_EOF:       the encoder has been flushed, and no new frames can
+ *                         be sent to it
+ *      AVERROR(EINVAL):   codec not opened, refcounted_frames not set, it is a
+ *                         decoder, or requires flush
+ *      AVERROR(ENOMEM):   failed to add packet to internal queue, or similar
+ *      other errors: legitimate decoding errors
+ */
+int avcodec_send_frame(AVCodecContext *avctx, const AVFrame *frame);
+
+/**
+ * Read encoded data from the encoder.
+ *
+ * @param avctx codec context
+ * @param avpkt This will be set to a reference-counted packet allocated by the
+ *              encoder. Note that the function will always call
+ *              av_frame_unref(frame) before doing anything else.
+ * @return 0 on success, otherwise negative error code:
+ *      AVERROR(EAGAIN):   output is not available in the current state - user
+ *                         must try to send input
+ *      AVERROR_EOF:       the encoder has been fully flushed, and there will be
+ *                         no more output packets
+ *      AVERROR(EINVAL):   codec not opened, or it is an encoder
+ *      other errors: legitimate decoding errors
+ */
+int avcodec_receive_packet(AVCodecContext *avctx, AVPacket *avpkt);
+
+
+/**
+ * @defgroup lavc_parsing Frame parsing
+ * @{
+ */
+
+enum AVPictureStructure {
+    AV_PICTURE_STRUCTURE_UNKNOWN,      //< unknown
+    AV_PICTURE_STRUCTURE_TOP_FIELD,    //< coded as top field
+    AV_PICTURE_STRUCTURE_BOTTOM_FIELD, //< coded as bottom field
+    AV_PICTURE_STRUCTURE_FRAME,        //< coded as frame
+};
+
+typedef struct AVCodecParserContext {
+    void *priv_data;
+    struct AVCodecParser *parser;
+    int64_t frame_offset; /* offset of the current frame */
+    int64_t cur_offset; /* current offset
+                           (incremented by each av_parser_parse()) */
+    int64_t next_frame_offset; /* offset of the next frame */
+    /* video info */
+    int pict_type; /* XXX: Put it back in AVCodecContext. */
+    /**
+     * This field is used for proper frame duration computation in lavf.
+     * It signals, how much longer the frame duration of the current frame
+     * is compared to normal frame duration.
+     *
+     * frame_duration = (1 + repeat_pict) * time_base
+     *
+     * It is used by codecs like H.264 to display telecined material.
+     */
+    int repeat_pict; /* XXX: Put it back in AVCodecContext. */
+    int64_t pts;     /* pts of the current frame */
+    int64_t dts;     /* dts of the current frame */
+
+    /* private data */
+    int64_t last_pts;
+    int64_t last_dts;
+    int fetch_timestamp;
+
+#define AV_PARSER_PTS_NB 4
+    int cur_frame_start_index;
+    int64_t cur_frame_offset[AV_PARSER_PTS_NB];
+    int64_t cur_frame_pts[AV_PARSER_PTS_NB];
+    int64_t cur_frame_dts[AV_PARSER_PTS_NB];
+
+    int flags;
+#define PARSER_FLAG_COMPLETE_FRAMES           0x0001
+#define PARSER_FLAG_ONCE                      0x0002
+/// Set if the parser has a valid file offset
+#define PARSER_FLAG_FETCHED_OFFSET            0x0004
+#define PARSER_FLAG_USE_CODEC_TS              0x1000
+
+    int64_t offset;      ///< byte offset from starting packet start
+    int64_t cur_frame_end[AV_PARSER_PTS_NB];
+
+    /**
+     * Set by parser to 1 for key frames and 0 for non-key frames.
+     * It is initialized to -1, so if the parser doesn't set this flag,
+     * old-style fallback using AV_PICTURE_TYPE_I picture type as key frames
+     * will be used.
+     */
+    int key_frame;
+
+#if FF_API_CONVERGENCE_DURATION
+    /**
+     * @deprecated unused
+     */
+    attribute_deprecated
+    int64_t convergence_duration;
+#endif
+
+    // Timestamp generation support:
+    /**
+     * Synchronization point for start of timestamp generation.
+     *
+     * Set to >0 for sync point, 0 for no sync point and <0 for undefined
+     * (default).
+     *
+     * For example, this corresponds to presence of H.264 buffering period
+     * SEI message.
+     */
+    int dts_sync_point;
+
+    /**
+     * Offset of the current timestamp against last timestamp sync point in
+     * units of AVCodecContext.time_base.
+     *
+     * Set to INT_MIN when dts_sync_point unused. Otherwise, it must
+     * contain a valid timestamp offset.
+     *
+     * Note that the timestamp of sync point has usually a nonzero
+     * dts_ref_dts_delta, which refers to the previous sync point. Offset of
+     * the next frame after timestamp sync point will be usually 1.
+     *
+     * For example, this corresponds to H.264 cpb_removal_delay.
+     */
+    int dts_ref_dts_delta;
+
+    /**
+     * Presentation delay of current frame in units of AVCodecContext.time_base.
+     *
+     * Set to INT_MIN when dts_sync_point unused. Otherwise, it must
+     * contain valid non-negative timestamp delta (presentation time of a frame
+     * must not lie in the past).
+     *
+     * This delay represents the difference between decoding and presentation
+     * time of the frame.
+     *
+     * For example, this corresponds to H.264 dpb_output_delay.
+     */
+    int pts_dts_delta;
+
+    /**
+     * Position of the packet in file.
+     *
+     * Analogous to cur_frame_pts/dts
+     */
+    int64_t cur_frame_pos[AV_PARSER_PTS_NB];
+
+    /**
+     * Byte position of currently parsed frame in stream.
+     */
+    int64_t pos;
+
+    /**
+     * Previous frame byte position.
+     */
+    int64_t last_pos;
+
+    /**
+     * Duration of the current frame.
+     * For audio, this is in units of 1 / AVCodecContext.sample_rate.
+     * For all other types, this is in units of AVCodecContext.time_base.
+     */
+    int duration;
+
+    enum AVFieldOrder field_order;
+
+    /**
+     * Indicate whether a picture is coded as a frame, top field or bottom field.
+     *
+     * For example, H.264 field_pic_flag equal to 0 corresponds to
+     * AV_PICTURE_STRUCTURE_FRAME. An H.264 picture with field_pic_flag
+     * equal to 1 and bottom_field_flag equal to 0 corresponds to
+     * AV_PICTURE_STRUCTURE_TOP_FIELD.
+     */
+    enum AVPictureStructure picture_structure;
+
+    /**
+     * Picture number incremented in presentation or output order.
+     * This field may be reinitialized at the first picture of a new sequence.
+     *
+     * For example, this corresponds to H.264 PicOrderCnt.
+     */
+    int output_picture_number;
+
+    /**
+     * Dimensions of the decoded video intended for presentation.
+     */
+    int width;
+    int height;
+
+    /**
+     * Dimensions of the coded video.
+     */
+    int coded_width;
+    int coded_height;
+
+    /**
+     * The format of the coded data, corresponds to enum AVPixelFormat for video
+     * and for enum AVSampleFormat for audio.
+     *
+     * Note that a decoder can have considerable freedom in how exactly it
+     * decodes the data, so the format reported here might be different from the
+     * one returned by a decoder.
+     */
+    int format;
+} AVCodecParserContext;
+
+typedef struct AVCodecParser {
+    int codec_ids[5]; /* several codec IDs are permitted */
+    int priv_data_size;
+    int (*parser_init)(AVCodecParserContext *s);
+    /* This callback never returns an error, a negative value means that
+     * the frame start was in a previous packet. */
+    int (*parser_parse)(AVCodecParserContext *s,
+                        AVCodecContext *avctx,
+                        const uint8_t **poutbuf, int *poutbuf_size,
+                        const uint8_t *buf, int buf_size);
+    void (*parser_close)(AVCodecParserContext *s);
+    int (*split)(AVCodecContext *avctx, const uint8_t *buf, int buf_size);
+    struct AVCodecParser *next;
+} AVCodecParser;
+
+AVCodecParser *av_parser_next(const AVCodecParser *c);
+
+void av_register_codec_parser(AVCodecParser *parser);
+AVCodecParserContext *av_parser_init(int codec_id);
+
+/**
+ * Parse a packet.
+ *
+ * @param s             parser context.
+ * @param avctx         codec context.
+ * @param poutbuf       set to pointer to parsed buffer or NULL if not yet finished.
+ * @param poutbuf_size  set to size of parsed buffer or zero if not yet finished.
+ * @param buf           input buffer.
+ * @param buf_size      buffer size in bytes without the padding. I.e. the full buffer
+                        size is assumed to be buf_size + AV_INPUT_BUFFER_PADDING_SIZE.
+                        To signal EOF, this should be 0 (so that the last frame
+                        can be output).
+ * @param pts           input presentation timestamp.
+ * @param dts           input decoding timestamp.
+ * @param pos           input byte position in stream.
+ * @return the number of bytes of the input bitstream used.
+ *
+ * Example:
+ * @code
+ *   while(in_len){
+ *       len = av_parser_parse2(myparser, AVCodecContext, &data, &size,
+ *                                        in_data, in_len,
+ *                                        pts, dts, pos);
+ *       in_data += len;
+ *       in_len  -= len;
+ *
+ *       if(size)
+ *          decode_frame(data, size);
+ *   }
+ * @endcode
+ */
+int av_parser_parse2(AVCodecParserContext *s,
+                     AVCodecContext *avctx,
+                     uint8_t **poutbuf, int *poutbuf_size,
+                     const uint8_t *buf, int buf_size,
+                     int64_t pts, int64_t dts,
+                     int64_t pos);
+
+/**
+ * @return 0 if the output buffer is a subset of the input, 1 if it is allocated and must be freed
+ * @deprecated use AVBitStreamFilter
+ */
+int av_parser_change(AVCodecParserContext *s,
+                     AVCodecContext *avctx,
+                     uint8_t **poutbuf, int *poutbuf_size,
+                     const uint8_t *buf, int buf_size, int keyframe);
+void av_parser_close(AVCodecParserContext *s);
+
+/**
+ * @}
+ * @}
+ */
+
+/**
+ * @addtogroup lavc_encoding
+ * @{
+ */
+
+/**
+ * Find a registered encoder with a matching codec ID.
+ *
+ * @param id AVCodecID of the requested encoder
+ * @return An encoder if one was found, NULL otherwise.
+ */
+AVCodec *avcodec_find_encoder(enum AVCodecID id);
+
+/**
+ * Find a registered encoder with the specified name.
+ *
+ * @param name name of the requested encoder
+ * @return An encoder if one was found, NULL otherwise.
+ */
+AVCodec *avcodec_find_encoder_by_name(const char *name);
+
+/**
+ * Encode a frame of audio.
+ *
+ * Takes input samples from frame and writes the next output packet, if
+ * available, to avpkt. The output packet does not necessarily contain data for
+ * the most recent frame, as encoders can delay, split, and combine input frames
+ * internally as needed.
+ *
+ * @param avctx     codec context
+ * @param avpkt     output AVPacket.
+ *                  The user can supply an output buffer by setting
+ *                  avpkt->data and avpkt->size prior to calling the
+ *                  function, but if the size of the user-provided data is not
+ *                  large enough, encoding will fail. If avpkt->data and
+ *                  avpkt->size are set, avpkt->destruct must also be set. All
+ *                  other AVPacket fields will be reset by the encoder using
+ *                  av_init_packet(). If avpkt->data is NULL, the encoder will
+ *                  allocate it. The encoder will set avpkt->size to the size
+ *                  of the output packet.
+ *
+ *                  If this function fails or produces no output, avpkt will be
+ *                  freed using av_packet_unref().
+ * @param[in] frame AVFrame containing the raw audio data to be encoded.
+ *                  May be NULL when flushing an encoder that has the
+ *                  AV_CODEC_CAP_DELAY capability set.
+ *                  If AV_CODEC_CAP_VARIABLE_FRAME_SIZE is set, then each frame
+ *                  can have any number of samples.
+ *                  If it is not set, frame->nb_samples must be equal to
+ *                  avctx->frame_size for all frames except the last.
+ *                  The final frame may be smaller than avctx->frame_size.
+ * @param[out] got_packet_ptr This field is set to 1 by libavcodec if the
+ *                            output packet is non-empty, and to 0 if it is
+ *                            empty. If the function returns an error, the
+ *                            packet can be assumed to be invalid, and the
+ *                            value of got_packet_ptr is undefined and should
+ *                            not be used.
+ * @return          0 on success, negative error code on failure
+ *
+ * @deprecated use avcodec_send_frame()/avcodec_receive_packet() instead
+ */
+attribute_deprecated
+int avcodec_encode_audio2(AVCodecContext *avctx, AVPacket *avpkt,
+                          const AVFrame *frame, int *got_packet_ptr);
+
+/**
+ * Encode a frame of video.
+ *
+ * Takes input raw video data from frame and writes the next output packet, if
+ * available, to avpkt. The output packet does not necessarily contain data for
+ * the most recent frame, as encoders can delay and reorder input frames
+ * internally as needed.
+ *
+ * @param avctx     codec context
+ * @param avpkt     output AVPacket.
+ *                  The user can supply an output buffer by setting
+ *                  avpkt->data and avpkt->size prior to calling the
+ *                  function, but if the size of the user-provided data is not
+ *                  large enough, encoding will fail. All other AVPacket fields
+ *                  will be reset by the encoder using av_init_packet(). If
+ *                  avpkt->data is NULL, the encoder will allocate it.
+ *                  The encoder will set avpkt->size to the size of the
+ *                  output packet. The returned data (if any) belongs to the
+ *                  caller, he is responsible for freeing it.
+ *
+ *                  If this function fails or produces no output, avpkt will be
+ *                  freed using av_packet_unref().
+ * @param[in] frame AVFrame containing the raw video data to be encoded.
+ *                  May be NULL when flushing an encoder that has the
+ *                  AV_CODEC_CAP_DELAY capability set.
+ * @param[out] got_packet_ptr This field is set to 1 by libavcodec if the
+ *                            output packet is non-empty, and to 0 if it is
+ *                            empty. If the function returns an error, the
+ *                            packet can be assumed to be invalid, and the
+ *                            value of got_packet_ptr is undefined and should
+ *                            not be used.
+ * @return          0 on success, negative error code on failure
+ *
+ * @deprecated use avcodec_send_frame()/avcodec_receive_packet() instead
+ */
+attribute_deprecated
+int avcodec_encode_video2(AVCodecContext *avctx, AVPacket *avpkt,
+                          const AVFrame *frame, int *got_packet_ptr);
+
+int avcodec_encode_subtitle(AVCodecContext *avctx, uint8_t *buf, int buf_size,
+                            const AVSubtitle *sub);
+
+
+/**
+ * @}
+ */
+
+#if FF_API_AVCODEC_RESAMPLE
+/**
+ * @defgroup lavc_resample Audio resampling
+ * @ingroup libavc
+ * @deprecated use libswresample instead
+ *
+ * @{
+ */
+struct ReSampleContext;
+struct AVResampleContext;
+
+typedef struct ReSampleContext ReSampleContext;
+
+/**
+ *  Initialize audio resampling context.
+ *
+ * @param output_channels  number of output channels
+ * @param input_channels   number of input channels
+ * @param output_rate      output sample rate
+ * @param input_rate       input sample rate
+ * @param sample_fmt_out   requested output sample format
+ * @param sample_fmt_in    input sample format
+ * @param filter_length    length of each FIR filter in the filterbank relative to the cutoff frequency
+ * @param log2_phase_count log2 of the number of entries in the polyphase filterbank
+ * @param linear           if 1 then the used FIR filter will be linearly interpolated
+                           between the 2 closest, if 0 the closest will be used
+ * @param cutoff           cutoff frequency, 1.0 corresponds to half the output sampling rate
+ * @return allocated ReSampleContext, NULL if error occurred
+ */
+attribute_deprecated
+ReSampleContext *av_audio_resample_init(int output_channels, int input_channels,
+                                        int output_rate, int input_rate,
+                                        enum AVSampleFormat sample_fmt_out,
+                                        enum AVSampleFormat sample_fmt_in,
+                                        int filter_length, int log2_phase_count,
+                                        int linear, double cutoff);
+
+attribute_deprecated
+int audio_resample(ReSampleContext *s, short *output, short *input, int nb_samples);
+
+/**
+ * Free resample context.
+ *
+ * @param s a non-NULL pointer to a resample context previously
+ *          created with av_audio_resample_init()
+ */
+attribute_deprecated
+void audio_resample_close(ReSampleContext *s);
+
+
+/**
+ * Initialize an audio resampler.
+ * Note, if either rate is not an integer then simply scale both rates up so they are.
+ * @param filter_length length of each FIR filter in the filterbank relative to the cutoff freq
+ * @param log2_phase_count log2 of the number of entries in the polyphase filterbank
+ * @param linear If 1 then the used FIR filter will be linearly interpolated
+                 between the 2 closest, if 0 the closest will be used
+ * @param cutoff cutoff frequency, 1.0 corresponds to half the output sampling rate
+ */
+attribute_deprecated
+struct AVResampleContext *av_resample_init(int out_rate, int in_rate, int filter_length, int log2_phase_count, int linear, double cutoff);
+
+/**
+ * Resample an array of samples using a previously configured context.
+ * @param src an array of unconsumed samples
+ * @param consumed the number of samples of src which have been consumed are returned here
+ * @param src_size the number of unconsumed samples available
+ * @param dst_size the amount of space in samples available in dst
+ * @param update_ctx If this is 0 then the context will not be modified, that way several channels can be resampled with the same context.
+ * @return the number of samples written in dst or -1 if an error occurred
+ */
+attribute_deprecated
+int av_resample(struct AVResampleContext *c, short *dst, short *src, int *consumed, int src_size, int dst_size, int update_ctx);
+
+
+/**
+ * Compensate samplerate/timestamp drift. The compensation is done by changing
+ * the resampler parameters, so no audible clicks or similar distortions occur
+ * @param compensation_distance distance in output samples over which the compensation should be performed
+ * @param sample_delta number of output samples which should be output less
+ *
+ * example: av_resample_compensate(c, 10, 500)
+ * here instead of 510 samples only 500 samples would be output
+ *
+ * note, due to rounding the actual compensation might be slightly different,
+ * especially if the compensation_distance is large and the in_rate used during init is small
+ */
+attribute_deprecated
+void av_resample_compensate(struct AVResampleContext *c, int sample_delta, int compensation_distance);
+attribute_deprecated
+void av_resample_close(struct AVResampleContext *c);
+
+/**
+ * @}
+ */
+#endif
+
+#if FF_API_AVPICTURE
+/**
+ * @addtogroup lavc_picture
+ * @{
+ */
+
+/**
+ * @deprecated unused
+ */
+attribute_deprecated
+int avpicture_alloc(AVPicture *picture, enum AVPixelFormat pix_fmt, int width, int height);
+
+/**
+ * @deprecated unused
+ */
+attribute_deprecated
+void avpicture_free(AVPicture *picture);
+
+/**
+ * @deprecated use av_image_fill_arrays() instead.
+ */
+attribute_deprecated
+int avpicture_fill(AVPicture *picture, const uint8_t *ptr,
+                   enum AVPixelFormat pix_fmt, int width, int height);
+
+/**
+ * @deprecated use av_image_copy_to_buffer() instead.
+ */
+attribute_deprecated
+int avpicture_layout(const AVPicture *src, enum AVPixelFormat pix_fmt,
+                     int width, int height,
+                     unsigned char *dest, int dest_size);
+
+/**
+ * @deprecated use av_image_get_buffer_size() instead.
+ */
+attribute_deprecated
+int avpicture_get_size(enum AVPixelFormat pix_fmt, int width, int height);
+
+/**
+ * @deprecated av_image_copy() instead.
+ */
+attribute_deprecated
+void av_picture_copy(AVPicture *dst, const AVPicture *src,
+                     enum AVPixelFormat pix_fmt, int width, int height);
+
+/**
+ * @deprecated unused
+ */
+attribute_deprecated
+int av_picture_crop(AVPicture *dst, const AVPicture *src,
+                    enum AVPixelFormat pix_fmt, int top_band, int left_band);
+
+/**
+ * @deprecated unused
+ */
+attribute_deprecated
+int av_picture_pad(AVPicture *dst, const AVPicture *src, int height, int width, enum AVPixelFormat pix_fmt,
+            int padtop, int padbottom, int padleft, int padright, int *color);
+
+/**
+ * @}
+ */
+#endif
+
+/**
+ * @defgroup lavc_misc Utility functions
+ * @ingroup libavc
+ *
+ * Miscellaneous utility functions related to both encoding and decoding
+ * (or neither).
+ * @{
+ */
+
+/**
+ * @defgroup lavc_misc_pixfmt Pixel formats
+ *
+ * Functions for working with pixel formats.
+ * @{
+ */
+
+/**
+ * Utility function to access log2_chroma_w log2_chroma_h from
+ * the pixel format AVPixFmtDescriptor.
+ *
+ * This function asserts that pix_fmt is valid. See av_pix_fmt_get_chroma_sub_sample
+ * for one that returns a failure code and continues in case of invalid
+ * pix_fmts.
+ *
+ * @param[in]  pix_fmt the pixel format
+ * @param[out] h_shift store log2_chroma_w
+ * @param[out] v_shift store log2_chroma_h
+ *
+ * @see av_pix_fmt_get_chroma_sub_sample
+ */
+
+void avcodec_get_chroma_sub_sample(enum AVPixelFormat pix_fmt, int *h_shift, int *v_shift);
+
+/**
+ * Return a value representing the fourCC code associated to the
+ * pixel format pix_fmt, or 0 if no associated fourCC code can be
+ * found.
+ */
+unsigned int avcodec_pix_fmt_to_codec_tag(enum AVPixelFormat pix_fmt);
+
+/**
+ * @deprecated see av_get_pix_fmt_loss()
+ */
+int avcodec_get_pix_fmt_loss(enum AVPixelFormat dst_pix_fmt, enum AVPixelFormat src_pix_fmt,
+                             int has_alpha);
+
+/**
+ * Find the best pixel format to convert to given a certain source pixel
+ * format.  When converting from one pixel format to another, information loss
+ * may occur.  For example, when converting from RGB24 to GRAY, the color
+ * information will be lost. Similarly, other losses occur when converting from
+ * some formats to other formats. avcodec_find_best_pix_fmt_of_2() searches which of
+ * the given pixel formats should be used to suffer the least amount of loss.
+ * The pixel formats from which it chooses one, are determined by the
+ * pix_fmt_list parameter.
+ *
+ *
+ * @param[in] pix_fmt_list AV_PIX_FMT_NONE terminated array of pixel formats to choose from
+ * @param[in] src_pix_fmt source pixel format
+ * @param[in] has_alpha Whether the source pixel format alpha channel is used.
+ * @param[out] loss_ptr Combination of flags informing you what kind of losses will occur.
+ * @return The best pixel format to convert to or -1 if none was found.
+ */
+enum AVPixelFormat avcodec_find_best_pix_fmt_of_list(const enum AVPixelFormat *pix_fmt_list,
+                                            enum AVPixelFormat src_pix_fmt,
+                                            int has_alpha, int *loss_ptr);
+
+/**
+ * @deprecated see av_find_best_pix_fmt_of_2()
+ */
+enum AVPixelFormat avcodec_find_best_pix_fmt_of_2(enum AVPixelFormat dst_pix_fmt1, enum AVPixelFormat dst_pix_fmt2,
+                                            enum AVPixelFormat src_pix_fmt, int has_alpha, int *loss_ptr);
+
+attribute_deprecated
+enum AVPixelFormat avcodec_find_best_pix_fmt2(enum AVPixelFormat dst_pix_fmt1, enum AVPixelFormat dst_pix_fmt2,
+                                            enum AVPixelFormat src_pix_fmt, int has_alpha, int *loss_ptr);
+
+enum AVPixelFormat avcodec_default_get_format(struct AVCodecContext *s, const enum AVPixelFormat * fmt);
+
+/**
+ * @}
+ */
+
+#if FF_API_SET_DIMENSIONS
+/**
+ * @deprecated this function is not supposed to be used from outside of lavc
+ */
+attribute_deprecated
+void avcodec_set_dimensions(AVCodecContext *s, int width, int height);
+#endif
+
+#if FF_API_TAG_STRING
+/**
+ * Put a string representing the codec tag codec_tag in buf.
+ *
+ * @param buf       buffer to place codec tag in
+ * @param buf_size size in bytes of buf
+ * @param codec_tag codec tag to assign
+ * @return the length of the string that would have been generated if
+ * enough space had been available, excluding the trailing null
+ *
+ * @deprecated see av_fourcc_make_string() and av_fourcc2str().
+ */
+attribute_deprecated
+size_t av_get_codec_tag_string(char *buf, size_t buf_size, unsigned int codec_tag);
+#endif
+
+void avcodec_string(char *buf, int buf_size, AVCodecContext *enc, int encode);
+
+/**
+ * Return a name for the specified profile, if available.
+ *
+ * @param codec the codec that is searched for the given profile
+ * @param profile the profile value for which a name is requested
+ * @return A name for the profile if found, NULL otherwise.
+ */
+const char *av_get_profile_name(const AVCodec *codec, int profile);
+
+/**
+ * Return a name for the specified profile, if available.
+ *
+ * @param codec_id the ID of the codec to which the requested profile belongs
+ * @param profile the profile value for which a name is requested
+ * @return A name for the profile if found, NULL otherwise.
+ *
+ * @note unlike av_get_profile_name(), which searches a list of profiles
+ *       supported by a specific decoder or encoder implementation, this
+ *       function searches the list of profiles from the AVCodecDescriptor
+ */
+const char *avcodec_profile_name(enum AVCodecID codec_id, int profile);
+
+int avcodec_default_execute(AVCodecContext *c, int (*func)(AVCodecContext *c2, void *arg2),void *arg, int *ret, int count, int size);
+int avcodec_default_execute2(AVCodecContext *c, int (*func)(AVCodecContext *c2, void *arg2, int, int),void *arg, int *ret, int count);
+//FIXME func typedef
+
+/**
+ * Fill AVFrame audio data and linesize pointers.
+ *
+ * The buffer buf must be a preallocated buffer with a size big enough
+ * to contain the specified samples amount. The filled AVFrame data
+ * pointers will point to this buffer.
+ *
+ * AVFrame extended_data channel pointers are allocated if necessary for
+ * planar audio.
+ *
+ * @param frame       the AVFrame
+ *                    frame->nb_samples must be set prior to calling the
+ *                    function. This function fills in frame->data,
+ *                    frame->extended_data, frame->linesize[0].
+ * @param nb_channels channel count
+ * @param sample_fmt  sample format
+ * @param buf         buffer to use for frame data
+ * @param buf_size    size of buffer
+ * @param align       plane size sample alignment (0 = default)
+ * @return            >=0 on success, negative error code on failure
+ * @todo return the size in bytes required to store the samples in
+ * case of success, at the next libavutil bump
+ */
+int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
+                             enum AVSampleFormat sample_fmt, const uint8_t *buf,
+                             int buf_size, int align);
+
+/**
+ * Reset the internal decoder state / flush internal buffers. Should be called
+ * e.g. when seeking or when switching to a different stream.
+ *
+ * @note when refcounted frames are not used (i.e. avctx->refcounted_frames is 0),
+ * this invalidates the frames previously returned from the decoder. When
+ * refcounted frames are used, the decoder just releases any references it might
+ * keep internally, but the caller's reference remains valid.
+ */
+void avcodec_flush_buffers(AVCodecContext *avctx);
+
+/**
+ * Return codec bits per sample.
+ *
+ * @param[in] codec_id the codec
+ * @return Number of bits per sample or zero if unknown for the given codec.
+ */
+int av_get_bits_per_sample(enum AVCodecID codec_id);
+
+/**
+ * Return the PCM codec associated with a sample format.
+ * @param be  endianness, 0 for little, 1 for big,
+ *            -1 (or anything else) for native
+ * @return  AV_CODEC_ID_PCM_* or AV_CODEC_ID_NONE
+ */
+enum AVCodecID av_get_pcm_codec(enum AVSampleFormat fmt, int be);
+
+/**
+ * Return codec bits per sample.
+ * Only return non-zero if the bits per sample is exactly correct, not an
+ * approximation.
+ *
+ * @param[in] codec_id the codec
+ * @return Number of bits per sample or zero if unknown for the given codec.
+ */
+int av_get_exact_bits_per_sample(enum AVCodecID codec_id);
+
+/**
+ * Return audio frame duration.
+ *
+ * @param avctx        codec context
+ * @param frame_bytes  size of the frame, or 0 if unknown
+ * @return             frame duration, in samples, if known. 0 if not able to
+ *                     determine.
+ */
+int av_get_audio_frame_duration(AVCodecContext *avctx, int frame_bytes);
+
+/**
+ * This function is the same as av_get_audio_frame_duration(), except it works
+ * with AVCodecParameters instead of an AVCodecContext.
+ */
+int av_get_audio_frame_duration2(AVCodecParameters *par, int frame_bytes);
+
+#if FF_API_OLD_BSF
+typedef struct AVBitStreamFilterContext {
+    void *priv_data;
+    const struct AVBitStreamFilter *filter;
+    AVCodecParserContext *parser;
+    struct AVBitStreamFilterContext *next;
+    /**
+     * Internal default arguments, used if NULL is passed to av_bitstream_filter_filter().
+     * Not for access by library users.
+     */
+    char *args;
+} AVBitStreamFilterContext;
+#endif
+
+typedef struct AVBSFInternal AVBSFInternal;
+
+/**
+ * The bitstream filter state.
+ *
+ * This struct must be allocated with av_bsf_alloc() and freed with
+ * av_bsf_free().
+ *
+ * The fields in the struct will only be changed (by the caller or by the
+ * filter) as described in their documentation, and are to be considered
+ * immutable otherwise.
+ */
+typedef struct AVBSFContext {
+    /**
+     * A class for logging and AVOptions
+     */
+    const AVClass *av_class;
+
+    /**
+     * The bitstream filter this context is an instance of.
+     */
+    const struct AVBitStreamFilter *filter;
+
+    /**
+     * Opaque libavcodec internal data. Must not be touched by the caller in any
+     * way.
+     */
+    AVBSFInternal *internal;
+
+    /**
+     * Opaque filter-specific private data. If filter->priv_class is non-NULL,
+     * this is an AVOptions-enabled struct.
+     */
+    void *priv_data;
+
+    /**
+     * Parameters of the input stream. This field is allocated in
+     * av_bsf_alloc(), it needs to be filled by the caller before
+     * av_bsf_init().
+     */
+    AVCodecParameters *par_in;
+
+    /**
+     * Parameters of the output stream. This field is allocated in
+     * av_bsf_alloc(), it is set by the filter in av_bsf_init().
+     */
+    AVCodecParameters *par_out;
+
+    /**
+     * The timebase used for the timestamps of the input packets. Set by the
+     * caller before av_bsf_init().
+     */
+    AVRational time_base_in;
+
+    /**
+     * The timebase used for the timestamps of the output packets. Set by the
+     * filter in av_bsf_init().
+     */
+    AVRational time_base_out;
+} AVBSFContext;
+
+typedef struct AVBitStreamFilter {
+    const char *name;
+
+    /**
+     * A list of codec ids supported by the filter, terminated by
+     * AV_CODEC_ID_NONE.
+     * May be NULL, in that case the bitstream filter works with any codec id.
+     */
+    const enum AVCodecID *codec_ids;
+
+    /**
+     * A class for the private data, used to declare bitstream filter private
+     * AVOptions. This field is NULL for bitstream filters that do not declare
+     * any options.
+     *
+     * If this field is non-NULL, the first member of the filter private data
+     * must be a pointer to AVClass, which will be set by libavcodec generic
+     * code to this class.
+     */
+    const AVClass *priv_class;
+
+    /*****************************************************************
+     * No fields below this line are part of the public API. They
+     * may not be used outside of libavcodec and can be changed and
+     * removed at will.
+     * New public fields should be added right above.
+     *****************************************************************
+     */
+
+    int priv_data_size;
+    int (*init)(AVBSFContext *ctx);
+    int (*filter)(AVBSFContext *ctx, AVPacket *pkt);
+    void (*close)(AVBSFContext *ctx);
+} AVBitStreamFilter;
+
+#if FF_API_OLD_BSF
+/**
+ * Register a bitstream filter.
+ *
+ * The filter will be accessible to the application code through
+ * av_bitstream_filter_next() or can be directly initialized with
+ * av_bitstream_filter_init().
+ *
+ * @see avcodec_register_all()
+ */
+attribute_deprecated
+void av_register_bitstream_filter(AVBitStreamFilter *bsf);
+
+/**
+ * Create and initialize a bitstream filter context given a bitstream
+ * filter name.
+ *
+ * The returned context must be freed with av_bitstream_filter_close().
+ *
+ * @param name    the name of the bitstream filter
+ * @return a bitstream filter context if a matching filter was found
+ * and successfully initialized, NULL otherwise
+ */
+attribute_deprecated
+AVBitStreamFilterContext *av_bitstream_filter_init(const char *name);
+
+/**
+ * Filter bitstream.
+ *
+ * This function filters the buffer buf with size buf_size, and places the
+ * filtered buffer in the buffer pointed to by poutbuf.
+ *
+ * The output buffer must be freed by the caller.
+ *
+ * @param bsfc            bitstream filter context created by av_bitstream_filter_init()
+ * @param avctx           AVCodecContext accessed by the filter, may be NULL.
+ *                        If specified, this must point to the encoder context of the
+ *                        output stream the packet is sent to.
+ * @param args            arguments which specify the filter configuration, may be NULL
+ * @param poutbuf         pointer which is updated to point to the filtered buffer
+ * @param poutbuf_size    pointer which is updated to the filtered buffer size in bytes
+ * @param buf             buffer containing the data to filter
+ * @param buf_size        size in bytes of buf
+ * @param keyframe        set to non-zero if the buffer to filter corresponds to a key-frame packet data
+ * @return >= 0 in case of success, or a negative error code in case of failure
+ *
+ * If the return value is positive, an output buffer is allocated and
+ * is available in *poutbuf, and is distinct from the input buffer.
+ *
+ * If the return value is 0, the output buffer is not allocated and
+ * should be considered identical to the input buffer, or in case
+ * *poutbuf was set it points to the input buffer (not necessarily to
+ * its starting address). A special case is if *poutbuf was set to NULL and
+ * *poutbuf_size was set to 0, which indicates the packet should be dropped.
+ */
+attribute_deprecated
+int av_bitstream_filter_filter(AVBitStreamFilterContext *bsfc,
+                               AVCodecContext *avctx, const char *args,
+                               uint8_t **poutbuf, int *poutbuf_size,
+                               const uint8_t *buf, int buf_size, int keyframe);
+
+/**
+ * Release bitstream filter context.
+ *
+ * @param bsf the bitstream filter context created with
+ * av_bitstream_filter_init(), can be NULL
+ */
+attribute_deprecated
+void av_bitstream_filter_close(AVBitStreamFilterContext *bsf);
+
+/**
+ * If f is NULL, return the first registered bitstream filter,
+ * if f is non-NULL, return the next registered bitstream filter
+ * after f, or NULL if f is the last one.
+ *
+ * This function can be used to iterate over all registered bitstream
+ * filters.
+ */
+attribute_deprecated
+AVBitStreamFilter *av_bitstream_filter_next(const AVBitStreamFilter *f);
+#endif
+
+/**
+ * @return a bitstream filter with the specified name or NULL if no such
+ *         bitstream filter exists.
+ */
+const AVBitStreamFilter *av_bsf_get_by_name(const char *name);
+
+/**
+ * Iterate over all registered bitstream filters.
+ *
+ * @param opaque a pointer where libavcodec will store the iteration state. Must
+ *               point to NULL to start the iteration.
+ *
+ * @return the next registered bitstream filter or NULL when the iteration is
+ *         finished
+ */
+const AVBitStreamFilter *av_bsf_next(void **opaque);
+
+/**
+ * Allocate a context for a given bitstream filter. The caller must fill in the
+ * context parameters as described in the documentation and then call
+ * av_bsf_init() before sending any data to the filter.
+ *
+ * @param filter the filter for which to allocate an instance.
+ * @param ctx a pointer into which the pointer to the newly-allocated context
+ *            will be written. It must be freed with av_bsf_free() after the
+ *            filtering is done.
+ *
+ * @return 0 on success, a negative AVERROR code on failure
+ */
+int av_bsf_alloc(const AVBitStreamFilter *filter, AVBSFContext **ctx);
+
+/**
+ * Prepare the filter for use, after all the parameters and options have been
+ * set.
+ */
+int av_bsf_init(AVBSFContext *ctx);
+
+/**
+ * Submit a packet for filtering.
+ *
+ * After sending each packet, the filter must be completely drained by calling
+ * av_bsf_receive_packet() repeatedly until it returns AVERROR(EAGAIN) or
+ * AVERROR_EOF.
+ *
+ * @param pkt the packet to filter. pkt must contain some payload (i.e data or
+ * side data must be present in pkt). The bitstream filter will take ownership of
+ * the packet and reset the contents of pkt. pkt is not touched if an error occurs.
+ * This parameter may be NULL, which signals the end of the stream (i.e. no more
+ * packets will be sent). That will cause the filter to output any packets it
+ * may have buffered internally.
+ *
+ * @return 0 on success, a negative AVERROR on error.
+ */
+int av_bsf_send_packet(AVBSFContext *ctx, AVPacket *pkt);
+
+/**
+ * Retrieve a filtered packet.
+ *
+ * @param[out] pkt this struct will be filled with the contents of the filtered
+ *                 packet. It is owned by the caller and must be freed using
+ *                 av_packet_unref() when it is no longer needed.
+ *                 This parameter should be "clean" (i.e. freshly allocated
+ *                 with av_packet_alloc() or unreffed with av_packet_unref())
+ *                 when this function is called. If this function returns
+ *                 successfully, the contents of pkt will be completely
+ *                 overwritten by the returned data. On failure, pkt is not
+ *                 touched.
+ *
+ * @return 0 on success. AVERROR(EAGAIN) if more packets need to be sent to the
+ * filter (using av_bsf_send_packet()) to get more output. AVERROR_EOF if there
+ * will be no further output from the filter. Another negative AVERROR value if
+ * an error occurs.
+ *
+ * @note one input packet may result in several output packets, so after sending
+ * a packet with av_bsf_send_packet(), this function needs to be called
+ * repeatedly until it stops returning 0. It is also possible for a filter to
+ * output fewer packets than were sent to it, so this function may return
+ * AVERROR(EAGAIN) immediately after a successful av_bsf_send_packet() call.
+ */
+int av_bsf_receive_packet(AVBSFContext *ctx, AVPacket *pkt);
+
+/**
+ * Free a bitstream filter context and everything associated with it; write NULL
+ * into the supplied pointer.
+ */
+void av_bsf_free(AVBSFContext **ctx);
+
+/**
+ * Get the AVClass for AVBSFContext. It can be used in combination with
+ * AV_OPT_SEARCH_FAKE_OBJ for examining options.
+ *
+ * @see av_opt_find().
+ */
+const AVClass *av_bsf_get_class(void);
+
+/**
+ * Structure for chain/list of bitstream filters.
+ * Empty list can be allocated by av_bsf_list_alloc().
+ */
+typedef struct AVBSFList AVBSFList;
+
+/**
+ * Allocate empty list of bitstream filters.
+ * The list must be later freed by av_bsf_list_free()
+ * or finalized by av_bsf_list_finalize().
+ *
+ * @return Pointer to @ref AVBSFList on success, NULL in case of failure
+ */
+AVBSFList *av_bsf_list_alloc(void);
+
+/**
+ * Free list of bitstream filters.
+ *
+ * @param lst Pointer to pointer returned by av_bsf_list_alloc()
+ */
+void av_bsf_list_free(AVBSFList **lst);
+
+/**
+ * Append bitstream filter to the list of bitstream filters.
+ *
+ * @param lst List to append to
+ * @param bsf Filter context to be appended
+ *
+ * @return >=0 on success, negative AVERROR in case of failure
+ */
+int av_bsf_list_append(AVBSFList *lst, AVBSFContext *bsf);
+
+/**
+ * Construct new bitstream filter context given it's name and options
+ * and append it to the list of bitstream filters.
+ *
+ * @param lst      List to append to
+ * @param bsf_name Name of the bitstream filter
+ * @param options  Options for the bitstream filter, can be set to NULL
+ *
+ * @return >=0 on success, negative AVERROR in case of failure
+ */
+int av_bsf_list_append2(AVBSFList *lst, const char * bsf_name, AVDictionary **options);
+/**
+ * Finalize list of bitstream filters.
+ *
+ * This function will transform @ref AVBSFList to single @ref AVBSFContext,
+ * so the whole chain of bitstream filters can be treated as single filter
+ * freshly allocated by av_bsf_alloc().
+ * If the call is successful, @ref AVBSFList structure is freed and lst
+ * will be set to NULL. In case of failure, caller is responsible for
+ * freeing the structure by av_bsf_list_free()
+ *
+ * @param      lst Filter list structure to be transformed
+ * @param[out] bsf Pointer to be set to newly created @ref AVBSFContext structure
+ *                 representing the chain of bitstream filters
+ *
+ * @return >=0 on success, negative AVERROR in case of failure
+ */
+int av_bsf_list_finalize(AVBSFList **lst, AVBSFContext **bsf);
+
+/**
+ * Parse string describing list of bitstream filters and create single
+ * @ref AVBSFContext describing the whole chain of bitstream filters.
+ * Resulting @ref AVBSFContext can be treated as any other @ref AVBSFContext freshly
+ * allocated by av_bsf_alloc().
+ *
+ * @param      str String describing chain of bitstream filters in format
+ *                 `bsf1[=opt1=val1:opt2=val2][,bsf2]`
+ * @param[out] bsf Pointer to be set to newly created @ref AVBSFContext structure
+ *                 representing the chain of bitstream filters
+ *
+ * @return >=0 on success, negative AVERROR in case of failure
+ */
+int av_bsf_list_parse_str(const char *str, AVBSFContext **bsf);
+
+/**
+ * Get null/pass-through bitstream filter.
+ *
+ * @param[out] bsf Pointer to be set to new instance of pass-through bitstream filter
+ *
+ * @return
+ */
+int av_bsf_get_null_filter(AVBSFContext **bsf);
+
+/* memory */
+
+/**
+ * Same behaviour av_fast_malloc but the buffer has additional
+ * AV_INPUT_BUFFER_PADDING_SIZE at the end which will always be 0.
+ *
+ * In addition the whole buffer will initially and after resizes
+ * be 0-initialized so that no uninitialized data will ever appear.
+ */
+void av_fast_padded_malloc(void *ptr, unsigned int *size, size_t min_size);
+
+/**
+ * Same behaviour av_fast_padded_malloc except that buffer will always
+ * be 0-initialized after call.
+ */
+void av_fast_padded_mallocz(void *ptr, unsigned int *size, size_t min_size);
+
+/**
+ * Encode extradata length to a buffer. Used by xiph codecs.
+ *
+ * @param s buffer to write to; must be at least (v/255+1) bytes long
+ * @param v size of extradata in bytes
+ * @return number of bytes written to the buffer.
+ */
+unsigned int av_xiphlacing(unsigned char *s, unsigned int v);
+
+#if FF_API_MISSING_SAMPLE
+/**
+ * Log a generic warning message about a missing feature. This function is
+ * intended to be used internally by FFmpeg (libavcodec, libavformat, etc.)
+ * only, and would normally not be used by applications.
+ * @param[in] avc a pointer to an arbitrary struct of which the first field is
+ * a pointer to an AVClass struct
+ * @param[in] feature string containing the name of the missing feature
+ * @param[in] want_sample indicates if samples are wanted which exhibit this feature.
+ * If want_sample is non-zero, additional verbiage will be added to the log
+ * message which tells the user how to report samples to the development
+ * mailing list.
+ * @deprecated Use avpriv_report_missing_feature() instead.
+ */
+attribute_deprecated
+void av_log_missing_feature(void *avc, const char *feature, int want_sample);
+
+/**
+ * Log a generic warning message asking for a sample. This function is
+ * intended to be used internally by FFmpeg (libavcodec, libavformat, etc.)
+ * only, and would normally not be used by applications.
+ * @param[in] avc a pointer to an arbitrary struct of which the first field is
+ * a pointer to an AVClass struct
+ * @param[in] msg string containing an optional message, or NULL if no message
+ * @deprecated Use avpriv_request_sample() instead.
+ */
+attribute_deprecated
+void av_log_ask_for_sample(void *avc, const char *msg, ...) av_printf_format(2, 3);
+#endif /* FF_API_MISSING_SAMPLE */
+
+/**
+ * Register the hardware accelerator hwaccel.
+ */
+void av_register_hwaccel(AVHWAccel *hwaccel);
+
+/**
+ * If hwaccel is NULL, returns the first registered hardware accelerator,
+ * if hwaccel is non-NULL, returns the next registered hardware accelerator
+ * after hwaccel, or NULL if hwaccel is the last one.
+ */
+AVHWAccel *av_hwaccel_next(const AVHWAccel *hwaccel);
+
+
+/**
+ * Lock operation used by lockmgr
+ */
+enum AVLockOp {
+  AV_LOCK_CREATE,  ///< Create a mutex
+  AV_LOCK_OBTAIN,  ///< Lock the mutex
+  AV_LOCK_RELEASE, ///< Unlock the mutex
+  AV_LOCK_DESTROY, ///< Free mutex resources
+};
+
+/**
+ * Register a user provided lock manager supporting the operations
+ * specified by AVLockOp. The "mutex" argument to the function points
+ * to a (void *) where the lockmgr should store/get a pointer to a user
+ * allocated mutex. It is NULL upon AV_LOCK_CREATE and equal to the
+ * value left by the last call for all other ops. If the lock manager is
+ * unable to perform the op then it should leave the mutex in the same
+ * state as when it was called and return a non-zero value. However,
+ * when called with AV_LOCK_DESTROY the mutex will always be assumed to
+ * have been successfully destroyed. If av_lockmgr_register succeeds
+ * it will return a non-negative value, if it fails it will return a
+ * negative value and destroy all mutex and unregister all callbacks.
+ * av_lockmgr_register is not thread-safe, it must be called from a
+ * single thread before any calls which make use of locking are used.
+ *
+ * @param cb User defined callback. av_lockmgr_register invokes calls
+ *           to this callback and the previously registered callback.
+ *           The callback will be used to create more than one mutex
+ *           each of which must be backed by its own underlying locking
+ *           mechanism (i.e. do not use a single static object to
+ *           implement your lock manager). If cb is set to NULL the
+ *           lockmgr will be unregistered.
+ */
+int av_lockmgr_register(int (*cb)(void **mutex, enum AVLockOp op));
+
+/**
+ * Get the type of the given codec.
+ */
+enum AVMediaType avcodec_get_type(enum AVCodecID codec_id);
+
+/**
+ * Get the name of a codec.
+ * @return  a static string identifying the codec; never NULL
+ */
+const char *avcodec_get_name(enum AVCodecID id);
+
+/**
+ * @return a positive value if s is open (i.e. avcodec_open2() was called on it
+ * with no corresponding avcodec_close()), 0 otherwise.
+ */
+int avcodec_is_open(AVCodecContext *s);
+
+/**
+ * @return a non-zero number if codec is an encoder, zero otherwise
+ */
+int av_codec_is_encoder(const AVCodec *codec);
+
+/**
+ * @return a non-zero number if codec is a decoder, zero otherwise
+ */
+int av_codec_is_decoder(const AVCodec *codec);
+
+/**
+ * @return descriptor for given codec ID or NULL if no descriptor exists.
+ */
+const AVCodecDescriptor *avcodec_descriptor_get(enum AVCodecID id);
+
+/**
+ * Iterate over all codec descriptors known to libavcodec.
+ *
+ * @param prev previous descriptor. NULL to get the first descriptor.
+ *
+ * @return next descriptor or NULL after the last descriptor
+ */
+const AVCodecDescriptor *avcodec_descriptor_next(const AVCodecDescriptor *prev);
+
+/**
+ * @return codec descriptor with the given name or NULL if no such descriptor
+ *         exists.
+ */
+const AVCodecDescriptor *avcodec_descriptor_get_by_name(const char *name);
+
+/**
+ * Allocate a CPB properties structure and initialize its fields to default
+ * values.
+ *
+ * @param size if non-NULL, the size of the allocated struct will be written
+ *             here. This is useful for embedding it in side data.
+ *
+ * @return the newly allocated struct or NULL on failure
+ */
+AVCPBProperties *av_cpb_properties_alloc(size_t *size);
+
+/**
+ * @}
+ */
+
+#endif /* AVCODEC_AVCODEC_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/avdct.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/avdct.h
new file mode 100644
index 0000000..272422e
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/avdct.h
@@ -0,0 +1,84 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AVDCT_H
+#define AVCODEC_AVDCT_H
+
+#include "libavutil/opt.h"
+
+/**
+ * AVDCT context.
+ * @note function pointers can be NULL if the specific features have been
+ *       disabled at build time.
+ */
+typedef struct AVDCT {
+    const AVClass *av_class;
+
+    void (*idct)(int16_t *block /* align 16 */);
+
+    /**
+     * IDCT input permutation.
+     * Several optimized IDCTs need a permutated input (relative to the
+     * normal order of the reference IDCT).
+     * This permutation must be performed before the idct_put/add.
+     * Note, normally this can be merged with the zigzag/alternate scan<br>
+     * An example to avoid confusion:
+     * - (->decode coeffs -> zigzag reorder -> dequant -> reference IDCT -> ...)
+     * - (x -> reference DCT -> reference IDCT -> x)
+     * - (x -> reference DCT -> simple_mmx_perm = idct_permutation
+     *    -> simple_idct_mmx -> x)
+     * - (-> decode coeffs -> zigzag reorder -> simple_mmx_perm -> dequant
+     *    -> simple_idct_mmx -> ...)
+     */
+    uint8_t idct_permutation[64];
+
+    void (*fdct)(int16_t *block /* align 16 */);
+
+
+    /**
+     * DCT algorithm.
+     * must use AVOptions to set this field.
+     */
+    int dct_algo;
+
+    /**
+     * IDCT algorithm.
+     * must use AVOptions to set this field.
+     */
+    int idct_algo;
+
+    void (*get_pixels)(int16_t *block /* align 16 */,
+                       const uint8_t *pixels /* align 8 */,
+                       ptrdiff_t line_size);
+
+    int bits_per_sample;
+} AVDCT;
+
+/**
+ * Allocates a AVDCT context.
+ * This needs to be initialized with avcodec_dct_init() after optionally
+ * configuring it with AVOptions.
+ *
+ * To free it use av_free()
+ */
+AVDCT *avcodec_dct_alloc(void);
+int avcodec_dct_init(AVDCT *);
+
+const AVClass *avcodec_dct_get_class(void);
+
+#endif /* AVCODEC_AVDCT_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/avfft.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/avfft.h
new file mode 100644
index 0000000..0c0f9b8
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/avfft.h
@@ -0,0 +1,118 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AVFFT_H
+#define AVCODEC_AVFFT_H
+
+/**
+ * @file
+ * @ingroup lavc_fft
+ * FFT functions
+ */
+
+/**
+ * @defgroup lavc_fft FFT functions
+ * @ingroup lavc_misc
+ *
+ * @{
+ */
+
+typedef float FFTSample;
+
+typedef struct FFTComplex {
+    FFTSample re, im;
+} FFTComplex;
+
+typedef struct FFTContext FFTContext;
+
+/**
+ * Set up a complex FFT.
+ * @param nbits           log2 of the length of the input array
+ * @param inverse         if 0 perform the forward transform, if 1 perform the inverse
+ */
+FFTContext *av_fft_init(int nbits, int inverse);
+
+/**
+ * Do the permutation needed BEFORE calling ff_fft_calc().
+ */
+void av_fft_permute(FFTContext *s, FFTComplex *z);
+
+/**
+ * Do a complex FFT with the parameters defined in av_fft_init(). The
+ * input data must be permuted before. No 1.0/sqrt(n) normalization is done.
+ */
+void av_fft_calc(FFTContext *s, FFTComplex *z);
+
+void av_fft_end(FFTContext *s);
+
+FFTContext *av_mdct_init(int nbits, int inverse, double scale);
+void av_imdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input);
+void av_imdct_half(FFTContext *s, FFTSample *output, const FFTSample *input);
+void av_mdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input);
+void av_mdct_end(FFTContext *s);
+
+/* Real Discrete Fourier Transform */
+
+enum RDFTransformType {
+    DFT_R2C,
+    IDFT_C2R,
+    IDFT_R2C,
+    DFT_C2R,
+};
+
+typedef struct RDFTContext RDFTContext;
+
+/**
+ * Set up a real FFT.
+ * @param nbits           log2 of the length of the input array
+ * @param trans           the type of transform
+ */
+RDFTContext *av_rdft_init(int nbits, enum RDFTransformType trans);
+void av_rdft_calc(RDFTContext *s, FFTSample *data);
+void av_rdft_end(RDFTContext *s);
+
+/* Discrete Cosine Transform */
+
+typedef struct DCTContext DCTContext;
+
+enum DCTTransformType {
+    DCT_II = 0,
+    DCT_III,
+    DCT_I,
+    DST_I,
+};
+
+/**
+ * Set up DCT.
+ *
+ * @param nbits           size of the input array:
+ *                        (1 << nbits)     for DCT-II, DCT-III and DST-I
+ *                        (1 << nbits) + 1 for DCT-I
+ * @param type            the type of transform
+ *
+ * @note the first element of the input of DST-I is ignored
+ */
+DCTContext *av_dct_init(int nbits, enum DCTTransformType type);
+void av_dct_calc(DCTContext *s, FFTSample *data);
+void av_dct_end (DCTContext *s);
+
+/**
+ * @}
+ */
+
+#endif /* AVCODEC_AVFFT_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/d3d11va.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/d3d11va.h
new file mode 100644
index 0000000..6816b6c
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/d3d11va.h
@@ -0,0 +1,112 @@
+/*
+ * Direct3D11 HW acceleration
+ *
+ * copyright (c) 2009 Laurent Aimar
+ * copyright (c) 2015 Steve Lhomme
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_D3D11VA_H
+#define AVCODEC_D3D11VA_H
+
+/**
+ * @file
+ * @ingroup lavc_codec_hwaccel_d3d11va
+ * Public libavcodec D3D11VA header.
+ */
+
+#if !defined(_WIN32_WINNT) || _WIN32_WINNT < 0x0602
+#undef _WIN32_WINNT
+#define _WIN32_WINNT 0x0602
+#endif
+
+#include <stdint.h>
+#include <d3d11.h>
+
+/**
+ * @defgroup lavc_codec_hwaccel_d3d11va Direct3D11
+ * @ingroup lavc_codec_hwaccel
+ *
+ * @{
+ */
+
+#define FF_DXVA2_WORKAROUND_SCALING_LIST_ZIGZAG 1 ///< Work around for Direct3D11 and old UVD/UVD+ ATI video cards
+#define FF_DXVA2_WORKAROUND_INTEL_CLEARVIDEO    2 ///< Work around for Direct3D11 and old Intel GPUs with ClearVideo interface
+
+/**
+ * This structure is used to provides the necessary configurations and data
+ * to the Direct3D11 FFmpeg HWAccel implementation.
+ *
+ * The application must make it available as AVCodecContext.hwaccel_context.
+ *
+ * Use av_d3d11va_alloc_context() exclusively to allocate an AVD3D11VAContext.
+ */
+typedef struct AVD3D11VAContext {
+    /**
+     * D3D11 decoder object
+     */
+    ID3D11VideoDecoder *decoder;
+
+    /**
+      * D3D11 VideoContext
+      */
+    ID3D11VideoContext *video_context;
+
+    /**
+     * D3D11 configuration used to create the decoder
+     */
+    D3D11_VIDEO_DECODER_CONFIG *cfg;
+
+    /**
+     * The number of surface in the surface array
+     */
+    unsigned surface_count;
+
+    /**
+     * The array of Direct3D surfaces used to create the decoder
+     */
+    ID3D11VideoDecoderOutputView **surface;
+
+    /**
+     * A bit field configuring the workarounds needed for using the decoder
+     */
+    uint64_t workaround;
+
+    /**
+     * Private to the FFmpeg AVHWAccel implementation
+     */
+    unsigned report_id;
+
+    /**
+      * Mutex to access video_context
+      */
+    HANDLE  context_mutex;
+} AVD3D11VAContext;
+
+/**
+ * Allocate an AVD3D11VAContext.
+ *
+ * @return Newly-allocated AVD3D11VAContext or NULL on failure.
+ */
+AVD3D11VAContext *av_d3d11va_alloc_context(void);
+
+/**
+ * @}
+ */
+
+#endif /* AVCODEC_D3D11VA_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/dirac.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/dirac.h
new file mode 100644
index 0000000..e6d9d34
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/dirac.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (C) 2007 Marco Gerards <marco@gnu.org>
+ * Copyright (C) 2009 David Conrad
+ * Copyright (C) 2011 Jordi Ortiz
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DIRAC_H
+#define AVCODEC_DIRAC_H
+
+/**
+ * @file
+ * Interface to Dirac Decoder/Encoder
+ * @author Marco Gerards <marco@gnu.org>
+ * @author David Conrad
+ * @author Jordi Ortiz
+ */
+
+#include "avcodec.h"
+
+/**
+ * The spec limits the number of wavelet decompositions to 4 for both
+ * level 1 (VC-2) and 128 (long-gop default).
+ * 5 decompositions is the maximum before >16-bit buffers are needed.
+ * Schroedinger allows this for DD 9,7 and 13,7 wavelets only, limiting
+ * the others to 4 decompositions (or 3 for the fidelity filter).
+ *
+ * We use this instead of MAX_DECOMPOSITIONS to save some memory.
+ */
+#define MAX_DWT_LEVELS 5
+
+/**
+ * Parse code values:
+ *
+ * Dirac Specification ->
+ * 9.6.1  Table 9.1
+ *
+ * VC-2 Specification  ->
+ * 10.4.1 Table 10.1
+ */
+
+enum DiracParseCodes {
+    DIRAC_PCODE_SEQ_HEADER      = 0x00,
+    DIRAC_PCODE_END_SEQ         = 0x10,
+    DIRAC_PCODE_AUX             = 0x20,
+    DIRAC_PCODE_PAD             = 0x30,
+    DIRAC_PCODE_PICTURE_CODED   = 0x08,
+    DIRAC_PCODE_PICTURE_RAW     = 0x48,
+    DIRAC_PCODE_PICTURE_LOW_DEL = 0xC8,
+    DIRAC_PCODE_PICTURE_HQ      = 0xE8,
+    DIRAC_PCODE_INTER_NOREF_CO1 = 0x0A,
+    DIRAC_PCODE_INTER_NOREF_CO2 = 0x09,
+    DIRAC_PCODE_INTER_REF_CO1   = 0x0D,
+    DIRAC_PCODE_INTER_REF_CO2   = 0x0E,
+    DIRAC_PCODE_INTRA_REF_CO    = 0x0C,
+    DIRAC_PCODE_INTRA_REF_RAW   = 0x4C,
+    DIRAC_PCODE_INTRA_REF_PICT  = 0xCC,
+    DIRAC_PCODE_MAGIC           = 0x42424344,
+};
+
+typedef struct DiracVersionInfo {
+    int major;
+    int minor;
+} DiracVersionInfo;
+
+typedef struct AVDiracSeqHeader {
+    unsigned width;
+    unsigned height;
+    uint8_t chroma_format;          ///< 0: 444  1: 422  2: 420
+
+    uint8_t interlaced;
+    uint8_t top_field_first;
+
+    uint8_t frame_rate_index;       ///< index into dirac_frame_rate[]
+    uint8_t aspect_ratio_index;     ///< index into dirac_aspect_ratio[]
+
+    uint16_t clean_width;
+    uint16_t clean_height;
+    uint16_t clean_left_offset;
+    uint16_t clean_right_offset;
+
+    uint8_t pixel_range_index;      ///< index into dirac_pixel_range_presets[]
+    uint8_t color_spec_index;       ///< index into dirac_color_spec_presets[]
+
+    int profile;
+    int level;
+
+    AVRational framerate;
+    AVRational sample_aspect_ratio;
+
+    enum AVPixelFormat pix_fmt;
+    enum AVColorRange color_range;
+    enum AVColorPrimaries color_primaries;
+    enum AVColorTransferCharacteristic color_trc;
+    enum AVColorSpace colorspace;
+
+    DiracVersionInfo version;
+    int bit_depth;
+} AVDiracSeqHeader;
+
+/**
+ * Parse a Dirac sequence header.
+ *
+ * @param dsh this function will allocate and fill an AVDiracSeqHeader struct
+ *            and write it into this pointer. The caller must free it with
+ *            av_free().
+ * @param buf the data buffer
+ * @param buf_size the size of the data buffer in bytes
+ * @param log_ctx if non-NULL, this function will log errors here
+ * @return 0 on success, a negative AVERROR code on failure
+ */
+int av_dirac_parse_sequence_header(AVDiracSeqHeader **dsh,
+                                   const uint8_t *buf, size_t buf_size,
+                                   void *log_ctx);
+
+#endif /* AVCODEC_DIRAC_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/dv_profile.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/dv_profile.h
new file mode 100644
index 0000000..9380a66
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/dv_profile.h
@@ -0,0 +1,83 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DV_PROFILE_H
+#define AVCODEC_DV_PROFILE_H
+
+#include <stdint.h>
+
+#include "libavutil/pixfmt.h"
+#include "libavutil/rational.h"
+#include "avcodec.h"
+
+/* minimum number of bytes to read from a DV stream in order to
+ * determine the profile */
+#define DV_PROFILE_BYTES (6 * 80) /* 6 DIF blocks */
+
+
+/*
+ * AVDVProfile is used to express the differences between various
+ * DV flavors. For now it's primarily used for differentiating
+ * 525/60 and 625/50, but the plans are to use it for various
+ * DV specs as well (e.g. SMPTE314M vs. IEC 61834).
+ */
+typedef struct AVDVProfile {
+    int              dsf;                   /* value of the dsf in the DV header */
+    int              video_stype;           /* stype for VAUX source pack */
+    int              frame_size;            /* total size of one frame in bytes */
+    int              difseg_size;           /* number of DIF segments per DIF channel */
+    int              n_difchan;             /* number of DIF channels per frame */
+    AVRational       time_base;             /* 1/framerate */
+    int              ltc_divisor;           /* FPS from the LTS standpoint */
+    int              height;                /* picture height in pixels */
+    int              width;                 /* picture width in pixels */
+    AVRational       sar[2];                /* sample aspect ratios for 4:3 and 16:9 */
+    enum AVPixelFormat pix_fmt;             /* picture pixel format */
+    int              bpm;                   /* blocks per macroblock */
+    const uint8_t   *block_sizes;           /* AC block sizes, in bits */
+    int              audio_stride;          /* size of audio_shuffle table */
+    int              audio_min_samples[3];  /* min amount of audio samples */
+                                            /* for 48kHz, 44.1kHz and 32kHz */
+    int              audio_samples_dist[5]; /* how many samples are supposed to be */
+                                            /* in each frame in a 5 frames window */
+    const uint8_t  (*audio_shuffle)[9];     /* PCM shuffling table */
+} AVDVProfile;
+
+/**
+ * Get a DV profile for the provided compressed frame.
+ *
+ * @param sys the profile used for the previous frame, may be NULL
+ * @param frame the compressed data buffer
+ * @param buf_size size of the buffer in bytes
+ * @return the DV profile for the supplied data or NULL on failure
+ */
+const AVDVProfile *av_dv_frame_profile(const AVDVProfile *sys,
+                                       const uint8_t *frame, unsigned buf_size);
+
+/**
+ * Get a DV profile for the provided stream parameters.
+ */
+const AVDVProfile *av_dv_codec_profile(int width, int height, enum AVPixelFormat pix_fmt);
+
+/**
+ * Get a DV profile for the provided stream parameters.
+ * The frame rate is used as a best-effort parameter.
+ */
+const AVDVProfile *av_dv_codec_profile2(int width, int height, enum AVPixelFormat pix_fmt, AVRational frame_rate);
+
+#endif /* AVCODEC_DV_PROFILE_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/dxva2.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/dxva2.h
new file mode 100644
index 0000000..22c9399
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/dxva2.h
@@ -0,0 +1,93 @@
+/*
+ * DXVA2 HW acceleration
+ *
+ * copyright (c) 2009 Laurent Aimar
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_DXVA2_H
+#define AVCODEC_DXVA2_H
+
+/**
+ * @file
+ * @ingroup lavc_codec_hwaccel_dxva2
+ * Public libavcodec DXVA2 header.
+ */
+
+#if !defined(_WIN32_WINNT) || _WIN32_WINNT < 0x0602
+#undef _WIN32_WINNT
+#define _WIN32_WINNT 0x0602
+#endif
+
+#include <stdint.h>
+#include <d3d9.h>
+#include <dxva2api.h>
+
+/**
+ * @defgroup lavc_codec_hwaccel_dxva2 DXVA2
+ * @ingroup lavc_codec_hwaccel
+ *
+ * @{
+ */
+
+#define FF_DXVA2_WORKAROUND_SCALING_LIST_ZIGZAG 1 ///< Work around for DXVA2 and old UVD/UVD+ ATI video cards
+#define FF_DXVA2_WORKAROUND_INTEL_CLEARVIDEO    2 ///< Work around for DXVA2 and old Intel GPUs with ClearVideo interface
+
+/**
+ * This structure is used to provides the necessary configurations and data
+ * to the DXVA2 FFmpeg HWAccel implementation.
+ *
+ * The application must make it available as AVCodecContext.hwaccel_context.
+ */
+struct dxva_context {
+    /**
+     * DXVA2 decoder object
+     */
+    IDirectXVideoDecoder *decoder;
+
+    /**
+     * DXVA2 configuration used to create the decoder
+     */
+    const DXVA2_ConfigPictureDecode *cfg;
+
+    /**
+     * The number of surface in the surface array
+     */
+    unsigned surface_count;
+
+    /**
+     * The array of Direct3D surfaces used to create the decoder
+     */
+    LPDIRECT3DSURFACE9 *surface;
+
+    /**
+     * A bit field configuring the workarounds needed for using the decoder
+     */
+    uint64_t workaround;
+
+    /**
+     * Private to the FFmpeg AVHWAccel implementation
+     */
+    unsigned report_id;
+};
+
+/**
+ * @}
+ */
+
+#endif /* AVCODEC_DXVA2_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/jni.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/jni.h
new file mode 100644
index 0000000..dd99e92
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/jni.h
@@ -0,0 +1,46 @@
+/*
+ * JNI public API functions
+ *
+ * Copyright (c) 2015-2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_JNI_H
+#define AVCODEC_JNI_H
+
+/*
+ * Manually set a Java virtual machine which will be used to retrieve the JNI
+ * environment. Once a Java VM is set it cannot be changed afterwards, meaning
+ * you can call multiple times av_jni_set_java_vm with the same Java VM pointer
+ * however it will error out if you try to set a different Java VM.
+ *
+ * @param vm Java virtual machine
+ * @param log_ctx context used for logging, can be NULL
+ * @return 0 on success, < 0 otherwise
+ */
+int av_jni_set_java_vm(void *vm, void *log_ctx);
+
+/*
+ * Get the Java virtual machine which has been set with av_jni_set_java_vm.
+ *
+ * @param vm Java virtual machine
+ * @return a pointer to the Java virtual machine
+ */
+void *av_jni_get_java_vm(void *log_ctx);
+
+#endif /* AVCODEC_JNI_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/mediacodec.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/mediacodec.h
new file mode 100644
index 0000000..5606d24
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/mediacodec.h
@@ -0,0 +1,88 @@
+/*
+ * Android MediaCodec public API
+ *
+ * Copyright (c) 2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MEDIACODEC_H
+#define AVCODEC_MEDIACODEC_H
+
+#include "libavcodec/avcodec.h"
+
+/**
+ * This structure holds a reference to a android/view/Surface object that will
+ * be used as output by the decoder.
+ *
+ */
+typedef struct AVMediaCodecContext {
+
+    /**
+     * android/view/Surface object reference.
+     */
+    void *surface;
+
+} AVMediaCodecContext;
+
+/**
+ * Allocate and initialize a MediaCodec context.
+ *
+ * When decoding with MediaCodec is finished, the caller must free the
+ * MediaCodec context with av_mediacodec_default_free.
+ *
+ * @return a pointer to a newly allocated AVMediaCodecContext on success, NULL otherwise
+ */
+AVMediaCodecContext *av_mediacodec_alloc_context(void);
+
+/**
+ * Convenience function that sets up the MediaCodec context.
+ *
+ * @param avctx codec context
+ * @param ctx MediaCodec context to initialize
+ * @param surface reference to an android/view/Surface
+ * @return 0 on success, < 0 otherwise
+ */
+int av_mediacodec_default_init(AVCodecContext *avctx, AVMediaCodecContext *ctx, void *surface);
+
+/**
+ * This function must be called to free the MediaCodec context initialized with
+ * av_mediacodec_default_init().
+ *
+ * @param avctx codec context
+ */
+void av_mediacodec_default_free(AVCodecContext *avctx);
+
+/**
+ * Opaque structure representing a MediaCodec buffer to render.
+ */
+typedef struct MediaCodecBuffer AVMediaCodecBuffer;
+
+/**
+ * Release a MediaCodec buffer and render it to the surface that is associated
+ * with the decoder. This function should only be called once on a given
+ * buffer, once released the underlying buffer returns to the codec, thus
+ * subsequent calls to this function will have no effect.
+ *
+ * @param buffer the buffer to render
+ * @param render 1 to release and render the buffer to the surface or 0 to
+ * discard the buffer
+ * @return 0 on success, < 0 otherwise
+ */
+int av_mediacodec_release_buffer(AVMediaCodecBuffer *buffer, int render);
+
+#endif /* AVCODEC_MEDIACODEC_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/qsv.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/qsv.h
new file mode 100644
index 0000000..b77158e
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/qsv.h
@@ -0,0 +1,107 @@
+/*
+ * Intel MediaSDK QSV public API
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_QSV_H
+#define AVCODEC_QSV_H
+
+#include <mfx/mfxvideo.h>
+
+#include "libavutil/buffer.h"
+
+/**
+ * This struct is used for communicating QSV parameters between libavcodec and
+ * the caller. It is managed by the caller and must be assigned to
+ * AVCodecContext.hwaccel_context.
+ * - decoding: hwaccel_context must be set on return from the get_format()
+ *             callback
+ * - encoding: hwaccel_context must be set before avcodec_open2()
+ */
+typedef struct AVQSVContext {
+    /**
+     * If non-NULL, the session to use for encoding or decoding.
+     * Otherwise, libavcodec will try to create an internal session.
+     */
+    mfxSession session;
+
+    /**
+     * The IO pattern to use.
+     */
+    int iopattern;
+
+    /**
+     * Extra buffers to pass to encoder or decoder initialization.
+     */
+    mfxExtBuffer **ext_buffers;
+    int         nb_ext_buffers;
+
+    /**
+     * Encoding only. If this field is set to non-zero by the caller, libavcodec
+     * will create an mfxExtOpaqueSurfaceAlloc extended buffer and pass it to
+     * the encoder initialization. This only makes sense if iopattern is also
+     * set to MFX_IOPATTERN_IN_OPAQUE_MEMORY.
+     *
+     * The number of allocated opaque surfaces will be the sum of the number
+     * required by the encoder and the user-provided value nb_opaque_surfaces.
+     * The array of the opaque surfaces will be exported to the caller through
+     * the opaque_surfaces field.
+     */
+    int opaque_alloc;
+
+    /**
+     * Encoding only, and only if opaque_alloc is set to non-zero. Before
+     * calling avcodec_open2(), the caller should set this field to the number
+     * of extra opaque surfaces to allocate beyond what is required by the
+     * encoder.
+     *
+     * On return from avcodec_open2(), this field will be set by libavcodec to
+     * the total number of allocated opaque surfaces.
+     */
+    int nb_opaque_surfaces;
+
+    /**
+     * Encoding only, and only if opaque_alloc is set to non-zero. On return
+     * from avcodec_open2(), this field will be used by libavcodec to export the
+     * array of the allocated opaque surfaces to the caller, so they can be
+     * passed to other parts of the pipeline.
+     *
+     * The buffer reference exported here is owned and managed by libavcodec,
+     * the callers should make their own reference with av_buffer_ref() and free
+     * it with av_buffer_unref() when it is no longer needed.
+     *
+     * The buffer data is an nb_opaque_surfaces-sized array of mfxFrameSurface1.
+     */
+    AVBufferRef *opaque_surfaces;
+
+    /**
+     * Encoding only, and only if opaque_alloc is set to non-zero. On return
+     * from avcodec_open2(), this field will be set to the surface type used in
+     * the opaque allocation request.
+     */
+    int opaque_alloc_type;
+} AVQSVContext;
+
+/**
+ * Allocate a new context.
+ *
+ * It must be freed by the caller with av_free().
+ */
+AVQSVContext *av_qsv_alloc_context(void);
+
+#endif /* AVCODEC_QSV_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/vaapi.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/vaapi.h
new file mode 100644
index 0000000..bb28455
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/vaapi.h
@@ -0,0 +1,195 @@
+/*
+ * Video Acceleration API (shared data between FFmpeg and the video player)
+ * HW decode acceleration for MPEG-2, MPEG-4, H.264 and VC-1
+ *
+ * Copyright (C) 2008-2009 Splitted-Desktop Systems
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_VAAPI_H
+#define AVCODEC_VAAPI_H
+
+/**
+ * @file
+ * @ingroup lavc_codec_hwaccel_vaapi
+ * Public libavcodec VA API header.
+ */
+
+#include <stdint.h>
+#include "libavutil/attributes.h"
+#include "version.h"
+
+#if FF_API_STRUCT_VAAPI_CONTEXT
+
+/**
+ * @defgroup lavc_codec_hwaccel_vaapi VA API Decoding
+ * @ingroup lavc_codec_hwaccel
+ * @{
+ */
+
+/**
+ * This structure is used to share data between the FFmpeg library and
+ * the client video application.
+ * This shall be zero-allocated and available as
+ * AVCodecContext.hwaccel_context. All user members can be set once
+ * during initialization or through each AVCodecContext.get_buffer()
+ * function call. In any case, they must be valid prior to calling
+ * decoding functions.
+ *
+ * Deprecated: use AVCodecContext.hw_frames_ctx instead.
+ */
+struct attribute_deprecated vaapi_context {
+    /**
+     * Window system dependent data
+     *
+     * - encoding: unused
+     * - decoding: Set by user
+     */
+    void *display;
+
+    /**
+     * Configuration ID
+     *
+     * - encoding: unused
+     * - decoding: Set by user
+     */
+    uint32_t config_id;
+
+    /**
+     * Context ID (video decode pipeline)
+     *
+     * - encoding: unused
+     * - decoding: Set by user
+     */
+    uint32_t context_id;
+
+#if FF_API_VAAPI_CONTEXT
+    /**
+     * VAPictureParameterBuffer ID
+     *
+     * - encoding: unused
+     * - decoding: Set by libavcodec
+     */
+    attribute_deprecated
+    uint32_t pic_param_buf_id;
+
+    /**
+     * VAIQMatrixBuffer ID
+     *
+     * - encoding: unused
+     * - decoding: Set by libavcodec
+     */
+    attribute_deprecated
+    uint32_t iq_matrix_buf_id;
+
+    /**
+     * VABitPlaneBuffer ID (for VC-1 decoding)
+     *
+     * - encoding: unused
+     * - decoding: Set by libavcodec
+     */
+    attribute_deprecated
+    uint32_t bitplane_buf_id;
+
+    /**
+     * Slice parameter/data buffer IDs
+     *
+     * - encoding: unused
+     * - decoding: Set by libavcodec
+     */
+    attribute_deprecated
+    uint32_t *slice_buf_ids;
+
+    /**
+     * Number of effective slice buffer IDs to send to the HW
+     *
+     * - encoding: unused
+     * - decoding: Set by libavcodec
+     */
+    attribute_deprecated
+    unsigned int n_slice_buf_ids;
+
+    /**
+     * Size of pre-allocated slice_buf_ids
+     *
+     * - encoding: unused
+     * - decoding: Set by libavcodec
+     */
+    attribute_deprecated
+    unsigned int slice_buf_ids_alloc;
+
+    /**
+     * Pointer to VASliceParameterBuffers
+     *
+     * - encoding: unused
+     * - decoding: Set by libavcodec
+     */
+    attribute_deprecated
+    void *slice_params;
+
+    /**
+     * Size of a VASliceParameterBuffer element
+     *
+     * - encoding: unused
+     * - decoding: Set by libavcodec
+     */
+    attribute_deprecated
+    unsigned int slice_param_size;
+
+    /**
+     * Size of pre-allocated slice_params
+     *
+     * - encoding: unused
+     * - decoding: Set by libavcodec
+     */
+    attribute_deprecated
+    unsigned int slice_params_alloc;
+
+    /**
+     * Number of slices currently filled in
+     *
+     * - encoding: unused
+     * - decoding: Set by libavcodec
+     */
+    attribute_deprecated
+    unsigned int slice_count;
+
+    /**
+     * Pointer to slice data buffer base
+     * - encoding: unused
+     * - decoding: Set by libavcodec
+     */
+    attribute_deprecated
+    const uint8_t *slice_data;
+
+    /**
+     * Current size of slice data
+     *
+     * - encoding: unused
+     * - decoding: Set by libavcodec
+     */
+    attribute_deprecated
+    uint32_t slice_data_size;
+#endif
+};
+
+/* @} */
+
+#endif /* FF_API_STRUCT_VAAPI_CONTEXT */
+
+#endif /* AVCODEC_VAAPI_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/vda.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/vda.h
new file mode 100644
index 0000000..bde14e3
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/vda.h
@@ -0,0 +1,230 @@
+/*
+ * VDA HW acceleration
+ *
+ * copyright (c) 2011 Sebastien Zwickert
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_VDA_H
+#define AVCODEC_VDA_H
+
+/**
+ * @file
+ * @ingroup lavc_codec_hwaccel_vda
+ * Public libavcodec VDA header.
+ */
+
+#include "libavcodec/avcodec.h"
+
+#include <stdint.h>
+
+// emmintrin.h is unable to compile with -std=c99 -Werror=missing-prototypes
+// http://openradar.appspot.com/8026390
+#undef __GNUC_STDC_INLINE__
+
+#define Picture QuickdrawPicture
+#include <VideoDecodeAcceleration/VDADecoder.h>
+#undef Picture
+
+#include "libavcodec/version.h"
+
+// extra flags not defined in VDADecoder.h
+enum {
+    kVDADecodeInfo_Asynchronous = 1UL << 0,
+    kVDADecodeInfo_FrameDropped = 1UL << 1
+};
+
+/**
+ * @defgroup lavc_codec_hwaccel_vda VDA
+ * @ingroup lavc_codec_hwaccel
+ *
+ * @{
+ */
+
+/**
+ * This structure is used to provide the necessary configurations and data
+ * to the VDA FFmpeg HWAccel implementation.
+ *
+ * The application must make it available as AVCodecContext.hwaccel_context.
+ */
+struct vda_context {
+    /**
+     * VDA decoder object.
+     *
+     * - encoding: unused
+     * - decoding: Set/Unset by libavcodec.
+     */
+    VDADecoder          decoder;
+
+    /**
+     * The Core Video pixel buffer that contains the current image data.
+     *
+     * encoding: unused
+     * decoding: Set by libavcodec. Unset by user.
+     */
+    CVPixelBufferRef    cv_buffer;
+
+    /**
+     * Use the hardware decoder in synchronous mode.
+     *
+     * encoding: unused
+     * decoding: Set by user.
+     */
+    int                 use_sync_decoding;
+
+    /**
+     * The frame width.
+     *
+     * - encoding: unused
+     * - decoding: Set/Unset by user.
+     */
+    int                 width;
+
+    /**
+     * The frame height.
+     *
+     * - encoding: unused
+     * - decoding: Set/Unset by user.
+     */
+    int                 height;
+
+    /**
+     * The frame format.
+     *
+     * - encoding: unused
+     * - decoding: Set/Unset by user.
+     */
+    int                 format;
+
+    /**
+     * The pixel format for output image buffers.
+     *
+     * - encoding: unused
+     * - decoding: Set/Unset by user.
+     */
+    OSType              cv_pix_fmt_type;
+
+    /**
+     * unused
+     */
+    uint8_t             *priv_bitstream;
+
+    /**
+     * unused
+     */
+    int                 priv_bitstream_size;
+
+    /**
+     * unused
+     */
+    int                 priv_allocated_size;
+
+    /**
+     * Use av_buffer to manage buffer.
+     * When the flag is set, the CVPixelBuffers returned by the decoder will
+     * be released automatically, so you have to retain them if necessary.
+     * Not setting this flag may cause memory leak.
+     *
+     * encoding: unused
+     * decoding: Set by user.
+     */
+    int                 use_ref_buffer;
+};
+
+/** Create the video decoder. */
+int ff_vda_create_decoder(struct vda_context *vda_ctx,
+                          uint8_t *extradata,
+                          int extradata_size);
+
+/** Destroy the video decoder. */
+int ff_vda_destroy_decoder(struct vda_context *vda_ctx);
+
+/**
+ * This struct holds all the information that needs to be passed
+ * between the caller and libavcodec for initializing VDA decoding.
+ * Its size is not a part of the public ABI, it must be allocated with
+ * av_vda_alloc_context() and freed with av_free().
+ */
+typedef struct AVVDAContext {
+    /**
+     * VDA decoder object. Created and freed by the caller.
+     */
+    VDADecoder decoder;
+
+    /**
+     * The output callback that must be passed to VDADecoderCreate.
+     * Set by av_vda_alloc_context().
+     */
+    VDADecoderOutputCallback output_callback;
+
+    /**
+     * CVPixelBuffer Format Type that VDA will use for decoded frames; set by
+     * the caller.
+     */
+    OSType cv_pix_fmt_type;
+} AVVDAContext;
+
+/**
+ * Allocate and initialize a VDA context.
+ *
+ * This function should be called from the get_format() callback when the caller
+ * selects the AV_PIX_FMT_VDA format. The caller must then create the decoder
+ * object (using the output callback provided by libavcodec) that will be used
+ * for VDA-accelerated decoding.
+ *
+ * When decoding with VDA is finished, the caller must destroy the decoder
+ * object and free the VDA context using av_free().
+ *
+ * @return the newly allocated context or NULL on failure
+ */
+AVVDAContext *av_vda_alloc_context(void);
+
+/**
+ * This is a convenience function that creates and sets up the VDA context using
+ * an internal implementation.
+ *
+ * @param avctx the corresponding codec context
+ *
+ * @return >= 0 on success, a negative AVERROR code on failure
+ */
+int av_vda_default_init(AVCodecContext *avctx);
+
+/**
+ * This is a convenience function that creates and sets up the VDA context using
+ * an internal implementation.
+ *
+ * @param avctx the corresponding codec context
+ * @param vdactx the VDA context to use
+ *
+ * @return >= 0 on success, a negative AVERROR code on failure
+ */
+int av_vda_default_init2(AVCodecContext *avctx, AVVDAContext *vdactx);
+
+/**
+ * This function must be called to free the VDA context initialized with
+ * av_vda_default_init().
+ *
+ * @param avctx the corresponding codec context
+ */
+void av_vda_default_free(AVCodecContext *avctx);
+
+/**
+ * @}
+ */
+
+#endif /* AVCODEC_VDA_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/vdpau.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/vdpau.h
new file mode 100644
index 0000000..e85e4d9
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/vdpau.h
@@ -0,0 +1,253 @@
+/*
+ * The Video Decode and Presentation API for UNIX (VDPAU) is used for
+ * hardware-accelerated decoding of MPEG-1/2, H.264 and VC-1.
+ *
+ * Copyright (C) 2008 NVIDIA
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_VDPAU_H
+#define AVCODEC_VDPAU_H
+
+/**
+ * @file
+ * @ingroup lavc_codec_hwaccel_vdpau
+ * Public libavcodec VDPAU header.
+ */
+
+
+/**
+ * @defgroup lavc_codec_hwaccel_vdpau VDPAU Decoder and Renderer
+ * @ingroup lavc_codec_hwaccel
+ *
+ * VDPAU hardware acceleration has two modules
+ * - VDPAU decoding
+ * - VDPAU presentation
+ *
+ * The VDPAU decoding module parses all headers using FFmpeg
+ * parsing mechanisms and uses VDPAU for the actual decoding.
+ *
+ * As per the current implementation, the actual decoding
+ * and rendering (API calls) are done as part of the VDPAU
+ * presentation (vo_vdpau.c) module.
+ *
+ * @{
+ */
+
+#include <vdpau/vdpau.h>
+#include <vdpau/vdpau_x11.h>
+#include "libavutil/avconfig.h"
+#include "libavutil/attributes.h"
+
+#include "avcodec.h"
+#include "version.h"
+
+#if FF_API_BUFS_VDPAU
+union AVVDPAUPictureInfo {
+    VdpPictureInfoH264        h264;
+    VdpPictureInfoMPEG1Or2    mpeg;
+    VdpPictureInfoVC1          vc1;
+    VdpPictureInfoMPEG4Part2 mpeg4;
+};
+#endif
+
+struct AVCodecContext;
+struct AVFrame;
+
+typedef int (*AVVDPAU_Render2)(struct AVCodecContext *, struct AVFrame *,
+                               const VdpPictureInfo *, uint32_t,
+                               const VdpBitstreamBuffer *);
+
+/**
+ * This structure is used to share data between the libavcodec library and
+ * the client video application.
+ * The user shall allocate the structure via the av_alloc_vdpau_hwaccel
+ * function and make it available as
+ * AVCodecContext.hwaccel_context. Members can be set by the user once
+ * during initialization or through each AVCodecContext.get_buffer()
+ * function call. In any case, they must be valid prior to calling
+ * decoding functions.
+ *
+ * The size of this structure is not a part of the public ABI and must not
+ * be used outside of libavcodec. Use av_vdpau_alloc_context() to allocate an
+ * AVVDPAUContext.
+ */
+typedef struct AVVDPAUContext {
+    /**
+     * VDPAU decoder handle
+     *
+     * Set by user.
+     */
+    VdpDecoder decoder;
+
+    /**
+     * VDPAU decoder render callback
+     *
+     * Set by the user.
+     */
+    VdpDecoderRender *render;
+
+#if FF_API_BUFS_VDPAU
+    /**
+     * VDPAU picture information
+     *
+     * Set by libavcodec.
+     */
+    attribute_deprecated
+    union AVVDPAUPictureInfo info;
+
+    /**
+     * Allocated size of the bitstream_buffers table.
+     *
+     * Set by libavcodec.
+     */
+    attribute_deprecated
+    int bitstream_buffers_allocated;
+
+    /**
+     * Useful bitstream buffers in the bitstream buffers table.
+     *
+     * Set by libavcodec.
+     */
+    attribute_deprecated
+    int bitstream_buffers_used;
+
+   /**
+     * Table of bitstream buffers.
+     * The user is responsible for freeing this buffer using av_freep().
+     *
+     * Set by libavcodec.
+     */
+    attribute_deprecated
+    VdpBitstreamBuffer *bitstream_buffers;
+#endif
+    AVVDPAU_Render2 render2;
+} AVVDPAUContext;
+
+/**
+ * @brief allocation function for AVVDPAUContext
+ *
+ * Allows extending the struct without breaking API/ABI
+ */
+AVVDPAUContext *av_alloc_vdpaucontext(void);
+
+AVVDPAU_Render2 av_vdpau_hwaccel_get_render2(const AVVDPAUContext *);
+void av_vdpau_hwaccel_set_render2(AVVDPAUContext *, AVVDPAU_Render2);
+
+/**
+ * Associate a VDPAU device with a codec context for hardware acceleration.
+ * This function is meant to be called from the get_format() codec callback,
+ * or earlier. It can also be called after avcodec_flush_buffers() to change
+ * the underlying VDPAU device mid-stream (e.g. to recover from non-transparent
+ * display preemption).
+ *
+ * @note get_format() must return AV_PIX_FMT_VDPAU if this function completes
+ * successfully.
+ *
+ * @param avctx decoding context whose get_format() callback is invoked
+ * @param device VDPAU device handle to use for hardware acceleration
+ * @param get_proc_address VDPAU device driver
+ * @param flags zero of more OR'd AV_HWACCEL_FLAG_* flags
+ *
+ * @return 0 on success, an AVERROR code on failure.
+ */
+int av_vdpau_bind_context(AVCodecContext *avctx, VdpDevice device,
+                          VdpGetProcAddress *get_proc_address, unsigned flags);
+
+/**
+ * Gets the parameters to create an adequate VDPAU video surface for the codec
+ * context using VDPAU hardware decoding acceleration.
+ *
+ * @note Behavior is undefined if the context was not successfully bound to a
+ * VDPAU device using av_vdpau_bind_context().
+ *
+ * @param avctx the codec context being used for decoding the stream
+ * @param type storage space for the VDPAU video surface chroma type
+ *              (or NULL to ignore)
+ * @param width storage space for the VDPAU video surface pixel width
+ *              (or NULL to ignore)
+ * @param height storage space for the VDPAU video surface pixel height
+ *              (or NULL to ignore)
+ *
+ * @return 0 on success, a negative AVERROR code on failure.
+ */
+int av_vdpau_get_surface_parameters(AVCodecContext *avctx, VdpChromaType *type,
+                                    uint32_t *width, uint32_t *height);
+
+/**
+ * Allocate an AVVDPAUContext.
+ *
+ * @return Newly-allocated AVVDPAUContext or NULL on failure.
+ */
+AVVDPAUContext *av_vdpau_alloc_context(void);
+
+#if FF_API_VDPAU_PROFILE
+/**
+ * Get a decoder profile that should be used for initializing a VDPAU decoder.
+ * Should be called from the AVCodecContext.get_format() callback.
+ *
+ * @deprecated Use av_vdpau_bind_context() instead.
+ *
+ * @param avctx the codec context being used for decoding the stream
+ * @param profile a pointer into which the result will be written on success.
+ *                The contents of profile are undefined if this function returns
+ *                an error.
+ *
+ * @return 0 on success (non-negative), a negative AVERROR on failure.
+ */
+attribute_deprecated
+int av_vdpau_get_profile(AVCodecContext *avctx, VdpDecoderProfile *profile);
+#endif
+
+#if FF_API_CAP_VDPAU
+/** @brief The videoSurface is used for rendering. */
+#define FF_VDPAU_STATE_USED_FOR_RENDER 1
+
+/**
+ * @brief The videoSurface is needed for reference/prediction.
+ * The codec manipulates this.
+ */
+#define FF_VDPAU_STATE_USED_FOR_REFERENCE 2
+
+/**
+ * @brief This structure is used as a callback between the FFmpeg
+ * decoder (vd_) and presentation (vo_) module.
+ * This is used for defining a video frame containing surface,
+ * picture parameter, bitstream information etc which are passed
+ * between the FFmpeg decoder and its clients.
+ */
+struct vdpau_render_state {
+    VdpVideoSurface surface; ///< Used as rendered surface, never changed.
+
+    int state; ///< Holds FF_VDPAU_STATE_* values.
+
+    /** picture parameter information for all supported codecs */
+    union AVVDPAUPictureInfo info;
+
+    /** Describe size/location of the compressed video data.
+        Set to 0 when freeing bitstream_buffers. */
+    int bitstream_buffers_allocated;
+    int bitstream_buffers_used;
+    /** The user is responsible for freeing this buffer using av_freep(). */
+    VdpBitstreamBuffer *bitstream_buffers;
+};
+#endif
+
+/* @}*/
+
+#endif /* AVCODEC_VDPAU_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/version.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/version.h
new file mode 100644
index 0000000..51df9e0
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/version.h
@@ -0,0 +1,243 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_VERSION_H
+#define AVCODEC_VERSION_H
+
+/**
+ * @file
+ * @ingroup libavc
+ * Libavcodec version macros.
+ */
+
+#include "libavutil/version.h"
+
+#define LIBAVCODEC_VERSION_MAJOR  57
+#define LIBAVCODEC_VERSION_MINOR  89
+#define LIBAVCODEC_VERSION_MICRO 100
+
+#define LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
+                                               LIBAVCODEC_VERSION_MINOR, \
+                                               LIBAVCODEC_VERSION_MICRO)
+#define LIBAVCODEC_VERSION      AV_VERSION(LIBAVCODEC_VERSION_MAJOR,    \
+                                           LIBAVCODEC_VERSION_MINOR,    \
+                                           LIBAVCODEC_VERSION_MICRO)
+#define LIBAVCODEC_BUILD        LIBAVCODEC_VERSION_INT
+
+#define LIBAVCODEC_IDENT        "Lavc" AV_STRINGIFY(LIBAVCODEC_VERSION)
+
+/**
+ * FF_API_* defines may be placed below to indicate public API that will be
+ * dropped at a future version bump. The defines themselves are not part of
+ * the public API and may change, break or disappear at any time.
+ *
+ * @note, when bumping the major version it is recommended to manually
+ * disable each FF_API_* in its own commit instead of disabling them all
+ * at once through the bump. This improves the git bisect-ability of the change.
+ */
+
+#ifndef FF_API_VIMA_DECODER
+#define FF_API_VIMA_DECODER     (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_AUDIO_CONVERT
+#define FF_API_AUDIO_CONVERT     (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_AVCODEC_RESAMPLE
+#define FF_API_AVCODEC_RESAMPLE  FF_API_AUDIO_CONVERT
+#endif
+#ifndef FF_API_GETCHROMA
+#define FF_API_GETCHROMA         (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_MISSING_SAMPLE
+#define FF_API_MISSING_SAMPLE    (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_LOWRES
+#define FF_API_LOWRES            (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_CAP_VDPAU
+#define FF_API_CAP_VDPAU         (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_BUFS_VDPAU
+#define FF_API_BUFS_VDPAU        (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_VOXWARE
+#define FF_API_VOXWARE           (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_SET_DIMENSIONS
+#define FF_API_SET_DIMENSIONS    (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_DEBUG_MV
+#define FF_API_DEBUG_MV          (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_AC_VLC
+#define FF_API_AC_VLC            (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_OLD_MSMPEG4
+#define FF_API_OLD_MSMPEG4       (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_ASPECT_EXTENDED
+#define FF_API_ASPECT_EXTENDED   (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_ARCH_ALPHA
+#define FF_API_ARCH_ALPHA        (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_XVMC
+#define FF_API_XVMC              (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_ERROR_RATE
+#define FF_API_ERROR_RATE        (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_QSCALE_TYPE
+#define FF_API_QSCALE_TYPE       (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_MB_TYPE
+#define FF_API_MB_TYPE           (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_MAX_BFRAMES
+#define FF_API_MAX_BFRAMES       (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_NEG_LINESIZES
+#define FF_API_NEG_LINESIZES     (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_EMU_EDGE
+#define FF_API_EMU_EDGE          (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_ARCH_SH4
+#define FF_API_ARCH_SH4          (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_ARCH_SPARC
+#define FF_API_ARCH_SPARC        (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_UNUSED_MEMBERS
+#define FF_API_UNUSED_MEMBERS    (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_IDCT_XVIDMMX
+#define FF_API_IDCT_XVIDMMX      (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_INPUT_PRESERVED
+#define FF_API_INPUT_PRESERVED   (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_NORMALIZE_AQP
+#define FF_API_NORMALIZE_AQP     (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_GMC
+#define FF_API_GMC               (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_MV0
+#define FF_API_MV0               (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_CODEC_NAME
+#define FF_API_CODEC_NAME        (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_AFD
+#define FF_API_AFD               (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_VISMV
+/* XXX: don't forget to drop the -vismv documentation */
+#define FF_API_VISMV             (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_AUDIOENC_DELAY
+#define FF_API_AUDIOENC_DELAY    (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_VAAPI_CONTEXT
+#define FF_API_VAAPI_CONTEXT     (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_MERGE_SD
+#define FF_API_MERGE_SD          (LIBAVCODEC_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_AVCTX_TIMEBASE
+#define FF_API_AVCTX_TIMEBASE    (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_MPV_OPT
+#define FF_API_MPV_OPT           (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_STREAM_CODEC_TAG
+#define FF_API_STREAM_CODEC_TAG  (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_QUANT_BIAS
+#define FF_API_QUANT_BIAS        (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_RC_STRATEGY
+#define FF_API_RC_STRATEGY       (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_CODED_FRAME
+#define FF_API_CODED_FRAME       (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_MOTION_EST
+#define FF_API_MOTION_EST        (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_WITHOUT_PREFIX
+#define FF_API_WITHOUT_PREFIX    (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_SIDEDATA_ONLY_PKT
+#define FF_API_SIDEDATA_ONLY_PKT (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_VDPAU_PROFILE
+#define FF_API_VDPAU_PROFILE     (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_CONVERGENCE_DURATION
+#define FF_API_CONVERGENCE_DURATION (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_AVPICTURE
+#define FF_API_AVPICTURE         (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_AVPACKET_OLD_API
+#define FF_API_AVPACKET_OLD_API (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_RTP_CALLBACK
+#define FF_API_RTP_CALLBACK      (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_VBV_DELAY
+#define FF_API_VBV_DELAY         (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_CODER_TYPE
+#define FF_API_CODER_TYPE        (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_STAT_BITS
+#define FF_API_STAT_BITS         (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_PRIVATE_OPT
+#define FF_API_PRIVATE_OPT      (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_ASS_TIMING
+#define FF_API_ASS_TIMING       (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_OLD_BSF
+#define FF_API_OLD_BSF          (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_COPY_CONTEXT
+#define FF_API_COPY_CONTEXT     (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_GET_CONTEXT_DEFAULTS
+#define FF_API_GET_CONTEXT_DEFAULTS (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_NVENC_OLD_NAME
+#define FF_API_NVENC_OLD_NAME    (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_STRUCT_VAAPI_CONTEXT
+#define FF_API_STRUCT_VAAPI_CONTEXT (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_MERGE_SD_API
+#define FF_API_MERGE_SD_API      (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_TAG_STRING
+#define FF_API_TAG_STRING        (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+
+
+#endif /* AVCODEC_VERSION_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/videotoolbox.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/videotoolbox.h
new file mode 100644
index 0000000..af2db0d
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/videotoolbox.h
@@ -0,0 +1,127 @@
+/*
+ * Videotoolbox hardware acceleration
+ *
+ * copyright (c) 2012 Sebastien Zwickert
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_VIDEOTOOLBOX_H
+#define AVCODEC_VIDEOTOOLBOX_H
+
+/**
+ * @file
+ * @ingroup lavc_codec_hwaccel_videotoolbox
+ * Public libavcodec Videotoolbox header.
+ */
+
+#include <stdint.h>
+
+#define Picture QuickdrawPicture
+#include <VideoToolbox/VideoToolbox.h>
+#undef Picture
+
+#include "libavcodec/avcodec.h"
+
+/**
+ * This struct holds all the information that needs to be passed
+ * between the caller and libavcodec for initializing Videotoolbox decoding.
+ * Its size is not a part of the public ABI, it must be allocated with
+ * av_videotoolbox_alloc_context() and freed with av_free().
+ */
+typedef struct AVVideotoolboxContext {
+    /**
+     * Videotoolbox decompression session object.
+     * Created and freed the caller.
+     */
+    VTDecompressionSessionRef session;
+
+    /**
+     * The output callback that must be passed to the session.
+     * Set by av_videottoolbox_default_init()
+     */
+    VTDecompressionOutputCallback output_callback;
+
+    /**
+     * CVPixelBuffer Format Type that Videotoolbox will use for decoded frames.
+     * set by the caller. If this is set to 0, then no specific format is
+     * requested from the decoder, and its native format is output.
+     */
+    OSType cv_pix_fmt_type;
+
+    /**
+     * CoreMedia Format Description that Videotoolbox will use to create the decompression session.
+     * Set by the caller.
+     */
+    CMVideoFormatDescriptionRef cm_fmt_desc;
+
+    /**
+     * CoreMedia codec type that Videotoolbox will use to create the decompression session.
+     * Set by the caller.
+     */
+    int cm_codec_type;
+} AVVideotoolboxContext;
+
+/**
+ * Allocate and initialize a Videotoolbox context.
+ *
+ * This function should be called from the get_format() callback when the caller
+ * selects the AV_PIX_FMT_VIDETOOLBOX format. The caller must then create
+ * the decoder object (using the output callback provided by libavcodec) that
+ * will be used for Videotoolbox-accelerated decoding.
+ *
+ * When decoding with Videotoolbox is finished, the caller must destroy the decoder
+ * object and free the Videotoolbox context using av_free().
+ *
+ * @return the newly allocated context or NULL on failure
+ */
+AVVideotoolboxContext *av_videotoolbox_alloc_context(void);
+
+/**
+ * This is a convenience function that creates and sets up the Videotoolbox context using
+ * an internal implementation.
+ *
+ * @param avctx the corresponding codec context
+ *
+ * @return >= 0 on success, a negative AVERROR code on failure
+ */
+int av_videotoolbox_default_init(AVCodecContext *avctx);
+
+/**
+ * This is a convenience function that creates and sets up the Videotoolbox context using
+ * an internal implementation.
+ *
+ * @param avctx the corresponding codec context
+ * @param vtctx the Videotoolbox context to use
+ *
+ * @return >= 0 on success, a negative AVERROR code on failure
+ */
+int av_videotoolbox_default_init2(AVCodecContext *avctx, AVVideotoolboxContext *vtctx);
+
+/**
+ * This function must be called to free the Videotoolbox context initialized with
+ * av_videotoolbox_default_init().
+ *
+ * @param avctx the corresponding codec context
+ */
+void av_videotoolbox_default_free(AVCodecContext *avctx);
+
+/**
+ * @}
+ */
+
+#endif /* AVCODEC_VIDEOTOOLBOX_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/vorbis_parser.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/vorbis_parser.h
new file mode 100644
index 0000000..9205027
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/vorbis_parser.h
@@ -0,0 +1,77 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * A public API for Vorbis parsing
+ *
+ * Determines the duration for each packet.
+ */
+
+#ifndef AVCODEC_VORBIS_PARSER_H
+#define AVCODEC_VORBIS_PARSER_H
+
+#include <stdint.h>
+
+typedef struct AVVorbisParseContext AVVorbisParseContext;
+
+/**
+ * Allocate and initialize the Vorbis parser using headers in the extradata.
+ *
+ * @param avctx codec context
+ * @param s     Vorbis parser context
+ */
+AVVorbisParseContext *av_vorbis_parse_init(const uint8_t *extradata,
+                                           int extradata_size);
+
+/**
+ * Free the parser and everything associated with it.
+ */
+void av_vorbis_parse_free(AVVorbisParseContext **s);
+
+#define VORBIS_FLAG_HEADER  0x00000001
+#define VORBIS_FLAG_COMMENT 0x00000002
+#define VORBIS_FLAG_SETUP   0x00000004
+
+/**
+ * Get the duration for a Vorbis packet.
+ *
+ * If @p flags is @c NULL,
+ * special frames are considered invalid.
+ *
+ * @param s        Vorbis parser context
+ * @param buf      buffer containing a Vorbis frame
+ * @param buf_size size of the buffer
+ * @param flags    flags for special frames
+ */
+int av_vorbis_parse_frame_flags(AVVorbisParseContext *s, const uint8_t *buf,
+                                int buf_size, int *flags);
+
+/**
+ * Get the duration for a Vorbis packet.
+ *
+ * @param s        Vorbis parser context
+ * @param buf      buffer containing a Vorbis frame
+ * @param buf_size size of the buffer
+ */
+int av_vorbis_parse_frame(AVVorbisParseContext *s, const uint8_t *buf,
+                          int buf_size);
+
+void av_vorbis_parse_reset(AVVorbisParseContext *s);
+
+#endif /* AVCODEC_VORBIS_PARSER_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/xvmc.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/xvmc.h
new file mode 100644
index 0000000..465ee78
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavcodec/xvmc.h
@@ -0,0 +1,170 @@
+/*
+ * Copyright (C) 2003 Ivan Kalvachev
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_XVMC_H
+#define AVCODEC_XVMC_H
+
+/**
+ * @file
+ * @ingroup lavc_codec_hwaccel_xvmc
+ * Public libavcodec XvMC header.
+ */
+
+#include <X11/extensions/XvMC.h>
+
+#include "libavutil/attributes.h"
+#include "version.h"
+#include "avcodec.h"
+
+/**
+ * @defgroup lavc_codec_hwaccel_xvmc XvMC
+ * @ingroup lavc_codec_hwaccel
+ *
+ * @{
+ */
+
+#define AV_XVMC_ID                    0x1DC711C0  /**< special value to ensure that regular pixel routines haven't corrupted the struct
+                                                       the number is 1337 speak for the letters IDCT MCo (motion compensation) */
+
+struct attribute_deprecated xvmc_pix_fmt {
+    /** The field contains the special constant value AV_XVMC_ID.
+        It is used as a test that the application correctly uses the API,
+        and that there is no corruption caused by pixel routines.
+        - application - set during initialization
+        - libavcodec  - unchanged
+    */
+    int             xvmc_id;
+
+    /** Pointer to the block array allocated by XvMCCreateBlocks().
+        The array has to be freed by XvMCDestroyBlocks().
+        Each group of 64 values represents one data block of differential
+        pixel information (in MoCo mode) or coefficients for IDCT.
+        - application - set the pointer during initialization
+        - libavcodec  - fills coefficients/pixel data into the array
+    */
+    short*          data_blocks;
+
+    /** Pointer to the macroblock description array allocated by
+        XvMCCreateMacroBlocks() and freed by XvMCDestroyMacroBlocks().
+        - application - set the pointer during initialization
+        - libavcodec  - fills description data into the array
+    */
+    XvMCMacroBlock* mv_blocks;
+
+    /** Number of macroblock descriptions that can be stored in the mv_blocks
+        array.
+        - application - set during initialization
+        - libavcodec  - unchanged
+    */
+    int             allocated_mv_blocks;
+
+    /** Number of blocks that can be stored at once in the data_blocks array.
+        - application - set during initialization
+        - libavcodec  - unchanged
+    */
+    int             allocated_data_blocks;
+
+    /** Indicate that the hardware would interpret data_blocks as IDCT
+        coefficients and perform IDCT on them.
+        - application - set during initialization
+        - libavcodec  - unchanged
+    */
+    int             idct;
+
+    /** In MoCo mode it indicates that intra macroblocks are assumed to be in
+        unsigned format; same as the XVMC_INTRA_UNSIGNED flag.
+        - application - set during initialization
+        - libavcodec  - unchanged
+    */
+    int             unsigned_intra;
+
+    /** Pointer to the surface allocated by XvMCCreateSurface().
+        It has to be freed by XvMCDestroySurface() on application exit.
+        It identifies the frame and its state on the video hardware.
+        - application - set during initialization
+        - libavcodec  - unchanged
+    */
+    XvMCSurface*    p_surface;
+
+/** Set by the decoder before calling ff_draw_horiz_band(),
+    needed by the XvMCRenderSurface function. */
+//@{
+    /** Pointer to the surface used as past reference
+        - application - unchanged
+        - libavcodec  - set
+    */
+    XvMCSurface*    p_past_surface;
+
+    /** Pointer to the surface used as future reference
+        - application - unchanged
+        - libavcodec  - set
+    */
+    XvMCSurface*    p_future_surface;
+
+    /** top/bottom field or frame
+        - application - unchanged
+        - libavcodec  - set
+    */
+    unsigned int    picture_structure;
+
+    /** XVMC_SECOND_FIELD - 1st or 2nd field in the sequence
+        - application - unchanged
+        - libavcodec  - set
+    */
+    unsigned int    flags;
+//}@
+
+    /** Number of macroblock descriptions in the mv_blocks array
+        that have already been passed to the hardware.
+        - application - zeroes it on get_buffer().
+                        A successful ff_draw_horiz_band() may increment it
+                        with filled_mb_block_num or zero both.
+        - libavcodec  - unchanged
+    */
+    int             start_mv_blocks_num;
+
+    /** Number of new macroblock descriptions in the mv_blocks array (after
+        start_mv_blocks_num) that are filled by libavcodec and have to be
+        passed to the hardware.
+        - application - zeroes it on get_buffer() or after successful
+                        ff_draw_horiz_band().
+        - libavcodec  - increment with one of each stored MB
+    */
+    int             filled_mv_blocks_num;
+
+    /** Number of the next free data block; one data block consists of
+        64 short values in the data_blocks array.
+        All blocks before this one have already been claimed by placing their
+        position into the corresponding block description structure field,
+        that are part of the mv_blocks array.
+        - application - zeroes it on get_buffer().
+                        A successful ff_draw_horiz_band() may zero it together
+                        with start_mb_blocks_num.
+        - libavcodec  - each decoded macroblock increases it by the number
+                        of coded blocks it contains.
+    */
+    int             next_free_data_block_num;
+};
+
+/**
+ * @}
+ */
+
+#endif /* AVCODEC_XVMC_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavdevice/avdevice.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavdevice/avdevice.h
new file mode 100644
index 0000000..ee94624
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavdevice/avdevice.h
@@ -0,0 +1,514 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVDEVICE_AVDEVICE_H
+#define AVDEVICE_AVDEVICE_H
+
+#include "version.h"
+
+/**
+ * @file
+ * @ingroup lavd
+ * Main libavdevice API header
+ */
+
+/**
+ * @defgroup lavd libavdevice
+ * Special devices muxing/demuxing library.
+ *
+ * Libavdevice is a complementary library to @ref libavf "libavformat". It
+ * provides various "special" platform-specific muxers and demuxers, e.g. for
+ * grabbing devices, audio capture and playback etc. As a consequence, the
+ * (de)muxers in libavdevice are of the AVFMT_NOFILE type (they use their own
+ * I/O functions). The filename passed to avformat_open_input() often does not
+ * refer to an actually existing file, but has some special device-specific
+ * meaning - e.g. for xcbgrab it is the display name.
+ *
+ * To use libavdevice, simply call avdevice_register_all() to register all
+ * compiled muxers and demuxers. They all use standard libavformat API.
+ *
+ * @{
+ */
+
+#include "libavutil/log.h"
+#include "libavutil/opt.h"
+#include "libavutil/dict.h"
+#include "libavformat/avformat.h"
+
+/**
+ * Return the LIBAVDEVICE_VERSION_INT constant.
+ */
+unsigned avdevice_version(void);
+
+/**
+ * Return the libavdevice build-time configuration.
+ */
+const char *avdevice_configuration(void);
+
+/**
+ * Return the libavdevice license.
+ */
+const char *avdevice_license(void);
+
+/**
+ * Initialize libavdevice and register all the input and output devices.
+ */
+void avdevice_register_all(void);
+
+/**
+ * Audio input devices iterator.
+ *
+ * If d is NULL, returns the first registered input audio/video device,
+ * if d is non-NULL, returns the next registered input audio/video device after d
+ * or NULL if d is the last one.
+ */
+AVInputFormat *av_input_audio_device_next(AVInputFormat  *d);
+
+/**
+ * Video input devices iterator.
+ *
+ * If d is NULL, returns the first registered input audio/video device,
+ * if d is non-NULL, returns the next registered input audio/video device after d
+ * or NULL if d is the last one.
+ */
+AVInputFormat *av_input_video_device_next(AVInputFormat  *d);
+
+/**
+ * Audio output devices iterator.
+ *
+ * If d is NULL, returns the first registered output audio/video device,
+ * if d is non-NULL, returns the next registered output audio/video device after d
+ * or NULL if d is the last one.
+ */
+AVOutputFormat *av_output_audio_device_next(AVOutputFormat *d);
+
+/**
+ * Video output devices iterator.
+ *
+ * If d is NULL, returns the first registered output audio/video device,
+ * if d is non-NULL, returns the next registered output audio/video device after d
+ * or NULL if d is the last one.
+ */
+AVOutputFormat *av_output_video_device_next(AVOutputFormat *d);
+
+typedef struct AVDeviceRect {
+    int x;      /**< x coordinate of top left corner */
+    int y;      /**< y coordinate of top left corner */
+    int width;  /**< width */
+    int height; /**< height */
+} AVDeviceRect;
+
+/**
+ * Message types used by avdevice_app_to_dev_control_message().
+ */
+enum AVAppToDevMessageType {
+    /**
+     * Dummy message.
+     */
+    AV_APP_TO_DEV_NONE = MKBETAG('N','O','N','E'),
+
+    /**
+     * Window size change message.
+     *
+     * Message is sent to the device every time the application changes the size
+     * of the window device renders to.
+     * Message should also be sent right after window is created.
+     *
+     * data: AVDeviceRect: new window size.
+     */
+    AV_APP_TO_DEV_WINDOW_SIZE = MKBETAG('G','E','O','M'),
+
+    /**
+     * Repaint request message.
+     *
+     * Message is sent to the device when window has to be repainted.
+     *
+     * data: AVDeviceRect: area required to be repainted.
+     *       NULL: whole area is required to be repainted.
+     */
+    AV_APP_TO_DEV_WINDOW_REPAINT = MKBETAG('R','E','P','A'),
+
+    /**
+     * Request pause/play.
+     *
+     * Application requests pause/unpause playback.
+     * Mostly usable with devices that have internal buffer.
+     * By default devices are not paused.
+     *
+     * data: NULL
+     */
+    AV_APP_TO_DEV_PAUSE        = MKBETAG('P', 'A', 'U', ' '),
+    AV_APP_TO_DEV_PLAY         = MKBETAG('P', 'L', 'A', 'Y'),
+    AV_APP_TO_DEV_TOGGLE_PAUSE = MKBETAG('P', 'A', 'U', 'T'),
+
+    /**
+     * Volume control message.
+     *
+     * Set volume level. It may be device-dependent if volume
+     * is changed per stream or system wide. Per stream volume
+     * change is expected when possible.
+     *
+     * data: double: new volume with range of 0.0 - 1.0.
+     */
+    AV_APP_TO_DEV_SET_VOLUME = MKBETAG('S', 'V', 'O', 'L'),
+
+    /**
+     * Mute control messages.
+     *
+     * Change mute state. It may be device-dependent if mute status
+     * is changed per stream or system wide. Per stream mute status
+     * change is expected when possible.
+     *
+     * data: NULL.
+     */
+    AV_APP_TO_DEV_MUTE        = MKBETAG(' ', 'M', 'U', 'T'),
+    AV_APP_TO_DEV_UNMUTE      = MKBETAG('U', 'M', 'U', 'T'),
+    AV_APP_TO_DEV_TOGGLE_MUTE = MKBETAG('T', 'M', 'U', 'T'),
+
+    /**
+     * Get volume/mute messages.
+     *
+     * Force the device to send AV_DEV_TO_APP_VOLUME_LEVEL_CHANGED or
+     * AV_DEV_TO_APP_MUTE_STATE_CHANGED command respectively.
+     *
+     * data: NULL.
+     */
+    AV_APP_TO_DEV_GET_VOLUME = MKBETAG('G', 'V', 'O', 'L'),
+    AV_APP_TO_DEV_GET_MUTE   = MKBETAG('G', 'M', 'U', 'T'),
+};
+
+/**
+ * Message types used by avdevice_dev_to_app_control_message().
+ */
+enum AVDevToAppMessageType {
+    /**
+     * Dummy message.
+     */
+    AV_DEV_TO_APP_NONE = MKBETAG('N','O','N','E'),
+
+    /**
+     * Create window buffer message.
+     *
+     * Device requests to create a window buffer. Exact meaning is device-
+     * and application-dependent. Message is sent before rendering first
+     * frame and all one-shot initializations should be done here.
+     * Application is allowed to ignore preferred window buffer size.
+     *
+     * @note: Application is obligated to inform about window buffer size
+     *        with AV_APP_TO_DEV_WINDOW_SIZE message.
+     *
+     * data: AVDeviceRect: preferred size of the window buffer.
+     *       NULL: no preferred size of the window buffer.
+     */
+    AV_DEV_TO_APP_CREATE_WINDOW_BUFFER = MKBETAG('B','C','R','E'),
+
+    /**
+     * Prepare window buffer message.
+     *
+     * Device requests to prepare a window buffer for rendering.
+     * Exact meaning is device- and application-dependent.
+     * Message is sent before rendering of each frame.
+     *
+     * data: NULL.
+     */
+    AV_DEV_TO_APP_PREPARE_WINDOW_BUFFER = MKBETAG('B','P','R','E'),
+
+    /**
+     * Display window buffer message.
+     *
+     * Device requests to display a window buffer.
+     * Message is sent when new frame is ready to be displayed.
+     * Usually buffers need to be swapped in handler of this message.
+     *
+     * data: NULL.
+     */
+    AV_DEV_TO_APP_DISPLAY_WINDOW_BUFFER = MKBETAG('B','D','I','S'),
+
+    /**
+     * Destroy window buffer message.
+     *
+     * Device requests to destroy a window buffer.
+     * Message is sent when device is about to be destroyed and window
+     * buffer is not required anymore.
+     *
+     * data: NULL.
+     */
+    AV_DEV_TO_APP_DESTROY_WINDOW_BUFFER = MKBETAG('B','D','E','S'),
+
+    /**
+     * Buffer fullness status messages.
+     *
+     * Device signals buffer overflow/underflow.
+     *
+     * data: NULL.
+     */
+    AV_DEV_TO_APP_BUFFER_OVERFLOW = MKBETAG('B','O','F','L'),
+    AV_DEV_TO_APP_BUFFER_UNDERFLOW = MKBETAG('B','U','F','L'),
+
+    /**
+     * Buffer readable/writable.
+     *
+     * Device informs that buffer is readable/writable.
+     * When possible, device informs how many bytes can be read/write.
+     *
+     * @warning Device may not inform when number of bytes than can be read/write changes.
+     *
+     * data: int64_t: amount of bytes available to read/write.
+     *       NULL: amount of bytes available to read/write is not known.
+     */
+    AV_DEV_TO_APP_BUFFER_READABLE = MKBETAG('B','R','D',' '),
+    AV_DEV_TO_APP_BUFFER_WRITABLE = MKBETAG('B','W','R',' '),
+
+    /**
+     * Mute state change message.
+     *
+     * Device informs that mute state has changed.
+     *
+     * data: int: 0 for not muted state, non-zero for muted state.
+     */
+    AV_DEV_TO_APP_MUTE_STATE_CHANGED = MKBETAG('C','M','U','T'),
+
+    /**
+     * Volume level change message.
+     *
+     * Device informs that volume level has changed.
+     *
+     * data: double: new volume with range of 0.0 - 1.0.
+     */
+    AV_DEV_TO_APP_VOLUME_LEVEL_CHANGED = MKBETAG('C','V','O','L'),
+};
+
+/**
+ * Send control message from application to device.
+ *
+ * @param s         device context.
+ * @param type      message type.
+ * @param data      message data. Exact type depends on message type.
+ * @param data_size size of message data.
+ * @return >= 0 on success, negative on error.
+ *         AVERROR(ENOSYS) when device doesn't implement handler of the message.
+ */
+int avdevice_app_to_dev_control_message(struct AVFormatContext *s,
+                                        enum AVAppToDevMessageType type,
+                                        void *data, size_t data_size);
+
+/**
+ * Send control message from device to application.
+ *
+ * @param s         device context.
+ * @param type      message type.
+ * @param data      message data. Can be NULL.
+ * @param data_size size of message data.
+ * @return >= 0 on success, negative on error.
+ *         AVERROR(ENOSYS) when application doesn't implement handler of the message.
+ */
+int avdevice_dev_to_app_control_message(struct AVFormatContext *s,
+                                        enum AVDevToAppMessageType type,
+                                        void *data, size_t data_size);
+
+/**
+ * Following API allows user to probe device capabilities (supported codecs,
+ * pixel formats, sample formats, resolutions, channel counts, etc).
+ * It is build on top op AVOption API.
+ * Queried capabilities make it possible to set up converters of video or audio
+ * parameters that fit to the device.
+ *
+ * List of capabilities that can be queried:
+ *  - Capabilities valid for both audio and video devices:
+ *    - codec:          supported audio/video codecs.
+ *                      type: AV_OPT_TYPE_INT (AVCodecID value)
+ *  - Capabilities valid for audio devices:
+ *    - sample_format:  supported sample formats.
+ *                      type: AV_OPT_TYPE_INT (AVSampleFormat value)
+ *    - sample_rate:    supported sample rates.
+ *                      type: AV_OPT_TYPE_INT
+ *    - channels:       supported number of channels.
+ *                      type: AV_OPT_TYPE_INT
+ *    - channel_layout: supported channel layouts.
+ *                      type: AV_OPT_TYPE_INT64
+ *  - Capabilities valid for video devices:
+ *    - pixel_format:   supported pixel formats.
+ *                      type: AV_OPT_TYPE_INT (AVPixelFormat value)
+ *    - window_size:    supported window sizes (describes size of the window size presented to the user).
+ *                      type: AV_OPT_TYPE_IMAGE_SIZE
+ *    - frame_size:     supported frame sizes (describes size of provided video frames).
+ *                      type: AV_OPT_TYPE_IMAGE_SIZE
+ *    - fps:            supported fps values
+ *                      type: AV_OPT_TYPE_RATIONAL
+ *
+ * Value of the capability may be set by user using av_opt_set() function
+ * and AVDeviceCapabilitiesQuery object. Following queries will
+ * limit results to the values matching already set capabilities.
+ * For example, setting a codec may impact number of formats or fps values
+ * returned during next query. Setting invalid value may limit results to zero.
+ *
+ * Example of the usage basing on opengl output device:
+ *
+ * @code
+ *  AVFormatContext *oc = NULL;
+ *  AVDeviceCapabilitiesQuery *caps = NULL;
+ *  AVOptionRanges *ranges;
+ *  int ret;
+ *
+ *  if ((ret = avformat_alloc_output_context2(&oc, NULL, "opengl", NULL)) < 0)
+ *      goto fail;
+ *  if (avdevice_capabilities_create(&caps, oc, NULL) < 0)
+ *      goto fail;
+ *
+ *  //query codecs
+ *  if (av_opt_query_ranges(&ranges, caps, "codec", AV_OPT_MULTI_COMPONENT_RANGE)) < 0)
+ *      goto fail;
+ *  //pick codec here and set it
+ *  av_opt_set(caps, "codec", AV_CODEC_ID_RAWVIDEO, 0);
+ *
+ *  //query format
+ *  if (av_opt_query_ranges(&ranges, caps, "pixel_format", AV_OPT_MULTI_COMPONENT_RANGE)) < 0)
+ *      goto fail;
+ *  //pick format here and set it
+ *  av_opt_set(caps, "pixel_format", AV_PIX_FMT_YUV420P, 0);
+ *
+ *  //query and set more capabilities
+ *
+ * fail:
+ *  //clean up code
+ *  avdevice_capabilities_free(&query, oc);
+ *  avformat_free_context(oc);
+ * @endcode
+ */
+
+/**
+ * Structure describes device capabilities.
+ *
+ * It is used by devices in conjunction with av_device_capabilities AVOption table
+ * to implement capabilities probing API based on AVOption API. Should not be used directly.
+ */
+typedef struct AVDeviceCapabilitiesQuery {
+    const AVClass *av_class;
+    AVFormatContext *device_context;
+    enum AVCodecID codec;
+    enum AVSampleFormat sample_format;
+    enum AVPixelFormat pixel_format;
+    int sample_rate;
+    int channels;
+    int64_t channel_layout;
+    int window_width;
+    int window_height;
+    int frame_width;
+    int frame_height;
+    AVRational fps;
+} AVDeviceCapabilitiesQuery;
+
+/**
+ * AVOption table used by devices to implement device capabilities API. Should not be used by a user.
+ */
+extern const AVOption av_device_capabilities[];
+
+/**
+ * Initialize capabilities probing API based on AVOption API.
+ *
+ * avdevice_capabilities_free() must be called when query capabilities API is
+ * not used anymore.
+ *
+ * @param[out] caps      Device capabilities data. Pointer to a NULL pointer must be passed.
+ * @param s              Context of the device.
+ * @param device_options An AVDictionary filled with device-private options.
+ *                       On return this parameter will be destroyed and replaced with a dict
+ *                       containing options that were not found. May be NULL.
+ *                       The same options must be passed later to avformat_write_header() for output
+ *                       devices or avformat_open_input() for input devices, or at any other place
+ *                       that affects device-private options.
+ *
+ * @return >= 0 on success, negative otherwise.
+ */
+int avdevice_capabilities_create(AVDeviceCapabilitiesQuery **caps, AVFormatContext *s,
+                                 AVDictionary **device_options);
+
+/**
+ * Free resources created by avdevice_capabilities_create()
+ *
+ * @param caps Device capabilities data to be freed.
+ * @param s    Context of the device.
+ */
+void avdevice_capabilities_free(AVDeviceCapabilitiesQuery **caps, AVFormatContext *s);
+
+/**
+ * Structure describes basic parameters of the device.
+ */
+typedef struct AVDeviceInfo {
+    char *device_name;                   /**< device name, format depends on device */
+    char *device_description;            /**< human friendly name */
+} AVDeviceInfo;
+
+/**
+ * List of devices.
+ */
+typedef struct AVDeviceInfoList {
+    AVDeviceInfo **devices;              /**< list of autodetected devices */
+    int nb_devices;                      /**< number of autodetected devices */
+    int default_device;                  /**< index of default device or -1 if no default */
+} AVDeviceInfoList;
+
+/**
+ * List devices.
+ *
+ * Returns available device names and their parameters.
+ *
+ * @note: Some devices may accept system-dependent device names that cannot be
+ *        autodetected. The list returned by this function cannot be assumed to
+ *        be always completed.
+ *
+ * @param s                device context.
+ * @param[out] device_list list of autodetected devices.
+ * @return count of autodetected devices, negative on error.
+ */
+int avdevice_list_devices(struct AVFormatContext *s, AVDeviceInfoList **device_list);
+
+/**
+ * Convenient function to free result of avdevice_list_devices().
+ *
+ * @param devices device list to be freed.
+ */
+void avdevice_free_list_devices(AVDeviceInfoList **device_list);
+
+/**
+ * List devices.
+ *
+ * Returns available device names and their parameters.
+ * These are convinient wrappers for avdevice_list_devices().
+ * Device context is allocated and deallocated internally.
+ *
+ * @param device           device format. May be NULL if device name is set.
+ * @param device_name      device name. May be NULL if device format is set.
+ * @param device_options   An AVDictionary filled with device-private options. May be NULL.
+ *                         The same options must be passed later to avformat_write_header() for output
+ *                         devices or avformat_open_input() for input devices, or at any other place
+ *                         that affects device-private options.
+ * @param[out] device_list list of autodetected devices
+ * @return count of autodetected devices, negative on error.
+ * @note device argument takes precedence over device_name when both are set.
+ */
+int avdevice_list_input_sources(struct AVInputFormat *device, const char *device_name,
+                                AVDictionary *device_options, AVDeviceInfoList **device_list);
+int avdevice_list_output_sinks(struct AVOutputFormat *device, const char *device_name,
+                               AVDictionary *device_options, AVDeviceInfoList **device_list);
+
+/**
+ * @}
+ */
+
+#endif /* AVDEVICE_AVDEVICE_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavdevice/version.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavdevice/version.h
new file mode 100644
index 0000000..986ca98
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavdevice/version.h
@@ -0,0 +1,50 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVDEVICE_VERSION_H
+#define AVDEVICE_VERSION_H
+
+/**
+ * @file
+ * @ingroup lavd
+ * Libavdevice version macros
+ */
+
+#include "libavutil/version.h"
+
+#define LIBAVDEVICE_VERSION_MAJOR  57
+#define LIBAVDEVICE_VERSION_MINOR   6
+#define LIBAVDEVICE_VERSION_MICRO 100
+
+#define LIBAVDEVICE_VERSION_INT AV_VERSION_INT(LIBAVDEVICE_VERSION_MAJOR, \
+                                               LIBAVDEVICE_VERSION_MINOR, \
+                                               LIBAVDEVICE_VERSION_MICRO)
+#define LIBAVDEVICE_VERSION     AV_VERSION(LIBAVDEVICE_VERSION_MAJOR, \
+                                           LIBAVDEVICE_VERSION_MINOR, \
+                                           LIBAVDEVICE_VERSION_MICRO)
+#define LIBAVDEVICE_BUILD       LIBAVDEVICE_VERSION_INT
+
+#define LIBAVDEVICE_IDENT       "Lavd" AV_STRINGIFY(LIBAVDEVICE_VERSION)
+
+/**
+ * FF_API_* defines may be placed below to indicate public API that will be
+ * dropped at a future version bump. The defines themselves are not part of
+ * the public API and may change, break or disappear at any time.
+ */
+
+#endif /* AVDEVICE_VERSION_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavfilter/avfilter.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavfilter/avfilter.h
new file mode 100644
index 0000000..60662c1
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavfilter/avfilter.h
@@ -0,0 +1,1182 @@
+/*
+ * filter layer
+ * Copyright (c) 2007 Bobby Bingham
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_AVFILTER_H
+#define AVFILTER_AVFILTER_H
+
+/**
+ * @file
+ * @ingroup lavfi
+ * Main libavfilter public API header
+ */
+
+/**
+ * @defgroup lavfi libavfilter
+ * Graph-based frame editing library.
+ *
+ * @{
+ */
+
+#include <stddef.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/avutil.h"
+#include "libavutil/buffer.h"
+#include "libavutil/dict.h"
+#include "libavutil/frame.h"
+#include "libavutil/log.h"
+#include "libavutil/samplefmt.h"
+#include "libavutil/pixfmt.h"
+#include "libavutil/rational.h"
+
+#include "libavfilter/version.h"
+
+/**
+ * Return the LIBAVFILTER_VERSION_INT constant.
+ */
+unsigned avfilter_version(void);
+
+/**
+ * Return the libavfilter build-time configuration.
+ */
+const char *avfilter_configuration(void);
+
+/**
+ * Return the libavfilter license.
+ */
+const char *avfilter_license(void);
+
+typedef struct AVFilterContext AVFilterContext;
+typedef struct AVFilterLink    AVFilterLink;
+typedef struct AVFilterPad     AVFilterPad;
+typedef struct AVFilterFormats AVFilterFormats;
+
+/**
+ * Get the number of elements in a NULL-terminated array of AVFilterPads (e.g.
+ * AVFilter.inputs/outputs).
+ */
+int avfilter_pad_count(const AVFilterPad *pads);
+
+/**
+ * Get the name of an AVFilterPad.
+ *
+ * @param pads an array of AVFilterPads
+ * @param pad_idx index of the pad in the array it; is the caller's
+ *                responsibility to ensure the index is valid
+ *
+ * @return name of the pad_idx'th pad in pads
+ */
+const char *avfilter_pad_get_name(const AVFilterPad *pads, int pad_idx);
+
+/**
+ * Get the type of an AVFilterPad.
+ *
+ * @param pads an array of AVFilterPads
+ * @param pad_idx index of the pad in the array; it is the caller's
+ *                responsibility to ensure the index is valid
+ *
+ * @return type of the pad_idx'th pad in pads
+ */
+enum AVMediaType avfilter_pad_get_type(const AVFilterPad *pads, int pad_idx);
+
+/**
+ * The number of the filter inputs is not determined just by AVFilter.inputs.
+ * The filter might add additional inputs during initialization depending on the
+ * options supplied to it.
+ */
+#define AVFILTER_FLAG_DYNAMIC_INPUTS        (1 << 0)
+/**
+ * The number of the filter outputs is not determined just by AVFilter.outputs.
+ * The filter might add additional outputs during initialization depending on
+ * the options supplied to it.
+ */
+#define AVFILTER_FLAG_DYNAMIC_OUTPUTS       (1 << 1)
+/**
+ * The filter supports multithreading by splitting frames into multiple parts
+ * and processing them concurrently.
+ */
+#define AVFILTER_FLAG_SLICE_THREADS         (1 << 2)
+/**
+ * Some filters support a generic "enable" expression option that can be used
+ * to enable or disable a filter in the timeline. Filters supporting this
+ * option have this flag set. When the enable expression is false, the default
+ * no-op filter_frame() function is called in place of the filter_frame()
+ * callback defined on each input pad, thus the frame is passed unchanged to
+ * the next filters.
+ */
+#define AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC  (1 << 16)
+/**
+ * Same as AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC, except that the filter will
+ * have its filter_frame() callback(s) called as usual even when the enable
+ * expression is false. The filter will disable filtering within the
+ * filter_frame() callback(s) itself, for example executing code depending on
+ * the AVFilterContext->is_disabled value.
+ */
+#define AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL (1 << 17)
+/**
+ * Handy mask to test whether the filter supports or no the timeline feature
+ * (internally or generically).
+ */
+#define AVFILTER_FLAG_SUPPORT_TIMELINE (AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC | AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL)
+
+/**
+ * Filter definition. This defines the pads a filter contains, and all the
+ * callback functions used to interact with the filter.
+ */
+typedef struct AVFilter {
+    /**
+     * Filter name. Must be non-NULL and unique among filters.
+     */
+    const char *name;
+
+    /**
+     * A description of the filter. May be NULL.
+     *
+     * You should use the NULL_IF_CONFIG_SMALL() macro to define it.
+     */
+    const char *description;
+
+    /**
+     * List of inputs, terminated by a zeroed element.
+     *
+     * NULL if there are no (static) inputs. Instances of filters with
+     * AVFILTER_FLAG_DYNAMIC_INPUTS set may have more inputs than present in
+     * this list.
+     */
+    const AVFilterPad *inputs;
+    /**
+     * List of outputs, terminated by a zeroed element.
+     *
+     * NULL if there are no (static) outputs. Instances of filters with
+     * AVFILTER_FLAG_DYNAMIC_OUTPUTS set may have more outputs than present in
+     * this list.
+     */
+    const AVFilterPad *outputs;
+
+    /**
+     * A class for the private data, used to declare filter private AVOptions.
+     * This field is NULL for filters that do not declare any options.
+     *
+     * If this field is non-NULL, the first member of the filter private data
+     * must be a pointer to AVClass, which will be set by libavfilter generic
+     * code to this class.
+     */
+    const AVClass *priv_class;
+
+    /**
+     * A combination of AVFILTER_FLAG_*
+     */
+    int flags;
+
+    /*****************************************************************
+     * All fields below this line are not part of the public API. They
+     * may not be used outside of libavfilter and can be changed and
+     * removed at will.
+     * New public fields should be added right above.
+     *****************************************************************
+     */
+
+    /**
+     * Filter initialization function.
+     *
+     * This callback will be called only once during the filter lifetime, after
+     * all the options have been set, but before links between filters are
+     * established and format negotiation is done.
+     *
+     * Basic filter initialization should be done here. Filters with dynamic
+     * inputs and/or outputs should create those inputs/outputs here based on
+     * provided options. No more changes to this filter's inputs/outputs can be
+     * done after this callback.
+     *
+     * This callback must not assume that the filter links exist or frame
+     * parameters are known.
+     *
+     * @ref AVFilter.uninit "uninit" is guaranteed to be called even if
+     * initialization fails, so this callback does not have to clean up on
+     * failure.
+     *
+     * @return 0 on success, a negative AVERROR on failure
+     */
+    int (*init)(AVFilterContext *ctx);
+
+    /**
+     * Should be set instead of @ref AVFilter.init "init" by the filters that
+     * want to pass a dictionary of AVOptions to nested contexts that are
+     * allocated during init.
+     *
+     * On return, the options dict should be freed and replaced with one that
+     * contains all the options which could not be processed by this filter (or
+     * with NULL if all the options were processed).
+     *
+     * Otherwise the semantics is the same as for @ref AVFilter.init "init".
+     */
+    int (*init_dict)(AVFilterContext *ctx, AVDictionary **options);
+
+    /**
+     * Filter uninitialization function.
+     *
+     * Called only once right before the filter is freed. Should deallocate any
+     * memory held by the filter, release any buffer references, etc. It does
+     * not need to deallocate the AVFilterContext.priv memory itself.
+     *
+     * This callback may be called even if @ref AVFilter.init "init" was not
+     * called or failed, so it must be prepared to handle such a situation.
+     */
+    void (*uninit)(AVFilterContext *ctx);
+
+    /**
+     * Query formats supported by the filter on its inputs and outputs.
+     *
+     * This callback is called after the filter is initialized (so the inputs
+     * and outputs are fixed), shortly before the format negotiation. This
+     * callback may be called more than once.
+     *
+     * This callback must set AVFilterLink.out_formats on every input link and
+     * AVFilterLink.in_formats on every output link to a list of pixel/sample
+     * formats that the filter supports on that link. For audio links, this
+     * filter must also set @ref AVFilterLink.in_samplerates "in_samplerates" /
+     * @ref AVFilterLink.out_samplerates "out_samplerates" and
+     * @ref AVFilterLink.in_channel_layouts "in_channel_layouts" /
+     * @ref AVFilterLink.out_channel_layouts "out_channel_layouts" analogously.
+     *
+     * This callback may be NULL for filters with one input, in which case
+     * libavfilter assumes that it supports all input formats and preserves
+     * them on output.
+     *
+     * @return zero on success, a negative value corresponding to an
+     * AVERROR code otherwise
+     */
+    int (*query_formats)(AVFilterContext *);
+
+    int priv_size;      ///< size of private data to allocate for the filter
+
+    int flags_internal; ///< Additional flags for avfilter internal use only.
+
+    /**
+     * Used by the filter registration system. Must not be touched by any other
+     * code.
+     */
+    struct AVFilter *next;
+
+    /**
+     * Make the filter instance process a command.
+     *
+     * @param cmd    the command to process, for handling simplicity all commands must be alphanumeric only
+     * @param arg    the argument for the command
+     * @param res    a buffer with size res_size where the filter(s) can return a response. This must not change when the command is not supported.
+     * @param flags  if AVFILTER_CMD_FLAG_FAST is set and the command would be
+     *               time consuming then a filter should treat it like an unsupported command
+     *
+     * @returns >=0 on success otherwise an error code.
+     *          AVERROR(ENOSYS) on unsupported commands
+     */
+    int (*process_command)(AVFilterContext *, const char *cmd, const char *arg, char *res, int res_len, int flags);
+
+    /**
+     * Filter initialization function, alternative to the init()
+     * callback. Args contains the user-supplied parameters, opaque is
+     * used for providing binary data.
+     */
+    int (*init_opaque)(AVFilterContext *ctx, void *opaque);
+
+    /**
+     * Filter activation function.
+     *
+     * Called when any processing is needed from the filter, instead of any
+     * filter_frame and request_frame on pads.
+     *
+     * The function must examine inlinks and outlinks and perform a single
+     * step of processing. If there is nothing to do, the function must do
+     * nothing and not return an error. If more steps are or may be
+     * possible, it must use ff_filter_set_ready() to schedule another
+     * activation.
+     */
+    int (*activate)(AVFilterContext *ctx);
+} AVFilter;
+
+/**
+ * Process multiple parts of the frame concurrently.
+ */
+#define AVFILTER_THREAD_SLICE (1 << 0)
+
+typedef struct AVFilterInternal AVFilterInternal;
+
+/** An instance of a filter */
+struct AVFilterContext {
+    const AVClass *av_class;        ///< needed for av_log() and filters common options
+
+    const AVFilter *filter;         ///< the AVFilter of which this is an instance
+
+    char *name;                     ///< name of this filter instance
+
+    AVFilterPad   *input_pads;      ///< array of input pads
+    AVFilterLink **inputs;          ///< array of pointers to input links
+    unsigned    nb_inputs;          ///< number of input pads
+
+    AVFilterPad   *output_pads;     ///< array of output pads
+    AVFilterLink **outputs;         ///< array of pointers to output links
+    unsigned    nb_outputs;         ///< number of output pads
+
+    void *priv;                     ///< private data for use by the filter
+
+    struct AVFilterGraph *graph;    ///< filtergraph this filter belongs to
+
+    /**
+     * Type of multithreading being allowed/used. A combination of
+     * AVFILTER_THREAD_* flags.
+     *
+     * May be set by the caller before initializing the filter to forbid some
+     * or all kinds of multithreading for this filter. The default is allowing
+     * everything.
+     *
+     * When the filter is initialized, this field is combined using bit AND with
+     * AVFilterGraph.thread_type to get the final mask used for determining
+     * allowed threading types. I.e. a threading type needs to be set in both
+     * to be allowed.
+     *
+     * After the filter is initialized, libavfilter sets this field to the
+     * threading type that is actually used (0 for no multithreading).
+     */
+    int thread_type;
+
+    /**
+     * An opaque struct for libavfilter internal use.
+     */
+    AVFilterInternal *internal;
+
+    struct AVFilterCommand *command_queue;
+
+    char *enable_str;               ///< enable expression string
+    void *enable;                   ///< parsed expression (AVExpr*)
+    double *var_values;             ///< variable values for the enable expression
+    int is_disabled;                ///< the enabled state from the last expression evaluation
+
+    /**
+     * For filters which will create hardware frames, sets the device the
+     * filter should create them in.  All other filters will ignore this field:
+     * in particular, a filter which consumes or processes hardware frames will
+     * instead use the hw_frames_ctx field in AVFilterLink to carry the
+     * hardware context information.
+     */
+    AVBufferRef *hw_device_ctx;
+
+    /**
+     * Max number of threads allowed in this filter instance.
+     * If <= 0, its value is ignored.
+     * Overrides global number of threads set per filter graph.
+     */
+    int nb_threads;
+
+    /**
+     * Ready status of the filter.
+     * A non-0 value means that the filter needs activating;
+     * a higher value suggests a more urgent activation.
+     */
+    unsigned ready;
+};
+
+/**
+ * A link between two filters. This contains pointers to the source and
+ * destination filters between which this link exists, and the indexes of
+ * the pads involved. In addition, this link also contains the parameters
+ * which have been negotiated and agreed upon between the filter, such as
+ * image dimensions, format, etc.
+ *
+ * Applications must not normally access the link structure directly.
+ * Use the buffersrc and buffersink API instead.
+ * In the future, access to the header may be reserved for filters
+ * implementation.
+ */
+struct AVFilterLink {
+    AVFilterContext *src;       ///< source filter
+    AVFilterPad *srcpad;        ///< output pad on the source filter
+
+    AVFilterContext *dst;       ///< dest filter
+    AVFilterPad *dstpad;        ///< input pad on the dest filter
+
+    enum AVMediaType type;      ///< filter media type
+
+    /* These parameters apply only to video */
+    int w;                      ///< agreed upon image width
+    int h;                      ///< agreed upon image height
+    AVRational sample_aspect_ratio; ///< agreed upon sample aspect ratio
+    /* These parameters apply only to audio */
+    uint64_t channel_layout;    ///< channel layout of current buffer (see libavutil/channel_layout.h)
+    int sample_rate;            ///< samples per second
+
+    int format;                 ///< agreed upon media format
+
+    /**
+     * Define the time base used by the PTS of the frames/samples
+     * which will pass through this link.
+     * During the configuration stage, each filter is supposed to
+     * change only the output timebase, while the timebase of the
+     * input link is assumed to be an unchangeable property.
+     */
+    AVRational time_base;
+
+    /*****************************************************************
+     * All fields below this line are not part of the public API. They
+     * may not be used outside of libavfilter and can be changed and
+     * removed at will.
+     * New public fields should be added right above.
+     *****************************************************************
+     */
+    /**
+     * Lists of formats and channel layouts supported by the input and output
+     * filters respectively. These lists are used for negotiating the format
+     * to actually be used, which will be loaded into the format and
+     * channel_layout members, above, when chosen.
+     *
+     */
+    AVFilterFormats *in_formats;
+    AVFilterFormats *out_formats;
+
+    /**
+     * Lists of channel layouts and sample rates used for automatic
+     * negotiation.
+     */
+    AVFilterFormats  *in_samplerates;
+    AVFilterFormats *out_samplerates;
+    struct AVFilterChannelLayouts  *in_channel_layouts;
+    struct AVFilterChannelLayouts *out_channel_layouts;
+
+    /**
+     * Audio only, the destination filter sets this to a non-zero value to
+     * request that buffers with the given number of samples should be sent to
+     * it. AVFilterPad.needs_fifo must also be set on the corresponding input
+     * pad.
+     * Last buffer before EOF will be padded with silence.
+     */
+    int request_samples;
+
+    /** stage of the initialization of the link properties (dimensions, etc) */
+    enum {
+        AVLINK_UNINIT = 0,      ///< not started
+        AVLINK_STARTINIT,       ///< started, but incomplete
+        AVLINK_INIT             ///< complete
+    } init_state;
+
+    /**
+     * Graph the filter belongs to.
+     */
+    struct AVFilterGraph *graph;
+
+    /**
+     * Current timestamp of the link, as defined by the most recent
+     * frame(s), in link time_base units.
+     */
+    int64_t current_pts;
+
+    /**
+     * Current timestamp of the link, as defined by the most recent
+     * frame(s), in AV_TIME_BASE units.
+     */
+    int64_t current_pts_us;
+
+    /**
+     * Index in the age array.
+     */
+    int age_index;
+
+    /**
+     * Frame rate of the stream on the link, or 1/0 if unknown or variable;
+     * if left to 0/0, will be automatically copied from the first input
+     * of the source filter if it exists.
+     *
+     * Sources should set it to the best estimation of the real frame rate.
+     * If the source frame rate is unknown or variable, set this to 1/0.
+     * Filters should update it if necessary depending on their function.
+     * Sinks can use it to set a default output frame rate.
+     * It is similar to the r_frame_rate field in AVStream.
+     */
+    AVRational frame_rate;
+
+    /**
+     * Buffer partially filled with samples to achieve a fixed/minimum size.
+     */
+    AVFrame *partial_buf;
+
+    /**
+     * Size of the partial buffer to allocate.
+     * Must be between min_samples and max_samples.
+     */
+    int partial_buf_size;
+
+    /**
+     * Minimum number of samples to filter at once. If filter_frame() is
+     * called with fewer samples, it will accumulate them in partial_buf.
+     * This field and the related ones must not be changed after filtering
+     * has started.
+     * If 0, all related fields are ignored.
+     */
+    int min_samples;
+
+    /**
+     * Maximum number of samples to filter at once. If filter_frame() is
+     * called with more samples, it will split them.
+     */
+    int max_samples;
+
+    /**
+     * Number of channels.
+     */
+    int channels;
+
+    /**
+     * Link processing flags.
+     */
+    unsigned flags;
+
+    /**
+     * Number of past frames sent through the link.
+     */
+    int64_t frame_count_in, frame_count_out;
+
+    /**
+     * A pointer to a FFFramePool struct.
+     */
+    void *frame_pool;
+
+    /**
+     * True if a frame is currently wanted on the output of this filter.
+     * Set when ff_request_frame() is called by the output,
+     * cleared when a frame is filtered.
+     */
+    int frame_wanted_out;
+
+    /**
+     * For hwaccel pixel formats, this should be a reference to the
+     * AVHWFramesContext describing the frames.
+     */
+    AVBufferRef *hw_frames_ctx;
+
+#ifndef FF_INTERNAL_FIELDS
+
+    /**
+     * Internal structure members.
+     * The fields below this limit are internal for libavfilter's use
+     * and must in no way be accessed by applications.
+     */
+    char reserved[0xF000];
+
+#else /* FF_INTERNAL_FIELDS */
+
+    /**
+     * Queue of frames waiting to be filtered.
+     */
+    FFFrameQueue fifo;
+
+    /**
+     * If set, the source filter can not generate a frame as is.
+     * The goal is to avoid repeatedly calling the request_frame() method on
+     * the same link.
+     */
+    int frame_blocked_in;
+
+    /**
+     * Link input status.
+     * If not zero, all attempts of filter_frame will fail with the
+     * corresponding code.
+     */
+    int status_in;
+
+    /**
+     * Timestamp of the input status change.
+     */
+    int64_t status_in_pts;
+
+    /**
+     * Link output status.
+     * If not zero, all attempts of request_frame will fail with the
+     * corresponding code.
+     */
+    int status_out;
+
+#endif /* FF_INTERNAL_FIELDS */
+
+};
+
+/**
+ * Link two filters together.
+ *
+ * @param src    the source filter
+ * @param srcpad index of the output pad on the source filter
+ * @param dst    the destination filter
+ * @param dstpad index of the input pad on the destination filter
+ * @return       zero on success
+ */
+int avfilter_link(AVFilterContext *src, unsigned srcpad,
+                  AVFilterContext *dst, unsigned dstpad);
+
+/**
+ * Free the link in *link, and set its pointer to NULL.
+ */
+void avfilter_link_free(AVFilterLink **link);
+
+/**
+ * Get the number of channels of a link.
+ */
+int avfilter_link_get_channels(AVFilterLink *link);
+
+/**
+ * Set the closed field of a link.
+ * @deprecated applications are not supposed to mess with links, they should
+ * close the sinks.
+ */
+attribute_deprecated
+void avfilter_link_set_closed(AVFilterLink *link, int closed);
+
+/**
+ * Negotiate the media format, dimensions, etc of all inputs to a filter.
+ *
+ * @param filter the filter to negotiate the properties for its inputs
+ * @return       zero on successful negotiation
+ */
+int avfilter_config_links(AVFilterContext *filter);
+
+#define AVFILTER_CMD_FLAG_ONE   1 ///< Stop once a filter understood the command (for target=all for example), fast filters are favored automatically
+#define AVFILTER_CMD_FLAG_FAST  2 ///< Only execute command when its fast (like a video out that supports contrast adjustment in hw)
+
+/**
+ * Make the filter instance process a command.
+ * It is recommended to use avfilter_graph_send_command().
+ */
+int avfilter_process_command(AVFilterContext *filter, const char *cmd, const char *arg, char *res, int res_len, int flags);
+
+/** Initialize the filter system. Register all builtin filters. */
+void avfilter_register_all(void);
+
+#if FF_API_OLD_FILTER_REGISTER
+/** Uninitialize the filter system. Unregister all filters. */
+attribute_deprecated
+void avfilter_uninit(void);
+#endif
+
+/**
+ * Register a filter. This is only needed if you plan to use
+ * avfilter_get_by_name later to lookup the AVFilter structure by name. A
+ * filter can still by instantiated with avfilter_graph_alloc_filter even if it
+ * is not registered.
+ *
+ * @param filter the filter to register
+ * @return 0 if the registration was successful, a negative value
+ * otherwise
+ */
+int avfilter_register(AVFilter *filter);
+
+/**
+ * Get a filter definition matching the given name.
+ *
+ * @param name the filter name to find
+ * @return     the filter definition, if any matching one is registered.
+ *             NULL if none found.
+ */
+#if !FF_API_NOCONST_GET_NAME
+const
+#endif
+AVFilter *avfilter_get_by_name(const char *name);
+
+/**
+ * Iterate over all registered filters.
+ * @return If prev is non-NULL, next registered filter after prev or NULL if
+ * prev is the last filter. If prev is NULL, return the first registered filter.
+ */
+const AVFilter *avfilter_next(const AVFilter *prev);
+
+#if FF_API_OLD_FILTER_REGISTER
+/**
+ * If filter is NULL, returns a pointer to the first registered filter pointer,
+ * if filter is non-NULL, returns the next pointer after filter.
+ * If the returned pointer points to NULL, the last registered filter
+ * was already reached.
+ * @deprecated use avfilter_next()
+ */
+attribute_deprecated
+AVFilter **av_filter_next(AVFilter **filter);
+#endif
+
+#if FF_API_AVFILTER_OPEN
+/**
+ * Create a filter instance.
+ *
+ * @param filter_ctx put here a pointer to the created filter context
+ * on success, NULL on failure
+ * @param filter    the filter to create an instance of
+ * @param inst_name Name to give to the new instance. Can be NULL for none.
+ * @return >= 0 in case of success, a negative error code otherwise
+ * @deprecated use avfilter_graph_alloc_filter() instead
+ */
+attribute_deprecated
+int avfilter_open(AVFilterContext **filter_ctx, AVFilter *filter, const char *inst_name);
+#endif
+
+
+#if FF_API_AVFILTER_INIT_FILTER
+/**
+ * Initialize a filter.
+ *
+ * @param filter the filter to initialize
+ * @param args   A string of parameters to use when initializing the filter.
+ *               The format and meaning of this string varies by filter.
+ * @param opaque Any extra non-string data needed by the filter. The meaning
+ *               of this parameter varies by filter.
+ * @return       zero on success
+ */
+attribute_deprecated
+int avfilter_init_filter(AVFilterContext *filter, const char *args, void *opaque);
+#endif
+
+/**
+ * Initialize a filter with the supplied parameters.
+ *
+ * @param ctx  uninitialized filter context to initialize
+ * @param args Options to initialize the filter with. This must be a
+ *             ':'-separated list of options in the 'key=value' form.
+ *             May be NULL if the options have been set directly using the
+ *             AVOptions API or there are no options that need to be set.
+ * @return 0 on success, a negative AVERROR on failure
+ */
+int avfilter_init_str(AVFilterContext *ctx, const char *args);
+
+/**
+ * Initialize a filter with the supplied dictionary of options.
+ *
+ * @param ctx     uninitialized filter context to initialize
+ * @param options An AVDictionary filled with options for this filter. On
+ *                return this parameter will be destroyed and replaced with
+ *                a dict containing options that were not found. This dictionary
+ *                must be freed by the caller.
+ *                May be NULL, then this function is equivalent to
+ *                avfilter_init_str() with the second parameter set to NULL.
+ * @return 0 on success, a negative AVERROR on failure
+ *
+ * @note This function and avfilter_init_str() do essentially the same thing,
+ * the difference is in manner in which the options are passed. It is up to the
+ * calling code to choose whichever is more preferable. The two functions also
+ * behave differently when some of the provided options are not declared as
+ * supported by the filter. In such a case, avfilter_init_str() will fail, but
+ * this function will leave those extra options in the options AVDictionary and
+ * continue as usual.
+ */
+int avfilter_init_dict(AVFilterContext *ctx, AVDictionary **options);
+
+/**
+ * Free a filter context. This will also remove the filter from its
+ * filtergraph's list of filters.
+ *
+ * @param filter the filter to free
+ */
+void avfilter_free(AVFilterContext *filter);
+
+/**
+ * Insert a filter in the middle of an existing link.
+ *
+ * @param link the link into which the filter should be inserted
+ * @param filt the filter to be inserted
+ * @param filt_srcpad_idx the input pad on the filter to connect
+ * @param filt_dstpad_idx the output pad on the filter to connect
+ * @return     zero on success
+ */
+int avfilter_insert_filter(AVFilterLink *link, AVFilterContext *filt,
+                           unsigned filt_srcpad_idx, unsigned filt_dstpad_idx);
+
+/**
+ * @return AVClass for AVFilterContext.
+ *
+ * @see av_opt_find().
+ */
+const AVClass *avfilter_get_class(void);
+
+typedef struct AVFilterGraphInternal AVFilterGraphInternal;
+
+/**
+ * A function pointer passed to the @ref AVFilterGraph.execute callback to be
+ * executed multiple times, possibly in parallel.
+ *
+ * @param ctx the filter context the job belongs to
+ * @param arg an opaque parameter passed through from @ref
+ *            AVFilterGraph.execute
+ * @param jobnr the index of the job being executed
+ * @param nb_jobs the total number of jobs
+ *
+ * @return 0 on success, a negative AVERROR on error
+ */
+typedef int (avfilter_action_func)(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs);
+
+/**
+ * A function executing multiple jobs, possibly in parallel.
+ *
+ * @param ctx the filter context to which the jobs belong
+ * @param func the function to be called multiple times
+ * @param arg the argument to be passed to func
+ * @param ret a nb_jobs-sized array to be filled with return values from each
+ *            invocation of func
+ * @param nb_jobs the number of jobs to execute
+ *
+ * @return 0 on success, a negative AVERROR on error
+ */
+typedef int (avfilter_execute_func)(AVFilterContext *ctx, avfilter_action_func *func,
+                                    void *arg, int *ret, int nb_jobs);
+
+typedef struct AVFilterGraph {
+    const AVClass *av_class;
+    AVFilterContext **filters;
+    unsigned nb_filters;
+
+    char *scale_sws_opts; ///< sws options to use for the auto-inserted scale filters
+#if FF_API_LAVR_OPTS
+    attribute_deprecated char *resample_lavr_opts;   ///< libavresample options to use for the auto-inserted resample filters
+#endif
+
+    /**
+     * Type of multithreading allowed for filters in this graph. A combination
+     * of AVFILTER_THREAD_* flags.
+     *
+     * May be set by the caller at any point, the setting will apply to all
+     * filters initialized after that. The default is allowing everything.
+     *
+     * When a filter in this graph is initialized, this field is combined using
+     * bit AND with AVFilterContext.thread_type to get the final mask used for
+     * determining allowed threading types. I.e. a threading type needs to be
+     * set in both to be allowed.
+     */
+    int thread_type;
+
+    /**
+     * Maximum number of threads used by filters in this graph. May be set by
+     * the caller before adding any filters to the filtergraph. Zero (the
+     * default) means that the number of threads is determined automatically.
+     */
+    int nb_threads;
+
+    /**
+     * Opaque object for libavfilter internal use.
+     */
+    AVFilterGraphInternal *internal;
+
+    /**
+     * Opaque user data. May be set by the caller to an arbitrary value, e.g. to
+     * be used from callbacks like @ref AVFilterGraph.execute.
+     * Libavfilter will not touch this field in any way.
+     */
+    void *opaque;
+
+    /**
+     * This callback may be set by the caller immediately after allocating the
+     * graph and before adding any filters to it, to provide a custom
+     * multithreading implementation.
+     *
+     * If set, filters with slice threading capability will call this callback
+     * to execute multiple jobs in parallel.
+     *
+     * If this field is left unset, libavfilter will use its internal
+     * implementation, which may or may not be multithreaded depending on the
+     * platform and build options.
+     */
+    avfilter_execute_func *execute;
+
+    char *aresample_swr_opts; ///< swr options to use for the auto-inserted aresample filters, Access ONLY through AVOptions
+
+    /**
+     * Private fields
+     *
+     * The following fields are for internal use only.
+     * Their type, offset, number and semantic can change without notice.
+     */
+
+    AVFilterLink **sink_links;
+    int sink_links_count;
+
+    unsigned disable_auto_convert;
+} AVFilterGraph;
+
+/**
+ * Allocate a filter graph.
+ *
+ * @return the allocated filter graph on success or NULL.
+ */
+AVFilterGraph *avfilter_graph_alloc(void);
+
+/**
+ * Create a new filter instance in a filter graph.
+ *
+ * @param graph graph in which the new filter will be used
+ * @param filter the filter to create an instance of
+ * @param name Name to give to the new instance (will be copied to
+ *             AVFilterContext.name). This may be used by the caller to identify
+ *             different filters, libavfilter itself assigns no semantics to
+ *             this parameter. May be NULL.
+ *
+ * @return the context of the newly created filter instance (note that it is
+ *         also retrievable directly through AVFilterGraph.filters or with
+ *         avfilter_graph_get_filter()) on success or NULL on failure.
+ */
+AVFilterContext *avfilter_graph_alloc_filter(AVFilterGraph *graph,
+                                             const AVFilter *filter,
+                                             const char *name);
+
+/**
+ * Get a filter instance identified by instance name from graph.
+ *
+ * @param graph filter graph to search through.
+ * @param name filter instance name (should be unique in the graph).
+ * @return the pointer to the found filter instance or NULL if it
+ * cannot be found.
+ */
+AVFilterContext *avfilter_graph_get_filter(AVFilterGraph *graph, const char *name);
+
+#if FF_API_AVFILTER_OPEN
+/**
+ * Add an existing filter instance to a filter graph.
+ *
+ * @param graphctx  the filter graph
+ * @param filter the filter to be added
+ *
+ * @deprecated use avfilter_graph_alloc_filter() to allocate a filter in a
+ * filter graph
+ */
+attribute_deprecated
+int avfilter_graph_add_filter(AVFilterGraph *graphctx, AVFilterContext *filter);
+#endif
+
+/**
+ * Create and add a filter instance into an existing graph.
+ * The filter instance is created from the filter filt and inited
+ * with the parameters args and opaque.
+ *
+ * In case of success put in *filt_ctx the pointer to the created
+ * filter instance, otherwise set *filt_ctx to NULL.
+ *
+ * @param name the instance name to give to the created filter instance
+ * @param graph_ctx the filter graph
+ * @return a negative AVERROR error code in case of failure, a non
+ * negative value otherwise
+ */
+int avfilter_graph_create_filter(AVFilterContext **filt_ctx, const AVFilter *filt,
+                                 const char *name, const char *args, void *opaque,
+                                 AVFilterGraph *graph_ctx);
+
+/**
+ * Enable or disable automatic format conversion inside the graph.
+ *
+ * Note that format conversion can still happen inside explicitly inserted
+ * scale and aresample filters.
+ *
+ * @param flags  any of the AVFILTER_AUTO_CONVERT_* constants
+ */
+void avfilter_graph_set_auto_convert(AVFilterGraph *graph, unsigned flags);
+
+enum {
+    AVFILTER_AUTO_CONVERT_ALL  =  0, /**< all automatic conversions enabled */
+    AVFILTER_AUTO_CONVERT_NONE = -1, /**< all automatic conversions disabled */
+};
+
+/**
+ * Check validity and configure all the links and formats in the graph.
+ *
+ * @param graphctx the filter graph
+ * @param log_ctx context used for logging
+ * @return >= 0 in case of success, a negative AVERROR code otherwise
+ */
+int avfilter_graph_config(AVFilterGraph *graphctx, void *log_ctx);
+
+/**
+ * Free a graph, destroy its links, and set *graph to NULL.
+ * If *graph is NULL, do nothing.
+ */
+void avfilter_graph_free(AVFilterGraph **graph);
+
+/**
+ * A linked-list of the inputs/outputs of the filter chain.
+ *
+ * This is mainly useful for avfilter_graph_parse() / avfilter_graph_parse2(),
+ * where it is used to communicate open (unlinked) inputs and outputs from and
+ * to the caller.
+ * This struct specifies, per each not connected pad contained in the graph, the
+ * filter context and the pad index required for establishing a link.
+ */
+typedef struct AVFilterInOut {
+    /** unique name for this input/output in the list */
+    char *name;
+
+    /** filter context associated to this input/output */
+    AVFilterContext *filter_ctx;
+
+    /** index of the filt_ctx pad to use for linking */
+    int pad_idx;
+
+    /** next input/input in the list, NULL if this is the last */
+    struct AVFilterInOut *next;
+} AVFilterInOut;
+
+/**
+ * Allocate a single AVFilterInOut entry.
+ * Must be freed with avfilter_inout_free().
+ * @return allocated AVFilterInOut on success, NULL on failure.
+ */
+AVFilterInOut *avfilter_inout_alloc(void);
+
+/**
+ * Free the supplied list of AVFilterInOut and set *inout to NULL.
+ * If *inout is NULL, do nothing.
+ */
+void avfilter_inout_free(AVFilterInOut **inout);
+
+/**
+ * Add a graph described by a string to a graph.
+ *
+ * @note The caller must provide the lists of inputs and outputs,
+ * which therefore must be known before calling the function.
+ *
+ * @note The inputs parameter describes inputs of the already existing
+ * part of the graph; i.e. from the point of view of the newly created
+ * part, they are outputs. Similarly the outputs parameter describes
+ * outputs of the already existing filters, which are provided as
+ * inputs to the parsed filters.
+ *
+ * @param graph   the filter graph where to link the parsed graph context
+ * @param filters string to be parsed
+ * @param inputs  linked list to the inputs of the graph
+ * @param outputs linked list to the outputs of the graph
+ * @return zero on success, a negative AVERROR code on error
+ */
+int avfilter_graph_parse(AVFilterGraph *graph, const char *filters,
+                         AVFilterInOut *inputs, AVFilterInOut *outputs,
+                         void *log_ctx);
+
+/**
+ * Add a graph described by a string to a graph.
+ *
+ * In the graph filters description, if the input label of the first
+ * filter is not specified, "in" is assumed; if the output label of
+ * the last filter is not specified, "out" is assumed.
+ *
+ * @param graph   the filter graph where to link the parsed graph context
+ * @param filters string to be parsed
+ * @param inputs  pointer to a linked list to the inputs of the graph, may be NULL.
+ *                If non-NULL, *inputs is updated to contain the list of open inputs
+ *                after the parsing, should be freed with avfilter_inout_free().
+ * @param outputs pointer to a linked list to the outputs of the graph, may be NULL.
+ *                If non-NULL, *outputs is updated to contain the list of open outputs
+ *                after the parsing, should be freed with avfilter_inout_free().
+ * @return non negative on success, a negative AVERROR code on error
+ */
+int avfilter_graph_parse_ptr(AVFilterGraph *graph, const char *filters,
+                             AVFilterInOut **inputs, AVFilterInOut **outputs,
+                             void *log_ctx);
+
+/**
+ * Add a graph described by a string to a graph.
+ *
+ * @param[in]  graph   the filter graph where to link the parsed graph context
+ * @param[in]  filters string to be parsed
+ * @param[out] inputs  a linked list of all free (unlinked) inputs of the
+ *                     parsed graph will be returned here. It is to be freed
+ *                     by the caller using avfilter_inout_free().
+ * @param[out] outputs a linked list of all free (unlinked) outputs of the
+ *                     parsed graph will be returned here. It is to be freed by the
+ *                     caller using avfilter_inout_free().
+ * @return zero on success, a negative AVERROR code on error
+ *
+ * @note This function returns the inputs and outputs that are left
+ * unlinked after parsing the graph and the caller then deals with
+ * them.
+ * @note This function makes no reference whatsoever to already
+ * existing parts of the graph and the inputs parameter will on return
+ * contain inputs of the newly parsed part of the graph.  Analogously
+ * the outputs parameter will contain outputs of the newly created
+ * filters.
+ */
+int avfilter_graph_parse2(AVFilterGraph *graph, const char *filters,
+                          AVFilterInOut **inputs,
+                          AVFilterInOut **outputs);
+
+/**
+ * Send a command to one or more filter instances.
+ *
+ * @param graph  the filter graph
+ * @param target the filter(s) to which the command should be sent
+ *               "all" sends to all filters
+ *               otherwise it can be a filter or filter instance name
+ *               which will send the command to all matching filters.
+ * @param cmd    the command to send, for handling simplicity all commands must be alphanumeric only
+ * @param arg    the argument for the command
+ * @param res    a buffer with size res_size where the filter(s) can return a response.
+ *
+ * @returns >=0 on success otherwise an error code.
+ *              AVERROR(ENOSYS) on unsupported commands
+ */
+int avfilter_graph_send_command(AVFilterGraph *graph, const char *target, const char *cmd, const char *arg, char *res, int res_len, int flags);
+
+/**
+ * Queue a command for one or more filter instances.
+ *
+ * @param graph  the filter graph
+ * @param target the filter(s) to which the command should be sent
+ *               "all" sends to all filters
+ *               otherwise it can be a filter or filter instance name
+ *               which will send the command to all matching filters.
+ * @param cmd    the command to sent, for handling simplicity all commands must be alphanumeric only
+ * @param arg    the argument for the command
+ * @param ts     time at which the command should be sent to the filter
+ *
+ * @note As this executes commands after this function returns, no return code
+ *       from the filter is provided, also AVFILTER_CMD_FLAG_ONE is not supported.
+ */
+int avfilter_graph_queue_command(AVFilterGraph *graph, const char *target, const char *cmd, const char *arg, int flags, double ts);
+
+
+/**
+ * Dump a graph into a human-readable string representation.
+ *
+ * @param graph    the graph to dump
+ * @param options  formatting options; currently ignored
+ * @return  a string, or NULL in case of memory allocation failure;
+ *          the string must be freed using av_free
+ */
+char *avfilter_graph_dump(AVFilterGraph *graph, const char *options);
+
+/**
+ * Request a frame on the oldest sink link.
+ *
+ * If the request returns AVERROR_EOF, try the next.
+ *
+ * Note that this function is not meant to be the sole scheduling mechanism
+ * of a filtergraph, only a convenience function to help drain a filtergraph
+ * in a balanced way under normal circumstances.
+ *
+ * Also note that AVERROR_EOF does not mean that frames did not arrive on
+ * some of the sinks during the process.
+ * When there are multiple sink links, in case the requested link
+ * returns an EOF, this may cause a filter to flush pending frames
+ * which are sent to another sink link, although unrequested.
+ *
+ * @return  the return value of ff_request_frame(),
+ *          or AVERROR_EOF if all links returned AVERROR_EOF
+ */
+int avfilter_graph_request_oldest(AVFilterGraph *graph);
+
+/**
+ * @}
+ */
+
+#endif /* AVFILTER_AVFILTER_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavfilter/avfiltergraph.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavfilter/avfiltergraph.h
new file mode 100644
index 0000000..b31d581
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavfilter/avfiltergraph.h
@@ -0,0 +1,28 @@
+/*
+ * Filter graphs
+ * copyright (c) 2007 Bobby Bingham
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_AVFILTERGRAPH_H
+#define AVFILTER_AVFILTERGRAPH_H
+
+#include "avfilter.h"
+#include "libavutil/log.h"
+
+#endif /* AVFILTER_AVFILTERGRAPH_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavfilter/buffersink.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavfilter/buffersink.h
new file mode 100644
index 0000000..f51fa7c
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavfilter/buffersink.h
@@ -0,0 +1,165 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_BUFFERSINK_H
+#define AVFILTER_BUFFERSINK_H
+
+/**
+ * @file
+ * @ingroup lavfi_buffersink
+ * memory buffer sink API for audio and video
+ */
+
+#include "avfilter.h"
+
+/**
+ * @defgroup lavfi_buffersink Buffer sink API
+ * @ingroup lavfi
+ * @{
+ */
+
+/**
+ * Get a frame with filtered data from sink and put it in frame.
+ *
+ * @param ctx    pointer to a buffersink or abuffersink filter context.
+ * @param frame  pointer to an allocated frame that will be filled with data.
+ *               The data must be freed using av_frame_unref() / av_frame_free()
+ * @param flags  a combination of AV_BUFFERSINK_FLAG_* flags
+ *
+ * @return  >= 0 in for success, a negative AVERROR code for failure.
+ */
+int av_buffersink_get_frame_flags(AVFilterContext *ctx, AVFrame *frame, int flags);
+
+/**
+ * Tell av_buffersink_get_buffer_ref() to read video/samples buffer
+ * reference, but not remove it from the buffer. This is useful if you
+ * need only to read a video/samples buffer, without to fetch it.
+ */
+#define AV_BUFFERSINK_FLAG_PEEK 1
+
+/**
+ * Tell av_buffersink_get_buffer_ref() not to request a frame from its input.
+ * If a frame is already buffered, it is read (and removed from the buffer),
+ * but if no frame is present, return AVERROR(EAGAIN).
+ */
+#define AV_BUFFERSINK_FLAG_NO_REQUEST 2
+
+/**
+ * Struct to use for initializing a buffersink context.
+ */
+typedef struct {
+    const enum AVPixelFormat *pixel_fmts; ///< list of allowed pixel formats, terminated by AV_PIX_FMT_NONE
+} AVBufferSinkParams;
+
+/**
+ * Create an AVBufferSinkParams structure.
+ *
+ * Must be freed with av_free().
+ */
+AVBufferSinkParams *av_buffersink_params_alloc(void);
+
+/**
+ * Struct to use for initializing an abuffersink context.
+ */
+typedef struct {
+    const enum AVSampleFormat *sample_fmts; ///< list of allowed sample formats, terminated by AV_SAMPLE_FMT_NONE
+    const int64_t *channel_layouts;         ///< list of allowed channel layouts, terminated by -1
+    const int *channel_counts;              ///< list of allowed channel counts, terminated by -1
+    int all_channel_counts;                 ///< if not 0, accept any channel count or layout
+    int *sample_rates;                      ///< list of allowed sample rates, terminated by -1
+} AVABufferSinkParams;
+
+/**
+ * Create an AVABufferSinkParams structure.
+ *
+ * Must be freed with av_free().
+ */
+AVABufferSinkParams *av_abuffersink_params_alloc(void);
+
+/**
+ * Set the frame size for an audio buffer sink.
+ *
+ * All calls to av_buffersink_get_buffer_ref will return a buffer with
+ * exactly the specified number of samples, or AVERROR(EAGAIN) if there is
+ * not enough. The last buffer at EOF will be padded with 0.
+ */
+void av_buffersink_set_frame_size(AVFilterContext *ctx, unsigned frame_size);
+
+/**
+ * @defgroup lavfi_buffersink_accessors Buffer sink accessors
+ * Get the properties of the stream
+ * @{
+ */
+
+enum AVMediaType av_buffersink_get_type                (const AVFilterContext *ctx);
+AVRational       av_buffersink_get_time_base           (const AVFilterContext *ctx);
+int              av_buffersink_get_format              (const AVFilterContext *ctx);
+
+AVRational       av_buffersink_get_frame_rate          (const AVFilterContext *ctx);
+int              av_buffersink_get_w                   (const AVFilterContext *ctx);
+int              av_buffersink_get_h                   (const AVFilterContext *ctx);
+AVRational       av_buffersink_get_sample_aspect_ratio (const AVFilterContext *ctx);
+
+int              av_buffersink_get_channels            (const AVFilterContext *ctx);
+uint64_t         av_buffersink_get_channel_layout      (const AVFilterContext *ctx);
+int              av_buffersink_get_sample_rate         (const AVFilterContext *ctx);
+
+AVBufferRef *    av_buffersink_get_hw_frames_ctx       (const AVFilterContext *ctx);
+
+/** @} */
+
+/**
+ * Get a frame with filtered data from sink and put it in frame.
+ *
+ * @param ctx pointer to a context of a buffersink or abuffersink AVFilter.
+ * @param frame pointer to an allocated frame that will be filled with data.
+ *              The data must be freed using av_frame_unref() / av_frame_free()
+ *
+ * @return
+ *         - >= 0 if a frame was successfully returned.
+ *         - AVERROR(EAGAIN) if no frames are available at this point; more
+ *           input frames must be added to the filtergraph to get more output.
+ *         - AVERROR_EOF if there will be no more output frames on this sink.
+ *         - A different negative AVERROR code in other failure cases.
+ */
+int av_buffersink_get_frame(AVFilterContext *ctx, AVFrame *frame);
+
+/**
+ * Same as av_buffersink_get_frame(), but with the ability to specify the number
+ * of samples read. This function is less efficient than
+ * av_buffersink_get_frame(), because it copies the data around.
+ *
+ * @param ctx pointer to a context of the abuffersink AVFilter.
+ * @param frame pointer to an allocated frame that will be filled with data.
+ *              The data must be freed using av_frame_unref() / av_frame_free()
+ *              frame will contain exactly nb_samples audio samples, except at
+ *              the end of stream, when it can contain less than nb_samples.
+ *
+ * @return The return codes have the same meaning as for
+ *         av_buffersink_get_samples().
+ *
+ * @warning do not mix this function with av_buffersink_get_frame(). Use only one or
+ * the other with a single sink, not both.
+ */
+int av_buffersink_get_samples(AVFilterContext *ctx, AVFrame *frame, int nb_samples);
+
+/**
+ * @}
+ */
+
+#endif /* AVFILTER_BUFFERSINK_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavfilter/buffersrc.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavfilter/buffersrc.h
new file mode 100644
index 0000000..e42c781
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavfilter/buffersrc.h
@@ -0,0 +1,201 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_BUFFERSRC_H
+#define AVFILTER_BUFFERSRC_H
+
+/**
+ * @file
+ * @ingroup lavfi_buffersrc
+ * Memory buffer source API.
+ */
+
+#include "avfilter.h"
+
+/**
+ * @defgroup lavfi_buffersrc Buffer source API
+ * @ingroup lavfi
+ * @{
+ */
+
+enum {
+
+    /**
+     * Do not check for format changes.
+     */
+    AV_BUFFERSRC_FLAG_NO_CHECK_FORMAT = 1,
+
+    /**
+     * Immediately push the frame to the output.
+     */
+    AV_BUFFERSRC_FLAG_PUSH = 4,
+
+    /**
+     * Keep a reference to the frame.
+     * If the frame if reference-counted, create a new reference; otherwise
+     * copy the frame data.
+     */
+    AV_BUFFERSRC_FLAG_KEEP_REF = 8,
+
+};
+
+/**
+ * Get the number of failed requests.
+ *
+ * A failed request is when the request_frame method is called while no
+ * frame is present in the buffer.
+ * The number is reset when a frame is added.
+ */
+unsigned av_buffersrc_get_nb_failed_requests(AVFilterContext *buffer_src);
+
+/**
+ * This structure contains the parameters describing the frames that will be
+ * passed to this filter.
+ *
+ * It should be allocated with av_buffersrc_parameters_alloc() and freed with
+ * av_free(). All the allocated fields in it remain owned by the caller.
+ */
+typedef struct AVBufferSrcParameters {
+    /**
+     * video: the pixel format, value corresponds to enum AVPixelFormat
+     * audio: the sample format, value corresponds to enum AVSampleFormat
+     */
+    int format;
+    /**
+     * The timebase to be used for the timestamps on the input frames.
+     */
+    AVRational time_base;
+
+    /**
+     * Video only, the display dimensions of the input frames.
+     */
+    int width, height;
+
+    /**
+     * Video only, the sample (pixel) aspect ratio.
+     */
+    AVRational sample_aspect_ratio;
+
+    /**
+     * Video only, the frame rate of the input video. This field must only be
+     * set to a non-zero value if input stream has a known constant framerate
+     * and should be left at its initial value if the framerate is variable or
+     * unknown.
+     */
+    AVRational frame_rate;
+
+    /**
+     * Video with a hwaccel pixel format only. This should be a reference to an
+     * AVHWFramesContext instance describing the input frames.
+     */
+    AVBufferRef *hw_frames_ctx;
+
+    /**
+     * Audio only, the audio sampling rate in samples per secon.
+     */
+    int sample_rate;
+
+    /**
+     * Audio only, the audio channel layout
+     */
+    uint64_t channel_layout;
+} AVBufferSrcParameters;
+
+/**
+ * Allocate a new AVBufferSrcParameters instance. It should be freed by the
+ * caller with av_free().
+ */
+AVBufferSrcParameters *av_buffersrc_parameters_alloc(void);
+
+/**
+ * Initialize the buffersrc or abuffersrc filter with the provided parameters.
+ * This function may be called multiple times, the later calls override the
+ * previous ones. Some of the parameters may also be set through AVOptions, then
+ * whatever method is used last takes precedence.
+ *
+ * @param ctx an instance of the buffersrc or abuffersrc filter
+ * @param param the stream parameters. The frames later passed to this filter
+ *              must conform to those parameters. All the allocated fields in
+ *              param remain owned by the caller, libavfilter will make internal
+ *              copies or references when necessary.
+ * @return 0 on success, a negative AVERROR code on failure.
+ */
+int av_buffersrc_parameters_set(AVFilterContext *ctx, AVBufferSrcParameters *param);
+
+/**
+ * Add a frame to the buffer source.
+ *
+ * @param ctx   an instance of the buffersrc filter
+ * @param frame frame to be added. If the frame is reference counted, this
+ * function will make a new reference to it. Otherwise the frame data will be
+ * copied.
+ *
+ * @return 0 on success, a negative AVERROR on error
+ *
+ * This function is equivalent to av_buffersrc_add_frame_flags() with the
+ * AV_BUFFERSRC_FLAG_KEEP_REF flag.
+ */
+av_warn_unused_result
+int av_buffersrc_write_frame(AVFilterContext *ctx, const AVFrame *frame);
+
+/**
+ * Add a frame to the buffer source.
+ *
+ * @param ctx   an instance of the buffersrc filter
+ * @param frame frame to be added. If the frame is reference counted, this
+ * function will take ownership of the reference(s) and reset the frame.
+ * Otherwise the frame data will be copied. If this function returns an error,
+ * the input frame is not touched.
+ *
+ * @return 0 on success, a negative AVERROR on error.
+ *
+ * @note the difference between this function and av_buffersrc_write_frame() is
+ * that av_buffersrc_write_frame() creates a new reference to the input frame,
+ * while this function takes ownership of the reference passed to it.
+ *
+ * This function is equivalent to av_buffersrc_add_frame_flags() without the
+ * AV_BUFFERSRC_FLAG_KEEP_REF flag.
+ */
+av_warn_unused_result
+int av_buffersrc_add_frame(AVFilterContext *ctx, AVFrame *frame);
+
+/**
+ * Add a frame to the buffer source.
+ *
+ * By default, if the frame is reference-counted, this function will take
+ * ownership of the reference(s) and reset the frame. This can be controlled
+ * using the flags.
+ *
+ * If this function returns an error, the input frame is not touched.
+ *
+ * @param buffer_src  pointer to a buffer source context
+ * @param frame       a frame, or NULL to mark EOF
+ * @param flags       a combination of AV_BUFFERSRC_FLAG_*
+ * @return            >= 0 in case of success, a negative AVERROR code
+ *                    in case of failure
+ */
+av_warn_unused_result
+int av_buffersrc_add_frame_flags(AVFilterContext *buffer_src,
+                                 AVFrame *frame, int flags);
+
+
+/**
+ * @}
+ */
+
+#endif /* AVFILTER_BUFFERSRC_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavfilter/version.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavfilter/version.h
new file mode 100644
index 0000000..4cbd185
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavfilter/version.h
@@ -0,0 +1,74 @@
+/*
+ * Version macros.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_VERSION_H
+#define AVFILTER_VERSION_H
+
+/**
+ * @file
+ * @ingroup lavfi
+ * Libavfilter version macros
+ */
+
+#include "libavutil/version.h"
+
+#define LIBAVFILTER_VERSION_MAJOR   6
+#define LIBAVFILTER_VERSION_MINOR  82
+#define LIBAVFILTER_VERSION_MICRO 100
+
+#define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \
+                                               LIBAVFILTER_VERSION_MINOR, \
+                                               LIBAVFILTER_VERSION_MICRO)
+#define LIBAVFILTER_VERSION     AV_VERSION(LIBAVFILTER_VERSION_MAJOR,   \
+                                           LIBAVFILTER_VERSION_MINOR,   \
+                                           LIBAVFILTER_VERSION_MICRO)
+#define LIBAVFILTER_BUILD       LIBAVFILTER_VERSION_INT
+
+#define LIBAVFILTER_IDENT       "Lavfi" AV_STRINGIFY(LIBAVFILTER_VERSION)
+
+/**
+ * FF_API_* defines may be placed below to indicate public API that will be
+ * dropped at a future version bump. The defines themselves are not part of
+ * the public API and may change, break or disappear at any time.
+ */
+
+#ifndef FF_API_OLD_FILTER_OPTS
+#define FF_API_OLD_FILTER_OPTS              (LIBAVFILTER_VERSION_MAJOR < 7)
+#endif
+#ifndef FF_API_OLD_FILTER_OPTS_ERROR
+#define FF_API_OLD_FILTER_OPTS_ERROR        (LIBAVFILTER_VERSION_MAJOR < 7)
+#endif
+#ifndef FF_API_AVFILTER_OPEN
+#define FF_API_AVFILTER_OPEN                (LIBAVFILTER_VERSION_MAJOR < 7)
+#endif
+#ifndef FF_API_AVFILTER_INIT_FILTER
+#define FF_API_AVFILTER_INIT_FILTER         (LIBAVFILTER_VERSION_MAJOR < 7)
+#endif
+#ifndef FF_API_OLD_FILTER_REGISTER
+#define FF_API_OLD_FILTER_REGISTER          (LIBAVFILTER_VERSION_MAJOR < 7)
+#endif
+#ifndef FF_API_NOCONST_GET_NAME
+#define FF_API_NOCONST_GET_NAME             (LIBAVFILTER_VERSION_MAJOR < 7)
+#endif
+#ifndef FF_API_LAVR_OPTS
+#define FF_API_LAVR_OPTS                    (LIBAVFILTER_VERSION_MAJOR < 7)
+#endif
+
+#endif /* AVFILTER_VERSION_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavformat/avformat.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavformat/avformat.h
new file mode 100644
index 0000000..4ab217d
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavformat/avformat.h
@@ -0,0 +1,3008 @@
+/*
+ * copyright (c) 2001 Fabrice Bellard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFORMAT_AVFORMAT_H
+#define AVFORMAT_AVFORMAT_H
+
+/**
+ * @file
+ * @ingroup libavf
+ * Main libavformat public API header
+ */
+
+/**
+ * @defgroup libavf libavformat
+ * I/O and Muxing/Demuxing Library
+ *
+ * Libavformat (lavf) is a library for dealing with various media container
+ * formats. Its main two purposes are demuxing - i.e. splitting a media file
+ * into component streams, and the reverse process of muxing - writing supplied
+ * data in a specified container format. It also has an @ref lavf_io
+ * "I/O module" which supports a number of protocols for accessing the data (e.g.
+ * file, tcp, http and others). Before using lavf, you need to call
+ * av_register_all() to register all compiled muxers, demuxers and protocols.
+ * Unless you are absolutely sure you won't use libavformat's network
+ * capabilities, you should also call avformat_network_init().
+ *
+ * A supported input format is described by an AVInputFormat struct, conversely
+ * an output format is described by AVOutputFormat. You can iterate over all
+ * registered input/output formats using the av_iformat_next() /
+ * av_oformat_next() functions. The protocols layer is not part of the public
+ * API, so you can only get the names of supported protocols with the
+ * avio_enum_protocols() function.
+ *
+ * Main lavf structure used for both muxing and demuxing is AVFormatContext,
+ * which exports all information about the file being read or written. As with
+ * most Libavformat structures, its size is not part of public ABI, so it cannot be
+ * allocated on stack or directly with av_malloc(). To create an
+ * AVFormatContext, use avformat_alloc_context() (some functions, like
+ * avformat_open_input() might do that for you).
+ *
+ * Most importantly an AVFormatContext contains:
+ * @li the @ref AVFormatContext.iformat "input" or @ref AVFormatContext.oformat
+ * "output" format. It is either autodetected or set by user for input;
+ * always set by user for output.
+ * @li an @ref AVFormatContext.streams "array" of AVStreams, which describe all
+ * elementary streams stored in the file. AVStreams are typically referred to
+ * using their index in this array.
+ * @li an @ref AVFormatContext.pb "I/O context". It is either opened by lavf or
+ * set by user for input, always set by user for output (unless you are dealing
+ * with an AVFMT_NOFILE format).
+ *
+ * @section lavf_options Passing options to (de)muxers
+ * It is possible to configure lavf muxers and demuxers using the @ref avoptions
+ * mechanism. Generic (format-independent) libavformat options are provided by
+ * AVFormatContext, they can be examined from a user program by calling
+ * av_opt_next() / av_opt_find() on an allocated AVFormatContext (or its AVClass
+ * from avformat_get_class()). Private (format-specific) options are provided by
+ * AVFormatContext.priv_data if and only if AVInputFormat.priv_class /
+ * AVOutputFormat.priv_class of the corresponding format struct is non-NULL.
+ * Further options may be provided by the @ref AVFormatContext.pb "I/O context",
+ * if its AVClass is non-NULL, and the protocols layer. See the discussion on
+ * nesting in @ref avoptions documentation to learn how to access those.
+ *
+ * @section urls
+ * URL strings in libavformat are made of a scheme/protocol, a ':', and a
+ * scheme specific string. URLs without a scheme and ':' used for local files
+ * are supported but deprecated. "file:" should be used for local files.
+ *
+ * It is important that the scheme string is not taken from untrusted
+ * sources without checks.
+ *
+ * Note that some schemes/protocols are quite powerful, allowing access to
+ * both local and remote files, parts of them, concatenations of them, local
+ * audio and video devices and so on.
+ *
+ * @{
+ *
+ * @defgroup lavf_decoding Demuxing
+ * @{
+ * Demuxers read a media file and split it into chunks of data (@em packets). A
+ * @ref AVPacket "packet" contains one or more encoded frames which belongs to a
+ * single elementary stream. In the lavf API this process is represented by the
+ * avformat_open_input() function for opening a file, av_read_frame() for
+ * reading a single packet and finally avformat_close_input(), which does the
+ * cleanup.
+ *
+ * @section lavf_decoding_open Opening a media file
+ * The minimum information required to open a file is its URL, which
+ * is passed to avformat_open_input(), as in the following code:
+ * @code
+ * const char    *url = "file:in.mp3";
+ * AVFormatContext *s = NULL;
+ * int ret = avformat_open_input(&s, url, NULL, NULL);
+ * if (ret < 0)
+ *     abort();
+ * @endcode
+ * The above code attempts to allocate an AVFormatContext, open the
+ * specified file (autodetecting the format) and read the header, exporting the
+ * information stored there into s. Some formats do not have a header or do not
+ * store enough information there, so it is recommended that you call the
+ * avformat_find_stream_info() function which tries to read and decode a few
+ * frames to find missing information.
+ *
+ * In some cases you might want to preallocate an AVFormatContext yourself with
+ * avformat_alloc_context() and do some tweaking on it before passing it to
+ * avformat_open_input(). One such case is when you want to use custom functions
+ * for reading input data instead of lavf internal I/O layer.
+ * To do that, create your own AVIOContext with avio_alloc_context(), passing
+ * your reading callbacks to it. Then set the @em pb field of your
+ * AVFormatContext to newly created AVIOContext.
+ *
+ * Since the format of the opened file is in general not known until after
+ * avformat_open_input() has returned, it is not possible to set demuxer private
+ * options on a preallocated context. Instead, the options should be passed to
+ * avformat_open_input() wrapped in an AVDictionary:
+ * @code
+ * AVDictionary *options = NULL;
+ * av_dict_set(&options, "video_size", "640x480", 0);
+ * av_dict_set(&options, "pixel_format", "rgb24", 0);
+ *
+ * if (avformat_open_input(&s, url, NULL, &options) < 0)
+ *     abort();
+ * av_dict_free(&options);
+ * @endcode
+ * This code passes the private options 'video_size' and 'pixel_format' to the
+ * demuxer. They would be necessary for e.g. the rawvideo demuxer, since it
+ * cannot know how to interpret raw video data otherwise. If the format turns
+ * out to be something different than raw video, those options will not be
+ * recognized by the demuxer and therefore will not be applied. Such unrecognized
+ * options are then returned in the options dictionary (recognized options are
+ * consumed). The calling program can handle such unrecognized options as it
+ * wishes, e.g.
+ * @code
+ * AVDictionaryEntry *e;
+ * if (e = av_dict_get(options, "", NULL, AV_DICT_IGNORE_SUFFIX)) {
+ *     fprintf(stderr, "Option %s not recognized by the demuxer.\n", e->key);
+ *     abort();
+ * }
+ * @endcode
+ *
+ * After you have finished reading the file, you must close it with
+ * avformat_close_input(). It will free everything associated with the file.
+ *
+ * @section lavf_decoding_read Reading from an opened file
+ * Reading data from an opened AVFormatContext is done by repeatedly calling
+ * av_read_frame() on it. Each call, if successful, will return an AVPacket
+ * containing encoded data for one AVStream, identified by
+ * AVPacket.stream_index. This packet may be passed straight into the libavcodec
+ * decoding functions avcodec_send_packet() or avcodec_decode_subtitle2() if the
+ * caller wishes to decode the data.
+ *
+ * AVPacket.pts, AVPacket.dts and AVPacket.duration timing information will be
+ * set if known. They may also be unset (i.e. AV_NOPTS_VALUE for
+ * pts/dts, 0 for duration) if the stream does not provide them. The timing
+ * information will be in AVStream.time_base units, i.e. it has to be
+ * multiplied by the timebase to convert them to seconds.
+ *
+ * If AVPacket.buf is set on the returned packet, then the packet is
+ * allocated dynamically and the user may keep it indefinitely.
+ * Otherwise, if AVPacket.buf is NULL, the packet data is backed by a
+ * static storage somewhere inside the demuxer and the packet is only valid
+ * until the next av_read_frame() call or closing the file. If the caller
+ * requires a longer lifetime, av_dup_packet() will make an av_malloc()ed copy
+ * of it.
+ * In both cases, the packet must be freed with av_packet_unref() when it is no
+ * longer needed.
+ *
+ * @section lavf_decoding_seek Seeking
+ * @}
+ *
+ * @defgroup lavf_encoding Muxing
+ * @{
+ * Muxers take encoded data in the form of @ref AVPacket "AVPackets" and write
+ * it into files or other output bytestreams in the specified container format.
+ *
+ * The main API functions for muxing are avformat_write_header() for writing the
+ * file header, av_write_frame() / av_interleaved_write_frame() for writing the
+ * packets and av_write_trailer() for finalizing the file.
+ *
+ * At the beginning of the muxing process, the caller must first call
+ * avformat_alloc_context() to create a muxing context. The caller then sets up
+ * the muxer by filling the various fields in this context:
+ *
+ * - The @ref AVFormatContext.oformat "oformat" field must be set to select the
+ *   muxer that will be used.
+ * - Unless the format is of the AVFMT_NOFILE type, the @ref AVFormatContext.pb
+ *   "pb" field must be set to an opened IO context, either returned from
+ *   avio_open2() or a custom one.
+ * - Unless the format is of the AVFMT_NOSTREAMS type, at least one stream must
+ *   be created with the avformat_new_stream() function. The caller should fill
+ *   the @ref AVStream.codecpar "stream codec parameters" information, such as the
+ *   codec @ref AVCodecParameters.codec_type "type", @ref AVCodecParameters.codec_id
+ *   "id" and other parameters (e.g. width / height, the pixel or sample format,
+ *   etc.) as known. The @ref AVStream.time_base "stream timebase" should
+ *   be set to the timebase that the caller desires to use for this stream (note
+ *   that the timebase actually used by the muxer can be different, as will be
+ *   described later).
+ * - It is advised to manually initialize only the relevant fields in
+ *   AVCodecParameters, rather than using @ref avcodec_parameters_copy() during
+ *   remuxing: there is no guarantee that the codec context values remain valid
+ *   for both input and output format contexts.
+ * - The caller may fill in additional information, such as @ref
+ *   AVFormatContext.metadata "global" or @ref AVStream.metadata "per-stream"
+ *   metadata, @ref AVFormatContext.chapters "chapters", @ref
+ *   AVFormatContext.programs "programs", etc. as described in the
+ *   AVFormatContext documentation. Whether such information will actually be
+ *   stored in the output depends on what the container format and the muxer
+ *   support.
+ *
+ * When the muxing context is fully set up, the caller must call
+ * avformat_write_header() to initialize the muxer internals and write the file
+ * header. Whether anything actually is written to the IO context at this step
+ * depends on the muxer, but this function must always be called. Any muxer
+ * private options must be passed in the options parameter to this function.
+ *
+ * The data is then sent to the muxer by repeatedly calling av_write_frame() or
+ * av_interleaved_write_frame() (consult those functions' documentation for
+ * discussion on the difference between them; only one of them may be used with
+ * a single muxing context, they should not be mixed). Do note that the timing
+ * information on the packets sent to the muxer must be in the corresponding
+ * AVStream's timebase. That timebase is set by the muxer (in the
+ * avformat_write_header() step) and may be different from the timebase
+ * requested by the caller.
+ *
+ * Once all the data has been written, the caller must call av_write_trailer()
+ * to flush any buffered packets and finalize the output file, then close the IO
+ * context (if any) and finally free the muxing context with
+ * avformat_free_context().
+ * @}
+ *
+ * @defgroup lavf_io I/O Read/Write
+ * @{
+ * @section lavf_io_dirlist Directory listing
+ * The directory listing API makes it possible to list files on remote servers.
+ *
+ * Some of possible use cases:
+ * - an "open file" dialog to choose files from a remote location,
+ * - a recursive media finder providing a player with an ability to play all
+ * files from a given directory.
+ *
+ * @subsection lavf_io_dirlist_open Opening a directory
+ * At first, a directory needs to be opened by calling avio_open_dir()
+ * supplied with a URL and, optionally, ::AVDictionary containing
+ * protocol-specific parameters. The function returns zero or positive
+ * integer and allocates AVIODirContext on success.
+ *
+ * @code
+ * AVIODirContext *ctx = NULL;
+ * if (avio_open_dir(&ctx, "smb://example.com/some_dir", NULL) < 0) {
+ *     fprintf(stderr, "Cannot open directory.\n");
+ *     abort();
+ * }
+ * @endcode
+ *
+ * This code tries to open a sample directory using smb protocol without
+ * any additional parameters.
+ *
+ * @subsection lavf_io_dirlist_read Reading entries
+ * Each directory's entry (i.e. file, another directory, anything else
+ * within ::AVIODirEntryType) is represented by AVIODirEntry.
+ * Reading consecutive entries from an opened AVIODirContext is done by
+ * repeatedly calling avio_read_dir() on it. Each call returns zero or
+ * positive integer if successful. Reading can be stopped right after the
+ * NULL entry has been read -- it means there are no entries left to be
+ * read. The following code reads all entries from a directory associated
+ * with ctx and prints their names to standard output.
+ * @code
+ * AVIODirEntry *entry = NULL;
+ * for (;;) {
+ *     if (avio_read_dir(ctx, &entry) < 0) {
+ *         fprintf(stderr, "Cannot list directory.\n");
+ *         abort();
+ *     }
+ *     if (!entry)
+ *         break;
+ *     printf("%s\n", entry->name);
+ *     avio_free_directory_entry(&entry);
+ * }
+ * @endcode
+ * @}
+ *
+ * @defgroup lavf_codec Demuxers
+ * @{
+ * @defgroup lavf_codec_native Native Demuxers
+ * @{
+ * @}
+ * @defgroup lavf_codec_wrappers External library wrappers
+ * @{
+ * @}
+ * @}
+ * @defgroup lavf_protos I/O Protocols
+ * @{
+ * @}
+ * @defgroup lavf_internal Internal
+ * @{
+ * @}
+ * @}
+ */
+
+#include <time.h>
+#include <stdio.h>  /* FILE */
+#include "libavcodec/avcodec.h"
+#include "libavutil/dict.h"
+#include "libavutil/log.h"
+
+#include "avio.h"
+#include "libavformat/version.h"
+
+struct AVFormatContext;
+
+struct AVDeviceInfoList;
+struct AVDeviceCapabilitiesQuery;
+
+/**
+ * @defgroup metadata_api Public Metadata API
+ * @{
+ * @ingroup libavf
+ * The metadata API allows libavformat to export metadata tags to a client
+ * application when demuxing. Conversely it allows a client application to
+ * set metadata when muxing.
+ *
+ * Metadata is exported or set as pairs of key/value strings in the 'metadata'
+ * fields of the AVFormatContext, AVStream, AVChapter and AVProgram structs
+ * using the @ref lavu_dict "AVDictionary" API. Like all strings in FFmpeg,
+ * metadata is assumed to be UTF-8 encoded Unicode. Note that metadata
+ * exported by demuxers isn't checked to be valid UTF-8 in most cases.
+ *
+ * Important concepts to keep in mind:
+ * -  Keys are unique; there can never be 2 tags with the same key. This is
+ *    also meant semantically, i.e., a demuxer should not knowingly produce
+ *    several keys that are literally different but semantically identical.
+ *    E.g., key=Author5, key=Author6. In this example, all authors must be
+ *    placed in the same tag.
+ * -  Metadata is flat, not hierarchical; there are no subtags. If you
+ *    want to store, e.g., the email address of the child of producer Alice
+ *    and actor Bob, that could have key=alice_and_bobs_childs_email_address.
+ * -  Several modifiers can be applied to the tag name. This is done by
+ *    appending a dash character ('-') and the modifier name in the order
+ *    they appear in the list below -- e.g. foo-eng-sort, not foo-sort-eng.
+ *    -  language -- a tag whose value is localized for a particular language
+ *       is appended with the ISO 639-2/B 3-letter language code.
+ *       For example: Author-ger=Michael, Author-eng=Mike
+ *       The original/default language is in the unqualified "Author" tag.
+ *       A demuxer should set a default if it sets any translated tag.
+ *    -  sorting  -- a modified version of a tag that should be used for
+ *       sorting will have '-sort' appended. E.g. artist="The Beatles",
+ *       artist-sort="Beatles, The".
+ * - Some protocols and demuxers support metadata updates. After a successful
+ *   call to av_read_packet(), AVFormatContext.event_flags or AVStream.event_flags
+ *   will be updated to indicate if metadata changed. In order to detect metadata
+ *   changes on a stream, you need to loop through all streams in the AVFormatContext
+ *   and check their individual event_flags.
+ *
+ * -  Demuxers attempt to export metadata in a generic format, however tags
+ *    with no generic equivalents are left as they are stored in the container.
+ *    Follows a list of generic tag names:
+ *
+ @verbatim
+ album        -- name of the set this work belongs to
+ album_artist -- main creator of the set/album, if different from artist.
+                 e.g. "Various Artists" for compilation albums.
+ artist       -- main creator of the work
+ comment      -- any additional description of the file.
+ composer     -- who composed the work, if different from artist.
+ copyright    -- name of copyright holder.
+ creation_time-- date when the file was created, preferably in ISO 8601.
+ date         -- date when the work was created, preferably in ISO 8601.
+ disc         -- number of a subset, e.g. disc in a multi-disc collection.
+ encoder      -- name/settings of the software/hardware that produced the file.
+ encoded_by   -- person/group who created the file.
+ filename     -- original name of the file.
+ genre        -- <self-evident>.
+ language     -- main language in which the work is performed, preferably
+                 in ISO 639-2 format. Multiple languages can be specified by
+                 separating them with commas.
+ performer    -- artist who performed the work, if different from artist.
+                 E.g for "Also sprach Zarathustra", artist would be "Richard
+                 Strauss" and performer "London Philharmonic Orchestra".
+ publisher    -- name of the label/publisher.
+ service_name     -- name of the service in broadcasting (channel name).
+ service_provider -- name of the service provider in broadcasting.
+ title        -- name of the work.
+ track        -- number of this work in the set, can be in form current/total.
+ variant_bitrate -- the total bitrate of the bitrate variant that the current stream is part of
+ @endverbatim
+ *
+ * Look in the examples section for an application example how to use the Metadata API.
+ *
+ * @}
+ */
+
+/* packet functions */
+
+
+/**
+ * Allocate and read the payload of a packet and initialize its
+ * fields with default values.
+ *
+ * @param s    associated IO context
+ * @param pkt packet
+ * @param size desired payload size
+ * @return >0 (read size) if OK, AVERROR_xxx otherwise
+ */
+int av_get_packet(AVIOContext *s, AVPacket *pkt, int size);
+
+
+/**
+ * Read data and append it to the current content of the AVPacket.
+ * If pkt->size is 0 this is identical to av_get_packet.
+ * Note that this uses av_grow_packet and thus involves a realloc
+ * which is inefficient. Thus this function should only be used
+ * when there is no reasonable way to know (an upper bound of)
+ * the final size.
+ *
+ * @param s    associated IO context
+ * @param pkt packet
+ * @param size amount of data to read
+ * @return >0 (read size) if OK, AVERROR_xxx otherwise, previous data
+ *         will not be lost even if an error occurs.
+ */
+int av_append_packet(AVIOContext *s, AVPacket *pkt, int size);
+
+#if FF_API_LAVF_FRAC
+/*************************************************/
+/* fractional numbers for exact pts handling */
+
+/**
+ * The exact value of the fractional number is: 'val + num / den'.
+ * num is assumed to be 0 <= num < den.
+ */
+typedef struct AVFrac {
+    int64_t val, num, den;
+} AVFrac;
+#endif
+
+/*************************************************/
+/* input/output formats */
+
+struct AVCodecTag;
+
+/**
+ * This structure contains the data a format has to probe a file.
+ */
+typedef struct AVProbeData {
+    const char *filename;
+    unsigned char *buf; /**< Buffer must have AVPROBE_PADDING_SIZE of extra allocated bytes filled with zero. */
+    int buf_size;       /**< Size of buf except extra allocated bytes */
+    const char *mime_type; /**< mime_type, when known. */
+} AVProbeData;
+
+#define AVPROBE_SCORE_RETRY (AVPROBE_SCORE_MAX/4)
+#define AVPROBE_SCORE_STREAM_RETRY (AVPROBE_SCORE_MAX/4-1)
+
+#define AVPROBE_SCORE_EXTENSION  50 ///< score for file extension
+#define AVPROBE_SCORE_MIME       75 ///< score for file mime type
+#define AVPROBE_SCORE_MAX       100 ///< maximum score
+
+#define AVPROBE_PADDING_SIZE 32             ///< extra allocated bytes at the end of the probe buffer
+
+/// Demuxer will use avio_open, no opened file should be provided by the caller.
+#define AVFMT_NOFILE        0x0001
+#define AVFMT_NEEDNUMBER    0x0002 /**< Needs '%d' in filename. */
+#define AVFMT_SHOW_IDS      0x0008 /**< Show format stream IDs numbers. */
+#if FF_API_LAVF_FMT_RAWPICTURE
+#define AVFMT_RAWPICTURE    0x0020 /**< Format wants AVPicture structure for
+                                      raw picture data. @deprecated Not used anymore */
+#endif
+#define AVFMT_GLOBALHEADER  0x0040 /**< Format wants global header. */
+#define AVFMT_NOTIMESTAMPS  0x0080 /**< Format does not need / have any timestamps. */
+#define AVFMT_GENERIC_INDEX 0x0100 /**< Use generic index building code. */
+#define AVFMT_TS_DISCONT    0x0200 /**< Format allows timestamp discontinuities. Note, muxers always require valid (monotone) timestamps */
+#define AVFMT_VARIABLE_FPS  0x0400 /**< Format allows variable fps. */
+#define AVFMT_NODIMENSIONS  0x0800 /**< Format does not need width/height */
+#define AVFMT_NOSTREAMS     0x1000 /**< Format does not require any streams */
+#define AVFMT_NOBINSEARCH   0x2000 /**< Format does not allow to fall back on binary search via read_timestamp */
+#define AVFMT_NOGENSEARCH   0x4000 /**< Format does not allow to fall back on generic search */
+#define AVFMT_NO_BYTE_SEEK  0x8000 /**< Format does not allow seeking by bytes */
+#define AVFMT_ALLOW_FLUSH  0x10000 /**< Format allows flushing. If not set, the muxer will not receive a NULL packet in the write_packet function. */
+#define AVFMT_TS_NONSTRICT 0x20000 /**< Format does not require strictly
+                                        increasing timestamps, but they must
+                                        still be monotonic */
+#define AVFMT_TS_NEGATIVE  0x40000 /**< Format allows muxing negative
+                                        timestamps. If not set the timestamp
+                                        will be shifted in av_write_frame and
+                                        av_interleaved_write_frame so they
+                                        start from 0.
+                                        The user or muxer can override this through
+                                        AVFormatContext.avoid_negative_ts
+                                        */
+
+#define AVFMT_SEEK_TO_PTS   0x4000000 /**< Seeking is based on PTS */
+
+/**
+ * @addtogroup lavf_encoding
+ * @{
+ */
+typedef struct AVOutputFormat {
+    const char *name;
+    /**
+     * Descriptive name for the format, meant to be more human-readable
+     * than name. You should use the NULL_IF_CONFIG_SMALL() macro
+     * to define it.
+     */
+    const char *long_name;
+    const char *mime_type;
+    const char *extensions; /**< comma-separated filename extensions */
+    /* output support */
+    enum AVCodecID audio_codec;    /**< default audio codec */
+    enum AVCodecID video_codec;    /**< default video codec */
+    enum AVCodecID subtitle_codec; /**< default subtitle codec */
+    /**
+     * can use flags: AVFMT_NOFILE, AVFMT_NEEDNUMBER,
+     * AVFMT_GLOBALHEADER, AVFMT_NOTIMESTAMPS, AVFMT_VARIABLE_FPS,
+     * AVFMT_NODIMENSIONS, AVFMT_NOSTREAMS, AVFMT_ALLOW_FLUSH,
+     * AVFMT_TS_NONSTRICT, AVFMT_TS_NEGATIVE
+     */
+    int flags;
+
+    /**
+     * List of supported codec_id-codec_tag pairs, ordered by "better
+     * choice first". The arrays are all terminated by AV_CODEC_ID_NONE.
+     */
+    const struct AVCodecTag * const *codec_tag;
+
+
+    const AVClass *priv_class; ///< AVClass for the private context
+
+    /*****************************************************************
+     * No fields below this line are part of the public API. They
+     * may not be used outside of libavformat and can be changed and
+     * removed at will.
+     * New public fields should be added right above.
+     *****************************************************************
+     */
+    struct AVOutputFormat *next;
+    /**
+     * size of private data so that it can be allocated in the wrapper
+     */
+    int priv_data_size;
+
+    int (*write_header)(struct AVFormatContext *);
+    /**
+     * Write a packet. If AVFMT_ALLOW_FLUSH is set in flags,
+     * pkt can be NULL in order to flush data buffered in the muxer.
+     * When flushing, return 0 if there still is more data to flush,
+     * or 1 if everything was flushed and there is no more buffered
+     * data.
+     */
+    int (*write_packet)(struct AVFormatContext *, AVPacket *pkt);
+    int (*write_trailer)(struct AVFormatContext *);
+    /**
+     * Currently only used to set pixel format if not YUV420P.
+     */
+    int (*interleave_packet)(struct AVFormatContext *, AVPacket *out,
+                             AVPacket *in, int flush);
+    /**
+     * Test if the given codec can be stored in this container.
+     *
+     * @return 1 if the codec is supported, 0 if it is not.
+     *         A negative number if unknown.
+     *         MKTAG('A', 'P', 'I', 'C') if the codec is only supported as AV_DISPOSITION_ATTACHED_PIC
+     */
+    int (*query_codec)(enum AVCodecID id, int std_compliance);
+
+    void (*get_output_timestamp)(struct AVFormatContext *s, int stream,
+                                 int64_t *dts, int64_t *wall);
+    /**
+     * Allows sending messages from application to device.
+     */
+    int (*control_message)(struct AVFormatContext *s, int type,
+                           void *data, size_t data_size);
+
+    /**
+     * Write an uncoded AVFrame.
+     *
+     * See av_write_uncoded_frame() for details.
+     *
+     * The library will free *frame afterwards, but the muxer can prevent it
+     * by setting the pointer to NULL.
+     */
+    int (*write_uncoded_frame)(struct AVFormatContext *, int stream_index,
+                               AVFrame **frame, unsigned flags);
+    /**
+     * Returns device list with it properties.
+     * @see avdevice_list_devices() for more details.
+     */
+    int (*get_device_list)(struct AVFormatContext *s, struct AVDeviceInfoList *device_list);
+    /**
+     * Initialize device capabilities submodule.
+     * @see avdevice_capabilities_create() for more details.
+     */
+    int (*create_device_capabilities)(struct AVFormatContext *s, struct AVDeviceCapabilitiesQuery *caps);
+    /**
+     * Free device capabilities submodule.
+     * @see avdevice_capabilities_free() for more details.
+     */
+    int (*free_device_capabilities)(struct AVFormatContext *s, struct AVDeviceCapabilitiesQuery *caps);
+    enum AVCodecID data_codec; /**< default data codec */
+    /**
+     * Initialize format. May allocate data here, and set any AVFormatContext or
+     * AVStream parameters that need to be set before packets are sent.
+     * This method must not write output.
+     *
+     * Return 0 if streams were fully configured, 1 if not, negative AVERROR on failure
+     *
+     * Any allocations made here must be freed in deinit().
+     */
+    int (*init)(struct AVFormatContext *);
+    /**
+     * Deinitialize format. If present, this is called whenever the muxer is being
+     * destroyed, regardless of whether or not the header has been written.
+     *
+     * If a trailer is being written, this is called after write_trailer().
+     *
+     * This is called if init() fails as well.
+     */
+    void (*deinit)(struct AVFormatContext *);
+    /**
+     * Set up any necessary bitstream filtering and extract any extra data needed
+     * for the global header.
+     * Return 0 if more packets from this stream must be checked; 1 if not.
+     */
+    int (*check_bitstream)(struct AVFormatContext *, const AVPacket *pkt);
+} AVOutputFormat;
+/**
+ * @}
+ */
+
+/**
+ * @addtogroup lavf_decoding
+ * @{
+ */
+typedef struct AVInputFormat {
+    /**
+     * A comma separated list of short names for the format. New names
+     * may be appended with a minor bump.
+     */
+    const char *name;
+
+    /**
+     * Descriptive name for the format, meant to be more human-readable
+     * than name. You should use the NULL_IF_CONFIG_SMALL() macro
+     * to define it.
+     */
+    const char *long_name;
+
+    /**
+     * Can use flags: AVFMT_NOFILE, AVFMT_NEEDNUMBER, AVFMT_SHOW_IDS,
+     * AVFMT_GENERIC_INDEX, AVFMT_TS_DISCONT, AVFMT_NOBINSEARCH,
+     * AVFMT_NOGENSEARCH, AVFMT_NO_BYTE_SEEK, AVFMT_SEEK_TO_PTS.
+     */
+    int flags;
+
+    /**
+     * If extensions are defined, then no probe is done. You should
+     * usually not use extension format guessing because it is not
+     * reliable enough
+     */
+    const char *extensions;
+
+    const struct AVCodecTag * const *codec_tag;
+
+    const AVClass *priv_class; ///< AVClass for the private context
+
+    /**
+     * Comma-separated list of mime types.
+     * It is used check for matching mime types while probing.
+     * @see av_probe_input_format2
+     */
+    const char *mime_type;
+
+    /*****************************************************************
+     * No fields below this line are part of the public API. They
+     * may not be used outside of libavformat and can be changed and
+     * removed at will.
+     * New public fields should be added right above.
+     *****************************************************************
+     */
+    struct AVInputFormat *next;
+
+    /**
+     * Raw demuxers store their codec ID here.
+     */
+    int raw_codec_id;
+
+    /**
+     * Size of private data so that it can be allocated in the wrapper.
+     */
+    int priv_data_size;
+
+    /**
+     * Tell if a given file has a chance of being parsed as this format.
+     * The buffer provided is guaranteed to be AVPROBE_PADDING_SIZE bytes
+     * big so you do not have to check for that unless you need more.
+     */
+    int (*read_probe)(AVProbeData *);
+
+    /**
+     * Read the format header and initialize the AVFormatContext
+     * structure. Return 0 if OK. 'avformat_new_stream' should be
+     * called to create new streams.
+     */
+    int (*read_header)(struct AVFormatContext *);
+
+    /**
+     * Read one packet and put it in 'pkt'. pts and flags are also
+     * set. 'avformat_new_stream' can be called only if the flag
+     * AVFMTCTX_NOHEADER is used and only in the calling thread (not in a
+     * background thread).
+     * @return 0 on success, < 0 on error.
+     *         When returning an error, pkt must not have been allocated
+     *         or must be freed before returning
+     */
+    int (*read_packet)(struct AVFormatContext *, AVPacket *pkt);
+
+    /**
+     * Close the stream. The AVFormatContext and AVStreams are not
+     * freed by this function
+     */
+    int (*read_close)(struct AVFormatContext *);
+
+    /**
+     * Seek to a given timestamp relative to the frames in
+     * stream component stream_index.
+     * @param stream_index Must not be -1.
+     * @param flags Selects which direction should be preferred if no exact
+     *              match is available.
+     * @return >= 0 on success (but not necessarily the new offset)
+     */
+    int (*read_seek)(struct AVFormatContext *,
+                     int stream_index, int64_t timestamp, int flags);
+
+    /**
+     * Get the next timestamp in stream[stream_index].time_base units.
+     * @return the timestamp or AV_NOPTS_VALUE if an error occurred
+     */
+    int64_t (*read_timestamp)(struct AVFormatContext *s, int stream_index,
+                              int64_t *pos, int64_t pos_limit);
+
+    /**
+     * Start/resume playing - only meaningful if using a network-based format
+     * (RTSP).
+     */
+    int (*read_play)(struct AVFormatContext *);
+
+    /**
+     * Pause playing - only meaningful if using a network-based format
+     * (RTSP).
+     */
+    int (*read_pause)(struct AVFormatContext *);
+
+    /**
+     * Seek to timestamp ts.
+     * Seeking will be done so that the point from which all active streams
+     * can be presented successfully will be closest to ts and within min/max_ts.
+     * Active streams are all streams that have AVStream.discard < AVDISCARD_ALL.
+     */
+    int (*read_seek2)(struct AVFormatContext *s, int stream_index, int64_t min_ts, int64_t ts, int64_t max_ts, int flags);
+
+    /**
+     * Returns device list with it properties.
+     * @see avdevice_list_devices() for more details.
+     */
+    int (*get_device_list)(struct AVFormatContext *s, struct AVDeviceInfoList *device_list);
+
+    /**
+     * Initialize device capabilities submodule.
+     * @see avdevice_capabilities_create() for more details.
+     */
+    int (*create_device_capabilities)(struct AVFormatContext *s, struct AVDeviceCapabilitiesQuery *caps);
+
+    /**
+     * Free device capabilities submodule.
+     * @see avdevice_capabilities_free() for more details.
+     */
+    int (*free_device_capabilities)(struct AVFormatContext *s, struct AVDeviceCapabilitiesQuery *caps);
+} AVInputFormat;
+/**
+ * @}
+ */
+
+enum AVStreamParseType {
+    AVSTREAM_PARSE_NONE,
+    AVSTREAM_PARSE_FULL,       /**< full parsing and repack */
+    AVSTREAM_PARSE_HEADERS,    /**< Only parse headers, do not repack. */
+    AVSTREAM_PARSE_TIMESTAMPS, /**< full parsing and interpolation of timestamps for frames not starting on a packet boundary */
+    AVSTREAM_PARSE_FULL_ONCE,  /**< full parsing and repack of the first frame only, only implemented for H.264 currently */
+    AVSTREAM_PARSE_FULL_RAW=MKTAG(0,'R','A','W'),       /**< full parsing and repack with timestamp and position generation by parser for raw
+                                                             this assumes that each packet in the file contains no demuxer level headers and
+                                                             just codec level data, otherwise position generation would fail */
+};
+
+typedef struct AVIndexEntry {
+    int64_t pos;
+    int64_t timestamp;        /**<
+                               * Timestamp in AVStream.time_base units, preferably the time from which on correctly decoded frames are available
+                               * when seeking to this entry. That means preferable PTS on keyframe based formats.
+                               * But demuxers can choose to store a different timestamp, if it is more convenient for the implementation or nothing better
+                               * is known
+                               */
+#define AVINDEX_KEYFRAME 0x0001
+#define AVINDEX_DISCARD_FRAME  0x0002    /**
+                                          * Flag is used to indicate which frame should be discarded after decoding.
+                                          */
+    int flags:2;
+    int size:30; //Yeah, trying to keep the size of this small to reduce memory requirements (it is 24 vs. 32 bytes due to possible 8-byte alignment).
+    int min_distance;         /**< Minimum distance between this and the previous keyframe, used to avoid unneeded searching. */
+} AVIndexEntry;
+
+#define AV_DISPOSITION_DEFAULT   0x0001
+#define AV_DISPOSITION_DUB       0x0002
+#define AV_DISPOSITION_ORIGINAL  0x0004
+#define AV_DISPOSITION_COMMENT   0x0008
+#define AV_DISPOSITION_LYRICS    0x0010
+#define AV_DISPOSITION_KARAOKE   0x0020
+
+/**
+ * Track should be used during playback by default.
+ * Useful for subtitle track that should be displayed
+ * even when user did not explicitly ask for subtitles.
+ */
+#define AV_DISPOSITION_FORCED    0x0040
+#define AV_DISPOSITION_HEARING_IMPAIRED  0x0080  /**< stream for hearing impaired audiences */
+#define AV_DISPOSITION_VISUAL_IMPAIRED   0x0100  /**< stream for visual impaired audiences */
+#define AV_DISPOSITION_CLEAN_EFFECTS     0x0200  /**< stream without voice */
+/**
+ * The stream is stored in the file as an attached picture/"cover art" (e.g.
+ * APIC frame in ID3v2). The first (usually only) packet associated with it
+ * will be returned among the first few packets read from the file unless
+ * seeking takes place. It can also be accessed at any time in
+ * AVStream.attached_pic.
+ */
+#define AV_DISPOSITION_ATTACHED_PIC      0x0400
+/**
+ * The stream is sparse, and contains thumbnail images, often corresponding
+ * to chapter markers. Only ever used with AV_DISPOSITION_ATTACHED_PIC.
+ */
+#define AV_DISPOSITION_TIMED_THUMBNAILS  0x0800
+
+typedef struct AVStreamInternal AVStreamInternal;
+
+/**
+ * To specify text track kind (different from subtitles default).
+ */
+#define AV_DISPOSITION_CAPTIONS     0x10000
+#define AV_DISPOSITION_DESCRIPTIONS 0x20000
+#define AV_DISPOSITION_METADATA     0x40000
+
+/**
+ * Options for behavior on timestamp wrap detection.
+ */
+#define AV_PTS_WRAP_IGNORE      0   ///< ignore the wrap
+#define AV_PTS_WRAP_ADD_OFFSET  1   ///< add the format specific offset on wrap detection
+#define AV_PTS_WRAP_SUB_OFFSET  -1  ///< subtract the format specific offset on wrap detection
+
+/**
+ * Stream structure.
+ * New fields can be added to the end with minor version bumps.
+ * Removal, reordering and changes to existing fields require a major
+ * version bump.
+ * sizeof(AVStream) must not be used outside libav*.
+ */
+typedef struct AVStream {
+    int index;    /**< stream index in AVFormatContext */
+    /**
+     * Format-specific stream ID.
+     * decoding: set by libavformat
+     * encoding: set by the user, replaced by libavformat if left unset
+     */
+    int id;
+#if FF_API_LAVF_AVCTX
+    /**
+     * @deprecated use the codecpar struct instead
+     */
+    attribute_deprecated
+    AVCodecContext *codec;
+#endif
+    void *priv_data;
+
+#if FF_API_LAVF_FRAC
+    /**
+     * @deprecated this field is unused
+     */
+    attribute_deprecated
+    struct AVFrac pts;
+#endif
+
+    /**
+     * This is the fundamental unit of time (in seconds) in terms
+     * of which frame timestamps are represented.
+     *
+     * decoding: set by libavformat
+     * encoding: May be set by the caller before avformat_write_header() to
+     *           provide a hint to the muxer about the desired timebase. In
+     *           avformat_write_header(), the muxer will overwrite this field
+     *           with the timebase that will actually be used for the timestamps
+     *           written into the file (which may or may not be related to the
+     *           user-provided one, depending on the format).
+     */
+    AVRational time_base;
+
+    /**
+     * Decoding: pts of the first frame of the stream in presentation order, in stream time base.
+     * Only set this if you are absolutely 100% sure that the value you set
+     * it to really is the pts of the first frame.
+     * This may be undefined (AV_NOPTS_VALUE).
+     * @note The ASF header does NOT contain a correct start_time the ASF
+     * demuxer must NOT set this.
+     */
+    int64_t start_time;
+
+    /**
+     * Decoding: duration of the stream, in stream time base.
+     * If a source file does not specify a duration, but does specify
+     * a bitrate, this value will be estimated from bitrate and file size.
+     *
+     * Encoding: May be set by the caller before avformat_write_header() to
+     * provide a hint to the muxer about the estimated duration.
+     */
+    int64_t duration;
+
+    int64_t nb_frames;                 ///< number of frames in this stream if known or 0
+
+    int disposition; /**< AV_DISPOSITION_* bit field */
+
+    enum AVDiscard discard; ///< Selects which packets can be discarded at will and do not need to be demuxed.
+
+    /**
+     * sample aspect ratio (0 if unknown)
+     * - encoding: Set by user.
+     * - decoding: Set by libavformat.
+     */
+    AVRational sample_aspect_ratio;
+
+    AVDictionary *metadata;
+
+    /**
+     * Average framerate
+     *
+     * - demuxing: May be set by libavformat when creating the stream or in
+     *             avformat_find_stream_info().
+     * - muxing: May be set by the caller before avformat_write_header().
+     */
+    AVRational avg_frame_rate;
+
+    /**
+     * For streams with AV_DISPOSITION_ATTACHED_PIC disposition, this packet
+     * will contain the attached picture.
+     *
+     * decoding: set by libavformat, must not be modified by the caller.
+     * encoding: unused
+     */
+    AVPacket attached_pic;
+
+    /**
+     * An array of side data that applies to the whole stream (i.e. the
+     * container does not allow it to change between packets).
+     *
+     * There may be no overlap between the side data in this array and side data
+     * in the packets. I.e. a given side data is either exported by the muxer
+     * (demuxing) / set by the caller (muxing) in this array, then it never
+     * appears in the packets, or the side data is exported / sent through
+     * the packets (always in the first packet where the value becomes known or
+     * changes), then it does not appear in this array.
+     *
+     * - demuxing: Set by libavformat when the stream is created.
+     * - muxing: May be set by the caller before avformat_write_header().
+     *
+     * Freed by libavformat in avformat_free_context().
+     *
+     * @see av_format_inject_global_side_data()
+     */
+    AVPacketSideData *side_data;
+    /**
+     * The number of elements in the AVStream.side_data array.
+     */
+    int            nb_side_data;
+
+    /**
+     * Flags for the user to detect events happening on the stream. Flags must
+     * be cleared by the user once the event has been handled.
+     * A combination of AVSTREAM_EVENT_FLAG_*.
+     */
+    int event_flags;
+#define AVSTREAM_EVENT_FLAG_METADATA_UPDATED 0x0001 ///< The call resulted in updated metadata.
+
+    /*****************************************************************
+     * All fields below this line are not part of the public API. They
+     * may not be used outside of libavformat and can be changed and
+     * removed at will.
+     * Internal note: be aware that physically removing these fields
+     * will break ABI. Replace removed fields with dummy fields, and
+     * add new fields to AVStreamInternal.
+     *****************************************************************
+     */
+
+    /**
+     * Stream information used internally by avformat_find_stream_info()
+     */
+#define MAX_STD_TIMEBASES (30*12+30+3+6)
+    struct {
+        int64_t last_dts;
+        int64_t duration_gcd;
+        int duration_count;
+        int64_t rfps_duration_sum;
+        double (*duration_error)[2][MAX_STD_TIMEBASES];
+        int64_t codec_info_duration;
+        int64_t codec_info_duration_fields;
+
+        /**
+         * 0  -> decoder has not been searched for yet.
+         * >0 -> decoder found
+         * <0 -> decoder with codec_id == -found_decoder has not been found
+         */
+        int found_decoder;
+
+        int64_t last_duration;
+
+        /**
+         * Those are used for average framerate estimation.
+         */
+        int64_t fps_first_dts;
+        int     fps_first_dts_idx;
+        int64_t fps_last_dts;
+        int     fps_last_dts_idx;
+
+    } *info;
+
+    int pts_wrap_bits; /**< number of bits in pts (used for wrapping control) */
+
+    // Timestamp generation support:
+    /**
+     * Timestamp corresponding to the last dts sync point.
+     *
+     * Initialized when AVCodecParserContext.dts_sync_point >= 0 and
+     * a DTS is received from the underlying container. Otherwise set to
+     * AV_NOPTS_VALUE by default.
+     */
+    int64_t first_dts;
+    int64_t cur_dts;
+    int64_t last_IP_pts;
+    int last_IP_duration;
+
+    /**
+     * Number of packets to buffer for codec probing
+     */
+    int probe_packets;
+
+    /**
+     * Number of frames that have been demuxed during avformat_find_stream_info()
+     */
+    int codec_info_nb_frames;
+
+    /* av_read_frame() support */
+    enum AVStreamParseType need_parsing;
+    struct AVCodecParserContext *parser;
+
+    /**
+     * last packet in packet_buffer for this stream when muxing.
+     */
+    struct AVPacketList *last_in_packet_buffer;
+    AVProbeData probe_data;
+#define MAX_REORDER_DELAY 16
+    int64_t pts_buffer[MAX_REORDER_DELAY+1];
+
+    AVIndexEntry *index_entries; /**< Only used if the format does not
+                                    support seeking natively. */
+    int nb_index_entries;
+    unsigned int index_entries_allocated_size;
+
+    /**
+     * Real base framerate of the stream.
+     * This is the lowest framerate with which all timestamps can be
+     * represented accurately (it is the least common multiple of all
+     * framerates in the stream). Note, this value is just a guess!
+     * For example, if the time base is 1/90000 and all frames have either
+     * approximately 3600 or 1800 timer ticks, then r_frame_rate will be 50/1.
+     *
+     * Code outside avformat should access this field using:
+     * av_stream_get/set_r_frame_rate(stream)
+     */
+    AVRational r_frame_rate;
+
+    /**
+     * Stream Identifier
+     * This is the MPEG-TS stream identifier +1
+     * 0 means unknown
+     */
+    int stream_identifier;
+
+    int64_t interleaver_chunk_size;
+    int64_t interleaver_chunk_duration;
+
+    /**
+     * stream probing state
+     * -1   -> probing finished
+     *  0   -> no probing requested
+     * rest -> perform probing with request_probe being the minimum score to accept.
+     * NOT PART OF PUBLIC API
+     */
+    int request_probe;
+    /**
+     * Indicates that everything up to the next keyframe
+     * should be discarded.
+     */
+    int skip_to_keyframe;
+
+    /**
+     * Number of samples to skip at the start of the frame decoded from the next packet.
+     */
+    int skip_samples;
+
+    /**
+     * If not 0, the number of samples that should be skipped from the start of
+     * the stream (the samples are removed from packets with pts==0, which also
+     * assumes negative timestamps do not happen).
+     * Intended for use with formats such as mp3 with ad-hoc gapless audio
+     * support.
+     */
+    int64_t start_skip_samples;
+
+    /**
+     * If not 0, the first audio sample that should be discarded from the stream.
+     * This is broken by design (needs global sample count), but can't be
+     * avoided for broken by design formats such as mp3 with ad-hoc gapless
+     * audio support.
+     */
+    int64_t first_discard_sample;
+
+    /**
+     * The sample after last sample that is intended to be discarded after
+     * first_discard_sample. Works on frame boundaries only. Used to prevent
+     * early EOF if the gapless info is broken (considered concatenated mp3s).
+     */
+    int64_t last_discard_sample;
+
+    /**
+     * Number of internally decoded frames, used internally in libavformat, do not access
+     * its lifetime differs from info which is why it is not in that structure.
+     */
+    int nb_decoded_frames;
+
+    /**
+     * Timestamp offset added to timestamps before muxing
+     * NOT PART OF PUBLIC API
+     */
+    int64_t mux_ts_offset;
+
+    /**
+     * Internal data to check for wrapping of the time stamp
+     */
+    int64_t pts_wrap_reference;
+
+    /**
+     * Options for behavior, when a wrap is detected.
+     *
+     * Defined by AV_PTS_WRAP_ values.
+     *
+     * If correction is enabled, there are two possibilities:
+     * If the first time stamp is near the wrap point, the wrap offset
+     * will be subtracted, which will create negative time stamps.
+     * Otherwise the offset will be added.
+     */
+    int pts_wrap_behavior;
+
+    /**
+     * Internal data to prevent doing update_initial_durations() twice
+     */
+    int update_initial_durations_done;
+
+    /**
+     * Internal data to generate dts from pts
+     */
+    int64_t pts_reorder_error[MAX_REORDER_DELAY+1];
+    uint8_t pts_reorder_error_count[MAX_REORDER_DELAY+1];
+
+    /**
+     * Internal data to analyze DTS and detect faulty mpeg streams
+     */
+    int64_t last_dts_for_order_check;
+    uint8_t dts_ordered;
+    uint8_t dts_misordered;
+
+    /**
+     * Internal data to inject global side data
+     */
+    int inject_global_side_data;
+
+    /*****************************************************************
+     * All fields above this line are not part of the public API.
+     * Fields below are part of the public API and ABI again.
+     *****************************************************************
+     */
+
+    /**
+     * String containing paris of key and values describing recommended encoder configuration.
+     * Paris are separated by ','.
+     * Keys are separated from values by '='.
+     */
+    char *recommended_encoder_configuration;
+
+    /**
+     * display aspect ratio (0 if unknown)
+     * - encoding: unused
+     * - decoding: Set by libavformat to calculate sample_aspect_ratio internally
+     */
+    AVRational display_aspect_ratio;
+
+    struct FFFrac *priv_pts;
+
+    /**
+     * An opaque field for libavformat internal usage.
+     * Must not be accessed in any way by callers.
+     */
+    AVStreamInternal *internal;
+
+    /*
+     * Codec parameters associated with this stream. Allocated and freed by
+     * libavformat in avformat_new_stream() and avformat_free_context()
+     * respectively.
+     *
+     * - demuxing: filled by libavformat on stream creation or in
+     *             avformat_find_stream_info()
+     * - muxing: filled by the caller before avformat_write_header()
+     */
+    AVCodecParameters *codecpar;
+} AVStream;
+
+AVRational av_stream_get_r_frame_rate(const AVStream *s);
+void       av_stream_set_r_frame_rate(AVStream *s, AVRational r);
+struct AVCodecParserContext *av_stream_get_parser(const AVStream *s);
+char* av_stream_get_recommended_encoder_configuration(const AVStream *s);
+void  av_stream_set_recommended_encoder_configuration(AVStream *s, char *configuration);
+
+/**
+ * Returns the pts of the last muxed packet + its duration
+ *
+ * the retuned value is undefined when used with a demuxer.
+ */
+int64_t    av_stream_get_end_pts(const AVStream *st);
+
+#define AV_PROGRAM_RUNNING 1
+
+/**
+ * New fields can be added to the end with minor version bumps.
+ * Removal, reordering and changes to existing fields require a major
+ * version bump.
+ * sizeof(AVProgram) must not be used outside libav*.
+ */
+typedef struct AVProgram {
+    int            id;
+    int            flags;
+    enum AVDiscard discard;        ///< selects which program to discard and which to feed to the caller
+    unsigned int   *stream_index;
+    unsigned int   nb_stream_indexes;
+    AVDictionary *metadata;
+
+    int program_num;
+    int pmt_pid;
+    int pcr_pid;
+
+    /*****************************************************************
+     * All fields below this line are not part of the public API. They
+     * may not be used outside of libavformat and can be changed and
+     * removed at will.
+     * New public fields should be added right above.
+     *****************************************************************
+     */
+    int64_t start_time;
+    int64_t end_time;
+
+    int64_t pts_wrap_reference;    ///< reference dts for wrap detection
+    int pts_wrap_behavior;         ///< behavior on wrap detection
+} AVProgram;
+
+#define AVFMTCTX_NOHEADER      0x0001 /**< signal that no header is present
+                                         (streams are added dynamically) */
+
+typedef struct AVChapter {
+    int id;                 ///< unique ID to identify the chapter
+    AVRational time_base;   ///< time base in which the start/end timestamps are specified
+    int64_t start, end;     ///< chapter start/end time in time_base units
+    AVDictionary *metadata;
+} AVChapter;
+
+
+/**
+ * Callback used by devices to communicate with application.
+ */
+typedef int (*av_format_control_message)(struct AVFormatContext *s, int type,
+                                         void *data, size_t data_size);
+
+typedef int (*AVOpenCallback)(struct AVFormatContext *s, AVIOContext **pb, const char *url, int flags,
+                              const AVIOInterruptCB *int_cb, AVDictionary **options);
+
+/**
+ * The duration of a video can be estimated through various ways, and this enum can be used
+ * to know how the duration was estimated.
+ */
+enum AVDurationEstimationMethod {
+    AVFMT_DURATION_FROM_PTS,    ///< Duration accurately estimated from PTSes
+    AVFMT_DURATION_FROM_STREAM, ///< Duration estimated from a stream with a known duration
+    AVFMT_DURATION_FROM_BITRATE ///< Duration estimated from bitrate (less accurate)
+};
+
+typedef struct AVFormatInternal AVFormatInternal;
+
+/**
+ * Format I/O context.
+ * New fields can be added to the end with minor version bumps.
+ * Removal, reordering and changes to existing fields require a major
+ * version bump.
+ * sizeof(AVFormatContext) must not be used outside libav*, use
+ * avformat_alloc_context() to create an AVFormatContext.
+ *
+ * Fields can be accessed through AVOptions (av_opt*),
+ * the name string used matches the associated command line parameter name and
+ * can be found in libavformat/options_table.h.
+ * The AVOption/command line parameter names differ in some cases from the C
+ * structure field names for historic reasons or brevity.
+ */
+typedef struct AVFormatContext {
+    /**
+     * A class for logging and @ref avoptions. Set by avformat_alloc_context().
+     * Exports (de)muxer private options if they exist.
+     */
+    const AVClass *av_class;
+
+    /**
+     * The input container format.
+     *
+     * Demuxing only, set by avformat_open_input().
+     */
+    struct AVInputFormat *iformat;
+
+    /**
+     * The output container format.
+     *
+     * Muxing only, must be set by the caller before avformat_write_header().
+     */
+    struct AVOutputFormat *oformat;
+
+    /**
+     * Format private data. This is an AVOptions-enabled struct
+     * if and only if iformat/oformat.priv_class is not NULL.
+     *
+     * - muxing: set by avformat_write_header()
+     * - demuxing: set by avformat_open_input()
+     */
+    void *priv_data;
+
+    /**
+     * I/O context.
+     *
+     * - demuxing: either set by the user before avformat_open_input() (then
+     *             the user must close it manually) or set by avformat_open_input().
+     * - muxing: set by the user before avformat_write_header(). The caller must
+     *           take care of closing / freeing the IO context.
+     *
+     * Do NOT set this field if AVFMT_NOFILE flag is set in
+     * iformat/oformat.flags. In such a case, the (de)muxer will handle
+     * I/O in some other way and this field will be NULL.
+     */
+    AVIOContext *pb;
+
+    /* stream info */
+    /**
+     * Flags signalling stream properties. A combination of AVFMTCTX_*.
+     * Set by libavformat.
+     */
+    int ctx_flags;
+
+    /**
+     * Number of elements in AVFormatContext.streams.
+     *
+     * Set by avformat_new_stream(), must not be modified by any other code.
+     */
+    unsigned int nb_streams;
+    /**
+     * A list of all streams in the file. New streams are created with
+     * avformat_new_stream().
+     *
+     * - demuxing: streams are created by libavformat in avformat_open_input().
+     *             If AVFMTCTX_NOHEADER is set in ctx_flags, then new streams may also
+     *             appear in av_read_frame().
+     * - muxing: streams are created by the user before avformat_write_header().
+     *
+     * Freed by libavformat in avformat_free_context().
+     */
+    AVStream **streams;
+
+    /**
+     * input or output filename
+     *
+     * - demuxing: set by avformat_open_input()
+     * - muxing: may be set by the caller before avformat_write_header()
+     */
+    char filename[1024];
+
+    /**
+     * Position of the first frame of the component, in
+     * AV_TIME_BASE fractional seconds. NEVER set this value directly:
+     * It is deduced from the AVStream values.
+     *
+     * Demuxing only, set by libavformat.
+     */
+    int64_t start_time;
+
+    /**
+     * Duration of the stream, in AV_TIME_BASE fractional
+     * seconds. Only set this value if you know none of the individual stream
+     * durations and also do not set any of them. This is deduced from the
+     * AVStream values if not set.
+     *
+     * Demuxing only, set by libavformat.
+     */
+    int64_t duration;
+
+    /**
+     * Total stream bitrate in bit/s, 0 if not
+     * available. Never set it directly if the file_size and the
+     * duration are known as FFmpeg can compute it automatically.
+     */
+    int64_t bit_rate;
+
+    unsigned int packet_size;
+    int max_delay;
+
+    /**
+     * Flags modifying the (de)muxer behaviour. A combination of AVFMT_FLAG_*.
+     * Set by the user before avformat_open_input() / avformat_write_header().
+     */
+    int flags;
+#define AVFMT_FLAG_GENPTS       0x0001 ///< Generate missing pts even if it requires parsing future frames.
+#define AVFMT_FLAG_IGNIDX       0x0002 ///< Ignore index.
+#define AVFMT_FLAG_NONBLOCK     0x0004 ///< Do not block when reading packets from input.
+#define AVFMT_FLAG_IGNDTS       0x0008 ///< Ignore DTS on frames that contain both DTS & PTS
+#define AVFMT_FLAG_NOFILLIN     0x0010 ///< Do not infer any values from other values, just return what is stored in the container
+#define AVFMT_FLAG_NOPARSE      0x0020 ///< Do not use AVParsers, you also must set AVFMT_FLAG_NOFILLIN as the fillin code works on frames and no parsing -> no frames. Also seeking to frames can not work if parsing to find frame boundaries has been disabled
+#define AVFMT_FLAG_NOBUFFER     0x0040 ///< Do not buffer frames when possible
+#define AVFMT_FLAG_CUSTOM_IO    0x0080 ///< The caller has supplied a custom AVIOContext, don't avio_close() it.
+#define AVFMT_FLAG_DISCARD_CORRUPT  0x0100 ///< Discard frames marked corrupted
+#define AVFMT_FLAG_FLUSH_PACKETS    0x0200 ///< Flush the AVIOContext every packet.
+/**
+ * When muxing, try to avoid writing any random/volatile data to the output.
+ * This includes any random IDs, real-time timestamps/dates, muxer version, etc.
+ *
+ * This flag is mainly intended for testing.
+ */
+#define AVFMT_FLAG_BITEXACT         0x0400
+#define AVFMT_FLAG_MP4A_LATM    0x8000 ///< Enable RTP MP4A-LATM payload
+#define AVFMT_FLAG_SORT_DTS    0x10000 ///< try to interleave outputted packets by dts (using this flag can slow demuxing down)
+#define AVFMT_FLAG_PRIV_OPT    0x20000 ///< Enable use of private options by delaying codec open (this could be made default once all code is converted)
+#if FF_API_LAVF_KEEPSIDE_FLAG
+#define AVFMT_FLAG_KEEP_SIDE_DATA 0x40000 ///< Don't merge side data but keep it separate. Deprecated, will be the default.
+#endif
+#define AVFMT_FLAG_FAST_SEEK   0x80000 ///< Enable fast, but inaccurate seeks for some formats
+#define AVFMT_FLAG_SHORTEST   0x100000 ///< Stop muxing when the shortest stream stops.
+#define AVFMT_FLAG_AUTO_BSF   0x200000 ///< Wait for packet data before writing a header, and add bitstream filters as requested by the muxer
+
+    /**
+     * Maximum size of the data read from input for determining
+     * the input container format.
+     * Demuxing only, set by the caller before avformat_open_input().
+     */
+    int64_t probesize;
+
+    /**
+     * Maximum duration (in AV_TIME_BASE units) of the data read
+     * from input in avformat_find_stream_info().
+     * Demuxing only, set by the caller before avformat_find_stream_info().
+     * Can be set to 0 to let avformat choose using a heuristic.
+     */
+    int64_t max_analyze_duration;
+
+    const uint8_t *key;
+    int keylen;
+
+    unsigned int nb_programs;
+    AVProgram **programs;
+
+    /**
+     * Forced video codec_id.
+     * Demuxing: Set by user.
+     */
+    enum AVCodecID video_codec_id;
+
+    /**
+     * Forced audio codec_id.
+     * Demuxing: Set by user.
+     */
+    enum AVCodecID audio_codec_id;
+
+    /**
+     * Forced subtitle codec_id.
+     * Demuxing: Set by user.
+     */
+    enum AVCodecID subtitle_codec_id;
+
+    /**
+     * Maximum amount of memory in bytes to use for the index of each stream.
+     * If the index exceeds this size, entries will be discarded as
+     * needed to maintain a smaller size. This can lead to slower or less
+     * accurate seeking (depends on demuxer).
+     * Demuxers for which a full in-memory index is mandatory will ignore
+     * this.
+     * - muxing: unused
+     * - demuxing: set by user
+     */
+    unsigned int max_index_size;
+
+    /**
+     * Maximum amount of memory in bytes to use for buffering frames
+     * obtained from realtime capture devices.
+     */
+    unsigned int max_picture_buffer;
+
+    /**
+     * Number of chapters in AVChapter array.
+     * When muxing, chapters are normally written in the file header,
+     * so nb_chapters should normally be initialized before write_header
+     * is called. Some muxers (e.g. mov and mkv) can also write chapters
+     * in the trailer.  To write chapters in the trailer, nb_chapters
+     * must be zero when write_header is called and non-zero when
+     * write_trailer is called.
+     * - muxing: set by user
+     * - demuxing: set by libavformat
+     */
+    unsigned int nb_chapters;
+    AVChapter **chapters;
+
+    /**
+     * Metadata that applies to the whole file.
+     *
+     * - demuxing: set by libavformat in avformat_open_input()
+     * - muxing: may be set by the caller before avformat_write_header()
+     *
+     * Freed by libavformat in avformat_free_context().
+     */
+    AVDictionary *metadata;
+
+    /**
+     * Start time of the stream in real world time, in microseconds
+     * since the Unix epoch (00:00 1st January 1970). That is, pts=0 in the
+     * stream was captured at this real world time.
+     * - muxing: Set by the caller before avformat_write_header(). If set to
+     *           either 0 or AV_NOPTS_VALUE, then the current wall-time will
+     *           be used.
+     * - demuxing: Set by libavformat. AV_NOPTS_VALUE if unknown. Note that
+     *             the value may become known after some number of frames
+     *             have been received.
+     */
+    int64_t start_time_realtime;
+
+    /**
+     * The number of frames used for determining the framerate in
+     * avformat_find_stream_info().
+     * Demuxing only, set by the caller before avformat_find_stream_info().
+     */
+    int fps_probe_size;
+
+    /**
+     * Error recognition; higher values will detect more errors but may
+     * misdetect some more or less valid parts as errors.
+     * Demuxing only, set by the caller before avformat_open_input().
+     */
+    int error_recognition;
+
+    /**
+     * Custom interrupt callbacks for the I/O layer.
+     *
+     * demuxing: set by the user before avformat_open_input().
+     * muxing: set by the user before avformat_write_header()
+     * (mainly useful for AVFMT_NOFILE formats). The callback
+     * should also be passed to avio_open2() if it's used to
+     * open the file.
+     */
+    AVIOInterruptCB interrupt_callback;
+
+    /**
+     * Flags to enable debugging.
+     */
+    int debug;
+#define FF_FDEBUG_TS        0x0001
+
+    /**
+     * Maximum buffering duration for interleaving.
+     *
+     * To ensure all the streams are interleaved correctly,
+     * av_interleaved_write_frame() will wait until it has at least one packet
+     * for each stream before actually writing any packets to the output file.
+     * When some streams are "sparse" (i.e. there are large gaps between
+     * successive packets), this can result in excessive buffering.
+     *
+     * This field specifies the maximum difference between the timestamps of the
+     * first and the last packet in the muxing queue, above which libavformat
+     * will output a packet regardless of whether it has queued a packet for all
+     * the streams.
+     *
+     * Muxing only, set by the caller before avformat_write_header().
+     */
+    int64_t max_interleave_delta;
+
+    /**
+     * Allow non-standard and experimental extension
+     * @see AVCodecContext.strict_std_compliance
+     */
+    int strict_std_compliance;
+
+    /**
+     * Flags for the user to detect events happening on the file. Flags must
+     * be cleared by the user once the event has been handled.
+     * A combination of AVFMT_EVENT_FLAG_*.
+     */
+    int event_flags;
+#define AVFMT_EVENT_FLAG_METADATA_UPDATED 0x0001 ///< The call resulted in updated metadata.
+
+    /**
+     * Maximum number of packets to read while waiting for the first timestamp.
+     * Decoding only.
+     */
+    int max_ts_probe;
+
+    /**
+     * Avoid negative timestamps during muxing.
+     * Any value of the AVFMT_AVOID_NEG_TS_* constants.
+     * Note, this only works when using av_interleaved_write_frame. (interleave_packet_per_dts is in use)
+     * - muxing: Set by user
+     * - demuxing: unused
+     */
+    int avoid_negative_ts;
+#define AVFMT_AVOID_NEG_TS_AUTO             -1 ///< Enabled when required by target format
+#define AVFMT_AVOID_NEG_TS_MAKE_NON_NEGATIVE 1 ///< Shift timestamps so they are non negative
+#define AVFMT_AVOID_NEG_TS_MAKE_ZERO         2 ///< Shift timestamps so that they start at 0
+
+    /**
+     * Transport stream id.
+     * This will be moved into demuxer private options. Thus no API/ABI compatibility
+     */
+    int ts_id;
+
+    /**
+     * Audio preload in microseconds.
+     * Note, not all formats support this and unpredictable things may happen if it is used when not supported.
+     * - encoding: Set by user
+     * - decoding: unused
+     */
+    int audio_preload;
+
+    /**
+     * Max chunk time in microseconds.
+     * Note, not all formats support this and unpredictable things may happen if it is used when not supported.
+     * - encoding: Set by user
+     * - decoding: unused
+     */
+    int max_chunk_duration;
+
+    /**
+     * Max chunk size in bytes
+     * Note, not all formats support this and unpredictable things may happen if it is used when not supported.
+     * - encoding: Set by user
+     * - decoding: unused
+     */
+    int max_chunk_size;
+
+    /**
+     * forces the use of wallclock timestamps as pts/dts of packets
+     * This has undefined results in the presence of B frames.
+     * - encoding: unused
+     * - decoding: Set by user
+     */
+    int use_wallclock_as_timestamps;
+
+    /**
+     * avio flags, used to force AVIO_FLAG_DIRECT.
+     * - encoding: unused
+     * - decoding: Set by user
+     */
+    int avio_flags;
+
+    /**
+     * The duration field can be estimated through various ways, and this field can be used
+     * to know how the duration was estimated.
+     * - encoding: unused
+     * - decoding: Read by user
+     */
+    enum AVDurationEstimationMethod duration_estimation_method;
+
+    /**
+     * Skip initial bytes when opening stream
+     * - encoding: unused
+     * - decoding: Set by user
+     */
+    int64_t skip_initial_bytes;
+
+    /**
+     * Correct single timestamp overflows
+     * - encoding: unused
+     * - decoding: Set by user
+     */
+    unsigned int correct_ts_overflow;
+
+    /**
+     * Force seeking to any (also non key) frames.
+     * - encoding: unused
+     * - decoding: Set by user
+     */
+    int seek2any;
+
+    /**
+     * Flush the I/O context after each packet.
+     * - encoding: Set by user
+     * - decoding: unused
+     */
+    int flush_packets;
+
+    /**
+     * format probing score.
+     * The maximal score is AVPROBE_SCORE_MAX, its set when the demuxer probes
+     * the format.
+     * - encoding: unused
+     * - decoding: set by avformat, read by user
+     */
+    int probe_score;
+
+    /**
+     * number of bytes to read maximally to identify format.
+     * - encoding: unused
+     * - decoding: set by user
+     */
+    int format_probesize;
+
+    /**
+     * ',' separated list of allowed decoders.
+     * If NULL then all are allowed
+     * - encoding: unused
+     * - decoding: set by user
+     */
+    char *codec_whitelist;
+
+    /**
+     * ',' separated list of allowed demuxers.
+     * If NULL then all are allowed
+     * - encoding: unused
+     * - decoding: set by user
+     */
+    char *format_whitelist;
+
+    /**
+     * An opaque field for libavformat internal usage.
+     * Must not be accessed in any way by callers.
+     */
+    AVFormatInternal *internal;
+
+    /**
+     * IO repositioned flag.
+     * This is set by avformat when the underlaying IO context read pointer
+     * is repositioned, for example when doing byte based seeking.
+     * Demuxers can use the flag to detect such changes.
+     */
+    int io_repositioned;
+
+    /**
+     * Forced video codec.
+     * This allows forcing a specific decoder, even when there are multiple with
+     * the same codec_id.
+     * Demuxing: Set by user
+     */
+    AVCodec *video_codec;
+
+    /**
+     * Forced audio codec.
+     * This allows forcing a specific decoder, even when there are multiple with
+     * the same codec_id.
+     * Demuxing: Set by user
+     */
+    AVCodec *audio_codec;
+
+    /**
+     * Forced subtitle codec.
+     * This allows forcing a specific decoder, even when there are multiple with
+     * the same codec_id.
+     * Demuxing: Set by user
+     */
+    AVCodec *subtitle_codec;
+
+    /**
+     * Forced data codec.
+     * This allows forcing a specific decoder, even when there are multiple with
+     * the same codec_id.
+     * Demuxing: Set by user
+     */
+    AVCodec *data_codec;
+
+    /**
+     * Number of bytes to be written as padding in a metadata header.
+     * Demuxing: Unused.
+     * Muxing: Set by user via av_format_set_metadata_header_padding.
+     */
+    int metadata_header_padding;
+
+    /**
+     * User data.
+     * This is a place for some private data of the user.
+     */
+    void *opaque;
+
+    /**
+     * Callback used by devices to communicate with application.
+     */
+    av_format_control_message control_message_cb;
+
+    /**
+     * Output timestamp offset, in microseconds.
+     * Muxing: set by user
+     */
+    int64_t output_ts_offset;
+
+    /**
+     * dump format separator.
+     * can be ", " or "\n      " or anything else
+     * - muxing: Set by user.
+     * - demuxing: Set by user.
+     */
+    uint8_t *dump_separator;
+
+    /**
+     * Forced Data codec_id.
+     * Demuxing: Set by user.
+     */
+    enum AVCodecID data_codec_id;
+
+#if FF_API_OLD_OPEN_CALLBACKS
+    /**
+     * Called to open further IO contexts when needed for demuxing.
+     *
+     * This can be set by the user application to perform security checks on
+     * the URLs before opening them.
+     * The function should behave like avio_open2(), AVFormatContext is provided
+     * as contextual information and to reach AVFormatContext.opaque.
+     *
+     * If NULL then some simple checks are used together with avio_open2().
+     *
+     * Must not be accessed directly from outside avformat.
+     * @See av_format_set_open_cb()
+     *
+     * Demuxing: Set by user.
+     *
+     * @deprecated Use io_open and io_close.
+     */
+    attribute_deprecated
+    int (*open_cb)(struct AVFormatContext *s, AVIOContext **p, const char *url, int flags, const AVIOInterruptCB *int_cb, AVDictionary **options);
+#endif
+
+    /**
+     * ',' separated list of allowed protocols.
+     * - encoding: unused
+     * - decoding: set by user
+     */
+    char *protocol_whitelist;
+
+    /*
+     * A callback for opening new IO streams.
+     *
+     * Whenever a muxer or a demuxer needs to open an IO stream (typically from
+     * avformat_open_input() for demuxers, but for certain formats can happen at
+     * other times as well), it will call this callback to obtain an IO context.
+     *
+     * @param s the format context
+     * @param pb on success, the newly opened IO context should be returned here
+     * @param url the url to open
+     * @param flags a combination of AVIO_FLAG_*
+     * @param options a dictionary of additional options, with the same
+     *                semantics as in avio_open2()
+     * @return 0 on success, a negative AVERROR code on failure
+     *
+     * @note Certain muxers and demuxers do nesting, i.e. they open one or more
+     * additional internal format contexts. Thus the AVFormatContext pointer
+     * passed to this callback may be different from the one facing the caller.
+     * It will, however, have the same 'opaque' field.
+     */
+    int (*io_open)(struct AVFormatContext *s, AVIOContext **pb, const char *url,
+                   int flags, AVDictionary **options);
+
+    /**
+     * A callback for closing the streams opened with AVFormatContext.io_open().
+     */
+    void (*io_close)(struct AVFormatContext *s, AVIOContext *pb);
+
+    /**
+     * ',' separated list of disallowed protocols.
+     * - encoding: unused
+     * - decoding: set by user
+     */
+    char *protocol_blacklist;
+
+    /**
+     * The maximum number of streams.
+     * - encoding: unused
+     * - decoding: set by user
+     */
+    int max_streams;
+} AVFormatContext;
+
+/**
+ * Accessors for some AVFormatContext fields. These used to be provided for ABI
+ * compatibility, and do not need to be used anymore.
+ */
+int av_format_get_probe_score(const AVFormatContext *s);
+AVCodec * av_format_get_video_codec(const AVFormatContext *s);
+void      av_format_set_video_codec(AVFormatContext *s, AVCodec *c);
+AVCodec * av_format_get_audio_codec(const AVFormatContext *s);
+void      av_format_set_audio_codec(AVFormatContext *s, AVCodec *c);
+AVCodec * av_format_get_subtitle_codec(const AVFormatContext *s);
+void      av_format_set_subtitle_codec(AVFormatContext *s, AVCodec *c);
+AVCodec * av_format_get_data_codec(const AVFormatContext *s);
+void      av_format_set_data_codec(AVFormatContext *s, AVCodec *c);
+int       av_format_get_metadata_header_padding(const AVFormatContext *s);
+void      av_format_set_metadata_header_padding(AVFormatContext *s, int c);
+void *    av_format_get_opaque(const AVFormatContext *s);
+void      av_format_set_opaque(AVFormatContext *s, void *opaque);
+av_format_control_message av_format_get_control_message_cb(const AVFormatContext *s);
+void      av_format_set_control_message_cb(AVFormatContext *s, av_format_control_message callback);
+#if FF_API_OLD_OPEN_CALLBACKS
+attribute_deprecated AVOpenCallback av_format_get_open_cb(const AVFormatContext *s);
+attribute_deprecated void av_format_set_open_cb(AVFormatContext *s, AVOpenCallback callback);
+#endif
+
+/**
+ * This function will cause global side data to be injected in the next packet
+ * of each stream as well as after any subsequent seek.
+ */
+void av_format_inject_global_side_data(AVFormatContext *s);
+
+/**
+ * Returns the method used to set ctx->duration.
+ *
+ * @return AVFMT_DURATION_FROM_PTS, AVFMT_DURATION_FROM_STREAM, or AVFMT_DURATION_FROM_BITRATE.
+ */
+enum AVDurationEstimationMethod av_fmt_ctx_get_duration_estimation_method(const AVFormatContext* ctx);
+
+typedef struct AVPacketList {
+    AVPacket pkt;
+    struct AVPacketList *next;
+} AVPacketList;
+
+
+/**
+ * @defgroup lavf_core Core functions
+ * @ingroup libavf
+ *
+ * Functions for querying libavformat capabilities, allocating core structures,
+ * etc.
+ * @{
+ */
+
+/**
+ * Return the LIBAVFORMAT_VERSION_INT constant.
+ */
+unsigned avformat_version(void);
+
+/**
+ * Return the libavformat build-time configuration.
+ */
+const char *avformat_configuration(void);
+
+/**
+ * Return the libavformat license.
+ */
+const char *avformat_license(void);
+
+/**
+ * Initialize libavformat and register all the muxers, demuxers and
+ * protocols. If you do not call this function, then you can select
+ * exactly which formats you want to support.
+ *
+ * @see av_register_input_format()
+ * @see av_register_output_format()
+ */
+void av_register_all(void);
+
+void av_register_input_format(AVInputFormat *format);
+void av_register_output_format(AVOutputFormat *format);
+
+/**
+ * Do global initialization of network components. This is optional,
+ * but recommended, since it avoids the overhead of implicitly
+ * doing the setup for each session.
+ *
+ * Calling this function will become mandatory if using network
+ * protocols at some major version bump.
+ */
+int avformat_network_init(void);
+
+/**
+ * Undo the initialization done by avformat_network_init.
+ */
+int avformat_network_deinit(void);
+
+/**
+ * If f is NULL, returns the first registered input format,
+ * if f is non-NULL, returns the next registered input format after f
+ * or NULL if f is the last one.
+ */
+AVInputFormat  *av_iformat_next(const AVInputFormat  *f);
+
+/**
+ * If f is NULL, returns the first registered output format,
+ * if f is non-NULL, returns the next registered output format after f
+ * or NULL if f is the last one.
+ */
+AVOutputFormat *av_oformat_next(const AVOutputFormat *f);
+
+/**
+ * Allocate an AVFormatContext.
+ * avformat_free_context() can be used to free the context and everything
+ * allocated by the framework within it.
+ */
+AVFormatContext *avformat_alloc_context(void);
+
+/**
+ * Free an AVFormatContext and all its streams.
+ * @param s context to free
+ */
+void avformat_free_context(AVFormatContext *s);
+
+/**
+ * Get the AVClass for AVFormatContext. It can be used in combination with
+ * AV_OPT_SEARCH_FAKE_OBJ for examining options.
+ *
+ * @see av_opt_find().
+ */
+const AVClass *avformat_get_class(void);
+
+/**
+ * Add a new stream to a media file.
+ *
+ * When demuxing, it is called by the demuxer in read_header(). If the
+ * flag AVFMTCTX_NOHEADER is set in s.ctx_flags, then it may also
+ * be called in read_packet().
+ *
+ * When muxing, should be called by the user before avformat_write_header().
+ *
+ * User is required to call avcodec_close() and avformat_free_context() to
+ * clean up the allocation by avformat_new_stream().
+ *
+ * @param s media file handle
+ * @param c If non-NULL, the AVCodecContext corresponding to the new stream
+ * will be initialized to use this codec. This is needed for e.g. codec-specific
+ * defaults to be set, so codec should be provided if it is known.
+ *
+ * @return newly created stream or NULL on error.
+ */
+AVStream *avformat_new_stream(AVFormatContext *s, const AVCodec *c);
+
+/**
+ * Wrap an existing array as stream side data.
+ *
+ * @param st stream
+ * @param type side information type
+ * @param data the side data array. It must be allocated with the av_malloc()
+ *             family of functions. The ownership of the data is transferred to
+ *             st.
+ * @param size side information size
+ * @return zero on success, a negative AVERROR code on failure. On failure,
+ *         the stream is unchanged and the data remains owned by the caller.
+ */
+int av_stream_add_side_data(AVStream *st, enum AVPacketSideDataType type,
+                            uint8_t *data, size_t size);
+
+/**
+ * Allocate new information from stream.
+ *
+ * @param stream stream
+ * @param type desired side information type
+ * @param size side information size
+ * @return pointer to fresh allocated data or NULL otherwise
+ */
+uint8_t *av_stream_new_side_data(AVStream *stream,
+                                 enum AVPacketSideDataType type, int size);
+/**
+ * Get side information from stream.
+ *
+ * @param stream stream
+ * @param type desired side information type
+ * @param size pointer for side information size to store (optional)
+ * @return pointer to data if present or NULL otherwise
+ */
+#if FF_API_NOCONST_GET_SIDE_DATA
+uint8_t *av_stream_get_side_data(AVStream *stream,
+                                 enum AVPacketSideDataType type, int *size);
+#else
+uint8_t *av_stream_get_side_data(const AVStream *stream,
+                                 enum AVPacketSideDataType type, int *size);
+#endif
+
+AVProgram *av_new_program(AVFormatContext *s, int id);
+
+/**
+ * @}
+ */
+
+
+/**
+ * Allocate an AVFormatContext for an output format.
+ * avformat_free_context() can be used to free the context and
+ * everything allocated by the framework within it.
+ *
+ * @param *ctx is set to the created format context, or to NULL in
+ * case of failure
+ * @param oformat format to use for allocating the context, if NULL
+ * format_name and filename are used instead
+ * @param format_name the name of output format to use for allocating the
+ * context, if NULL filename is used instead
+ * @param filename the name of the filename to use for allocating the
+ * context, may be NULL
+ * @return >= 0 in case of success, a negative AVERROR code in case of
+ * failure
+ */
+int avformat_alloc_output_context2(AVFormatContext **ctx, AVOutputFormat *oformat,
+                                   const char *format_name, const char *filename);
+
+/**
+ * @addtogroup lavf_decoding
+ * @{
+ */
+
+/**
+ * Find AVInputFormat based on the short name of the input format.
+ */
+AVInputFormat *av_find_input_format(const char *short_name);
+
+/**
+ * Guess the file format.
+ *
+ * @param pd        data to be probed
+ * @param is_opened Whether the file is already opened; determines whether
+ *                  demuxers with or without AVFMT_NOFILE are probed.
+ */
+AVInputFormat *av_probe_input_format(AVProbeData *pd, int is_opened);
+
+/**
+ * Guess the file format.
+ *
+ * @param pd        data to be probed
+ * @param is_opened Whether the file is already opened; determines whether
+ *                  demuxers with or without AVFMT_NOFILE are probed.
+ * @param score_max A probe score larger that this is required to accept a
+ *                  detection, the variable is set to the actual detection
+ *                  score afterwards.
+ *                  If the score is <= AVPROBE_SCORE_MAX / 4 it is recommended
+ *                  to retry with a larger probe buffer.
+ */
+AVInputFormat *av_probe_input_format2(AVProbeData *pd, int is_opened, int *score_max);
+
+/**
+ * Guess the file format.
+ *
+ * @param is_opened Whether the file is already opened; determines whether
+ *                  demuxers with or without AVFMT_NOFILE are probed.
+ * @param score_ret The score of the best detection.
+ */
+AVInputFormat *av_probe_input_format3(AVProbeData *pd, int is_opened, int *score_ret);
+
+/**
+ * Probe a bytestream to determine the input format. Each time a probe returns
+ * with a score that is too low, the probe buffer size is increased and another
+ * attempt is made. When the maximum probe size is reached, the input format
+ * with the highest score is returned.
+ *
+ * @param pb the bytestream to probe
+ * @param fmt the input format is put here
+ * @param url the url of the stream
+ * @param logctx the log context
+ * @param offset the offset within the bytestream to probe from
+ * @param max_probe_size the maximum probe buffer size (zero for default)
+ * @return the score in case of success, a negative value corresponding to an
+ *         the maximal score is AVPROBE_SCORE_MAX
+ * AVERROR code otherwise
+ */
+int av_probe_input_buffer2(AVIOContext *pb, AVInputFormat **fmt,
+                           const char *url, void *logctx,
+                           unsigned int offset, unsigned int max_probe_size);
+
+/**
+ * Like av_probe_input_buffer2() but returns 0 on success
+ */
+int av_probe_input_buffer(AVIOContext *pb, AVInputFormat **fmt,
+                          const char *url, void *logctx,
+                          unsigned int offset, unsigned int max_probe_size);
+
+/**
+ * Open an input stream and read the header. The codecs are not opened.
+ * The stream must be closed with avformat_close_input().
+ *
+ * @param ps Pointer to user-supplied AVFormatContext (allocated by avformat_alloc_context).
+ *           May be a pointer to NULL, in which case an AVFormatContext is allocated by this
+ *           function and written into ps.
+ *           Note that a user-supplied AVFormatContext will be freed on failure.
+ * @param url URL of the stream to open.
+ * @param fmt If non-NULL, this parameter forces a specific input format.
+ *            Otherwise the format is autodetected.
+ * @param options  A dictionary filled with AVFormatContext and demuxer-private options.
+ *                 On return this parameter will be destroyed and replaced with a dict containing
+ *                 options that were not found. May be NULL.
+ *
+ * @return 0 on success, a negative AVERROR on failure.
+ *
+ * @note If you want to use custom IO, preallocate the format context and set its pb field.
+ */
+int avformat_open_input(AVFormatContext **ps, const char *url, AVInputFormat *fmt, AVDictionary **options);
+
+attribute_deprecated
+int av_demuxer_open(AVFormatContext *ic);
+
+/**
+ * Read packets of a media file to get stream information. This
+ * is useful for file formats with no headers such as MPEG. This
+ * function also computes the real framerate in case of MPEG-2 repeat
+ * frame mode.
+ * The logical file position is not changed by this function;
+ * examined packets may be buffered for later processing.
+ *
+ * @param ic media file handle
+ * @param options  If non-NULL, an ic.nb_streams long array of pointers to
+ *                 dictionaries, where i-th member contains options for
+ *                 codec corresponding to i-th stream.
+ *                 On return each dictionary will be filled with options that were not found.
+ * @return >=0 if OK, AVERROR_xxx on error
+ *
+ * @note this function isn't guaranteed to open all the codecs, so
+ *       options being non-empty at return is a perfectly normal behavior.
+ *
+ * @todo Let the user decide somehow what information is needed so that
+ *       we do not waste time getting stuff the user does not need.
+ */
+int avformat_find_stream_info(AVFormatContext *ic, AVDictionary **options);
+
+/**
+ * Find the programs which belong to a given stream.
+ *
+ * @param ic    media file handle
+ * @param last  the last found program, the search will start after this
+ *              program, or from the beginning if it is NULL
+ * @param s     stream index
+ * @return the next program which belongs to s, NULL if no program is found or
+ *         the last program is not among the programs of ic.
+ */
+AVProgram *av_find_program_from_stream(AVFormatContext *ic, AVProgram *last, int s);
+
+void av_program_add_stream_index(AVFormatContext *ac, int progid, unsigned int idx);
+
+/**
+ * Find the "best" stream in the file.
+ * The best stream is determined according to various heuristics as the most
+ * likely to be what the user expects.
+ * If the decoder parameter is non-NULL, av_find_best_stream will find the
+ * default decoder for the stream's codec; streams for which no decoder can
+ * be found are ignored.
+ *
+ * @param ic                media file handle
+ * @param type              stream type: video, audio, subtitles, etc.
+ * @param wanted_stream_nb  user-requested stream number,
+ *                          or -1 for automatic selection
+ * @param related_stream    try to find a stream related (eg. in the same
+ *                          program) to this one, or -1 if none
+ * @param decoder_ret       if non-NULL, returns the decoder for the
+ *                          selected stream
+ * @param flags             flags; none are currently defined
+ * @return  the non-negative stream number in case of success,
+ *          AVERROR_STREAM_NOT_FOUND if no stream with the requested type
+ *          could be found,
+ *          AVERROR_DECODER_NOT_FOUND if streams were found but no decoder
+ * @note  If av_find_best_stream returns successfully and decoder_ret is not
+ *        NULL, then *decoder_ret is guaranteed to be set to a valid AVCodec.
+ */
+int av_find_best_stream(AVFormatContext *ic,
+                        enum AVMediaType type,
+                        int wanted_stream_nb,
+                        int related_stream,
+                        AVCodec **decoder_ret,
+                        int flags);
+
+/**
+ * Return the next frame of a stream.
+ * This function returns what is stored in the file, and does not validate
+ * that what is there are valid frames for the decoder. It will split what is
+ * stored in the file into frames and return one for each call. It will not
+ * omit invalid data between valid frames so as to give the decoder the maximum
+ * information possible for decoding.
+ *
+ * If pkt->buf is NULL, then the packet is valid until the next
+ * av_read_frame() or until avformat_close_input(). Otherwise the packet
+ * is valid indefinitely. In both cases the packet must be freed with
+ * av_packet_unref when it is no longer needed. For video, the packet contains
+ * exactly one frame. For audio, it contains an integer number of frames if each
+ * frame has a known fixed size (e.g. PCM or ADPCM data). If the audio frames
+ * have a variable size (e.g. MPEG audio), then it contains one frame.
+ *
+ * pkt->pts, pkt->dts and pkt->duration are always set to correct
+ * values in AVStream.time_base units (and guessed if the format cannot
+ * provide them). pkt->pts can be AV_NOPTS_VALUE if the video format
+ * has B-frames, so it is better to rely on pkt->dts if you do not
+ * decompress the payload.
+ *
+ * @return 0 if OK, < 0 on error or end of file
+ */
+int av_read_frame(AVFormatContext *s, AVPacket *pkt);
+
+/**
+ * Seek to the keyframe at timestamp.
+ * 'timestamp' in 'stream_index'.
+ *
+ * @param s media file handle
+ * @param stream_index If stream_index is (-1), a default
+ * stream is selected, and timestamp is automatically converted
+ * from AV_TIME_BASE units to the stream specific time_base.
+ * @param timestamp Timestamp in AVStream.time_base units
+ *        or, if no stream is specified, in AV_TIME_BASE units.
+ * @param flags flags which select direction and seeking mode
+ * @return >= 0 on success
+ */
+int av_seek_frame(AVFormatContext *s, int stream_index, int64_t timestamp,
+                  int flags);
+
+/**
+ * Seek to timestamp ts.
+ * Seeking will be done so that the point from which all active streams
+ * can be presented successfully will be closest to ts and within min/max_ts.
+ * Active streams are all streams that have AVStream.discard < AVDISCARD_ALL.
+ *
+ * If flags contain AVSEEK_FLAG_BYTE, then all timestamps are in bytes and
+ * are the file position (this may not be supported by all demuxers).
+ * If flags contain AVSEEK_FLAG_FRAME, then all timestamps are in frames
+ * in the stream with stream_index (this may not be supported by all demuxers).
+ * Otherwise all timestamps are in units of the stream selected by stream_index
+ * or if stream_index is -1, in AV_TIME_BASE units.
+ * If flags contain AVSEEK_FLAG_ANY, then non-keyframes are treated as
+ * keyframes (this may not be supported by all demuxers).
+ * If flags contain AVSEEK_FLAG_BACKWARD, it is ignored.
+ *
+ * @param s media file handle
+ * @param stream_index index of the stream which is used as time base reference
+ * @param min_ts smallest acceptable timestamp
+ * @param ts target timestamp
+ * @param max_ts largest acceptable timestamp
+ * @param flags flags
+ * @return >=0 on success, error code otherwise
+ *
+ * @note This is part of the new seek API which is still under construction.
+ *       Thus do not use this yet. It may change at any time, do not expect
+ *       ABI compatibility yet!
+ */
+int avformat_seek_file(AVFormatContext *s, int stream_index, int64_t min_ts, int64_t ts, int64_t max_ts, int flags);
+
+/**
+ * Discard all internally buffered data. This can be useful when dealing with
+ * discontinuities in the byte stream. Generally works only with formats that
+ * can resync. This includes headerless formats like MPEG-TS/TS but should also
+ * work with NUT, Ogg and in a limited way AVI for example.
+ *
+ * The set of streams, the detected duration, stream parameters and codecs do
+ * not change when calling this function. If you want a complete reset, it's
+ * better to open a new AVFormatContext.
+ *
+ * This does not flush the AVIOContext (s->pb). If necessary, call
+ * avio_flush(s->pb) before calling this function.
+ *
+ * @param s media file handle
+ * @return >=0 on success, error code otherwise
+ */
+int avformat_flush(AVFormatContext *s);
+
+/**
+ * Start playing a network-based stream (e.g. RTSP stream) at the
+ * current position.
+ */
+int av_read_play(AVFormatContext *s);
+
+/**
+ * Pause a network-based stream (e.g. RTSP stream).
+ *
+ * Use av_read_play() to resume it.
+ */
+int av_read_pause(AVFormatContext *s);
+
+/**
+ * Close an opened input AVFormatContext. Free it and all its contents
+ * and set *s to NULL.
+ */
+void avformat_close_input(AVFormatContext **s);
+/**
+ * @}
+ */
+
+#define AVSEEK_FLAG_BACKWARD 1 ///< seek backward
+#define AVSEEK_FLAG_BYTE     2 ///< seeking based on position in bytes
+#define AVSEEK_FLAG_ANY      4 ///< seek to any frame, even non-keyframes
+#define AVSEEK_FLAG_FRAME    8 ///< seeking based on frame number
+
+/**
+ * @addtogroup lavf_encoding
+ * @{
+ */
+
+#define AVSTREAM_INIT_IN_WRITE_HEADER 0 ///< stream parameters initialized in avformat_write_header
+#define AVSTREAM_INIT_IN_INIT_OUTPUT  1 ///< stream parameters initialized in avformat_init_output
+
+/**
+ * Allocate the stream private data and write the stream header to
+ * an output media file.
+ *
+ * @param s Media file handle, must be allocated with avformat_alloc_context().
+ *          Its oformat field must be set to the desired output format;
+ *          Its pb field must be set to an already opened AVIOContext.
+ * @param options  An AVDictionary filled with AVFormatContext and muxer-private options.
+ *                 On return this parameter will be destroyed and replaced with a dict containing
+ *                 options that were not found. May be NULL.
+ *
+ * @return AVSTREAM_INIT_IN_WRITE_HEADER on success if the codec had not already been fully initialized in avformat_init,
+ *         AVSTREAM_INIT_IN_INIT_OUTPUT  on success if the codec had already been fully initialized in avformat_init,
+ *         negative AVERROR on failure.
+ *
+ * @see av_opt_find, av_dict_set, avio_open, av_oformat_next, avformat_init_output.
+ */
+av_warn_unused_result
+int avformat_write_header(AVFormatContext *s, AVDictionary **options);
+
+/**
+ * Allocate the stream private data and initialize the codec, but do not write the header.
+ * May optionally be used before avformat_write_header to initialize stream parameters
+ * before actually writing the header.
+ * If using this function, do not pass the same options to avformat_write_header.
+ *
+ * @param s Media file handle, must be allocated with avformat_alloc_context().
+ *          Its oformat field must be set to the desired output format;
+ *          Its pb field must be set to an already opened AVIOContext.
+ * @param options  An AVDictionary filled with AVFormatContext and muxer-private options.
+ *                 On return this parameter will be destroyed and replaced with a dict containing
+ *                 options that were not found. May be NULL.
+ *
+ * @return AVSTREAM_INIT_IN_WRITE_HEADER on success if the codec requires avformat_write_header to fully initialize,
+ *         AVSTREAM_INIT_IN_INIT_OUTPUT  on success if the codec has been fully initialized,
+ *         negative AVERROR on failure.
+ *
+ * @see av_opt_find, av_dict_set, avio_open, av_oformat_next, avformat_write_header.
+ */
+av_warn_unused_result
+int avformat_init_output(AVFormatContext *s, AVDictionary **options);
+
+/**
+ * Write a packet to an output media file.
+ *
+ * This function passes the packet directly to the muxer, without any buffering
+ * or reordering. The caller is responsible for correctly interleaving the
+ * packets if the format requires it. Callers that want libavformat to handle
+ * the interleaving should call av_interleaved_write_frame() instead of this
+ * function.
+ *
+ * @param s media file handle
+ * @param pkt The packet containing the data to be written. Note that unlike
+ *            av_interleaved_write_frame(), this function does not take
+ *            ownership of the packet passed to it (though some muxers may make
+ *            an internal reference to the input packet).
+ *            <br>
+ *            This parameter can be NULL (at any time, not just at the end), in
+ *            order to immediately flush data buffered within the muxer, for
+ *            muxers that buffer up data internally before writing it to the
+ *            output.
+ *            <br>
+ *            Packet's @ref AVPacket.stream_index "stream_index" field must be
+ *            set to the index of the corresponding stream in @ref
+ *            AVFormatContext.streams "s->streams".
+ *            <br>
+ *            The timestamps (@ref AVPacket.pts "pts", @ref AVPacket.dts "dts")
+ *            must be set to correct values in the stream's timebase (unless the
+ *            output format is flagged with the AVFMT_NOTIMESTAMPS flag, then
+ *            they can be set to AV_NOPTS_VALUE).
+ *            The dts for subsequent packets passed to this function must be strictly
+ *            increasing when compared in their respective timebases (unless the
+ *            output format is flagged with the AVFMT_TS_NONSTRICT, then they
+ *            merely have to be nondecreasing).  @ref AVPacket.duration
+ *            "duration") should also be set if known.
+ * @return < 0 on error, = 0 if OK, 1 if flushed and there is no more data to flush
+ *
+ * @see av_interleaved_write_frame()
+ */
+int av_write_frame(AVFormatContext *s, AVPacket *pkt);
+
+/**
+ * Write a packet to an output media file ensuring correct interleaving.
+ *
+ * This function will buffer the packets internally as needed to make sure the
+ * packets in the output file are properly interleaved in the order of
+ * increasing dts. Callers doing their own interleaving should call
+ * av_write_frame() instead of this function.
+ *
+ * Using this function instead of av_write_frame() can give muxers advance
+ * knowledge of future packets, improving e.g. the behaviour of the mp4
+ * muxer for VFR content in fragmenting mode.
+ *
+ * @param s media file handle
+ * @param pkt The packet containing the data to be written.
+ *            <br>
+ *            If the packet is reference-counted, this function will take
+ *            ownership of this reference and unreference it later when it sees
+ *            fit.
+ *            The caller must not access the data through this reference after
+ *            this function returns. If the packet is not reference-counted,
+ *            libavformat will make a copy.
+ *            <br>
+ *            This parameter can be NULL (at any time, not just at the end), to
+ *            flush the interleaving queues.
+ *            <br>
+ *            Packet's @ref AVPacket.stream_index "stream_index" field must be
+ *            set to the index of the corresponding stream in @ref
+ *            AVFormatContext.streams "s->streams".
+ *            <br>
+ *            The timestamps (@ref AVPacket.pts "pts", @ref AVPacket.dts "dts")
+ *            must be set to correct values in the stream's timebase (unless the
+ *            output format is flagged with the AVFMT_NOTIMESTAMPS flag, then
+ *            they can be set to AV_NOPTS_VALUE).
+ *            The dts for subsequent packets in one stream must be strictly
+ *            increasing (unless the output format is flagged with the
+ *            AVFMT_TS_NONSTRICT, then they merely have to be nondecreasing).
+ *            @ref AVPacket.duration "duration") should also be set if known.
+ *
+ * @return 0 on success, a negative AVERROR on error. Libavformat will always
+ *         take care of freeing the packet, even if this function fails.
+ *
+ * @see av_write_frame(), AVFormatContext.max_interleave_delta
+ */
+int av_interleaved_write_frame(AVFormatContext *s, AVPacket *pkt);
+
+/**
+ * Write an uncoded frame to an output media file.
+ *
+ * The frame must be correctly interleaved according to the container
+ * specification; if not, then av_interleaved_write_frame() must be used.
+ *
+ * See av_interleaved_write_frame() for details.
+ */
+int av_write_uncoded_frame(AVFormatContext *s, int stream_index,
+                           AVFrame *frame);
+
+/**
+ * Write an uncoded frame to an output media file.
+ *
+ * If the muxer supports it, this function makes it possible to write an AVFrame
+ * structure directly, without encoding it into a packet.
+ * It is mostly useful for devices and similar special muxers that use raw
+ * video or PCM data and will not serialize it into a byte stream.
+ *
+ * To test whether it is possible to use it with a given muxer and stream,
+ * use av_write_uncoded_frame_query().
+ *
+ * The caller gives up ownership of the frame and must not access it
+ * afterwards.
+ *
+ * @return  >=0 for success, a negative code on error
+ */
+int av_interleaved_write_uncoded_frame(AVFormatContext *s, int stream_index,
+                                       AVFrame *frame);
+
+/**
+ * Test whether a muxer supports uncoded frame.
+ *
+ * @return  >=0 if an uncoded frame can be written to that muxer and stream,
+ *          <0 if not
+ */
+int av_write_uncoded_frame_query(AVFormatContext *s, int stream_index);
+
+/**
+ * Write the stream trailer to an output media file and free the
+ * file private data.
+ *
+ * May only be called after a successful call to avformat_write_header.
+ *
+ * @param s media file handle
+ * @return 0 if OK, AVERROR_xxx on error
+ */
+int av_write_trailer(AVFormatContext *s);
+
+/**
+ * Return the output format in the list of registered output formats
+ * which best matches the provided parameters, or return NULL if
+ * there is no match.
+ *
+ * @param short_name if non-NULL checks if short_name matches with the
+ * names of the registered formats
+ * @param filename if non-NULL checks if filename terminates with the
+ * extensions of the registered formats
+ * @param mime_type if non-NULL checks if mime_type matches with the
+ * MIME type of the registered formats
+ */
+AVOutputFormat *av_guess_format(const char *short_name,
+                                const char *filename,
+                                const char *mime_type);
+
+/**
+ * Guess the codec ID based upon muxer and filename.
+ */
+enum AVCodecID av_guess_codec(AVOutputFormat *fmt, const char *short_name,
+                            const char *filename, const char *mime_type,
+                            enum AVMediaType type);
+
+/**
+ * Get timing information for the data currently output.
+ * The exact meaning of "currently output" depends on the format.
+ * It is mostly relevant for devices that have an internal buffer and/or
+ * work in real time.
+ * @param s          media file handle
+ * @param stream     stream in the media file
+ * @param[out] dts   DTS of the last packet output for the stream, in stream
+ *                   time_base units
+ * @param[out] wall  absolute time when that packet whas output,
+ *                   in microsecond
+ * @return  0 if OK, AVERROR(ENOSYS) if the format does not support it
+ * Note: some formats or devices may not allow to measure dts and wall
+ * atomically.
+ */
+int av_get_output_timestamp(struct AVFormatContext *s, int stream,
+                            int64_t *dts, int64_t *wall);
+
+
+/**
+ * @}
+ */
+
+
+/**
+ * @defgroup lavf_misc Utility functions
+ * @ingroup libavf
+ * @{
+ *
+ * Miscellaneous utility functions related to both muxing and demuxing
+ * (or neither).
+ */
+
+/**
+ * Send a nice hexadecimal dump of a buffer to the specified file stream.
+ *
+ * @param f The file stream pointer where the dump should be sent to.
+ * @param buf buffer
+ * @param size buffer size
+ *
+ * @see av_hex_dump_log, av_pkt_dump2, av_pkt_dump_log2
+ */
+void av_hex_dump(FILE *f, const uint8_t *buf, int size);
+
+/**
+ * Send a nice hexadecimal dump of a buffer to the log.
+ *
+ * @param avcl A pointer to an arbitrary struct of which the first field is a
+ * pointer to an AVClass struct.
+ * @param level The importance level of the message, lower values signifying
+ * higher importance.
+ * @param buf buffer
+ * @param size buffer size
+ *
+ * @see av_hex_dump, av_pkt_dump2, av_pkt_dump_log2
+ */
+void av_hex_dump_log(void *avcl, int level, const uint8_t *buf, int size);
+
+/**
+ * Send a nice dump of a packet to the specified file stream.
+ *
+ * @param f The file stream pointer where the dump should be sent to.
+ * @param pkt packet to dump
+ * @param dump_payload True if the payload must be displayed, too.
+ * @param st AVStream that the packet belongs to
+ */
+void av_pkt_dump2(FILE *f, const AVPacket *pkt, int dump_payload, const AVStream *st);
+
+
+/**
+ * Send a nice dump of a packet to the log.
+ *
+ * @param avcl A pointer to an arbitrary struct of which the first field is a
+ * pointer to an AVClass struct.
+ * @param level The importance level of the message, lower values signifying
+ * higher importance.
+ * @param pkt packet to dump
+ * @param dump_payload True if the payload must be displayed, too.
+ * @param st AVStream that the packet belongs to
+ */
+void av_pkt_dump_log2(void *avcl, int level, const AVPacket *pkt, int dump_payload,
+                      const AVStream *st);
+
+/**
+ * Get the AVCodecID for the given codec tag tag.
+ * If no codec id is found returns AV_CODEC_ID_NONE.
+ *
+ * @param tags list of supported codec_id-codec_tag pairs, as stored
+ * in AVInputFormat.codec_tag and AVOutputFormat.codec_tag
+ * @param tag  codec tag to match to a codec ID
+ */
+enum AVCodecID av_codec_get_id(const struct AVCodecTag * const *tags, unsigned int tag);
+
+/**
+ * Get the codec tag for the given codec id id.
+ * If no codec tag is found returns 0.
+ *
+ * @param tags list of supported codec_id-codec_tag pairs, as stored
+ * in AVInputFormat.codec_tag and AVOutputFormat.codec_tag
+ * @param id   codec ID to match to a codec tag
+ */
+unsigned int av_codec_get_tag(const struct AVCodecTag * const *tags, enum AVCodecID id);
+
+/**
+ * Get the codec tag for the given codec id.
+ *
+ * @param tags list of supported codec_id - codec_tag pairs, as stored
+ * in AVInputFormat.codec_tag and AVOutputFormat.codec_tag
+ * @param id codec id that should be searched for in the list
+ * @param tag A pointer to the found tag
+ * @return 0 if id was not found in tags, > 0 if it was found
+ */
+int av_codec_get_tag2(const struct AVCodecTag * const *tags, enum AVCodecID id,
+                      unsigned int *tag);
+
+int av_find_default_stream_index(AVFormatContext *s);
+
+/**
+ * Get the index for a specific timestamp.
+ *
+ * @param st        stream that the timestamp belongs to
+ * @param timestamp timestamp to retrieve the index for
+ * @param flags if AVSEEK_FLAG_BACKWARD then the returned index will correspond
+ *                 to the timestamp which is <= the requested one, if backward
+ *                 is 0, then it will be >=
+ *              if AVSEEK_FLAG_ANY seek to any frame, only keyframes otherwise
+ * @return < 0 if no such timestamp could be found
+ */
+int av_index_search_timestamp(AVStream *st, int64_t timestamp, int flags);
+
+/**
+ * Add an index entry into a sorted list. Update the entry if the list
+ * already contains it.
+ *
+ * @param timestamp timestamp in the time base of the given stream
+ */
+int av_add_index_entry(AVStream *st, int64_t pos, int64_t timestamp,
+                       int size, int distance, int flags);
+
+
+/**
+ * Split a URL string into components.
+ *
+ * The pointers to buffers for storing individual components may be null,
+ * in order to ignore that component. Buffers for components not found are
+ * set to empty strings. If the port is not found, it is set to a negative
+ * value.
+ *
+ * @param proto the buffer for the protocol
+ * @param proto_size the size of the proto buffer
+ * @param authorization the buffer for the authorization
+ * @param authorization_size the size of the authorization buffer
+ * @param hostname the buffer for the host name
+ * @param hostname_size the size of the hostname buffer
+ * @param port_ptr a pointer to store the port number in
+ * @param path the buffer for the path
+ * @param path_size the size of the path buffer
+ * @param url the URL to split
+ */
+void av_url_split(char *proto,         int proto_size,
+                  char *authorization, int authorization_size,
+                  char *hostname,      int hostname_size,
+                  int *port_ptr,
+                  char *path,          int path_size,
+                  const char *url);
+
+
+/**
+ * Print detailed information about the input or output format, such as
+ * duration, bitrate, streams, container, programs, metadata, side data,
+ * codec and time base.
+ *
+ * @param ic        the context to analyze
+ * @param index     index of the stream to dump information about
+ * @param url       the URL to print, such as source or destination file
+ * @param is_output Select whether the specified context is an input(0) or output(1)
+ */
+void av_dump_format(AVFormatContext *ic,
+                    int index,
+                    const char *url,
+                    int is_output);
+
+
+#define AV_FRAME_FILENAME_FLAGS_MULTIPLE 1 ///< Allow multiple %d
+
+/**
+ * Return in 'buf' the path with '%d' replaced by a number.
+ *
+ * Also handles the '%0nd' format where 'n' is the total number
+ * of digits and '%%'.
+ *
+ * @param buf destination buffer
+ * @param buf_size destination buffer size
+ * @param path numbered sequence string
+ * @param number frame number
+ * @param flags AV_FRAME_FILENAME_FLAGS_*
+ * @return 0 if OK, -1 on format error
+ */
+int av_get_frame_filename2(char *buf, int buf_size,
+                          const char *path, int number, int flags);
+
+int av_get_frame_filename(char *buf, int buf_size,
+                          const char *path, int number);
+
+/**
+ * Check whether filename actually is a numbered sequence generator.
+ *
+ * @param filename possible numbered sequence string
+ * @return 1 if a valid numbered sequence string, 0 otherwise
+ */
+int av_filename_number_test(const char *filename);
+
+/**
+ * Generate an SDP for an RTP session.
+ *
+ * Note, this overwrites the id values of AVStreams in the muxer contexts
+ * for getting unique dynamic payload types.
+ *
+ * @param ac array of AVFormatContexts describing the RTP streams. If the
+ *           array is composed by only one context, such context can contain
+ *           multiple AVStreams (one AVStream per RTP stream). Otherwise,
+ *           all the contexts in the array (an AVCodecContext per RTP stream)
+ *           must contain only one AVStream.
+ * @param n_files number of AVCodecContexts contained in ac
+ * @param buf buffer where the SDP will be stored (must be allocated by
+ *            the caller)
+ * @param size the size of the buffer
+ * @return 0 if OK, AVERROR_xxx on error
+ */
+int av_sdp_create(AVFormatContext *ac[], int n_files, char *buf, int size);
+
+/**
+ * Return a positive value if the given filename has one of the given
+ * extensions, 0 otherwise.
+ *
+ * @param filename   file name to check against the given extensions
+ * @param extensions a comma-separated list of filename extensions
+ */
+int av_match_ext(const char *filename, const char *extensions);
+
+/**
+ * Test if the given container can store a codec.
+ *
+ * @param ofmt           container to check for compatibility
+ * @param codec_id       codec to potentially store in container
+ * @param std_compliance standards compliance level, one of FF_COMPLIANCE_*
+ *
+ * @return 1 if codec with ID codec_id can be stored in ofmt, 0 if it cannot.
+ *         A negative number if this information is not available.
+ */
+int avformat_query_codec(const AVOutputFormat *ofmt, enum AVCodecID codec_id,
+                         int std_compliance);
+
+/**
+ * @defgroup riff_fourcc RIFF FourCCs
+ * @{
+ * Get the tables mapping RIFF FourCCs to libavcodec AVCodecIDs. The tables are
+ * meant to be passed to av_codec_get_id()/av_codec_get_tag() as in the
+ * following code:
+ * @code
+ * uint32_t tag = MKTAG('H', '2', '6', '4');
+ * const struct AVCodecTag *table[] = { avformat_get_riff_video_tags(), 0 };
+ * enum AVCodecID id = av_codec_get_id(table, tag);
+ * @endcode
+ */
+/**
+ * @return the table mapping RIFF FourCCs for video to libavcodec AVCodecID.
+ */
+const struct AVCodecTag *avformat_get_riff_video_tags(void);
+/**
+ * @return the table mapping RIFF FourCCs for audio to AVCodecID.
+ */
+const struct AVCodecTag *avformat_get_riff_audio_tags(void);
+/**
+ * @return the table mapping MOV FourCCs for video to libavcodec AVCodecID.
+ */
+const struct AVCodecTag *avformat_get_mov_video_tags(void);
+/**
+ * @return the table mapping MOV FourCCs for audio to AVCodecID.
+ */
+const struct AVCodecTag *avformat_get_mov_audio_tags(void);
+
+/**
+ * @}
+ */
+
+/**
+ * Guess the sample aspect ratio of a frame, based on both the stream and the
+ * frame aspect ratio.
+ *
+ * Since the frame aspect ratio is set by the codec but the stream aspect ratio
+ * is set by the demuxer, these two may not be equal. This function tries to
+ * return the value that you should use if you would like to display the frame.
+ *
+ * Basic logic is to use the stream aspect ratio if it is set to something sane
+ * otherwise use the frame aspect ratio. This way a container setting, which is
+ * usually easy to modify can override the coded value in the frames.
+ *
+ * @param format the format context which the stream is part of
+ * @param stream the stream which the frame is part of
+ * @param frame the frame with the aspect ratio to be determined
+ * @return the guessed (valid) sample_aspect_ratio, 0/1 if no idea
+ */
+AVRational av_guess_sample_aspect_ratio(AVFormatContext *format, AVStream *stream, AVFrame *frame);
+
+/**
+ * Guess the frame rate, based on both the container and codec information.
+ *
+ * @param ctx the format context which the stream is part of
+ * @param stream the stream which the frame is part of
+ * @param frame the frame for which the frame rate should be determined, may be NULL
+ * @return the guessed (valid) frame rate, 0/1 if no idea
+ */
+AVRational av_guess_frame_rate(AVFormatContext *ctx, AVStream *stream, AVFrame *frame);
+
+/**
+ * Check if the stream st contained in s is matched by the stream specifier
+ * spec.
+ *
+ * See the "stream specifiers" chapter in the documentation for the syntax
+ * of spec.
+ *
+ * @return  >0 if st is matched by spec;
+ *          0  if st is not matched by spec;
+ *          AVERROR code if spec is invalid
+ *
+ * @note  A stream specifier can match several streams in the format.
+ */
+int avformat_match_stream_specifier(AVFormatContext *s, AVStream *st,
+                                    const char *spec);
+
+int avformat_queue_attached_pictures(AVFormatContext *s);
+
+/**
+ * Apply a list of bitstream filters to a packet.
+ *
+ * @param codec AVCodecContext, usually from an AVStream
+ * @param pkt the packet to apply filters to. If, on success, the returned
+ *        packet has size == 0 and side_data_elems == 0, it indicates that
+ *        the packet should be dropped
+ * @param bsfc a NULL-terminated list of filters to apply
+ * @return  >=0 on success;
+ *          AVERROR code on failure
+ */
+#if FF_API_OLD_BSF
+attribute_deprecated
+int av_apply_bitstream_filters(AVCodecContext *codec, AVPacket *pkt,
+                               AVBitStreamFilterContext *bsfc);
+#endif
+
+enum AVTimebaseSource {
+    AVFMT_TBCF_AUTO = -1,
+    AVFMT_TBCF_DECODER,
+    AVFMT_TBCF_DEMUXER,
+#if FF_API_R_FRAME_RATE
+    AVFMT_TBCF_R_FRAMERATE,
+#endif
+};
+
+/**
+ * Transfer internal timing information from one stream to another.
+ *
+ * This function is useful when doing stream copy.
+ *
+ * @param ofmt     target output format for ost
+ * @param ost      output stream which needs timings copy and adjustments
+ * @param ist      reference input stream to copy timings from
+ * @param copy_tb  define from where the stream codec timebase needs to be imported
+ */
+int avformat_transfer_internal_stream_timing_info(const AVOutputFormat *ofmt,
+                                                  AVStream *ost, const AVStream *ist,
+                                                  enum AVTimebaseSource copy_tb);
+
+/**
+ * Get the internal codec timebase from a stream.
+ *
+ * @param st  input stream to extract the timebase from
+ */
+AVRational av_stream_get_codec_timebase(const AVStream *st);
+
+/**
+ * @}
+ */
+
+#endif /* AVFORMAT_AVFORMAT_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavformat/avio.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavformat/avio.h
new file mode 100644
index 0000000..6f4ed84
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavformat/avio.h
@@ -0,0 +1,827 @@
+/*
+ * copyright (c) 2001 Fabrice Bellard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef AVFORMAT_AVIO_H
+#define AVFORMAT_AVIO_H
+
+/**
+ * @file
+ * @ingroup lavf_io
+ * Buffered I/O operations
+ */
+
+#include <stdint.h>
+
+#include "libavutil/common.h"
+#include "libavutil/dict.h"
+#include "libavutil/log.h"
+
+#include "libavformat/version.h"
+
+/**
+ * Seeking works like for a local file.
+ */
+#define AVIO_SEEKABLE_NORMAL (1 << 0)
+
+/**
+ * Seeking by timestamp with avio_seek_time() is possible.
+ */
+#define AVIO_SEEKABLE_TIME   (1 << 1)
+
+/**
+ * Callback for checking whether to abort blocking functions.
+ * AVERROR_EXIT is returned in this case by the interrupted
+ * function. During blocking operations, callback is called with
+ * opaque as parameter. If the callback returns 1, the
+ * blocking operation will be aborted.
+ *
+ * No members can be added to this struct without a major bump, if
+ * new elements have been added after this struct in AVFormatContext
+ * or AVIOContext.
+ */
+typedef struct AVIOInterruptCB {
+    int (*callback)(void*);
+    void *opaque;
+} AVIOInterruptCB;
+
+/**
+ * Directory entry types.
+ */
+enum AVIODirEntryType {
+    AVIO_ENTRY_UNKNOWN,
+    AVIO_ENTRY_BLOCK_DEVICE,
+    AVIO_ENTRY_CHARACTER_DEVICE,
+    AVIO_ENTRY_DIRECTORY,
+    AVIO_ENTRY_NAMED_PIPE,
+    AVIO_ENTRY_SYMBOLIC_LINK,
+    AVIO_ENTRY_SOCKET,
+    AVIO_ENTRY_FILE,
+    AVIO_ENTRY_SERVER,
+    AVIO_ENTRY_SHARE,
+    AVIO_ENTRY_WORKGROUP,
+};
+
+/**
+ * Describes single entry of the directory.
+ *
+ * Only name and type fields are guaranteed be set.
+ * Rest of fields are protocol or/and platform dependent and might be unknown.
+ */
+typedef struct AVIODirEntry {
+    char *name;                           /**< Filename */
+    int type;                             /**< Type of the entry */
+    int utf8;                             /**< Set to 1 when name is encoded with UTF-8, 0 otherwise.
+                                               Name can be encoded with UTF-8 even though 0 is set. */
+    int64_t size;                         /**< File size in bytes, -1 if unknown. */
+    int64_t modification_timestamp;       /**< Time of last modification in microseconds since unix
+                                               epoch, -1 if unknown. */
+    int64_t access_timestamp;             /**< Time of last access in microseconds since unix epoch,
+                                               -1 if unknown. */
+    int64_t status_change_timestamp;      /**< Time of last status change in microseconds since unix
+                                               epoch, -1 if unknown. */
+    int64_t user_id;                      /**< User ID of owner, -1 if unknown. */
+    int64_t group_id;                     /**< Group ID of owner, -1 if unknown. */
+    int64_t filemode;                     /**< Unix file mode, -1 if unknown. */
+} AVIODirEntry;
+
+typedef struct AVIODirContext {
+    struct URLContext *url_context;
+} AVIODirContext;
+
+/**
+ * Different data types that can be returned via the AVIO
+ * write_data_type callback.
+ */
+enum AVIODataMarkerType {
+    /**
+     * Header data; this needs to be present for the stream to be decodeable.
+     */
+    AVIO_DATA_MARKER_HEADER,
+    /**
+     * A point in the output bytestream where a decoder can start decoding
+     * (i.e. a keyframe). A demuxer/decoder given the data flagged with
+     * AVIO_DATA_MARKER_HEADER, followed by any AVIO_DATA_MARKER_SYNC_POINT,
+     * should give decodeable results.
+     */
+    AVIO_DATA_MARKER_SYNC_POINT,
+    /**
+     * A point in the output bytestream where a demuxer can start parsing
+     * (for non self synchronizing bytestream formats). That is, any
+     * non-keyframe packet start point.
+     */
+    AVIO_DATA_MARKER_BOUNDARY_POINT,
+    /**
+     * This is any, unlabelled data. It can either be a muxer not marking
+     * any positions at all, it can be an actual boundary/sync point
+     * that the muxer chooses not to mark, or a later part of a packet/fragment
+     * that is cut into multiple write callbacks due to limited IO buffer size.
+     */
+    AVIO_DATA_MARKER_UNKNOWN,
+    /**
+     * Trailer data, which doesn't contain actual content, but only for
+     * finalizing the output file.
+     */
+    AVIO_DATA_MARKER_TRAILER
+};
+
+/**
+ * Bytestream IO Context.
+ * New fields can be added to the end with minor version bumps.
+ * Removal, reordering and changes to existing fields require a major
+ * version bump.
+ * sizeof(AVIOContext) must not be used outside libav*.
+ *
+ * @note None of the function pointers in AVIOContext should be called
+ *       directly, they should only be set by the client application
+ *       when implementing custom I/O. Normally these are set to the
+ *       function pointers specified in avio_alloc_context()
+ */
+typedef struct AVIOContext {
+    /**
+     * A class for private options.
+     *
+     * If this AVIOContext is created by avio_open2(), av_class is set and
+     * passes the options down to protocols.
+     *
+     * If this AVIOContext is manually allocated, then av_class may be set by
+     * the caller.
+     *
+     * warning -- this field can be NULL, be sure to not pass this AVIOContext
+     * to any av_opt_* functions in that case.
+     */
+    const AVClass *av_class;
+
+    /*
+     * The following shows the relationship between buffer, buf_ptr, buf_end, buf_size,
+     * and pos, when reading and when writing (since AVIOContext is used for both):
+     *
+     **********************************************************************************
+     *                                   READING
+     **********************************************************************************
+     *
+     *                            |              buffer_size              |
+     *                            |---------------------------------------|
+     *                            |                                       |
+     *
+     *                         buffer          buf_ptr       buf_end
+     *                            +---------------+-----------------------+
+     *                            |/ / / / / / / /|/ / / / / / /|         |
+     *  read buffer:              |/ / consumed / | to be read /|         |
+     *                            |/ / / / / / / /|/ / / / / / /|         |
+     *                            +---------------+-----------------------+
+     *
+     *                                                         pos
+     *              +-------------------------------------------+-----------------+
+     *  input file: |                                           |                 |
+     *              +-------------------------------------------+-----------------+
+     *
+     *
+     **********************************************************************************
+     *                                   WRITING
+     **********************************************************************************
+     *
+     *                                          |          buffer_size          |
+     *                                          |-------------------------------|
+     *                                          |                               |
+     *
+     *                                       buffer              buf_ptr     buf_end
+     *                                          +-------------------+-----------+
+     *                                          |/ / / / / / / / / /|           |
+     *  write buffer:                           | / to be flushed / |           |
+     *                                          |/ / / / / / / / / /|           |
+     *                                          +-------------------+-----------+
+     *
+     *                                         pos
+     *               +--------------------------+-----------------------------------+
+     *  output file: |                          |                                   |
+     *               +--------------------------+-----------------------------------+
+     *
+     */
+    unsigned char *buffer;  /**< Start of the buffer. */
+    int buffer_size;        /**< Maximum buffer size */
+    unsigned char *buf_ptr; /**< Current position in the buffer */
+    unsigned char *buf_end; /**< End of the data, may be less than
+                                 buffer+buffer_size if the read function returned
+                                 less data than requested, e.g. for streams where
+                                 no more data has been received yet. */
+    void *opaque;           /**< A private pointer, passed to the read/write/seek/...
+                                 functions. */
+    int (*read_packet)(void *opaque, uint8_t *buf, int buf_size);
+    int (*write_packet)(void *opaque, uint8_t *buf, int buf_size);
+    int64_t (*seek)(void *opaque, int64_t offset, int whence);
+    int64_t pos;            /**< position in the file of the current buffer */
+    int must_flush;         /**< true if the next seek should flush */
+    int eof_reached;        /**< true if eof reached */
+    int write_flag;         /**< true if open for writing */
+    int max_packet_size;
+    unsigned long checksum;
+    unsigned char *checksum_ptr;
+    unsigned long (*update_checksum)(unsigned long checksum, const uint8_t *buf, unsigned int size);
+    int error;              /**< contains the error code or 0 if no error happened */
+    /**
+     * Pause or resume playback for network streaming protocols - e.g. MMS.
+     */
+    int (*read_pause)(void *opaque, int pause);
+    /**
+     * Seek to a given timestamp in stream with the specified stream_index.
+     * Needed for some network streaming protocols which don't support seeking
+     * to byte position.
+     */
+    int64_t (*read_seek)(void *opaque, int stream_index,
+                         int64_t timestamp, int flags);
+    /**
+     * A combination of AVIO_SEEKABLE_ flags or 0 when the stream is not seekable.
+     */
+    int seekable;
+
+    /**
+     * max filesize, used to limit allocations
+     * This field is internal to libavformat and access from outside is not allowed.
+     */
+    int64_t maxsize;
+
+    /**
+     * avio_read and avio_write should if possible be satisfied directly
+     * instead of going through a buffer, and avio_seek will always
+     * call the underlying seek function directly.
+     */
+    int direct;
+
+    /**
+     * Bytes read statistic
+     * This field is internal to libavformat and access from outside is not allowed.
+     */
+    int64_t bytes_read;
+
+    /**
+     * seek statistic
+     * This field is internal to libavformat and access from outside is not allowed.
+     */
+    int seek_count;
+
+    /**
+     * writeout statistic
+     * This field is internal to libavformat and access from outside is not allowed.
+     */
+    int writeout_count;
+
+    /**
+     * Original buffer size
+     * used internally after probing and ensure seekback to reset the buffer size
+     * This field is internal to libavformat and access from outside is not allowed.
+     */
+    int orig_buffer_size;
+
+    /**
+     * Threshold to favor readahead over seek.
+     * This is current internal only, do not use from outside.
+     */
+    int short_seek_threshold;
+
+    /**
+     * ',' separated list of allowed protocols.
+     */
+    const char *protocol_whitelist;
+
+    /**
+     * ',' separated list of disallowed protocols.
+     */
+    const char *protocol_blacklist;
+
+    /**
+     * A callback that is used instead of write_packet.
+     */
+    int (*write_data_type)(void *opaque, uint8_t *buf, int buf_size,
+                           enum AVIODataMarkerType type, int64_t time);
+    /**
+     * If set, don't call write_data_type separately for AVIO_DATA_MARKER_BOUNDARY_POINT,
+     * but ignore them and treat them as AVIO_DATA_MARKER_UNKNOWN (to avoid needlessly
+     * small chunks of data returned from the callback).
+     */
+    int ignore_boundary_point;
+
+    /**
+     * Internal, not meant to be used from outside of AVIOContext.
+     */
+    enum AVIODataMarkerType current_type;
+    int64_t last_time;
+
+    /**
+     * A callback that is used instead of short_seek_threshold.
+     * This is current internal only, do not use from outside.
+     */
+    int (*short_seek_get)(void *opaque);
+} AVIOContext;
+
+/**
+ * Return the name of the protocol that will handle the passed URL.
+ *
+ * NULL is returned if no protocol could be found for the given URL.
+ *
+ * @return Name of the protocol or NULL.
+ */
+const char *avio_find_protocol_name(const char *url);
+
+/**
+ * Return AVIO_FLAG_* access flags corresponding to the access permissions
+ * of the resource in url, or a negative value corresponding to an
+ * AVERROR code in case of failure. The returned access flags are
+ * masked by the value in flags.
+ *
+ * @note This function is intrinsically unsafe, in the sense that the
+ * checked resource may change its existence or permission status from
+ * one call to another. Thus you should not trust the returned value,
+ * unless you are sure that no other processes are accessing the
+ * checked resource.
+ */
+int avio_check(const char *url, int flags);
+
+/**
+ * Move or rename a resource.
+ *
+ * @note url_src and url_dst should share the same protocol and authority.
+ *
+ * @param url_src url to resource to be moved
+ * @param url_dst new url to resource if the operation succeeded
+ * @return >=0 on success or negative on error.
+ */
+int avpriv_io_move(const char *url_src, const char *url_dst);
+
+/**
+ * Delete a resource.
+ *
+ * @param url resource to be deleted.
+ * @return >=0 on success or negative on error.
+ */
+int avpriv_io_delete(const char *url);
+
+/**
+ * Open directory for reading.
+ *
+ * @param s       directory read context. Pointer to a NULL pointer must be passed.
+ * @param url     directory to be listed.
+ * @param options A dictionary filled with protocol-private options. On return
+ *                this parameter will be destroyed and replaced with a dictionary
+ *                containing options that were not found. May be NULL.
+ * @return >=0 on success or negative on error.
+ */
+int avio_open_dir(AVIODirContext **s, const char *url, AVDictionary **options);
+
+/**
+ * Get next directory entry.
+ *
+ * Returned entry must be freed with avio_free_directory_entry(). In particular
+ * it may outlive AVIODirContext.
+ *
+ * @param s         directory read context.
+ * @param[out] next next entry or NULL when no more entries.
+ * @return >=0 on success or negative on error. End of list is not considered an
+ *             error.
+ */
+int avio_read_dir(AVIODirContext *s, AVIODirEntry **next);
+
+/**
+ * Close directory.
+ *
+ * @note Entries created using avio_read_dir() are not deleted and must be
+ * freeded with avio_free_directory_entry().
+ *
+ * @param s         directory read context.
+ * @return >=0 on success or negative on error.
+ */
+int avio_close_dir(AVIODirContext **s);
+
+/**
+ * Free entry allocated by avio_read_dir().
+ *
+ * @param entry entry to be freed.
+ */
+void avio_free_directory_entry(AVIODirEntry **entry);
+
+/**
+ * Allocate and initialize an AVIOContext for buffered I/O. It must be later
+ * freed with av_free().
+ *
+ * @param buffer Memory block for input/output operations via AVIOContext.
+ *        The buffer must be allocated with av_malloc() and friends.
+ *        It may be freed and replaced with a new buffer by libavformat.
+ *        AVIOContext.buffer holds the buffer currently in use,
+ *        which must be later freed with av_free().
+ * @param buffer_size The buffer size is very important for performance.
+ *        For protocols with fixed blocksize it should be set to this blocksize.
+ *        For others a typical size is a cache page, e.g. 4kb.
+ * @param write_flag Set to 1 if the buffer should be writable, 0 otherwise.
+ * @param opaque An opaque pointer to user-specific data.
+ * @param read_packet  A function for refilling the buffer, may be NULL.
+ * @param write_packet A function for writing the buffer contents, may be NULL.
+ *        The function may not change the input buffers content.
+ * @param seek A function for seeking to specified byte position, may be NULL.
+ *
+ * @return Allocated AVIOContext or NULL on failure.
+ */
+AVIOContext *avio_alloc_context(
+                  unsigned char *buffer,
+                  int buffer_size,
+                  int write_flag,
+                  void *opaque,
+                  int (*read_packet)(void *opaque, uint8_t *buf, int buf_size),
+                  int (*write_packet)(void *opaque, uint8_t *buf, int buf_size),
+                  int64_t (*seek)(void *opaque, int64_t offset, int whence));
+
+void avio_w8(AVIOContext *s, int b);
+void avio_write(AVIOContext *s, const unsigned char *buf, int size);
+void avio_wl64(AVIOContext *s, uint64_t val);
+void avio_wb64(AVIOContext *s, uint64_t val);
+void avio_wl32(AVIOContext *s, unsigned int val);
+void avio_wb32(AVIOContext *s, unsigned int val);
+void avio_wl24(AVIOContext *s, unsigned int val);
+void avio_wb24(AVIOContext *s, unsigned int val);
+void avio_wl16(AVIOContext *s, unsigned int val);
+void avio_wb16(AVIOContext *s, unsigned int val);
+
+/**
+ * Write a NULL-terminated string.
+ * @return number of bytes written.
+ */
+int avio_put_str(AVIOContext *s, const char *str);
+
+/**
+ * Convert an UTF-8 string to UTF-16LE and write it.
+ * @param s the AVIOContext
+ * @param str NULL-terminated UTF-8 string
+ *
+ * @return number of bytes written.
+ */
+int avio_put_str16le(AVIOContext *s, const char *str);
+
+/**
+ * Convert an UTF-8 string to UTF-16BE and write it.
+ * @param s the AVIOContext
+ * @param str NULL-terminated UTF-8 string
+ *
+ * @return number of bytes written.
+ */
+int avio_put_str16be(AVIOContext *s, const char *str);
+
+/**
+ * Mark the written bytestream as a specific type.
+ *
+ * Zero-length ranges are omitted from the output.
+ *
+ * @param time the stream time the current bytestream pos corresponds to
+ *             (in AV_TIME_BASE units), or AV_NOPTS_VALUE if unknown or not
+ *             applicable
+ * @param type the kind of data written starting at the current pos
+ */
+void avio_write_marker(AVIOContext *s, int64_t time, enum AVIODataMarkerType type);
+
+/**
+ * ORing this as the "whence" parameter to a seek function causes it to
+ * return the filesize without seeking anywhere. Supporting this is optional.
+ * If it is not supported then the seek function will return <0.
+ */
+#define AVSEEK_SIZE 0x10000
+
+/**
+ * Passing this flag as the "whence" parameter to a seek function causes it to
+ * seek by any means (like reopening and linear reading) or other normally unreasonable
+ * means that can be extremely slow.
+ * This may be ignored by the seek code.
+ */
+#define AVSEEK_FORCE 0x20000
+
+/**
+ * fseek() equivalent for AVIOContext.
+ * @return new position or AVERROR.
+ */
+int64_t avio_seek(AVIOContext *s, int64_t offset, int whence);
+
+/**
+ * Skip given number of bytes forward
+ * @return new position or AVERROR.
+ */
+int64_t avio_skip(AVIOContext *s, int64_t offset);
+
+/**
+ * ftell() equivalent for AVIOContext.
+ * @return position or AVERROR.
+ */
+static av_always_inline int64_t avio_tell(AVIOContext *s)
+{
+    return avio_seek(s, 0, SEEK_CUR);
+}
+
+/**
+ * Get the filesize.
+ * @return filesize or AVERROR
+ */
+int64_t avio_size(AVIOContext *s);
+
+/**
+ * feof() equivalent for AVIOContext.
+ * @return non zero if and only if end of file
+ */
+int avio_feof(AVIOContext *s);
+#if FF_API_URL_FEOF
+/**
+ * @deprecated use avio_feof()
+ */
+attribute_deprecated
+int url_feof(AVIOContext *s);
+#endif
+
+/** @warning Writes up to 4 KiB per call */
+int avio_printf(AVIOContext *s, const char *fmt, ...) av_printf_format(2, 3);
+
+/**
+ * Force flushing of buffered data.
+ *
+ * For write streams, force the buffered data to be immediately written to the output,
+ * without to wait to fill the internal buffer.
+ *
+ * For read streams, discard all currently buffered data, and advance the
+ * reported file position to that of the underlying stream. This does not
+ * read new data, and does not perform any seeks.
+ */
+void avio_flush(AVIOContext *s);
+
+/**
+ * Read size bytes from AVIOContext into buf.
+ * @return number of bytes read or AVERROR
+ */
+int avio_read(AVIOContext *s, unsigned char *buf, int size);
+
+/**
+ * @name Functions for reading from AVIOContext
+ * @{
+ *
+ * @note return 0 if EOF, so you cannot use it if EOF handling is
+ *       necessary
+ */
+int          avio_r8  (AVIOContext *s);
+unsigned int avio_rl16(AVIOContext *s);
+unsigned int avio_rl24(AVIOContext *s);
+unsigned int avio_rl32(AVIOContext *s);
+uint64_t     avio_rl64(AVIOContext *s);
+unsigned int avio_rb16(AVIOContext *s);
+unsigned int avio_rb24(AVIOContext *s);
+unsigned int avio_rb32(AVIOContext *s);
+uint64_t     avio_rb64(AVIOContext *s);
+/**
+ * @}
+ */
+
+/**
+ * Read a string from pb into buf. The reading will terminate when either
+ * a NULL character was encountered, maxlen bytes have been read, or nothing
+ * more can be read from pb. The result is guaranteed to be NULL-terminated, it
+ * will be truncated if buf is too small.
+ * Note that the string is not interpreted or validated in any way, it
+ * might get truncated in the middle of a sequence for multi-byte encodings.
+ *
+ * @return number of bytes read (is always <= maxlen).
+ * If reading ends on EOF or error, the return value will be one more than
+ * bytes actually read.
+ */
+int avio_get_str(AVIOContext *pb, int maxlen, char *buf, int buflen);
+
+/**
+ * Read a UTF-16 string from pb and convert it to UTF-8.
+ * The reading will terminate when either a null or invalid character was
+ * encountered or maxlen bytes have been read.
+ * @return number of bytes read (is always <= maxlen)
+ */
+int avio_get_str16le(AVIOContext *pb, int maxlen, char *buf, int buflen);
+int avio_get_str16be(AVIOContext *pb, int maxlen, char *buf, int buflen);
+
+
+/**
+ * @name URL open modes
+ * The flags argument to avio_open must be one of the following
+ * constants, optionally ORed with other flags.
+ * @{
+ */
+#define AVIO_FLAG_READ  1                                      /**< read-only */
+#define AVIO_FLAG_WRITE 2                                      /**< write-only */
+#define AVIO_FLAG_READ_WRITE (AVIO_FLAG_READ|AVIO_FLAG_WRITE)  /**< read-write pseudo flag */
+/**
+ * @}
+ */
+
+/**
+ * Use non-blocking mode.
+ * If this flag is set, operations on the context will return
+ * AVERROR(EAGAIN) if they can not be performed immediately.
+ * If this flag is not set, operations on the context will never return
+ * AVERROR(EAGAIN).
+ * Note that this flag does not affect the opening/connecting of the
+ * context. Connecting a protocol will always block if necessary (e.g. on
+ * network protocols) but never hang (e.g. on busy devices).
+ * Warning: non-blocking protocols is work-in-progress; this flag may be
+ * silently ignored.
+ */
+#define AVIO_FLAG_NONBLOCK 8
+
+/**
+ * Use direct mode.
+ * avio_read and avio_write should if possible be satisfied directly
+ * instead of going through a buffer, and avio_seek will always
+ * call the underlying seek function directly.
+ */
+#define AVIO_FLAG_DIRECT 0x8000
+
+/**
+ * Create and initialize a AVIOContext for accessing the
+ * resource indicated by url.
+ * @note When the resource indicated by url has been opened in
+ * read+write mode, the AVIOContext can be used only for writing.
+ *
+ * @param s Used to return the pointer to the created AVIOContext.
+ * In case of failure the pointed to value is set to NULL.
+ * @param url resource to access
+ * @param flags flags which control how the resource indicated by url
+ * is to be opened
+ * @return >= 0 in case of success, a negative value corresponding to an
+ * AVERROR code in case of failure
+ */
+int avio_open(AVIOContext **s, const char *url, int flags);
+
+/**
+ * Create and initialize a AVIOContext for accessing the
+ * resource indicated by url.
+ * @note When the resource indicated by url has been opened in
+ * read+write mode, the AVIOContext can be used only for writing.
+ *
+ * @param s Used to return the pointer to the created AVIOContext.
+ * In case of failure the pointed to value is set to NULL.
+ * @param url resource to access
+ * @param flags flags which control how the resource indicated by url
+ * is to be opened
+ * @param int_cb an interrupt callback to be used at the protocols level
+ * @param options  A dictionary filled with protocol-private options. On return
+ * this parameter will be destroyed and replaced with a dict containing options
+ * that were not found. May be NULL.
+ * @return >= 0 in case of success, a negative value corresponding to an
+ * AVERROR code in case of failure
+ */
+int avio_open2(AVIOContext **s, const char *url, int flags,
+               const AVIOInterruptCB *int_cb, AVDictionary **options);
+
+/**
+ * Close the resource accessed by the AVIOContext s and free it.
+ * This function can only be used if s was opened by avio_open().
+ *
+ * The internal buffer is automatically flushed before closing the
+ * resource.
+ *
+ * @return 0 on success, an AVERROR < 0 on error.
+ * @see avio_closep
+ */
+int avio_close(AVIOContext *s);
+
+/**
+ * Close the resource accessed by the AVIOContext *s, free it
+ * and set the pointer pointing to it to NULL.
+ * This function can only be used if s was opened by avio_open().
+ *
+ * The internal buffer is automatically flushed before closing the
+ * resource.
+ *
+ * @return 0 on success, an AVERROR < 0 on error.
+ * @see avio_close
+ */
+int avio_closep(AVIOContext **s);
+
+
+/**
+ * Open a write only memory stream.
+ *
+ * @param s new IO context
+ * @return zero if no error.
+ */
+int avio_open_dyn_buf(AVIOContext **s);
+
+/**
+ * Return the written size and a pointer to the buffer.
+ * The AVIOContext stream is left intact.
+ * The buffer must NOT be freed.
+ * No padding is added to the buffer.
+ *
+ * @param s IO context
+ * @param pbuffer pointer to a byte buffer
+ * @return the length of the byte buffer
+ */
+int avio_get_dyn_buf(AVIOContext *s, uint8_t **pbuffer);
+
+/**
+ * Return the written size and a pointer to the buffer. The buffer
+ * must be freed with av_free().
+ * Padding of AV_INPUT_BUFFER_PADDING_SIZE is added to the buffer.
+ *
+ * @param s IO context
+ * @param pbuffer pointer to a byte buffer
+ * @return the length of the byte buffer
+ */
+int avio_close_dyn_buf(AVIOContext *s, uint8_t **pbuffer);
+
+/**
+ * Iterate through names of available protocols.
+ *
+ * @param opaque A private pointer representing current protocol.
+ *        It must be a pointer to NULL on first iteration and will
+ *        be updated by successive calls to avio_enum_protocols.
+ * @param output If set to 1, iterate over output protocols,
+ *               otherwise over input protocols.
+ *
+ * @return A static string containing the name of current protocol or NULL
+ */
+const char *avio_enum_protocols(void **opaque, int output);
+
+/**
+ * Pause and resume playing - only meaningful if using a network streaming
+ * protocol (e.g. MMS).
+ *
+ * @param h     IO context from which to call the read_pause function pointer
+ * @param pause 1 for pause, 0 for resume
+ */
+int     avio_pause(AVIOContext *h, int pause);
+
+/**
+ * Seek to a given timestamp relative to some component stream.
+ * Only meaningful if using a network streaming protocol (e.g. MMS.).
+ *
+ * @param h IO context from which to call the seek function pointers
+ * @param stream_index The stream index that the timestamp is relative to.
+ *        If stream_index is (-1) the timestamp should be in AV_TIME_BASE
+ *        units from the beginning of the presentation.
+ *        If a stream_index >= 0 is used and the protocol does not support
+ *        seeking based on component streams, the call will fail.
+ * @param timestamp timestamp in AVStream.time_base units
+ *        or if there is no stream specified then in AV_TIME_BASE units.
+ * @param flags Optional combination of AVSEEK_FLAG_BACKWARD, AVSEEK_FLAG_BYTE
+ *        and AVSEEK_FLAG_ANY. The protocol may silently ignore
+ *        AVSEEK_FLAG_BACKWARD and AVSEEK_FLAG_ANY, but AVSEEK_FLAG_BYTE will
+ *        fail if used and not supported.
+ * @return >= 0 on success
+ * @see AVInputFormat::read_seek
+ */
+int64_t avio_seek_time(AVIOContext *h, int stream_index,
+                       int64_t timestamp, int flags);
+
+/* Avoid a warning. The header can not be included because it breaks c++. */
+struct AVBPrint;
+
+/**
+ * Read contents of h into print buffer, up to max_size bytes, or up to EOF.
+ *
+ * @return 0 for success (max_size bytes read or EOF reached), negative error
+ * code otherwise
+ */
+int avio_read_to_bprint(AVIOContext *h, struct AVBPrint *pb, size_t max_size);
+
+/**
+ * Accept and allocate a client context on a server context.
+ * @param  s the server context
+ * @param  c the client context, must be unallocated
+ * @return   >= 0 on success or a negative value corresponding
+ *           to an AVERROR on failure
+ */
+int avio_accept(AVIOContext *s, AVIOContext **c);
+
+/**
+ * Perform one step of the protocol handshake to accept a new client.
+ * This function must be called on a client returned by avio_accept() before
+ * using it as a read/write context.
+ * It is separate from avio_accept() because it may block.
+ * A step of the handshake is defined by places where the application may
+ * decide to change the proceedings.
+ * For example, on a protocol with a request header and a reply header, each
+ * one can constitute a step because the application may use the parameters
+ * from the request to change parameters in the reply; or each individual
+ * chunk of the request can constitute a step.
+ * If the handshake is already finished, avio_handshake() does nothing and
+ * returns 0 immediately.
+ *
+ * @param  c the client context to perform the handshake on
+ * @return   0   on a complete and successful handshake
+ *           > 0 if the handshake progressed, but is not complete
+ *           < 0 for an AVERROR code
+ */
+int avio_handshake(AVIOContext *c);
+#endif /* AVFORMAT_AVIO_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavformat/version.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavformat/version.h
new file mode 100644
index 0000000..fc054ee
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavformat/version.h
@@ -0,0 +1,105 @@
+/*
+ * Version macros.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFORMAT_VERSION_H
+#define AVFORMAT_VERSION_H
+
+/**
+ * @file
+ * @ingroup libavf
+ * Libavformat version macros
+ */
+
+#include "libavutil/version.h"
+
+// Major bumping may affect Ticket5467, 5421, 5451(compatibility with Chromium)
+// Also please add any ticket numbers that you believe might be affected here
+#define LIBAVFORMAT_VERSION_MAJOR  57
+#define LIBAVFORMAT_VERSION_MINOR  71
+#define LIBAVFORMAT_VERSION_MICRO 100
+
+#define LIBAVFORMAT_VERSION_INT AV_VERSION_INT(LIBAVFORMAT_VERSION_MAJOR, \
+                                               LIBAVFORMAT_VERSION_MINOR, \
+                                               LIBAVFORMAT_VERSION_MICRO)
+#define LIBAVFORMAT_VERSION     AV_VERSION(LIBAVFORMAT_VERSION_MAJOR,   \
+                                           LIBAVFORMAT_VERSION_MINOR,   \
+                                           LIBAVFORMAT_VERSION_MICRO)
+#define LIBAVFORMAT_BUILD       LIBAVFORMAT_VERSION_INT
+
+#define LIBAVFORMAT_IDENT       "Lavf" AV_STRINGIFY(LIBAVFORMAT_VERSION)
+
+/**
+ * FF_API_* defines may be placed below to indicate public API that will be
+ * dropped at a future version bump. The defines themselves are not part of
+ * the public API and may change, break or disappear at any time.
+ *
+ * @note, when bumping the major version it is recommended to manually
+ * disable each FF_API_* in its own commit instead of disabling them all
+ * at once through the bump. This improves the git bisect-ability of the change.
+ *
+ */
+#ifndef FF_API_LAVF_BITEXACT
+#define FF_API_LAVF_BITEXACT            (LIBAVFORMAT_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_LAVF_FRAC
+#define FF_API_LAVF_FRAC                (LIBAVFORMAT_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_LAVF_CODEC_TB
+#define FF_API_LAVF_CODEC_TB            (LIBAVFORMAT_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_URL_FEOF
+#define FF_API_URL_FEOF                 (LIBAVFORMAT_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_LAVF_FMT_RAWPICTURE
+#define FF_API_LAVF_FMT_RAWPICTURE      (LIBAVFORMAT_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_COMPUTE_PKT_FIELDS2
+#define FF_API_COMPUTE_PKT_FIELDS2      (LIBAVFORMAT_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_OLD_OPEN_CALLBACKS
+#define FF_API_OLD_OPEN_CALLBACKS       (LIBAVFORMAT_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_LAVF_AVCTX
+#define FF_API_LAVF_AVCTX               (LIBAVFORMAT_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_NOCONST_GET_SIDE_DATA
+#define FF_API_NOCONST_GET_SIDE_DATA    (LIBAVFORMAT_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_HTTP_USER_AGENT
+#define FF_API_HTTP_USER_AGENT          (LIBAVFORMAT_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_HLS_WRAP
+#define FF_API_HLS_WRAP                 (LIBAVFORMAT_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_LAVF_MERGE_SD
+#define FF_API_LAVF_MERGE_SD            (LIBAVFORMAT_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_LAVF_KEEPSIDE_FLAG
+#define FF_API_LAVF_KEEPSIDE_FLAG       (LIBAVFORMAT_VERSION_MAJOR < 58)
+#endif
+#ifndef FF_API_OLD_ROTATE_API
+#define FF_API_OLD_ROTATE_API           (LIBAVFORMAT_VERSION_MAJOR < 58)
+#endif
+
+
+#ifndef FF_API_R_FRAME_RATE
+#define FF_API_R_FRAME_RATE            1
+#endif
+#endif /* AVFORMAT_VERSION_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/adler32.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/adler32.h
new file mode 100644
index 0000000..a1f035b
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/adler32.h
@@ -0,0 +1,60 @@
+/*
+ * copyright (c) 2006 Mans Rullgard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * @ingroup lavu_adler32
+ * Public header for Adler-32 hash function implementation.
+ */
+
+#ifndef AVUTIL_ADLER32_H
+#define AVUTIL_ADLER32_H
+
+#include <stdint.h>
+#include "attributes.h"
+
+/**
+ * @defgroup lavu_adler32 Adler-32
+ * @ingroup lavu_hash
+ * Adler-32 hash function implementation.
+ *
+ * @{
+ */
+
+/**
+ * Calculate the Adler32 checksum of a buffer.
+ *
+ * Passing the return value to a subsequent av_adler32_update() call
+ * allows the checksum of multiple buffers to be calculated as though
+ * they were concatenated.
+ *
+ * @param adler initial checksum value
+ * @param buf   pointer to input buffer
+ * @param len   size of input buffer
+ * @return      updated checksum
+ */
+unsigned long av_adler32_update(unsigned long adler, const uint8_t *buf,
+                                unsigned int len) av_pure;
+
+/**
+ * @}
+ */
+
+#endif /* AVUTIL_ADLER32_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/aes.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/aes.h
new file mode 100644
index 0000000..09efbda
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/aes.h
@@ -0,0 +1,65 @@
+/*
+ * copyright (c) 2007 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_AES_H
+#define AVUTIL_AES_H
+
+#include <stdint.h>
+
+#include "attributes.h"
+#include "version.h"
+
+/**
+ * @defgroup lavu_aes AES
+ * @ingroup lavu_crypto
+ * @{
+ */
+
+extern const int av_aes_size;
+
+struct AVAES;
+
+/**
+ * Allocate an AVAES context.
+ */
+struct AVAES *av_aes_alloc(void);
+
+/**
+ * Initialize an AVAES context.
+ * @param key_bits 128, 192 or 256
+ * @param decrypt 0 for encryption, 1 for decryption
+ */
+int av_aes_init(struct AVAES *a, const uint8_t *key, int key_bits, int decrypt);
+
+/**
+ * Encrypt or decrypt a buffer using a previously initialized context.
+ * @param count number of 16 byte blocks
+ * @param dst destination array, can be equal to src
+ * @param src source array, can be equal to dst
+ * @param iv initialization vector for CBC mode, if NULL then ECB will be used
+ * @param decrypt 0 for encryption, 1 for decryption
+ */
+void av_aes_crypt(struct AVAES *a, uint8_t *dst, const uint8_t *src, int count, uint8_t *iv, int decrypt);
+
+/**
+ * @}
+ */
+
+#endif /* AVUTIL_AES_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/aes_ctr.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/aes_ctr.h
new file mode 100644
index 0000000..f596fa6
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/aes_ctr.h
@@ -0,0 +1,83 @@
+/*
+ * AES-CTR cipher
+ * Copyright (c) 2015 Eran Kornblau <erankor at gmail dot com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_AES_CTR_H
+#define AVUTIL_AES_CTR_H
+
+#include <stdint.h>
+
+#include "attributes.h"
+#include "version.h"
+
+#define AES_CTR_KEY_SIZE (16)
+#define AES_CTR_IV_SIZE (8)
+
+struct AVAESCTR;
+
+/**
+ * Allocate an AVAESCTR context.
+ */
+struct AVAESCTR *av_aes_ctr_alloc(void);
+
+/**
+ * Initialize an AVAESCTR context.
+ * @param key encryption key, must have a length of AES_CTR_KEY_SIZE
+ */
+int av_aes_ctr_init(struct AVAESCTR *a, const uint8_t *key);
+
+/**
+ * Release an AVAESCTR context.
+ */
+void av_aes_ctr_free(struct AVAESCTR *a);
+
+/**
+ * Process a buffer using a previously initialized context.
+ * @param dst destination array, can be equal to src
+ * @param src source array, can be equal to dst
+ * @param size the size of src and dst
+ */
+void av_aes_ctr_crypt(struct AVAESCTR *a, uint8_t *dst, const uint8_t *src, int size);
+
+/**
+ * Get the current iv
+ */
+const uint8_t* av_aes_ctr_get_iv(struct AVAESCTR *a);
+
+/**
+ * Generate a random iv
+ */
+void av_aes_ctr_set_random_iv(struct AVAESCTR *a);
+
+/**
+ * Forcefully change the iv
+ */
+void av_aes_ctr_set_iv(struct AVAESCTR *a, const uint8_t* iv);
+
+/**
+ * Increment the top 64 bit of the iv (performed after each frame)
+ */
+void av_aes_ctr_increment_iv(struct AVAESCTR *a);
+
+/**
+ * @}
+ */
+
+#endif /* AVUTIL_AES_CTR_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/attributes.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/attributes.h
new file mode 100644
index 0000000..54d1901
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/attributes.h
@@ -0,0 +1,167 @@
+/*
+ * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Macro definitions for various function/variable attributes
+ */
+
+#ifndef AVUTIL_ATTRIBUTES_H
+#define AVUTIL_ATTRIBUTES_H
+
+#ifdef __GNUC__
+#    define AV_GCC_VERSION_AT_LEAST(x,y) (__GNUC__ > (x) || __GNUC__ == (x) && __GNUC_MINOR__ >= (y))
+#    define AV_GCC_VERSION_AT_MOST(x,y)  (__GNUC__ < (x) || __GNUC__ == (x) && __GNUC_MINOR__ <= (y))
+#else
+#    define AV_GCC_VERSION_AT_LEAST(x,y) 0
+#    define AV_GCC_VERSION_AT_MOST(x,y)  0
+#endif
+
+#ifndef av_always_inline
+#if AV_GCC_VERSION_AT_LEAST(3,1)
+#    define av_always_inline __attribute__((always_inline)) inline
+#elif defined(_MSC_VER)
+#    define av_always_inline __forceinline
+#else
+#    define av_always_inline inline
+#endif
+#endif
+
+#ifndef av_extern_inline
+#if defined(__ICL) && __ICL >= 1210 || defined(__GNUC_STDC_INLINE__)
+#    define av_extern_inline extern inline
+#else
+#    define av_extern_inline inline
+#endif
+#endif
+
+#if AV_GCC_VERSION_AT_LEAST(3,4)
+#    define av_warn_unused_result __attribute__((warn_unused_result))
+#else
+#    define av_warn_unused_result
+#endif
+
+#if AV_GCC_VERSION_AT_LEAST(3,1)
+#    define av_noinline __attribute__((noinline))
+#elif defined(_MSC_VER)
+#    define av_noinline __declspec(noinline)
+#else
+#    define av_noinline
+#endif
+
+#if AV_GCC_VERSION_AT_LEAST(3,1)
+#    define av_pure __attribute__((pure))
+#else
+#    define av_pure
+#endif
+
+#if AV_GCC_VERSION_AT_LEAST(2,6)
+#    define av_const __attribute__((const))
+#else
+#    define av_const
+#endif
+
+#if AV_GCC_VERSION_AT_LEAST(4,3)
+#    define av_cold __attribute__((cold))
+#else
+#    define av_cold
+#endif
+
+#if AV_GCC_VERSION_AT_LEAST(4,1) && !defined(__llvm__)
+#    define av_flatten __attribute__((flatten))
+#else
+#    define av_flatten
+#endif
+
+#if AV_GCC_VERSION_AT_LEAST(3,1)
+#    define attribute_deprecated __attribute__((deprecated))
+#elif defined(_MSC_VER)
+#    define attribute_deprecated __declspec(deprecated)
+#else
+#    define attribute_deprecated
+#endif
+
+/**
+ * Disable warnings about deprecated features
+ * This is useful for sections of code kept for backward compatibility and
+ * scheduled for removal.
+ */
+#ifndef AV_NOWARN_DEPRECATED
+#if AV_GCC_VERSION_AT_LEAST(4,6)
+#    define AV_NOWARN_DEPRECATED(code) \
+        _Pragma("GCC diagnostic push") \
+        _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") \
+        code \
+        _Pragma("GCC diagnostic pop")
+#elif defined(_MSC_VER)
+#    define AV_NOWARN_DEPRECATED(code) \
+        __pragma(warning(push)) \
+        __pragma(warning(disable : 4996)) \
+        code; \
+        __pragma(warning(pop))
+#else
+#    define AV_NOWARN_DEPRECATED(code) code
+#endif
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#    define av_unused __attribute__((unused))
+#else
+#    define av_unused
+#endif
+
+/**
+ * Mark a variable as used and prevent the compiler from optimizing it
+ * away.  This is useful for variables accessed only from inline
+ * assembler without the compiler being aware.
+ */
+#if AV_GCC_VERSION_AT_LEAST(3,1) || defined(__clang__)
+#    define av_used __attribute__((used))
+#else
+#    define av_used
+#endif
+
+#if AV_GCC_VERSION_AT_LEAST(3,3)
+#   define av_alias __attribute__((may_alias))
+#else
+#   define av_alias
+#endif
+
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
+#    define av_uninit(x) x=x
+#else
+#    define av_uninit(x) x
+#endif
+
+#ifdef __GNUC__
+#    define av_builtin_constant_p __builtin_constant_p
+#    define av_printf_format(fmtpos, attrpos) __attribute__((__format__(__printf__, fmtpos, attrpos)))
+#else
+#    define av_builtin_constant_p(x) 0
+#    define av_printf_format(fmtpos, attrpos)
+#endif
+
+#if AV_GCC_VERSION_AT_LEAST(2,5)
+#    define av_noreturn __attribute__((noreturn))
+#else
+#    define av_noreturn
+#endif
+
+#endif /* AVUTIL_ATTRIBUTES_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/audio_fifo.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/audio_fifo.h
new file mode 100644
index 0000000..d8a9194
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/audio_fifo.h
@@ -0,0 +1,187 @@
+/*
+ * Audio FIFO
+ * Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Audio FIFO Buffer
+ */
+
+#ifndef AVUTIL_AUDIO_FIFO_H
+#define AVUTIL_AUDIO_FIFO_H
+
+#include "avutil.h"
+#include "fifo.h"
+#include "samplefmt.h"
+
+/**
+ * @addtogroup lavu_audio
+ * @{
+ *
+ * @defgroup lavu_audiofifo Audio FIFO Buffer
+ * @{
+ */
+
+/**
+ * Context for an Audio FIFO Buffer.
+ *
+ * - Operates at the sample level rather than the byte level.
+ * - Supports multiple channels with either planar or packed sample format.
+ * - Automatic reallocation when writing to a full buffer.
+ */
+typedef struct AVAudioFifo AVAudioFifo;
+
+/**
+ * Free an AVAudioFifo.
+ *
+ * @param af  AVAudioFifo to free
+ */
+void av_audio_fifo_free(AVAudioFifo *af);
+
+/**
+ * Allocate an AVAudioFifo.
+ *
+ * @param sample_fmt  sample format
+ * @param channels    number of channels
+ * @param nb_samples  initial allocation size, in samples
+ * @return            newly allocated AVAudioFifo, or NULL on error
+ */
+AVAudioFifo *av_audio_fifo_alloc(enum AVSampleFormat sample_fmt, int channels,
+                                 int nb_samples);
+
+/**
+ * Reallocate an AVAudioFifo.
+ *
+ * @param af          AVAudioFifo to reallocate
+ * @param nb_samples  new allocation size, in samples
+ * @return            0 if OK, or negative AVERROR code on failure
+ */
+av_warn_unused_result
+int av_audio_fifo_realloc(AVAudioFifo *af, int nb_samples);
+
+/**
+ * Write data to an AVAudioFifo.
+ *
+ * The AVAudioFifo will be reallocated automatically if the available space
+ * is less than nb_samples.
+ *
+ * @see enum AVSampleFormat
+ * The documentation for AVSampleFormat describes the data layout.
+ *
+ * @param af          AVAudioFifo to write to
+ * @param data        audio data plane pointers
+ * @param nb_samples  number of samples to write
+ * @return            number of samples actually written, or negative AVERROR
+ *                    code on failure. If successful, the number of samples
+ *                    actually written will always be nb_samples.
+ */
+int av_audio_fifo_write(AVAudioFifo *af, void **data, int nb_samples);
+
+/**
+ * Peek data from an AVAudioFifo.
+ *
+ * @see enum AVSampleFormat
+ * The documentation for AVSampleFormat describes the data layout.
+ *
+ * @param af          AVAudioFifo to read from
+ * @param data        audio data plane pointers
+ * @param nb_samples  number of samples to peek
+ * @return            number of samples actually peek, or negative AVERROR code
+ *                    on failure. The number of samples actually peek will not
+ *                    be greater than nb_samples, and will only be less than
+ *                    nb_samples if av_audio_fifo_size is less than nb_samples.
+ */
+int av_audio_fifo_peek(AVAudioFifo *af, void **data, int nb_samples);
+
+/**
+ * Peek data from an AVAudioFifo.
+ *
+ * @see enum AVSampleFormat
+ * The documentation for AVSampleFormat describes the data layout.
+ *
+ * @param af          AVAudioFifo to read from
+ * @param data        audio data plane pointers
+ * @param nb_samples  number of samples to peek
+ * @param offset      offset from current read position
+ * @return            number of samples actually peek, or negative AVERROR code
+ *                    on failure. The number of samples actually peek will not
+ *                    be greater than nb_samples, and will only be less than
+ *                    nb_samples if av_audio_fifo_size is less than nb_samples.
+ */
+int av_audio_fifo_peek_at(AVAudioFifo *af, void **data, int nb_samples, int offset);
+
+/**
+ * Read data from an AVAudioFifo.
+ *
+ * @see enum AVSampleFormat
+ * The documentation for AVSampleFormat describes the data layout.
+ *
+ * @param af          AVAudioFifo to read from
+ * @param data        audio data plane pointers
+ * @param nb_samples  number of samples to read
+ * @return            number of samples actually read, or negative AVERROR code
+ *                    on failure. The number of samples actually read will not
+ *                    be greater than nb_samples, and will only be less than
+ *                    nb_samples if av_audio_fifo_size is less than nb_samples.
+ */
+int av_audio_fifo_read(AVAudioFifo *af, void **data, int nb_samples);
+
+/**
+ * Drain data from an AVAudioFifo.
+ *
+ * Removes the data without reading it.
+ *
+ * @param af          AVAudioFifo to drain
+ * @param nb_samples  number of samples to drain
+ * @return            0 if OK, or negative AVERROR code on failure
+ */
+int av_audio_fifo_drain(AVAudioFifo *af, int nb_samples);
+
+/**
+ * Reset the AVAudioFifo buffer.
+ *
+ * This empties all data in the buffer.
+ *
+ * @param af  AVAudioFifo to reset
+ */
+void av_audio_fifo_reset(AVAudioFifo *af);
+
+/**
+ * Get the current number of samples in the AVAudioFifo available for reading.
+ *
+ * @param af  the AVAudioFifo to query
+ * @return    number of samples available for reading
+ */
+int av_audio_fifo_size(AVAudioFifo *af);
+
+/**
+ * Get the current number of samples in the AVAudioFifo available for writing.
+ *
+ * @param af  the AVAudioFifo to query
+ * @return    number of samples available for writing
+ */
+int av_audio_fifo_space(AVAudioFifo *af);
+
+/**
+ * @}
+ * @}
+ */
+
+#endif /* AVUTIL_AUDIO_FIFO_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/avassert.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/avassert.h
new file mode 100644
index 0000000..46f3fea
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/avassert.h
@@ -0,0 +1,75 @@
+/*
+ * copyright (c) 2010 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * simple assert() macros that are a bit more flexible than ISO C assert().
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#ifndef AVUTIL_AVASSERT_H
+#define AVUTIL_AVASSERT_H
+
+#include <stdlib.h>
+#include "avutil.h"
+#include "log.h"
+
+/**
+ * assert() equivalent, that is always enabled.
+ */
+#define av_assert0(cond) do {                                           \
+    if (!(cond)) {                                                      \
+        av_log(NULL, AV_LOG_PANIC, "Assertion %s failed at %s:%d\n",    \
+               AV_STRINGIFY(cond), __FILE__, __LINE__);                 \
+        abort();                                                        \
+    }                                                                   \
+} while (0)
+
+
+/**
+ * assert() equivalent, that does not lie in speed critical code.
+ * These asserts() thus can be enabled without fearing speed loss.
+ */
+#if defined(ASSERT_LEVEL) && ASSERT_LEVEL > 0
+#define av_assert1(cond) av_assert0(cond)
+#else
+#define av_assert1(cond) ((void)0)
+#endif
+
+
+/**
+ * assert() equivalent, that does lie in speed critical code.
+ */
+#if defined(ASSERT_LEVEL) && ASSERT_LEVEL > 1
+#define av_assert2(cond) av_assert0(cond)
+#define av_assert2_fpu() av_assert0_fpu()
+#else
+#define av_assert2(cond) ((void)0)
+#define av_assert2_fpu() ((void)0)
+#endif
+
+/**
+ * Assert that floating point opperations can be executed.
+ *
+ * This will av_assert0() that the cpu is not in MMX state on X86
+ */
+void av_assert0_fpu(void);
+
+#endif /* AVUTIL_AVASSERT_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/avconfig.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/avconfig.h
new file mode 100644
index 0000000..f10aa61
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/avconfig.h
@@ -0,0 +1,6 @@
+/* Generated by ffconf */
+#ifndef AVUTIL_AVCONFIG_H
+#define AVUTIL_AVCONFIG_H
+#define AV_HAVE_BIGENDIAN 0
+#define AV_HAVE_FAST_UNALIGNED 1
+#endif /* AVUTIL_AVCONFIG_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/avstring.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/avstring.h
new file mode 100644
index 0000000..dd28769
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/avstring.h
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2007 Mans Rullgard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_AVSTRING_H
+#define AVUTIL_AVSTRING_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include "attributes.h"
+
+/**
+ * @addtogroup lavu_string
+ * @{
+ */
+
+/**
+ * Return non-zero if pfx is a prefix of str. If it is, *ptr is set to
+ * the address of the first character in str after the prefix.
+ *
+ * @param str input string
+ * @param pfx prefix to test
+ * @param ptr updated if the prefix is matched inside str
+ * @return non-zero if the prefix matches, zero otherwise
+ */
+int av_strstart(const char *str, const char *pfx, const char **ptr);
+
+/**
+ * Return non-zero if pfx is a prefix of str independent of case. If
+ * it is, *ptr is set to the address of the first character in str
+ * after the prefix.
+ *
+ * @param str input string
+ * @param pfx prefix to test
+ * @param ptr updated if the prefix is matched inside str
+ * @return non-zero if the prefix matches, zero otherwise
+ */
+int av_stristart(const char *str, const char *pfx, const char **ptr);
+
+/**
+ * Locate the first case-independent occurrence in the string haystack
+ * of the string needle.  A zero-length string needle is considered to
+ * match at the start of haystack.
+ *
+ * This function is a case-insensitive version of the standard strstr().
+ *
+ * @param haystack string to search in
+ * @param needle   string to search for
+ * @return         pointer to the located match within haystack
+ *                 or a null pointer if no match
+ */
+char *av_stristr(const char *haystack, const char *needle);
+
+/**
+ * Locate the first occurrence of the string needle in the string haystack
+ * where not more than hay_length characters are searched. A zero-length
+ * string needle is considered to match at the start of haystack.
+ *
+ * This function is a length-limited version of the standard strstr().
+ *
+ * @param haystack   string to search in
+ * @param needle     string to search for
+ * @param hay_length length of string to search in
+ * @return           pointer to the located match within haystack
+ *                   or a null pointer if no match
+ */
+char *av_strnstr(const char *haystack, const char *needle, size_t hay_length);
+
+/**
+ * Copy the string src to dst, but no more than size - 1 bytes, and
+ * null-terminate dst.
+ *
+ * This function is the same as BSD strlcpy().
+ *
+ * @param dst destination buffer
+ * @param src source string
+ * @param size size of destination buffer
+ * @return the length of src
+ *
+ * @warning since the return value is the length of src, src absolutely
+ * _must_ be a properly 0-terminated string, otherwise this will read beyond
+ * the end of the buffer and possibly crash.
+ */
+size_t av_strlcpy(char *dst, const char *src, size_t size);
+
+/**
+ * Append the string src to the string dst, but to a total length of
+ * no more than size - 1 bytes, and null-terminate dst.
+ *
+ * This function is similar to BSD strlcat(), but differs when
+ * size <= strlen(dst).
+ *
+ * @param dst destination buffer
+ * @param src source string
+ * @param size size of destination buffer
+ * @return the total length of src and dst
+ *
+ * @warning since the return value use the length of src and dst, these
+ * absolutely _must_ be a properly 0-terminated strings, otherwise this
+ * will read beyond the end of the buffer and possibly crash.
+ */
+size_t av_strlcat(char *dst, const char *src, size_t size);
+
+/**
+ * Append output to a string, according to a format. Never write out of
+ * the destination buffer, and always put a terminating 0 within
+ * the buffer.
+ * @param dst destination buffer (string to which the output is
+ *  appended)
+ * @param size total size of the destination buffer
+ * @param fmt printf-compatible format string, specifying how the
+ *  following parameters are used
+ * @return the length of the string that would have been generated
+ *  if enough space had been available
+ */
+size_t av_strlcatf(char *dst, size_t size, const char *fmt, ...) av_printf_format(3, 4);
+
+/**
+ * Get the count of continuous non zero chars starting from the beginning.
+ *
+ * @param len maximum number of characters to check in the string, that
+ *            is the maximum value which is returned by the function
+ */
+static inline size_t av_strnlen(const char *s, size_t len)
+{
+    size_t i;
+    for (i = 0; i < len && s[i]; i++)
+        ;
+    return i;
+}
+
+/**
+ * Print arguments following specified format into a large enough auto
+ * allocated buffer. It is similar to GNU asprintf().
+ * @param fmt printf-compatible format string, specifying how the
+ *            following parameters are used.
+ * @return the allocated string
+ * @note You have to free the string yourself with av_free().
+ */
+char *av_asprintf(const char *fmt, ...) av_printf_format(1, 2);
+
+/**
+ * Convert a number to an av_malloced string.
+ */
+char *av_d2str(double d);
+
+/**
+ * Unescape the given string until a non escaped terminating char,
+ * and return the token corresponding to the unescaped string.
+ *
+ * The normal \ and ' escaping is supported. Leading and trailing
+ * whitespaces are removed, unless they are escaped with '\' or are
+ * enclosed between ''.
+ *
+ * @param buf the buffer to parse, buf will be updated to point to the
+ * terminating char
+ * @param term a 0-terminated list of terminating chars
+ * @return the malloced unescaped string, which must be av_freed by
+ * the user, NULL in case of allocation failure
+ */
+char *av_get_token(const char **buf, const char *term);
+
+/**
+ * Split the string into several tokens which can be accessed by
+ * successive calls to av_strtok().
+ *
+ * A token is defined as a sequence of characters not belonging to the
+ * set specified in delim.
+ *
+ * On the first call to av_strtok(), s should point to the string to
+ * parse, and the value of saveptr is ignored. In subsequent calls, s
+ * should be NULL, and saveptr should be unchanged since the previous
+ * call.
+ *
+ * This function is similar to strtok_r() defined in POSIX.1.
+ *
+ * @param s the string to parse, may be NULL
+ * @param delim 0-terminated list of token delimiters, must be non-NULL
+ * @param saveptr user-provided pointer which points to stored
+ * information necessary for av_strtok() to continue scanning the same
+ * string. saveptr is updated to point to the next character after the
+ * first delimiter found, or to NULL if the string was terminated
+ * @return the found token, or NULL when no token is found
+ */
+char *av_strtok(char *s, const char *delim, char **saveptr);
+
+/**
+ * Locale-independent conversion of ASCII isdigit.
+ */
+static inline av_const int av_isdigit(int c)
+{
+    return c >= '0' && c <= '9';
+}
+
+/**
+ * Locale-independent conversion of ASCII isgraph.
+ */
+static inline av_const int av_isgraph(int c)
+{
+    return c > 32 && c < 127;
+}
+
+/**
+ * Locale-independent conversion of ASCII isspace.
+ */
+static inline av_const int av_isspace(int c)
+{
+    return c == ' ' || c == '\f' || c == '\n' || c == '\r' || c == '\t' ||
+           c == '\v';
+}
+
+/**
+ * Locale-independent conversion of ASCII characters to uppercase.
+ */
+static inline av_const int av_toupper(int c)
+{
+    if (c >= 'a' && c <= 'z')
+        c ^= 0x20;
+    return c;
+}
+
+/**
+ * Locale-independent conversion of ASCII characters to lowercase.
+ */
+static inline av_const int av_tolower(int c)
+{
+    if (c >= 'A' && c <= 'Z')
+        c ^= 0x20;
+    return c;
+}
+
+/**
+ * Locale-independent conversion of ASCII isxdigit.
+ */
+static inline av_const int av_isxdigit(int c)
+{
+    c = av_tolower(c);
+    return av_isdigit(c) || (c >= 'a' && c <= 'f');
+}
+
+/**
+ * Locale-independent case-insensitive compare.
+ * @note This means only ASCII-range characters are case-insensitive
+ */
+int av_strcasecmp(const char *a, const char *b);
+
+/**
+ * Locale-independent case-insensitive compare.
+ * @note This means only ASCII-range characters are case-insensitive
+ */
+int av_strncasecmp(const char *a, const char *b, size_t n);
+
+
+/**
+ * Thread safe basename.
+ * @param path the path, on DOS both \ and / are considered separators.
+ * @return pointer to the basename substring.
+ */
+const char *av_basename(const char *path);
+
+/**
+ * Thread safe dirname.
+ * @param path the path, on DOS both \ and / are considered separators.
+ * @return the path with the separator replaced by the string terminator or ".".
+ * @note the function may change the input string.
+ */
+const char *av_dirname(char *path);
+
+/**
+ * Match instances of a name in a comma-separated list of names.
+ * List entries are checked from the start to the end of the names list,
+ * the first match ends further processing. If an entry prefixed with '-'
+ * matches, then 0 is returned. The "ALL" list entry is considered to
+ * match all names.
+ *
+ * @param name  Name to look for.
+ * @param names List of names.
+ * @return 1 on match, 0 otherwise.
+ */
+int av_match_name(const char *name, const char *names);
+
+/**
+ * Append path component to the existing path.
+ * Path separator '/' is placed between when needed.
+ * Resulting string have to be freed with av_free().
+ * @param path      base path
+ * @param component component to be appended
+ * @return new path or NULL on error.
+ */
+char *av_append_path_component(const char *path, const char *component);
+
+enum AVEscapeMode {
+    AV_ESCAPE_MODE_AUTO,      ///< Use auto-selected escaping mode.
+    AV_ESCAPE_MODE_BACKSLASH, ///< Use backslash escaping.
+    AV_ESCAPE_MODE_QUOTE,     ///< Use single-quote escaping.
+};
+
+/**
+ * Consider spaces special and escape them even in the middle of the
+ * string.
+ *
+ * This is equivalent to adding the whitespace characters to the special
+ * characters lists, except it is guaranteed to use the exact same list
+ * of whitespace characters as the rest of libavutil.
+ */
+#define AV_ESCAPE_FLAG_WHITESPACE (1 << 0)
+
+/**
+ * Escape only specified special characters.
+ * Without this flag, escape also any characters that may be considered
+ * special by av_get_token(), such as the single quote.
+ */
+#define AV_ESCAPE_FLAG_STRICT (1 << 1)
+
+/**
+ * Escape string in src, and put the escaped string in an allocated
+ * string in *dst, which must be freed with av_free().
+ *
+ * @param dst           pointer where an allocated string is put
+ * @param src           string to escape, must be non-NULL
+ * @param special_chars string containing the special characters which
+ *                      need to be escaped, can be NULL
+ * @param mode          escape mode to employ, see AV_ESCAPE_MODE_* macros.
+ *                      Any unknown value for mode will be considered equivalent to
+ *                      AV_ESCAPE_MODE_BACKSLASH, but this behaviour can change without
+ *                      notice.
+ * @param flags         flags which control how to escape, see AV_ESCAPE_FLAG_ macros
+ * @return the length of the allocated string, or a negative error code in case of error
+ * @see av_bprint_escape()
+ */
+av_warn_unused_result
+int av_escape(char **dst, const char *src, const char *special_chars,
+              enum AVEscapeMode mode, int flags);
+
+#define AV_UTF8_FLAG_ACCEPT_INVALID_BIG_CODES          1 ///< accept codepoints over 0x10FFFF
+#define AV_UTF8_FLAG_ACCEPT_NON_CHARACTERS             2 ///< accept non-characters - 0xFFFE and 0xFFFF
+#define AV_UTF8_FLAG_ACCEPT_SURROGATES                 4 ///< accept UTF-16 surrogates codes
+#define AV_UTF8_FLAG_EXCLUDE_XML_INVALID_CONTROL_CODES 8 ///< exclude control codes not accepted by XML
+
+#define AV_UTF8_FLAG_ACCEPT_ALL \
+    AV_UTF8_FLAG_ACCEPT_INVALID_BIG_CODES|AV_UTF8_FLAG_ACCEPT_NON_CHARACTERS|AV_UTF8_FLAG_ACCEPT_SURROGATES
+
+/**
+ * Read and decode a single UTF-8 code point (character) from the
+ * buffer in *buf, and update *buf to point to the next byte to
+ * decode.
+ *
+ * In case of an invalid byte sequence, the pointer will be updated to
+ * the next byte after the invalid sequence and the function will
+ * return an error code.
+ *
+ * Depending on the specified flags, the function will also fail in
+ * case the decoded code point does not belong to a valid range.
+ *
+ * @note For speed-relevant code a carefully implemented use of
+ * GET_UTF8() may be preferred.
+ *
+ * @param codep   pointer used to return the parsed code in case of success.
+ *                The value in *codep is set even in case the range check fails.
+ * @param bufp    pointer to the address the first byte of the sequence
+ *                to decode, updated by the function to point to the
+ *                byte next after the decoded sequence
+ * @param buf_end pointer to the end of the buffer, points to the next
+ *                byte past the last in the buffer. This is used to
+ *                avoid buffer overreads (in case of an unfinished
+ *                UTF-8 sequence towards the end of the buffer).
+ * @param flags   a collection of AV_UTF8_FLAG_* flags
+ * @return >= 0 in case a sequence was successfully read, a negative
+ * value in case of invalid sequence
+ */
+av_warn_unused_result
+int av_utf8_decode(int32_t *codep, const uint8_t **bufp, const uint8_t *buf_end,
+                   unsigned int flags);
+
+/**
+ * Check if a name is in a list.
+ * @returns 0 if not found, or the 1 based index where it has been found in the
+ *            list.
+ */
+int av_match_list(const char *name, const char *list, char separator);
+
+/**
+ * @}
+ */
+
+#endif /* AVUTIL_AVSTRING_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/avutil.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/avutil.h
new file mode 100644
index 0000000..4d63315
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/avutil.h
@@ -0,0 +1,365 @@
+/*
+ * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_AVUTIL_H
+#define AVUTIL_AVUTIL_H
+
+/**
+ * @file
+ * @ingroup lavu
+ * Convenience header that includes @ref lavu "libavutil"'s core.
+ */
+
+/**
+ * @mainpage
+ *
+ * @section ffmpeg_intro Introduction
+ *
+ * This document describes the usage of the different libraries
+ * provided by FFmpeg.
+ *
+ * @li @ref libavc "libavcodec" encoding/decoding library
+ * @li @ref lavfi "libavfilter" graph-based frame editing library
+ * @li @ref libavf "libavformat" I/O and muxing/demuxing library
+ * @li @ref lavd "libavdevice" special devices muxing/demuxing library
+ * @li @ref lavu "libavutil" common utility library
+ * @li @ref lswr "libswresample" audio resampling, format conversion and mixing
+ * @li @ref lpp  "libpostproc" post processing library
+ * @li @ref libsws "libswscale" color conversion and scaling library
+ *
+ * @section ffmpeg_versioning Versioning and compatibility
+ *
+ * Each of the FFmpeg libraries contains a version.h header, which defines a
+ * major, minor and micro version number with the
+ * <em>LIBRARYNAME_VERSION_{MAJOR,MINOR,MICRO}</em> macros. The major version
+ * number is incremented with backward incompatible changes - e.g. removing
+ * parts of the public API, reordering public struct members, etc. The minor
+ * version number is incremented for backward compatible API changes or major
+ * new features - e.g. adding a new public function or a new decoder. The micro
+ * version number is incremented for smaller changes that a calling program
+ * might still want to check for - e.g. changing behavior in a previously
+ * unspecified situation.
+ *
+ * FFmpeg guarantees backward API and ABI compatibility for each library as long
+ * as its major version number is unchanged. This means that no public symbols
+ * will be removed or renamed. Types and names of the public struct members and
+ * values of public macros and enums will remain the same (unless they were
+ * explicitly declared as not part of the public API). Documented behavior will
+ * not change.
+ *
+ * In other words, any correct program that works with a given FFmpeg snapshot
+ * should work just as well without any changes with any later snapshot with the
+ * same major versions. This applies to both rebuilding the program against new
+ * FFmpeg versions or to replacing the dynamic FFmpeg libraries that a program
+ * links against.
+ *
+ * However, new public symbols may be added and new members may be appended to
+ * public structs whose size is not part of public ABI (most public structs in
+ * FFmpeg). New macros and enum values may be added. Behavior in undocumented
+ * situations may change slightly (and be documented). All those are accompanied
+ * by an entry in doc/APIchanges and incrementing either the minor or micro
+ * version number.
+ */
+
+/**
+ * @defgroup lavu libavutil
+ * Common code shared across all FFmpeg libraries.
+ *
+ * @note
+ * libavutil is designed to be modular. In most cases, in order to use the
+ * functions provided by one component of libavutil you must explicitly include
+ * the specific header containing that feature. If you are only using
+ * media-related components, you could simply include libavutil/avutil.h, which
+ * brings in most of the "core" components.
+ *
+ * @{
+ *
+ * @defgroup lavu_crypto Crypto and Hashing
+ *
+ * @{
+ * @}
+ *
+ * @defgroup lavu_math Mathematics
+ * @{
+ *
+ * @}
+ *
+ * @defgroup lavu_string String Manipulation
+ *
+ * @{
+ *
+ * @}
+ *
+ * @defgroup lavu_mem Memory Management
+ *
+ * @{
+ *
+ * @}
+ *
+ * @defgroup lavu_data Data Structures
+ * @{
+ *
+ * @}
+ *
+ * @defgroup lavu_video Video related
+ *
+ * @{
+ *
+ * @}
+ *
+ * @defgroup lavu_audio Audio related
+ *
+ * @{
+ *
+ * @}
+ *
+ * @defgroup lavu_error Error Codes
+ *
+ * @{
+ *
+ * @}
+ *
+ * @defgroup lavu_log Logging Facility
+ *
+ * @{
+ *
+ * @}
+ *
+ * @defgroup lavu_misc Other
+ *
+ * @{
+ *
+ * @defgroup preproc_misc Preprocessor String Macros
+ *
+ * @{
+ *
+ * @}
+ *
+ * @defgroup version_utils Library Version Macros
+ *
+ * @{
+ *
+ * @}
+ */
+
+
+/**
+ * @addtogroup lavu_ver
+ * @{
+ */
+
+/**
+ * Return the LIBAVUTIL_VERSION_INT constant.
+ */
+unsigned avutil_version(void);
+
+/**
+ * Return an informative version string. This usually is the actual release
+ * version number or a git commit description. This string has no fixed format
+ * and can change any time. It should never be parsed by code.
+ */
+const char *av_version_info(void);
+
+/**
+ * Return the libavutil build-time configuration.
+ */
+const char *avutil_configuration(void);
+
+/**
+ * Return the libavutil license.
+ */
+const char *avutil_license(void);
+
+/**
+ * @}
+ */
+
+/**
+ * @addtogroup lavu_media Media Type
+ * @brief Media Type
+ */
+
+enum AVMediaType {
+    AVMEDIA_TYPE_UNKNOWN = -1,  ///< Usually treated as AVMEDIA_TYPE_DATA
+    AVMEDIA_TYPE_VIDEO,
+    AVMEDIA_TYPE_AUDIO,
+    AVMEDIA_TYPE_DATA,          ///< Opaque data information usually continuous
+    AVMEDIA_TYPE_SUBTITLE,
+    AVMEDIA_TYPE_ATTACHMENT,    ///< Opaque data information usually sparse
+    AVMEDIA_TYPE_NB
+};
+
+/**
+ * Return a string describing the media_type enum, NULL if media_type
+ * is unknown.
+ */
+const char *av_get_media_type_string(enum AVMediaType media_type);
+
+/**
+ * @defgroup lavu_const Constants
+ * @{
+ *
+ * @defgroup lavu_enc Encoding specific
+ *
+ * @note those definition should move to avcodec
+ * @{
+ */
+
+#define FF_LAMBDA_SHIFT 7
+#define FF_LAMBDA_SCALE (1<<FF_LAMBDA_SHIFT)
+#define FF_QP2LAMBDA 118 ///< factor to convert from H.263 QP to lambda
+#define FF_LAMBDA_MAX (256*128-1)
+
+#define FF_QUALITY_SCALE FF_LAMBDA_SCALE //FIXME maybe remove
+
+/**
+ * @}
+ * @defgroup lavu_time Timestamp specific
+ *
+ * FFmpeg internal timebase and timestamp definitions
+ *
+ * @{
+ */
+
+/**
+ * @brief Undefined timestamp value
+ *
+ * Usually reported by demuxer that work on containers that do not provide
+ * either pts or dts.
+ */
+
+#define AV_NOPTS_VALUE          ((int64_t)UINT64_C(0x8000000000000000))
+
+/**
+ * Internal time base represented as integer
+ */
+
+#define AV_TIME_BASE            1000000
+
+/**
+ * Internal time base represented as fractional value
+ */
+
+#define AV_TIME_BASE_Q          (AVRational){1, AV_TIME_BASE}
+
+/**
+ * @}
+ * @}
+ * @defgroup lavu_picture Image related
+ *
+ * AVPicture types, pixel formats and basic image planes manipulation.
+ *
+ * @{
+ */
+
+enum AVPictureType {
+    AV_PICTURE_TYPE_NONE = 0, ///< Undefined
+    AV_PICTURE_TYPE_I,     ///< Intra
+    AV_PICTURE_TYPE_P,     ///< Predicted
+    AV_PICTURE_TYPE_B,     ///< Bi-dir predicted
+    AV_PICTURE_TYPE_S,     ///< S(GMC)-VOP MPEG-4
+    AV_PICTURE_TYPE_SI,    ///< Switching Intra
+    AV_PICTURE_TYPE_SP,    ///< Switching Predicted
+    AV_PICTURE_TYPE_BI,    ///< BI type
+};
+
+/**
+ * Return a single letter to describe the given picture type
+ * pict_type.
+ *
+ * @param[in] pict_type the picture type @return a single character
+ * representing the picture type, '?' if pict_type is unknown
+ */
+char av_get_picture_type_char(enum AVPictureType pict_type);
+
+/**
+ * @}
+ */
+
+#include "common.h"
+#include "error.h"
+#include "rational.h"
+#include "version.h"
+#include "macros.h"
+#include "mathematics.h"
+#include "log.h"
+#include "pixfmt.h"
+
+/**
+ * Return x default pointer in case p is NULL.
+ */
+static inline void *av_x_if_null(const void *p, const void *x)
+{
+    return (void *)(intptr_t)(p ? p : x);
+}
+
+/**
+ * Compute the length of an integer list.
+ *
+ * @param elsize  size in bytes of each list element (only 1, 2, 4 or 8)
+ * @param term    list terminator (usually 0 or -1)
+ * @param list    pointer to the list
+ * @return  length of the list, in elements, not counting the terminator
+ */
+unsigned av_int_list_length_for_size(unsigned elsize,
+                                     const void *list, uint64_t term) av_pure;
+
+/**
+ * Compute the length of an integer list.
+ *
+ * @param term  list terminator (usually 0 or -1)
+ * @param list  pointer to the list
+ * @return  length of the list, in elements, not counting the terminator
+ */
+#define av_int_list_length(list, term) \
+    av_int_list_length_for_size(sizeof(*(list)), list, term)
+
+/**
+ * Open a file using a UTF-8 filename.
+ * The API of this function matches POSIX fopen(), errors are returned through
+ * errno.
+ */
+FILE *av_fopen_utf8(const char *path, const char *mode);
+
+/**
+ * Return the fractional representation of the internal time base.
+ */
+AVRational av_get_time_base_q(void);
+
+#define AV_FOURCC_MAX_STRING_SIZE 32
+
+#define av_fourcc2str(fourcc) av_fourcc_make_string((char[AV_FOURCC_MAX_STRING_SIZE]){0}, fourcc)
+
+/**
+ * Fill the provided buffer with a string containing a FourCC (four-character
+ * code) representation.
+ *
+ * @param buf    a buffer with size in bytes of at least AV_FOURCC_MAX_STRING_SIZE
+ * @param fourcc the fourcc to represent
+ * @return the buffer in input
+ */
+char *av_fourcc_make_string(char *buf, uint32_t fourcc);
+
+/**
+ * @}
+ * @}
+ */
+
+#endif /* AVUTIL_AVUTIL_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/base64.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/base64.h
new file mode 100644
index 0000000..2954c12
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/base64.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2006 Ryan Martell. (rdm4@martellventures.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_BASE64_H
+#define AVUTIL_BASE64_H
+
+#include <stdint.h>
+
+/**
+ * @defgroup lavu_base64 Base64
+ * @ingroup lavu_crypto
+ * @{
+ */
+
+/**
+ * Decode a base64-encoded string.
+ *
+ * @param out      buffer for decoded data
+ * @param in       null-terminated input string
+ * @param out_size size in bytes of the out buffer, must be at
+ *                 least 3/4 of the length of in, that is AV_BASE64_DECODE_SIZE(strlen(in))
+ * @return         number of bytes written, or a negative value in case of
+ *                 invalid input
+ */
+int av_base64_decode(uint8_t *out, const char *in, int out_size);
+
+/**
+ * Calculate the output size in bytes needed to decode a base64 string
+ * with length x to a data buffer.
+ */
+#define AV_BASE64_DECODE_SIZE(x) ((x) * 3LL / 4)
+
+/**
+ * Encode data to base64 and null-terminate.
+ *
+ * @param out      buffer for encoded data
+ * @param out_size size in bytes of the out buffer (including the
+ *                 null terminator), must be at least AV_BASE64_SIZE(in_size)
+ * @param in       input buffer containing the data to encode
+ * @param in_size  size in bytes of the in buffer
+ * @return         out or NULL in case of error
+ */
+char *av_base64_encode(char *out, int out_size, const uint8_t *in, int in_size);
+
+/**
+ * Calculate the output size needed to base64-encode x bytes to a
+ * null-terminated string.
+ */
+#define AV_BASE64_SIZE(x)  (((x)+2) / 3 * 4 + 1)
+
+ /**
+  * @}
+  */
+
+#endif /* AVUTIL_BASE64_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/blowfish.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/blowfish.h
new file mode 100644
index 0000000..9e289a4
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/blowfish.h
@@ -0,0 +1,82 @@
+/*
+ * Blowfish algorithm
+ * Copyright (c) 2012 Samuel Pitoiset
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_BLOWFISH_H
+#define AVUTIL_BLOWFISH_H
+
+#include <stdint.h>
+
+/**
+ * @defgroup lavu_blowfish Blowfish
+ * @ingroup lavu_crypto
+ * @{
+ */
+
+#define AV_BF_ROUNDS 16
+
+typedef struct AVBlowfish {
+    uint32_t p[AV_BF_ROUNDS + 2];
+    uint32_t s[4][256];
+} AVBlowfish;
+
+/**
+ * Allocate an AVBlowfish context.
+ */
+AVBlowfish *av_blowfish_alloc(void);
+
+/**
+ * Initialize an AVBlowfish context.
+ *
+ * @param ctx an AVBlowfish context
+ * @param key a key
+ * @param key_len length of the key
+ */
+void av_blowfish_init(struct AVBlowfish *ctx, const uint8_t *key, int key_len);
+
+/**
+ * Encrypt or decrypt a buffer using a previously initialized context.
+ *
+ * @param ctx an AVBlowfish context
+ * @param xl left four bytes halves of input to be encrypted
+ * @param xr right four bytes halves of input to be encrypted
+ * @param decrypt 0 for encryption, 1 for decryption
+ */
+void av_blowfish_crypt_ecb(struct AVBlowfish *ctx, uint32_t *xl, uint32_t *xr,
+                           int decrypt);
+
+/**
+ * Encrypt or decrypt a buffer using a previously initialized context.
+ *
+ * @param ctx an AVBlowfish context
+ * @param dst destination array, can be equal to src
+ * @param src source array, can be equal to dst
+ * @param count number of 8 byte blocks
+ * @param iv initialization vector for CBC mode, if NULL ECB will be used
+ * @param decrypt 0 for encryption, 1 for decryption
+ */
+void av_blowfish_crypt(struct AVBlowfish *ctx, uint8_t *dst, const uint8_t *src,
+                       int count, uint8_t *iv, int decrypt);
+
+/**
+ * @}
+ */
+
+#endif /* AVUTIL_BLOWFISH_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/bprint.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/bprint.h
new file mode 100644
index 0000000..c09b1ac
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/bprint.h
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2012 Nicolas George
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_BPRINT_H
+#define AVUTIL_BPRINT_H
+
+#include <stdarg.h>
+
+#include "attributes.h"
+#include "avstring.h"
+
+/**
+ * Define a structure with extra padding to a fixed size
+ * This helps ensuring binary compatibility with future versions.
+ */
+
+#define FF_PAD_STRUCTURE(name, size, ...) \
+struct ff_pad_helper_##name { __VA_ARGS__ }; \
+typedef struct name { \
+    __VA_ARGS__ \
+    char reserved_padding[size - sizeof(struct ff_pad_helper_##name)]; \
+} name;
+
+/**
+ * Buffer to print data progressively
+ *
+ * The string buffer grows as necessary and is always 0-terminated.
+ * The content of the string is never accessed, and thus is
+ * encoding-agnostic and can even hold binary data.
+ *
+ * Small buffers are kept in the structure itself, and thus require no
+ * memory allocation at all (unless the contents of the buffer is needed
+ * after the structure goes out of scope). This is almost as lightweight as
+ * declaring a local "char buf[512]".
+ *
+ * The length of the string can go beyond the allocated size: the buffer is
+ * then truncated, but the functions still keep account of the actual total
+ * length.
+ *
+ * In other words, buf->len can be greater than buf->size and records the
+ * total length of what would have been to the buffer if there had been
+ * enough memory.
+ *
+ * Append operations do not need to be tested for failure: if a memory
+ * allocation fails, data stop being appended to the buffer, but the length
+ * is still updated. This situation can be tested with
+ * av_bprint_is_complete().
+ *
+ * The size_max field determines several possible behaviours:
+ *
+ * size_max = -1 (= UINT_MAX) or any large value will let the buffer be
+ * reallocated as necessary, with an amortized linear cost.
+ *
+ * size_max = 0 prevents writing anything to the buffer: only the total
+ * length is computed. The write operations can then possibly be repeated in
+ * a buffer with exactly the necessary size
+ * (using size_init = size_max = len + 1).
+ *
+ * size_max = 1 is automatically replaced by the exact size available in the
+ * structure itself, thus ensuring no dynamic memory allocation. The
+ * internal buffer is large enough to hold a reasonable paragraph of text,
+ * such as the current paragraph.
+ */
+
+FF_PAD_STRUCTURE(AVBPrint, 1024,
+    char *str;         /**< string so far */
+    unsigned len;      /**< length so far */
+    unsigned size;     /**< allocated memory */
+    unsigned size_max; /**< maximum allocated memory */
+    char reserved_internal_buffer[1];
+)
+
+/**
+ * Convenience macros for special values for av_bprint_init() size_max
+ * parameter.
+ */
+#define AV_BPRINT_SIZE_UNLIMITED  ((unsigned)-1)
+#define AV_BPRINT_SIZE_AUTOMATIC  1
+#define AV_BPRINT_SIZE_COUNT_ONLY 0
+
+/**
+ * Init a print buffer.
+ *
+ * @param buf        buffer to init
+ * @param size_init  initial size (including the final 0)
+ * @param size_max   maximum size;
+ *                   0 means do not write anything, just count the length;
+ *                   1 is replaced by the maximum value for automatic storage;
+ *                   any large value means that the internal buffer will be
+ *                   reallocated as needed up to that limit; -1 is converted to
+ *                   UINT_MAX, the largest limit possible.
+ *                   Check also AV_BPRINT_SIZE_* macros.
+ */
+void av_bprint_init(AVBPrint *buf, unsigned size_init, unsigned size_max);
+
+/**
+ * Init a print buffer using a pre-existing buffer.
+ *
+ * The buffer will not be reallocated.
+ *
+ * @param buf     buffer structure to init
+ * @param buffer  byte buffer to use for the string data
+ * @param size    size of buffer
+ */
+void av_bprint_init_for_buffer(AVBPrint *buf, char *buffer, unsigned size);
+
+/**
+ * Append a formatted string to a print buffer.
+ */
+void av_bprintf(AVBPrint *buf, const char *fmt, ...) av_printf_format(2, 3);
+
+/**
+ * Append a formatted string to a print buffer.
+ */
+void av_vbprintf(AVBPrint *buf, const char *fmt, va_list vl_arg);
+
+/**
+ * Append char c n times to a print buffer.
+ */
+void av_bprint_chars(AVBPrint *buf, char c, unsigned n);
+
+/**
+ * Append data to a print buffer.
+ *
+ * param buf  bprint buffer to use
+ * param data pointer to data
+ * param size size of data
+ */
+void av_bprint_append_data(AVBPrint *buf, const char *data, unsigned size);
+
+struct tm;
+/**
+ * Append a formatted date and time to a print buffer.
+ *
+ * param buf  bprint buffer to use
+ * param fmt  date and time format string, see strftime()
+ * param tm   broken-down time structure to translate
+ *
+ * @note due to poor design of the standard strftime function, it may
+ * produce poor results if the format string expands to a very long text and
+ * the bprint buffer is near the limit stated by the size_max option.
+ */
+void av_bprint_strftime(AVBPrint *buf, const char *fmt, const struct tm *tm);
+
+/**
+ * Allocate bytes in the buffer for external use.
+ *
+ * @param[in]  buf          buffer structure
+ * @param[in]  size         required size
+ * @param[out] mem          pointer to the memory area
+ * @param[out] actual_size  size of the memory area after allocation;
+ *                          can be larger or smaller than size
+ */
+void av_bprint_get_buffer(AVBPrint *buf, unsigned size,
+                          unsigned char **mem, unsigned *actual_size);
+
+/**
+ * Reset the string to "" but keep internal allocated data.
+ */
+void av_bprint_clear(AVBPrint *buf);
+
+/**
+ * Test if the print buffer is complete (not truncated).
+ *
+ * It may have been truncated due to a memory allocation failure
+ * or the size_max limit (compare size and size_max if necessary).
+ */
+static inline int av_bprint_is_complete(const AVBPrint *buf)
+{
+    return buf->len < buf->size;
+}
+
+/**
+ * Finalize a print buffer.
+ *
+ * The print buffer can no longer be used afterwards,
+ * but the len and size fields are still valid.
+ *
+ * @arg[out] ret_str  if not NULL, used to return a permanent copy of the
+ *                    buffer contents, or NULL if memory allocation fails;
+ *                    if NULL, the buffer is discarded and freed
+ * @return  0 for success or error code (probably AVERROR(ENOMEM))
+ */
+int av_bprint_finalize(AVBPrint *buf, char **ret_str);
+
+/**
+ * Escape the content in src and append it to dstbuf.
+ *
+ * @param dstbuf        already inited destination bprint buffer
+ * @param src           string containing the text to escape
+ * @param special_chars string containing the special characters which
+ *                      need to be escaped, can be NULL
+ * @param mode          escape mode to employ, see AV_ESCAPE_MODE_* macros.
+ *                      Any unknown value for mode will be considered equivalent to
+ *                      AV_ESCAPE_MODE_BACKSLASH, but this behaviour can change without
+ *                      notice.
+ * @param flags         flags which control how to escape, see AV_ESCAPE_FLAG_* macros
+ */
+void av_bprint_escape(AVBPrint *dstbuf, const char *src, const char *special_chars,
+                      enum AVEscapeMode mode, int flags);
+
+#endif /* AVUTIL_BPRINT_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/bswap.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/bswap.h
new file mode 100644
index 0000000..91cb795
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/bswap.h
@@ -0,0 +1,109 @@
+/*
+ * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * byte swapping routines
+ */
+
+#ifndef AVUTIL_BSWAP_H
+#define AVUTIL_BSWAP_H
+
+#include <stdint.h>
+#include "libavutil/avconfig.h"
+#include "attributes.h"
+
+#ifdef HAVE_AV_CONFIG_H
+
+#include "config.h"
+
+#if   ARCH_AARCH64
+#   include "aarch64/bswap.h"
+#elif ARCH_ARM
+#   include "arm/bswap.h"
+#elif ARCH_AVR32
+#   include "avr32/bswap.h"
+#elif ARCH_SH4
+#   include "sh4/bswap.h"
+#elif ARCH_X86
+#   include "x86/bswap.h"
+#endif
+
+#endif /* HAVE_AV_CONFIG_H */
+
+#define AV_BSWAP16C(x) (((x) << 8 & 0xff00)  | ((x) >> 8 & 0x00ff))
+#define AV_BSWAP32C(x) (AV_BSWAP16C(x) << 16 | AV_BSWAP16C((x) >> 16))
+#define AV_BSWAP64C(x) (AV_BSWAP32C(x) << 32 | AV_BSWAP32C((x) >> 32))
+
+#define AV_BSWAPC(s, x) AV_BSWAP##s##C(x)
+
+#ifndef av_bswap16
+static av_always_inline av_const uint16_t av_bswap16(uint16_t x)
+{
+    x= (x>>8) | (x<<8);
+    return x;
+}
+#endif
+
+#ifndef av_bswap32
+static av_always_inline av_const uint32_t av_bswap32(uint32_t x)
+{
+    return AV_BSWAP32C(x);
+}
+#endif
+
+#ifndef av_bswap64
+static inline uint64_t av_const av_bswap64(uint64_t x)
+{
+    return (uint64_t)av_bswap32(x) << 32 | av_bswap32(x >> 32);
+}
+#endif
+
+// be2ne ... big-endian to native-endian
+// le2ne ... little-endian to native-endian
+
+#if AV_HAVE_BIGENDIAN
+#define av_be2ne16(x) (x)
+#define av_be2ne32(x) (x)
+#define av_be2ne64(x) (x)
+#define av_le2ne16(x) av_bswap16(x)
+#define av_le2ne32(x) av_bswap32(x)
+#define av_le2ne64(x) av_bswap64(x)
+#define AV_BE2NEC(s, x) (x)
+#define AV_LE2NEC(s, x) AV_BSWAPC(s, x)
+#else
+#define av_be2ne16(x) av_bswap16(x)
+#define av_be2ne32(x) av_bswap32(x)
+#define av_be2ne64(x) av_bswap64(x)
+#define av_le2ne16(x) (x)
+#define av_le2ne32(x) (x)
+#define av_le2ne64(x) (x)
+#define AV_BE2NEC(s, x) AV_BSWAPC(s, x)
+#define AV_LE2NEC(s, x) (x)
+#endif
+
+#define AV_BE2NE16C(x) AV_BE2NEC(16, x)
+#define AV_BE2NE32C(x) AV_BE2NEC(32, x)
+#define AV_BE2NE64C(x) AV_BE2NEC(64, x)
+#define AV_LE2NE16C(x) AV_LE2NEC(16, x)
+#define AV_LE2NE32C(x) AV_LE2NEC(32, x)
+#define AV_LE2NE64C(x) AV_LE2NEC(64, x)
+
+#endif /* AVUTIL_BSWAP_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/buffer.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/buffer.h
new file mode 100644
index 0000000..73b6bd0
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/buffer.h
@@ -0,0 +1,291 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * @ingroup lavu_buffer
+ * refcounted data buffer API
+ */
+
+#ifndef AVUTIL_BUFFER_H
+#define AVUTIL_BUFFER_H
+
+#include <stdint.h>
+
+/**
+ * @defgroup lavu_buffer AVBuffer
+ * @ingroup lavu_data
+ *
+ * @{
+ * AVBuffer is an API for reference-counted data buffers.
+ *
+ * There are two core objects in this API -- AVBuffer and AVBufferRef. AVBuffer
+ * represents the data buffer itself; it is opaque and not meant to be accessed
+ * by the caller directly, but only through AVBufferRef. However, the caller may
+ * e.g. compare two AVBuffer pointers to check whether two different references
+ * are describing the same data buffer. AVBufferRef represents a single
+ * reference to an AVBuffer and it is the object that may be manipulated by the
+ * caller directly.
+ *
+ * There are two functions provided for creating a new AVBuffer with a single
+ * reference -- av_buffer_alloc() to just allocate a new buffer, and
+ * av_buffer_create() to wrap an existing array in an AVBuffer. From an existing
+ * reference, additional references may be created with av_buffer_ref().
+ * Use av_buffer_unref() to free a reference (this will automatically free the
+ * data once all the references are freed).
+ *
+ * The convention throughout this API and the rest of FFmpeg is such that the
+ * buffer is considered writable if there exists only one reference to it (and
+ * it has not been marked as read-only). The av_buffer_is_writable() function is
+ * provided to check whether this is true and av_buffer_make_writable() will
+ * automatically create a new writable buffer when necessary.
+ * Of course nothing prevents the calling code from violating this convention,
+ * however that is safe only when all the existing references are under its
+ * control.
+ *
+ * @note Referencing and unreferencing the buffers is thread-safe and thus
+ * may be done from multiple threads simultaneously without any need for
+ * additional locking.
+ *
+ * @note Two different references to the same buffer can point to different
+ * parts of the buffer (i.e. their AVBufferRef.data will not be equal).
+ */
+
+/**
+ * A reference counted buffer type. It is opaque and is meant to be used through
+ * references (AVBufferRef).
+ */
+typedef struct AVBuffer AVBuffer;
+
+/**
+ * A reference to a data buffer.
+ *
+ * The size of this struct is not a part of the public ABI and it is not meant
+ * to be allocated directly.
+ */
+typedef struct AVBufferRef {
+    AVBuffer *buffer;
+
+    /**
+     * The data buffer. It is considered writable if and only if
+     * this is the only reference to the buffer, in which case
+     * av_buffer_is_writable() returns 1.
+     */
+    uint8_t *data;
+    /**
+     * Size of data in bytes.
+     */
+    int      size;
+} AVBufferRef;
+
+/**
+ * Allocate an AVBuffer of the given size using av_malloc().
+ *
+ * @return an AVBufferRef of given size or NULL when out of memory
+ */
+AVBufferRef *av_buffer_alloc(int size);
+
+/**
+ * Same as av_buffer_alloc(), except the returned buffer will be initialized
+ * to zero.
+ */
+AVBufferRef *av_buffer_allocz(int size);
+
+/**
+ * Always treat the buffer as read-only, even when it has only one
+ * reference.
+ */
+#define AV_BUFFER_FLAG_READONLY (1 << 0)
+
+/**
+ * Create an AVBuffer from an existing array.
+ *
+ * If this function is successful, data is owned by the AVBuffer. The caller may
+ * only access data through the returned AVBufferRef and references derived from
+ * it.
+ * If this function fails, data is left untouched.
+ * @param data   data array
+ * @param size   size of data in bytes
+ * @param free   a callback for freeing this buffer's data
+ * @param opaque parameter to be got for processing or passed to free
+ * @param flags  a combination of AV_BUFFER_FLAG_*
+ *
+ * @return an AVBufferRef referring to data on success, NULL on failure.
+ */
+AVBufferRef *av_buffer_create(uint8_t *data, int size,
+                              void (*free)(void *opaque, uint8_t *data),
+                              void *opaque, int flags);
+
+/**
+ * Default free callback, which calls av_free() on the buffer data.
+ * This function is meant to be passed to av_buffer_create(), not called
+ * directly.
+ */
+void av_buffer_default_free(void *opaque, uint8_t *data);
+
+/**
+ * Create a new reference to an AVBuffer.
+ *
+ * @return a new AVBufferRef referring to the same AVBuffer as buf or NULL on
+ * failure.
+ */
+AVBufferRef *av_buffer_ref(AVBufferRef *buf);
+
+/**
+ * Free a given reference and automatically free the buffer if there are no more
+ * references to it.
+ *
+ * @param buf the reference to be freed. The pointer is set to NULL on return.
+ */
+void av_buffer_unref(AVBufferRef **buf);
+
+/**
+ * @return 1 if the caller may write to the data referred to by buf (which is
+ * true if and only if buf is the only reference to the underlying AVBuffer).
+ * Return 0 otherwise.
+ * A positive answer is valid until av_buffer_ref() is called on buf.
+ */
+int av_buffer_is_writable(const AVBufferRef *buf);
+
+/**
+ * @return the opaque parameter set by av_buffer_create.
+ */
+void *av_buffer_get_opaque(const AVBufferRef *buf);
+
+int av_buffer_get_ref_count(const AVBufferRef *buf);
+
+/**
+ * Create a writable reference from a given buffer reference, avoiding data copy
+ * if possible.
+ *
+ * @param buf buffer reference to make writable. On success, buf is either left
+ *            untouched, or it is unreferenced and a new writable AVBufferRef is
+ *            written in its place. On failure, buf is left untouched.
+ * @return 0 on success, a negative AVERROR on failure.
+ */
+int av_buffer_make_writable(AVBufferRef **buf);
+
+/**
+ * Reallocate a given buffer.
+ *
+ * @param buf  a buffer reference to reallocate. On success, buf will be
+ *             unreferenced and a new reference with the required size will be
+ *             written in its place. On failure buf will be left untouched. *buf
+ *             may be NULL, then a new buffer is allocated.
+ * @param size required new buffer size.
+ * @return 0 on success, a negative AVERROR on failure.
+ *
+ * @note the buffer is actually reallocated with av_realloc() only if it was
+ * initially allocated through av_buffer_realloc(NULL) and there is only one
+ * reference to it (i.e. the one passed to this function). In all other cases
+ * a new buffer is allocated and the data is copied.
+ */
+int av_buffer_realloc(AVBufferRef **buf, int size);
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup lavu_bufferpool AVBufferPool
+ * @ingroup lavu_data
+ *
+ * @{
+ * AVBufferPool is an API for a lock-free thread-safe pool of AVBuffers.
+ *
+ * Frequently allocating and freeing large buffers may be slow. AVBufferPool is
+ * meant to solve this in cases when the caller needs a set of buffers of the
+ * same size (the most obvious use case being buffers for raw video or audio
+ * frames).
+ *
+ * At the beginning, the user must call av_buffer_pool_init() to create the
+ * buffer pool. Then whenever a buffer is needed, call av_buffer_pool_get() to
+ * get a reference to a new buffer, similar to av_buffer_alloc(). This new
+ * reference works in all aspects the same way as the one created by
+ * av_buffer_alloc(). However, when the last reference to this buffer is
+ * unreferenced, it is returned to the pool instead of being freed and will be
+ * reused for subsequent av_buffer_pool_get() calls.
+ *
+ * When the caller is done with the pool and no longer needs to allocate any new
+ * buffers, av_buffer_pool_uninit() must be called to mark the pool as freeable.
+ * Once all the buffers are released, it will automatically be freed.
+ *
+ * Allocating and releasing buffers with this API is thread-safe as long as
+ * either the default alloc callback is used, or the user-supplied one is
+ * thread-safe.
+ */
+
+/**
+ * The buffer pool. This structure is opaque and not meant to be accessed
+ * directly. It is allocated with av_buffer_pool_init() and freed with
+ * av_buffer_pool_uninit().
+ */
+typedef struct AVBufferPool AVBufferPool;
+
+/**
+ * Allocate and initialize a buffer pool.
+ *
+ * @param size size of each buffer in this pool
+ * @param alloc a function that will be used to allocate new buffers when the
+ * pool is empty. May be NULL, then the default allocator will be used
+ * (av_buffer_alloc()).
+ * @return newly created buffer pool on success, NULL on error.
+ */
+AVBufferPool *av_buffer_pool_init(int size, AVBufferRef* (*alloc)(int size));
+
+/**
+ * Allocate and initialize a buffer pool with a more complex allocator.
+ *
+ * @param size size of each buffer in this pool
+ * @param opaque arbitrary user data used by the allocator
+ * @param alloc a function that will be used to allocate new buffers when the
+ *              pool is empty.
+ * @param pool_free a function that will be called immediately before the pool
+ *                  is freed. I.e. after av_buffer_pool_uninit() is called
+ *                  by the caller and all the frames are returned to the pool
+ *                  and freed. It is intended to uninitialize the user opaque
+ *                  data.
+ * @return newly created buffer pool on success, NULL on error.
+ */
+AVBufferPool *av_buffer_pool_init2(int size, void *opaque,
+                                   AVBufferRef* (*alloc)(void *opaque, int size),
+                                   void (*pool_free)(void *opaque));
+
+/**
+ * Mark the pool as being available for freeing. It will actually be freed only
+ * once all the allocated buffers associated with the pool are released. Thus it
+ * is safe to call this function while some of the allocated buffers are still
+ * in use.
+ *
+ * @param pool pointer to the pool to be freed. It will be set to NULL.
+ */
+void av_buffer_pool_uninit(AVBufferPool **pool);
+
+/**
+ * Allocate a new AVBuffer, reusing an old buffer from the pool when available.
+ * This function may be called simultaneously from multiple threads.
+ *
+ * @return a reference to the new buffer on success, NULL on error.
+ */
+AVBufferRef *av_buffer_pool_get(AVBufferPool *pool);
+
+/**
+ * @}
+ */
+
+#endif /* AVUTIL_BUFFER_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/camellia.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/camellia.h
new file mode 100644
index 0000000..e674c9b
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/camellia.h
@@ -0,0 +1,70 @@
+/*
+ * An implementation of the CAMELLIA algorithm as mentioned in RFC3713
+ * Copyright (c) 2014 Supraja Meedinti
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_CAMELLIA_H
+#define AVUTIL_CAMELLIA_H
+
+#include <stdint.h>
+
+
+/**
+  * @file
+  * @brief Public header for libavutil CAMELLIA algorithm
+  * @defgroup lavu_camellia CAMELLIA
+  * @ingroup lavu_crypto
+  * @{
+  */
+
+extern const int av_camellia_size;
+
+struct AVCAMELLIA;
+
+/**
+  * Allocate an AVCAMELLIA context
+  * To free the struct: av_free(ptr)
+  */
+struct AVCAMELLIA *av_camellia_alloc(void);
+
+/**
+  * Initialize an AVCAMELLIA context.
+  *
+  * @param ctx an AVCAMELLIA context
+  * @param key a key of 16, 24, 32 bytes used for encryption/decryption
+  * @param key_bits number of keybits: possible are 128, 192, 256
+ */
+int av_camellia_init(struct AVCAMELLIA *ctx, const uint8_t *key, int key_bits);
+
+/**
+  * Encrypt or decrypt a buffer using a previously initialized context
+  *
+  * @param ctx an AVCAMELLIA context
+  * @param dst destination array, can be equal to src
+  * @param src source array, can be equal to dst
+  * @param count number of 16 byte blocks
+  * @paran iv initialization vector for CBC mode, NULL for ECB mode
+  * @param decrypt 0 for encryption, 1 for decryption
+ */
+void av_camellia_crypt(struct AVCAMELLIA *ctx, uint8_t *dst, const uint8_t *src, int count, uint8_t* iv, int decrypt);
+
+/**
+ * @}
+ */
+#endif /* AVUTIL_CAMELLIA_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/cast5.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/cast5.h
new file mode 100644
index 0000000..ad5b347
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/cast5.h
@@ -0,0 +1,80 @@
+/*
+ * An implementation of the CAST128 algorithm as mentioned in RFC2144
+ * Copyright (c) 2014 Supraja Meedinti
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_CAST5_H
+#define AVUTIL_CAST5_H
+
+#include <stdint.h>
+
+
+/**
+  * @file
+  * @brief Public header for libavutil CAST5 algorithm
+  * @defgroup lavu_cast5 CAST5
+  * @ingroup lavu_crypto
+  * @{
+  */
+
+extern const int av_cast5_size;
+
+struct AVCAST5;
+
+/**
+  * Allocate an AVCAST5 context
+  * To free the struct: av_free(ptr)
+  */
+struct AVCAST5 *av_cast5_alloc(void);
+/**
+  * Initialize an AVCAST5 context.
+  *
+  * @param ctx an AVCAST5 context
+  * @param key a key of 5,6,...16 bytes used for encryption/decryption
+  * @param key_bits number of keybits: possible are 40,48,...,128
+  * @return 0 on success, less than 0 on failure
+ */
+int av_cast5_init(struct AVCAST5 *ctx, const uint8_t *key, int key_bits);
+
+/**
+  * Encrypt or decrypt a buffer using a previously initialized context, ECB mode only
+  *
+  * @param ctx an AVCAST5 context
+  * @param dst destination array, can be equal to src
+  * @param src source array, can be equal to dst
+  * @param count number of 8 byte blocks
+  * @param decrypt 0 for encryption, 1 for decryption
+ */
+void av_cast5_crypt(struct AVCAST5 *ctx, uint8_t *dst, const uint8_t *src, int count, int decrypt);
+
+/**
+  * Encrypt or decrypt a buffer using a previously initialized context
+  *
+  * @param ctx an AVCAST5 context
+  * @param dst destination array, can be equal to src
+  * @param src source array, can be equal to dst
+  * @param count number of 8 byte blocks
+  * @param iv initialization vector for CBC mode, NULL for ECB mode
+  * @param decrypt 0 for encryption, 1 for decryption
+ */
+void av_cast5_crypt2(struct AVCAST5 *ctx, uint8_t *dst, const uint8_t *src, int count, uint8_t *iv, int decrypt);
+/**
+ * @}
+ */
+#endif /* AVUTIL_CAST5_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/channel_layout.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/channel_layout.h
new file mode 100644
index 0000000..50bb8f0
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/channel_layout.h
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2008 Peter Ross
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_CHANNEL_LAYOUT_H
+#define AVUTIL_CHANNEL_LAYOUT_H
+
+#include <stdint.h>
+
+/**
+ * @file
+ * audio channel layout utility functions
+ */
+
+/**
+ * @addtogroup lavu_audio
+ * @{
+ */
+
+/**
+ * @defgroup channel_masks Audio channel masks
+ *
+ * A channel layout is a 64-bits integer with a bit set for every channel.
+ * The number of bits set must be equal to the number of channels.
+ * The value 0 means that the channel layout is not known.
+ * @note this data structure is not powerful enough to handle channels
+ * combinations that have the same channel multiple times, such as
+ * dual-mono.
+ *
+ * @{
+ */
+#define AV_CH_FRONT_LEFT             0x00000001
+#define AV_CH_FRONT_RIGHT            0x00000002
+#define AV_CH_FRONT_CENTER           0x00000004
+#define AV_CH_LOW_FREQUENCY          0x00000008
+#define AV_CH_BACK_LEFT              0x00000010
+#define AV_CH_BACK_RIGHT             0x00000020
+#define AV_CH_FRONT_LEFT_OF_CENTER   0x00000040
+#define AV_CH_FRONT_RIGHT_OF_CENTER  0x00000080
+#define AV_CH_BACK_CENTER            0x00000100
+#define AV_CH_SIDE_LEFT              0x00000200
+#define AV_CH_SIDE_RIGHT             0x00000400
+#define AV_CH_TOP_CENTER             0x00000800
+#define AV_CH_TOP_FRONT_LEFT         0x00001000
+#define AV_CH_TOP_FRONT_CENTER       0x00002000
+#define AV_CH_TOP_FRONT_RIGHT        0x00004000
+#define AV_CH_TOP_BACK_LEFT          0x00008000
+#define AV_CH_TOP_BACK_CENTER        0x00010000
+#define AV_CH_TOP_BACK_RIGHT         0x00020000
+#define AV_CH_STEREO_LEFT            0x20000000  ///< Stereo downmix.
+#define AV_CH_STEREO_RIGHT           0x40000000  ///< See AV_CH_STEREO_LEFT.
+#define AV_CH_WIDE_LEFT              0x0000000080000000ULL
+#define AV_CH_WIDE_RIGHT             0x0000000100000000ULL
+#define AV_CH_SURROUND_DIRECT_LEFT   0x0000000200000000ULL
+#define AV_CH_SURROUND_DIRECT_RIGHT  0x0000000400000000ULL
+#define AV_CH_LOW_FREQUENCY_2        0x0000000800000000ULL
+
+/** Channel mask value used for AVCodecContext.request_channel_layout
+    to indicate that the user requests the channel order of the decoder output
+    to be the native codec channel order. */
+#define AV_CH_LAYOUT_NATIVE          0x8000000000000000ULL
+
+/**
+ * @}
+ * @defgroup channel_mask_c Audio channel layouts
+ * @{
+ * */
+#define AV_CH_LAYOUT_MONO              (AV_CH_FRONT_CENTER)
+#define AV_CH_LAYOUT_STEREO            (AV_CH_FRONT_LEFT|AV_CH_FRONT_RIGHT)
+#define AV_CH_LAYOUT_2POINT1           (AV_CH_LAYOUT_STEREO|AV_CH_LOW_FREQUENCY)
+#define AV_CH_LAYOUT_2_1               (AV_CH_LAYOUT_STEREO|AV_CH_BACK_CENTER)
+#define AV_CH_LAYOUT_SURROUND          (AV_CH_LAYOUT_STEREO|AV_CH_FRONT_CENTER)
+#define AV_CH_LAYOUT_3POINT1           (AV_CH_LAYOUT_SURROUND|AV_CH_LOW_FREQUENCY)
+#define AV_CH_LAYOUT_4POINT0           (AV_CH_LAYOUT_SURROUND|AV_CH_BACK_CENTER)
+#define AV_CH_LAYOUT_4POINT1           (AV_CH_LAYOUT_4POINT0|AV_CH_LOW_FREQUENCY)
+#define AV_CH_LAYOUT_2_2               (AV_CH_LAYOUT_STEREO|AV_CH_SIDE_LEFT|AV_CH_SIDE_RIGHT)
+#define AV_CH_LAYOUT_QUAD              (AV_CH_LAYOUT_STEREO|AV_CH_BACK_LEFT|AV_CH_BACK_RIGHT)
+#define AV_CH_LAYOUT_5POINT0           (AV_CH_LAYOUT_SURROUND|AV_CH_SIDE_LEFT|AV_CH_SIDE_RIGHT)
+#define AV_CH_LAYOUT_5POINT1           (AV_CH_LAYOUT_5POINT0|AV_CH_LOW_FREQUENCY)
+#define AV_CH_LAYOUT_5POINT0_BACK      (AV_CH_LAYOUT_SURROUND|AV_CH_BACK_LEFT|AV_CH_BACK_RIGHT)
+#define AV_CH_LAYOUT_5POINT1_BACK      (AV_CH_LAYOUT_5POINT0_BACK|AV_CH_LOW_FREQUENCY)
+#define AV_CH_LAYOUT_6POINT0           (AV_CH_LAYOUT_5POINT0|AV_CH_BACK_CENTER)
+#define AV_CH_LAYOUT_6POINT0_FRONT     (AV_CH_LAYOUT_2_2|AV_CH_FRONT_LEFT_OF_CENTER|AV_CH_FRONT_RIGHT_OF_CENTER)
+#define AV_CH_LAYOUT_HEXAGONAL         (AV_CH_LAYOUT_5POINT0_BACK|AV_CH_BACK_CENTER)
+#define AV_CH_LAYOUT_6POINT1           (AV_CH_LAYOUT_5POINT1|AV_CH_BACK_CENTER)
+#define AV_CH_LAYOUT_6POINT1_BACK      (AV_CH_LAYOUT_5POINT1_BACK|AV_CH_BACK_CENTER)
+#define AV_CH_LAYOUT_6POINT1_FRONT     (AV_CH_LAYOUT_6POINT0_FRONT|AV_CH_LOW_FREQUENCY)
+#define AV_CH_LAYOUT_7POINT0           (AV_CH_LAYOUT_5POINT0|AV_CH_BACK_LEFT|AV_CH_BACK_RIGHT)
+#define AV_CH_LAYOUT_7POINT0_FRONT     (AV_CH_LAYOUT_5POINT0|AV_CH_FRONT_LEFT_OF_CENTER|AV_CH_FRONT_RIGHT_OF_CENTER)
+#define AV_CH_LAYOUT_7POINT1           (AV_CH_LAYOUT_5POINT1|AV_CH_BACK_LEFT|AV_CH_BACK_RIGHT)
+#define AV_CH_LAYOUT_7POINT1_WIDE      (AV_CH_LAYOUT_5POINT1|AV_CH_FRONT_LEFT_OF_CENTER|AV_CH_FRONT_RIGHT_OF_CENTER)
+#define AV_CH_LAYOUT_7POINT1_WIDE_BACK (AV_CH_LAYOUT_5POINT1_BACK|AV_CH_FRONT_LEFT_OF_CENTER|AV_CH_FRONT_RIGHT_OF_CENTER)
+#define AV_CH_LAYOUT_OCTAGONAL         (AV_CH_LAYOUT_5POINT0|AV_CH_BACK_LEFT|AV_CH_BACK_CENTER|AV_CH_BACK_RIGHT)
+#define AV_CH_LAYOUT_HEXADECAGONAL     (AV_CH_LAYOUT_OCTAGONAL|AV_CH_WIDE_LEFT|AV_CH_WIDE_RIGHT|AV_CH_TOP_BACK_LEFT|AV_CH_TOP_BACK_RIGHT|AV_CH_TOP_BACK_CENTER|AV_CH_TOP_FRONT_CENTER|AV_CH_TOP_FRONT_LEFT|AV_CH_TOP_FRONT_RIGHT)
+#define AV_CH_LAYOUT_STEREO_DOWNMIX    (AV_CH_STEREO_LEFT|AV_CH_STEREO_RIGHT)
+
+enum AVMatrixEncoding {
+    AV_MATRIX_ENCODING_NONE,
+    AV_MATRIX_ENCODING_DOLBY,
+    AV_MATRIX_ENCODING_DPLII,
+    AV_MATRIX_ENCODING_DPLIIX,
+    AV_MATRIX_ENCODING_DPLIIZ,
+    AV_MATRIX_ENCODING_DOLBYEX,
+    AV_MATRIX_ENCODING_DOLBYHEADPHONE,
+    AV_MATRIX_ENCODING_NB
+};
+
+/**
+ * Return a channel layout id that matches name, or 0 if no match is found.
+ *
+ * name can be one or several of the following notations,
+ * separated by '+' or '|':
+ * - the name of an usual channel layout (mono, stereo, 4.0, quad, 5.0,
+ *   5.0(side), 5.1, 5.1(side), 7.1, 7.1(wide), downmix);
+ * - the name of a single channel (FL, FR, FC, LFE, BL, BR, FLC, FRC, BC,
+ *   SL, SR, TC, TFL, TFC, TFR, TBL, TBC, TBR, DL, DR);
+ * - a number of channels, in decimal, followed by 'c', yielding
+ *   the default channel layout for that number of channels (@see
+ *   av_get_default_channel_layout);
+ * - a channel layout mask, in hexadecimal starting with "0x" (see the
+ *   AV_CH_* macros).
+ *
+ * Example: "stereo+FC" = "2c+FC" = "2c+1c" = "0x7"
+ */
+uint64_t av_get_channel_layout(const char *name);
+
+/**
+ * Return a channel layout and the number of channels based on the specified name.
+ *
+ * This function is similar to (@see av_get_channel_layout), but can also parse
+ * unknown channel layout specifications.
+ *
+ * @param[in]  name             channel layout specification string
+ * @param[out] channel_layout   parsed channel layout (0 if unknown)
+ * @param[out] nb_channels      number of channels
+ *
+ * @return 0 on success, AVERROR(EINVAL) if the parsing fails.
+ */
+int av_get_extended_channel_layout(const char *name, uint64_t* channel_layout, int* nb_channels);
+
+/**
+ * Return a description of a channel layout.
+ * If nb_channels is <= 0, it is guessed from the channel_layout.
+ *
+ * @param buf put here the string containing the channel layout
+ * @param buf_size size in bytes of the buffer
+ */
+void av_get_channel_layout_string(char *buf, int buf_size, int nb_channels, uint64_t channel_layout);
+
+struct AVBPrint;
+/**
+ * Append a description of a channel layout to a bprint buffer.
+ */
+void av_bprint_channel_layout(struct AVBPrint *bp, int nb_channels, uint64_t channel_layout);
+
+/**
+ * Return the number of channels in the channel layout.
+ */
+int av_get_channel_layout_nb_channels(uint64_t channel_layout);
+
+/**
+ * Return default channel layout for a given number of channels.
+ */
+int64_t av_get_default_channel_layout(int nb_channels);
+
+/**
+ * Get the index of a channel in channel_layout.
+ *
+ * @param channel a channel layout describing exactly one channel which must be
+ *                present in channel_layout.
+ *
+ * @return index of channel in channel_layout on success, a negative AVERROR
+ *         on error.
+ */
+int av_get_channel_layout_channel_index(uint64_t channel_layout,
+                                        uint64_t channel);
+
+/**
+ * Get the channel with the given index in channel_layout.
+ */
+uint64_t av_channel_layout_extract_channel(uint64_t channel_layout, int index);
+
+/**
+ * Get the name of a given channel.
+ *
+ * @return channel name on success, NULL on error.
+ */
+const char *av_get_channel_name(uint64_t channel);
+
+/**
+ * Get the description of a given channel.
+ *
+ * @param channel  a channel layout with a single channel
+ * @return  channel description on success, NULL on error
+ */
+const char *av_get_channel_description(uint64_t channel);
+
+/**
+ * Get the value and name of a standard channel layout.
+ *
+ * @param[in]  index   index in an internal list, starting at 0
+ * @param[out] layout  channel layout mask
+ * @param[out] name    name of the layout
+ * @return  0  if the layout exists,
+ *          <0 if index is beyond the limits
+ */
+int av_get_standard_channel_layout(unsigned index, uint64_t *layout,
+                                   const char **name);
+
+/**
+ * @}
+ * @}
+ */
+
+#endif /* AVUTIL_CHANNEL_LAYOUT_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/common.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/common.h
new file mode 100644
index 0000000..58ead80
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/common.h
@@ -0,0 +1,530 @@
+/*
+ * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * common internal and external API header
+ */
+
+#ifndef AVUTIL_COMMON_H
+#define AVUTIL_COMMON_H
+
+#if defined(__cplusplus) && !defined(__STDC_CONSTANT_MACROS) && !defined(UINT64_C)
+#error missing -D__STDC_CONSTANT_MACROS / #define __STDC_CONSTANT_MACROS
+#endif
+
+#include <errno.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "attributes.h"
+#include "macros.h"
+#include "version.h"
+#include "libavutil/avconfig.h"
+
+#if AV_HAVE_BIGENDIAN
+#   define AV_NE(be, le) (be)
+#else
+#   define AV_NE(be, le) (le)
+#endif
+
+//rounded division & shift
+#define RSHIFT(a,b) ((a) > 0 ? ((a) + ((1<<(b))>>1))>>(b) : ((a) + ((1<<(b))>>1)-1)>>(b))
+/* assume b>0 */
+#define ROUNDED_DIV(a,b) (((a)>0 ? (a) + ((b)>>1) : (a) - ((b)>>1))/(b))
+/* Fast a/(1<<b) rounded toward +inf. Assume a>=0 and b>=0 */
+#define AV_CEIL_RSHIFT(a,b) (!av_builtin_constant_p(b) ? -((-(a)) >> (b)) \
+                                                       : ((a) + (1<<(b)) - 1) >> (b))
+/* Backwards compat. */
+#define FF_CEIL_RSHIFT AV_CEIL_RSHIFT
+
+#define FFUDIV(a,b) (((a)>0 ?(a):(a)-(b)+1) / (b))
+#define FFUMOD(a,b) ((a)-(b)*FFUDIV(a,b))
+
+/**
+ * Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they
+ * are not representable as absolute values of their type. This is the same
+ * as with *abs()
+ * @see FFNABS()
+ */
+#define FFABS(a) ((a) >= 0 ? (a) : (-(a)))
+#define FFSIGN(a) ((a) > 0 ? 1 : -1)
+
+/**
+ * Negative Absolute value.
+ * this works for all integers of all types.
+ * As with many macros, this evaluates its argument twice, it thus must not have
+ * a sideeffect, that is FFNABS(x++) has undefined behavior.
+ */
+#define FFNABS(a) ((a) <= 0 ? (a) : (-(a)))
+
+/**
+ * Comparator.
+ * For two numerical expressions x and y, gives 1 if x > y, -1 if x < y, and 0
+ * if x == y. This is useful for instance in a qsort comparator callback.
+ * Furthermore, compilers are able to optimize this to branchless code, and
+ * there is no risk of overflow with signed types.
+ * As with many macros, this evaluates its argument multiple times, it thus
+ * must not have a side-effect.
+ */
+#define FFDIFFSIGN(x,y) (((x)>(y)) - ((x)<(y)))
+
+#define FFMAX(a,b) ((a) > (b) ? (a) : (b))
+#define FFMAX3(a,b,c) FFMAX(FFMAX(a,b),c)
+#define FFMIN(a,b) ((a) > (b) ? (b) : (a))
+#define FFMIN3(a,b,c) FFMIN(FFMIN(a,b),c)
+
+#define FFSWAP(type,a,b) do{type SWAP_tmp= b; b= a; a= SWAP_tmp;}while(0)
+#define FF_ARRAY_ELEMS(a) (sizeof(a) / sizeof((a)[0]))
+
+/* misc math functions */
+
+#ifdef HAVE_AV_CONFIG_H
+#   include "config.h"
+#   include "intmath.h"
+#endif
+
+/* Pull in unguarded fallback defines at the end of this file. */
+#include "common.h"
+
+#ifndef av_log2
+av_const int av_log2(unsigned v);
+#endif
+
+#ifndef av_log2_16bit
+av_const int av_log2_16bit(unsigned v);
+#endif
+
+/**
+ * Clip a signed integer value into the amin-amax range.
+ * @param a value to clip
+ * @param amin minimum value of the clip range
+ * @param amax maximum value of the clip range
+ * @return clipped value
+ */
+static av_always_inline av_const int av_clip_c(int a, int amin, int amax)
+{
+#if defined(HAVE_AV_CONFIG_H) && defined(ASSERT_LEVEL) && ASSERT_LEVEL >= 2
+    if (amin > amax) abort();
+#endif
+    if      (a < amin) return amin;
+    else if (a > amax) return amax;
+    else               return a;
+}
+
+/**
+ * Clip a signed 64bit integer value into the amin-amax range.
+ * @param a value to clip
+ * @param amin minimum value of the clip range
+ * @param amax maximum value of the clip range
+ * @return clipped value
+ */
+static av_always_inline av_const int64_t av_clip64_c(int64_t a, int64_t amin, int64_t amax)
+{
+#if defined(HAVE_AV_CONFIG_H) && defined(ASSERT_LEVEL) && ASSERT_LEVEL >= 2
+    if (amin > amax) abort();
+#endif
+    if      (a < amin) return amin;
+    else if (a > amax) return amax;
+    else               return a;
+}
+
+/**
+ * Clip a signed integer value into the 0-255 range.
+ * @param a value to clip
+ * @return clipped value
+ */
+static av_always_inline av_const uint8_t av_clip_uint8_c(int a)
+{
+    if (a&(~0xFF)) return (~a)>>31;
+    else           return a;
+}
+
+/**
+ * Clip a signed integer value into the -128,127 range.
+ * @param a value to clip
+ * @return clipped value
+ */
+static av_always_inline av_const int8_t av_clip_int8_c(int a)
+{
+    if ((a+0x80U) & ~0xFF) return (a>>31) ^ 0x7F;
+    else                  return a;
+}
+
+/**
+ * Clip a signed integer value into the 0-65535 range.
+ * @param a value to clip
+ * @return clipped value
+ */
+static av_always_inline av_const uint16_t av_clip_uint16_c(int a)
+{
+    if (a&(~0xFFFF)) return (~a)>>31;
+    else             return a;
+}
+
+/**
+ * Clip a signed integer value into the -32768,32767 range.
+ * @param a value to clip
+ * @return clipped value
+ */
+static av_always_inline av_const int16_t av_clip_int16_c(int a)
+{
+    if ((a+0x8000U) & ~0xFFFF) return (a>>31) ^ 0x7FFF;
+    else                      return a;
+}
+
+/**
+ * Clip a signed 64-bit integer value into the -2147483648,2147483647 range.
+ * @param a value to clip
+ * @return clipped value
+ */
+static av_always_inline av_const int32_t av_clipl_int32_c(int64_t a)
+{
+    if ((a+0x80000000u) & ~UINT64_C(0xFFFFFFFF)) return (int32_t)((a>>63) ^ 0x7FFFFFFF);
+    else                                         return (int32_t)a;
+}
+
+/**
+ * Clip a signed integer into the -(2^p),(2^p-1) range.
+ * @param  a value to clip
+ * @param  p bit position to clip at
+ * @return clipped value
+ */
+static av_always_inline av_const int av_clip_intp2_c(int a, int p)
+{
+    if (((unsigned)a + (1 << p)) & ~((2 << p) - 1))
+        return (a >> 31) ^ ((1 << p) - 1);
+    else
+        return a;
+}
+
+/**
+ * Clip a signed integer to an unsigned power of two range.
+ * @param  a value to clip
+ * @param  p bit position to clip at
+ * @return clipped value
+ */
+static av_always_inline av_const unsigned av_clip_uintp2_c(int a, int p)
+{
+    if (a & ~((1<<p) - 1)) return (~a) >> 31 & ((1<<p) - 1);
+    else                   return  a;
+}
+
+/**
+ * Clear high bits from an unsigned integer starting with specific bit position
+ * @param  a value to clip
+ * @param  p bit position to clip at
+ * @return clipped value
+ */
+static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
+{
+    return a & ((1 << p) - 1);
+}
+
+/**
+ * Add two signed 32-bit values with saturation.
+ *
+ * @param  a one value
+ * @param  b another value
+ * @return sum with signed saturation
+ */
+static av_always_inline int av_sat_add32_c(int a, int b)
+{
+    return av_clipl_int32((int64_t)a + b);
+}
+
+/**
+ * Add a doubled value to another value with saturation at both stages.
+ *
+ * @param  a first value
+ * @param  b value doubled and added to a
+ * @return sum with signed saturation
+ */
+static av_always_inline int av_sat_dadd32_c(int a, int b)
+{
+    return av_sat_add32(a, av_sat_add32(b, b));
+}
+
+/**
+ * Clip a float value into the amin-amax range.
+ * @param a value to clip
+ * @param amin minimum value of the clip range
+ * @param amax maximum value of the clip range
+ * @return clipped value
+ */
+static av_always_inline av_const float av_clipf_c(float a, float amin, float amax)
+{
+#if defined(HAVE_AV_CONFIG_H) && defined(ASSERT_LEVEL) && ASSERT_LEVEL >= 2
+    if (amin > amax) abort();
+#endif
+    if      (a < amin) return amin;
+    else if (a > amax) return amax;
+    else               return a;
+}
+
+/**
+ * Clip a double value into the amin-amax range.
+ * @param a value to clip
+ * @param amin minimum value of the clip range
+ * @param amax maximum value of the clip range
+ * @return clipped value
+ */
+static av_always_inline av_const double av_clipd_c(double a, double amin, double amax)
+{
+#if defined(HAVE_AV_CONFIG_H) && defined(ASSERT_LEVEL) && ASSERT_LEVEL >= 2
+    if (amin > amax) abort();
+#endif
+    if      (a < amin) return amin;
+    else if (a > amax) return amax;
+    else               return a;
+}
+
+/** Compute ceil(log2(x)).
+ * @param x value used to compute ceil(log2(x))
+ * @return computed ceiling of log2(x)
+ */
+static av_always_inline av_const int av_ceil_log2_c(int x)
+{
+    return av_log2((x - 1) << 1);
+}
+
+/**
+ * Count number of bits set to one in x
+ * @param x value to count bits of
+ * @return the number of bits set to one in x
+ */
+static av_always_inline av_const int av_popcount_c(uint32_t x)
+{
+    x -= (x >> 1) & 0x55555555;
+    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
+    x = (x + (x >> 4)) & 0x0F0F0F0F;
+    x += x >> 8;
+    return (x + (x >> 16)) & 0x3F;
+}
+
+/**
+ * Count number of bits set to one in x
+ * @param x value to count bits of
+ * @return the number of bits set to one in x
+ */
+static av_always_inline av_const int av_popcount64_c(uint64_t x)
+{
+    return av_popcount((uint32_t)x) + av_popcount((uint32_t)(x >> 32));
+}
+
+static av_always_inline av_const int av_parity_c(uint32_t v)
+{
+    return av_popcount(v) & 1;
+}
+
+#define MKTAG(a,b,c,d) ((a) | ((b) << 8) | ((c) << 16) | ((unsigned)(d) << 24))
+#define MKBETAG(a,b,c,d) ((d) | ((c) << 8) | ((b) << 16) | ((unsigned)(a) << 24))
+
+/**
+ * Convert a UTF-8 character (up to 4 bytes) to its 32-bit UCS-4 encoded form.
+ *
+ * @param val      Output value, must be an lvalue of type uint32_t.
+ * @param GET_BYTE Expression reading one byte from the input.
+ *                 Evaluated up to 7 times (4 for the currently
+ *                 assigned Unicode range).  With a memory buffer
+ *                 input, this could be *ptr++.
+ * @param ERROR    Expression to be evaluated on invalid input,
+ *                 typically a goto statement.
+ *
+ * @warning ERROR should not contain a loop control statement which
+ * could interact with the internal while loop, and should force an
+ * exit from the macro code (e.g. through a goto or a return) in order
+ * to prevent undefined results.
+ */
+#define GET_UTF8(val, GET_BYTE, ERROR)\
+    val= (GET_BYTE);\
+    {\
+        uint32_t top = (val & 128) >> 1;\
+        if ((val & 0xc0) == 0x80 || val >= 0xFE)\
+            ERROR\
+        while (val & top) {\
+            int tmp= (GET_BYTE) - 128;\
+            if(tmp>>6)\
+                ERROR\
+            val= (val<<6) + tmp;\
+            top <<= 5;\
+        }\
+        val &= (top << 1) - 1;\
+    }
+
+/**
+ * Convert a UTF-16 character (2 or 4 bytes) to its 32-bit UCS-4 encoded form.
+ *
+ * @param val       Output value, must be an lvalue of type uint32_t.
+ * @param GET_16BIT Expression returning two bytes of UTF-16 data converted
+ *                  to native byte order.  Evaluated one or two times.
+ * @param ERROR     Expression to be evaluated on invalid input,
+ *                  typically a goto statement.
+ */
+#define GET_UTF16(val, GET_16BIT, ERROR)\
+    val = GET_16BIT;\
+    {\
+        unsigned int hi = val - 0xD800;\
+        if (hi < 0x800) {\
+            val = GET_16BIT - 0xDC00;\
+            if (val > 0x3FFU || hi > 0x3FFU)\
+                ERROR\
+            val += (hi<<10) + 0x10000;\
+        }\
+    }\
+
+/**
+ * @def PUT_UTF8(val, tmp, PUT_BYTE)
+ * Convert a 32-bit Unicode character to its UTF-8 encoded form (up to 4 bytes long).
+ * @param val is an input-only argument and should be of type uint32_t. It holds
+ * a UCS-4 encoded Unicode character that is to be converted to UTF-8. If
+ * val is given as a function it is executed only once.
+ * @param tmp is a temporary variable and should be of type uint8_t. It
+ * represents an intermediate value during conversion that is to be
+ * output by PUT_BYTE.
+ * @param PUT_BYTE writes the converted UTF-8 bytes to any proper destination.
+ * It could be a function or a statement, and uses tmp as the input byte.
+ * For example, PUT_BYTE could be "*output++ = tmp;" PUT_BYTE will be
+ * executed up to 4 times for values in the valid UTF-8 range and up to
+ * 7 times in the general case, depending on the length of the converted
+ * Unicode character.
+ */
+#define PUT_UTF8(val, tmp, PUT_BYTE)\
+    {\
+        int bytes, shift;\
+        uint32_t in = val;\
+        if (in < 0x80) {\
+            tmp = in;\
+            PUT_BYTE\
+        } else {\
+            bytes = (av_log2(in) + 4) / 5;\
+            shift = (bytes - 1) * 6;\
+            tmp = (256 - (256 >> bytes)) | (in >> shift);\
+            PUT_BYTE\
+            while (shift >= 6) {\
+                shift -= 6;\
+                tmp = 0x80 | ((in >> shift) & 0x3f);\
+                PUT_BYTE\
+            }\
+        }\
+    }
+
+/**
+ * @def PUT_UTF16(val, tmp, PUT_16BIT)
+ * Convert a 32-bit Unicode character to its UTF-16 encoded form (2 or 4 bytes).
+ * @param val is an input-only argument and should be of type uint32_t. It holds
+ * a UCS-4 encoded Unicode character that is to be converted to UTF-16. If
+ * val is given as a function it is executed only once.
+ * @param tmp is a temporary variable and should be of type uint16_t. It
+ * represents an intermediate value during conversion that is to be
+ * output by PUT_16BIT.
+ * @param PUT_16BIT writes the converted UTF-16 data to any proper destination
+ * in desired endianness. It could be a function or a statement, and uses tmp
+ * as the input byte.  For example, PUT_BYTE could be "*output++ = tmp;"
+ * PUT_BYTE will be executed 1 or 2 times depending on input character.
+ */
+#define PUT_UTF16(val, tmp, PUT_16BIT)\
+    {\
+        uint32_t in = val;\
+        if (in < 0x10000) {\
+            tmp = in;\
+            PUT_16BIT\
+        } else {\
+            tmp = 0xD800 | ((in - 0x10000) >> 10);\
+            PUT_16BIT\
+            tmp = 0xDC00 | ((in - 0x10000) & 0x3FF);\
+            PUT_16BIT\
+        }\
+    }\
+
+
+
+#include "mem.h"
+
+#ifdef HAVE_AV_CONFIG_H
+#    include "internal.h"
+#endif /* HAVE_AV_CONFIG_H */
+
+#endif /* AVUTIL_COMMON_H */
+
+/*
+ * The following definitions are outside the multiple inclusion guard
+ * to ensure they are immediately available in intmath.h.
+ */
+
+#ifndef av_ceil_log2
+#   define av_ceil_log2     av_ceil_log2_c
+#endif
+#ifndef av_clip
+#   define av_clip          av_clip_c
+#endif
+#ifndef av_clip64
+#   define av_clip64        av_clip64_c
+#endif
+#ifndef av_clip_uint8
+#   define av_clip_uint8    av_clip_uint8_c
+#endif
+#ifndef av_clip_int8
+#   define av_clip_int8     av_clip_int8_c
+#endif
+#ifndef av_clip_uint16
+#   define av_clip_uint16   av_clip_uint16_c
+#endif
+#ifndef av_clip_int16
+#   define av_clip_int16    av_clip_int16_c
+#endif
+#ifndef av_clipl_int32
+#   define av_clipl_int32   av_clipl_int32_c
+#endif
+#ifndef av_clip_intp2
+#   define av_clip_intp2    av_clip_intp2_c
+#endif
+#ifndef av_clip_uintp2
+#   define av_clip_uintp2   av_clip_uintp2_c
+#endif
+#ifndef av_mod_uintp2
+#   define av_mod_uintp2    av_mod_uintp2_c
+#endif
+#ifndef av_sat_add32
+#   define av_sat_add32     av_sat_add32_c
+#endif
+#ifndef av_sat_dadd32
+#   define av_sat_dadd32    av_sat_dadd32_c
+#endif
+#ifndef av_clipf
+#   define av_clipf         av_clipf_c
+#endif
+#ifndef av_clipd
+#   define av_clipd         av_clipd_c
+#endif
+#ifndef av_popcount
+#   define av_popcount      av_popcount_c
+#endif
+#ifndef av_popcount64
+#   define av_popcount64    av_popcount64_c
+#endif
+#ifndef av_parity
+#   define av_parity        av_parity_c
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/cpu.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/cpu.h
new file mode 100644
index 0000000..de05593
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/cpu.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_CPU_H
+#define AVUTIL_CPU_H
+
+#include "attributes.h"
+
+#define AV_CPU_FLAG_FORCE    0x80000000 /* force usage of selected flags (OR) */
+
+    /* lower 16 bits - CPU features */
+#define AV_CPU_FLAG_MMX          0x0001 ///< standard MMX
+#define AV_CPU_FLAG_MMXEXT       0x0002 ///< SSE integer functions or AMD MMX ext
+#define AV_CPU_FLAG_MMX2         0x0002 ///< SSE integer functions or AMD MMX ext
+#define AV_CPU_FLAG_3DNOW        0x0004 ///< AMD 3DNOW
+#define AV_CPU_FLAG_SSE          0x0008 ///< SSE functions
+#define AV_CPU_FLAG_SSE2         0x0010 ///< PIV SSE2 functions
+#define AV_CPU_FLAG_SSE2SLOW 0x40000000 ///< SSE2 supported, but usually not faster
+                                        ///< than regular MMX/SSE (e.g. Core1)
+#define AV_CPU_FLAG_3DNOWEXT     0x0020 ///< AMD 3DNowExt
+#define AV_CPU_FLAG_SSE3         0x0040 ///< Prescott SSE3 functions
+#define AV_CPU_FLAG_SSE3SLOW 0x20000000 ///< SSE3 supported, but usually not faster
+                                        ///< than regular MMX/SSE (e.g. Core1)
+#define AV_CPU_FLAG_SSSE3        0x0080 ///< Conroe SSSE3 functions
+#define AV_CPU_FLAG_SSSE3SLOW 0x4000000 ///< SSSE3 supported, but usually not faster
+#define AV_CPU_FLAG_ATOM     0x10000000 ///< Atom processor, some SSSE3 instructions are slower
+#define AV_CPU_FLAG_SSE4         0x0100 ///< Penryn SSE4.1 functions
+#define AV_CPU_FLAG_SSE42        0x0200 ///< Nehalem SSE4.2 functions
+#define AV_CPU_FLAG_AESNI       0x80000 ///< Advanced Encryption Standard functions
+#define AV_CPU_FLAG_AVX          0x4000 ///< AVX functions: requires OS support even if YMM registers aren't used
+#define AV_CPU_FLAG_AVXSLOW   0x8000000 ///< AVX supported, but slow when using YMM registers (e.g. Bulldozer)
+#define AV_CPU_FLAG_XOP          0x0400 ///< Bulldozer XOP functions
+#define AV_CPU_FLAG_FMA4         0x0800 ///< Bulldozer FMA4 functions
+#define AV_CPU_FLAG_CMOV         0x1000 ///< supports cmov instruction
+#define AV_CPU_FLAG_AVX2         0x8000 ///< AVX2 functions: requires OS support even if YMM registers aren't used
+#define AV_CPU_FLAG_FMA3        0x10000 ///< Haswell FMA3 functions
+#define AV_CPU_FLAG_BMI1        0x20000 ///< Bit Manipulation Instruction Set 1
+#define AV_CPU_FLAG_BMI2        0x40000 ///< Bit Manipulation Instruction Set 2
+
+#define AV_CPU_FLAG_ALTIVEC      0x0001 ///< standard
+#define AV_CPU_FLAG_VSX          0x0002 ///< ISA 2.06
+#define AV_CPU_FLAG_POWER8       0x0004 ///< ISA 2.07
+
+#define AV_CPU_FLAG_ARMV5TE      (1 << 0)
+#define AV_CPU_FLAG_ARMV6        (1 << 1)
+#define AV_CPU_FLAG_ARMV6T2      (1 << 2)
+#define AV_CPU_FLAG_VFP          (1 << 3)
+#define AV_CPU_FLAG_VFPV3        (1 << 4)
+#define AV_CPU_FLAG_NEON         (1 << 5)
+#define AV_CPU_FLAG_ARMV8        (1 << 6)
+#define AV_CPU_FLAG_VFP_VM       (1 << 7) ///< VFPv2 vector mode, deprecated in ARMv7-A and unavailable in various CPUs implementations
+#define AV_CPU_FLAG_SETEND       (1 <<16)
+
+/**
+ * Return the flags which specify extensions supported by the CPU.
+ * The returned value is affected by av_force_cpu_flags() if that was used
+ * before. So av_get_cpu_flags() can easily be used in an application to
+ * detect the enabled cpu flags.
+ */
+int av_get_cpu_flags(void);
+
+/**
+ * Disables cpu detection and forces the specified flags.
+ * -1 is a special case that disables forcing of specific flags.
+ */
+void av_force_cpu_flags(int flags);
+
+/**
+ * Set a mask on flags returned by av_get_cpu_flags().
+ * This function is mainly useful for testing.
+ * Please use av_force_cpu_flags() and av_get_cpu_flags() instead which are more flexible
+ */
+attribute_deprecated void av_set_cpu_flags_mask(int mask);
+
+/**
+ * Parse CPU flags from a string.
+ *
+ * The returned flags contain the specified flags as well as related unspecified flags.
+ *
+ * This function exists only for compatibility with libav.
+ * Please use av_parse_cpu_caps() when possible.
+ * @return a combination of AV_CPU_* flags, negative on error.
+ */
+attribute_deprecated
+int av_parse_cpu_flags(const char *s);
+
+/**
+ * Parse CPU caps from a string and update the given AV_CPU_* flags based on that.
+ *
+ * @return negative on error.
+ */
+int av_parse_cpu_caps(unsigned *flags, const char *s);
+
+/**
+ * @return the number of logical CPU cores present.
+ */
+int av_cpu_count(void);
+
+#endif /* AVUTIL_CPU_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/crc.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/crc.h
new file mode 100644
index 0000000..2a1b0d7
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/crc.h
@@ -0,0 +1,103 @@
+/*
+ * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * @ingroup lavu_crc32
+ * Public header for CRC hash function implementation.
+ */
+
+#ifndef AVUTIL_CRC_H
+#define AVUTIL_CRC_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include "attributes.h"
+#include "version.h"
+
+/**
+ * @defgroup lavu_crc32 CRC
+ * @ingroup lavu_hash
+ * CRC (Cyclic Redundancy Check) hash function implementation.
+ *
+ * This module supports numerous CRC polynomials, in addition to the most
+ * widely used CRC-32-IEEE. See @ref AVCRCId for a list of available
+ * polynomials.
+ *
+ * @{
+ */
+
+typedef uint32_t AVCRC;
+
+typedef enum {
+    AV_CRC_8_ATM,
+    AV_CRC_16_ANSI,
+    AV_CRC_16_CCITT,
+    AV_CRC_32_IEEE,
+    AV_CRC_32_IEEE_LE,  /*< reversed bitorder version of AV_CRC_32_IEEE */
+    AV_CRC_16_ANSI_LE,  /*< reversed bitorder version of AV_CRC_16_ANSI */
+#if FF_API_CRC_BIG_TABLE
+    AV_CRC_24_IEEE = 12,
+#else
+    AV_CRC_24_IEEE,
+#endif /* FF_API_CRC_BIG_TABLE */
+    AV_CRC_MAX,         /*< Not part of public API! Do not use outside libavutil. */
+}AVCRCId;
+
+/**
+ * Initialize a CRC table.
+ * @param ctx must be an array of size sizeof(AVCRC)*257 or sizeof(AVCRC)*1024
+ * @param le If 1, the lowest bit represents the coefficient for the highest
+ *           exponent of the corresponding polynomial (both for poly and
+ *           actual CRC).
+ *           If 0, you must swap the CRC parameter and the result of av_crc
+ *           if you need the standard representation (can be simplified in
+ *           most cases to e.g. bswap16):
+ *           av_bswap32(crc << (32-bits))
+ * @param bits number of bits for the CRC
+ * @param poly generator polynomial without the x**bits coefficient, in the
+ *             representation as specified by le
+ * @param ctx_size size of ctx in bytes
+ * @return <0 on failure
+ */
+int av_crc_init(AVCRC *ctx, int le, int bits, uint32_t poly, int ctx_size);
+
+/**
+ * Get an initialized standard CRC table.
+ * @param crc_id ID of a standard CRC
+ * @return a pointer to the CRC table or NULL on failure
+ */
+const AVCRC *av_crc_get_table(AVCRCId crc_id);
+
+/**
+ * Calculate the CRC of a block.
+ * @param crc CRC of previous blocks if any or initial value for CRC
+ * @return CRC updated with the data from the given block
+ *
+ * @see av_crc_init() "le" parameter
+ */
+uint32_t av_crc(const AVCRC *ctx, uint32_t crc,
+                const uint8_t *buffer, size_t length) av_pure;
+
+/**
+ * @}
+ */
+
+#endif /* AVUTIL_CRC_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/des.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/des.h
new file mode 100644
index 0000000..4cf11f5
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/des.h
@@ -0,0 +1,77 @@
+/*
+ * DES encryption/decryption
+ * Copyright (c) 2007 Reimar Doeffinger
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_DES_H
+#define AVUTIL_DES_H
+
+#include <stdint.h>
+
+/**
+ * @defgroup lavu_des DES
+ * @ingroup lavu_crypto
+ * @{
+ */
+
+typedef struct AVDES {
+    uint64_t round_keys[3][16];
+    int triple_des;
+} AVDES;
+
+/**
+ * Allocate an AVDES context.
+ */
+AVDES *av_des_alloc(void);
+
+/**
+ * @brief Initializes an AVDES context.
+ *
+ * @param key_bits must be 64 or 192
+ * @param decrypt 0 for encryption/CBC-MAC, 1 for decryption
+ * @return zero on success, negative value otherwise
+ */
+int av_des_init(struct AVDES *d, const uint8_t *key, int key_bits, int decrypt);
+
+/**
+ * @brief Encrypts / decrypts using the DES algorithm.
+ *
+ * @param count number of 8 byte blocks
+ * @param dst destination array, can be equal to src, must be 8-byte aligned
+ * @param src source array, can be equal to dst, must be 8-byte aligned, may be NULL
+ * @param iv initialization vector for CBC mode, if NULL then ECB will be used,
+ *           must be 8-byte aligned
+ * @param decrypt 0 for encryption, 1 for decryption
+ */
+void av_des_crypt(struct AVDES *d, uint8_t *dst, const uint8_t *src, int count, uint8_t *iv, int decrypt);
+
+/**
+ * @brief Calculates CBC-MAC using the DES algorithm.
+ *
+ * @param count number of 8 byte blocks
+ * @param dst destination array, can be equal to src, must be 8-byte aligned
+ * @param src source array, can be equal to dst, must be 8-byte aligned, may be NULL
+ */
+void av_des_mac(struct AVDES *d, uint8_t *dst, const uint8_t *src, int count);
+
+/**
+ * @}
+ */
+
+#endif /* AVUTIL_DES_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/dict.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/dict.h
new file mode 100644
index 0000000..118f1f0
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/dict.h
@@ -0,0 +1,200 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Public dictionary API.
+ * @deprecated
+ *  AVDictionary is provided for compatibility with libav. It is both in
+ *  implementation as well as API inefficient. It does not scale and is
+ *  extremely slow with large dictionaries.
+ *  It is recommended that new code uses our tree container from tree.c/h
+ *  where applicable, which uses AVL trees to achieve O(log n) performance.
+ */
+
+#ifndef AVUTIL_DICT_H
+#define AVUTIL_DICT_H
+
+#include <stdint.h>
+
+#include "version.h"
+
+/**
+ * @addtogroup lavu_dict AVDictionary
+ * @ingroup lavu_data
+ *
+ * @brief Simple key:value store
+ *
+ * @{
+ * Dictionaries are used for storing key:value pairs. To create
+ * an AVDictionary, simply pass an address of a NULL pointer to
+ * av_dict_set(). NULL can be used as an empty dictionary wherever
+ * a pointer to an AVDictionary is required.
+ * Use av_dict_get() to retrieve an entry or iterate over all
+ * entries and finally av_dict_free() to free the dictionary
+ * and all its contents.
+ *
+ @code
+   AVDictionary *d = NULL;           // "create" an empty dictionary
+   AVDictionaryEntry *t = NULL;
+
+   av_dict_set(&d, "foo", "bar", 0); // add an entry
+
+   char *k = av_strdup("key");       // if your strings are already allocated,
+   char *v = av_strdup("value");     // you can avoid copying them like this
+   av_dict_set(&d, k, v, AV_DICT_DONT_STRDUP_KEY | AV_DICT_DONT_STRDUP_VAL);
+
+   while (t = av_dict_get(d, "", t, AV_DICT_IGNORE_SUFFIX)) {
+       <....>                             // iterate over all entries in d
+   }
+   av_dict_free(&d);
+ @endcode
+ */
+
+#define AV_DICT_MATCH_CASE      1   /**< Only get an entry with exact-case key match. Only relevant in av_dict_get(). */
+#define AV_DICT_IGNORE_SUFFIX   2   /**< Return first entry in a dictionary whose first part corresponds to the search key,
+                                         ignoring the suffix of the found key string. Only relevant in av_dict_get(). */
+#define AV_DICT_DONT_STRDUP_KEY 4   /**< Take ownership of a key that's been
+                                         allocated with av_malloc() or another memory allocation function. */
+#define AV_DICT_DONT_STRDUP_VAL 8   /**< Take ownership of a value that's been
+                                         allocated with av_malloc() or another memory allocation function. */
+#define AV_DICT_DONT_OVERWRITE 16   ///< Don't overwrite existing entries.
+#define AV_DICT_APPEND         32   /**< If the entry already exists, append to it.  Note that no
+                                      delimiter is added, the strings are simply concatenated. */
+#define AV_DICT_MULTIKEY       64   /**< Allow to store several equal keys in the dictionary */
+
+typedef struct AVDictionaryEntry {
+    char *key;
+    char *value;
+} AVDictionaryEntry;
+
+typedef struct AVDictionary AVDictionary;
+
+/**
+ * Get a dictionary entry with matching key.
+ *
+ * The returned entry key or value must not be changed, or it will
+ * cause undefined behavior.
+ *
+ * To iterate through all the dictionary entries, you can set the matching key
+ * to the null string "" and set the AV_DICT_IGNORE_SUFFIX flag.
+ *
+ * @param prev Set to the previous matching element to find the next.
+ *             If set to NULL the first matching element is returned.
+ * @param key matching key
+ * @param flags a collection of AV_DICT_* flags controlling how the entry is retrieved
+ * @return found entry or NULL in case no matching entry was found in the dictionary
+ */
+AVDictionaryEntry *av_dict_get(const AVDictionary *m, const char *key,
+                               const AVDictionaryEntry *prev, int flags);
+
+/**
+ * Get number of entries in dictionary.
+ *
+ * @param m dictionary
+ * @return  number of entries in dictionary
+ */
+int av_dict_count(const AVDictionary *m);
+
+/**
+ * Set the given entry in *pm, overwriting an existing entry.
+ *
+ * Note: If AV_DICT_DONT_STRDUP_KEY or AV_DICT_DONT_STRDUP_VAL is set,
+ * these arguments will be freed on error.
+ *
+ * Warning: Adding a new entry to a dictionary invalidates all existing entries
+ * previously returned with av_dict_get.
+ *
+ * @param pm pointer to a pointer to a dictionary struct. If *pm is NULL
+ * a dictionary struct is allocated and put in *pm.
+ * @param key entry key to add to *pm (will either be av_strduped or added as a new key depending on flags)
+ * @param value entry value to add to *pm (will be av_strduped or added as a new key depending on flags).
+ *        Passing a NULL value will cause an existing entry to be deleted.
+ * @return >= 0 on success otherwise an error code <0
+ */
+int av_dict_set(AVDictionary **pm, const char *key, const char *value, int flags);
+
+/**
+ * Convenience wrapper for av_dict_set that converts the value to a string
+ * and stores it.
+ *
+ * Note: If AV_DICT_DONT_STRDUP_KEY is set, key will be freed on error.
+ */
+int av_dict_set_int(AVDictionary **pm, const char *key, int64_t value, int flags);
+
+/**
+ * Parse the key/value pairs list and add the parsed entries to a dictionary.
+ *
+ * In case of failure, all the successfully set entries are stored in
+ * *pm. You may need to manually free the created dictionary.
+ *
+ * @param key_val_sep  a 0-terminated list of characters used to separate
+ *                     key from value
+ * @param pairs_sep    a 0-terminated list of characters used to separate
+ *                     two pairs from each other
+ * @param flags        flags to use when adding to dictionary.
+ *                     AV_DICT_DONT_STRDUP_KEY and AV_DICT_DONT_STRDUP_VAL
+ *                     are ignored since the key/value tokens will always
+ *                     be duplicated.
+ * @return             0 on success, negative AVERROR code on failure
+ */
+int av_dict_parse_string(AVDictionary **pm, const char *str,
+                         const char *key_val_sep, const char *pairs_sep,
+                         int flags);
+
+/**
+ * Copy entries from one AVDictionary struct into another.
+ * @param dst pointer to a pointer to a AVDictionary struct. If *dst is NULL,
+ *            this function will allocate a struct for you and put it in *dst
+ * @param src pointer to source AVDictionary struct
+ * @param flags flags to use when setting entries in *dst
+ * @note metadata is read using the AV_DICT_IGNORE_SUFFIX flag
+ * @return 0 on success, negative AVERROR code on failure. If dst was allocated
+ *           by this function, callers should free the associated memory.
+ */
+int av_dict_copy(AVDictionary **dst, const AVDictionary *src, int flags);
+
+/**
+ * Free all the memory allocated for an AVDictionary struct
+ * and all keys and values.
+ */
+void av_dict_free(AVDictionary **m);
+
+/**
+ * Get dictionary entries as a string.
+ *
+ * Create a string containing dictionary's entries.
+ * Such string may be passed back to av_dict_parse_string().
+ * @note String is escaped with backslashes ('\').
+ *
+ * @param[in]  m             dictionary
+ * @param[out] buffer        Pointer to buffer that will be allocated with string containg entries.
+ *                           Buffer must be freed by the caller when is no longer needed.
+ * @param[in]  key_val_sep   character used to separate key from value
+ * @param[in]  pairs_sep     character used to separate two pairs from each other
+ * @return                   >= 0 on success, negative on error
+ * @warning Separators cannot be neither '\\' nor '\0'. They also cannot be the same.
+ */
+int av_dict_get_string(const AVDictionary *m, char **buffer,
+                       const char key_val_sep, const char pairs_sep);
+
+/**
+ * @}
+ */
+
+#endif /* AVUTIL_DICT_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/display.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/display.h
new file mode 100644
index 0000000..39c15ee
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/display.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2014 Vittorio Giovara <vittorio.giovara@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_DISPLAY_H
+#define AVUTIL_DISPLAY_H
+
+#include <stdint.h>
+#include "common.h"
+
+/**
+ * The display transformation matrix specifies an affine transformation that
+ * should be applied to video frames for correct presentation. It is compatible
+ * with the matrices stored in the ISO/IEC 14496-12 container format.
+ *
+ * The data is a 3x3 matrix represented as a 9-element array:
+ *
+ *                                  | a b u |
+ *   (a, b, u, c, d, v, x, y, w) -> | c d v |
+ *                                  | x y w |
+ *
+ * All numbers are stored in native endianness, as 16.16 fixed-point values,
+ * except for u, v and w, which are stored as 2.30 fixed-point values.
+ *
+ * The transformation maps a point (p, q) in the source (pre-transformation)
+ * frame to the point (p', q') in the destination (post-transformation) frame as
+ * follows:
+ *               | a b u |
+ *   (p, q, 1) . | c d v | = z * (p', q', 1)
+ *               | x y w |
+ *
+ * The transformation can also be more explicitly written in components as
+ * follows:
+ *   p' = (a * p + c * q + x) / z;
+ *   q' = (b * p + d * q + y) / z;
+ *   z  =  u * p + v * q + w
+ */
+
+/**
+ * Extract the rotation component of the transformation matrix.
+ *
+ * @param matrix the transformation matrix
+ * @return the angle (in degrees) by which the transformation rotates the frame
+ *         counterclockwise. The angle will be in range [-180.0, 180.0],
+ *         or NaN if the matrix is singular.
+ *
+ * @note floating point numbers are inherently inexact, so callers are
+ *       recommended to round the return value to nearest integer before use.
+ */
+double av_display_rotation_get(const int32_t matrix[9]);
+
+/**
+ * Initialize a transformation matrix describing a pure counterclockwise
+ * rotation by the specified angle (in degrees).
+ *
+ * @param matrix an allocated transformation matrix (will be fully overwritten
+ *               by this function)
+ * @param angle rotation angle in degrees.
+ */
+void av_display_rotation_set(int32_t matrix[9], double angle);
+
+/**
+ * Flip the input matrix horizontally and/or vertically.
+ *
+ * @param matrix an allocated transformation matrix
+ * @param hflip whether the matrix should be flipped horizontally
+ * @param vflip whether the matrix should be flipped vertically
+ */
+void av_display_matrix_flip(int32_t matrix[9], int hflip, int vflip);
+
+#endif /* AVUTIL_DISPLAY_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/downmix_info.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/downmix_info.h
new file mode 100644
index 0000000..221cf5b
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/downmix_info.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2014 Tim Walker <tdskywalker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_DOWNMIX_INFO_H
+#define AVUTIL_DOWNMIX_INFO_H
+
+#include "frame.h"
+
+/**
+ * @file
+ * audio downmix medatata
+ */
+
+/**
+ * @addtogroup lavu_audio
+ * @{
+ */
+
+/**
+ * @defgroup downmix_info Audio downmix metadata
+ * @{
+ */
+
+/**
+ * Possible downmix types.
+ */
+enum AVDownmixType {
+    AV_DOWNMIX_TYPE_UNKNOWN, /**< Not indicated. */
+    AV_DOWNMIX_TYPE_LORO,    /**< Lo/Ro 2-channel downmix (Stereo). */
+    AV_DOWNMIX_TYPE_LTRT,    /**< Lt/Rt 2-channel downmix, Dolby Surround compatible. */
+    AV_DOWNMIX_TYPE_DPLII,   /**< Lt/Rt 2-channel downmix, Dolby Pro Logic II compatible. */
+    AV_DOWNMIX_TYPE_NB       /**< Number of downmix types. Not part of ABI. */
+};
+
+/**
+ * This structure describes optional metadata relevant to a downmix procedure.
+ *
+ * All fields are set by the decoder to the value indicated in the audio
+ * bitstream (if present), or to a "sane" default otherwise.
+ */
+typedef struct AVDownmixInfo {
+    /**
+     * Type of downmix preferred by the mastering engineer.
+     */
+    enum AVDownmixType preferred_downmix_type;
+
+    /**
+     * Absolute scale factor representing the nominal level of the center
+     * channel during a regular downmix.
+     */
+    double center_mix_level;
+
+    /**
+     * Absolute scale factor representing the nominal level of the center
+     * channel during an Lt/Rt compatible downmix.
+     */
+    double center_mix_level_ltrt;
+
+    /**
+     * Absolute scale factor representing the nominal level of the surround
+     * channels during a regular downmix.
+     */
+    double surround_mix_level;
+
+    /**
+     * Absolute scale factor representing the nominal level of the surround
+     * channels during an Lt/Rt compatible downmix.
+     */
+    double surround_mix_level_ltrt;
+
+    /**
+     * Absolute scale factor representing the level at which the LFE data is
+     * mixed into L/R channels during downmixing.
+     */
+    double lfe_mix_level;
+} AVDownmixInfo;
+
+/**
+ * Get a frame's AV_FRAME_DATA_DOWNMIX_INFO side data for editing.
+ *
+ * If the side data is absent, it is created and added to the frame.
+ *
+ * @param frame the frame for which the side data is to be obtained or created
+ *
+ * @return the AVDownmixInfo structure to be edited by the caller, or NULL if
+ *         the structure cannot be allocated.
+ */
+AVDownmixInfo *av_downmix_info_update_side_data(AVFrame *frame);
+
+/**
+ * @}
+ */
+
+/**
+ * @}
+ */
+
+#endif /* AVUTIL_DOWNMIX_INFO_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/error.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/error.h
new file mode 100644
index 0000000..71df4da
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/error.h
@@ -0,0 +1,126 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * error code definitions
+ */
+
+#ifndef AVUTIL_ERROR_H
+#define AVUTIL_ERROR_H
+
+#include <errno.h>
+#include <stddef.h>
+
+/**
+ * @addtogroup lavu_error
+ *
+ * @{
+ */
+
+
+/* error handling */
+#if EDOM > 0
+#define AVERROR(e) (-(e))   ///< Returns a negative error code from a POSIX error code, to return from library functions.
+#define AVUNERROR(e) (-(e)) ///< Returns a POSIX error code from a library function error return value.
+#else
+/* Some platforms have E* and errno already negated. */
+#define AVERROR(e) (e)
+#define AVUNERROR(e) (e)
+#endif
+
+#define FFERRTAG(a, b, c, d) (-(int)MKTAG(a, b, c, d))
+
+#define AVERROR_BSF_NOT_FOUND      FFERRTAG(0xF8,'B','S','F') ///< Bitstream filter not found
+#define AVERROR_BUG                FFERRTAG( 'B','U','G','!') ///< Internal bug, also see AVERROR_BUG2
+#define AVERROR_BUFFER_TOO_SMALL   FFERRTAG( 'B','U','F','S') ///< Buffer too small
+#define AVERROR_DECODER_NOT_FOUND  FFERRTAG(0xF8,'D','E','C') ///< Decoder not found
+#define AVERROR_DEMUXER_NOT_FOUND  FFERRTAG(0xF8,'D','E','M') ///< Demuxer not found
+#define AVERROR_ENCODER_NOT_FOUND  FFERRTAG(0xF8,'E','N','C') ///< Encoder not found
+#define AVERROR_EOF                FFERRTAG( 'E','O','F',' ') ///< End of file
+#define AVERROR_EXIT               FFERRTAG( 'E','X','I','T') ///< Immediate exit was requested; the called function should not be restarted
+#define AVERROR_EXTERNAL           FFERRTAG( 'E','X','T',' ') ///< Generic error in an external library
+#define AVERROR_FILTER_NOT_FOUND   FFERRTAG(0xF8,'F','I','L') ///< Filter not found
+#define AVERROR_INVALIDDATA        FFERRTAG( 'I','N','D','A') ///< Invalid data found when processing input
+#define AVERROR_MUXER_NOT_FOUND    FFERRTAG(0xF8,'M','U','X') ///< Muxer not found
+#define AVERROR_OPTION_NOT_FOUND   FFERRTAG(0xF8,'O','P','T') ///< Option not found
+#define AVERROR_PATCHWELCOME       FFERRTAG( 'P','A','W','E') ///< Not yet implemented in FFmpeg, patches welcome
+#define AVERROR_PROTOCOL_NOT_FOUND FFERRTAG(0xF8,'P','R','O') ///< Protocol not found
+
+#define AVERROR_STREAM_NOT_FOUND   FFERRTAG(0xF8,'S','T','R') ///< Stream not found
+/**
+ * This is semantically identical to AVERROR_BUG
+ * it has been introduced in Libav after our AVERROR_BUG and with a modified value.
+ */
+#define AVERROR_BUG2               FFERRTAG( 'B','U','G',' ')
+#define AVERROR_UNKNOWN            FFERRTAG( 'U','N','K','N') ///< Unknown error, typically from an external library
+#define AVERROR_EXPERIMENTAL       (-0x2bb2afa8) ///< Requested feature is flagged experimental. Set strict_std_compliance if you really want to use it.
+#define AVERROR_INPUT_CHANGED      (-0x636e6701) ///< Input changed between calls. Reconfiguration is required. (can be OR-ed with AVERROR_OUTPUT_CHANGED)
+#define AVERROR_OUTPUT_CHANGED     (-0x636e6702) ///< Output changed between calls. Reconfiguration is required. (can be OR-ed with AVERROR_INPUT_CHANGED)
+/* HTTP & RTSP errors */
+#define AVERROR_HTTP_BAD_REQUEST   FFERRTAG(0xF8,'4','0','0')
+#define AVERROR_HTTP_UNAUTHORIZED  FFERRTAG(0xF8,'4','0','1')
+#define AVERROR_HTTP_FORBIDDEN     FFERRTAG(0xF8,'4','0','3')
+#define AVERROR_HTTP_NOT_FOUND     FFERRTAG(0xF8,'4','0','4')
+#define AVERROR_HTTP_OTHER_4XX     FFERRTAG(0xF8,'4','X','X')
+#define AVERROR_HTTP_SERVER_ERROR  FFERRTAG(0xF8,'5','X','X')
+
+#define AV_ERROR_MAX_STRING_SIZE 64
+
+/**
+ * Put a description of the AVERROR code errnum in errbuf.
+ * In case of failure the global variable errno is set to indicate the
+ * error. Even in case of failure av_strerror() will print a generic
+ * error message indicating the errnum provided to errbuf.
+ *
+ * @param errnum      error code to describe
+ * @param errbuf      buffer to which description is written
+ * @param errbuf_size the size in bytes of errbuf
+ * @return 0 on success, a negative value if a description for errnum
+ * cannot be found
+ */
+int av_strerror(int errnum, char *errbuf, size_t errbuf_size);
+
+/**
+ * Fill the provided buffer with a string containing an error string
+ * corresponding to the AVERROR code errnum.
+ *
+ * @param errbuf         a buffer
+ * @param errbuf_size    size in bytes of errbuf
+ * @param errnum         error code to describe
+ * @return the buffer in input, filled with the error description
+ * @see av_strerror()
+ */
+static inline char *av_make_error_string(char *errbuf, size_t errbuf_size, int errnum)
+{
+    av_strerror(errnum, errbuf, errbuf_size);
+    return errbuf;
+}
+
+/**
+ * Convenience macro, the return value should be used only directly in
+ * function arguments but never stand-alone.
+ */
+#define av_err2str(errnum) \
+    av_make_error_string((char[AV_ERROR_MAX_STRING_SIZE]){0}, AV_ERROR_MAX_STRING_SIZE, errnum)
+
+/**
+ * @}
+ */
+
+#endif /* AVUTIL_ERROR_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/eval.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/eval.h
new file mode 100644
index 0000000..dacd22b
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/eval.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * simple arithmetic expression evaluator
+ */
+
+#ifndef AVUTIL_EVAL_H
+#define AVUTIL_EVAL_H
+
+#include "avutil.h"
+
+typedef struct AVExpr AVExpr;
+
+/**
+ * Parse and evaluate an expression.
+ * Note, this is significantly slower than av_expr_eval().
+ *
+ * @param res a pointer to a double where is put the result value of
+ * the expression, or NAN in case of error
+ * @param s expression as a zero terminated string, for example "1+2^3+5*5+sin(2/3)"
+ * @param const_names NULL terminated array of zero terminated strings of constant identifiers, for example {"PI", "E", 0}
+ * @param const_values a zero terminated array of values for the identifiers from const_names
+ * @param func1_names NULL terminated array of zero terminated strings of funcs1 identifiers
+ * @param funcs1 NULL terminated array of function pointers for functions which take 1 argument
+ * @param func2_names NULL terminated array of zero terminated strings of funcs2 identifiers
+ * @param funcs2 NULL terminated array of function pointers for functions which take 2 arguments
+ * @param opaque a pointer which will be passed to all functions from funcs1 and funcs2
+ * @param log_ctx parent logging context
+ * @return >= 0 in case of success, a negative value corresponding to an
+ * AVERROR code otherwise
+ */
+int av_expr_parse_and_eval(double *res, const char *s,
+                           const char * const *const_names, const double *const_values,
+                           const char * const *func1_names, double (* const *funcs1)(void *, double),
+                           const char * const *func2_names, double (* const *funcs2)(void *, double, double),
+                           void *opaque, int log_offset, void *log_ctx);
+
+/**
+ * Parse an expression.
+ *
+ * @param expr a pointer where is put an AVExpr containing the parsed
+ * value in case of successful parsing, or NULL otherwise.
+ * The pointed to AVExpr must be freed with av_expr_free() by the user
+ * when it is not needed anymore.
+ * @param s expression as a zero terminated string, for example "1+2^3+5*5+sin(2/3)"
+ * @param const_names NULL terminated array of zero terminated strings of constant identifiers, for example {"PI", "E", 0}
+ * @param func1_names NULL terminated array of zero terminated strings of funcs1 identifiers
+ * @param funcs1 NULL terminated array of function pointers for functions which take 1 argument
+ * @param func2_names NULL terminated array of zero terminated strings of funcs2 identifiers
+ * @param funcs2 NULL terminated array of function pointers for functions which take 2 arguments
+ * @param log_ctx parent logging context
+ * @return >= 0 in case of success, a negative value corresponding to an
+ * AVERROR code otherwise
+ */
+int av_expr_parse(AVExpr **expr, const char *s,
+                  const char * const *const_names,
+                  const char * const *func1_names, double (* const *funcs1)(void *, double),
+                  const char * const *func2_names, double (* const *funcs2)(void *, double, double),
+                  int log_offset, void *log_ctx);
+
+/**
+ * Evaluate a previously parsed expression.
+ *
+ * @param const_values a zero terminated array of values for the identifiers from av_expr_parse() const_names
+ * @param opaque a pointer which will be passed to all functions from funcs1 and funcs2
+ * @return the value of the expression
+ */
+double av_expr_eval(AVExpr *e, const double *const_values, void *opaque);
+
+/**
+ * Free a parsed expression previously created with av_expr_parse().
+ */
+void av_expr_free(AVExpr *e);
+
+/**
+ * Parse the string in numstr and return its value as a double. If
+ * the string is empty, contains only whitespaces, or does not contain
+ * an initial substring that has the expected syntax for a
+ * floating-point number, no conversion is performed. In this case,
+ * returns a value of zero and the value returned in tail is the value
+ * of numstr.
+ *
+ * @param numstr a string representing a number, may contain one of
+ * the International System number postfixes, for example 'K', 'M',
+ * 'G'. If 'i' is appended after the postfix, powers of 2 are used
+ * instead of powers of 10. The 'B' postfix multiplies the value by
+ * 8, and can be appended after another postfix or used alone. This
+ * allows using for example 'KB', 'MiB', 'G' and 'B' as postfix.
+ * @param tail if non-NULL puts here the pointer to the char next
+ * after the last parsed character
+ */
+double av_strtod(const char *numstr, char **tail);
+
+#endif /* AVUTIL_EVAL_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/ffversion.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/ffversion.h
new file mode 100644
index 0000000..16c192a
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/ffversion.h
@@ -0,0 +1,5 @@
+/* Automatically generated by version.sh, do not manually edit! */
+#ifndef AVUTIL_FFVERSION_H
+#define AVUTIL_FFVERSION_H
+#define FFMPEG_VERSION "3.3.9"
+#endif /* AVUTIL_FFVERSION_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/fifo.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/fifo.h
new file mode 100644
index 0000000..dc7bc6f
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/fifo.h
@@ -0,0 +1,179 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * a very simple circular buffer FIFO implementation
+ */
+
+#ifndef AVUTIL_FIFO_H
+#define AVUTIL_FIFO_H
+
+#include <stdint.h>
+#include "avutil.h"
+#include "attributes.h"
+
+typedef struct AVFifoBuffer {
+    uint8_t *buffer;
+    uint8_t *rptr, *wptr, *end;
+    uint32_t rndx, wndx;
+} AVFifoBuffer;
+
+/**
+ * Initialize an AVFifoBuffer.
+ * @param size of FIFO
+ * @return AVFifoBuffer or NULL in case of memory allocation failure
+ */
+AVFifoBuffer *av_fifo_alloc(unsigned int size);
+
+/**
+ * Initialize an AVFifoBuffer.
+ * @param nmemb number of elements
+ * @param size  size of the single element
+ * @return AVFifoBuffer or NULL in case of memory allocation failure
+ */
+AVFifoBuffer *av_fifo_alloc_array(size_t nmemb, size_t size);
+
+/**
+ * Free an AVFifoBuffer.
+ * @param f AVFifoBuffer to free
+ */
+void av_fifo_free(AVFifoBuffer *f);
+
+/**
+ * Free an AVFifoBuffer and reset pointer to NULL.
+ * @param f AVFifoBuffer to free
+ */
+void av_fifo_freep(AVFifoBuffer **f);
+
+/**
+ * Reset the AVFifoBuffer to the state right after av_fifo_alloc, in particular it is emptied.
+ * @param f AVFifoBuffer to reset
+ */
+void av_fifo_reset(AVFifoBuffer *f);
+
+/**
+ * Return the amount of data in bytes in the AVFifoBuffer, that is the
+ * amount of data you can read from it.
+ * @param f AVFifoBuffer to read from
+ * @return size
+ */
+int av_fifo_size(const AVFifoBuffer *f);
+
+/**
+ * Return the amount of space in bytes in the AVFifoBuffer, that is the
+ * amount of data you can write into it.
+ * @param f AVFifoBuffer to write into
+ * @return size
+ */
+int av_fifo_space(const AVFifoBuffer *f);
+
+/**
+ * Feed data at specific position from an AVFifoBuffer to a user-supplied callback.
+ * Similar as av_fifo_gereric_read but without discarding data.
+ * @param f AVFifoBuffer to read from
+ * @param offset offset from current read position
+ * @param buf_size number of bytes to read
+ * @param func generic read function
+ * @param dest data destination
+ */
+int av_fifo_generic_peek_at(AVFifoBuffer *f, void *dest, int offset, int buf_size, void (*func)(void*, void*, int));
+
+/**
+ * Feed data from an AVFifoBuffer to a user-supplied callback.
+ * Similar as av_fifo_gereric_read but without discarding data.
+ * @param f AVFifoBuffer to read from
+ * @param buf_size number of bytes to read
+ * @param func generic read function
+ * @param dest data destination
+ */
+int av_fifo_generic_peek(AVFifoBuffer *f, void *dest, int buf_size, void (*func)(void*, void*, int));
+
+/**
+ * Feed data from an AVFifoBuffer to a user-supplied callback.
+ * @param f AVFifoBuffer to read from
+ * @param buf_size number of bytes to read
+ * @param func generic read function
+ * @param dest data destination
+ */
+int av_fifo_generic_read(AVFifoBuffer *f, void *dest, int buf_size, void (*func)(void*, void*, int));
+
+/**
+ * Feed data from a user-supplied callback to an AVFifoBuffer.
+ * @param f AVFifoBuffer to write to
+ * @param src data source; non-const since it may be used as a
+ * modifiable context by the function defined in func
+ * @param size number of bytes to write
+ * @param func generic write function; the first parameter is src,
+ * the second is dest_buf, the third is dest_buf_size.
+ * func must return the number of bytes written to dest_buf, or <= 0 to
+ * indicate no more data available to write.
+ * If func is NULL, src is interpreted as a simple byte array for source data.
+ * @return the number of bytes written to the FIFO
+ */
+int av_fifo_generic_write(AVFifoBuffer *f, void *src, int size, int (*func)(void*, void*, int));
+
+/**
+ * Resize an AVFifoBuffer.
+ * In case of reallocation failure, the old FIFO is kept unchanged.
+ *
+ * @param f AVFifoBuffer to resize
+ * @param size new AVFifoBuffer size in bytes
+ * @return <0 for failure, >=0 otherwise
+ */
+int av_fifo_realloc2(AVFifoBuffer *f, unsigned int size);
+
+/**
+ * Enlarge an AVFifoBuffer.
+ * In case of reallocation failure, the old FIFO is kept unchanged.
+ * The new fifo size may be larger than the requested size.
+ *
+ * @param f AVFifoBuffer to resize
+ * @param additional_space the amount of space in bytes to allocate in addition to av_fifo_size()
+ * @return <0 for failure, >=0 otherwise
+ */
+int av_fifo_grow(AVFifoBuffer *f, unsigned int additional_space);
+
+/**
+ * Read and discard the specified amount of data from an AVFifoBuffer.
+ * @param f AVFifoBuffer to read from
+ * @param size amount of data to read in bytes
+ */
+void av_fifo_drain(AVFifoBuffer *f, int size);
+
+/**
+ * Return a pointer to the data stored in a FIFO buffer at a certain offset.
+ * The FIFO buffer is not modified.
+ *
+ * @param f    AVFifoBuffer to peek at, f must be non-NULL
+ * @param offs an offset in bytes, its absolute value must be less
+ *             than the used buffer size or the returned pointer will
+ *             point outside to the buffer data.
+ *             The used buffer size can be checked with av_fifo_size().
+ */
+static inline uint8_t *av_fifo_peek2(const AVFifoBuffer *f, int offs)
+{
+    uint8_t *ptr = f->rptr + offs;
+    if (ptr >= f->end)
+        ptr = f->buffer + (ptr - f->end);
+    else if (ptr < f->buffer)
+        ptr = f->end - (f->buffer - ptr);
+    return ptr;
+}
+
+#endif /* AVUTIL_FIFO_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/file.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/file.h
new file mode 100644
index 0000000..8666c7b
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/file.h
@@ -0,0 +1,69 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_FILE_H
+#define AVUTIL_FILE_H
+
+#include <stdint.h>
+
+#include "avutil.h"
+
+/**
+ * @file
+ * Misc file utilities.
+ */
+
+/**
+ * Read the file with name filename, and put its content in a newly
+ * allocated buffer or map it with mmap() when available.
+ * In case of success set *bufptr to the read or mmapped buffer, and
+ * *size to the size in bytes of the buffer in *bufptr.
+ * The returned buffer must be released with av_file_unmap().
+ *
+ * @param log_offset loglevel offset used for logging
+ * @param log_ctx context used for logging
+ * @return a non negative number in case of success, a negative value
+ * corresponding to an AVERROR error code in case of failure
+ */
+av_warn_unused_result
+int av_file_map(const char *filename, uint8_t **bufptr, size_t *size,
+                int log_offset, void *log_ctx);
+
+/**
+ * Unmap or free the buffer bufptr created by av_file_map().
+ *
+ * @param size size in bytes of bufptr, must be the same as returned
+ * by av_file_map()
+ */
+void av_file_unmap(uint8_t *bufptr, size_t size);
+
+/**
+ * Wrapper to work around the lack of mkstemp() on mingw.
+ * Also, tries to create file in /tmp first, if possible.
+ * *prefix can be a character constant; *filename will be allocated internally.
+ * @return file descriptor of opened file (or negative value corresponding to an
+ * AVERROR code on error)
+ * and opened file name in **filename.
+ * @note On very old libcs it is necessary to set a secure umask before
+ *       calling this, av_tempfile() can't call umask itself as it is used in
+ *       libraries and could interfere with the calling application.
+ * @deprecated as fd numbers cannot be passed saftely between libs on some platforms
+ */
+int av_tempfile(const char *prefix, char **filename, int log_offset, void *log_ctx);
+
+#endif /* AVUTIL_FILE_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/frame.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/frame.h
new file mode 100644
index 0000000..7cb78a1
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/frame.h
@@ -0,0 +1,746 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * @ingroup lavu_frame
+ * reference-counted frame API
+ */
+
+#ifndef AVUTIL_FRAME_H
+#define AVUTIL_FRAME_H
+
+#include <stdint.h>
+
+#include "avutil.h"
+#include "buffer.h"
+#include "dict.h"
+#include "rational.h"
+#include "samplefmt.h"
+#include "pixfmt.h"
+#include "version.h"
+
+
+/**
+ * @defgroup lavu_frame AVFrame
+ * @ingroup lavu_data
+ *
+ * @{
+ * AVFrame is an abstraction for reference-counted raw multimedia data.
+ */
+
+enum AVFrameSideDataType {
+    /**
+     * The data is the AVPanScan struct defined in libavcodec.
+     */
+    AV_FRAME_DATA_PANSCAN,
+    /**
+     * ATSC A53 Part 4 Closed Captions.
+     * A53 CC bitstream is stored as uint8_t in AVFrameSideData.data.
+     * The number of bytes of CC data is AVFrameSideData.size.
+     */
+    AV_FRAME_DATA_A53_CC,
+    /**
+     * Stereoscopic 3d metadata.
+     * The data is the AVStereo3D struct defined in libavutil/stereo3d.h.
+     */
+    AV_FRAME_DATA_STEREO3D,
+    /**
+     * The data is the AVMatrixEncoding enum defined in libavutil/channel_layout.h.
+     */
+    AV_FRAME_DATA_MATRIXENCODING,
+    /**
+     * Metadata relevant to a downmix procedure.
+     * The data is the AVDownmixInfo struct defined in libavutil/downmix_info.h.
+     */
+    AV_FRAME_DATA_DOWNMIX_INFO,
+    /**
+     * ReplayGain information in the form of the AVReplayGain struct.
+     */
+    AV_FRAME_DATA_REPLAYGAIN,
+    /**
+     * This side data contains a 3x3 transformation matrix describing an affine
+     * transformation that needs to be applied to the frame for correct
+     * presentation.
+     *
+     * See libavutil/display.h for a detailed description of the data.
+     */
+    AV_FRAME_DATA_DISPLAYMATRIX,
+    /**
+     * Active Format Description data consisting of a single byte as specified
+     * in ETSI TS 101 154 using AVActiveFormatDescription enum.
+     */
+    AV_FRAME_DATA_AFD,
+    /**
+     * Motion vectors exported by some codecs (on demand through the export_mvs
+     * flag set in the libavcodec AVCodecContext flags2 option).
+     * The data is the AVMotionVector struct defined in
+     * libavutil/motion_vector.h.
+     */
+    AV_FRAME_DATA_MOTION_VECTORS,
+    /**
+     * Recommmends skipping the specified number of samples. This is exported
+     * only if the "skip_manual" AVOption is set in libavcodec.
+     * This has the same format as AV_PKT_DATA_SKIP_SAMPLES.
+     * @code
+     * u32le number of samples to skip from start of this packet
+     * u32le number of samples to skip from end of this packet
+     * u8    reason for start skip
+     * u8    reason for end   skip (0=padding silence, 1=convergence)
+     * @endcode
+     */
+    AV_FRAME_DATA_SKIP_SAMPLES,
+    /**
+     * This side data must be associated with an audio frame and corresponds to
+     * enum AVAudioServiceType defined in avcodec.h.
+     */
+    AV_FRAME_DATA_AUDIO_SERVICE_TYPE,
+    /**
+     * Mastering display metadata associated with a video frame. The payload is
+     * an AVMasteringDisplayMetadata type and contains information about the
+     * mastering display color volume.
+     */
+    AV_FRAME_DATA_MASTERING_DISPLAY_METADATA,
+    /**
+     * The GOP timecode in 25 bit timecode format. Data format is 64-bit integer.
+     * This is set on the first frame of a GOP that has a temporal reference of 0.
+     */
+    AV_FRAME_DATA_GOP_TIMECODE,
+
+    /**
+     * The data represents the AVSphericalMapping structure defined in
+     * libavutil/spherical.h.
+     */
+    AV_FRAME_DATA_SPHERICAL,
+};
+
+enum AVActiveFormatDescription {
+    AV_AFD_SAME         = 8,
+    AV_AFD_4_3          = 9,
+    AV_AFD_16_9         = 10,
+    AV_AFD_14_9         = 11,
+    AV_AFD_4_3_SP_14_9  = 13,
+    AV_AFD_16_9_SP_14_9 = 14,
+    AV_AFD_SP_4_3       = 15,
+};
+
+
+/**
+ * Structure to hold side data for an AVFrame.
+ *
+ * sizeof(AVFrameSideData) is not a part of the public ABI, so new fields may be added
+ * to the end with a minor bump.
+ */
+typedef struct AVFrameSideData {
+    enum AVFrameSideDataType type;
+    uint8_t *data;
+    int      size;
+    AVDictionary *metadata;
+    AVBufferRef *buf;
+} AVFrameSideData;
+
+/**
+ * This structure describes decoded (raw) audio or video data.
+ *
+ * AVFrame must be allocated using av_frame_alloc(). Note that this only
+ * allocates the AVFrame itself, the buffers for the data must be managed
+ * through other means (see below).
+ * AVFrame must be freed with av_frame_free().
+ *
+ * AVFrame is typically allocated once and then reused multiple times to hold
+ * different data (e.g. a single AVFrame to hold frames received from a
+ * decoder). In such a case, av_frame_unref() will free any references held by
+ * the frame and reset it to its original clean state before it
+ * is reused again.
+ *
+ * The data described by an AVFrame is usually reference counted through the
+ * AVBuffer API. The underlying buffer references are stored in AVFrame.buf /
+ * AVFrame.extended_buf. An AVFrame is considered to be reference counted if at
+ * least one reference is set, i.e. if AVFrame.buf[0] != NULL. In such a case,
+ * every single data plane must be contained in one of the buffers in
+ * AVFrame.buf or AVFrame.extended_buf.
+ * There may be a single buffer for all the data, or one separate buffer for
+ * each plane, or anything in between.
+ *
+ * sizeof(AVFrame) is not a part of the public ABI, so new fields may be added
+ * to the end with a minor bump.
+ *
+ * Fields can be accessed through AVOptions, the name string used, matches the
+ * C structure field name for fields accessible through AVOptions. The AVClass
+ * for AVFrame can be obtained from avcodec_get_frame_class()
+ */
+typedef struct AVFrame {
+#define AV_NUM_DATA_POINTERS 8
+    /**
+     * pointer to the picture/channel planes.
+     * This might be different from the first allocated byte
+     *
+     * Some decoders access areas outside 0,0 - width,height, please
+     * see avcodec_align_dimensions2(). Some filters and swscale can read
+     * up to 16 bytes beyond the planes, if these filters are to be used,
+     * then 16 extra bytes must be allocated.
+     *
+     * NOTE: Except for hwaccel formats, pointers not needed by the format
+     * MUST be set to NULL.
+     */
+    uint8_t *data[AV_NUM_DATA_POINTERS];
+
+    /**
+     * For video, size in bytes of each picture line.
+     * For audio, size in bytes of each plane.
+     *
+     * For audio, only linesize[0] may be set. For planar audio, each channel
+     * plane must be the same size.
+     *
+     * For video the linesizes should be multiples of the CPUs alignment
+     * preference, this is 16 or 32 for modern desktop CPUs.
+     * Some code requires such alignment other code can be slower without
+     * correct alignment, for yet other it makes no difference.
+     *
+     * @note The linesize may be larger than the size of usable data -- there
+     * may be extra padding present for performance reasons.
+     */
+    int linesize[AV_NUM_DATA_POINTERS];
+
+    /**
+     * pointers to the data planes/channels.
+     *
+     * For video, this should simply point to data[].
+     *
+     * For planar audio, each channel has a separate data pointer, and
+     * linesize[0] contains the size of each channel buffer.
+     * For packed audio, there is just one data pointer, and linesize[0]
+     * contains the total size of the buffer for all channels.
+     *
+     * Note: Both data and extended_data should always be set in a valid frame,
+     * but for planar audio with more channels that can fit in data,
+     * extended_data must be used in order to access all channels.
+     */
+    uint8_t **extended_data;
+
+    /**
+     * width and height of the video frame
+     */
+    int width, height;
+
+    /**
+     * number of audio samples (per channel) described by this frame
+     */
+    int nb_samples;
+
+    /**
+     * format of the frame, -1 if unknown or unset
+     * Values correspond to enum AVPixelFormat for video frames,
+     * enum AVSampleFormat for audio)
+     */
+    int format;
+
+    /**
+     * 1 -> keyframe, 0-> not
+     */
+    int key_frame;
+
+    /**
+     * Picture type of the frame.
+     */
+    enum AVPictureType pict_type;
+
+    /**
+     * Sample aspect ratio for the video frame, 0/1 if unknown/unspecified.
+     */
+    AVRational sample_aspect_ratio;
+
+    /**
+     * Presentation timestamp in time_base units (time when frame should be shown to user).
+     */
+    int64_t pts;
+
+#if FF_API_PKT_PTS
+    /**
+     * PTS copied from the AVPacket that was decoded to produce this frame.
+     * @deprecated use the pts field instead
+     */
+    attribute_deprecated
+    int64_t pkt_pts;
+#endif
+
+    /**
+     * DTS copied from the AVPacket that triggered returning this frame. (if frame threading isn't used)
+     * This is also the Presentation time of this AVFrame calculated from
+     * only AVPacket.dts values without pts values.
+     */
+    int64_t pkt_dts;
+
+    /**
+     * picture number in bitstream order
+     */
+    int coded_picture_number;
+    /**
+     * picture number in display order
+     */
+    int display_picture_number;
+
+    /**
+     * quality (between 1 (good) and FF_LAMBDA_MAX (bad))
+     */
+    int quality;
+
+    /**
+     * for some private data of the user
+     */
+    void *opaque;
+
+#if FF_API_ERROR_FRAME
+    /**
+     * @deprecated unused
+     */
+    attribute_deprecated
+    uint64_t error[AV_NUM_DATA_POINTERS];
+#endif
+
+    /**
+     * When decoding, this signals how much the picture must be delayed.
+     * extra_delay = repeat_pict / (2*fps)
+     */
+    int repeat_pict;
+
+    /**
+     * The content of the picture is interlaced.
+     */
+    int interlaced_frame;
+
+    /**
+     * If the content is interlaced, is top field displayed first.
+     */
+    int top_field_first;
+
+    /**
+     * Tell user application that palette has changed from previous frame.
+     */
+    int palette_has_changed;
+
+    /**
+     * reordered opaque 64 bits (generally an integer or a double precision float
+     * PTS but can be anything).
+     * The user sets AVCodecContext.reordered_opaque to represent the input at
+     * that time,
+     * the decoder reorders values as needed and sets AVFrame.reordered_opaque
+     * to exactly one of the values provided by the user through AVCodecContext.reordered_opaque
+     * @deprecated in favor of pkt_pts
+     */
+    int64_t reordered_opaque;
+
+    /**
+     * Sample rate of the audio data.
+     */
+    int sample_rate;
+
+    /**
+     * Channel layout of the audio data.
+     */
+    uint64_t channel_layout;
+
+    /**
+     * AVBuffer references backing the data for this frame. If all elements of
+     * this array are NULL, then this frame is not reference counted. This array
+     * must be filled contiguously -- if buf[i] is non-NULL then buf[j] must
+     * also be non-NULL for all j < i.
+     *
+     * There may be at most one AVBuffer per data plane, so for video this array
+     * always contains all the references. For planar audio with more than
+     * AV_NUM_DATA_POINTERS channels, there may be more buffers than can fit in
+     * this array. Then the extra AVBufferRef pointers are stored in the
+     * extended_buf array.
+     */
+    AVBufferRef *buf[AV_NUM_DATA_POINTERS];
+
+    /**
+     * For planar audio which requires more than AV_NUM_DATA_POINTERS
+     * AVBufferRef pointers, this array will hold all the references which
+     * cannot fit into AVFrame.buf.
+     *
+     * Note that this is different from AVFrame.extended_data, which always
+     * contains all the pointers. This array only contains the extra pointers,
+     * which cannot fit into AVFrame.buf.
+     *
+     * This array is always allocated using av_malloc() by whoever constructs
+     * the frame. It is freed in av_frame_unref().
+     */
+    AVBufferRef **extended_buf;
+    /**
+     * Number of elements in extended_buf.
+     */
+    int        nb_extended_buf;
+
+    AVFrameSideData **side_data;
+    int            nb_side_data;
+
+/**
+ * @defgroup lavu_frame_flags AV_FRAME_FLAGS
+ * @ingroup lavu_frame
+ * Flags describing additional frame properties.
+ *
+ * @{
+ */
+
+/**
+ * The frame data may be corrupted, e.g. due to decoding errors.
+ */
+#define AV_FRAME_FLAG_CORRUPT       (1 << 0)
+/**
+ * A flag to mark the frames which need to be decoded, but shouldn't be output.
+ */
+#define AV_FRAME_FLAG_DISCARD   (1 << 2)
+/**
+ * @}
+ */
+
+    /**
+     * Frame flags, a combination of @ref lavu_frame_flags
+     */
+    int flags;
+
+    /**
+     * MPEG vs JPEG YUV range.
+     * - encoding: Set by user
+     * - decoding: Set by libavcodec
+     */
+    enum AVColorRange color_range;
+
+    enum AVColorPrimaries color_primaries;
+
+    enum AVColorTransferCharacteristic color_trc;
+
+    /**
+     * YUV colorspace type.
+     * - encoding: Set by user
+     * - decoding: Set by libavcodec
+     */
+    enum AVColorSpace colorspace;
+
+    enum AVChromaLocation chroma_location;
+
+    /**
+     * frame timestamp estimated using various heuristics, in stream time base
+     * - encoding: unused
+     * - decoding: set by libavcodec, read by user.
+     */
+    int64_t best_effort_timestamp;
+
+    /**
+     * reordered pos from the last AVPacket that has been input into the decoder
+     * - encoding: unused
+     * - decoding: Read by user.
+     */
+    int64_t pkt_pos;
+
+    /**
+     * duration of the corresponding packet, expressed in
+     * AVStream->time_base units, 0 if unknown.
+     * - encoding: unused
+     * - decoding: Read by user.
+     */
+    int64_t pkt_duration;
+
+    /**
+     * metadata.
+     * - encoding: Set by user.
+     * - decoding: Set by libavcodec.
+     */
+    AVDictionary *metadata;
+
+    /**
+     * decode error flags of the frame, set to a combination of
+     * FF_DECODE_ERROR_xxx flags if the decoder produced a frame, but there
+     * were errors during the decoding.
+     * - encoding: unused
+     * - decoding: set by libavcodec, read by user.
+     */
+    int decode_error_flags;
+#define FF_DECODE_ERROR_INVALID_BITSTREAM   1
+#define FF_DECODE_ERROR_MISSING_REFERENCE   2
+
+    /**
+     * number of audio channels, only used for audio.
+     * - encoding: unused
+     * - decoding: Read by user.
+     */
+    int channels;
+
+    /**
+     * size of the corresponding packet containing the compressed
+     * frame.
+     * It is set to a negative value if unknown.
+     * - encoding: unused
+     * - decoding: set by libavcodec, read by user.
+     */
+    int pkt_size;
+
+#if FF_API_FRAME_QP
+    /**
+     * QP table
+     */
+    attribute_deprecated
+    int8_t *qscale_table;
+    /**
+     * QP store stride
+     */
+    attribute_deprecated
+    int qstride;
+
+    attribute_deprecated
+    int qscale_type;
+
+    AVBufferRef *qp_table_buf;
+#endif
+    /**
+     * For hwaccel-format frames, this should be a reference to the
+     * AVHWFramesContext describing the frame.
+     */
+    AVBufferRef *hw_frames_ctx;
+
+    /**
+     * AVBufferRef for free use by the API user. FFmpeg will never check the
+     * contents of the buffer ref. FFmpeg calls av_buffer_unref() on it when
+     * the frame is unreferenced. av_frame_copy_props() calls create a new
+     * reference with av_buffer_ref() for the target frame's opaque_ref field.
+     *
+     * This is unrelated to the opaque field, although it serves a similar
+     * purpose.
+     */
+    AVBufferRef *opaque_ref;
+} AVFrame;
+
+/**
+ * Accessors for some AVFrame fields. These used to be provided for ABI
+ * compatibility, and do not need to be used anymore.
+ */
+int64_t av_frame_get_best_effort_timestamp(const AVFrame *frame);
+void    av_frame_set_best_effort_timestamp(AVFrame *frame, int64_t val);
+int64_t av_frame_get_pkt_duration         (const AVFrame *frame);
+void    av_frame_set_pkt_duration         (AVFrame *frame, int64_t val);
+int64_t av_frame_get_pkt_pos              (const AVFrame *frame);
+void    av_frame_set_pkt_pos              (AVFrame *frame, int64_t val);
+int64_t av_frame_get_channel_layout       (const AVFrame *frame);
+void    av_frame_set_channel_layout       (AVFrame *frame, int64_t val);
+int     av_frame_get_channels             (const AVFrame *frame);
+void    av_frame_set_channels             (AVFrame *frame, int     val);
+int     av_frame_get_sample_rate          (const AVFrame *frame);
+void    av_frame_set_sample_rate          (AVFrame *frame, int     val);
+AVDictionary *av_frame_get_metadata       (const AVFrame *frame);
+void          av_frame_set_metadata       (AVFrame *frame, AVDictionary *val);
+int     av_frame_get_decode_error_flags   (const AVFrame *frame);
+void    av_frame_set_decode_error_flags   (AVFrame *frame, int     val);
+int     av_frame_get_pkt_size(const AVFrame *frame);
+void    av_frame_set_pkt_size(AVFrame *frame, int val);
+AVDictionary **avpriv_frame_get_metadatap(AVFrame *frame);
+#if FF_API_FRAME_QP
+int8_t *av_frame_get_qp_table(AVFrame *f, int *stride, int *type);
+int av_frame_set_qp_table(AVFrame *f, AVBufferRef *buf, int stride, int type);
+#endif
+enum AVColorSpace av_frame_get_colorspace(const AVFrame *frame);
+void    av_frame_set_colorspace(AVFrame *frame, enum AVColorSpace val);
+enum AVColorRange av_frame_get_color_range(const AVFrame *frame);
+void    av_frame_set_color_range(AVFrame *frame, enum AVColorRange val);
+
+/**
+ * Get the name of a colorspace.
+ * @return a static string identifying the colorspace; can be NULL.
+ */
+const char *av_get_colorspace_name(enum AVColorSpace val);
+
+/**
+ * Allocate an AVFrame and set its fields to default values.  The resulting
+ * struct must be freed using av_frame_free().
+ *
+ * @return An AVFrame filled with default values or NULL on failure.
+ *
+ * @note this only allocates the AVFrame itself, not the data buffers. Those
+ * must be allocated through other means, e.g. with av_frame_get_buffer() or
+ * manually.
+ */
+AVFrame *av_frame_alloc(void);
+
+/**
+ * Free the frame and any dynamically allocated objects in it,
+ * e.g. extended_data. If the frame is reference counted, it will be
+ * unreferenced first.
+ *
+ * @param frame frame to be freed. The pointer will be set to NULL.
+ */
+void av_frame_free(AVFrame **frame);
+
+/**
+ * Set up a new reference to the data described by the source frame.
+ *
+ * Copy frame properties from src to dst and create a new reference for each
+ * AVBufferRef from src.
+ *
+ * If src is not reference counted, new buffers are allocated and the data is
+ * copied.
+ *
+ * @warning: dst MUST have been either unreferenced with av_frame_unref(dst),
+ *           or newly allocated with av_frame_alloc() before calling this
+ *           function, or undefined behavior will occur.
+ *
+ * @return 0 on success, a negative AVERROR on error
+ */
+int av_frame_ref(AVFrame *dst, const AVFrame *src);
+
+/**
+ * Create a new frame that references the same data as src.
+ *
+ * This is a shortcut for av_frame_alloc()+av_frame_ref().
+ *
+ * @return newly created AVFrame on success, NULL on error.
+ */
+AVFrame *av_frame_clone(const AVFrame *src);
+
+/**
+ * Unreference all the buffers referenced by frame and reset the frame fields.
+ */
+void av_frame_unref(AVFrame *frame);
+
+/**
+ * Move everything contained in src to dst and reset src.
+ *
+ * @warning: dst is not unreferenced, but directly overwritten without reading
+ *           or deallocating its contents. Call av_frame_unref(dst) manually
+ *           before calling this function to ensure that no memory is leaked.
+ */
+void av_frame_move_ref(AVFrame *dst, AVFrame *src);
+
+/**
+ * Allocate new buffer(s) for audio or video data.
+ *
+ * The following fields must be set on frame before calling this function:
+ * - format (pixel format for video, sample format for audio)
+ * - width and height for video
+ * - nb_samples and channel_layout for audio
+ *
+ * This function will fill AVFrame.data and AVFrame.buf arrays and, if
+ * necessary, allocate and fill AVFrame.extended_data and AVFrame.extended_buf.
+ * For planar formats, one buffer will be allocated for each plane.
+ *
+ * @warning: if frame already has been allocated, calling this function will
+ *           leak memory. In addition, undefined behavior can occur in certain
+ *           cases.
+ *
+ * @param frame frame in which to store the new buffers.
+ * @param align required buffer size alignment
+ *
+ * @return 0 on success, a negative AVERROR on error.
+ */
+int av_frame_get_buffer(AVFrame *frame, int align);
+
+/**
+ * Check if the frame data is writable.
+ *
+ * @return A positive value if the frame data is writable (which is true if and
+ * only if each of the underlying buffers has only one reference, namely the one
+ * stored in this frame). Return 0 otherwise.
+ *
+ * If 1 is returned the answer is valid until av_buffer_ref() is called on any
+ * of the underlying AVBufferRefs (e.g. through av_frame_ref() or directly).
+ *
+ * @see av_frame_make_writable(), av_buffer_is_writable()
+ */
+int av_frame_is_writable(AVFrame *frame);
+
+/**
+ * Ensure that the frame data is writable, avoiding data copy if possible.
+ *
+ * Do nothing if the frame is writable, allocate new buffers and copy the data
+ * if it is not.
+ *
+ * @return 0 on success, a negative AVERROR on error.
+ *
+ * @see av_frame_is_writable(), av_buffer_is_writable(),
+ * av_buffer_make_writable()
+ */
+int av_frame_make_writable(AVFrame *frame);
+
+/**
+ * Copy the frame data from src to dst.
+ *
+ * This function does not allocate anything, dst must be already initialized and
+ * allocated with the same parameters as src.
+ *
+ * This function only copies the frame data (i.e. the contents of the data /
+ * extended data arrays), not any other properties.
+ *
+ * @return >= 0 on success, a negative AVERROR on error.
+ */
+int av_frame_copy(AVFrame *dst, const AVFrame *src);
+
+/**
+ * Copy only "metadata" fields from src to dst.
+ *
+ * Metadata for the purpose of this function are those fields that do not affect
+ * the data layout in the buffers.  E.g. pts, sample rate (for audio) or sample
+ * aspect ratio (for video), but not width/height or channel layout.
+ * Side data is also copied.
+ */
+int av_frame_copy_props(AVFrame *dst, const AVFrame *src);
+
+/**
+ * Get the buffer reference a given data plane is stored in.
+ *
+ * @param plane index of the data plane of interest in frame->extended_data.
+ *
+ * @return the buffer reference that contains the plane or NULL if the input
+ * frame is not valid.
+ */
+AVBufferRef *av_frame_get_plane_buffer(AVFrame *frame, int plane);
+
+/**
+ * Add a new side data to a frame.
+ *
+ * @param frame a frame to which the side data should be added
+ * @param type type of the added side data
+ * @param size size of the side data
+ *
+ * @return newly added side data on success, NULL on error
+ */
+AVFrameSideData *av_frame_new_side_data(AVFrame *frame,
+                                        enum AVFrameSideDataType type,
+                                        int size);
+
+/**
+ * @return a pointer to the side data of a given type on success, NULL if there
+ * is no side data with such type in this frame.
+ */
+AVFrameSideData *av_frame_get_side_data(const AVFrame *frame,
+                                        enum AVFrameSideDataType type);
+
+/**
+ * If side data of the supplied type exists in the frame, free it and remove it
+ * from the frame.
+ */
+void av_frame_remove_side_data(AVFrame *frame, enum AVFrameSideDataType type);
+
+/**
+ * @return a string identifying the side data type
+ */
+const char *av_frame_side_data_name(enum AVFrameSideDataType type);
+
+/**
+ * @}
+ */
+
+#endif /* AVUTIL_FRAME_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/hash.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/hash.h
new file mode 100644
index 0000000..a20b893
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/hash.h
@@ -0,0 +1,263 @@
+/*
+ * Copyright (C) 2013 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * @ingroup lavu_hash_generic
+ * Generic hashing API
+ */
+
+#ifndef AVUTIL_HASH_H
+#define AVUTIL_HASH_H
+
+#include <stdint.h>
+
+/**
+ * @defgroup lavu_hash Hash Functions
+ * @ingroup lavu_crypto
+ * Hash functions useful in multimedia.
+ *
+ * Hash functions are widely used in multimedia, from error checking and
+ * concealment to internal regression testing. libavutil has efficient
+ * implementations of a variety of hash functions that may be useful for
+ * FFmpeg and other multimedia applications.
+ *
+ * @{
+ *
+ * @defgroup lavu_hash_generic Generic Hashing API
+ * An abstraction layer for all hash functions supported by libavutil.
+ *
+ * If your application needs to support a wide range of different hash
+ * functions, then the Generic Hashing API is for you. It provides a generic,
+ * reusable API for @ref lavu_hash "all hash functions" implemented in libavutil.
+ * If you just need to use one particular hash function, use the @ref lavu_hash
+ * "individual hash" directly.
+ *
+ * @section Sample Code
+ *
+ * A basic template for using the Generic Hashing API follows:
+ *
+ * @code
+ * struct AVHashContext *ctx = NULL;
+ * const char *hash_name = NULL;
+ * uint8_t *output_buf = NULL;
+ *
+ * // Select from a string returned by av_hash_names()
+ * hash_name = ...;
+ *
+ * // Allocate a hash context
+ * ret = av_hash_alloc(&ctx, hash_name);
+ * if (ret < 0)
+ *     return ret;
+ *
+ * // Initialize the hash context
+ * av_hash_init(ctx);
+ *
+ * // Update the hash context with data
+ * while (data_left) {
+ *     av_hash_update(ctx, data, size);
+ * }
+ *
+ * // Now we have no more data, so it is time to finalize the hash and get the
+ * // output. But we need to first allocate an output buffer. Note that you can
+ * // use any memory allocation function, including malloc(), not just
+ * // av_malloc().
+ * output_buf = av_malloc(av_hash_get_size(ctx));
+ * if (!output_buf)
+ *     return AVERROR(ENOMEM);
+ *
+ * // Finalize the hash context.
+ * // You can use any of the av_hash_final*() functions provided, for other
+ * // output formats. If you do so, be sure to adjust the memory allocation
+ * // above. See the function documentation below for the exact amount of extra
+ * // memory needed.
+ * av_hash_final(ctx, output_buffer);
+ *
+ * // Free the context
+ * av_hash_freep(&ctx);
+ * @endcode
+ *
+ * @section Hash Function-Specific Information
+ * If the CRC32 hash is selected, the #AV_CRC_32_IEEE polynomial will be
+ * used.
+ *
+ * If the Murmur3 hash is selected, the default seed will be used. See @ref
+ * lavu_murmur3_seedinfo "Murmur3" for more information.
+ *
+ * @{
+ */
+
+/**
+ * @example ffhash.c
+ * This example is a simple command line application that takes one or more
+ * arguments. It demonstrates a typical use of the hashing API with allocation,
+ * initialization, updating, and finalizing.
+ */
+
+struct AVHashContext;
+
+/**
+ * Allocate a hash context for the algorithm specified by name.
+ *
+ * @return  >= 0 for success, a negative error code for failure
+ *
+ * @note The context is not initialized after a call to this function; you must
+ * call av_hash_init() to do so.
+ */
+int av_hash_alloc(struct AVHashContext **ctx, const char *name);
+
+/**
+ * Get the names of available hash algorithms.
+ *
+ * This function can be used to enumerate the algorithms.
+ *
+ * @param[in] i  Index of the hash algorithm, starting from 0
+ * @return       Pointer to a static string or `NULL` if `i` is out of range
+ */
+const char *av_hash_names(int i);
+
+/**
+ * Get the name of the algorithm corresponding to the given hash context.
+ */
+const char *av_hash_get_name(const struct AVHashContext *ctx);
+
+/**
+ * Maximum value that av_hash_get_size() will currently return.
+ *
+ * You can use this if you absolutely want or need to use static allocation for
+ * the output buffer and are fine with not supporting hashes newly added to
+ * libavutil without recompilation.
+ *
+ * @warning
+ * Adding new hashes with larger sizes, and increasing the macro while doing
+ * so, will not be considered an ABI change. To prevent your code from
+ * overflowing a buffer, either dynamically allocate the output buffer with
+ * av_hash_get_size(), or limit your use of the Hashing API to hashes that are
+ * already in FFmpeg during the time of compilation.
+ */
+#define AV_HASH_MAX_SIZE 64
+
+/**
+ * Get the size of the resulting hash value in bytes.
+ *
+ * The maximum value this function will currently return is available as macro
+ * #AV_HASH_MAX_SIZE.
+ *
+ * @param[in]     ctx Hash context
+ * @return            Size of the hash value in bytes
+ */
+int av_hash_get_size(const struct AVHashContext *ctx);
+
+/**
+ * Initialize or reset a hash context.
+ *
+ * @param[in,out] ctx Hash context
+ */
+void av_hash_init(struct AVHashContext *ctx);
+
+/**
+ * Update a hash context with additional data.
+ *
+ * @param[in,out] ctx Hash context
+ * @param[in]     src Data to be added to the hash context
+ * @param[in]     len Size of the additional data
+ */
+void av_hash_update(struct AVHashContext *ctx, const uint8_t *src, int len);
+
+/**
+ * Finalize a hash context and compute the actual hash value.
+ *
+ * The minimum size of `dst` buffer is given by av_hash_get_size() or
+ * #AV_HASH_MAX_SIZE. The use of the latter macro is discouraged.
+ *
+ * It is not safe to update or finalize a hash context again, if it has already
+ * been finalized.
+ *
+ * @param[in,out] ctx Hash context
+ * @param[out]    dst Where the final hash value will be stored
+ *
+ * @see av_hash_final_bin() provides an alternative API
+ */
+void av_hash_final(struct AVHashContext *ctx, uint8_t *dst);
+
+/**
+ * Finalize a hash context and store the actual hash value in a buffer.
+ *
+ * It is not safe to update or finalize a hash context again, if it has already
+ * been finalized.
+ *
+ * If `size` is smaller than the hash size (given by av_hash_get_size()), the
+ * hash is truncated; if size is larger, the buffer is padded with 0.
+ *
+ * @param[in,out] ctx  Hash context
+ * @param[out]    dst  Where the final hash value will be stored
+ * @param[in]     size Number of bytes to write to `dst`
+ */
+void av_hash_final_bin(struct AVHashContext *ctx, uint8_t *dst, int size);
+
+/**
+ * Finalize a hash context and store the hexadecimal representation of the
+ * actual hash value as a string.
+ *
+ * It is not safe to update or finalize a hash context again, if it has already
+ * been finalized.
+ *
+ * The string is always 0-terminated.
+ *
+ * If `size` is smaller than `2 * hash_size + 1`, where `hash_size` is the
+ * value returned by av_hash_get_size(), the string will be truncated.
+ *
+ * @param[in,out] ctx  Hash context
+ * @param[out]    dst  Where the string will be stored
+ * @param[in]     size Maximum number of bytes to write to `dst`
+ */
+void av_hash_final_hex(struct AVHashContext *ctx, uint8_t *dst, int size);
+
+/**
+ * Finalize a hash context and store the Base64 representation of the
+ * actual hash value as a string.
+ *
+ * It is not safe to update or finalize a hash context again, if it has already
+ * been finalized.
+ *
+ * The string is always 0-terminated.
+ *
+ * If `size` is smaller than AV_BASE64_SIZE(hash_size), where `hash_size` is
+ * the value returned by av_hash_get_size(), the string will be truncated.
+ *
+ * @param[in,out] ctx  Hash context
+ * @param[out]    dst  Where the final hash value will be stored
+ * @param[in]     size Maximum number of bytes to write to `dst`
+ */
+void av_hash_final_b64(struct AVHashContext *ctx, uint8_t *dst, int size);
+
+/**
+ * Free hash context and set hash context pointer to `NULL`.
+ *
+ * @param[in,out] ctx  Pointer to hash context
+ */
+void av_hash_freep(struct AVHashContext **ctx);
+
+/**
+ * @}
+ * @}
+ */
+
+#endif /* AVUTIL_HASH_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/hmac.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/hmac.h
new file mode 100644
index 0000000..576a0a4
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/hmac.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (C) 2012 Martin Storsjo
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_HMAC_H
+#define AVUTIL_HMAC_H
+
+#include <stdint.h>
+
+#include "version.h"
+/**
+ * @defgroup lavu_hmac HMAC
+ * @ingroup lavu_crypto
+ * @{
+ */
+
+enum AVHMACType {
+    AV_HMAC_MD5,
+    AV_HMAC_SHA1,
+    AV_HMAC_SHA224,
+    AV_HMAC_SHA256,
+    AV_HMAC_SHA384 = 12,
+    AV_HMAC_SHA512,
+};
+
+typedef struct AVHMAC AVHMAC;
+
+/**
+ * Allocate an AVHMAC context.
+ * @param type The hash function used for the HMAC.
+ */
+AVHMAC *av_hmac_alloc(enum AVHMACType type);
+
+/**
+ * Free an AVHMAC context.
+ * @param ctx The context to free, may be NULL
+ */
+void av_hmac_free(AVHMAC *ctx);
+
+/**
+ * Initialize an AVHMAC context with an authentication key.
+ * @param ctx    The HMAC context
+ * @param key    The authentication key
+ * @param keylen The length of the key, in bytes
+ */
+void av_hmac_init(AVHMAC *ctx, const uint8_t *key, unsigned int keylen);
+
+/**
+ * Hash data with the HMAC.
+ * @param ctx  The HMAC context
+ * @param data The data to hash
+ * @param len  The length of the data, in bytes
+ */
+void av_hmac_update(AVHMAC *ctx, const uint8_t *data, unsigned int len);
+
+/**
+ * Finish hashing and output the HMAC digest.
+ * @param ctx    The HMAC context
+ * @param out    The output buffer to write the digest into
+ * @param outlen The length of the out buffer, in bytes
+ * @return       The number of bytes written to out, or a negative error code.
+ */
+int av_hmac_final(AVHMAC *ctx, uint8_t *out, unsigned int outlen);
+
+/**
+ * Hash an array of data with a key.
+ * @param ctx    The HMAC context
+ * @param data   The data to hash
+ * @param len    The length of the data, in bytes
+ * @param key    The authentication key
+ * @param keylen The length of the key, in bytes
+ * @param out    The output buffer to write the digest into
+ * @param outlen The length of the out buffer, in bytes
+ * @return       The number of bytes written to out, or a negative error code.
+ */
+int av_hmac_calc(AVHMAC *ctx, const uint8_t *data, unsigned int len,
+                 const uint8_t *key, unsigned int keylen,
+                 uint8_t *out, unsigned int outlen);
+
+/**
+ * @}
+ */
+
+#endif /* AVUTIL_HMAC_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/hwcontext.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/hwcontext.h
new file mode 100644
index 0000000..e35fb25
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/hwcontext.h
@@ -0,0 +1,523 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_HWCONTEXT_H
+#define AVUTIL_HWCONTEXT_H
+
+#include "buffer.h"
+#include "frame.h"
+#include "log.h"
+#include "pixfmt.h"
+
+enum AVHWDeviceType {
+    AV_HWDEVICE_TYPE_VDPAU,
+    AV_HWDEVICE_TYPE_CUDA,
+    AV_HWDEVICE_TYPE_VAAPI,
+    AV_HWDEVICE_TYPE_DXVA2,
+    AV_HWDEVICE_TYPE_QSV,
+};
+
+typedef struct AVHWDeviceInternal AVHWDeviceInternal;
+
+/**
+ * This struct aggregates all the (hardware/vendor-specific) "high-level" state,
+ * i.e. state that is not tied to a concrete processing configuration.
+ * E.g., in an API that supports hardware-accelerated encoding and decoding,
+ * this struct will (if possible) wrap the state that is common to both encoding
+ * and decoding and from which specific instances of encoders or decoders can be
+ * derived.
+ *
+ * This struct is reference-counted with the AVBuffer mechanism. The
+ * av_hwdevice_ctx_alloc() constructor yields a reference, whose data field
+ * points to the actual AVHWDeviceContext. Further objects derived from
+ * AVHWDeviceContext (such as AVHWFramesContext, describing a frame pool with
+ * specific properties) will hold an internal reference to it. After all the
+ * references are released, the AVHWDeviceContext itself will be freed,
+ * optionally invoking a user-specified callback for uninitializing the hardware
+ * state.
+ */
+typedef struct AVHWDeviceContext {
+    /**
+     * A class for logging. Set by av_hwdevice_ctx_alloc().
+     */
+    const AVClass *av_class;
+
+    /**
+     * Private data used internally by libavutil. Must not be accessed in any
+     * way by the caller.
+     */
+    AVHWDeviceInternal *internal;
+
+    /**
+     * This field identifies the underlying API used for hardware access.
+     *
+     * This field is set when this struct is allocated and never changed
+     * afterwards.
+     */
+    enum AVHWDeviceType type;
+
+    /**
+     * The format-specific data, allocated and freed by libavutil along with
+     * this context.
+     *
+     * Should be cast by the user to the format-specific context defined in the
+     * corresponding header (hwcontext_*.h) and filled as described in the
+     * documentation before calling av_hwdevice_ctx_init().
+     *
+     * After calling av_hwdevice_ctx_init() this struct should not be modified
+     * by the caller.
+     */
+    void *hwctx;
+
+    /**
+     * This field may be set by the caller before calling av_hwdevice_ctx_init().
+     *
+     * If non-NULL, this callback will be called when the last reference to
+     * this context is unreferenced, immediately before it is freed.
+     *
+     * @note when other objects (e.g an AVHWFramesContext) are derived from this
+     *       struct, this callback will be invoked after all such child objects
+     *       are fully uninitialized and their respective destructors invoked.
+     */
+    void (*free)(struct AVHWDeviceContext *ctx);
+
+    /**
+     * Arbitrary user data, to be used e.g. by the free() callback.
+     */
+    void *user_opaque;
+} AVHWDeviceContext;
+
+typedef struct AVHWFramesInternal AVHWFramesInternal;
+
+/**
+ * This struct describes a set or pool of "hardware" frames (i.e. those with
+ * data not located in normal system memory). All the frames in the pool are
+ * assumed to be allocated in the same way and interchangeable.
+ *
+ * This struct is reference-counted with the AVBuffer mechanism and tied to a
+ * given AVHWDeviceContext instance. The av_hwframe_ctx_alloc() constructor
+ * yields a reference, whose data field points to the actual AVHWFramesContext
+ * struct.
+ */
+typedef struct AVHWFramesContext {
+    /**
+     * A class for logging.
+     */
+    const AVClass *av_class;
+
+    /**
+     * Private data used internally by libavutil. Must not be accessed in any
+     * way by the caller.
+     */
+    AVHWFramesInternal *internal;
+
+    /**
+     * A reference to the parent AVHWDeviceContext. This reference is owned and
+     * managed by the enclosing AVHWFramesContext, but the caller may derive
+     * additional references from it.
+     */
+    AVBufferRef *device_ref;
+
+    /**
+     * The parent AVHWDeviceContext. This is simply a pointer to
+     * device_ref->data provided for convenience.
+     *
+     * Set by libavutil in av_hwframe_ctx_init().
+     */
+    AVHWDeviceContext *device_ctx;
+
+    /**
+     * The format-specific data, allocated and freed automatically along with
+     * this context.
+     *
+     * Should be cast by the user to the format-specific context defined in the
+     * corresponding header (hwframe_*.h) and filled as described in the
+     * documentation before calling av_hwframe_ctx_init().
+     *
+     * After any frames using this context are created, the contents of this
+     * struct should not be modified by the caller.
+     */
+    void *hwctx;
+
+    /**
+     * This field may be set by the caller before calling av_hwframe_ctx_init().
+     *
+     * If non-NULL, this callback will be called when the last reference to
+     * this context is unreferenced, immediately before it is freed.
+     */
+    void (*free)(struct AVHWFramesContext *ctx);
+
+    /**
+     * Arbitrary user data, to be used e.g. by the free() callback.
+     */
+    void *user_opaque;
+
+    /**
+     * A pool from which the frames are allocated by av_hwframe_get_buffer().
+     * This field may be set by the caller before calling av_hwframe_ctx_init().
+     * The buffers returned by calling av_buffer_pool_get() on this pool must
+     * have the properties described in the documentation in the corresponding hw
+     * type's header (hwcontext_*.h). The pool will be freed strictly before
+     * this struct's free() callback is invoked.
+     *
+     * This field may be NULL, then libavutil will attempt to allocate a pool
+     * internally. Note that certain device types enforce pools allocated at
+     * fixed size (frame count), which cannot be extended dynamically. In such a
+     * case, initial_pool_size must be set appropriately.
+     */
+    AVBufferPool *pool;
+
+    /**
+     * Initial size of the frame pool. If a device type does not support
+     * dynamically resizing the pool, then this is also the maximum pool size.
+     *
+     * May be set by the caller before calling av_hwframe_ctx_init(). Must be
+     * set if pool is NULL and the device type does not support dynamic pools.
+     */
+    int initial_pool_size;
+
+    /**
+     * The pixel format identifying the underlying HW surface type.
+     *
+     * Must be a hwaccel format, i.e. the corresponding descriptor must have the
+     * AV_PIX_FMT_FLAG_HWACCEL flag set.
+     *
+     * Must be set by the user before calling av_hwframe_ctx_init().
+     */
+    enum AVPixelFormat format;
+
+    /**
+     * The pixel format identifying the actual data layout of the hardware
+     * frames.
+     *
+     * Must be set by the caller before calling av_hwframe_ctx_init().
+     *
+     * @note when the underlying API does not provide the exact data layout, but
+     * only the colorspace/bit depth, this field should be set to the fully
+     * planar version of that format (e.g. for 8-bit 420 YUV it should be
+     * AV_PIX_FMT_YUV420P, not AV_PIX_FMT_NV12 or anything else).
+     */
+    enum AVPixelFormat sw_format;
+
+    /**
+     * The allocated dimensions of the frames in this pool.
+     *
+     * Must be set by the user before calling av_hwframe_ctx_init().
+     */
+    int width, height;
+} AVHWFramesContext;
+
+/**
+ * Allocate an AVHWDeviceContext for a given hardware type.
+ *
+ * @param type the type of the hardware device to allocate.
+ * @return a reference to the newly created AVHWDeviceContext on success or NULL
+ *         on failure.
+ */
+AVBufferRef *av_hwdevice_ctx_alloc(enum AVHWDeviceType type);
+
+/**
+ * Finalize the device context before use. This function must be called after
+ * the context is filled with all the required information and before it is
+ * used in any way.
+ *
+ * @param ref a reference to the AVHWDeviceContext
+ * @return 0 on success, a negative AVERROR code on failure
+ */
+int av_hwdevice_ctx_init(AVBufferRef *ref);
+
+/**
+ * Open a device of the specified type and create an AVHWDeviceContext for it.
+ *
+ * This is a convenience function intended to cover the simple cases. Callers
+ * who need to fine-tune device creation/management should open the device
+ * manually and then wrap it in an AVHWDeviceContext using
+ * av_hwdevice_ctx_alloc()/av_hwdevice_ctx_init().
+ *
+ * The returned context is already initialized and ready for use, the caller
+ * should not call av_hwdevice_ctx_init() on it. The user_opaque/free fields of
+ * the created AVHWDeviceContext are set by this function and should not be
+ * touched by the caller.
+ *
+ * @param device_ctx On success, a reference to the newly-created device context
+ *                   will be written here. The reference is owned by the caller
+ *                   and must be released with av_buffer_unref() when no longer
+ *                   needed. On failure, NULL will be written to this pointer.
+ * @param type The type of the device to create.
+ * @param device A type-specific string identifying the device to open.
+ * @param opts A dictionary of additional (type-specific) options to use in
+ *             opening the device. The dictionary remains owned by the caller.
+ * @param flags currently unused
+ *
+ * @return 0 on success, a negative AVERROR code on failure.
+ */
+int av_hwdevice_ctx_create(AVBufferRef **device_ctx, enum AVHWDeviceType type,
+                           const char *device, AVDictionary *opts, int flags);
+
+/**
+ * Allocate an AVHWFramesContext tied to a given device context.
+ *
+ * @param device_ctx a reference to a AVHWDeviceContext. This function will make
+ *                   a new reference for internal use, the one passed to the
+ *                   function remains owned by the caller.
+ * @return a reference to the newly created AVHWFramesContext on success or NULL
+ *         on failure.
+ */
+AVBufferRef *av_hwframe_ctx_alloc(AVBufferRef *device_ctx);
+
+/**
+ * Finalize the context before use. This function must be called after the
+ * context is filled with all the required information and before it is attached
+ * to any frames.
+ *
+ * @param ref a reference to the AVHWFramesContext
+ * @return 0 on success, a negative AVERROR code on failure
+ */
+int av_hwframe_ctx_init(AVBufferRef *ref);
+
+/**
+ * Allocate a new frame attached to the given AVHWFramesContext.
+ *
+ * @param hwframe_ctx a reference to an AVHWFramesContext
+ * @param frame an empty (freshly allocated or unreffed) frame to be filled with
+ *              newly allocated buffers.
+ * @param flags currently unused, should be set to zero
+ * @return 0 on success, a negative AVERROR code on failure
+ */
+int av_hwframe_get_buffer(AVBufferRef *hwframe_ctx, AVFrame *frame, int flags);
+
+/**
+ * Copy data to or from a hw surface. At least one of dst/src must have an
+ * AVHWFramesContext attached.
+ *
+ * If src has an AVHWFramesContext attached, then the format of dst (if set)
+ * must use one of the formats returned by av_hwframe_transfer_get_formats(src,
+ * AV_HWFRAME_TRANSFER_DIRECTION_FROM).
+ * If dst has an AVHWFramesContext attached, then the format of src must use one
+ * of the formats returned by av_hwframe_transfer_get_formats(dst,
+ * AV_HWFRAME_TRANSFER_DIRECTION_TO)
+ *
+ * dst may be "clean" (i.e. with data/buf pointers unset), in which case the
+ * data buffers will be allocated by this function using av_frame_get_buffer().
+ * If dst->format is set, then this format will be used, otherwise (when
+ * dst->format is AV_PIX_FMT_NONE) the first acceptable format will be chosen.
+ *
+ * The two frames must have matching allocated dimensions (i.e. equal to
+ * AVHWFramesContext.width/height), since not all device types support
+ * transferring a sub-rectangle of the whole surface. The display dimensions
+ * (i.e. AVFrame.width/height) may be smaller than the allocated dimensions, but
+ * also have to be equal for both frames. When the display dimensions are
+ * smaller than the allocated dimensions, the content of the padding in the
+ * destination frame is unspecified.
+ *
+ * @param dst the destination frame. dst is not touched on failure.
+ * @param src the source frame.
+ * @param flags currently unused, should be set to zero
+ * @return 0 on success, a negative AVERROR error code on failure.
+ */
+int av_hwframe_transfer_data(AVFrame *dst, const AVFrame *src, int flags);
+
+enum AVHWFrameTransferDirection {
+    /**
+     * Transfer the data from the queried hw frame.
+     */
+    AV_HWFRAME_TRANSFER_DIRECTION_FROM,
+
+    /**
+     * Transfer the data to the queried hw frame.
+     */
+    AV_HWFRAME_TRANSFER_DIRECTION_TO,
+};
+
+/**
+ * Get a list of possible source or target formats usable in
+ * av_hwframe_transfer_data().
+ *
+ * @param hwframe_ctx the frame context to obtain the information for
+ * @param dir the direction of the transfer
+ * @param formats the pointer to the output format list will be written here.
+ *                The list is terminated with AV_PIX_FMT_NONE and must be freed
+ *                by the caller when no longer needed using av_free().
+ *                If this function returns successfully, the format list will
+ *                have at least one item (not counting the terminator).
+ *                On failure, the contents of this pointer are unspecified.
+ * @param flags currently unused, should be set to zero
+ * @return 0 on success, a negative AVERROR code on failure.
+ */
+int av_hwframe_transfer_get_formats(AVBufferRef *hwframe_ctx,
+                                    enum AVHWFrameTransferDirection dir,
+                                    enum AVPixelFormat **formats, int flags);
+
+
+/**
+ * This struct describes the constraints on hardware frames attached to
+ * a given device with a hardware-specific configuration.  This is returned
+ * by av_hwdevice_get_hwframe_constraints() and must be freed by
+ * av_hwframe_constraints_free() after use.
+ */
+typedef struct AVHWFramesConstraints {
+    /**
+     * A list of possible values for format in the hw_frames_ctx,
+     * terminated by AV_PIX_FMT_NONE.  This member will always be filled.
+     */
+    enum AVPixelFormat *valid_hw_formats;
+
+    /**
+     * A list of possible values for sw_format in the hw_frames_ctx,
+     * terminated by AV_PIX_FMT_NONE.  Can be NULL if this information is
+     * not known.
+     */
+    enum AVPixelFormat *valid_sw_formats;
+
+    /**
+     * The minimum size of frames in this hw_frames_ctx.
+     * (Zero if not known.)
+     */
+    int min_width;
+    int min_height;
+
+    /**
+     * The maximum size of frames in this hw_frames_ctx.
+     * (INT_MAX if not known / no limit.)
+     */
+    int max_width;
+    int max_height;
+} AVHWFramesConstraints;
+
+/**
+ * Allocate a HW-specific configuration structure for a given HW device.
+ * After use, the user must free all members as required by the specific
+ * hardware structure being used, then free the structure itself with
+ * av_free().
+ *
+ * @param device_ctx a reference to the associated AVHWDeviceContext.
+ * @return The newly created HW-specific configuration structure on
+ *         success or NULL on failure.
+ */
+void *av_hwdevice_hwconfig_alloc(AVBufferRef *device_ctx);
+
+/**
+ * Get the constraints on HW frames given a device and the HW-specific
+ * configuration to be used with that device.  If no HW-specific
+ * configuration is provided, returns the maximum possible capabilities
+ * of the device.
+ *
+ * @param device_ctx a reference to the associated AVHWDeviceContext.
+ * @param hwconfig a filled HW-specific configuration structure, or NULL
+ *        to return the maximum possible capabilities of the device.
+ * @return AVHWFramesConstraints structure describing the constraints
+ *         on the device, or NULL if not available.
+ */
+AVHWFramesConstraints *av_hwdevice_get_hwframe_constraints(AVBufferRef *ref,
+                                                           const void *hwconfig);
+
+/**
+ * Free an AVHWFrameConstraints structure.
+ *
+ * @param constraints The (filled or unfilled) AVHWFrameConstraints structure.
+ */
+void av_hwframe_constraints_free(AVHWFramesConstraints **constraints);
+
+
+/**
+ * Flags to apply to frame mappings.
+ */
+enum {
+    /**
+     * The mapping must be readable.
+     */
+    AV_HWFRAME_MAP_READ      = 1 << 0,
+    /**
+     * The mapping must be writeable.
+     */
+    AV_HWFRAME_MAP_WRITE     = 1 << 1,
+    /**
+     * The mapped frame will be overwritten completely in subsequent
+     * operations, so the current frame data need not be loaded.  Any values
+     * which are not overwritten are unspecified.
+     */
+    AV_HWFRAME_MAP_OVERWRITE = 1 << 2,
+    /**
+     * The mapping must be direct.  That is, there must not be any copying in
+     * the map or unmap steps.  Note that performance of direct mappings may
+     * be much lower than normal memory.
+     */
+    AV_HWFRAME_MAP_DIRECT    = 1 << 3,
+};
+
+/**
+ * Map a hardware frame.
+ *
+ * This has a number of different possible effects, depending on the format
+ * and origin of the src and dst frames.  On input, src should be a usable
+ * frame with valid buffers and dst should be blank (typically as just created
+ * by av_frame_alloc()).  src should have an associated hwframe context, and
+ * dst may optionally have a format and associated hwframe context.
+ *
+ * If src was created by mapping a frame from the hwframe context of dst,
+ * then this function undoes the mapping - dst is replaced by a reference to
+ * the frame that src was originally mapped from.
+ *
+ * If both src and dst have an associated hwframe context, then this function
+ * attempts to map the src frame from its hardware context to that of dst and
+ * then fill dst with appropriate data to be usable there.  This will only be
+ * possible if the hwframe contexts and associated devices are compatible -
+ * given compatible devices, av_hwframe_ctx_create_derived() can be used to
+ * create a hwframe context for dst in which mapping should be possible.
+ *
+ * If src has a hwframe context but dst does not, then the src frame is
+ * mapped to normal memory and should thereafter be usable as a normal frame.
+ * If the format is set on dst, then the mapping will attempt to create dst
+ * with that format and fail if it is not possible.  If format is unset (is
+ * AV_PIX_FMT_NONE) then dst will be mapped with whatever the most appropriate
+ * format to use is (probably the sw_format of the src hwframe context).
+ *
+ * A return value of AVERROR(ENOSYS) indicates that the mapping is not
+ * possible with the given arguments and hwframe setup, while other return
+ * values indicate that it failed somehow.
+ *
+ * @param dst Destination frame, to contain the mapping.
+ * @param src Source frame, to be mapped.
+ * @param flags Some combination of AV_HWFRAME_MAP_* flags.
+ * @return Zero on success, negative AVERROR code on failure.
+ */
+int av_hwframe_map(AVFrame *dst, const AVFrame *src, int flags);
+
+
+/**
+ * Create and initialise an AVHWFramesContext as a mapping of another existing
+ * AVHWFramesContext on a different device.
+ *
+ * av_hwframe_ctx_init() should not be called after this.
+ *
+ * @param derived_frame_ctx  On success, a reference to the newly created
+ *                           AVHWFramesContext.
+ * @param derived_device_ctx A reference to the device to create the new
+ *                           AVHWFramesContext on.
+ * @param source_frame_ctx   A reference to an existing AVHWFramesContext
+ *                           which will be mapped to the derived context.
+ * @param flags  Currently unused; should be set to zero.
+ * @return       Zero on success, negative AVERROR code on failure.
+ */
+int av_hwframe_ctx_create_derived(AVBufferRef **derived_frame_ctx,
+                                  enum AVPixelFormat format,
+                                  AVBufferRef *derived_device_ctx,
+                                  AVBufferRef *source_frame_ctx,
+                                  int flags);
+
+#endif /* AVUTIL_HWCONTEXT_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/hwcontext_cuda.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/hwcontext_cuda.h
new file mode 100644
index 0000000..12dae84
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/hwcontext_cuda.h
@@ -0,0 +1,51 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#ifndef AVUTIL_HWCONTEXT_CUDA_H
+#define AVUTIL_HWCONTEXT_CUDA_H
+
+#ifndef CUDA_VERSION
+#include <cuda.h>
+#endif
+
+#include "pixfmt.h"
+
+/**
+ * @file
+ * An API-specific header for AV_HWDEVICE_TYPE_CUDA.
+ *
+ * This API supports dynamic frame pools. AVHWFramesContext.pool must return
+ * AVBufferRefs whose data pointer is a CUdeviceptr.
+ */
+
+typedef struct AVCUDADeviceContextInternal AVCUDADeviceContextInternal;
+
+/**
+ * This struct is allocated as AVHWDeviceContext.hwctx
+ */
+typedef struct AVCUDADeviceContext {
+    CUcontext cuda_ctx;
+    AVCUDADeviceContextInternal *internal;
+} AVCUDADeviceContext;
+
+/**
+ * AVHWFramesContext.hwctx is currently not used
+ */
+
+#endif /* AVUTIL_HWCONTEXT_CUDA_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/hwcontext_dxva2.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/hwcontext_dxva2.h
new file mode 100644
index 0000000..6c36cb4
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/hwcontext_dxva2.h
@@ -0,0 +1,72 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#ifndef AVUTIL_HWCONTEXT_DXVA2_H
+#define AVUTIL_HWCONTEXT_DXVA2_H
+
+/**
+ * @file
+ * An API-specific header for AV_HWDEVICE_TYPE_DXVA2.
+ *
+ * Only fixed-size pools are supported.
+ *
+ * For user-allocated pools, AVHWFramesContext.pool must return AVBufferRefs
+ * with the data pointer set to a pointer to IDirect3DSurface9.
+ */
+
+#include <d3d9.h>
+#include <dxva2api.h>
+
+/**
+ * This struct is allocated as AVHWDeviceContext.hwctx
+ */
+typedef struct AVDXVA2DeviceContext {
+    IDirect3DDeviceManager9 *devmgr;
+} AVDXVA2DeviceContext;
+
+/**
+ * This struct is allocated as AVHWFramesContext.hwctx
+ */
+typedef struct AVDXVA2FramesContext {
+    /**
+     * The surface type (e.g. DXVA2_VideoProcessorRenderTarget or
+     * DXVA2_VideoDecoderRenderTarget). Must be set by the caller.
+     */
+    DWORD               surface_type;
+
+    /**
+     * The surface pool. When an external pool is not provided by the caller,
+     * this will be managed (allocated and filled on init, freed on uninit) by
+     * libavutil.
+     */
+    IDirect3DSurface9 **surfaces;
+    int              nb_surfaces;
+
+    /**
+     * Certain drivers require the decoder to be destroyed before the surfaces.
+     * To allow internally managed pools to work properly in such cases, this
+     * field is provided.
+     *
+     * If it is non-NULL, libavutil will call IDirectXVideoDecoder_Release() on
+     * it just before the internal surface pool is freed.
+     */
+    IDirectXVideoDecoder *decoder_to_release;
+} AVDXVA2FramesContext;
+
+#endif /* AVUTIL_HWCONTEXT_DXVA2_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/hwcontext_qsv.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/hwcontext_qsv.h
new file mode 100644
index 0000000..b98d611
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/hwcontext_qsv.h
@@ -0,0 +1,53 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_HWCONTEXT_QSV_H
+#define AVUTIL_HWCONTEXT_QSV_H
+
+#include <mfx/mfxvideo.h>
+
+/**
+ * @file
+ * An API-specific header for AV_HWDEVICE_TYPE_QSV.
+ *
+ * This API does not support dynamic frame pools. AVHWFramesContext.pool must
+ * contain AVBufferRefs whose data pointer points to an mfxFrameSurface1 struct.
+ */
+
+/**
+ * This struct is allocated as AVHWDeviceContext.hwctx
+ */
+typedef struct AVQSVDeviceContext {
+    mfxSession session;
+} AVQSVDeviceContext;
+
+/**
+ * This struct is allocated as AVHWFramesContext.hwctx
+ */
+typedef struct AVQSVFramesContext {
+    mfxFrameSurface1 *surfaces;
+    int            nb_surfaces;
+
+    /**
+     * A combination of MFX_MEMTYPE_* describing the frame pool.
+     */
+    int frame_type;
+} AVQSVFramesContext;
+
+#endif /* AVUTIL_HWCONTEXT_QSV_H */
+
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/hwcontext_vaapi.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/hwcontext_vaapi.h
new file mode 100644
index 0000000..da1d4fe
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/hwcontext_vaapi.h
@@ -0,0 +1,110 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_HWCONTEXT_VAAPI_H
+#define AVUTIL_HWCONTEXT_VAAPI_H
+
+#include <va/va.h>
+
+/**
+ * @file
+ * API-specific header for AV_HWDEVICE_TYPE_VAAPI.
+ *
+ * Dynamic frame pools are supported, but note that any pool used as a render
+ * target is required to be of fixed size in order to be be usable as an
+ * argument to vaCreateContext().
+ *
+ * For user-allocated pools, AVHWFramesContext.pool must return AVBufferRefs
+ * with the data pointer set to a VASurfaceID.
+ */
+
+enum {
+    /**
+     * The quirks field has been set by the user and should not be detected
+     * automatically by av_hwdevice_ctx_init().
+     */
+    AV_VAAPI_DRIVER_QUIRK_USER_SET = (1 << 0),
+    /**
+     * The driver does not destroy parameter buffers when they are used by
+     * vaRenderPicture().  Additional code will be required to destroy them
+     * separately afterwards.
+     */
+    AV_VAAPI_DRIVER_QUIRK_RENDER_PARAM_BUFFERS = (1 << 1),
+
+    /**
+     * The driver does not support the VASurfaceAttribMemoryType attribute,
+     * so the surface allocation code will not try to use it.
+     */
+    AV_VAAPI_DRIVER_QUIRK_ATTRIB_MEMTYPE = (1 << 2),
+};
+
+/**
+ * VAAPI connection details.
+ *
+ * Allocated as AVHWDeviceContext.hwctx
+ */
+typedef struct AVVAAPIDeviceContext {
+    /**
+     * The VADisplay handle, to be filled by the user.
+     */
+    VADisplay display;
+    /**
+     * Driver quirks to apply - this is filled by av_hwdevice_ctx_init(),
+     * with reference to a table of known drivers, unless the
+     * AV_VAAPI_DRIVER_QUIRK_USER_SET bit is already present.  The user
+     * may need to refer to this field when performing any later
+     * operations using VAAPI with the same VADisplay.
+     */
+    unsigned int driver_quirks;
+} AVVAAPIDeviceContext;
+
+/**
+ * VAAPI-specific data associated with a frame pool.
+ *
+ * Allocated as AVHWFramesContext.hwctx.
+ */
+typedef struct AVVAAPIFramesContext {
+    /**
+     * Set by the user to apply surface attributes to all surfaces in
+     * the frame pool.  If null, default settings are used.
+     */
+    VASurfaceAttrib *attributes;
+    int           nb_attributes;
+    /**
+     * The surfaces IDs of all surfaces in the pool after creation.
+     * Only valid if AVHWFramesContext.initial_pool_size was positive.
+     * These are intended to be used as the render_targets arguments to
+     * vaCreateContext().
+     */
+    VASurfaceID     *surface_ids;
+    int           nb_surfaces;
+} AVVAAPIFramesContext;
+
+/**
+ * VAAPI hardware pipeline configuration details.
+ *
+ * Allocated with av_hwdevice_hwconfig_alloc().
+ */
+typedef struct AVVAAPIHWConfig {
+    /**
+     * ID of a VAAPI pipeline configuration.
+     */
+    VAConfigID config_id;
+} AVVAAPIHWConfig;
+
+#endif /* AVUTIL_HWCONTEXT_VAAPI_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/hwcontext_vdpau.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/hwcontext_vdpau.h
new file mode 100644
index 0000000..1b7ea1e
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/hwcontext_vdpau.h
@@ -0,0 +1,44 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_HWCONTEXT_VDPAU_H
+#define AVUTIL_HWCONTEXT_VDPAU_H
+
+#include <vdpau/vdpau.h>
+
+/**
+ * @file
+ * An API-specific header for AV_HWDEVICE_TYPE_VDPAU.
+ *
+ * This API supports dynamic frame pools. AVHWFramesContext.pool must return
+ * AVBufferRefs whose data pointer is a VdpVideoSurface.
+ */
+
+/**
+ * This struct is allocated as AVHWDeviceContext.hwctx
+ */
+typedef struct AVVDPAUDeviceContext {
+    VdpDevice          device;
+    VdpGetProcAddress *get_proc_address;
+} AVVDPAUDeviceContext;
+
+/**
+ * AVHWFramesContext.hwctx is currently not used
+ */
+
+#endif /* AVUTIL_HWCONTEXT_VDPAU_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/imgutils.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/imgutils.h
new file mode 100644
index 0000000..a4a5efc
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/imgutils.h
@@ -0,0 +1,246 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_IMGUTILS_H
+#define AVUTIL_IMGUTILS_H
+
+/**
+ * @file
+ * misc image utilities
+ *
+ * @addtogroup lavu_picture
+ * @{
+ */
+
+#include "avutil.h"
+#include "pixdesc.h"
+#include "rational.h"
+
+/**
+ * Compute the max pixel step for each plane of an image with a
+ * format described by pixdesc.
+ *
+ * The pixel step is the distance in bytes between the first byte of
+ * the group of bytes which describe a pixel component and the first
+ * byte of the successive group in the same plane for the same
+ * component.
+ *
+ * @param max_pixsteps an array which is filled with the max pixel step
+ * for each plane. Since a plane may contain different pixel
+ * components, the computed max_pixsteps[plane] is relative to the
+ * component in the plane with the max pixel step.
+ * @param max_pixstep_comps an array which is filled with the component
+ * for each plane which has the max pixel step. May be NULL.
+ */
+void av_image_fill_max_pixsteps(int max_pixsteps[4], int max_pixstep_comps[4],
+                                const AVPixFmtDescriptor *pixdesc);
+
+/**
+ * Compute the size of an image line with format pix_fmt and width
+ * width for the plane plane.
+ *
+ * @return the computed size in bytes
+ */
+int av_image_get_linesize(enum AVPixelFormat pix_fmt, int width, int plane);
+
+/**
+ * Fill plane linesizes for an image with pixel format pix_fmt and
+ * width width.
+ *
+ * @param linesizes array to be filled with the linesize for each plane
+ * @return >= 0 in case of success, a negative error code otherwise
+ */
+int av_image_fill_linesizes(int linesizes[4], enum AVPixelFormat pix_fmt, int width);
+
+/**
+ * Fill plane data pointers for an image with pixel format pix_fmt and
+ * height height.
+ *
+ * @param data pointers array to be filled with the pointer for each image plane
+ * @param ptr the pointer to a buffer which will contain the image
+ * @param linesizes the array containing the linesize for each
+ * plane, should be filled by av_image_fill_linesizes()
+ * @return the size in bytes required for the image buffer, a negative
+ * error code in case of failure
+ */
+int av_image_fill_pointers(uint8_t *data[4], enum AVPixelFormat pix_fmt, int height,
+                           uint8_t *ptr, const int linesizes[4]);
+
+/**
+ * Allocate an image with size w and h and pixel format pix_fmt, and
+ * fill pointers and linesizes accordingly.
+ * The allocated image buffer has to be freed by using
+ * av_freep(&pointers[0]).
+ *
+ * @param align the value to use for buffer size alignment
+ * @return the size in bytes required for the image buffer, a negative
+ * error code in case of failure
+ */
+int av_image_alloc(uint8_t *pointers[4], int linesizes[4],
+                   int w, int h, enum AVPixelFormat pix_fmt, int align);
+
+/**
+ * Copy image plane from src to dst.
+ * That is, copy "height" number of lines of "bytewidth" bytes each.
+ * The first byte of each successive line is separated by *_linesize
+ * bytes.
+ *
+ * bytewidth must be contained by both absolute values of dst_linesize
+ * and src_linesize, otherwise the function behavior is undefined.
+ *
+ * @param dst_linesize linesize for the image plane in dst
+ * @param src_linesize linesize for the image plane in src
+ */
+void av_image_copy_plane(uint8_t       *dst, int dst_linesize,
+                         const uint8_t *src, int src_linesize,
+                         int bytewidth, int height);
+
+/**
+ * Copy image in src_data to dst_data.
+ *
+ * @param dst_linesizes linesizes for the image in dst_data
+ * @param src_linesizes linesizes for the image in src_data
+ */
+void av_image_copy(uint8_t *dst_data[4], int dst_linesizes[4],
+                   const uint8_t *src_data[4], const int src_linesizes[4],
+                   enum AVPixelFormat pix_fmt, int width, int height);
+
+/**
+ * Copy image data located in uncacheable (e.g. GPU mapped) memory. Where
+ * available, this function will use special functionality for reading from such
+ * memory, which may result in greatly improved performance compared to plain
+ * av_image_copy().
+ *
+ * The data pointers and the linesizes must be aligned to the maximum required
+ * by the CPU architecture.
+ *
+ * @note The linesize parameters have the type ptrdiff_t here, while they are
+ *       int for av_image_copy().
+ * @note On x86, the linesizes currently need to be aligned to the cacheline
+ *       size (i.e. 64) to get improved performance.
+ */
+void av_image_copy_uc_from(uint8_t *dst_data[4],       const ptrdiff_t dst_linesizes[4],
+                           const uint8_t *src_data[4], const ptrdiff_t src_linesizes[4],
+                           enum AVPixelFormat pix_fmt, int width, int height);
+
+/**
+ * Setup the data pointers and linesizes based on the specified image
+ * parameters and the provided array.
+ *
+ * The fields of the given image are filled in by using the src
+ * address which points to the image data buffer. Depending on the
+ * specified pixel format, one or multiple image data pointers and
+ * line sizes will be set.  If a planar format is specified, several
+ * pointers will be set pointing to the different picture planes and
+ * the line sizes of the different planes will be stored in the
+ * lines_sizes array. Call with src == NULL to get the required
+ * size for the src buffer.
+ *
+ * To allocate the buffer and fill in the dst_data and dst_linesize in
+ * one call, use av_image_alloc().
+ *
+ * @param dst_data      data pointers to be filled in
+ * @param dst_linesizes linesizes for the image in dst_data to be filled in
+ * @param src           buffer which will contain or contains the actual image data, can be NULL
+ * @param pix_fmt       the pixel format of the image
+ * @param width         the width of the image in pixels
+ * @param height        the height of the image in pixels
+ * @param align         the value used in src for linesize alignment
+ * @return the size in bytes required for src, a negative error code
+ * in case of failure
+ */
+int av_image_fill_arrays(uint8_t *dst_data[4], int dst_linesize[4],
+                         const uint8_t *src,
+                         enum AVPixelFormat pix_fmt, int width, int height, int align);
+
+/**
+ * Return the size in bytes of the amount of data required to store an
+ * image with the given parameters.
+ *
+ * @param[in] align the assumed linesize alignment
+ */
+int av_image_get_buffer_size(enum AVPixelFormat pix_fmt, int width, int height, int align);
+
+/**
+ * Copy image data from an image into a buffer.
+ *
+ * av_image_get_buffer_size() can be used to compute the required size
+ * for the buffer to fill.
+ *
+ * @param dst           a buffer into which picture data will be copied
+ * @param dst_size      the size in bytes of dst
+ * @param src_data      pointers containing the source image data
+ * @param src_linesizes linesizes for the image in src_data
+ * @param pix_fmt       the pixel format of the source image
+ * @param width         the width of the source image in pixels
+ * @param height        the height of the source image in pixels
+ * @param align         the assumed linesize alignment for dst
+ * @return the number of bytes written to dst, or a negative value
+ * (error code) on error
+ */
+int av_image_copy_to_buffer(uint8_t *dst, int dst_size,
+                            const uint8_t * const src_data[4], const int src_linesize[4],
+                            enum AVPixelFormat pix_fmt, int width, int height, int align);
+
+/**
+ * Check if the given dimension of an image is valid, meaning that all
+ * bytes of the image can be addressed with a signed int.
+ *
+ * @param w the width of the picture
+ * @param h the height of the picture
+ * @param log_offset the offset to sum to the log level for logging with log_ctx
+ * @param log_ctx the parent logging context, it may be NULL
+ * @return >= 0 if valid, a negative error code otherwise
+ */
+int av_image_check_size(unsigned int w, unsigned int h, int log_offset, void *log_ctx);
+
+/**
+ * Check if the given dimension of an image is valid, meaning that all
+ * bytes of a plane of an image with the specified pix_fmt can be addressed
+ * with a signed int.
+ *
+ * @param w the width of the picture
+ * @param h the height of the picture
+ * @param max_pixels the maximum number of pixels the user wants to accept
+ * @param pix_fmt the pixel format, can be AV_PIX_FMT_NONE if unknown.
+ * @param log_offset the offset to sum to the log level for logging with log_ctx
+ * @param log_ctx the parent logging context, it may be NULL
+ * @return >= 0 if valid, a negative error code otherwise
+ */
+int av_image_check_size2(unsigned int w, unsigned int h, int64_t max_pixels, enum AVPixelFormat pix_fmt, int log_offset, void *log_ctx);
+
+/**
+ * Check if the given sample aspect ratio of an image is valid.
+ *
+ * It is considered invalid if the denominator is 0 or if applying the ratio
+ * to the image size would make the smaller dimension less than 1. If the
+ * sar numerator is 0, it is considered unknown and will return as valid.
+ *
+ * @param w width of the image
+ * @param h height of the image
+ * @param sar sample aspect ratio of the image
+ * @return 0 if valid, a negative AVERROR code otherwise
+ */
+int av_image_check_sar(unsigned int w, unsigned int h, AVRational sar);
+
+/**
+ * @}
+ */
+
+
+#endif /* AVUTIL_IMGUTILS_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/intfloat.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/intfloat.h
new file mode 100644
index 0000000..fe3d7ec
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/intfloat.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2011 Mans Rullgard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_INTFLOAT_H
+#define AVUTIL_INTFLOAT_H
+
+#include <stdint.h>
+#include "attributes.h"
+
+union av_intfloat32 {
+    uint32_t i;
+    float    f;
+};
+
+union av_intfloat64 {
+    uint64_t i;
+    double   f;
+};
+
+/**
+ * Reinterpret a 32-bit integer as a float.
+ */
+static av_always_inline float av_int2float(uint32_t i)
+{
+    union av_intfloat32 v;
+    v.i = i;
+    return v.f;
+}
+
+/**
+ * Reinterpret a float as a 32-bit integer.
+ */
+static av_always_inline uint32_t av_float2int(float f)
+{
+    union av_intfloat32 v;
+    v.f = f;
+    return v.i;
+}
+
+/**
+ * Reinterpret a 64-bit integer as a double.
+ */
+static av_always_inline double av_int2double(uint64_t i)
+{
+    union av_intfloat64 v;
+    v.i = i;
+    return v.f;
+}
+
+/**
+ * Reinterpret a double as a 64-bit integer.
+ */
+static av_always_inline uint64_t av_double2int(double f)
+{
+    union av_intfloat64 v;
+    v.f = f;
+    return v.i;
+}
+
+#endif /* AVUTIL_INTFLOAT_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/intreadwrite.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/intreadwrite.h
new file mode 100644
index 0000000..d54d4b9
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/intreadwrite.h
@@ -0,0 +1,634 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_INTREADWRITE_H
+#define AVUTIL_INTREADWRITE_H
+
+#include <stdint.h>
+#include "libavutil/avconfig.h"
+#include "attributes.h"
+#include "bswap.h"
+
+typedef union {
+    uint64_t u64;
+    uint32_t u32[2];
+    uint16_t u16[4];
+    uint8_t  u8 [8];
+    double   f64;
+    float    f32[2];
+} av_alias av_alias64;
+
+typedef union {
+    uint32_t u32;
+    uint16_t u16[2];
+    uint8_t  u8 [4];
+    float    f32;
+} av_alias av_alias32;
+
+typedef union {
+    uint16_t u16;
+    uint8_t  u8 [2];
+} av_alias av_alias16;
+
+/*
+ * Arch-specific headers can provide any combination of
+ * AV_[RW][BLN](16|24|32|48|64) and AV_(COPY|SWAP|ZERO)(64|128) macros.
+ * Preprocessor symbols must be defined, even if these are implemented
+ * as inline functions.
+ *
+ * R/W means read/write, B/L/N means big/little/native endianness.
+ * The following macros require aligned access, compared to their
+ * unaligned variants: AV_(COPY|SWAP|ZERO)(64|128), AV_[RW]N[8-64]A.
+ * Incorrect usage may range from abysmal performance to crash
+ * depending on the platform.
+ *
+ * The unaligned variants are AV_[RW][BLN][8-64] and AV_COPY*U.
+ */
+
+#ifdef HAVE_AV_CONFIG_H
+
+#include "config.h"
+
+#if   ARCH_ARM
+#   include "arm/intreadwrite.h"
+#elif ARCH_AVR32
+#   include "avr32/intreadwrite.h"
+#elif ARCH_MIPS
+#   include "mips/intreadwrite.h"
+#elif ARCH_PPC
+#   include "ppc/intreadwrite.h"
+#elif ARCH_TOMI
+#   include "tomi/intreadwrite.h"
+#elif ARCH_X86
+#   include "x86/intreadwrite.h"
+#endif
+
+#endif /* HAVE_AV_CONFIG_H */
+
+/*
+ * Map AV_RNXX <-> AV_R[BL]XX for all variants provided by per-arch headers.
+ */
+
+#if AV_HAVE_BIGENDIAN
+
+#   if    defined(AV_RN16) && !defined(AV_RB16)
+#       define AV_RB16(p) AV_RN16(p)
+#   elif !defined(AV_RN16) &&  defined(AV_RB16)
+#       define AV_RN16(p) AV_RB16(p)
+#   endif
+
+#   if    defined(AV_WN16) && !defined(AV_WB16)
+#       define AV_WB16(p, v) AV_WN16(p, v)
+#   elif !defined(AV_WN16) &&  defined(AV_WB16)
+#       define AV_WN16(p, v) AV_WB16(p, v)
+#   endif
+
+#   if    defined(AV_RN24) && !defined(AV_RB24)
+#       define AV_RB24(p) AV_RN24(p)
+#   elif !defined(AV_RN24) &&  defined(AV_RB24)
+#       define AV_RN24(p) AV_RB24(p)
+#   endif
+
+#   if    defined(AV_WN24) && !defined(AV_WB24)
+#       define AV_WB24(p, v) AV_WN24(p, v)
+#   elif !defined(AV_WN24) &&  defined(AV_WB24)
+#       define AV_WN24(p, v) AV_WB24(p, v)
+#   endif
+
+#   if    defined(AV_RN32) && !defined(AV_RB32)
+#       define AV_RB32(p) AV_RN32(p)
+#   elif !defined(AV_RN32) &&  defined(AV_RB32)
+#       define AV_RN32(p) AV_RB32(p)
+#   endif
+
+#   if    defined(AV_WN32) && !defined(AV_WB32)
+#       define AV_WB32(p, v) AV_WN32(p, v)
+#   elif !defined(AV_WN32) &&  defined(AV_WB32)
+#       define AV_WN32(p, v) AV_WB32(p, v)
+#   endif
+
+#   if    defined(AV_RN48) && !defined(AV_RB48)
+#       define AV_RB48(p) AV_RN48(p)
+#   elif !defined(AV_RN48) &&  defined(AV_RB48)
+#       define AV_RN48(p) AV_RB48(p)
+#   endif
+
+#   if    defined(AV_WN48) && !defined(AV_WB48)
+#       define AV_WB48(p, v) AV_WN48(p, v)
+#   elif !defined(AV_WN48) &&  defined(AV_WB48)
+#       define AV_WN48(p, v) AV_WB48(p, v)
+#   endif
+
+#   if    defined(AV_RN64) && !defined(AV_RB64)
+#       define AV_RB64(p) AV_RN64(p)
+#   elif !defined(AV_RN64) &&  defined(AV_RB64)
+#       define AV_RN64(p) AV_RB64(p)
+#   endif
+
+#   if    defined(AV_WN64) && !defined(AV_WB64)
+#       define AV_WB64(p, v) AV_WN64(p, v)
+#   elif !defined(AV_WN64) &&  defined(AV_WB64)
+#       define AV_WN64(p, v) AV_WB64(p, v)
+#   endif
+
+#else /* AV_HAVE_BIGENDIAN */
+
+#   if    defined(AV_RN16) && !defined(AV_RL16)
+#       define AV_RL16(p) AV_RN16(p)
+#   elif !defined(AV_RN16) &&  defined(AV_RL16)
+#       define AV_RN16(p) AV_RL16(p)
+#   endif
+
+#   if    defined(AV_WN16) && !defined(AV_WL16)
+#       define AV_WL16(p, v) AV_WN16(p, v)
+#   elif !defined(AV_WN16) &&  defined(AV_WL16)
+#       define AV_WN16(p, v) AV_WL16(p, v)
+#   endif
+
+#   if    defined(AV_RN24) && !defined(AV_RL24)
+#       define AV_RL24(p) AV_RN24(p)
+#   elif !defined(AV_RN24) &&  defined(AV_RL24)
+#       define AV_RN24(p) AV_RL24(p)
+#   endif
+
+#   if    defined(AV_WN24) && !defined(AV_WL24)
+#       define AV_WL24(p, v) AV_WN24(p, v)
+#   elif !defined(AV_WN24) &&  defined(AV_WL24)
+#       define AV_WN24(p, v) AV_WL24(p, v)
+#   endif
+
+#   if    defined(AV_RN32) && !defined(AV_RL32)
+#       define AV_RL32(p) AV_RN32(p)
+#   elif !defined(AV_RN32) &&  defined(AV_RL32)
+#       define AV_RN32(p) AV_RL32(p)
+#   endif
+
+#   if    defined(AV_WN32) && !defined(AV_WL32)
+#       define AV_WL32(p, v) AV_WN32(p, v)
+#   elif !defined(AV_WN32) &&  defined(AV_WL32)
+#       define AV_WN32(p, v) AV_WL32(p, v)
+#   endif
+
+#   if    defined(AV_RN48) && !defined(AV_RL48)
+#       define AV_RL48(p) AV_RN48(p)
+#   elif !defined(AV_RN48) &&  defined(AV_RL48)
+#       define AV_RN48(p) AV_RL48(p)
+#   endif
+
+#   if    defined(AV_WN48) && !defined(AV_WL48)
+#       define AV_WL48(p, v) AV_WN48(p, v)
+#   elif !defined(AV_WN48) &&  defined(AV_WL48)
+#       define AV_WN48(p, v) AV_WL48(p, v)
+#   endif
+
+#   if    defined(AV_RN64) && !defined(AV_RL64)
+#       define AV_RL64(p) AV_RN64(p)
+#   elif !defined(AV_RN64) &&  defined(AV_RL64)
+#       define AV_RN64(p) AV_RL64(p)
+#   endif
+
+#   if    defined(AV_WN64) && !defined(AV_WL64)
+#       define AV_WL64(p, v) AV_WN64(p, v)
+#   elif !defined(AV_WN64) &&  defined(AV_WL64)
+#       define AV_WN64(p, v) AV_WL64(p, v)
+#   endif
+
+#endif /* !AV_HAVE_BIGENDIAN */
+
+/*
+ * Define AV_[RW]N helper macros to simplify definitions not provided
+ * by per-arch headers.
+ */
+
+#if defined(__GNUC__) && !defined(__TI_COMPILER_VERSION__)
+
+union unaligned_64 { uint64_t l; } __attribute__((packed)) av_alias;
+union unaligned_32 { uint32_t l; } __attribute__((packed)) av_alias;
+union unaligned_16 { uint16_t l; } __attribute__((packed)) av_alias;
+
+#   define AV_RN(s, p) (((const union unaligned_##s *) (p))->l)
+#   define AV_WN(s, p, v) ((((union unaligned_##s *) (p))->l) = (v))
+
+#elif defined(__DECC)
+
+#   define AV_RN(s, p) (*((const __unaligned uint##s##_t*)(p)))
+#   define AV_WN(s, p, v) (*((__unaligned uint##s##_t*)(p)) = (v))
+
+#elif defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_X64)) && AV_HAVE_FAST_UNALIGNED
+
+#   define AV_RN(s, p) (*((const __unaligned uint##s##_t*)(p)))
+#   define AV_WN(s, p, v) (*((__unaligned uint##s##_t*)(p)) = (v))
+
+#elif AV_HAVE_FAST_UNALIGNED
+
+#   define AV_RN(s, p) (((const av_alias##s*)(p))->u##s)
+#   define AV_WN(s, p, v) (((av_alias##s*)(p))->u##s = (v))
+
+#else
+
+#ifndef AV_RB16
+#   define AV_RB16(x)                           \
+    ((((const uint8_t*)(x))[0] << 8) |          \
+      ((const uint8_t*)(x))[1])
+#endif
+#ifndef AV_WB16
+#   define AV_WB16(p, val) do {                 \
+        uint16_t d = (val);                     \
+        ((uint8_t*)(p))[1] = (d);               \
+        ((uint8_t*)(p))[0] = (d)>>8;            \
+    } while(0)
+#endif
+
+#ifndef AV_RL16
+#   define AV_RL16(x)                           \
+    ((((const uint8_t*)(x))[1] << 8) |          \
+      ((const uint8_t*)(x))[0])
+#endif
+#ifndef AV_WL16
+#   define AV_WL16(p, val) do {                 \
+        uint16_t d = (val);                     \
+        ((uint8_t*)(p))[0] = (d);               \
+        ((uint8_t*)(p))[1] = (d)>>8;            \
+    } while(0)
+#endif
+
+#ifndef AV_RB32
+#   define AV_RB32(x)                                \
+    (((uint32_t)((const uint8_t*)(x))[0] << 24) |    \
+               (((const uint8_t*)(x))[1] << 16) |    \
+               (((const uint8_t*)(x))[2] <<  8) |    \
+                ((const uint8_t*)(x))[3])
+#endif
+#ifndef AV_WB32
+#   define AV_WB32(p, val) do {                 \
+        uint32_t d = (val);                     \
+        ((uint8_t*)(p))[3] = (d);               \
+        ((uint8_t*)(p))[2] = (d)>>8;            \
+        ((uint8_t*)(p))[1] = (d)>>16;           \
+        ((uint8_t*)(p))[0] = (d)>>24;           \
+    } while(0)
+#endif
+
+#ifndef AV_RL32
+#   define AV_RL32(x)                                \
+    (((uint32_t)((const uint8_t*)(x))[3] << 24) |    \
+               (((const uint8_t*)(x))[2] << 16) |    \
+               (((const uint8_t*)(x))[1] <<  8) |    \
+                ((const uint8_t*)(x))[0])
+#endif
+#ifndef AV_WL32
+#   define AV_WL32(p, val) do {                 \
+        uint32_t d = (val);                     \
+        ((uint8_t*)(p))[0] = (d);               \
+        ((uint8_t*)(p))[1] = (d)>>8;            \
+        ((uint8_t*)(p))[2] = (d)>>16;           \
+        ((uint8_t*)(p))[3] = (d)>>24;           \
+    } while(0)
+#endif
+
+#ifndef AV_RB64
+#   define AV_RB64(x)                                   \
+    (((uint64_t)((const uint8_t*)(x))[0] << 56) |       \
+     ((uint64_t)((const uint8_t*)(x))[1] << 48) |       \
+     ((uint64_t)((const uint8_t*)(x))[2] << 40) |       \
+     ((uint64_t)((const uint8_t*)(x))[3] << 32) |       \
+     ((uint64_t)((const uint8_t*)(x))[4] << 24) |       \
+     ((uint64_t)((const uint8_t*)(x))[5] << 16) |       \
+     ((uint64_t)((const uint8_t*)(x))[6] <<  8) |       \
+      (uint64_t)((const uint8_t*)(x))[7])
+#endif
+#ifndef AV_WB64
+#   define AV_WB64(p, val) do {                 \
+        uint64_t d = (val);                     \
+        ((uint8_t*)(p))[7] = (d);               \
+        ((uint8_t*)(p))[6] = (d)>>8;            \
+        ((uint8_t*)(p))[5] = (d)>>16;           \
+        ((uint8_t*)(p))[4] = (d)>>24;           \
+        ((uint8_t*)(p))[3] = (d)>>32;           \
+        ((uint8_t*)(p))[2] = (d)>>40;           \
+        ((uint8_t*)(p))[1] = (d)>>48;           \
+        ((uint8_t*)(p))[0] = (d)>>56;           \
+    } while(0)
+#endif
+
+#ifndef AV_RL64
+#   define AV_RL64(x)                                   \
+    (((uint64_t)((const uint8_t*)(x))[7] << 56) |       \
+     ((uint64_t)((const uint8_t*)(x))[6] << 48) |       \
+     ((uint64_t)((const uint8_t*)(x))[5] << 40) |       \
+     ((uint64_t)((const uint8_t*)(x))[4] << 32) |       \
+     ((uint64_t)((const uint8_t*)(x))[3] << 24) |       \
+     ((uint64_t)((const uint8_t*)(x))[2] << 16) |       \
+     ((uint64_t)((const uint8_t*)(x))[1] <<  8) |       \
+      (uint64_t)((const uint8_t*)(x))[0])
+#endif
+#ifndef AV_WL64
+#   define AV_WL64(p, val) do {                 \
+        uint64_t d = (val);                     \
+        ((uint8_t*)(p))[0] = (d);               \
+        ((uint8_t*)(p))[1] = (d)>>8;            \
+        ((uint8_t*)(p))[2] = (d)>>16;           \
+        ((uint8_t*)(p))[3] = (d)>>24;           \
+        ((uint8_t*)(p))[4] = (d)>>32;           \
+        ((uint8_t*)(p))[5] = (d)>>40;           \
+        ((uint8_t*)(p))[6] = (d)>>48;           \
+        ((uint8_t*)(p))[7] = (d)>>56;           \
+    } while(0)
+#endif
+
+#if AV_HAVE_BIGENDIAN
+#   define AV_RN(s, p)    AV_RB##s(p)
+#   define AV_WN(s, p, v) AV_WB##s(p, v)
+#else
+#   define AV_RN(s, p)    AV_RL##s(p)
+#   define AV_WN(s, p, v) AV_WL##s(p, v)
+#endif
+
+#endif /* HAVE_FAST_UNALIGNED */
+
+#ifndef AV_RN16
+#   define AV_RN16(p) AV_RN(16, p)
+#endif
+
+#ifndef AV_RN32
+#   define AV_RN32(p) AV_RN(32, p)
+#endif
+
+#ifndef AV_RN64
+#   define AV_RN64(p) AV_RN(64, p)
+#endif
+
+#ifndef AV_WN16
+#   define AV_WN16(p, v) AV_WN(16, p, v)
+#endif
+
+#ifndef AV_WN32
+#   define AV_WN32(p, v) AV_WN(32, p, v)
+#endif
+
+#ifndef AV_WN64
+#   define AV_WN64(p, v) AV_WN(64, p, v)
+#endif
+
+#if AV_HAVE_BIGENDIAN
+#   define AV_RB(s, p)    AV_RN##s(p)
+#   define AV_WB(s, p, v) AV_WN##s(p, v)
+#   define AV_RL(s, p)    av_bswap##s(AV_RN##s(p))
+#   define AV_WL(s, p, v) AV_WN##s(p, av_bswap##s(v))
+#else
+#   define AV_RB(s, p)    av_bswap##s(AV_RN##s(p))
+#   define AV_WB(s, p, v) AV_WN##s(p, av_bswap##s(v))
+#   define AV_RL(s, p)    AV_RN##s(p)
+#   define AV_WL(s, p, v) AV_WN##s(p, v)
+#endif
+
+#define AV_RB8(x)     (((const uint8_t*)(x))[0])
+#define AV_WB8(p, d)  do { ((uint8_t*)(p))[0] = (d); } while(0)
+
+#define AV_RL8(x)     AV_RB8(x)
+#define AV_WL8(p, d)  AV_WB8(p, d)
+
+#ifndef AV_RB16
+#   define AV_RB16(p)    AV_RB(16, p)
+#endif
+#ifndef AV_WB16
+#   define AV_WB16(p, v) AV_WB(16, p, v)
+#endif
+
+#ifndef AV_RL16
+#   define AV_RL16(p)    AV_RL(16, p)
+#endif
+#ifndef AV_WL16
+#   define AV_WL16(p, v) AV_WL(16, p, v)
+#endif
+
+#ifndef AV_RB32
+#   define AV_RB32(p)    AV_RB(32, p)
+#endif
+#ifndef AV_WB32
+#   define AV_WB32(p, v) AV_WB(32, p, v)
+#endif
+
+#ifndef AV_RL32
+#   define AV_RL32(p)    AV_RL(32, p)
+#endif
+#ifndef AV_WL32
+#   define AV_WL32(p, v) AV_WL(32, p, v)
+#endif
+
+#ifndef AV_RB64
+#   define AV_RB64(p)    AV_RB(64, p)
+#endif
+#ifndef AV_WB64
+#   define AV_WB64(p, v) AV_WB(64, p, v)
+#endif
+
+#ifndef AV_RL64
+#   define AV_RL64(p)    AV_RL(64, p)
+#endif
+#ifndef AV_WL64
+#   define AV_WL64(p, v) AV_WL(64, p, v)
+#endif
+
+#ifndef AV_RB24
+#   define AV_RB24(x)                           \
+    ((((const uint8_t*)(x))[0] << 16) |         \
+     (((const uint8_t*)(x))[1] <<  8) |         \
+      ((const uint8_t*)(x))[2])
+#endif
+#ifndef AV_WB24
+#   define AV_WB24(p, d) do {                   \
+        ((uint8_t*)(p))[2] = (d);               \
+        ((uint8_t*)(p))[1] = (d)>>8;            \
+        ((uint8_t*)(p))[0] = (d)>>16;           \
+    } while(0)
+#endif
+
+#ifndef AV_RL24
+#   define AV_RL24(x)                           \
+    ((((const uint8_t*)(x))[2] << 16) |         \
+     (((const uint8_t*)(x))[1] <<  8) |         \
+      ((const uint8_t*)(x))[0])
+#endif
+#ifndef AV_WL24
+#   define AV_WL24(p, d) do {                   \
+        ((uint8_t*)(p))[0] = (d);               \
+        ((uint8_t*)(p))[1] = (d)>>8;            \
+        ((uint8_t*)(p))[2] = (d)>>16;           \
+    } while(0)
+#endif
+
+#ifndef AV_RB48
+#   define AV_RB48(x)                                     \
+    (((uint64_t)((const uint8_t*)(x))[0] << 40) |         \
+     ((uint64_t)((const uint8_t*)(x))[1] << 32) |         \
+     ((uint64_t)((const uint8_t*)(x))[2] << 24) |         \
+     ((uint64_t)((const uint8_t*)(x))[3] << 16) |         \
+     ((uint64_t)((const uint8_t*)(x))[4] <<  8) |         \
+      (uint64_t)((const uint8_t*)(x))[5])
+#endif
+#ifndef AV_WB48
+#   define AV_WB48(p, darg) do {                \
+        uint64_t d = (darg);                    \
+        ((uint8_t*)(p))[5] = (d);               \
+        ((uint8_t*)(p))[4] = (d)>>8;            \
+        ((uint8_t*)(p))[3] = (d)>>16;           \
+        ((uint8_t*)(p))[2] = (d)>>24;           \
+        ((uint8_t*)(p))[1] = (d)>>32;           \
+        ((uint8_t*)(p))[0] = (d)>>40;           \
+    } while(0)
+#endif
+
+#ifndef AV_RL48
+#   define AV_RL48(x)                                     \
+    (((uint64_t)((const uint8_t*)(x))[5] << 40) |         \
+     ((uint64_t)((const uint8_t*)(x))[4] << 32) |         \
+     ((uint64_t)((const uint8_t*)(x))[3] << 24) |         \
+     ((uint64_t)((const uint8_t*)(x))[2] << 16) |         \
+     ((uint64_t)((const uint8_t*)(x))[1] <<  8) |         \
+      (uint64_t)((const uint8_t*)(x))[0])
+#endif
+#ifndef AV_WL48
+#   define AV_WL48(p, darg) do {                \
+        uint64_t d = (darg);                    \
+        ((uint8_t*)(p))[0] = (d);               \
+        ((uint8_t*)(p))[1] = (d)>>8;            \
+        ((uint8_t*)(p))[2] = (d)>>16;           \
+        ((uint8_t*)(p))[3] = (d)>>24;           \
+        ((uint8_t*)(p))[4] = (d)>>32;           \
+        ((uint8_t*)(p))[5] = (d)>>40;           \
+    } while(0)
+#endif
+
+/*
+ * The AV_[RW]NA macros access naturally aligned data
+ * in a type-safe way.
+ */
+
+#define AV_RNA(s, p)    (((const av_alias##s*)(p))->u##s)
+#define AV_WNA(s, p, v) (((av_alias##s*)(p))->u##s = (v))
+
+#ifndef AV_RN16A
+#   define AV_RN16A(p) AV_RNA(16, p)
+#endif
+
+#ifndef AV_RN32A
+#   define AV_RN32A(p) AV_RNA(32, p)
+#endif
+
+#ifndef AV_RN64A
+#   define AV_RN64A(p) AV_RNA(64, p)
+#endif
+
+#ifndef AV_WN16A
+#   define AV_WN16A(p, v) AV_WNA(16, p, v)
+#endif
+
+#ifndef AV_WN32A
+#   define AV_WN32A(p, v) AV_WNA(32, p, v)
+#endif
+
+#ifndef AV_WN64A
+#   define AV_WN64A(p, v) AV_WNA(64, p, v)
+#endif
+
+/*
+ * The AV_COPYxxU macros are suitable for copying data to/from unaligned
+ * memory locations.
+ */
+
+#define AV_COPYU(n, d, s) AV_WN##n(d, AV_RN##n(s));
+
+#ifndef AV_COPY16U
+#   define AV_COPY16U(d, s) AV_COPYU(16, d, s)
+#endif
+
+#ifndef AV_COPY32U
+#   define AV_COPY32U(d, s) AV_COPYU(32, d, s)
+#endif
+
+#ifndef AV_COPY64U
+#   define AV_COPY64U(d, s) AV_COPYU(64, d, s)
+#endif
+
+#ifndef AV_COPY128U
+#   define AV_COPY128U(d, s)                                    \
+    do {                                                        \
+        AV_COPY64U(d, s);                                       \
+        AV_COPY64U((char *)(d) + 8, (const char *)(s) + 8);     \
+    } while(0)
+#endif
+
+/* Parameters for AV_COPY*, AV_SWAP*, AV_ZERO* must be
+ * naturally aligned. They may be implemented using MMX,
+ * so emms_c() must be called before using any float code
+ * afterwards.
+ */
+
+#define AV_COPY(n, d, s) \
+    (((av_alias##n*)(d))->u##n = ((const av_alias##n*)(s))->u##n)
+
+#ifndef AV_COPY16
+#   define AV_COPY16(d, s) AV_COPY(16, d, s)
+#endif
+
+#ifndef AV_COPY32
+#   define AV_COPY32(d, s) AV_COPY(32, d, s)
+#endif
+
+#ifndef AV_COPY64
+#   define AV_COPY64(d, s) AV_COPY(64, d, s)
+#endif
+
+#ifndef AV_COPY128
+#   define AV_COPY128(d, s)                    \
+    do {                                       \
+        AV_COPY64(d, s);                       \
+        AV_COPY64((char*)(d)+8, (char*)(s)+8); \
+    } while(0)
+#endif
+
+#define AV_SWAP(n, a, b) FFSWAP(av_alias##n, *(av_alias##n*)(a), *(av_alias##n*)(b))
+
+#ifndef AV_SWAP64
+#   define AV_SWAP64(a, b) AV_SWAP(64, a, b)
+#endif
+
+#define AV_ZERO(n, d) (((av_alias##n*)(d))->u##n = 0)
+
+#ifndef AV_ZERO16
+#   define AV_ZERO16(d) AV_ZERO(16, d)
+#endif
+
+#ifndef AV_ZERO32
+#   define AV_ZERO32(d) AV_ZERO(32, d)
+#endif
+
+#ifndef AV_ZERO64
+#   define AV_ZERO64(d) AV_ZERO(64, d)
+#endif
+
+#ifndef AV_ZERO128
+#   define AV_ZERO128(d)         \
+    do {                         \
+        AV_ZERO64(d);            \
+        AV_ZERO64((char*)(d)+8); \
+    } while(0)
+#endif
+
+#endif /* AVUTIL_INTREADWRITE_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/lfg.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/lfg.h
new file mode 100644
index 0000000..03f779a
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/lfg.h
@@ -0,0 +1,71 @@
+/*
+ * Lagged Fibonacci PRNG
+ * Copyright (c) 2008 Michael Niedermayer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_LFG_H
+#define AVUTIL_LFG_H
+
+#include <stdint.h>
+
+typedef struct AVLFG {
+    unsigned int state[64];
+    int index;
+} AVLFG;
+
+void av_lfg_init(AVLFG *c, unsigned int seed);
+
+/**
+ * Seed the state of the ALFG using binary data.
+ *
+ * Return value: 0 on success, negative value (AVERROR) on failure.
+ */
+int av_lfg_init_from_data(AVLFG *c, const uint8_t *data, unsigned int length);
+
+/**
+ * Get the next random unsigned 32-bit number using an ALFG.
+ *
+ * Please also consider a simple LCG like state= state*1664525+1013904223,
+ * it may be good enough and faster for your specific use case.
+ */
+static inline unsigned int av_lfg_get(AVLFG *c){
+    c->state[c->index & 63] = c->state[(c->index-24) & 63] + c->state[(c->index-55) & 63];
+    return c->state[c->index++ & 63];
+}
+
+/**
+ * Get the next random unsigned 32-bit number using a MLFG.
+ *
+ * Please also consider av_lfg_get() above, it is faster.
+ */
+static inline unsigned int av_mlfg_get(AVLFG *c){
+    unsigned int a= c->state[(c->index-55) & 63];
+    unsigned int b= c->state[(c->index-24) & 63];
+    return c->state[c->index++ & 63] = 2*a*b+a+b;
+}
+
+/**
+ * Get the next two numbers generated by a Box-Muller Gaussian
+ * generator using the random numbers issued by lfg.
+ *
+ * @param out array where the two generated numbers are placed
+ */
+void av_bmg_get(AVLFG *lfg, double out[2]);
+
+#endif /* AVUTIL_LFG_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/log.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/log.h
new file mode 100644
index 0000000..f0a5738
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/log.h
@@ -0,0 +1,376 @@
+/*
+ * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_LOG_H
+#define AVUTIL_LOG_H
+
+#include <stdarg.h>
+#include "avutil.h"
+#include "attributes.h"
+#include "version.h"
+
+typedef enum {
+    AV_CLASS_CATEGORY_NA = 0,
+    AV_CLASS_CATEGORY_INPUT,
+    AV_CLASS_CATEGORY_OUTPUT,
+    AV_CLASS_CATEGORY_MUXER,
+    AV_CLASS_CATEGORY_DEMUXER,
+    AV_CLASS_CATEGORY_ENCODER,
+    AV_CLASS_CATEGORY_DECODER,
+    AV_CLASS_CATEGORY_FILTER,
+    AV_CLASS_CATEGORY_BITSTREAM_FILTER,
+    AV_CLASS_CATEGORY_SWSCALER,
+    AV_CLASS_CATEGORY_SWRESAMPLER,
+    AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT = 40,
+    AV_CLASS_CATEGORY_DEVICE_VIDEO_INPUT,
+    AV_CLASS_CATEGORY_DEVICE_AUDIO_OUTPUT,
+    AV_CLASS_CATEGORY_DEVICE_AUDIO_INPUT,
+    AV_CLASS_CATEGORY_DEVICE_OUTPUT,
+    AV_CLASS_CATEGORY_DEVICE_INPUT,
+    AV_CLASS_CATEGORY_NB  ///< not part of ABI/API
+}AVClassCategory;
+
+#define AV_IS_INPUT_DEVICE(category) \
+    (((category) == AV_CLASS_CATEGORY_DEVICE_VIDEO_INPUT) || \
+     ((category) == AV_CLASS_CATEGORY_DEVICE_AUDIO_INPUT) || \
+     ((category) == AV_CLASS_CATEGORY_DEVICE_INPUT))
+
+#define AV_IS_OUTPUT_DEVICE(category) \
+    (((category) == AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT) || \
+     ((category) == AV_CLASS_CATEGORY_DEVICE_AUDIO_OUTPUT) || \
+     ((category) == AV_CLASS_CATEGORY_DEVICE_OUTPUT))
+
+struct AVOptionRanges;
+
+/**
+ * Describe the class of an AVClass context structure. That is an
+ * arbitrary struct of which the first field is a pointer to an
+ * AVClass struct (e.g. AVCodecContext, AVFormatContext etc.).
+ */
+typedef struct AVClass {
+    /**
+     * The name of the class; usually it is the same name as the
+     * context structure type to which the AVClass is associated.
+     */
+    const char* class_name;
+
+    /**
+     * A pointer to a function which returns the name of a context
+     * instance ctx associated with the class.
+     */
+    const char* (*item_name)(void* ctx);
+
+    /**
+     * a pointer to the first option specified in the class if any or NULL
+     *
+     * @see av_set_default_options()
+     */
+    const struct AVOption *option;
+
+    /**
+     * LIBAVUTIL_VERSION with which this structure was created.
+     * This is used to allow fields to be added without requiring major
+     * version bumps everywhere.
+     */
+
+    int version;
+
+    /**
+     * Offset in the structure where log_level_offset is stored.
+     * 0 means there is no such variable
+     */
+    int log_level_offset_offset;
+
+    /**
+     * Offset in the structure where a pointer to the parent context for
+     * logging is stored. For example a decoder could pass its AVCodecContext
+     * to eval as such a parent context, which an av_log() implementation
+     * could then leverage to display the parent context.
+     * The offset can be NULL.
+     */
+    int parent_log_context_offset;
+
+    /**
+     * Return next AVOptions-enabled child or NULL
+     */
+    void* (*child_next)(void *obj, void *prev);
+
+    /**
+     * Return an AVClass corresponding to the next potential
+     * AVOptions-enabled child.
+     *
+     * The difference between child_next and this is that
+     * child_next iterates over _already existing_ objects, while
+     * child_class_next iterates over _all possible_ children.
+     */
+    const struct AVClass* (*child_class_next)(const struct AVClass *prev);
+
+    /**
+     * Category used for visualization (like color)
+     * This is only set if the category is equal for all objects using this class.
+     * available since version (51 << 16 | 56 << 8 | 100)
+     */
+    AVClassCategory category;
+
+    /**
+     * Callback to return the category.
+     * available since version (51 << 16 | 59 << 8 | 100)
+     */
+    AVClassCategory (*get_category)(void* ctx);
+
+    /**
+     * Callback to return the supported/allowed ranges.
+     * available since version (52.12)
+     */
+    int (*query_ranges)(struct AVOptionRanges **, void *obj, const char *key, int flags);
+} AVClass;
+
+/**
+ * @addtogroup lavu_log
+ *
+ * @{
+ *
+ * @defgroup lavu_log_constants Logging Constants
+ *
+ * @{
+ */
+
+/**
+ * Print no output.
+ */
+#define AV_LOG_QUIET    -8
+
+/**
+ * Something went really wrong and we will crash now.
+ */
+#define AV_LOG_PANIC     0
+
+/**
+ * Something went wrong and recovery is not possible.
+ * For example, no header was found for a format which depends
+ * on headers or an illegal combination of parameters is used.
+ */
+#define AV_LOG_FATAL     8
+
+/**
+ * Something went wrong and cannot losslessly be recovered.
+ * However, not all future data is affected.
+ */
+#define AV_LOG_ERROR    16
+
+/**
+ * Something somehow does not look correct. This may or may not
+ * lead to problems. An example would be the use of '-vstrict -2'.
+ */
+#define AV_LOG_WARNING  24
+
+/**
+ * Standard information.
+ */
+#define AV_LOG_INFO     32
+
+/**
+ * Detailed information.
+ */
+#define AV_LOG_VERBOSE  40
+
+/**
+ * Stuff which is only useful for libav* developers.
+ */
+#define AV_LOG_DEBUG    48
+
+/**
+ * Extremely verbose debugging, useful for libav* development.
+ */
+#define AV_LOG_TRACE    56
+
+#define AV_LOG_MAX_OFFSET (AV_LOG_TRACE - AV_LOG_QUIET)
+
+/**
+ * @}
+ */
+
+/**
+ * Sets additional colors for extended debugging sessions.
+ * @code
+   av_log(ctx, AV_LOG_DEBUG|AV_LOG_C(134), "Message in purple\n");
+   @endcode
+ * Requires 256color terminal support. Uses outside debugging is not
+ * recommended.
+ */
+#define AV_LOG_C(x) ((x) << 8)
+
+/**
+ * Send the specified message to the log if the level is less than or equal
+ * to the current av_log_level. By default, all logging messages are sent to
+ * stderr. This behavior can be altered by setting a different logging callback
+ * function.
+ * @see av_log_set_callback
+ *
+ * @param avcl A pointer to an arbitrary struct of which the first field is a
+ *        pointer to an AVClass struct or NULL if general log.
+ * @param level The importance level of the message expressed using a @ref
+ *        lavu_log_constants "Logging Constant".
+ * @param fmt The format string (printf-compatible) that specifies how
+ *        subsequent arguments are converted to output.
+ */
+void av_log(void *avcl, int level, const char *fmt, ...) av_printf_format(3, 4);
+
+
+/**
+ * Send the specified message to the log if the level is less than or equal
+ * to the current av_log_level. By default, all logging messages are sent to
+ * stderr. This behavior can be altered by setting a different logging callback
+ * function.
+ * @see av_log_set_callback
+ *
+ * @param avcl A pointer to an arbitrary struct of which the first field is a
+ *        pointer to an AVClass struct.
+ * @param level The importance level of the message expressed using a @ref
+ *        lavu_log_constants "Logging Constant".
+ * @param fmt The format string (printf-compatible) that specifies how
+ *        subsequent arguments are converted to output.
+ * @param vl The arguments referenced by the format string.
+ */
+void av_vlog(void *avcl, int level, const char *fmt, va_list vl);
+
+/**
+ * Get the current log level
+ *
+ * @see lavu_log_constants
+ *
+ * @return Current log level
+ */
+int av_log_get_level(void);
+
+/**
+ * Set the log level
+ *
+ * @see lavu_log_constants
+ *
+ * @param level Logging level
+ */
+void av_log_set_level(int level);
+
+/**
+ * Set the logging callback
+ *
+ * @note The callback must be thread safe, even if the application does not use
+ *       threads itself as some codecs are multithreaded.
+ *
+ * @see av_log_default_callback
+ *
+ * @param callback A logging function with a compatible signature.
+ */
+void av_log_set_callback(void (*callback)(void*, int, const char*, va_list));
+
+/**
+ * Default logging callback
+ *
+ * It prints the message to stderr, optionally colorizing it.
+ *
+ * @param avcl A pointer to an arbitrary struct of which the first field is a
+ *        pointer to an AVClass struct.
+ * @param level The importance level of the message expressed using a @ref
+ *        lavu_log_constants "Logging Constant".
+ * @param fmt The format string (printf-compatible) that specifies how
+ *        subsequent arguments are converted to output.
+ * @param vl The arguments referenced by the format string.
+ */
+void av_log_default_callback(void *avcl, int level, const char *fmt,
+                             va_list vl);
+
+/**
+ * Return the context name
+ *
+ * @param  ctx The AVClass context
+ *
+ * @return The AVClass class_name
+ */
+const char* av_default_item_name(void* ctx);
+AVClassCategory av_default_get_category(void *ptr);
+
+/**
+ * Format a line of log the same way as the default callback.
+ * @param line          buffer to receive the formatted line
+ * @param line_size     size of the buffer
+ * @param print_prefix  used to store whether the prefix must be printed;
+ *                      must point to a persistent integer initially set to 1
+ */
+void av_log_format_line(void *ptr, int level, const char *fmt, va_list vl,
+                        char *line, int line_size, int *print_prefix);
+
+/**
+ * Format a line of log the same way as the default callback.
+ * @param line          buffer to receive the formatted line;
+ *                      may be NULL if line_size is 0
+ * @param line_size     size of the buffer; at most line_size-1 characters will
+ *                      be written to the buffer, plus one null terminator
+ * @param print_prefix  used to store whether the prefix must be printed;
+ *                      must point to a persistent integer initially set to 1
+ * @return Returns a negative value if an error occurred, otherwise returns
+ *         the number of characters that would have been written for a
+ *         sufficiently large buffer, not including the terminating null
+ *         character. If the return value is not less than line_size, it means
+ *         that the log message was truncated to fit the buffer.
+ */
+int av_log_format_line2(void *ptr, int level, const char *fmt, va_list vl,
+                        char *line, int line_size, int *print_prefix);
+
+#if FF_API_DLOG
+/**
+ * av_dlog macros
+ * @deprecated unused
+ * Useful to print debug messages that shouldn't get compiled in normally.
+ */
+
+#ifdef DEBUG
+#    define av_dlog(pctx, ...) av_log(pctx, AV_LOG_DEBUG, __VA_ARGS__)
+#else
+#    define av_dlog(pctx, ...) do { if (0) av_log(pctx, AV_LOG_DEBUG, __VA_ARGS__); } while (0)
+#endif
+#endif /* FF_API_DLOG */
+
+/**
+ * Skip repeated messages, this requires the user app to use av_log() instead of
+ * (f)printf as the 2 would otherwise interfere and lead to
+ * "Last message repeated x times" messages below (f)printf messages with some
+ * bad luck.
+ * Also to receive the last, "last repeated" line if any, the user app must
+ * call av_log(NULL, AV_LOG_QUIET, "%s", ""); at the end
+ */
+#define AV_LOG_SKIP_REPEATED 1
+
+/**
+ * Include the log severity in messages originating from codecs.
+ *
+ * Results in messages such as:
+ * [rawvideo @ 0xDEADBEEF] [error] encode did not produce valid pts
+ */
+#define AV_LOG_PRINT_LEVEL 2
+
+void av_log_set_flags(int arg);
+int av_log_get_flags(void);
+
+/**
+ * @}
+ */
+
+#endif /* AVUTIL_LOG_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/lzo.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/lzo.h
new file mode 100644
index 0000000..c034039
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/lzo.h
@@ -0,0 +1,66 @@
+/*
+ * LZO 1x decompression
+ * copyright (c) 2006 Reimar Doeffinger
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_LZO_H
+#define AVUTIL_LZO_H
+
+/**
+ * @defgroup lavu_lzo LZO
+ * @ingroup lavu_crypto
+ *
+ * @{
+ */
+
+#include <stdint.h>
+
+/** @name Error flags returned by av_lzo1x_decode
+ * @{ */
+/// end of the input buffer reached before decoding finished
+#define AV_LZO_INPUT_DEPLETED  1
+/// decoded data did not fit into output buffer
+#define AV_LZO_OUTPUT_FULL     2
+/// a reference to previously decoded data was wrong
+#define AV_LZO_INVALID_BACKPTR 4
+/// a non-specific error in the compressed bitstream
+#define AV_LZO_ERROR           8
+/** @} */
+
+#define AV_LZO_INPUT_PADDING   8
+#define AV_LZO_OUTPUT_PADDING 12
+
+/**
+ * @brief Decodes LZO 1x compressed data.
+ * @param out output buffer
+ * @param outlen size of output buffer, number of bytes left are returned here
+ * @param in input buffer
+ * @param inlen size of input buffer, number of bytes left are returned here
+ * @return 0 on success, otherwise a combination of the error flags above
+ *
+ * Make sure all buffers are appropriately padded, in must provide
+ * AV_LZO_INPUT_PADDING, out must provide AV_LZO_OUTPUT_PADDING additional bytes.
+ */
+int av_lzo1x_decode(void *out, int *outlen, const void *in, int *inlen);
+
+/**
+ * @}
+ */
+
+#endif /* AVUTIL_LZO_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/macros.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/macros.h
new file mode 100644
index 0000000..2007ee5
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/macros.h
@@ -0,0 +1,50 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * @ingroup lavu
+ * Utility Preprocessor macros
+ */
+
+#ifndef AVUTIL_MACROS_H
+#define AVUTIL_MACROS_H
+
+/**
+ * @addtogroup preproc_misc Preprocessor String Macros
+ *
+ * String manipulation macros
+ *
+ * @{
+ */
+
+#define AV_STRINGIFY(s)         AV_TOSTRING(s)
+#define AV_TOSTRING(s) #s
+
+#define AV_GLUE(a, b) a ## b
+#define AV_JOIN(a, b) AV_GLUE(a, b)
+
+/**
+ * @}
+ */
+
+#define AV_PRAGMA(s) _Pragma(#s)
+
+#define FFALIGN(x, a) (((x)+(a)-1)&~((a)-1))
+
+#endif /* AVUTIL_MACROS_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/mastering_display_metadata.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/mastering_display_metadata.h
new file mode 100644
index 0000000..936533f
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/mastering_display_metadata.h
@@ -0,0 +1,89 @@
+/**
+ * Copyright (c) 2016 Neil Birkbeck <neil.birkbeck@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_MASTERING_DISPLAY_METADATA_H
+#define AVUTIL_MASTERING_DISPLAY_METADATA_H
+
+#include "frame.h"
+#include "rational.h"
+
+
+/**
+ * Mastering display metadata capable of representing the color volume of
+ * the display used to master the content (SMPTE 2086:2014).
+ *
+ * To be used as payload of a AVFrameSideData or AVPacketSideData with the
+ * appropriate type.
+ *
+ * @note The struct should be allocated with av_mastering_display_metadata_alloc()
+ *       and its size is not a part of the public ABI.
+ */
+typedef struct AVMasteringDisplayMetadata {
+    /**
+     * CIE 1931 xy chromaticity coords of color primaries (r, g, b order).
+     */
+    AVRational display_primaries[3][2];
+
+    /**
+     * CIE 1931 xy chromaticity coords of white point.
+     */
+    AVRational white_point[2];
+
+    /**
+     * Min luminance of mastering display (cd/m^2).
+     */
+    AVRational min_luminance;
+
+    /**
+     * Max luminance of mastering display (cd/m^2).
+     */
+    AVRational max_luminance;
+
+    /**
+     * Flag indicating whether the display primaries (and white point) are set.
+     */
+    int has_primaries;
+
+    /**
+     * Flag indicating whether the luminance (min_ and max_) have been set.
+     */
+    int has_luminance;
+
+} AVMasteringDisplayMetadata;
+
+/**
+ * Allocate an AVMasteringDisplayMetadata structure and set its fields to
+ * default values. The resulting struct can be freed using av_freep().
+ *
+ * @return An AVMasteringDisplayMetadata filled with default values or NULL
+ *         on failure.
+ */
+AVMasteringDisplayMetadata *av_mastering_display_metadata_alloc(void);
+
+/**
+ * Allocate a complete AVMasteringDisplayMetadata and add it to the frame.
+ *
+ * @param frame The frame which side data is added to.
+ *
+ * @return The AVMasteringDisplayMetadata structure to be filled by caller.
+ */
+AVMasteringDisplayMetadata *av_mastering_display_metadata_create_side_data(AVFrame *frame);
+
+#endif /* AVUTIL_MASTERING_DISPLAY_METADATA_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/mathematics.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/mathematics.h
new file mode 100644
index 0000000..5490180
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/mathematics.h
@@ -0,0 +1,242 @@
+/*
+ * copyright (c) 2005-2012 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * @addtogroup lavu_math
+ * Mathematical utilities for working with timestamp and time base.
+ */
+
+#ifndef AVUTIL_MATHEMATICS_H
+#define AVUTIL_MATHEMATICS_H
+
+#include <stdint.h>
+#include <math.h>
+#include "attributes.h"
+#include "rational.h"
+#include "intfloat.h"
+
+#ifndef M_E
+#define M_E            2.7182818284590452354   /* e */
+#endif
+#ifndef M_LN2
+#define M_LN2          0.69314718055994530942  /* log_e 2 */
+#endif
+#ifndef M_LN10
+#define M_LN10         2.30258509299404568402  /* log_e 10 */
+#endif
+#ifndef M_LOG2_10
+#define M_LOG2_10      3.32192809488736234787  /* log_2 10 */
+#endif
+#ifndef M_PHI
+#define M_PHI          1.61803398874989484820   /* phi / golden ratio */
+#endif
+#ifndef M_PI
+#define M_PI           3.14159265358979323846  /* pi */
+#endif
+#ifndef M_PI_2
+#define M_PI_2         1.57079632679489661923  /* pi/2 */
+#endif
+#ifndef M_SQRT1_2
+#define M_SQRT1_2      0.70710678118654752440  /* 1/sqrt(2) */
+#endif
+#ifndef M_SQRT2
+#define M_SQRT2        1.41421356237309504880  /* sqrt(2) */
+#endif
+#ifndef NAN
+#define NAN            av_int2float(0x7fc00000)
+#endif
+#ifndef INFINITY
+#define INFINITY       av_int2float(0x7f800000)
+#endif
+
+/**
+ * @addtogroup lavu_math
+ *
+ * @{
+ */
+
+/**
+ * Rounding methods.
+ */
+enum AVRounding {
+    AV_ROUND_ZERO     = 0, ///< Round toward zero.
+    AV_ROUND_INF      = 1, ///< Round away from zero.
+    AV_ROUND_DOWN     = 2, ///< Round toward -infinity.
+    AV_ROUND_UP       = 3, ///< Round toward +infinity.
+    AV_ROUND_NEAR_INF = 5, ///< Round to nearest and halfway cases away from zero.
+    /**
+     * Flag telling rescaling functions to pass `INT64_MIN`/`MAX` through
+     * unchanged, avoiding special cases for #AV_NOPTS_VALUE.
+     *
+     * Unlike other values of the enumeration AVRounding, this value is a
+     * bitmask that must be used in conjunction with another value of the
+     * enumeration through a bitwise OR, in order to set behavior for normal
+     * cases.
+     *
+     * @code{.c}
+     * av_rescale_rnd(3, 1, 2, AV_ROUND_UP | AV_ROUND_PASS_MINMAX);
+     * // Rescaling 3:
+     * //     Calculating 3 * 1 / 2
+     * //     3 / 2 is rounded up to 2
+     * //     => 2
+     *
+     * av_rescale_rnd(AV_NOPTS_VALUE, 1, 2, AV_ROUND_UP | AV_ROUND_PASS_MINMAX);
+     * // Rescaling AV_NOPTS_VALUE:
+     * //     AV_NOPTS_VALUE == INT64_MIN
+     * //     AV_NOPTS_VALUE is passed through
+     * //     => AV_NOPTS_VALUE
+     * @endcode
+     */
+    AV_ROUND_PASS_MINMAX = 8192,
+};
+
+/**
+ * Compute the greatest common divisor of two integer operands.
+ *
+ * @param a,b Operands
+ * @return GCD of a and b up to sign; if a >= 0 and b >= 0, return value is >= 0;
+ * if a == 0 and b == 0, returns 0.
+ */
+int64_t av_const av_gcd(int64_t a, int64_t b);
+
+/**
+ * Rescale a 64-bit integer with rounding to nearest.
+ *
+ * The operation is mathematically equivalent to `a * b / c`, but writing that
+ * directly can overflow.
+ *
+ * This function is equivalent to av_rescale_rnd() with #AV_ROUND_NEAR_INF.
+ *
+ * @see av_rescale_rnd(), av_rescale_q(), av_rescale_q_rnd()
+ */
+int64_t av_rescale(int64_t a, int64_t b, int64_t c) av_const;
+
+/**
+ * Rescale a 64-bit integer with specified rounding.
+ *
+ * The operation is mathematically equivalent to `a * b / c`, but writing that
+ * directly can overflow, and does not support different rounding methods.
+ *
+ * @see av_rescale(), av_rescale_q(), av_rescale_q_rnd()
+ */
+int64_t av_rescale_rnd(int64_t a, int64_t b, int64_t c, enum AVRounding rnd) av_const;
+
+/**
+ * Rescale a 64-bit integer by 2 rational numbers.
+ *
+ * The operation is mathematically equivalent to `a * bq / cq`.
+ *
+ * This function is equivalent to av_rescale_q_rnd() with #AV_ROUND_NEAR_INF.
+ *
+ * @see av_rescale(), av_rescale_rnd(), av_rescale_q_rnd()
+ */
+int64_t av_rescale_q(int64_t a, AVRational bq, AVRational cq) av_const;
+
+/**
+ * Rescale a 64-bit integer by 2 rational numbers with specified rounding.
+ *
+ * The operation is mathematically equivalent to `a * bq / cq`.
+ *
+ * @see av_rescale(), av_rescale_rnd(), av_rescale_q()
+ */
+int64_t av_rescale_q_rnd(int64_t a, AVRational bq, AVRational cq,
+                         enum AVRounding rnd) av_const;
+
+/**
+ * Compare two timestamps each in its own time base.
+ *
+ * @return One of the following values:
+ *         - -1 if `ts_a` is before `ts_b`
+ *         - 1 if `ts_a` is after `ts_b`
+ *         - 0 if they represent the same position
+ *
+ * @warning
+ * The result of the function is undefined if one of the timestamps is outside
+ * the `int64_t` range when represented in the other's timebase.
+ */
+int av_compare_ts(int64_t ts_a, AVRational tb_a, int64_t ts_b, AVRational tb_b);
+
+/**
+ * Compare the remainders of two integer operands divided by a common divisor.
+ *
+ * In other words, compare the least significant `log2(mod)` bits of integers
+ * `a` and `b`.
+ *
+ * @code{.c}
+ * av_compare_mod(0x11, 0x02, 0x10) < 0 // since 0x11 % 0x10  (0x1) < 0x02 % 0x10  (0x2)
+ * av_compare_mod(0x11, 0x02, 0x20) > 0 // since 0x11 % 0x20 (0x11) > 0x02 % 0x20 (0x02)
+ * @endcode
+ *
+ * @param a,b Operands
+ * @param mod Divisor; must be a power of 2
+ * @return
+ *         - a negative value if `a % mod < b % mod`
+ *         - a positive value if `a % mod > b % mod`
+ *         - zero             if `a % mod == b % mod`
+ */
+int64_t av_compare_mod(uint64_t a, uint64_t b, uint64_t mod);
+
+/**
+ * Rescale a timestamp while preserving known durations.
+ *
+ * This function is designed to be called per audio packet to scale the input
+ * timestamp to a different time base. Compared to a simple av_rescale_q()
+ * call, this function is robust against possible inconsistent frame durations.
+ *
+ * The `last` parameter is a state variable that must be preserved for all
+ * subsequent calls for the same stream. For the first call, `*last` should be
+ * initialized to #AV_NOPTS_VALUE.
+ *
+ * @param[in]     in_tb    Input time base
+ * @param[in]     in_ts    Input timestamp
+ * @param[in]     fs_tb    Duration time base; typically this is finer-grained
+ *                         (greater) than `in_tb` and `out_tb`
+ * @param[in]     duration Duration till the next call to this function (i.e.
+ *                         duration of the current packet/frame)
+ * @param[in,out] last     Pointer to a timestamp expressed in terms of
+ *                         `fs_tb`, acting as a state variable
+ * @param[in]     out_tb   Output timebase
+ * @return        Timestamp expressed in terms of `out_tb`
+ *
+ * @note In the context of this function, "duration" is in term of samples, not
+ *       seconds.
+ */
+int64_t av_rescale_delta(AVRational in_tb, int64_t in_ts,  AVRational fs_tb, int duration, int64_t *last, AVRational out_tb);
+
+/**
+ * Add a value to a timestamp.
+ *
+ * This function guarantees that when the same value is repeatly added that
+ * no accumulation of rounding errors occurs.
+ *
+ * @param[in] ts     Input timestamp
+ * @param[in] ts_tb  Input timestamp time base
+ * @param[in] inc    Value to be added
+ * @param[in] inc_tb Time base of `inc`
+ */
+int64_t av_add_stable(AVRational ts_tb, int64_t ts, AVRational inc_tb, int64_t inc);
+
+
+/**
+ * @}
+ */
+
+#endif /* AVUTIL_MATHEMATICS_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/md5.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/md5.h
new file mode 100644
index 0000000..9571c1f
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/md5.h
@@ -0,0 +1,89 @@
+/*
+ * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * @ingroup lavu_md5
+ * Public header for MD5 hash function implementation.
+ */
+
+#ifndef AVUTIL_MD5_H
+#define AVUTIL_MD5_H
+
+#include <stdint.h>
+
+#include "attributes.h"
+#include "version.h"
+
+/**
+ * @defgroup lavu_md5 MD5
+ * @ingroup lavu_hash
+ * MD5 hash function implementation.
+ *
+ * @{
+ */
+
+extern const int av_md5_size;
+
+struct AVMD5;
+
+/**
+ * Allocate an AVMD5 context.
+ */
+struct AVMD5 *av_md5_alloc(void);
+
+/**
+ * Initialize MD5 hashing.
+ *
+ * @param ctx pointer to the function context (of size av_md5_size)
+ */
+void av_md5_init(struct AVMD5 *ctx);
+
+/**
+ * Update hash value.
+ *
+ * @param ctx hash function context
+ * @param src input data to update hash with
+ * @param len input data length
+ */
+void av_md5_update(struct AVMD5 *ctx, const uint8_t *src, int len);
+
+/**
+ * Finish hashing and output digest value.
+ *
+ * @param ctx hash function context
+ * @param dst buffer where output digest value is stored
+ */
+void av_md5_final(struct AVMD5 *ctx, uint8_t *dst);
+
+/**
+ * Hash an array of data.
+ *
+ * @param dst The output buffer to write the digest into
+ * @param src The data to hash
+ * @param len The length of the data, in bytes
+ */
+void av_md5_sum(uint8_t *dst, const uint8_t *src, const int len);
+
+/**
+ * @}
+ */
+
+#endif /* AVUTIL_MD5_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/mem.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/mem.h
new file mode 100644
index 0000000..527cd03
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/mem.h
@@ -0,0 +1,699 @@
+/*
+ * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * @ingroup lavu_mem
+ * Memory handling functions
+ */
+
+#ifndef AVUTIL_MEM_H
+#define AVUTIL_MEM_H
+
+#include <limits.h>
+#include <stdint.h>
+
+#include "attributes.h"
+#include "error.h"
+#include "avutil.h"
+
+/**
+ * @addtogroup lavu_mem
+ * Utilities for manipulating memory.
+ *
+ * FFmpeg has several applications of memory that are not required of a typical
+ * program. For example, the computing-heavy components like video decoding and
+ * encoding can be sped up significantly through the use of aligned memory.
+ *
+ * However, for each of FFmpeg's applications of memory, there might not be a
+ * recognized or standardized API for that specific use. Memory alignment, for
+ * instance, varies wildly depending on operating systems, architectures, and
+ * compilers. Hence, this component of @ref libavutil is created to make
+ * dealing with memory consistently possible on all platforms.
+ *
+ * @{
+ *
+ * @defgroup lavu_mem_macros Alignment Macros
+ * Helper macros for declaring aligned variables.
+ * @{
+ */
+
+/**
+ * @def DECLARE_ALIGNED(n,t,v)
+ * Declare a variable that is aligned in memory.
+ *
+ * @code{.c}
+ * DECLARE_ALIGNED(16, uint16_t, aligned_int) = 42;
+ * DECLARE_ALIGNED(32, uint8_t, aligned_array)[128];
+ *
+ * // The default-alignment equivalent would be
+ * uint16_t aligned_int = 42;
+ * uint8_t aligned_array[128];
+ * @endcode
+ *
+ * @param n Minimum alignment in bytes
+ * @param t Type of the variable (or array element)
+ * @param v Name of the variable
+ */
+
+/**
+ * @def DECLARE_ASM_CONST(n,t,v)
+ * Declare a static constant aligned variable appropriate for use in inline
+ * assembly code.
+ *
+ * @code{.c}
+ * DECLARE_ASM_CONST(16, uint64_t, pw_08) = UINT64_C(0x0008000800080008);
+ * @endcode
+ *
+ * @param n Minimum alignment in bytes
+ * @param t Type of the variable (or array element)
+ * @param v Name of the variable
+ */
+
+#if defined(__INTEL_COMPILER) && __INTEL_COMPILER < 1110 || defined(__SUNPRO_C)
+    #define DECLARE_ALIGNED(n,t,v)      t __attribute__ ((aligned (n))) v
+    #define DECLARE_ASM_CONST(n,t,v)    const t __attribute__ ((aligned (n))) v
+#elif defined(__TI_COMPILER_VERSION__)
+    #define DECLARE_ALIGNED(n,t,v)                      \
+        AV_PRAGMA(DATA_ALIGN(v,n))                      \
+        t __attribute__((aligned(n))) v
+    #define DECLARE_ASM_CONST(n,t,v)                    \
+        AV_PRAGMA(DATA_ALIGN(v,n))                      \
+        static const t __attribute__((aligned(n))) v
+#elif defined(__DJGPP__)
+    #define DECLARE_ALIGNED(n,t,v)      t __attribute__ ((aligned (FFMIN(n, 16)))) v
+    #define DECLARE_ASM_CONST(n,t,v)    static const t av_used __attribute__ ((aligned (FFMIN(n, 16)))) v
+#elif defined(__GNUC__) || defined(__clang__)
+    #define DECLARE_ALIGNED(n,t,v)      t __attribute__ ((aligned (n))) v
+    #define DECLARE_ASM_CONST(n,t,v)    static const t av_used __attribute__ ((aligned (n))) v
+#elif defined(_MSC_VER)
+    #define DECLARE_ALIGNED(n,t,v)      __declspec(align(n)) t v
+    #define DECLARE_ASM_CONST(n,t,v)    __declspec(align(n)) static const t v
+#else
+    #define DECLARE_ALIGNED(n,t,v)      t v
+    #define DECLARE_ASM_CONST(n,t,v)    static const t v
+#endif
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup lavu_mem_attrs Function Attributes
+ * Function attributes applicable to memory handling functions.
+ *
+ * These function attributes can help compilers emit more useful warnings, or
+ * generate better code.
+ * @{
+ */
+
+/**
+ * @def av_malloc_attrib
+ * Function attribute denoting a malloc-like function.
+ *
+ * @see <a href="https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-g_t_0040code_007bmalloc_007d-function-attribute-3251">Function attribute `malloc` in GCC's documentation</a>
+ */
+
+#if AV_GCC_VERSION_AT_LEAST(3,1)
+    #define av_malloc_attrib __attribute__((__malloc__))
+#else
+    #define av_malloc_attrib
+#endif
+
+/**
+ * @def av_alloc_size(...)
+ * Function attribute used on a function that allocates memory, whose size is
+ * given by the specified parameter(s).
+ *
+ * @code{.c}
+ * void *av_malloc(size_t size) av_alloc_size(1);
+ * void *av_calloc(size_t nmemb, size_t size) av_alloc_size(1, 2);
+ * @endcode
+ *
+ * @param ... One or two parameter indexes, separated by a comma
+ *
+ * @see <a href="https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-g_t_0040code_007balloc_005fsize_007d-function-attribute-3220">Function attribute `alloc_size` in GCC's documentation</a>
+ */
+
+#if AV_GCC_VERSION_AT_LEAST(4,3)
+    #define av_alloc_size(...) __attribute__((alloc_size(__VA_ARGS__)))
+#else
+    #define av_alloc_size(...)
+#endif
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup lavu_mem_funcs Heap Management
+ * Functions responsible for allocating, freeing, and copying memory.
+ *
+ * All memory allocation functions have a built-in upper limit of `INT_MAX`
+ * bytes. This may be changed with av_max_alloc(), although exercise extreme
+ * caution when doing so.
+ *
+ * @{
+ */
+
+/**
+ * Allocate a memory block with alignment suitable for all memory accesses
+ * (including vectors if available on the CPU).
+ *
+ * @param size Size in bytes for the memory block to be allocated
+ * @return Pointer to the allocated block, or `NULL` if the block cannot
+ *         be allocated
+ * @see av_mallocz()
+ */
+void *av_malloc(size_t size) av_malloc_attrib av_alloc_size(1);
+
+/**
+ * Allocate a memory block with alignment suitable for all memory accesses
+ * (including vectors if available on the CPU) and zero all the bytes of the
+ * block.
+ *
+ * @param size Size in bytes for the memory block to be allocated
+ * @return Pointer to the allocated block, or `NULL` if it cannot be allocated
+ * @see av_malloc()
+ */
+void *av_mallocz(size_t size) av_malloc_attrib av_alloc_size(1);
+
+/**
+ * Allocate a memory block for an array with av_malloc().
+ *
+ * The allocated memory will have size `size * nmemb` bytes.
+ *
+ * @param nmemb Number of element
+ * @param size  Size of a single element
+ * @return Pointer to the allocated block, or `NULL` if the block cannot
+ *         be allocated
+ * @see av_malloc()
+ */
+av_alloc_size(1, 2) static inline void *av_malloc_array(size_t nmemb, size_t size)
+{
+    if (!size || nmemb >= INT_MAX / size)
+        return NULL;
+    return av_malloc(nmemb * size);
+}
+
+/**
+ * Allocate a memory block for an array with av_mallocz().
+ *
+ * The allocated memory will have size `size * nmemb` bytes.
+ *
+ * @param nmemb Number of elements
+ * @param size  Size of the single element
+ * @return Pointer to the allocated block, or `NULL` if the block cannot
+ *         be allocated
+ *
+ * @see av_mallocz()
+ * @see av_malloc_array()
+ */
+av_alloc_size(1, 2) static inline void *av_mallocz_array(size_t nmemb, size_t size)
+{
+    if (!size || nmemb >= INT_MAX / size)
+        return NULL;
+    return av_mallocz(nmemb * size);
+}
+
+/**
+ * Non-inlined equivalent of av_mallocz_array().
+ *
+ * Created for symmetry with the calloc() C function.
+ */
+void *av_calloc(size_t nmemb, size_t size) av_malloc_attrib;
+
+/**
+ * Allocate, reallocate, or free a block of memory.
+ *
+ * If `ptr` is `NULL` and `size` > 0, allocate a new block. If `size` is
+ * zero, free the memory block pointed to by `ptr`. Otherwise, expand or
+ * shrink that block of memory according to `size`.
+ *
+ * @param ptr  Pointer to a memory block already allocated with
+ *             av_realloc() or `NULL`
+ * @param size Size in bytes of the memory block to be allocated or
+ *             reallocated
+ *
+ * @return Pointer to a newly-reallocated block or `NULL` if the block
+ *         cannot be reallocated or the function is used to free the memory block
+ *
+ * @warning Unlike av_malloc(), the returned pointer is not guaranteed to be
+ *          correctly aligned.
+ * @see av_fast_realloc()
+ * @see av_reallocp()
+ */
+void *av_realloc(void *ptr, size_t size) av_alloc_size(2);
+
+/**
+ * Allocate, reallocate, or free a block of memory through a pointer to a
+ * pointer.
+ *
+ * If `*ptr` is `NULL` and `size` > 0, allocate a new block. If `size` is
+ * zero, free the memory block pointed to by `*ptr`. Otherwise, expand or
+ * shrink that block of memory according to `size`.
+ *
+ * @param[in,out] ptr  Pointer to a pointer to a memory block already allocated
+ *                     with av_realloc(), or a pointer to `NULL`. The pointer
+ *                     is updated on success, or freed on failure.
+ * @param[in]     size Size in bytes for the memory block to be allocated or
+ *                     reallocated
+ *
+ * @return Zero on success, an AVERROR error code on failure
+ *
+ * @warning Unlike av_malloc(), the allocated memory is not guaranteed to be
+ *          correctly aligned.
+ */
+av_warn_unused_result
+int av_reallocp(void *ptr, size_t size);
+
+/**
+ * Allocate, reallocate, or free a block of memory.
+ *
+ * This function does the same thing as av_realloc(), except:
+ * - It takes two size arguments and allocates `nelem * elsize` bytes,
+ *   after checking the result of the multiplication for integer overflow.
+ * - It frees the input block in case of failure, thus avoiding the memory
+ *   leak with the classic
+ *   @code{.c}
+ *   buf = realloc(buf);
+ *   if (!buf)
+ *       return -1;
+ *   @endcode
+ *   pattern.
+ */
+void *av_realloc_f(void *ptr, size_t nelem, size_t elsize);
+
+/**
+ * Allocate, reallocate, or free an array.
+ *
+ * If `ptr` is `NULL` and `nmemb` > 0, allocate a new block. If
+ * `nmemb` is zero, free the memory block pointed to by `ptr`.
+ *
+ * @param ptr   Pointer to a memory block already allocated with
+ *              av_realloc() or `NULL`
+ * @param nmemb Number of elements in the array
+ * @param size  Size of the single element of the array
+ *
+ * @return Pointer to a newly-reallocated block or NULL if the block
+ *         cannot be reallocated or the function is used to free the memory block
+ *
+ * @warning Unlike av_malloc(), the allocated memory is not guaranteed to be
+ *          correctly aligned.
+ * @see av_reallocp_array()
+ */
+av_alloc_size(2, 3) void *av_realloc_array(void *ptr, size_t nmemb, size_t size);
+
+/**
+ * Allocate, reallocate, or free an array through a pointer to a pointer.
+ *
+ * If `*ptr` is `NULL` and `nmemb` > 0, allocate a new block. If `nmemb` is
+ * zero, free the memory block pointed to by `*ptr`.
+ *
+ * @param[in,out] ptr   Pointer to a pointer to a memory block already
+ *                      allocated with av_realloc(), or a pointer to `NULL`.
+ *                      The pointer is updated on success, or freed on failure.
+ * @param[in]     nmemb Number of elements
+ * @param[in]     size  Size of the single element
+ *
+ * @return Zero on success, an AVERROR error code on failure
+ *
+ * @warning Unlike av_malloc(), the allocated memory is not guaranteed to be
+ *          correctly aligned.
+ */
+av_alloc_size(2, 3) int av_reallocp_array(void *ptr, size_t nmemb, size_t size);
+
+/**
+ * Reallocate the given buffer if it is not large enough, otherwise do nothing.
+ *
+ * If the given buffer is `NULL`, then a new uninitialized buffer is allocated.
+ *
+ * If the given buffer is not large enough, and reallocation fails, `NULL` is
+ * returned and `*size` is set to 0, but the original buffer is not changed or
+ * freed.
+ *
+ * A typical use pattern follows:
+ *
+ * @code{.c}
+ * uint8_t *buf = ...;
+ * uint8_t *new_buf = av_fast_realloc(buf, &current_size, size_needed);
+ * if (!new_buf) {
+ *     // Allocation failed; clean up original buffer
+ *     av_freep(&buf);
+ *     return AVERROR(ENOMEM);
+ * }
+ * @endcode
+ *
+ * @param[in,out] ptr      Already allocated buffer, or `NULL`
+ * @param[in,out] size     Pointer to current size of buffer `ptr`. `*size` is
+ *                         changed to `min_size` in case of success or 0 in
+ *                         case of failure
+ * @param[in]     min_size New size of buffer `ptr`
+ * @return `ptr` if the buffer is large enough, a pointer to newly reallocated
+ *         buffer if the buffer was not large enough, or `NULL` in case of
+ *         error
+ * @see av_realloc()
+ * @see av_fast_malloc()
+ */
+void *av_fast_realloc(void *ptr, unsigned int *size, size_t min_size);
+
+/**
+ * Allocate a buffer, reusing the given one if large enough.
+ *
+ * Contrary to av_fast_realloc(), the current buffer contents might not be
+ * preserved and on error the old buffer is freed, thus no special handling to
+ * avoid memleaks is necessary.
+ *
+ * `*ptr` is allowed to be `NULL`, in which case allocation always happens if
+ * `size_needed` is greater than 0.
+ *
+ * @code{.c}
+ * uint8_t *buf = ...;
+ * av_fast_malloc(&buf, &current_size, size_needed);
+ * if (!buf) {
+ *     // Allocation failed; buf already freed
+ *     return AVERROR(ENOMEM);
+ * }
+ * @endcode
+ *
+ * @param[in,out] ptr      Pointer to pointer to an already allocated buffer.
+ *                         `*ptr` will be overwritten with pointer to new
+ *                         buffer on success or `NULL` on failure
+ * @param[in,out] size     Pointer to current size of buffer `*ptr`. `*size` is
+ *                         changed to `min_size` in case of success or 0 in
+ *                         case of failure
+ * @param[in]     min_size New size of buffer `*ptr`
+ * @see av_realloc()
+ * @see av_fast_mallocz()
+ */
+void av_fast_malloc(void *ptr, unsigned int *size, size_t min_size);
+
+/**
+ * Allocate and clear a buffer, reusing the given one if large enough.
+ *
+ * Like av_fast_malloc(), but all newly allocated space is initially cleared.
+ * Reused buffer is not cleared.
+ *
+ * `*ptr` is allowed to be `NULL`, in which case allocation always happens if
+ * `size_needed` is greater than 0.
+ *
+ * @param[in,out] ptr      Pointer to pointer to an already allocated buffer.
+ *                         `*ptr` will be overwritten with pointer to new
+ *                         buffer on success or `NULL` on failure
+ * @param[in,out] size     Pointer to current size of buffer `*ptr`. `*size` is
+ *                         changed to `min_size` in case of success or 0 in
+ *                         case of failure
+ * @param[in]     min_size New size of buffer `*ptr`
+ * @see av_fast_malloc()
+ */
+void av_fast_mallocz(void *ptr, unsigned int *size, size_t min_size);
+
+/**
+ * Free a memory block which has been allocated with a function of av_malloc()
+ * or av_realloc() family.
+ *
+ * @param ptr Pointer to the memory block which should be freed.
+ *
+ * @note `ptr = NULL` is explicitly allowed.
+ * @note It is recommended that you use av_freep() instead, to prevent leaving
+ *       behind dangling pointers.
+ * @see av_freep()
+ */
+void av_free(void *ptr);
+
+/**
+ * Free a memory block which has been allocated with a function of av_malloc()
+ * or av_realloc() family, and set the pointer pointing to it to `NULL`.
+ *
+ * @code{.c}
+ * uint8_t *buf = av_malloc(16);
+ * av_free(buf);
+ * // buf now contains a dangling pointer to freed memory, and accidental
+ * // dereference of buf will result in a use-after-free, which may be a
+ * // security risk.
+ *
+ * uint8_t *buf = av_malloc(16);
+ * av_freep(&buf);
+ * // buf is now NULL, and accidental dereference will only result in a
+ * // NULL-pointer dereference.
+ * @endcode
+ *
+ * @param ptr Pointer to the pointer to the memory block which should be freed
+ * @note `*ptr = NULL` is safe and leads to no action.
+ * @see av_free()
+ */
+void av_freep(void *ptr);
+
+/**
+ * Duplicate a string.
+ *
+ * @param s String to be duplicated
+ * @return Pointer to a newly-allocated string containing a
+ *         copy of `s` or `NULL` if the string cannot be allocated
+ * @see av_strndup()
+ */
+char *av_strdup(const char *s) av_malloc_attrib;
+
+/**
+ * Duplicate a substring of a string.
+ *
+ * @param s   String to be duplicated
+ * @param len Maximum length of the resulting string (not counting the
+ *            terminating byte)
+ * @return Pointer to a newly-allocated string containing a
+ *         substring of `s` or `NULL` if the string cannot be allocated
+ */
+char *av_strndup(const char *s, size_t len) av_malloc_attrib;
+
+/**
+ * Duplicate a buffer with av_malloc().
+ *
+ * @param p    Buffer to be duplicated
+ * @param size Size in bytes of the buffer copied
+ * @return Pointer to a newly allocated buffer containing a
+ *         copy of `p` or `NULL` if the buffer cannot be allocated
+ */
+void *av_memdup(const void *p, size_t size);
+
+/**
+ * Overlapping memcpy() implementation.
+ *
+ * @param dst  Destination buffer
+ * @param back Number of bytes back to start copying (i.e. the initial size of
+ *             the overlapping window); must be > 0
+ * @param cnt  Number of bytes to copy; must be >= 0
+ *
+ * @note `cnt > back` is valid, this will copy the bytes we just copied,
+ *       thus creating a repeating pattern with a period length of `back`.
+ */
+void av_memcpy_backptr(uint8_t *dst, int back, int cnt);
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup lavu_mem_dynarray Dynamic Array
+ *
+ * Utilities to make an array grow when needed.
+ *
+ * Sometimes, the programmer would want to have an array that can grow when
+ * needed. The libavutil dynamic array utilities fill that need.
+ *
+ * libavutil supports two systems of appending elements onto a dynamically
+ * allocated array, the first one storing the pointer to the value in the
+ * array, and the second storing the value directly. In both systems, the
+ * caller is responsible for maintaining a variable containing the length of
+ * the array, as well as freeing of the array after use.
+ *
+ * The first system stores pointers to values in a block of dynamically
+ * allocated memory. Since only pointers are stored, the function does not need
+ * to know the size of the type. Both av_dynarray_add() and
+ * av_dynarray_add_nofree() implement this system.
+ *
+ * @code
+ * type **array = NULL; //< an array of pointers to values
+ * int    nb    = 0;    //< a variable to keep track of the length of the array
+ *
+ * type to_be_added  = ...;
+ * type to_be_added2 = ...;
+ *
+ * av_dynarray_add(&array, &nb, &to_be_added);
+ * if (nb == 0)
+ *     return AVERROR(ENOMEM);
+ *
+ * av_dynarray_add(&array, &nb, &to_be_added2);
+ * if (nb == 0)
+ *     return AVERROR(ENOMEM);
+ *
+ * // Now:
+ * //  nb           == 2
+ * // &to_be_added  == array[0]
+ * // &to_be_added2 == array[1]
+ *
+ * av_freep(&array);
+ * @endcode
+ *
+ * The second system stores the value directly in a block of memory. As a
+ * result, the function has to know the size of the type. av_dynarray2_add()
+ * implements this mechanism.
+ *
+ * @code
+ * type *array = NULL; //< an array of values
+ * int   nb    = 0;    //< a variable to keep track of the length of the array
+ *
+ * type to_be_added  = ...;
+ * type to_be_added2 = ...;
+ *
+ * type *addr = av_dynarray2_add((void **)&array, &nb, sizeof(*array), NULL);
+ * if (!addr)
+ *     return AVERROR(ENOMEM);
+ * memcpy(addr, &to_be_added, sizeof(to_be_added));
+ *
+ * // Shortcut of the above.
+ * type *addr = av_dynarray2_add((void **)&array, &nb, sizeof(*array),
+ *                               (const void *)&to_be_added2);
+ * if (!addr)
+ *     return AVERROR(ENOMEM);
+ *
+ * // Now:
+ * //  nb           == 2
+ * //  to_be_added  == array[0]
+ * //  to_be_added2 == array[1]
+ *
+ * av_freep(&array);
+ * @endcode
+ *
+ * @{
+ */
+
+/**
+ * Add the pointer to an element to a dynamic array.
+ *
+ * The array to grow is supposed to be an array of pointers to
+ * structures, and the element to add must be a pointer to an already
+ * allocated structure.
+ *
+ * The array is reallocated when its size reaches powers of 2.
+ * Therefore, the amortized cost of adding an element is constant.
+ *
+ * In case of success, the pointer to the array is updated in order to
+ * point to the new grown array, and the number pointed to by `nb_ptr`
+ * is incremented.
+ * In case of failure, the array is freed, `*tab_ptr` is set to `NULL` and
+ * `*nb_ptr` is set to 0.
+ *
+ * @param[in,out] tab_ptr Pointer to the array to grow
+ * @param[in,out] nb_ptr  Pointer to the number of elements in the array
+ * @param[in]     elem    Element to add
+ * @see av_dynarray_add_nofree(), av_dynarray2_add()
+ */
+void av_dynarray_add(void *tab_ptr, int *nb_ptr, void *elem);
+
+/**
+ * Add an element to a dynamic array.
+ *
+ * Function has the same functionality as av_dynarray_add(),
+ * but it doesn't free memory on fails. It returns error code
+ * instead and leave current buffer untouched.
+ *
+ * @return >=0 on success, negative otherwise
+ * @see av_dynarray_add(), av_dynarray2_add()
+ */
+av_warn_unused_result
+int av_dynarray_add_nofree(void *tab_ptr, int *nb_ptr, void *elem);
+
+/**
+ * Add an element of size `elem_size` to a dynamic array.
+ *
+ * The array is reallocated when its number of elements reaches powers of 2.
+ * Therefore, the amortized cost of adding an element is constant.
+ *
+ * In case of success, the pointer to the array is updated in order to
+ * point to the new grown array, and the number pointed to by `nb_ptr`
+ * is incremented.
+ * In case of failure, the array is freed, `*tab_ptr` is set to `NULL` and
+ * `*nb_ptr` is set to 0.
+ *
+ * @param[in,out] tab_ptr   Pointer to the array to grow
+ * @param[in,out] nb_ptr    Pointer to the number of elements in the array
+ * @param[in]     elem_size Size in bytes of an element in the array
+ * @param[in]     elem_data Pointer to the data of the element to add. If
+ *                          `NULL`, the space of the newly added element is
+ *                          allocated but left uninitialized.
+ *
+ * @return Pointer to the data of the element to copy in the newly allocated
+ *         space
+ * @see av_dynarray_add(), av_dynarray_add_nofree()
+ */
+void *av_dynarray2_add(void **tab_ptr, int *nb_ptr, size_t elem_size,
+                       const uint8_t *elem_data);
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup lavu_mem_misc Miscellaneous Functions
+ *
+ * Other functions related to memory allocation.
+ *
+ * @{
+ */
+
+/**
+ * Multiply two `size_t` values checking for overflow.
+ *
+ * @param[in]  a,b Operands of multiplication
+ * @param[out] r   Pointer to the result of the operation
+ * @return 0 on success, AVERROR(EINVAL) on overflow
+ */
+static inline int av_size_mult(size_t a, size_t b, size_t *r)
+{
+    size_t t = a * b;
+    /* Hack inspired from glibc: don't try the division if nelem and elsize
+     * are both less than sqrt(SIZE_MAX). */
+    if ((a | b) >= ((size_t)1 << (sizeof(size_t) * 4)) && a && t / a != b)
+        return AVERROR(EINVAL);
+    *r = t;
+    return 0;
+}
+
+/**
+ * Set the maximum size that may be allocated in one block.
+ *
+ * The value specified with this function is effective for all libavutil's @ref
+ * lavu_mem_funcs "heap management functions."
+ *
+ * By default, the max value is defined as `INT_MAX`.
+ *
+ * @param max Value to be set as the new maximum size
+ *
+ * @warning Exercise extreme caution when using this function. Don't touch
+ *          this if you do not understand the full consequence of doing so.
+ */
+void av_max_alloc(size_t max);
+
+/**
+ * @}
+ * @}
+ */
+
+#endif /* AVUTIL_MEM_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/motion_vector.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/motion_vector.h
new file mode 100644
index 0000000..ec29556
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/motion_vector.h
@@ -0,0 +1,57 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_MOTION_VECTOR_H
+#define AVUTIL_MOTION_VECTOR_H
+
+#include <stdint.h>
+
+typedef struct AVMotionVector {
+    /**
+     * Where the current macroblock comes from; negative value when it comes
+     * from the past, positive value when it comes from the future.
+     * XXX: set exact relative ref frame reference instead of a +/- 1 "direction".
+     */
+    int32_t source;
+    /**
+     * Width and height of the block.
+     */
+    uint8_t w, h;
+    /**
+     * Absolute source position. Can be outside the frame area.
+     */
+    int16_t src_x, src_y;
+    /**
+     * Absolute destination position. Can be outside the frame area.
+     */
+    int16_t dst_x, dst_y;
+    /**
+     * Extra flag information.
+     * Currently unused.
+     */
+    uint64_t flags;
+    /**
+     * Motion vector
+     * src_x = dst_x + motion_x / motion_scale
+     * src_y = dst_y + motion_y / motion_scale
+     */
+    int32_t motion_x, motion_y;
+    uint16_t motion_scale;
+} AVMotionVector;
+
+#endif /* AVUTIL_MOTION_VECTOR_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/murmur3.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/murmur3.h
new file mode 100644
index 0000000..6a1694c
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/murmur3.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (C) 2013 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * @ingroup lavu_murmur3
+ * Public header for MurmurHash3 hash function implementation.
+ */
+
+#ifndef AVUTIL_MURMUR3_H
+#define AVUTIL_MURMUR3_H
+
+#include <stdint.h>
+
+/**
+ * @defgroup lavu_murmur3 Murmur3
+ * @ingroup lavu_hash
+ * MurmurHash3 hash function implementation.
+ *
+ * MurmurHash3 is a non-cryptographic hash function, of which three
+ * incompatible versions were created by its inventor Austin Appleby:
+ *
+ * - 32-bit output
+ * - 128-bit output for 32-bit platforms
+ * - 128-bit output for 64-bit platforms
+ *
+ * FFmpeg only implements the last variant: 128-bit output designed for 64-bit
+ * platforms. Even though the hash function was designed for 64-bit platforms,
+ * the function in reality works on 32-bit systems too, only with reduced
+ * performance.
+ *
+ * @anchor lavu_murmur3_seedinfo
+ * By design, MurmurHash3 requires a seed to operate. In response to this,
+ * libavutil provides two functions for hash initiation, one that requires a
+ * seed (av_murmur3_init_seeded()) and one that uses a fixed arbitrary integer
+ * as the seed, and therefore does not (av_murmur3_init()).
+ *
+ * To make hashes comparable, you should provide the same seed for all calls to
+ * this hash function -- if you are supplying one yourself, that is.
+ *
+ * @{
+ */
+
+/**
+ * Allocate an AVMurMur3 hash context.
+ *
+ * @return Uninitialized hash context or `NULL` in case of error
+ */
+struct AVMurMur3 *av_murmur3_alloc(void);
+
+/**
+ * Initialize or reinitialize an AVMurMur3 hash context with a seed.
+ *
+ * @param[out] c    Hash context
+ * @param[in]  seed Random seed
+ *
+ * @see av_murmur3_init()
+ * @see @ref lavu_murmur3_seedinfo "Detailed description" on a discussion of
+ * seeds for MurmurHash3.
+ */
+void av_murmur3_init_seeded(struct AVMurMur3 *c, uint64_t seed);
+
+/**
+ * Initialize or reinitialize an AVMurMur3 hash context.
+ *
+ * Equivalent to av_murmur3_init_seeded() with a built-in seed.
+ *
+ * @param[out] c    Hash context
+ *
+ * @see av_murmur3_init_seeded()
+ * @see @ref lavu_murmur3_seedinfo "Detailed description" on a discussion of
+ * seeds for MurmurHash3.
+ */
+void av_murmur3_init(struct AVMurMur3 *c);
+
+/**
+ * Update hash context with new data.
+ *
+ * @param[out] c    Hash context
+ * @param[in]  src  Input data to update hash with
+ * @param[in]  len  Number of bytes to read from `src`
+ */
+void av_murmur3_update(struct AVMurMur3 *c, const uint8_t *src, int len);
+
+/**
+ * Finish hashing and output digest value.
+ *
+ * @param[in,out] c    Hash context
+ * @param[out]    dst  Buffer where output digest value is stored
+ */
+void av_murmur3_final(struct AVMurMur3 *c, uint8_t dst[16]);
+
+/**
+ * @}
+ */
+
+#endif /* AVUTIL_MURMUR3_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/opt.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/opt.h
new file mode 100644
index 0000000..0d89379
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/opt.h
@@ -0,0 +1,866 @@
+/*
+ * AVOptions
+ * copyright (c) 2005 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_OPT_H
+#define AVUTIL_OPT_H
+
+/**
+ * @file
+ * AVOptions
+ */
+
+#include "rational.h"
+#include "avutil.h"
+#include "dict.h"
+#include "log.h"
+#include "pixfmt.h"
+#include "samplefmt.h"
+#include "version.h"
+
+/**
+ * @defgroup avoptions AVOptions
+ * @ingroup lavu_data
+ * @{
+ * AVOptions provide a generic system to declare options on arbitrary structs
+ * ("objects"). An option can have a help text, a type and a range of possible
+ * values. Options may then be enumerated, read and written to.
+ *
+ * @section avoptions_implement Implementing AVOptions
+ * This section describes how to add AVOptions capabilities to a struct.
+ *
+ * All AVOptions-related information is stored in an AVClass. Therefore
+ * the first member of the struct should be a pointer to an AVClass describing it.
+ * The option field of the AVClass must be set to a NULL-terminated static array
+ * of AVOptions. Each AVOption must have a non-empty name, a type, a default
+ * value and for number-type AVOptions also a range of allowed values. It must
+ * also declare an offset in bytes from the start of the struct, where the field
+ * associated with this AVOption is located. Other fields in the AVOption struct
+ * should also be set when applicable, but are not required.
+ *
+ * The following example illustrates an AVOptions-enabled struct:
+ * @code
+ * typedef struct test_struct {
+ *     const AVClass *class;
+ *     int      int_opt;
+ *     char    *str_opt;
+ *     uint8_t *bin_opt;
+ *     int      bin_len;
+ * } test_struct;
+ *
+ * static const AVOption test_options[] = {
+ *   { "test_int", "This is a test option of int type.", offsetof(test_struct, int_opt),
+ *     AV_OPT_TYPE_INT, { .i64 = -1 }, INT_MIN, INT_MAX },
+ *   { "test_str", "This is a test option of string type.", offsetof(test_struct, str_opt),
+ *     AV_OPT_TYPE_STRING },
+ *   { "test_bin", "This is a test option of binary type.", offsetof(test_struct, bin_opt),
+ *     AV_OPT_TYPE_BINARY },
+ *   { NULL },
+ * };
+ *
+ * static const AVClass test_class = {
+ *     .class_name = "test class",
+ *     .item_name  = av_default_item_name,
+ *     .option     = test_options,
+ *     .version    = LIBAVUTIL_VERSION_INT,
+ * };
+ * @endcode
+ *
+ * Next, when allocating your struct, you must ensure that the AVClass pointer
+ * is set to the correct value. Then, av_opt_set_defaults() can be called to
+ * initialize defaults. After that the struct is ready to be used with the
+ * AVOptions API.
+ *
+ * When cleaning up, you may use the av_opt_free() function to automatically
+ * free all the allocated string and binary options.
+ *
+ * Continuing with the above example:
+ *
+ * @code
+ * test_struct *alloc_test_struct(void)
+ * {
+ *     test_struct *ret = av_mallocz(sizeof(*ret));
+ *     ret->class = &test_class;
+ *     av_opt_set_defaults(ret);
+ *     return ret;
+ * }
+ * void free_test_struct(test_struct **foo)
+ * {
+ *     av_opt_free(*foo);
+ *     av_freep(foo);
+ * }
+ * @endcode
+ *
+ * @subsection avoptions_implement_nesting Nesting
+ *      It may happen that an AVOptions-enabled struct contains another
+ *      AVOptions-enabled struct as a member (e.g. AVCodecContext in
+ *      libavcodec exports generic options, while its priv_data field exports
+ *      codec-specific options). In such a case, it is possible to set up the
+ *      parent struct to export a child's options. To do that, simply
+ *      implement AVClass.child_next() and AVClass.child_class_next() in the
+ *      parent struct's AVClass.
+ *      Assuming that the test_struct from above now also contains a
+ *      child_struct field:
+ *
+ *      @code
+ *      typedef struct child_struct {
+ *          AVClass *class;
+ *          int flags_opt;
+ *      } child_struct;
+ *      static const AVOption child_opts[] = {
+ *          { "test_flags", "This is a test option of flags type.",
+ *            offsetof(child_struct, flags_opt), AV_OPT_TYPE_FLAGS, { .i64 = 0 }, INT_MIN, INT_MAX },
+ *          { NULL },
+ *      };
+ *      static const AVClass child_class = {
+ *          .class_name = "child class",
+ *          .item_name  = av_default_item_name,
+ *          .option     = child_opts,
+ *          .version    = LIBAVUTIL_VERSION_INT,
+ *      };
+ *
+ *      void *child_next(void *obj, void *prev)
+ *      {
+ *          test_struct *t = obj;
+ *          if (!prev && t->child_struct)
+ *              return t->child_struct;
+ *          return NULL
+ *      }
+ *      const AVClass child_class_next(const AVClass *prev)
+ *      {
+ *          return prev ? NULL : &child_class;
+ *      }
+ *      @endcode
+ *      Putting child_next() and child_class_next() as defined above into
+ *      test_class will now make child_struct's options accessible through
+ *      test_struct (again, proper setup as described above needs to be done on
+ *      child_struct right after it is created).
+ *
+ *      From the above example it might not be clear why both child_next()
+ *      and child_class_next() are needed. The distinction is that child_next()
+ *      iterates over actually existing objects, while child_class_next()
+ *      iterates over all possible child classes. E.g. if an AVCodecContext
+ *      was initialized to use a codec which has private options, then its
+ *      child_next() will return AVCodecContext.priv_data and finish
+ *      iterating. OTOH child_class_next() on AVCodecContext.av_class will
+ *      iterate over all available codecs with private options.
+ *
+ * @subsection avoptions_implement_named_constants Named constants
+ *      It is possible to create named constants for options. Simply set the unit
+ *      field of the option the constants should apply to a string and
+ *      create the constants themselves as options of type AV_OPT_TYPE_CONST
+ *      with their unit field set to the same string.
+ *      Their default_val field should contain the value of the named
+ *      constant.
+ *      For example, to add some named constants for the test_flags option
+ *      above, put the following into the child_opts array:
+ *      @code
+ *      { "test_flags", "This is a test option of flags type.",
+ *        offsetof(child_struct, flags_opt), AV_OPT_TYPE_FLAGS, { .i64 = 0 }, INT_MIN, INT_MAX, "test_unit" },
+ *      { "flag1", "This is a flag with value 16", 0, AV_OPT_TYPE_CONST, { .i64 = 16 }, 0, 0, "test_unit" },
+ *      @endcode
+ *
+ * @section avoptions_use Using AVOptions
+ * This section deals with accessing options in an AVOptions-enabled struct.
+ * Such structs in FFmpeg are e.g. AVCodecContext in libavcodec or
+ * AVFormatContext in libavformat.
+ *
+ * @subsection avoptions_use_examine Examining AVOptions
+ * The basic functions for examining options are av_opt_next(), which iterates
+ * over all options defined for one object, and av_opt_find(), which searches
+ * for an option with the given name.
+ *
+ * The situation is more complicated with nesting. An AVOptions-enabled struct
+ * may have AVOptions-enabled children. Passing the AV_OPT_SEARCH_CHILDREN flag
+ * to av_opt_find() will make the function search children recursively.
+ *
+ * For enumerating there are basically two cases. The first is when you want to
+ * get all options that may potentially exist on the struct and its children
+ * (e.g.  when constructing documentation). In that case you should call
+ * av_opt_child_class_next() recursively on the parent struct's AVClass.  The
+ * second case is when you have an already initialized struct with all its
+ * children and you want to get all options that can be actually written or read
+ * from it. In that case you should call av_opt_child_next() recursively (and
+ * av_opt_next() on each result).
+ *
+ * @subsection avoptions_use_get_set Reading and writing AVOptions
+ * When setting options, you often have a string read directly from the
+ * user. In such a case, simply passing it to av_opt_set() is enough. For
+ * non-string type options, av_opt_set() will parse the string according to the
+ * option type.
+ *
+ * Similarly av_opt_get() will read any option type and convert it to a string
+ * which will be returned. Do not forget that the string is allocated, so you
+ * have to free it with av_free().
+ *
+ * In some cases it may be more convenient to put all options into an
+ * AVDictionary and call av_opt_set_dict() on it. A specific case of this
+ * are the format/codec open functions in lavf/lavc which take a dictionary
+ * filled with option as a parameter. This makes it possible to set some options
+ * that cannot be set otherwise, since e.g. the input file format is not known
+ * before the file is actually opened.
+ */
+
+enum AVOptionType{
+    AV_OPT_TYPE_FLAGS,
+    AV_OPT_TYPE_INT,
+    AV_OPT_TYPE_INT64,
+    AV_OPT_TYPE_DOUBLE,
+    AV_OPT_TYPE_FLOAT,
+    AV_OPT_TYPE_STRING,
+    AV_OPT_TYPE_RATIONAL,
+    AV_OPT_TYPE_BINARY,  ///< offset must point to a pointer immediately followed by an int for the length
+    AV_OPT_TYPE_DICT,
+    AV_OPT_TYPE_UINT64,
+    AV_OPT_TYPE_CONST = 128,
+    AV_OPT_TYPE_IMAGE_SIZE = MKBETAG('S','I','Z','E'), ///< offset must point to two consecutive integers
+    AV_OPT_TYPE_PIXEL_FMT  = MKBETAG('P','F','M','T'),
+    AV_OPT_TYPE_SAMPLE_FMT = MKBETAG('S','F','M','T'),
+    AV_OPT_TYPE_VIDEO_RATE = MKBETAG('V','R','A','T'), ///< offset must point to AVRational
+    AV_OPT_TYPE_DURATION   = MKBETAG('D','U','R',' '),
+    AV_OPT_TYPE_COLOR      = MKBETAG('C','O','L','R'),
+    AV_OPT_TYPE_CHANNEL_LAYOUT = MKBETAG('C','H','L','A'),
+    AV_OPT_TYPE_BOOL           = MKBETAG('B','O','O','L'),
+};
+
+/**
+ * AVOption
+ */
+typedef struct AVOption {
+    const char *name;
+
+    /**
+     * short English help text
+     * @todo What about other languages?
+     */
+    const char *help;
+
+    /**
+     * The offset relative to the context structure where the option
+     * value is stored. It should be 0 for named constants.
+     */
+    int offset;
+    enum AVOptionType type;
+
+    /**
+     * the default value for scalar options
+     */
+    union {
+        int64_t i64;
+        double dbl;
+        const char *str;
+        /* TODO those are unused now */
+        AVRational q;
+    } default_val;
+    double min;                 ///< minimum valid value for the option
+    double max;                 ///< maximum valid value for the option
+
+    int flags;
+#define AV_OPT_FLAG_ENCODING_PARAM  1   ///< a generic parameter which can be set by the user for muxing or encoding
+#define AV_OPT_FLAG_DECODING_PARAM  2   ///< a generic parameter which can be set by the user for demuxing or decoding
+#if FF_API_OPT_TYPE_METADATA
+#define AV_OPT_FLAG_METADATA        4   ///< some data extracted or inserted into the file like title, comment, ...
+#endif
+#define AV_OPT_FLAG_AUDIO_PARAM     8
+#define AV_OPT_FLAG_VIDEO_PARAM     16
+#define AV_OPT_FLAG_SUBTITLE_PARAM  32
+/**
+ * The option is intended for exporting values to the caller.
+ */
+#define AV_OPT_FLAG_EXPORT          64
+/**
+ * The option may not be set through the AVOptions API, only read.
+ * This flag only makes sense when AV_OPT_FLAG_EXPORT is also set.
+ */
+#define AV_OPT_FLAG_READONLY        128
+#define AV_OPT_FLAG_FILTERING_PARAM (1<<16) ///< a generic parameter which can be set by the user for filtering
+//FIXME think about enc-audio, ... style flags
+
+    /**
+     * The logical unit to which the option belongs. Non-constant
+     * options and corresponding named constants share the same
+     * unit. May be NULL.
+     */
+    const char *unit;
+} AVOption;
+
+/**
+ * A single allowed range of values, or a single allowed value.
+ */
+typedef struct AVOptionRange {
+    const char *str;
+    /**
+     * Value range.
+     * For string ranges this represents the min/max length.
+     * For dimensions this represents the min/max pixel count or width/height in multi-component case.
+     */
+    double value_min, value_max;
+    /**
+     * Value's component range.
+     * For string this represents the unicode range for chars, 0-127 limits to ASCII.
+     */
+    double component_min, component_max;
+    /**
+     * Range flag.
+     * If set to 1 the struct encodes a range, if set to 0 a single value.
+     */
+    int is_range;
+} AVOptionRange;
+
+/**
+ * List of AVOptionRange structs.
+ */
+typedef struct AVOptionRanges {
+    /**
+     * Array of option ranges.
+     *
+     * Most of option types use just one component.
+     * Following describes multi-component option types:
+     *
+     * AV_OPT_TYPE_IMAGE_SIZE:
+     * component index 0: range of pixel count (width * height).
+     * component index 1: range of width.
+     * component index 2: range of height.
+     *
+     * @note To obtain multi-component version of this structure, user must
+     *       provide AV_OPT_MULTI_COMPONENT_RANGE to av_opt_query_ranges or
+     *       av_opt_query_ranges_default function.
+     *
+     * Multi-component range can be read as in following example:
+     *
+     * @code
+     * int range_index, component_index;
+     * AVOptionRanges *ranges;
+     * AVOptionRange *range[3]; //may require more than 3 in the future.
+     * av_opt_query_ranges(&ranges, obj, key, AV_OPT_MULTI_COMPONENT_RANGE);
+     * for (range_index = 0; range_index < ranges->nb_ranges; range_index++) {
+     *     for (component_index = 0; component_index < ranges->nb_components; component_index++)
+     *         range[component_index] = ranges->range[ranges->nb_ranges * component_index + range_index];
+     *     //do something with range here.
+     * }
+     * av_opt_freep_ranges(&ranges);
+     * @endcode
+     */
+    AVOptionRange **range;
+    /**
+     * Number of ranges per component.
+     */
+    int nb_ranges;
+    /**
+     * Number of componentes.
+     */
+    int nb_components;
+} AVOptionRanges;
+
+/**
+ * Show the obj options.
+ *
+ * @param req_flags requested flags for the options to show. Show only the
+ * options for which it is opt->flags & req_flags.
+ * @param rej_flags rejected flags for the options to show. Show only the
+ * options for which it is !(opt->flags & req_flags).
+ * @param av_log_obj log context to use for showing the options
+ */
+int av_opt_show2(void *obj, void *av_log_obj, int req_flags, int rej_flags);
+
+/**
+ * Set the values of all AVOption fields to their default values.
+ *
+ * @param s an AVOption-enabled struct (its first member must be a pointer to AVClass)
+ */
+void av_opt_set_defaults(void *s);
+
+/**
+ * Set the values of all AVOption fields to their default values. Only these
+ * AVOption fields for which (opt->flags & mask) == flags will have their
+ * default applied to s.
+ *
+ * @param s an AVOption-enabled struct (its first member must be a pointer to AVClass)
+ * @param mask combination of AV_OPT_FLAG_*
+ * @param flags combination of AV_OPT_FLAG_*
+ */
+void av_opt_set_defaults2(void *s, int mask, int flags);
+
+/**
+ * Parse the key/value pairs list in opts. For each key/value pair
+ * found, stores the value in the field in ctx that is named like the
+ * key. ctx must be an AVClass context, storing is done using
+ * AVOptions.
+ *
+ * @param opts options string to parse, may be NULL
+ * @param key_val_sep a 0-terminated list of characters used to
+ * separate key from value
+ * @param pairs_sep a 0-terminated list of characters used to separate
+ * two pairs from each other
+ * @return the number of successfully set key/value pairs, or a negative
+ * value corresponding to an AVERROR code in case of error:
+ * AVERROR(EINVAL) if opts cannot be parsed,
+ * the error code issued by av_opt_set() if a key/value pair
+ * cannot be set
+ */
+int av_set_options_string(void *ctx, const char *opts,
+                          const char *key_val_sep, const char *pairs_sep);
+
+/**
+ * Parse the key-value pairs list in opts. For each key=value pair found,
+ * set the value of the corresponding option in ctx.
+ *
+ * @param ctx          the AVClass object to set options on
+ * @param opts         the options string, key-value pairs separated by a
+ *                     delimiter
+ * @param shorthand    a NULL-terminated array of options names for shorthand
+ *                     notation: if the first field in opts has no key part,
+ *                     the key is taken from the first element of shorthand;
+ *                     then again for the second, etc., until either opts is
+ *                     finished, shorthand is finished or a named option is
+ *                     found; after that, all options must be named
+ * @param key_val_sep  a 0-terminated list of characters used to separate
+ *                     key from value, for example '='
+ * @param pairs_sep    a 0-terminated list of characters used to separate
+ *                     two pairs from each other, for example ':' or ','
+ * @return  the number of successfully set key=value pairs, or a negative
+ *          value corresponding to an AVERROR code in case of error:
+ *          AVERROR(EINVAL) if opts cannot be parsed,
+ *          the error code issued by av_set_string3() if a key/value pair
+ *          cannot be set
+ *
+ * Options names must use only the following characters: a-z A-Z 0-9 - . / _
+ * Separators must use characters distinct from option names and from each
+ * other.
+ */
+int av_opt_set_from_string(void *ctx, const char *opts,
+                           const char *const *shorthand,
+                           const char *key_val_sep, const char *pairs_sep);
+/**
+ * Free all allocated objects in obj.
+ */
+void av_opt_free(void *obj);
+
+/**
+ * Check whether a particular flag is set in a flags field.
+ *
+ * @param field_name the name of the flag field option
+ * @param flag_name the name of the flag to check
+ * @return non-zero if the flag is set, zero if the flag isn't set,
+ *         isn't of the right type, or the flags field doesn't exist.
+ */
+int av_opt_flag_is_set(void *obj, const char *field_name, const char *flag_name);
+
+/**
+ * Set all the options from a given dictionary on an object.
+ *
+ * @param obj a struct whose first element is a pointer to AVClass
+ * @param options options to process. This dictionary will be freed and replaced
+ *                by a new one containing all options not found in obj.
+ *                Of course this new dictionary needs to be freed by caller
+ *                with av_dict_free().
+ *
+ * @return 0 on success, a negative AVERROR if some option was found in obj,
+ *         but could not be set.
+ *
+ * @see av_dict_copy()
+ */
+int av_opt_set_dict(void *obj, struct AVDictionary **options);
+
+
+/**
+ * Set all the options from a given dictionary on an object.
+ *
+ * @param obj a struct whose first element is a pointer to AVClass
+ * @param options options to process. This dictionary will be freed and replaced
+ *                by a new one containing all options not found in obj.
+ *                Of course this new dictionary needs to be freed by caller
+ *                with av_dict_free().
+ * @param search_flags A combination of AV_OPT_SEARCH_*.
+ *
+ * @return 0 on success, a negative AVERROR if some option was found in obj,
+ *         but could not be set.
+ *
+ * @see av_dict_copy()
+ */
+int av_opt_set_dict2(void *obj, struct AVDictionary **options, int search_flags);
+
+/**
+ * Extract a key-value pair from the beginning of a string.
+ *
+ * @param ropts        pointer to the options string, will be updated to
+ *                     point to the rest of the string (one of the pairs_sep
+ *                     or the final NUL)
+ * @param key_val_sep  a 0-terminated list of characters used to separate
+ *                     key from value, for example '='
+ * @param pairs_sep    a 0-terminated list of characters used to separate
+ *                     two pairs from each other, for example ':' or ','
+ * @param flags        flags; see the AV_OPT_FLAG_* values below
+ * @param rkey         parsed key; must be freed using av_free()
+ * @param rval         parsed value; must be freed using av_free()
+ *
+ * @return  >=0 for success, or a negative value corresponding to an
+ *          AVERROR code in case of error; in particular:
+ *          AVERROR(EINVAL) if no key is present
+ *
+ */
+int av_opt_get_key_value(const char **ropts,
+                         const char *key_val_sep, const char *pairs_sep,
+                         unsigned flags,
+                         char **rkey, char **rval);
+
+enum {
+
+    /**
+     * Accept to parse a value without a key; the key will then be returned
+     * as NULL.
+     */
+    AV_OPT_FLAG_IMPLICIT_KEY = 1,
+};
+
+/**
+ * @defgroup opt_eval_funcs Evaluating option strings
+ * @{
+ * This group of functions can be used to evaluate option strings
+ * and get numbers out of them. They do the same thing as av_opt_set(),
+ * except the result is written into the caller-supplied pointer.
+ *
+ * @param obj a struct whose first element is a pointer to AVClass.
+ * @param o an option for which the string is to be evaluated.
+ * @param val string to be evaluated.
+ * @param *_out value of the string will be written here.
+ *
+ * @return 0 on success, a negative number on failure.
+ */
+int av_opt_eval_flags (void *obj, const AVOption *o, const char *val, int        *flags_out);
+int av_opt_eval_int   (void *obj, const AVOption *o, const char *val, int        *int_out);
+int av_opt_eval_int64 (void *obj, const AVOption *o, const char *val, int64_t    *int64_out);
+int av_opt_eval_float (void *obj, const AVOption *o, const char *val, float      *float_out);
+int av_opt_eval_double(void *obj, const AVOption *o, const char *val, double     *double_out);
+int av_opt_eval_q     (void *obj, const AVOption *o, const char *val, AVRational *q_out);
+/**
+ * @}
+ */
+
+#define AV_OPT_SEARCH_CHILDREN   (1 << 0) /**< Search in possible children of the
+                                               given object first. */
+/**
+ *  The obj passed to av_opt_find() is fake -- only a double pointer to AVClass
+ *  instead of a required pointer to a struct containing AVClass. This is
+ *  useful for searching for options without needing to allocate the corresponding
+ *  object.
+ */
+#define AV_OPT_SEARCH_FAKE_OBJ   (1 << 1)
+
+/**
+ *  In av_opt_get, return NULL if the option has a pointer type and is set to NULL,
+ *  rather than returning an empty string.
+ */
+#define AV_OPT_ALLOW_NULL (1 << 2)
+
+/**
+ *  Allows av_opt_query_ranges and av_opt_query_ranges_default to return more than
+ *  one component for certain option types.
+ *  @see AVOptionRanges for details.
+ */
+#define AV_OPT_MULTI_COMPONENT_RANGE (1 << 12)
+
+/**
+ * Look for an option in an object. Consider only options which
+ * have all the specified flags set.
+ *
+ * @param[in] obj A pointer to a struct whose first element is a
+ *                pointer to an AVClass.
+ *                Alternatively a double pointer to an AVClass, if
+ *                AV_OPT_SEARCH_FAKE_OBJ search flag is set.
+ * @param[in] name The name of the option to look for.
+ * @param[in] unit When searching for named constants, name of the unit
+ *                 it belongs to.
+ * @param opt_flags Find only options with all the specified flags set (AV_OPT_FLAG).
+ * @param search_flags A combination of AV_OPT_SEARCH_*.
+ *
+ * @return A pointer to the option found, or NULL if no option
+ *         was found.
+ *
+ * @note Options found with AV_OPT_SEARCH_CHILDREN flag may not be settable
+ * directly with av_opt_set(). Use special calls which take an options
+ * AVDictionary (e.g. avformat_open_input()) to set options found with this
+ * flag.
+ */
+const AVOption *av_opt_find(void *obj, const char *name, const char *unit,
+                            int opt_flags, int search_flags);
+
+/**
+ * Look for an option in an object. Consider only options which
+ * have all the specified flags set.
+ *
+ * @param[in] obj A pointer to a struct whose first element is a
+ *                pointer to an AVClass.
+ *                Alternatively a double pointer to an AVClass, if
+ *                AV_OPT_SEARCH_FAKE_OBJ search flag is set.
+ * @param[in] name The name of the option to look for.
+ * @param[in] unit When searching for named constants, name of the unit
+ *                 it belongs to.
+ * @param opt_flags Find only options with all the specified flags set (AV_OPT_FLAG).
+ * @param search_flags A combination of AV_OPT_SEARCH_*.
+ * @param[out] target_obj if non-NULL, an object to which the option belongs will be
+ * written here. It may be different from obj if AV_OPT_SEARCH_CHILDREN is present
+ * in search_flags. This parameter is ignored if search_flags contain
+ * AV_OPT_SEARCH_FAKE_OBJ.
+ *
+ * @return A pointer to the option found, or NULL if no option
+ *         was found.
+ */
+const AVOption *av_opt_find2(void *obj, const char *name, const char *unit,
+                             int opt_flags, int search_flags, void **target_obj);
+
+/**
+ * Iterate over all AVOptions belonging to obj.
+ *
+ * @param obj an AVOptions-enabled struct or a double pointer to an
+ *            AVClass describing it.
+ * @param prev result of the previous call to av_opt_next() on this object
+ *             or NULL
+ * @return next AVOption or NULL
+ */
+const AVOption *av_opt_next(const void *obj, const AVOption *prev);
+
+/**
+ * Iterate over AVOptions-enabled children of obj.
+ *
+ * @param prev result of a previous call to this function or NULL
+ * @return next AVOptions-enabled child or NULL
+ */
+void *av_opt_child_next(void *obj, void *prev);
+
+/**
+ * Iterate over potential AVOptions-enabled children of parent.
+ *
+ * @param prev result of a previous call to this function or NULL
+ * @return AVClass corresponding to next potential child or NULL
+ */
+const AVClass *av_opt_child_class_next(const AVClass *parent, const AVClass *prev);
+
+/**
+ * @defgroup opt_set_funcs Option setting functions
+ * @{
+ * Those functions set the field of obj with the given name to value.
+ *
+ * @param[in] obj A struct whose first element is a pointer to an AVClass.
+ * @param[in] name the name of the field to set
+ * @param[in] val The value to set. In case of av_opt_set() if the field is not
+ * of a string type, then the given string is parsed.
+ * SI postfixes and some named scalars are supported.
+ * If the field is of a numeric type, it has to be a numeric or named
+ * scalar. Behavior with more than one scalar and +- infix operators
+ * is undefined.
+ * If the field is of a flags type, it has to be a sequence of numeric
+ * scalars or named flags separated by '+' or '-'. Prefixing a flag
+ * with '+' causes it to be set without affecting the other flags;
+ * similarly, '-' unsets a flag.
+ * @param search_flags flags passed to av_opt_find2. I.e. if AV_OPT_SEARCH_CHILDREN
+ * is passed here, then the option may be set on a child of obj.
+ *
+ * @return 0 if the value has been set, or an AVERROR code in case of
+ * error:
+ * AVERROR_OPTION_NOT_FOUND if no matching option exists
+ * AVERROR(ERANGE) if the value is out of range
+ * AVERROR(EINVAL) if the value is not valid
+ */
+int av_opt_set         (void *obj, const char *name, const char *val, int search_flags);
+int av_opt_set_int     (void *obj, const char *name, int64_t     val, int search_flags);
+int av_opt_set_double  (void *obj, const char *name, double      val, int search_flags);
+int av_opt_set_q       (void *obj, const char *name, AVRational  val, int search_flags);
+int av_opt_set_bin     (void *obj, const char *name, const uint8_t *val, int size, int search_flags);
+int av_opt_set_image_size(void *obj, const char *name, int w, int h, int search_flags);
+int av_opt_set_pixel_fmt (void *obj, const char *name, enum AVPixelFormat fmt, int search_flags);
+int av_opt_set_sample_fmt(void *obj, const char *name, enum AVSampleFormat fmt, int search_flags);
+int av_opt_set_video_rate(void *obj, const char *name, AVRational val, int search_flags);
+int av_opt_set_channel_layout(void *obj, const char *name, int64_t ch_layout, int search_flags);
+/**
+ * @note Any old dictionary present is discarded and replaced with a copy of the new one. The
+ * caller still owns val is and responsible for freeing it.
+ */
+int av_opt_set_dict_val(void *obj, const char *name, const AVDictionary *val, int search_flags);
+
+/**
+ * Set a binary option to an integer list.
+ *
+ * @param obj    AVClass object to set options on
+ * @param name   name of the binary option
+ * @param val    pointer to an integer list (must have the correct type with
+ *               regard to the contents of the list)
+ * @param term   list terminator (usually 0 or -1)
+ * @param flags  search flags
+ */
+#define av_opt_set_int_list(obj, name, val, term, flags) \
+    (av_int_list_length(val, term) > INT_MAX / sizeof(*(val)) ? \
+     AVERROR(EINVAL) : \
+     av_opt_set_bin(obj, name, (const uint8_t *)(val), \
+                    av_int_list_length(val, term) * sizeof(*(val)), flags))
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup opt_get_funcs Option getting functions
+ * @{
+ * Those functions get a value of the option with the given name from an object.
+ *
+ * @param[in] obj a struct whose first element is a pointer to an AVClass.
+ * @param[in] name name of the option to get.
+ * @param[in] search_flags flags passed to av_opt_find2. I.e. if AV_OPT_SEARCH_CHILDREN
+ * is passed here, then the option may be found in a child of obj.
+ * @param[out] out_val value of the option will be written here
+ * @return >=0 on success, a negative error code otherwise
+ */
+/**
+ * @note the returned string will be av_malloc()ed and must be av_free()ed by the caller
+ *
+ * @note if AV_OPT_ALLOW_NULL is set in search_flags in av_opt_get, and the option has
+ * AV_OPT_TYPE_STRING or AV_OPT_TYPE_BINARY and is set to NULL, *out_val will be set
+ * to NULL instead of an allocated empty string.
+ */
+int av_opt_get         (void *obj, const char *name, int search_flags, uint8_t   **out_val);
+int av_opt_get_int     (void *obj, const char *name, int search_flags, int64_t    *out_val);
+int av_opt_get_double  (void *obj, const char *name, int search_flags, double     *out_val);
+int av_opt_get_q       (void *obj, const char *name, int search_flags, AVRational *out_val);
+int av_opt_get_image_size(void *obj, const char *name, int search_flags, int *w_out, int *h_out);
+int av_opt_get_pixel_fmt (void *obj, const char *name, int search_flags, enum AVPixelFormat *out_fmt);
+int av_opt_get_sample_fmt(void *obj, const char *name, int search_flags, enum AVSampleFormat *out_fmt);
+int av_opt_get_video_rate(void *obj, const char *name, int search_flags, AVRational *out_val);
+int av_opt_get_channel_layout(void *obj, const char *name, int search_flags, int64_t *ch_layout);
+/**
+ * @param[out] out_val The returned dictionary is a copy of the actual value and must
+ * be freed with av_dict_free() by the caller
+ */
+int av_opt_get_dict_val(void *obj, const char *name, int search_flags, AVDictionary **out_val);
+/**
+ * @}
+ */
+/**
+ * Gets a pointer to the requested field in a struct.
+ * This function allows accessing a struct even when its fields are moved or
+ * renamed since the application making the access has been compiled,
+ *
+ * @returns a pointer to the field, it can be cast to the correct type and read
+ *          or written to.
+ */
+void *av_opt_ptr(const AVClass *avclass, void *obj, const char *name);
+
+/**
+ * Free an AVOptionRanges struct and set it to NULL.
+ */
+void av_opt_freep_ranges(AVOptionRanges **ranges);
+
+/**
+ * Get a list of allowed ranges for the given option.
+ *
+ * The returned list may depend on other fields in obj like for example profile.
+ *
+ * @param flags is a bitmask of flags, undefined flags should not be set and should be ignored
+ *              AV_OPT_SEARCH_FAKE_OBJ indicates that the obj is a double pointer to a AVClass instead of a full instance
+ *              AV_OPT_MULTI_COMPONENT_RANGE indicates that function may return more than one component, @see AVOptionRanges
+ *
+ * The result must be freed with av_opt_freep_ranges.
+ *
+ * @return number of compontents returned on success, a negative errro code otherwise
+ */
+int av_opt_query_ranges(AVOptionRanges **, void *obj, const char *key, int flags);
+
+/**
+ * Copy options from src object into dest object.
+ *
+ * Options that require memory allocation (e.g. string or binary) are malloc'ed in dest object.
+ * Original memory allocated for such options is freed unless both src and dest options points to the same memory.
+ *
+ * @param dest Object to copy from
+ * @param src  Object to copy into
+ * @return 0 on success, negative on error
+ */
+int av_opt_copy(void *dest, const void *src);
+
+/**
+ * Get a default list of allowed ranges for the given option.
+ *
+ * This list is constructed without using the AVClass.query_ranges() callback
+ * and can be used as fallback from within the callback.
+ *
+ * @param flags is a bitmask of flags, undefined flags should not be set and should be ignored
+ *              AV_OPT_SEARCH_FAKE_OBJ indicates that the obj is a double pointer to a AVClass instead of a full instance
+ *              AV_OPT_MULTI_COMPONENT_RANGE indicates that function may return more than one component, @see AVOptionRanges
+ *
+ * The result must be freed with av_opt_free_ranges.
+ *
+ * @return number of compontents returned on success, a negative errro code otherwise
+ */
+int av_opt_query_ranges_default(AVOptionRanges **, void *obj, const char *key, int flags);
+
+/**
+ * Check if given option is set to its default value.
+ *
+ * Options o must belong to the obj. This function must not be called to check child's options state.
+ * @see av_opt_is_set_to_default_by_name().
+ *
+ * @param obj  AVClass object to check option on
+ * @param o    option to be checked
+ * @return     >0 when option is set to its default,
+ *              0 when option is not set its default,
+ *             <0 on error
+ */
+int av_opt_is_set_to_default(void *obj, const AVOption *o);
+
+/**
+ * Check if given option is set to its default value.
+ *
+ * @param obj          AVClass object to check option on
+ * @param name         option name
+ * @param search_flags combination of AV_OPT_SEARCH_*
+ * @return             >0 when option is set to its default,
+ *                     0 when option is not set its default,
+ *                     <0 on error
+ */
+int av_opt_is_set_to_default_by_name(void *obj, const char *name, int search_flags);
+
+
+#define AV_OPT_SERIALIZE_SKIP_DEFAULTS              0x00000001  ///< Serialize options that are not set to default values only.
+#define AV_OPT_SERIALIZE_OPT_FLAGS_EXACT            0x00000002  ///< Serialize options that exactly match opt_flags only.
+
+/**
+ * Serialize object's options.
+ *
+ * Create a string containing object's serialized options.
+ * Such string may be passed back to av_opt_set_from_string() in order to restore option values.
+ * A key/value or pairs separator occurring in the serialized value or
+ * name string are escaped through the av_escape() function.
+ *
+ * @param[in]  obj           AVClass object to serialize
+ * @param[in]  opt_flags     serialize options with all the specified flags set (AV_OPT_FLAG)
+ * @param[in]  flags         combination of AV_OPT_SERIALIZE_* flags
+ * @param[out] buffer        Pointer to buffer that will be allocated with string containg serialized options.
+ *                           Buffer must be freed by the caller when is no longer needed.
+ * @param[in]  key_val_sep   character used to separate key from value
+ * @param[in]  pairs_sep     character used to separate two pairs from each other
+ * @return                   >= 0 on success, negative on error
+ * @warning Separators cannot be neither '\\' nor '\0'. They also cannot be the same.
+ */
+int av_opt_serialize(void *obj, int opt_flags, int flags, char **buffer,
+                     const char key_val_sep, const char pairs_sep);
+/**
+ * @}
+ */
+
+#endif /* AVUTIL_OPT_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/parseutils.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/parseutils.h
new file mode 100644
index 0000000..e66d24b
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/parseutils.h
@@ -0,0 +1,193 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_PARSEUTILS_H
+#define AVUTIL_PARSEUTILS_H
+
+#include <time.h>
+
+#include "rational.h"
+
+/**
+ * @file
+ * misc parsing utilities
+ */
+
+/**
+ * Parse str and store the parsed ratio in q.
+ *
+ * Note that a ratio with infinite (1/0) or negative value is
+ * considered valid, so you should check on the returned value if you
+ * want to exclude those values.
+ *
+ * The undefined value can be expressed using the "0:0" string.
+ *
+ * @param[in,out] q pointer to the AVRational which will contain the ratio
+ * @param[in] str the string to parse: it has to be a string in the format
+ * num:den, a float number or an expression
+ * @param[in] max the maximum allowed numerator and denominator
+ * @param[in] log_offset log level offset which is applied to the log
+ * level of log_ctx
+ * @param[in] log_ctx parent logging context
+ * @return >= 0 on success, a negative error code otherwise
+ */
+int av_parse_ratio(AVRational *q, const char *str, int max,
+                   int log_offset, void *log_ctx);
+
+#define av_parse_ratio_quiet(rate, str, max) \
+    av_parse_ratio(rate, str, max, AV_LOG_MAX_OFFSET, NULL)
+
+/**
+ * Parse str and put in width_ptr and height_ptr the detected values.
+ *
+ * @param[in,out] width_ptr pointer to the variable which will contain the detected
+ * width value
+ * @param[in,out] height_ptr pointer to the variable which will contain the detected
+ * height value
+ * @param[in] str the string to parse: it has to be a string in the format
+ * width x height or a valid video size abbreviation.
+ * @return >= 0 on success, a negative error code otherwise
+ */
+int av_parse_video_size(int *width_ptr, int *height_ptr, const char *str);
+
+/**
+ * Parse str and store the detected values in *rate.
+ *
+ * @param[in,out] rate pointer to the AVRational which will contain the detected
+ * frame rate
+ * @param[in] str the string to parse: it has to be a string in the format
+ * rate_num / rate_den, a float number or a valid video rate abbreviation
+ * @return >= 0 on success, a negative error code otherwise
+ */
+int av_parse_video_rate(AVRational *rate, const char *str);
+
+/**
+ * Put the RGBA values that correspond to color_string in rgba_color.
+ *
+ * @param color_string a string specifying a color. It can be the name of
+ * a color (case insensitive match) or a [0x|#]RRGGBB[AA] sequence,
+ * possibly followed by "@" and a string representing the alpha
+ * component.
+ * The alpha component may be a string composed by "0x" followed by an
+ * hexadecimal number or a decimal number between 0.0 and 1.0, which
+ * represents the opacity value (0x00/0.0 means completely transparent,
+ * 0xff/1.0 completely opaque).
+ * If the alpha component is not specified then 0xff is assumed.
+ * The string "random" will result in a random color.
+ * @param slen length of the initial part of color_string containing the
+ * color. It can be set to -1 if color_string is a null terminated string
+ * containing nothing else than the color.
+ * @return >= 0 in case of success, a negative value in case of
+ * failure (for example if color_string cannot be parsed).
+ */
+int av_parse_color(uint8_t *rgba_color, const char *color_string, int slen,
+                   void *log_ctx);
+
+/**
+ * Get the name of a color from the internal table of hard-coded named
+ * colors.
+ *
+ * This function is meant to enumerate the color names recognized by
+ * av_parse_color().
+ *
+ * @param color_idx index of the requested color, starting from 0
+ * @param rgbp      if not NULL, will point to a 3-elements array with the color value in RGB
+ * @return the color name string or NULL if color_idx is not in the array
+ */
+const char *av_get_known_color_name(int color_idx, const uint8_t **rgb);
+
+/**
+ * Parse timestr and return in *time a corresponding number of
+ * microseconds.
+ *
+ * @param timeval puts here the number of microseconds corresponding
+ * to the string in timestr. If the string represents a duration, it
+ * is the number of microseconds contained in the time interval.  If
+ * the string is a date, is the number of microseconds since 1st of
+ * January, 1970 up to the time of the parsed date.  If timestr cannot
+ * be successfully parsed, set *time to INT64_MIN.
+
+ * @param timestr a string representing a date or a duration.
+ * - If a date the syntax is:
+ * @code
+ * [{YYYY-MM-DD|YYYYMMDD}[T|t| ]]{{HH:MM:SS[.m...]]]}|{HHMMSS[.m...]]]}}[Z]
+ * now
+ * @endcode
+ * If the value is "now" it takes the current time.
+ * Time is local time unless Z is appended, in which case it is
+ * interpreted as UTC.
+ * If the year-month-day part is not specified it takes the current
+ * year-month-day.
+ * - If a duration the syntax is:
+ * @code
+ * [-][HH:]MM:SS[.m...]
+ * [-]S+[.m...]
+ * @endcode
+ * @param duration flag which tells how to interpret timestr, if not
+ * zero timestr is interpreted as a duration, otherwise as a date
+ * @return >= 0 in case of success, a negative value corresponding to an
+ * AVERROR code otherwise
+ */
+int av_parse_time(int64_t *timeval, const char *timestr, int duration);
+
+/**
+ * Attempt to find a specific tag in a URL.
+ *
+ * syntax: '?tag1=val1&tag2=val2...'. Little URL decoding is done.
+ * Return 1 if found.
+ */
+int av_find_info_tag(char *arg, int arg_size, const char *tag1, const char *info);
+
+/**
+ * Simplified version of strptime
+ *
+ * Parse the input string p according to the format string fmt and
+ * store its results in the structure dt.
+ * This implementation supports only a subset of the formats supported
+ * by the standard strptime().
+ *
+ * The supported input field descriptors are listed below.
+ * - %H: the hour as a decimal number, using a 24-hour clock, in the
+ *   range '00' through '23'
+ * - %J: hours as a decimal number, in the range '0' through INT_MAX
+ * - %M: the minute as a decimal number, using a 24-hour clock, in the
+ *   range '00' through '59'
+ * - %S: the second as a decimal number, using a 24-hour clock, in the
+ *   range '00' through '59'
+ * - %Y: the year as a decimal number, using the Gregorian calendar
+ * - %m: the month as a decimal number, in the range '1' through '12'
+ * - %d: the day of the month as a decimal number, in the range '1'
+ *   through '31'
+ * - %T: alias for '%H:%M:%S'
+ * - %%: a literal '%'
+ *
+ * @return a pointer to the first character not processed in this function
+ *         call. In case the input string contains more characters than
+ *         required by the format string the return value points right after
+ *         the last consumed input character. In case the whole input string
+ *         is consumed the return value points to the null byte at the end of
+ *         the string. On failure NULL is returned.
+ */
+char *av_small_strptime(const char *p, const char *fmt, struct tm *dt);
+
+/**
+ * Convert the decomposed UTC time in tm to a time_t value.
+ */
+time_t av_timegm(struct tm *tm);
+
+#endif /* AVUTIL_PARSEUTILS_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/pixdesc.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/pixdesc.h
new file mode 100644
index 0000000..c3a6f27
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/pixdesc.h
@@ -0,0 +1,399 @@
+/*
+ * pixel format descriptor
+ * Copyright (c) 2009 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_PIXDESC_H
+#define AVUTIL_PIXDESC_H
+
+#include <inttypes.h>
+
+#include "attributes.h"
+#include "pixfmt.h"
+#include "version.h"
+
+typedef struct AVComponentDescriptor {
+    /**
+     * Which of the 4 planes contains the component.
+     */
+    int plane;
+
+    /**
+     * Number of elements between 2 horizontally consecutive pixels.
+     * Elements are bits for bitstream formats, bytes otherwise.
+     */
+    int step;
+
+    /**
+     * Number of elements before the component of the first pixel.
+     * Elements are bits for bitstream formats, bytes otherwise.
+     */
+    int offset;
+
+    /**
+     * Number of least significant bits that must be shifted away
+     * to get the value.
+     */
+    int shift;
+
+    /**
+     * Number of bits in the component.
+     */
+    int depth;
+
+#if FF_API_PLUS1_MINUS1
+    /** deprecated, use step instead */
+    attribute_deprecated int step_minus1;
+
+    /** deprecated, use depth instead */
+    attribute_deprecated int depth_minus1;
+
+    /** deprecated, use offset instead */
+    attribute_deprecated int offset_plus1;
+#endif
+} AVComponentDescriptor;
+
+/**
+ * Descriptor that unambiguously describes how the bits of a pixel are
+ * stored in the up to 4 data planes of an image. It also stores the
+ * subsampling factors and number of components.
+ *
+ * @note This is separate of the colorspace (RGB, YCbCr, YPbPr, JPEG-style YUV
+ *       and all the YUV variants) AVPixFmtDescriptor just stores how values
+ *       are stored not what these values represent.
+ */
+typedef struct AVPixFmtDescriptor {
+    const char *name;
+    uint8_t nb_components;  ///< The number of components each pixel has, (1-4)
+
+    /**
+     * Amount to shift the luma width right to find the chroma width.
+     * For YV12 this is 1 for example.
+     * chroma_width = AV_CEIL_RSHIFT(luma_width, log2_chroma_w)
+     * The note above is needed to ensure rounding up.
+     * This value only refers to the chroma components.
+     */
+    uint8_t log2_chroma_w;
+
+    /**
+     * Amount to shift the luma height right to find the chroma height.
+     * For YV12 this is 1 for example.
+     * chroma_height= AV_CEIL_RSHIFT(luma_height, log2_chroma_h)
+     * The note above is needed to ensure rounding up.
+     * This value only refers to the chroma components.
+     */
+    uint8_t log2_chroma_h;
+
+    /**
+     * Combination of AV_PIX_FMT_FLAG_... flags.
+     */
+    uint64_t flags;
+
+    /**
+     * Parameters that describe how pixels are packed.
+     * If the format has 1 or 2 components, then luma is 0.
+     * If the format has 3 or 4 components:
+     *   if the RGB flag is set then 0 is red, 1 is green and 2 is blue;
+     *   otherwise 0 is luma, 1 is chroma-U and 2 is chroma-V.
+     *
+     * If present, the Alpha channel is always the last component.
+     */
+    AVComponentDescriptor comp[4];
+
+    /**
+     * Alternative comma-separated names.
+     */
+    const char *alias;
+} AVPixFmtDescriptor;
+
+/**
+ * Pixel format is big-endian.
+ */
+#define AV_PIX_FMT_FLAG_BE           (1 << 0)
+/**
+ * Pixel format has a palette in data[1], values are indexes in this palette.
+ */
+#define AV_PIX_FMT_FLAG_PAL          (1 << 1)
+/**
+ * All values of a component are bit-wise packed end to end.
+ */
+#define AV_PIX_FMT_FLAG_BITSTREAM    (1 << 2)
+/**
+ * Pixel format is an HW accelerated format.
+ */
+#define AV_PIX_FMT_FLAG_HWACCEL      (1 << 3)
+/**
+ * At least one pixel component is not in the first data plane.
+ */
+#define AV_PIX_FMT_FLAG_PLANAR       (1 << 4)
+/**
+ * The pixel format contains RGB-like data (as opposed to YUV/grayscale).
+ */
+#define AV_PIX_FMT_FLAG_RGB          (1 << 5)
+
+/**
+ * The pixel format is "pseudo-paletted". This means that it contains a
+ * fixed palette in the 2nd plane but the palette is fixed/constant for each
+ * PIX_FMT. This allows interpreting the data as if it was PAL8, which can
+ * in some cases be simpler. Or the data can be interpreted purely based on
+ * the pixel format without using the palette.
+ * An example of a pseudo-paletted format is AV_PIX_FMT_GRAY8
+ */
+#define AV_PIX_FMT_FLAG_PSEUDOPAL    (1 << 6)
+
+/**
+ * The pixel format has an alpha channel. This is set on all formats that
+ * support alpha in some way. The exception is AV_PIX_FMT_PAL8, which can
+ * carry alpha as part of the palette. Details are explained in the
+ * AVPixelFormat enum, and are also encoded in the corresponding
+ * AVPixFmtDescriptor.
+ *
+ * The alpha is always straight, never pre-multiplied.
+ *
+ * If a codec or a filter does not support alpha, it should set all alpha to
+ * opaque, or use the equivalent pixel formats without alpha component, e.g.
+ * AV_PIX_FMT_RGB0 (or AV_PIX_FMT_RGB24 etc.) instead of AV_PIX_FMT_RGBA.
+ */
+#define AV_PIX_FMT_FLAG_ALPHA        (1 << 7)
+
+/**
+ * The pixel format is following a Bayer pattern
+ */
+#define AV_PIX_FMT_FLAG_BAYER        (1 << 8)
+
+/**
+ * Return the number of bits per pixel used by the pixel format
+ * described by pixdesc. Note that this is not the same as the number
+ * of bits per sample.
+ *
+ * The returned number of bits refers to the number of bits actually
+ * used for storing the pixel information, that is padding bits are
+ * not counted.
+ */
+int av_get_bits_per_pixel(const AVPixFmtDescriptor *pixdesc);
+
+/**
+ * Return the number of bits per pixel for the pixel format
+ * described by pixdesc, including any padding or unused bits.
+ */
+int av_get_padded_bits_per_pixel(const AVPixFmtDescriptor *pixdesc);
+
+/**
+ * @return a pixel format descriptor for provided pixel format or NULL if
+ * this pixel format is unknown.
+ */
+const AVPixFmtDescriptor *av_pix_fmt_desc_get(enum AVPixelFormat pix_fmt);
+
+/**
+ * Iterate over all pixel format descriptors known to libavutil.
+ *
+ * @param prev previous descriptor. NULL to get the first descriptor.
+ *
+ * @return next descriptor or NULL after the last descriptor
+ */
+const AVPixFmtDescriptor *av_pix_fmt_desc_next(const AVPixFmtDescriptor *prev);
+
+/**
+ * @return an AVPixelFormat id described by desc, or AV_PIX_FMT_NONE if desc
+ * is not a valid pointer to a pixel format descriptor.
+ */
+enum AVPixelFormat av_pix_fmt_desc_get_id(const AVPixFmtDescriptor *desc);
+
+/**
+ * Utility function to access log2_chroma_w log2_chroma_h from
+ * the pixel format AVPixFmtDescriptor.
+ *
+ * See av_get_chroma_sub_sample() for a function that asserts a
+ * valid pixel format instead of returning an error code.
+ * Its recommended that you use avcodec_get_chroma_sub_sample unless
+ * you do check the return code!
+ *
+ * @param[in]  pix_fmt the pixel format
+ * @param[out] h_shift store log2_chroma_w (horizontal/width shift)
+ * @param[out] v_shift store log2_chroma_h (vertical/height shift)
+ *
+ * @return 0 on success, AVERROR(ENOSYS) on invalid or unknown pixel format
+ */
+int av_pix_fmt_get_chroma_sub_sample(enum AVPixelFormat pix_fmt,
+                                     int *h_shift, int *v_shift);
+
+/**
+ * @return number of planes in pix_fmt, a negative AVERROR if pix_fmt is not a
+ * valid pixel format.
+ */
+int av_pix_fmt_count_planes(enum AVPixelFormat pix_fmt);
+
+/**
+ * @return the name for provided color range or NULL if unknown.
+ */
+const char *av_color_range_name(enum AVColorRange range);
+
+/**
+ * @return the name for provided color primaries or NULL if unknown.
+ */
+const char *av_color_primaries_name(enum AVColorPrimaries primaries);
+
+/**
+ * @return the name for provided color transfer or NULL if unknown.
+ */
+const char *av_color_transfer_name(enum AVColorTransferCharacteristic transfer);
+
+/**
+ * @return the name for provided color space or NULL if unknown.
+ */
+const char *av_color_space_name(enum AVColorSpace space);
+
+/**
+ * @return the name for provided chroma location or NULL if unknown.
+ */
+const char *av_chroma_location_name(enum AVChromaLocation location);
+
+/**
+ * Return the pixel format corresponding to name.
+ *
+ * If there is no pixel format with name name, then looks for a
+ * pixel format with the name corresponding to the native endian
+ * format of name.
+ * For example in a little-endian system, first looks for "gray16",
+ * then for "gray16le".
+ *
+ * Finally if no pixel format has been found, returns AV_PIX_FMT_NONE.
+ */
+enum AVPixelFormat av_get_pix_fmt(const char *name);
+
+/**
+ * Return the short name for a pixel format, NULL in case pix_fmt is
+ * unknown.
+ *
+ * @see av_get_pix_fmt(), av_get_pix_fmt_string()
+ */
+const char *av_get_pix_fmt_name(enum AVPixelFormat pix_fmt);
+
+/**
+ * Print in buf the string corresponding to the pixel format with
+ * number pix_fmt, or a header if pix_fmt is negative.
+ *
+ * @param buf the buffer where to write the string
+ * @param buf_size the size of buf
+ * @param pix_fmt the number of the pixel format to print the
+ * corresponding info string, or a negative value to print the
+ * corresponding header.
+ */
+char *av_get_pix_fmt_string(char *buf, int buf_size,
+                            enum AVPixelFormat pix_fmt);
+
+/**
+ * Read a line from an image, and write the values of the
+ * pixel format component c to dst.
+ *
+ * @param data the array containing the pointers to the planes of the image
+ * @param linesize the array containing the linesizes of the image
+ * @param desc the pixel format descriptor for the image
+ * @param x the horizontal coordinate of the first pixel to read
+ * @param y the vertical coordinate of the first pixel to read
+ * @param w the width of the line to read, that is the number of
+ * values to write to dst
+ * @param read_pal_component if not zero and the format is a paletted
+ * format writes the values corresponding to the palette
+ * component c in data[1] to dst, rather than the palette indexes in
+ * data[0]. The behavior is undefined if the format is not paletted.
+ */
+void av_read_image_line(uint16_t *dst, const uint8_t *data[4],
+                        const int linesize[4], const AVPixFmtDescriptor *desc,
+                        int x, int y, int c, int w, int read_pal_component);
+
+/**
+ * Write the values from src to the pixel format component c of an
+ * image line.
+ *
+ * @param src array containing the values to write
+ * @param data the array containing the pointers to the planes of the
+ * image to write into. It is supposed to be zeroed.
+ * @param linesize the array containing the linesizes of the image
+ * @param desc the pixel format descriptor for the image
+ * @param x the horizontal coordinate of the first pixel to write
+ * @param y the vertical coordinate of the first pixel to write
+ * @param w the width of the line to write, that is the number of
+ * values to write to the image line
+ */
+void av_write_image_line(const uint16_t *src, uint8_t *data[4],
+                         const int linesize[4], const AVPixFmtDescriptor *desc,
+                         int x, int y, int c, int w);
+
+/**
+ * Utility function to swap the endianness of a pixel format.
+ *
+ * @param[in]  pix_fmt the pixel format
+ *
+ * @return pixel format with swapped endianness if it exists,
+ * otherwise AV_PIX_FMT_NONE
+ */
+enum AVPixelFormat av_pix_fmt_swap_endianness(enum AVPixelFormat pix_fmt);
+
+#define FF_LOSS_RESOLUTION  0x0001 /**< loss due to resolution change */
+#define FF_LOSS_DEPTH       0x0002 /**< loss due to color depth change */
+#define FF_LOSS_COLORSPACE  0x0004 /**< loss due to color space conversion */
+#define FF_LOSS_ALPHA       0x0008 /**< loss of alpha bits */
+#define FF_LOSS_COLORQUANT  0x0010 /**< loss due to color quantization */
+#define FF_LOSS_CHROMA      0x0020 /**< loss of chroma (e.g. RGB to gray conversion) */
+
+/**
+ * Compute what kind of losses will occur when converting from one specific
+ * pixel format to another.
+ * When converting from one pixel format to another, information loss may occur.
+ * For example, when converting from RGB24 to GRAY, the color information will
+ * be lost. Similarly, other losses occur when converting from some formats to
+ * other formats. These losses can involve loss of chroma, but also loss of
+ * resolution, loss of color depth, loss due to the color space conversion, loss
+ * of the alpha bits or loss due to color quantization.
+ * av_get_fix_fmt_loss() informs you about the various types of losses
+ * which will occur when converting from one pixel format to another.
+ *
+ * @param[in] dst_pix_fmt destination pixel format
+ * @param[in] src_pix_fmt source pixel format
+ * @param[in] has_alpha Whether the source pixel format alpha channel is used.
+ * @return Combination of flags informing you what kind of losses will occur
+ * (maximum loss for an invalid dst_pix_fmt).
+ */
+int av_get_pix_fmt_loss(enum AVPixelFormat dst_pix_fmt,
+                        enum AVPixelFormat src_pix_fmt,
+                        int has_alpha);
+
+/**
+ * Compute what kind of losses will occur when converting from one specific
+ * pixel format to another.
+ * When converting from one pixel format to another, information loss may occur.
+ * For example, when converting from RGB24 to GRAY, the color information will
+ * be lost. Similarly, other losses occur when converting from some formats to
+ * other formats. These losses can involve loss of chroma, but also loss of
+ * resolution, loss of color depth, loss due to the color space conversion, loss
+ * of the alpha bits or loss due to color quantization.
+ * av_get_fix_fmt_loss() informs you about the various types of losses
+ * which will occur when converting from one pixel format to another.
+ *
+ * @param[in] dst_pix_fmt destination pixel format
+ * @param[in] src_pix_fmt source pixel format
+ * @param[in] has_alpha Whether the source pixel format alpha channel is used.
+ * @return Combination of flags informing you what kind of losses will occur
+ * (maximum loss for an invalid dst_pix_fmt).
+ */
+enum AVPixelFormat av_find_best_pix_fmt_of_2(enum AVPixelFormat dst_pix_fmt1, enum AVPixelFormat dst_pix_fmt2,
+                                             enum AVPixelFormat src_pix_fmt, int has_alpha, int *loss_ptr);
+
+#endif /* AVUTIL_PIXDESC_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/pixelutils.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/pixelutils.h
new file mode 100644
index 0000000..a8dbc15
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/pixelutils.h
@@ -0,0 +1,52 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_PIXELUTILS_H
+#define AVUTIL_PIXELUTILS_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include "common.h"
+
+/**
+ * Sum of abs(src1[x] - src2[x])
+ */
+typedef int (*av_pixelutils_sad_fn)(const uint8_t *src1, ptrdiff_t stride1,
+                                    const uint8_t *src2, ptrdiff_t stride2);
+
+/**
+ * Get a potentially optimized pointer to a Sum-of-absolute-differences
+ * function (see the av_pixelutils_sad_fn prototype).
+ *
+ * @param w_bits  1<<w_bits is the requested width of the block size
+ * @param h_bits  1<<h_bits is the requested height of the block size
+ * @param aligned If set to 2, the returned sad function will assume src1 and
+ *                src2 addresses are aligned on the block size.
+ *                If set to 1, the returned sad function will assume src1 is
+ *                aligned on the block size.
+ *                If set to 0, the returned sad function assume no particular
+ *                alignment.
+ * @param log_ctx context used for logging, can be NULL
+ *
+ * @return a pointer to the SAD function or NULL in case of error (because of
+ *         invalid parameters)
+ */
+av_pixelutils_sad_fn av_pixelutils_get_sad_fn(int w_bits, int h_bits,
+                                              int aligned, void *log_ctx);
+
+#endif /* AVUTIL_PIXELUTILS_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/pixfmt.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/pixfmt.h
new file mode 100644
index 0000000..926bf5a
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/pixfmt.h
@@ -0,0 +1,510 @@
+/*
+ * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_PIXFMT_H
+#define AVUTIL_PIXFMT_H
+
+/**
+ * @file
+ * pixel format definitions
+ */
+
+#include "libavutil/avconfig.h"
+#include "version.h"
+
+#define AVPALETTE_SIZE 1024
+#define AVPALETTE_COUNT 256
+
+/**
+ * Pixel format.
+ *
+ * @note
+ * AV_PIX_FMT_RGB32 is handled in an endian-specific manner. An RGBA
+ * color is put together as:
+ *  (A << 24) | (R << 16) | (G << 8) | B
+ * This is stored as BGRA on little-endian CPU architectures and ARGB on
+ * big-endian CPUs.
+ *
+ * @note
+ * If the resolution is not a multiple of the chroma subsampling factor
+ * then the chroma plane resolution must be rounded up.
+ *
+ * @par
+ * When the pixel format is palettized RGB32 (AV_PIX_FMT_PAL8), the palettized
+ * image data is stored in AVFrame.data[0]. The palette is transported in
+ * AVFrame.data[1], is 1024 bytes long (256 4-byte entries) and is
+ * formatted the same as in AV_PIX_FMT_RGB32 described above (i.e., it is
+ * also endian-specific). Note also that the individual RGB32 palette
+ * components stored in AVFrame.data[1] should be in the range 0..255.
+ * This is important as many custom PAL8 video codecs that were designed
+ * to run on the IBM VGA graphics adapter use 6-bit palette components.
+ *
+ * @par
+ * For all the 8 bits per pixel formats, an RGB32 palette is in data[1] like
+ * for pal8. This palette is filled in automatically by the function
+ * allocating the picture.
+ */
+enum AVPixelFormat {
+    AV_PIX_FMT_NONE = -1,
+    AV_PIX_FMT_YUV420P,   ///< planar YUV 4:2:0, 12bpp, (1 Cr & Cb sample per 2x2 Y samples)
+    AV_PIX_FMT_YUYV422,   ///< packed YUV 4:2:2, 16bpp, Y0 Cb Y1 Cr
+    AV_PIX_FMT_RGB24,     ///< packed RGB 8:8:8, 24bpp, RGBRGB...
+    AV_PIX_FMT_BGR24,     ///< packed RGB 8:8:8, 24bpp, BGRBGR...
+    AV_PIX_FMT_YUV422P,   ///< planar YUV 4:2:2, 16bpp, (1 Cr & Cb sample per 2x1 Y samples)
+    AV_PIX_FMT_YUV444P,   ///< planar YUV 4:4:4, 24bpp, (1 Cr & Cb sample per 1x1 Y samples)
+    AV_PIX_FMT_YUV410P,   ///< planar YUV 4:1:0,  9bpp, (1 Cr & Cb sample per 4x4 Y samples)
+    AV_PIX_FMT_YUV411P,   ///< planar YUV 4:1:1, 12bpp, (1 Cr & Cb sample per 4x1 Y samples)
+    AV_PIX_FMT_GRAY8,     ///<        Y        ,  8bpp
+    AV_PIX_FMT_MONOWHITE, ///<        Y        ,  1bpp, 0 is white, 1 is black, in each byte pixels are ordered from the msb to the lsb
+    AV_PIX_FMT_MONOBLACK, ///<        Y        ,  1bpp, 0 is black, 1 is white, in each byte pixels are ordered from the msb to the lsb
+    AV_PIX_FMT_PAL8,      ///< 8 bits with AV_PIX_FMT_RGB32 palette
+    AV_PIX_FMT_YUVJ420P,  ///< planar YUV 4:2:0, 12bpp, full scale (JPEG), deprecated in favor of AV_PIX_FMT_YUV420P and setting color_range
+    AV_PIX_FMT_YUVJ422P,  ///< planar YUV 4:2:2, 16bpp, full scale (JPEG), deprecated in favor of AV_PIX_FMT_YUV422P and setting color_range
+    AV_PIX_FMT_YUVJ444P,  ///< planar YUV 4:4:4, 24bpp, full scale (JPEG), deprecated in favor of AV_PIX_FMT_YUV444P and setting color_range
+#if FF_API_XVMC
+    AV_PIX_FMT_XVMC_MPEG2_MC,///< XVideo Motion Acceleration via common packet passing
+    AV_PIX_FMT_XVMC_MPEG2_IDCT,
+    AV_PIX_FMT_XVMC = AV_PIX_FMT_XVMC_MPEG2_IDCT,
+#endif /* FF_API_XVMC */
+    AV_PIX_FMT_UYVY422,   ///< packed YUV 4:2:2, 16bpp, Cb Y0 Cr Y1
+    AV_PIX_FMT_UYYVYY411, ///< packed YUV 4:1:1, 12bpp, Cb Y0 Y1 Cr Y2 Y3
+    AV_PIX_FMT_BGR8,      ///< packed RGB 3:3:2,  8bpp, (msb)2B 3G 3R(lsb)
+    AV_PIX_FMT_BGR4,      ///< packed RGB 1:2:1 bitstream,  4bpp, (msb)1B 2G 1R(lsb), a byte contains two pixels, the first pixel in the byte is the one composed by the 4 msb bits
+    AV_PIX_FMT_BGR4_BYTE, ///< packed RGB 1:2:1,  8bpp, (msb)1B 2G 1R(lsb)
+    AV_PIX_FMT_RGB8,      ///< packed RGB 3:3:2,  8bpp, (msb)2R 3G 3B(lsb)
+    AV_PIX_FMT_RGB4,      ///< packed RGB 1:2:1 bitstream,  4bpp, (msb)1R 2G 1B(lsb), a byte contains two pixels, the first pixel in the byte is the one composed by the 4 msb bits
+    AV_PIX_FMT_RGB4_BYTE, ///< packed RGB 1:2:1,  8bpp, (msb)1R 2G 1B(lsb)
+    AV_PIX_FMT_NV12,      ///< planar YUV 4:2:0, 12bpp, 1 plane for Y and 1 plane for the UV components, which are interleaved (first byte U and the following byte V)
+    AV_PIX_FMT_NV21,      ///< as above, but U and V bytes are swapped
+
+    AV_PIX_FMT_ARGB,      ///< packed ARGB 8:8:8:8, 32bpp, ARGBARGB...
+    AV_PIX_FMT_RGBA,      ///< packed RGBA 8:8:8:8, 32bpp, RGBARGBA...
+    AV_PIX_FMT_ABGR,      ///< packed ABGR 8:8:8:8, 32bpp, ABGRABGR...
+    AV_PIX_FMT_BGRA,      ///< packed BGRA 8:8:8:8, 32bpp, BGRABGRA...
+
+    AV_PIX_FMT_GRAY16BE,  ///<        Y        , 16bpp, big-endian
+    AV_PIX_FMT_GRAY16LE,  ///<        Y        , 16bpp, little-endian
+    AV_PIX_FMT_YUV440P,   ///< planar YUV 4:4:0 (1 Cr & Cb sample per 1x2 Y samples)
+    AV_PIX_FMT_YUVJ440P,  ///< planar YUV 4:4:0 full scale (JPEG), deprecated in favor of AV_PIX_FMT_YUV440P and setting color_range
+    AV_PIX_FMT_YUVA420P,  ///< planar YUV 4:2:0, 20bpp, (1 Cr & Cb sample per 2x2 Y & A samples)
+#if FF_API_VDPAU
+    AV_PIX_FMT_VDPAU_H264,///< H.264 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
+    AV_PIX_FMT_VDPAU_MPEG1,///< MPEG-1 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
+    AV_PIX_FMT_VDPAU_MPEG2,///< MPEG-2 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
+    AV_PIX_FMT_VDPAU_WMV3,///< WMV3 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
+    AV_PIX_FMT_VDPAU_VC1, ///< VC-1 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
+#endif
+    AV_PIX_FMT_RGB48BE,   ///< packed RGB 16:16:16, 48bpp, 16R, 16G, 16B, the 2-byte value for each R/G/B component is stored as big-endian
+    AV_PIX_FMT_RGB48LE,   ///< packed RGB 16:16:16, 48bpp, 16R, 16G, 16B, the 2-byte value for each R/G/B component is stored as little-endian
+
+    AV_PIX_FMT_RGB565BE,  ///< packed RGB 5:6:5, 16bpp, (msb)   5R 6G 5B(lsb), big-endian
+    AV_PIX_FMT_RGB565LE,  ///< packed RGB 5:6:5, 16bpp, (msb)   5R 6G 5B(lsb), little-endian
+    AV_PIX_FMT_RGB555BE,  ///< packed RGB 5:5:5, 16bpp, (msb)1X 5R 5G 5B(lsb), big-endian   , X=unused/undefined
+    AV_PIX_FMT_RGB555LE,  ///< packed RGB 5:5:5, 16bpp, (msb)1X 5R 5G 5B(lsb), little-endian, X=unused/undefined
+
+    AV_PIX_FMT_BGR565BE,  ///< packed BGR 5:6:5, 16bpp, (msb)   5B 6G 5R(lsb), big-endian
+    AV_PIX_FMT_BGR565LE,  ///< packed BGR 5:6:5, 16bpp, (msb)   5B 6G 5R(lsb), little-endian
+    AV_PIX_FMT_BGR555BE,  ///< packed BGR 5:5:5, 16bpp, (msb)1X 5B 5G 5R(lsb), big-endian   , X=unused/undefined
+    AV_PIX_FMT_BGR555LE,  ///< packed BGR 5:5:5, 16bpp, (msb)1X 5B 5G 5R(lsb), little-endian, X=unused/undefined
+
+#if FF_API_VAAPI
+    /** @name Deprecated pixel formats */
+    /**@{*/
+    AV_PIX_FMT_VAAPI_MOCO, ///< HW acceleration through VA API at motion compensation entry-point, Picture.data[3] contains a vaapi_render_state struct which contains macroblocks as well as various fields extracted from headers
+    AV_PIX_FMT_VAAPI_IDCT, ///< HW acceleration through VA API at IDCT entry-point, Picture.data[3] contains a vaapi_render_state struct which contains fields extracted from headers
+    AV_PIX_FMT_VAAPI_VLD,  ///< HW decoding through VA API, Picture.data[3] contains a VASurfaceID
+    /**@}*/
+    AV_PIX_FMT_VAAPI = AV_PIX_FMT_VAAPI_VLD,
+#else
+    /**
+     *  Hardware acceleration through VA-API, data[3] contains a
+     *  VASurfaceID.
+     */
+    AV_PIX_FMT_VAAPI,
+#endif
+
+    AV_PIX_FMT_YUV420P16LE,  ///< planar YUV 4:2:0, 24bpp, (1 Cr & Cb sample per 2x2 Y samples), little-endian
+    AV_PIX_FMT_YUV420P16BE,  ///< planar YUV 4:2:0, 24bpp, (1 Cr & Cb sample per 2x2 Y samples), big-endian
+    AV_PIX_FMT_YUV422P16LE,  ///< planar YUV 4:2:2, 32bpp, (1 Cr & Cb sample per 2x1 Y samples), little-endian
+    AV_PIX_FMT_YUV422P16BE,  ///< planar YUV 4:2:2, 32bpp, (1 Cr & Cb sample per 2x1 Y samples), big-endian
+    AV_PIX_FMT_YUV444P16LE,  ///< planar YUV 4:4:4, 48bpp, (1 Cr & Cb sample per 1x1 Y samples), little-endian
+    AV_PIX_FMT_YUV444P16BE,  ///< planar YUV 4:4:4, 48bpp, (1 Cr & Cb sample per 1x1 Y samples), big-endian
+#if FF_API_VDPAU
+    AV_PIX_FMT_VDPAU_MPEG4,  ///< MPEG-4 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
+#endif
+    AV_PIX_FMT_DXVA2_VLD,    ///< HW decoding through DXVA2, Picture.data[3] contains a LPDIRECT3DSURFACE9 pointer
+
+    AV_PIX_FMT_RGB444LE,  ///< packed RGB 4:4:4, 16bpp, (msb)4X 4R 4G 4B(lsb), little-endian, X=unused/undefined
+    AV_PIX_FMT_RGB444BE,  ///< packed RGB 4:4:4, 16bpp, (msb)4X 4R 4G 4B(lsb), big-endian,    X=unused/undefined
+    AV_PIX_FMT_BGR444LE,  ///< packed BGR 4:4:4, 16bpp, (msb)4X 4B 4G 4R(lsb), little-endian, X=unused/undefined
+    AV_PIX_FMT_BGR444BE,  ///< packed BGR 4:4:4, 16bpp, (msb)4X 4B 4G 4R(lsb), big-endian,    X=unused/undefined
+    AV_PIX_FMT_YA8,       ///< 8 bits gray, 8 bits alpha
+
+    AV_PIX_FMT_Y400A = AV_PIX_FMT_YA8, ///< alias for AV_PIX_FMT_YA8
+    AV_PIX_FMT_GRAY8A= AV_PIX_FMT_YA8, ///< alias for AV_PIX_FMT_YA8
+
+    AV_PIX_FMT_BGR48BE,   ///< packed RGB 16:16:16, 48bpp, 16B, 16G, 16R, the 2-byte value for each R/G/B component is stored as big-endian
+    AV_PIX_FMT_BGR48LE,   ///< packed RGB 16:16:16, 48bpp, 16B, 16G, 16R, the 2-byte value for each R/G/B component is stored as little-endian
+
+    /**
+     * The following 12 formats have the disadvantage of needing 1 format for each bit depth.
+     * Notice that each 9/10 bits sample is stored in 16 bits with extra padding.
+     * If you want to support multiple bit depths, then using AV_PIX_FMT_YUV420P16* with the bpp stored separately is better.
+     */
+    AV_PIX_FMT_YUV420P9BE, ///< planar YUV 4:2:0, 13.5bpp, (1 Cr & Cb sample per 2x2 Y samples), big-endian
+    AV_PIX_FMT_YUV420P9LE, ///< planar YUV 4:2:0, 13.5bpp, (1 Cr & Cb sample per 2x2 Y samples), little-endian
+    AV_PIX_FMT_YUV420P10BE,///< planar YUV 4:2:0, 15bpp, (1 Cr & Cb sample per 2x2 Y samples), big-endian
+    AV_PIX_FMT_YUV420P10LE,///< planar YUV 4:2:0, 15bpp, (1 Cr & Cb sample per 2x2 Y samples), little-endian
+    AV_PIX_FMT_YUV422P10BE,///< planar YUV 4:2:2, 20bpp, (1 Cr & Cb sample per 2x1 Y samples), big-endian
+    AV_PIX_FMT_YUV422P10LE,///< planar YUV 4:2:2, 20bpp, (1 Cr & Cb sample per 2x1 Y samples), little-endian
+    AV_PIX_FMT_YUV444P9BE, ///< planar YUV 4:4:4, 27bpp, (1 Cr & Cb sample per 1x1 Y samples), big-endian
+    AV_PIX_FMT_YUV444P9LE, ///< planar YUV 4:4:4, 27bpp, (1 Cr & Cb sample per 1x1 Y samples), little-endian
+    AV_PIX_FMT_YUV444P10BE,///< planar YUV 4:4:4, 30bpp, (1 Cr & Cb sample per 1x1 Y samples), big-endian
+    AV_PIX_FMT_YUV444P10LE,///< planar YUV 4:4:4, 30bpp, (1 Cr & Cb sample per 1x1 Y samples), little-endian
+    AV_PIX_FMT_YUV422P9BE, ///< planar YUV 4:2:2, 18bpp, (1 Cr & Cb sample per 2x1 Y samples), big-endian
+    AV_PIX_FMT_YUV422P9LE, ///< planar YUV 4:2:2, 18bpp, (1 Cr & Cb sample per 2x1 Y samples), little-endian
+    AV_PIX_FMT_VDA_VLD,    ///< hardware decoding through VDA
+    AV_PIX_FMT_GBRP,      ///< planar GBR 4:4:4 24bpp
+    AV_PIX_FMT_GBR24P = AV_PIX_FMT_GBRP, // alias for #AV_PIX_FMT_GBRP
+    AV_PIX_FMT_GBRP9BE,   ///< planar GBR 4:4:4 27bpp, big-endian
+    AV_PIX_FMT_GBRP9LE,   ///< planar GBR 4:4:4 27bpp, little-endian
+    AV_PIX_FMT_GBRP10BE,  ///< planar GBR 4:4:4 30bpp, big-endian
+    AV_PIX_FMT_GBRP10LE,  ///< planar GBR 4:4:4 30bpp, little-endian
+    AV_PIX_FMT_GBRP16BE,  ///< planar GBR 4:4:4 48bpp, big-endian
+    AV_PIX_FMT_GBRP16LE,  ///< planar GBR 4:4:4 48bpp, little-endian
+    AV_PIX_FMT_YUVA422P,  ///< planar YUV 4:2:2 24bpp, (1 Cr & Cb sample per 2x1 Y & A samples)
+    AV_PIX_FMT_YUVA444P,  ///< planar YUV 4:4:4 32bpp, (1 Cr & Cb sample per 1x1 Y & A samples)
+    AV_PIX_FMT_YUVA420P9BE,  ///< planar YUV 4:2:0 22.5bpp, (1 Cr & Cb sample per 2x2 Y & A samples), big-endian
+    AV_PIX_FMT_YUVA420P9LE,  ///< planar YUV 4:2:0 22.5bpp, (1 Cr & Cb sample per 2x2 Y & A samples), little-endian
+    AV_PIX_FMT_YUVA422P9BE,  ///< planar YUV 4:2:2 27bpp, (1 Cr & Cb sample per 2x1 Y & A samples), big-endian
+    AV_PIX_FMT_YUVA422P9LE,  ///< planar YUV 4:2:2 27bpp, (1 Cr & Cb sample per 2x1 Y & A samples), little-endian
+    AV_PIX_FMT_YUVA444P9BE,  ///< planar YUV 4:4:4 36bpp, (1 Cr & Cb sample per 1x1 Y & A samples), big-endian
+    AV_PIX_FMT_YUVA444P9LE,  ///< planar YUV 4:4:4 36bpp, (1 Cr & Cb sample per 1x1 Y & A samples), little-endian
+    AV_PIX_FMT_YUVA420P10BE, ///< planar YUV 4:2:0 25bpp, (1 Cr & Cb sample per 2x2 Y & A samples, big-endian)
+    AV_PIX_FMT_YUVA420P10LE, ///< planar YUV 4:2:0 25bpp, (1 Cr & Cb sample per 2x2 Y & A samples, little-endian)
+    AV_PIX_FMT_YUVA422P10BE, ///< planar YUV 4:2:2 30bpp, (1 Cr & Cb sample per 2x1 Y & A samples, big-endian)
+    AV_PIX_FMT_YUVA422P10LE, ///< planar YUV 4:2:2 30bpp, (1 Cr & Cb sample per 2x1 Y & A samples, little-endian)
+    AV_PIX_FMT_YUVA444P10BE, ///< planar YUV 4:4:4 40bpp, (1 Cr & Cb sample per 1x1 Y & A samples, big-endian)
+    AV_PIX_FMT_YUVA444P10LE, ///< planar YUV 4:4:4 40bpp, (1 Cr & Cb sample per 1x1 Y & A samples, little-endian)
+    AV_PIX_FMT_YUVA420P16BE, ///< planar YUV 4:2:0 40bpp, (1 Cr & Cb sample per 2x2 Y & A samples, big-endian)
+    AV_PIX_FMT_YUVA420P16LE, ///< planar YUV 4:2:0 40bpp, (1 Cr & Cb sample per 2x2 Y & A samples, little-endian)
+    AV_PIX_FMT_YUVA422P16BE, ///< planar YUV 4:2:2 48bpp, (1 Cr & Cb sample per 2x1 Y & A samples, big-endian)
+    AV_PIX_FMT_YUVA422P16LE, ///< planar YUV 4:2:2 48bpp, (1 Cr & Cb sample per 2x1 Y & A samples, little-endian)
+    AV_PIX_FMT_YUVA444P16BE, ///< planar YUV 4:4:4 64bpp, (1 Cr & Cb sample per 1x1 Y & A samples, big-endian)
+    AV_PIX_FMT_YUVA444P16LE, ///< planar YUV 4:4:4 64bpp, (1 Cr & Cb sample per 1x1 Y & A samples, little-endian)
+
+    AV_PIX_FMT_VDPAU,     ///< HW acceleration through VDPAU, Picture.data[3] contains a VdpVideoSurface
+
+    AV_PIX_FMT_XYZ12LE,      ///< packed XYZ 4:4:4, 36 bpp, (msb) 12X, 12Y, 12Z (lsb), the 2-byte value for each X/Y/Z is stored as little-endian, the 4 lower bits are set to 0
+    AV_PIX_FMT_XYZ12BE,      ///< packed XYZ 4:4:4, 36 bpp, (msb) 12X, 12Y, 12Z (lsb), the 2-byte value for each X/Y/Z is stored as big-endian, the 4 lower bits are set to 0
+    AV_PIX_FMT_NV16,         ///< interleaved chroma YUV 4:2:2, 16bpp, (1 Cr & Cb sample per 2x1 Y samples)
+    AV_PIX_FMT_NV20LE,       ///< interleaved chroma YUV 4:2:2, 20bpp, (1 Cr & Cb sample per 2x1 Y samples), little-endian
+    AV_PIX_FMT_NV20BE,       ///< interleaved chroma YUV 4:2:2, 20bpp, (1 Cr & Cb sample per 2x1 Y samples), big-endian
+
+    AV_PIX_FMT_RGBA64BE,     ///< packed RGBA 16:16:16:16, 64bpp, 16R, 16G, 16B, 16A, the 2-byte value for each R/G/B/A component is stored as big-endian
+    AV_PIX_FMT_RGBA64LE,     ///< packed RGBA 16:16:16:16, 64bpp, 16R, 16G, 16B, 16A, the 2-byte value for each R/G/B/A component is stored as little-endian
+    AV_PIX_FMT_BGRA64BE,     ///< packed RGBA 16:16:16:16, 64bpp, 16B, 16G, 16R, 16A, the 2-byte value for each R/G/B/A component is stored as big-endian
+    AV_PIX_FMT_BGRA64LE,     ///< packed RGBA 16:16:16:16, 64bpp, 16B, 16G, 16R, 16A, the 2-byte value for each R/G/B/A component is stored as little-endian
+
+    AV_PIX_FMT_YVYU422,   ///< packed YUV 4:2:2, 16bpp, Y0 Cr Y1 Cb
+
+    AV_PIX_FMT_VDA,          ///< HW acceleration through VDA, data[3] contains a CVPixelBufferRef
+
+    AV_PIX_FMT_YA16BE,       ///< 16 bits gray, 16 bits alpha (big-endian)
+    AV_PIX_FMT_YA16LE,       ///< 16 bits gray, 16 bits alpha (little-endian)
+
+    AV_PIX_FMT_GBRAP,        ///< planar GBRA 4:4:4:4 32bpp
+    AV_PIX_FMT_GBRAP16BE,    ///< planar GBRA 4:4:4:4 64bpp, big-endian
+    AV_PIX_FMT_GBRAP16LE,    ///< planar GBRA 4:4:4:4 64bpp, little-endian
+    /**
+     *  HW acceleration through QSV, data[3] contains a pointer to the
+     *  mfxFrameSurface1 structure.
+     */
+    AV_PIX_FMT_QSV,
+    /**
+     * HW acceleration though MMAL, data[3] contains a pointer to the
+     * MMAL_BUFFER_HEADER_T structure.
+     */
+    AV_PIX_FMT_MMAL,
+
+    AV_PIX_FMT_D3D11VA_VLD,  ///< HW decoding through Direct3D11, Picture.data[3] contains a ID3D11VideoDecoderOutputView pointer
+
+    /**
+     * HW acceleration through CUDA. data[i] contain CUdeviceptr pointers
+     * exactly as for system memory frames.
+     */
+    AV_PIX_FMT_CUDA,
+
+    AV_PIX_FMT_0RGB=0x123+4,///< packed RGB 8:8:8, 32bpp, XRGBXRGB...   X=unused/undefined
+    AV_PIX_FMT_RGB0,        ///< packed RGB 8:8:8, 32bpp, RGBXRGBX...   X=unused/undefined
+    AV_PIX_FMT_0BGR,        ///< packed BGR 8:8:8, 32bpp, XBGRXBGR...   X=unused/undefined
+    AV_PIX_FMT_BGR0,        ///< packed BGR 8:8:8, 32bpp, BGRXBGRX...   X=unused/undefined
+
+    AV_PIX_FMT_YUV420P12BE, ///< planar YUV 4:2:0,18bpp, (1 Cr & Cb sample per 2x2 Y samples), big-endian
+    AV_PIX_FMT_YUV420P12LE, ///< planar YUV 4:2:0,18bpp, (1 Cr & Cb sample per 2x2 Y samples), little-endian
+    AV_PIX_FMT_YUV420P14BE, ///< planar YUV 4:2:0,21bpp, (1 Cr & Cb sample per 2x2 Y samples), big-endian
+    AV_PIX_FMT_YUV420P14LE, ///< planar YUV 4:2:0,21bpp, (1 Cr & Cb sample per 2x2 Y samples), little-endian
+    AV_PIX_FMT_YUV422P12BE, ///< planar YUV 4:2:2,24bpp, (1 Cr & Cb sample per 2x1 Y samples), big-endian
+    AV_PIX_FMT_YUV422P12LE, ///< planar YUV 4:2:2,24bpp, (1 Cr & Cb sample per 2x1 Y samples), little-endian
+    AV_PIX_FMT_YUV422P14BE, ///< planar YUV 4:2:2,28bpp, (1 Cr & Cb sample per 2x1 Y samples), big-endian
+    AV_PIX_FMT_YUV422P14LE, ///< planar YUV 4:2:2,28bpp, (1 Cr & Cb sample per 2x1 Y samples), little-endian
+    AV_PIX_FMT_YUV444P12BE, ///< planar YUV 4:4:4,36bpp, (1 Cr & Cb sample per 1x1 Y samples), big-endian
+    AV_PIX_FMT_YUV444P12LE, ///< planar YUV 4:4:4,36bpp, (1 Cr & Cb sample per 1x1 Y samples), little-endian
+    AV_PIX_FMT_YUV444P14BE, ///< planar YUV 4:4:4,42bpp, (1 Cr & Cb sample per 1x1 Y samples), big-endian
+    AV_PIX_FMT_YUV444P14LE, ///< planar YUV 4:4:4,42bpp, (1 Cr & Cb sample per 1x1 Y samples), little-endian
+    AV_PIX_FMT_GBRP12BE,    ///< planar GBR 4:4:4 36bpp, big-endian
+    AV_PIX_FMT_GBRP12LE,    ///< planar GBR 4:4:4 36bpp, little-endian
+    AV_PIX_FMT_GBRP14BE,    ///< planar GBR 4:4:4 42bpp, big-endian
+    AV_PIX_FMT_GBRP14LE,    ///< planar GBR 4:4:4 42bpp, little-endian
+    AV_PIX_FMT_YUVJ411P,    ///< planar YUV 4:1:1, 12bpp, (1 Cr & Cb sample per 4x1 Y samples) full scale (JPEG), deprecated in favor of AV_PIX_FMT_YUV411P and setting color_range
+
+    AV_PIX_FMT_BAYER_BGGR8,    ///< bayer, BGBG..(odd line), GRGR..(even line), 8-bit samples */
+    AV_PIX_FMT_BAYER_RGGB8,    ///< bayer, RGRG..(odd line), GBGB..(even line), 8-bit samples */
+    AV_PIX_FMT_BAYER_GBRG8,    ///< bayer, GBGB..(odd line), RGRG..(even line), 8-bit samples */
+    AV_PIX_FMT_BAYER_GRBG8,    ///< bayer, GRGR..(odd line), BGBG..(even line), 8-bit samples */
+    AV_PIX_FMT_BAYER_BGGR16LE, ///< bayer, BGBG..(odd line), GRGR..(even line), 16-bit samples, little-endian */
+    AV_PIX_FMT_BAYER_BGGR16BE, ///< bayer, BGBG..(odd line), GRGR..(even line), 16-bit samples, big-endian */
+    AV_PIX_FMT_BAYER_RGGB16LE, ///< bayer, RGRG..(odd line), GBGB..(even line), 16-bit samples, little-endian */
+    AV_PIX_FMT_BAYER_RGGB16BE, ///< bayer, RGRG..(odd line), GBGB..(even line), 16-bit samples, big-endian */
+    AV_PIX_FMT_BAYER_GBRG16LE, ///< bayer, GBGB..(odd line), RGRG..(even line), 16-bit samples, little-endian */
+    AV_PIX_FMT_BAYER_GBRG16BE, ///< bayer, GBGB..(odd line), RGRG..(even line), 16-bit samples, big-endian */
+    AV_PIX_FMT_BAYER_GRBG16LE, ///< bayer, GRGR..(odd line), BGBG..(even line), 16-bit samples, little-endian */
+    AV_PIX_FMT_BAYER_GRBG16BE, ///< bayer, GRGR..(odd line), BGBG..(even line), 16-bit samples, big-endian */
+#if !FF_API_XVMC
+    AV_PIX_FMT_XVMC,///< XVideo Motion Acceleration via common packet passing
+#endif /* !FF_API_XVMC */
+    AV_PIX_FMT_YUV440P10LE, ///< planar YUV 4:4:0,20bpp, (1 Cr & Cb sample per 1x2 Y samples), little-endian
+    AV_PIX_FMT_YUV440P10BE, ///< planar YUV 4:4:0,20bpp, (1 Cr & Cb sample per 1x2 Y samples), big-endian
+    AV_PIX_FMT_YUV440P12LE, ///< planar YUV 4:4:0,24bpp, (1 Cr & Cb sample per 1x2 Y samples), little-endian
+    AV_PIX_FMT_YUV440P12BE, ///< planar YUV 4:4:0,24bpp, (1 Cr & Cb sample per 1x2 Y samples), big-endian
+    AV_PIX_FMT_AYUV64LE,    ///< packed AYUV 4:4:4,64bpp (1 Cr & Cb sample per 1x1 Y & A samples), little-endian
+    AV_PIX_FMT_AYUV64BE,    ///< packed AYUV 4:4:4,64bpp (1 Cr & Cb sample per 1x1 Y & A samples), big-endian
+
+    AV_PIX_FMT_VIDEOTOOLBOX, ///< hardware decoding through Videotoolbox
+
+    AV_PIX_FMT_P010LE, ///< like NV12, with 10bpp per component, data in the high bits, zeros in the low bits, little-endian
+    AV_PIX_FMT_P010BE, ///< like NV12, with 10bpp per component, data in the high bits, zeros in the low bits, big-endian
+
+    AV_PIX_FMT_GBRAP12BE,  ///< planar GBR 4:4:4:4 48bpp, big-endian
+    AV_PIX_FMT_GBRAP12LE,  ///< planar GBR 4:4:4:4 48bpp, little-endian
+
+    AV_PIX_FMT_GBRAP10BE,  ///< planar GBR 4:4:4:4 40bpp, big-endian
+    AV_PIX_FMT_GBRAP10LE,  ///< planar GBR 4:4:4:4 40bpp, little-endian
+
+    AV_PIX_FMT_MEDIACODEC, ///< hardware decoding through MediaCodec
+
+    AV_PIX_FMT_GRAY12BE,   ///<        Y        , 12bpp, big-endian
+    AV_PIX_FMT_GRAY12LE,   ///<        Y        , 12bpp, little-endian
+    AV_PIX_FMT_GRAY10BE,   ///<        Y        , 10bpp, big-endian
+    AV_PIX_FMT_GRAY10LE,   ///<        Y        , 10bpp, little-endian
+
+    AV_PIX_FMT_P016LE, ///< like NV12, with 16bpp per component, little-endian
+    AV_PIX_FMT_P016BE, ///< like NV12, with 16bpp per component, big-endian
+
+    AV_PIX_FMT_NB         ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
+};
+
+#if AV_HAVE_BIGENDIAN
+#   define AV_PIX_FMT_NE(be, le) AV_PIX_FMT_##be
+#else
+#   define AV_PIX_FMT_NE(be, le) AV_PIX_FMT_##le
+#endif
+
+#define AV_PIX_FMT_RGB32   AV_PIX_FMT_NE(ARGB, BGRA)
+#define AV_PIX_FMT_RGB32_1 AV_PIX_FMT_NE(RGBA, ABGR)
+#define AV_PIX_FMT_BGR32   AV_PIX_FMT_NE(ABGR, RGBA)
+#define AV_PIX_FMT_BGR32_1 AV_PIX_FMT_NE(BGRA, ARGB)
+#define AV_PIX_FMT_0RGB32  AV_PIX_FMT_NE(0RGB, BGR0)
+#define AV_PIX_FMT_0BGR32  AV_PIX_FMT_NE(0BGR, RGB0)
+
+#define AV_PIX_FMT_GRAY10 AV_PIX_FMT_NE(GRAY10BE, GRAY10LE)
+#define AV_PIX_FMT_GRAY12 AV_PIX_FMT_NE(GRAY12BE, GRAY12LE)
+#define AV_PIX_FMT_GRAY16 AV_PIX_FMT_NE(GRAY16BE, GRAY16LE)
+#define AV_PIX_FMT_YA16   AV_PIX_FMT_NE(YA16BE,   YA16LE)
+#define AV_PIX_FMT_RGB48  AV_PIX_FMT_NE(RGB48BE,  RGB48LE)
+#define AV_PIX_FMT_RGB565 AV_PIX_FMT_NE(RGB565BE, RGB565LE)
+#define AV_PIX_FMT_RGB555 AV_PIX_FMT_NE(RGB555BE, RGB555LE)
+#define AV_PIX_FMT_RGB444 AV_PIX_FMT_NE(RGB444BE, RGB444LE)
+#define AV_PIX_FMT_RGBA64 AV_PIX_FMT_NE(RGBA64BE, RGBA64LE)
+#define AV_PIX_FMT_BGR48  AV_PIX_FMT_NE(BGR48BE,  BGR48LE)
+#define AV_PIX_FMT_BGR565 AV_PIX_FMT_NE(BGR565BE, BGR565LE)
+#define AV_PIX_FMT_BGR555 AV_PIX_FMT_NE(BGR555BE, BGR555LE)
+#define AV_PIX_FMT_BGR444 AV_PIX_FMT_NE(BGR444BE, BGR444LE)
+#define AV_PIX_FMT_BGRA64 AV_PIX_FMT_NE(BGRA64BE, BGRA64LE)
+
+#define AV_PIX_FMT_YUV420P9  AV_PIX_FMT_NE(YUV420P9BE , YUV420P9LE)
+#define AV_PIX_FMT_YUV422P9  AV_PIX_FMT_NE(YUV422P9BE , YUV422P9LE)
+#define AV_PIX_FMT_YUV444P9  AV_PIX_FMT_NE(YUV444P9BE , YUV444P9LE)
+#define AV_PIX_FMT_YUV420P10 AV_PIX_FMT_NE(YUV420P10BE, YUV420P10LE)
+#define AV_PIX_FMT_YUV422P10 AV_PIX_FMT_NE(YUV422P10BE, YUV422P10LE)
+#define AV_PIX_FMT_YUV440P10 AV_PIX_FMT_NE(YUV440P10BE, YUV440P10LE)
+#define AV_PIX_FMT_YUV444P10 AV_PIX_FMT_NE(YUV444P10BE, YUV444P10LE)
+#define AV_PIX_FMT_YUV420P12 AV_PIX_FMT_NE(YUV420P12BE, YUV420P12LE)
+#define AV_PIX_FMT_YUV422P12 AV_PIX_FMT_NE(YUV422P12BE, YUV422P12LE)
+#define AV_PIX_FMT_YUV440P12 AV_PIX_FMT_NE(YUV440P12BE, YUV440P12LE)
+#define AV_PIX_FMT_YUV444P12 AV_PIX_FMT_NE(YUV444P12BE, YUV444P12LE)
+#define AV_PIX_FMT_YUV420P14 AV_PIX_FMT_NE(YUV420P14BE, YUV420P14LE)
+#define AV_PIX_FMT_YUV422P14 AV_PIX_FMT_NE(YUV422P14BE, YUV422P14LE)
+#define AV_PIX_FMT_YUV444P14 AV_PIX_FMT_NE(YUV444P14BE, YUV444P14LE)
+#define AV_PIX_FMT_YUV420P16 AV_PIX_FMT_NE(YUV420P16BE, YUV420P16LE)
+#define AV_PIX_FMT_YUV422P16 AV_PIX_FMT_NE(YUV422P16BE, YUV422P16LE)
+#define AV_PIX_FMT_YUV444P16 AV_PIX_FMT_NE(YUV444P16BE, YUV444P16LE)
+
+#define AV_PIX_FMT_GBRP9     AV_PIX_FMT_NE(GBRP9BE ,    GBRP9LE)
+#define AV_PIX_FMT_GBRP10    AV_PIX_FMT_NE(GBRP10BE,    GBRP10LE)
+#define AV_PIX_FMT_GBRP12    AV_PIX_FMT_NE(GBRP12BE,    GBRP12LE)
+#define AV_PIX_FMT_GBRP14    AV_PIX_FMT_NE(GBRP14BE,    GBRP14LE)
+#define AV_PIX_FMT_GBRP16    AV_PIX_FMT_NE(GBRP16BE,    GBRP16LE)
+#define AV_PIX_FMT_GBRAP10   AV_PIX_FMT_NE(GBRAP10BE,   GBRAP10LE)
+#define AV_PIX_FMT_GBRAP12   AV_PIX_FMT_NE(GBRAP12BE,   GBRAP12LE)
+#define AV_PIX_FMT_GBRAP16   AV_PIX_FMT_NE(GBRAP16BE,   GBRAP16LE)
+
+#define AV_PIX_FMT_BAYER_BGGR16 AV_PIX_FMT_NE(BAYER_BGGR16BE,    BAYER_BGGR16LE)
+#define AV_PIX_FMT_BAYER_RGGB16 AV_PIX_FMT_NE(BAYER_RGGB16BE,    BAYER_RGGB16LE)
+#define AV_PIX_FMT_BAYER_GBRG16 AV_PIX_FMT_NE(BAYER_GBRG16BE,    BAYER_GBRG16LE)
+#define AV_PIX_FMT_BAYER_GRBG16 AV_PIX_FMT_NE(BAYER_GRBG16BE,    BAYER_GRBG16LE)
+
+
+#define AV_PIX_FMT_YUVA420P9  AV_PIX_FMT_NE(YUVA420P9BE , YUVA420P9LE)
+#define AV_PIX_FMT_YUVA422P9  AV_PIX_FMT_NE(YUVA422P9BE , YUVA422P9LE)
+#define AV_PIX_FMT_YUVA444P9  AV_PIX_FMT_NE(YUVA444P9BE , YUVA444P9LE)
+#define AV_PIX_FMT_YUVA420P10 AV_PIX_FMT_NE(YUVA420P10BE, YUVA420P10LE)
+#define AV_PIX_FMT_YUVA422P10 AV_PIX_FMT_NE(YUVA422P10BE, YUVA422P10LE)
+#define AV_PIX_FMT_YUVA444P10 AV_PIX_FMT_NE(YUVA444P10BE, YUVA444P10LE)
+#define AV_PIX_FMT_YUVA420P16 AV_PIX_FMT_NE(YUVA420P16BE, YUVA420P16LE)
+#define AV_PIX_FMT_YUVA422P16 AV_PIX_FMT_NE(YUVA422P16BE, YUVA422P16LE)
+#define AV_PIX_FMT_YUVA444P16 AV_PIX_FMT_NE(YUVA444P16BE, YUVA444P16LE)
+
+#define AV_PIX_FMT_XYZ12      AV_PIX_FMT_NE(XYZ12BE, XYZ12LE)
+#define AV_PIX_FMT_NV20       AV_PIX_FMT_NE(NV20BE,  NV20LE)
+#define AV_PIX_FMT_AYUV64     AV_PIX_FMT_NE(AYUV64BE, AYUV64LE)
+#define AV_PIX_FMT_P010       AV_PIX_FMT_NE(P010BE,  P010LE)
+#define AV_PIX_FMT_P016       AV_PIX_FMT_NE(P016BE,  P016LE)
+
+/**
+  * Chromaticity coordinates of the source primaries.
+  */
+enum AVColorPrimaries {
+    AVCOL_PRI_RESERVED0   = 0,
+    AVCOL_PRI_BT709       = 1,  ///< also ITU-R BT1361 / IEC 61966-2-4 / SMPTE RP177 Annex B
+    AVCOL_PRI_UNSPECIFIED = 2,
+    AVCOL_PRI_RESERVED    = 3,
+    AVCOL_PRI_BT470M      = 4,  ///< also FCC Title 47 Code of Federal Regulations 73.682 (a)(20)
+
+    AVCOL_PRI_BT470BG     = 5,  ///< also ITU-R BT601-6 625 / ITU-R BT1358 625 / ITU-R BT1700 625 PAL & SECAM
+    AVCOL_PRI_SMPTE170M   = 6,  ///< also ITU-R BT601-6 525 / ITU-R BT1358 525 / ITU-R BT1700 NTSC
+    AVCOL_PRI_SMPTE240M   = 7,  ///< functionally identical to above
+    AVCOL_PRI_FILM        = 8,  ///< colour filters using Illuminant C
+    AVCOL_PRI_BT2020      = 9,  ///< ITU-R BT2020
+    AVCOL_PRI_SMPTE428    = 10, ///< SMPTE ST 428-1 (CIE 1931 XYZ)
+    AVCOL_PRI_SMPTEST428_1 = AVCOL_PRI_SMPTE428,
+    AVCOL_PRI_SMPTE431    = 11, ///< SMPTE ST 431-2 (2011) / DCI P3
+    AVCOL_PRI_SMPTE432    = 12, ///< SMPTE ST 432-1 (2010) / P3 D65 / Display P3
+    AVCOL_PRI_JEDEC_P22   = 22, ///< JEDEC P22 phosphors
+    AVCOL_PRI_NB                ///< Not part of ABI
+};
+
+/**
+ * Color Transfer Characteristic.
+ */
+enum AVColorTransferCharacteristic {
+    AVCOL_TRC_RESERVED0    = 0,
+    AVCOL_TRC_BT709        = 1,  ///< also ITU-R BT1361
+    AVCOL_TRC_UNSPECIFIED  = 2,
+    AVCOL_TRC_RESERVED     = 3,
+    AVCOL_TRC_GAMMA22      = 4,  ///< also ITU-R BT470M / ITU-R BT1700 625 PAL & SECAM
+    AVCOL_TRC_GAMMA28      = 5,  ///< also ITU-R BT470BG
+    AVCOL_TRC_SMPTE170M    = 6,  ///< also ITU-R BT601-6 525 or 625 / ITU-R BT1358 525 or 625 / ITU-R BT1700 NTSC
+    AVCOL_TRC_SMPTE240M    = 7,
+    AVCOL_TRC_LINEAR       = 8,  ///< "Linear transfer characteristics"
+    AVCOL_TRC_LOG          = 9,  ///< "Logarithmic transfer characteristic (100:1 range)"
+    AVCOL_TRC_LOG_SQRT     = 10, ///< "Logarithmic transfer characteristic (100 * Sqrt(10) : 1 range)"
+    AVCOL_TRC_IEC61966_2_4 = 11, ///< IEC 61966-2-4
+    AVCOL_TRC_BT1361_ECG   = 12, ///< ITU-R BT1361 Extended Colour Gamut
+    AVCOL_TRC_IEC61966_2_1 = 13, ///< IEC 61966-2-1 (sRGB or sYCC)
+    AVCOL_TRC_BT2020_10    = 14, ///< ITU-R BT2020 for 10-bit system
+    AVCOL_TRC_BT2020_12    = 15, ///< ITU-R BT2020 for 12-bit system
+    AVCOL_TRC_SMPTE2084    = 16, ///< SMPTE ST 2084 for 10-, 12-, 14- and 16-bit systems
+    AVCOL_TRC_SMPTEST2084  = AVCOL_TRC_SMPTE2084,
+    AVCOL_TRC_SMPTE428     = 17, ///< SMPTE ST 428-1
+    AVCOL_TRC_SMPTEST428_1 = AVCOL_TRC_SMPTE428,
+    AVCOL_TRC_ARIB_STD_B67 = 18, ///< ARIB STD-B67, known as "Hybrid log-gamma"
+    AVCOL_TRC_NB                 ///< Not part of ABI
+};
+
+/**
+ * YUV colorspace type.
+ */
+enum AVColorSpace {
+    AVCOL_SPC_RGB         = 0,  ///< order of coefficients is actually GBR, also IEC 61966-2-1 (sRGB)
+    AVCOL_SPC_BT709       = 1,  ///< also ITU-R BT1361 / IEC 61966-2-4 xvYCC709 / SMPTE RP177 Annex B
+    AVCOL_SPC_UNSPECIFIED = 2,
+    AVCOL_SPC_RESERVED    = 3,
+    AVCOL_SPC_FCC         = 4,  ///< FCC Title 47 Code of Federal Regulations 73.682 (a)(20)
+    AVCOL_SPC_BT470BG     = 5,  ///< also ITU-R BT601-6 625 / ITU-R BT1358 625 / ITU-R BT1700 625 PAL & SECAM / IEC 61966-2-4 xvYCC601
+    AVCOL_SPC_SMPTE170M   = 6,  ///< also ITU-R BT601-6 525 / ITU-R BT1358 525 / ITU-R BT1700 NTSC
+    AVCOL_SPC_SMPTE240M   = 7,  ///< functionally identical to above
+    AVCOL_SPC_YCGCO       = 8,  ///< Used by Dirac / VC-2 and H.264 FRext, see ITU-T SG16
+    AVCOL_SPC_YCOCG       = AVCOL_SPC_YCGCO,
+    AVCOL_SPC_BT2020_NCL  = 9,  ///< ITU-R BT2020 non-constant luminance system
+    AVCOL_SPC_BT2020_CL   = 10, ///< ITU-R BT2020 constant luminance system
+    AVCOL_SPC_SMPTE2085   = 11, ///< SMPTE 2085, Y'D'zD'x
+    AVCOL_SPC_NB                ///< Not part of ABI
+};
+#define AVCOL_SPC_YCGCO AVCOL_SPC_YCOCG
+
+
+/**
+ * MPEG vs JPEG YUV range.
+ */
+enum AVColorRange {
+    AVCOL_RANGE_UNSPECIFIED = 0,
+    AVCOL_RANGE_MPEG        = 1, ///< the normal 219*2^(n-8) "MPEG" YUV ranges
+    AVCOL_RANGE_JPEG        = 2, ///< the normal     2^n-1   "JPEG" YUV ranges
+    AVCOL_RANGE_NB               ///< Not part of ABI
+};
+
+/**
+ * Location of chroma samples.
+ *
+ * Illustration showing the location of the first (top left) chroma sample of the
+ * image, the left shows only luma, the right
+ * shows the location of the chroma sample, the 2 could be imagined to overlay
+ * each other but are drawn separately due to limitations of ASCII
+ *
+ *                1st 2nd       1st 2nd horizontal luma sample positions
+ *                 v   v         v   v
+ *                 ______        ______
+ *1st luma line > |X   X ...    |3 4 X ...     X are luma samples,
+ *                |             |1 2           1-6 are possible chroma positions
+ *2nd luma line > |X   X ...    |5 6 X ...     0 is undefined/unknown position
+ */
+enum AVChromaLocation {
+    AVCHROMA_LOC_UNSPECIFIED = 0,
+    AVCHROMA_LOC_LEFT        = 1, ///< MPEG-2/4 4:2:0, H.264 default for 4:2:0
+    AVCHROMA_LOC_CENTER      = 2, ///< MPEG-1 4:2:0, JPEG 4:2:0, H.263 4:2:0
+    AVCHROMA_LOC_TOPLEFT     = 3, ///< ITU-R 601, SMPTE 274M 296M S314M(DV 4:1:1), mpeg2 4:2:2
+    AVCHROMA_LOC_TOP         = 4,
+    AVCHROMA_LOC_BOTTOMLEFT  = 5,
+    AVCHROMA_LOC_BOTTOM      = 6,
+    AVCHROMA_LOC_NB               ///< Not part of ABI
+};
+
+#endif /* AVUTIL_PIXFMT_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/random_seed.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/random_seed.h
new file mode 100644
index 0000000..0462a04
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/random_seed.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2009 Baptiste Coudurier <baptiste.coudurier@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_RANDOM_SEED_H
+#define AVUTIL_RANDOM_SEED_H
+
+#include <stdint.h>
+/**
+ * @addtogroup lavu_crypto
+ * @{
+ */
+
+/**
+ * Get a seed to use in conjunction with random functions.
+ * This function tries to provide a good seed at a best effort bases.
+ * Its possible to call this function multiple times if more bits are needed.
+ * It can be quite slow, which is why it should only be used as seed for a faster
+ * PRNG. The quality of the seed depends on the platform.
+ */
+uint32_t av_get_random_seed(void);
+
+/**
+ * @}
+ */
+
+#endif /* AVUTIL_RANDOM_SEED_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/rational.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/rational.h
new file mode 100644
index 0000000..5c6b67b
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/rational.h
@@ -0,0 +1,214 @@
+/*
+ * rational numbers
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * @ingroup lavu_math_rational
+ * Utilties for rational number calculation.
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#ifndef AVUTIL_RATIONAL_H
+#define AVUTIL_RATIONAL_H
+
+#include <stdint.h>
+#include <limits.h>
+#include "attributes.h"
+
+/**
+ * @defgroup lavu_math_rational AVRational
+ * @ingroup lavu_math
+ * Rational number calculation.
+ *
+ * While rational numbers can be expressed as floating-point numbers, the
+ * conversion process is a lossy one, so are floating-point operations. On the
+ * other hand, the nature of FFmpeg demands highly accurate calculation of
+ * timestamps. This set of rational number utilities serves as a generic
+ * interface for manipulating rational numbers as pairs of numerators and
+ * denominators.
+ *
+ * Many of the functions that operate on AVRational's have the suffix `_q`, in
+ * reference to the mathematical symbol "ℚ" (Q) which denotes the set of all
+ * rational numbers.
+ *
+ * @{
+ */
+
+/**
+ * Rational number (pair of numerator and denominator).
+ */
+typedef struct AVRational{
+    int num; ///< Numerator
+    int den; ///< Denominator
+} AVRational;
+
+/**
+ * Create an AVRational.
+ *
+ * Useful for compilers that do not support compound literals.
+ *
+ * @note The return value is not reduced.
+ * @see av_reduce()
+ */
+static inline AVRational av_make_q(int num, int den)
+{
+    AVRational r = { num, den };
+    return r;
+}
+
+/**
+ * Compare two rationals.
+ *
+ * @param a First rational
+ * @param b Second rational
+ *
+ * @return One of the following values:
+ *         - 0 if `a == b`
+ *         - 1 if `a > b`
+ *         - -1 if `a < b`
+ *         - `INT_MIN` if one of the values is of the form `0 / 0`
+ */
+static inline int av_cmp_q(AVRational a, AVRational b){
+    const int64_t tmp= a.num * (int64_t)b.den - b.num * (int64_t)a.den;
+
+    if(tmp) return (int)((tmp ^ a.den ^ b.den)>>63)|1;
+    else if(b.den && a.den) return 0;
+    else if(a.num && b.num) return (a.num>>31) - (b.num>>31);
+    else                    return INT_MIN;
+}
+
+/**
+ * Convert an AVRational to a `double`.
+ * @param a AVRational to convert
+ * @return `a` in floating-point form
+ * @see av_d2q()
+ */
+static inline double av_q2d(AVRational a){
+    return a.num / (double) a.den;
+}
+
+/**
+ * Reduce a fraction.
+ *
+ * This is useful for framerate calculations.
+ *
+ * @param[out] dst_num Destination numerator
+ * @param[out] dst_den Destination denominator
+ * @param[in]      num Source numerator
+ * @param[in]      den Source denominator
+ * @param[in]      max Maximum allowed values for `dst_num` & `dst_den`
+ * @return 1 if the operation is exact, 0 otherwise
+ */
+int av_reduce(int *dst_num, int *dst_den, int64_t num, int64_t den, int64_t max);
+
+/**
+ * Multiply two rationals.
+ * @param b First rational
+ * @param c Second rational
+ * @return b*c
+ */
+AVRational av_mul_q(AVRational b, AVRational c) av_const;
+
+/**
+ * Divide one rational by another.
+ * @param b First rational
+ * @param c Second rational
+ * @return b/c
+ */
+AVRational av_div_q(AVRational b, AVRational c) av_const;
+
+/**
+ * Add two rationals.
+ * @param b First rational
+ * @param c Second rational
+ * @return b+c
+ */
+AVRational av_add_q(AVRational b, AVRational c) av_const;
+
+/**
+ * Subtract one rational from another.
+ * @param b First rational
+ * @param c Second rational
+ * @return b-c
+ */
+AVRational av_sub_q(AVRational b, AVRational c) av_const;
+
+/**
+ * Invert a rational.
+ * @param q value
+ * @return 1 / q
+ */
+static av_always_inline AVRational av_inv_q(AVRational q)
+{
+    AVRational r = { q.den, q.num };
+    return r;
+}
+
+/**
+ * Convert a double precision floating point number to a rational.
+ *
+ * In case of infinity, the returned value is expressed as `{1, 0}` or
+ * `{-1, 0}` depending on the sign.
+ *
+ * @param d   `double` to convert
+ * @param max Maximum allowed numerator and denominator
+ * @return `d` in AVRational form
+ * @see av_q2d()
+ */
+AVRational av_d2q(double d, int max) av_const;
+
+/**
+ * Find which of the two rationals is closer to another rational.
+ *
+ * @param q     Rational to be compared against
+ * @param q1,q2 Rationals to be tested
+ * @return One of the following values:
+ *         - 1 if `q1` is nearer to `q` than `q2`
+ *         - -1 if `q2` is nearer to `q` than `q1`
+ *         - 0 if they have the same distance
+ */
+int av_nearer_q(AVRational q, AVRational q1, AVRational q2);
+
+/**
+ * Find the value in a list of rationals nearest a given reference rational.
+ *
+ * @param q      Reference rational
+ * @param q_list Array of rationals terminated by `{0, 0}`
+ * @return Index of the nearest value found in the array
+ */
+int av_find_nearest_q_idx(AVRational q, const AVRational* q_list);
+
+/**
+ * Convert an AVRational to a IEEE 32-bit `float` expressed in fixed-point
+ * format.
+ *
+ * @param q Rational to be converted
+ * @return Equivalent floating-point value, expressed as an unsigned 32-bit
+ *         integer.
+ * @note The returned value is platform-indepedant.
+ */
+uint32_t av_q2intfloat(AVRational q);
+
+/**
+ * @}
+ */
+
+#endif /* AVUTIL_RATIONAL_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/rc4.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/rc4.h
new file mode 100644
index 0000000..029cd2a
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/rc4.h
@@ -0,0 +1,66 @@
+/*
+ * RC4 encryption/decryption/pseudo-random number generator
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_RC4_H
+#define AVUTIL_RC4_H
+
+#include <stdint.h>
+
+/**
+ * @defgroup lavu_rc4 RC4
+ * @ingroup lavu_crypto
+ * @{
+ */
+
+typedef struct AVRC4 {
+    uint8_t state[256];
+    int x, y;
+} AVRC4;
+
+/**
+ * Allocate an AVRC4 context.
+ */
+AVRC4 *av_rc4_alloc(void);
+
+/**
+ * @brief Initializes an AVRC4 context.
+ *
+ * @param key_bits must be a multiple of 8
+ * @param decrypt 0 for encryption, 1 for decryption, currently has no effect
+ * @return zero on success, negative value otherwise
+ */
+int av_rc4_init(struct AVRC4 *d, const uint8_t *key, int key_bits, int decrypt);
+
+/**
+ * @brief Encrypts / decrypts using the RC4 algorithm.
+ *
+ * @param count number of bytes
+ * @param dst destination array, can be equal to src
+ * @param src source array, can be equal to dst, may be NULL
+ * @param iv not (yet) used for RC4, should be NULL
+ * @param decrypt 0 for encryption, 1 for decryption, not (yet) used
+ */
+void av_rc4_crypt(struct AVRC4 *d, uint8_t *dst, const uint8_t *src, int count, uint8_t *iv, int decrypt);
+
+/**
+ * @}
+ */
+
+#endif /* AVUTIL_RC4_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/replaygain.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/replaygain.h
new file mode 100644
index 0000000..b49bf1a
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/replaygain.h
@@ -0,0 +1,50 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_REPLAYGAIN_H
+#define AVUTIL_REPLAYGAIN_H
+
+#include <stdint.h>
+
+/**
+ * ReplayGain information (see
+ * http://wiki.hydrogenaudio.org/index.php?title=ReplayGain_1.0_specification).
+ * The size of this struct is a part of the public ABI.
+ */
+typedef struct AVReplayGain {
+    /**
+     * Track replay gain in microbels (divide by 100000 to get the value in dB).
+     * Should be set to INT32_MIN when unknown.
+     */
+    int32_t track_gain;
+    /**
+     * Peak track amplitude, with 100000 representing full scale (but values
+     * may overflow). 0 when unknown.
+     */
+    uint32_t track_peak;
+    /**
+     * Same as track_gain, but for the whole album.
+     */
+    int32_t album_gain;
+    /**
+     * Same as track_peak, but for the whole album,
+     */
+    uint32_t album_peak;
+} AVReplayGain;
+
+#endif /* AVUTIL_REPLAYGAIN_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/ripemd.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/ripemd.h
new file mode 100644
index 0000000..6d6bb32
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/ripemd.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (C) 2007 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2013 James Almer <jamrial@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * @ingroup lavu_ripemd
+ * Public header for RIPEMD hash function implementation.
+ */
+
+#ifndef AVUTIL_RIPEMD_H
+#define AVUTIL_RIPEMD_H
+
+#include <stdint.h>
+
+#include "attributes.h"
+#include "version.h"
+
+/**
+ * @defgroup lavu_ripemd RIPEMD
+ * @ingroup lavu_hash
+ * RIPEMD hash function implementation.
+ *
+ * @{
+ */
+
+extern const int av_ripemd_size;
+
+struct AVRIPEMD;
+
+/**
+ * Allocate an AVRIPEMD context.
+ */
+struct AVRIPEMD *av_ripemd_alloc(void);
+
+/**
+ * Initialize RIPEMD hashing.
+ *
+ * @param context pointer to the function context (of size av_ripemd_size)
+ * @param bits    number of bits in digest (128, 160, 256 or 320 bits)
+ * @return        zero if initialization succeeded, -1 otherwise
+ */
+int av_ripemd_init(struct AVRIPEMD* context, int bits);
+
+/**
+ * Update hash value.
+ *
+ * @param context hash function context
+ * @param data    input data to update hash with
+ * @param len     input data length
+ */
+void av_ripemd_update(struct AVRIPEMD* context, const uint8_t* data, unsigned int len);
+
+/**
+ * Finish hashing and output digest value.
+ *
+ * @param context hash function context
+ * @param digest  buffer where output digest value is stored
+ */
+void av_ripemd_final(struct AVRIPEMD* context, uint8_t *digest);
+
+/**
+ * @}
+ */
+
+#endif /* AVUTIL_RIPEMD_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/samplefmt.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/samplefmt.h
new file mode 100644
index 0000000..8cd43ae
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/samplefmt.h
@@ -0,0 +1,272 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_SAMPLEFMT_H
+#define AVUTIL_SAMPLEFMT_H
+
+#include <stdint.h>
+
+#include "avutil.h"
+#include "attributes.h"
+
+/**
+ * @addtogroup lavu_audio
+ * @{
+ *
+ * @defgroup lavu_sampfmts Audio sample formats
+ *
+ * Audio sample format enumeration and related convenience functions.
+ * @{
+ */
+
+/**
+ * Audio sample formats
+ *
+ * - The data described by the sample format is always in native-endian order.
+ *   Sample values can be expressed by native C types, hence the lack of a signed
+ *   24-bit sample format even though it is a common raw audio data format.
+ *
+ * - The floating-point formats are based on full volume being in the range
+ *   [-1.0, 1.0]. Any values outside this range are beyond full volume level.
+ *
+ * - The data layout as used in av_samples_fill_arrays() and elsewhere in FFmpeg
+ *   (such as AVFrame in libavcodec) is as follows:
+ *
+ * @par
+ * For planar sample formats, each audio channel is in a separate data plane,
+ * and linesize is the buffer size, in bytes, for a single plane. All data
+ * planes must be the same size. For packed sample formats, only the first data
+ * plane is used, and samples for each channel are interleaved. In this case,
+ * linesize is the buffer size, in bytes, for the 1 plane.
+ *
+ */
+enum AVSampleFormat {
+    AV_SAMPLE_FMT_NONE = -1,
+    AV_SAMPLE_FMT_U8,          ///< unsigned 8 bits
+    AV_SAMPLE_FMT_S16,         ///< signed 16 bits
+    AV_SAMPLE_FMT_S32,         ///< signed 32 bits
+    AV_SAMPLE_FMT_FLT,         ///< float
+    AV_SAMPLE_FMT_DBL,         ///< double
+
+    AV_SAMPLE_FMT_U8P,         ///< unsigned 8 bits, planar
+    AV_SAMPLE_FMT_S16P,        ///< signed 16 bits, planar
+    AV_SAMPLE_FMT_S32P,        ///< signed 32 bits, planar
+    AV_SAMPLE_FMT_FLTP,        ///< float, planar
+    AV_SAMPLE_FMT_DBLP,        ///< double, planar
+    AV_SAMPLE_FMT_S64,         ///< signed 64 bits
+    AV_SAMPLE_FMT_S64P,        ///< signed 64 bits, planar
+
+    AV_SAMPLE_FMT_NB           ///< Number of sample formats. DO NOT USE if linking dynamically
+};
+
+/**
+ * Return the name of sample_fmt, or NULL if sample_fmt is not
+ * recognized.
+ */
+const char *av_get_sample_fmt_name(enum AVSampleFormat sample_fmt);
+
+/**
+ * Return a sample format corresponding to name, or AV_SAMPLE_FMT_NONE
+ * on error.
+ */
+enum AVSampleFormat av_get_sample_fmt(const char *name);
+
+/**
+ * Return the planar<->packed alternative form of the given sample format, or
+ * AV_SAMPLE_FMT_NONE on error. If the passed sample_fmt is already in the
+ * requested planar/packed format, the format returned is the same as the
+ * input.
+ */
+enum AVSampleFormat av_get_alt_sample_fmt(enum AVSampleFormat sample_fmt, int planar);
+
+/**
+ * Get the packed alternative form of the given sample format.
+ *
+ * If the passed sample_fmt is already in packed format, the format returned is
+ * the same as the input.
+ *
+ * @return  the packed alternative form of the given sample format or
+            AV_SAMPLE_FMT_NONE on error.
+ */
+enum AVSampleFormat av_get_packed_sample_fmt(enum AVSampleFormat sample_fmt);
+
+/**
+ * Get the planar alternative form of the given sample format.
+ *
+ * If the passed sample_fmt is already in planar format, the format returned is
+ * the same as the input.
+ *
+ * @return  the planar alternative form of the given sample format or
+            AV_SAMPLE_FMT_NONE on error.
+ */
+enum AVSampleFormat av_get_planar_sample_fmt(enum AVSampleFormat sample_fmt);
+
+/**
+ * Generate a string corresponding to the sample format with
+ * sample_fmt, or a header if sample_fmt is negative.
+ *
+ * @param buf the buffer where to write the string
+ * @param buf_size the size of buf
+ * @param sample_fmt the number of the sample format to print the
+ * corresponding info string, or a negative value to print the
+ * corresponding header.
+ * @return the pointer to the filled buffer or NULL if sample_fmt is
+ * unknown or in case of other errors
+ */
+char *av_get_sample_fmt_string(char *buf, int buf_size, enum AVSampleFormat sample_fmt);
+
+/**
+ * Return number of bytes per sample.
+ *
+ * @param sample_fmt the sample format
+ * @return number of bytes per sample or zero if unknown for the given
+ * sample format
+ */
+int av_get_bytes_per_sample(enum AVSampleFormat sample_fmt);
+
+/**
+ * Check if the sample format is planar.
+ *
+ * @param sample_fmt the sample format to inspect
+ * @return 1 if the sample format is planar, 0 if it is interleaved
+ */
+int av_sample_fmt_is_planar(enum AVSampleFormat sample_fmt);
+
+/**
+ * Get the required buffer size for the given audio parameters.
+ *
+ * @param[out] linesize calculated linesize, may be NULL
+ * @param nb_channels   the number of channels
+ * @param nb_samples    the number of samples in a single channel
+ * @param sample_fmt    the sample format
+ * @param align         buffer size alignment (0 = default, 1 = no alignment)
+ * @return              required buffer size, or negative error code on failure
+ */
+int av_samples_get_buffer_size(int *linesize, int nb_channels, int nb_samples,
+                               enum AVSampleFormat sample_fmt, int align);
+
+/**
+ * @}
+ *
+ * @defgroup lavu_sampmanip Samples manipulation
+ *
+ * Functions that manipulate audio samples
+ * @{
+ */
+
+/**
+ * Fill plane data pointers and linesize for samples with sample
+ * format sample_fmt.
+ *
+ * The audio_data array is filled with the pointers to the samples data planes:
+ * for planar, set the start point of each channel's data within the buffer,
+ * for packed, set the start point of the entire buffer only.
+ *
+ * The value pointed to by linesize is set to the aligned size of each
+ * channel's data buffer for planar layout, or to the aligned size of the
+ * buffer for all channels for packed layout.
+ *
+ * The buffer in buf must be big enough to contain all the samples
+ * (use av_samples_get_buffer_size() to compute its minimum size),
+ * otherwise the audio_data pointers will point to invalid data.
+ *
+ * @see enum AVSampleFormat
+ * The documentation for AVSampleFormat describes the data layout.
+ *
+ * @param[out] audio_data  array to be filled with the pointer for each channel
+ * @param[out] linesize    calculated linesize, may be NULL
+ * @param buf              the pointer to a buffer containing the samples
+ * @param nb_channels      the number of channels
+ * @param nb_samples       the number of samples in a single channel
+ * @param sample_fmt       the sample format
+ * @param align            buffer size alignment (0 = default, 1 = no alignment)
+ * @return                 >=0 on success or a negative error code on failure
+ * @todo return minimum size in bytes required for the buffer in case
+ * of success at the next bump
+ */
+int av_samples_fill_arrays(uint8_t **audio_data, int *linesize,
+                           const uint8_t *buf,
+                           int nb_channels, int nb_samples,
+                           enum AVSampleFormat sample_fmt, int align);
+
+/**
+ * Allocate a samples buffer for nb_samples samples, and fill data pointers and
+ * linesize accordingly.
+ * The allocated samples buffer can be freed by using av_freep(&audio_data[0])
+ * Allocated data will be initialized to silence.
+ *
+ * @see enum AVSampleFormat
+ * The documentation for AVSampleFormat describes the data layout.
+ *
+ * @param[out] audio_data  array to be filled with the pointer for each channel
+ * @param[out] linesize    aligned size for audio buffer(s), may be NULL
+ * @param nb_channels      number of audio channels
+ * @param nb_samples       number of samples per channel
+ * @param align            buffer size alignment (0 = default, 1 = no alignment)
+ * @return                 >=0 on success or a negative error code on failure
+ * @todo return the size of the allocated buffer in case of success at the next bump
+ * @see av_samples_fill_arrays()
+ * @see av_samples_alloc_array_and_samples()
+ */
+int av_samples_alloc(uint8_t **audio_data, int *linesize, int nb_channels,
+                     int nb_samples, enum AVSampleFormat sample_fmt, int align);
+
+/**
+ * Allocate a data pointers array, samples buffer for nb_samples
+ * samples, and fill data pointers and linesize accordingly.
+ *
+ * This is the same as av_samples_alloc(), but also allocates the data
+ * pointers array.
+ *
+ * @see av_samples_alloc()
+ */
+int av_samples_alloc_array_and_samples(uint8_t ***audio_data, int *linesize, int nb_channels,
+                                       int nb_samples, enum AVSampleFormat sample_fmt, int align);
+
+/**
+ * Copy samples from src to dst.
+ *
+ * @param dst destination array of pointers to data planes
+ * @param src source array of pointers to data planes
+ * @param dst_offset offset in samples at which the data will be written to dst
+ * @param src_offset offset in samples at which the data will be read from src
+ * @param nb_samples number of samples to be copied
+ * @param nb_channels number of audio channels
+ * @param sample_fmt audio sample format
+ */
+int av_samples_copy(uint8_t **dst, uint8_t * const *src, int dst_offset,
+                    int src_offset, int nb_samples, int nb_channels,
+                    enum AVSampleFormat sample_fmt);
+
+/**
+ * Fill an audio buffer with silence.
+ *
+ * @param audio_data  array of pointers to data planes
+ * @param offset      offset in samples at which to start filling
+ * @param nb_samples  number of samples to fill
+ * @param nb_channels number of audio channels
+ * @param sample_fmt  audio sample format
+ */
+int av_samples_set_silence(uint8_t **audio_data, int offset, int nb_samples,
+                           int nb_channels, enum AVSampleFormat sample_fmt);
+
+/**
+ * @}
+ * @}
+ */
+#endif /* AVUTIL_SAMPLEFMT_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/sha.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/sha.h
new file mode 100644
index 0000000..c7558a8
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/sha.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (C) 2007 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * @ingroup lavu_sha
+ * Public header for SHA-1 & SHA-256 hash function implementations.
+ */
+
+#ifndef AVUTIL_SHA_H
+#define AVUTIL_SHA_H
+
+#include <stdint.h>
+
+#include "attributes.h"
+#include "version.h"
+
+/**
+ * @defgroup lavu_sha SHA
+ * @ingroup lavu_hash
+ * SHA-1 and SHA-256 (Secure Hash Algorithm) hash function implementations.
+ *
+ * This module supports the following SHA hash functions:
+ *
+ * - SHA-1: 160 bits
+ * - SHA-224: 224 bits, as a variant of SHA-2
+ * - SHA-256: 256 bits, as a variant of SHA-2
+ *
+ * @see For SHA-384, SHA-512, and variants thereof, see @ref lavu_sha512.
+ *
+ * @{
+ */
+
+extern const int av_sha_size;
+
+struct AVSHA;
+
+/**
+ * Allocate an AVSHA context.
+ */
+struct AVSHA *av_sha_alloc(void);
+
+/**
+ * Initialize SHA-1 or SHA-2 hashing.
+ *
+ * @param context pointer to the function context (of size av_sha_size)
+ * @param bits    number of bits in digest (SHA-1 - 160 bits, SHA-2 224 or 256 bits)
+ * @return        zero if initialization succeeded, -1 otherwise
+ */
+int av_sha_init(struct AVSHA* context, int bits);
+
+/**
+ * Update hash value.
+ *
+ * @param context hash function context
+ * @param data    input data to update hash with
+ * @param len     input data length
+ */
+void av_sha_update(struct AVSHA* context, const uint8_t* data, unsigned int len);
+
+/**
+ * Finish hashing and output digest value.
+ *
+ * @param context hash function context
+ * @param digest  buffer where output digest value is stored
+ */
+void av_sha_final(struct AVSHA* context, uint8_t *digest);
+
+/**
+ * @}
+ */
+
+#endif /* AVUTIL_SHA_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/sha512.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/sha512.h
new file mode 100644
index 0000000..5bac184
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/sha512.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2007 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2013 James Almer <jamrial@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * @ingroup lavu_sha512
+ * Public header for SHA-512 implementation.
+ */
+
+#ifndef AVUTIL_SHA512_H
+#define AVUTIL_SHA512_H
+
+#include <stdint.h>
+
+#include "attributes.h"
+#include "version.h"
+
+/**
+ * @defgroup lavu_sha512 SHA-512
+ * @ingroup lavu_hash
+ * SHA-512 (Secure Hash Algorithm) hash function implementations.
+ *
+ * This module supports the following SHA-2 hash functions:
+ *
+ * - SHA-512/224: 224 bits
+ * - SHA-512/256: 256 bits
+ * - SHA-384: 384 bits
+ * - SHA-512: 512 bits
+ *
+ * @see For SHA-1, SHA-256, and variants thereof, see @ref lavu_sha.
+ *
+ * @{
+ */
+
+extern const int av_sha512_size;
+
+struct AVSHA512;
+
+/**
+ * Allocate an AVSHA512 context.
+ */
+struct AVSHA512 *av_sha512_alloc(void);
+
+/**
+ * Initialize SHA-2 512 hashing.
+ *
+ * @param context pointer to the function context (of size av_sha512_size)
+ * @param bits    number of bits in digest (224, 256, 384 or 512 bits)
+ * @return        zero if initialization succeeded, -1 otherwise
+ */
+int av_sha512_init(struct AVSHA512* context, int bits);
+
+/**
+ * Update hash value.
+ *
+ * @param context hash function context
+ * @param data    input data to update hash with
+ * @param len     input data length
+ */
+void av_sha512_update(struct AVSHA512* context, const uint8_t* data, unsigned int len);
+
+/**
+ * Finish hashing and output digest value.
+ *
+ * @param context hash function context
+ * @param digest  buffer where output digest value is stored
+ */
+void av_sha512_final(struct AVSHA512* context, uint8_t *digest);
+
+/**
+ * @}
+ */
+
+#endif /* AVUTIL_SHA512_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/spherical.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/spherical.h
new file mode 100644
index 0000000..cef759c
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/spherical.h
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2016 Vittorio Giovara <vittorio.giovara@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Spherical video
+ */
+
+#ifndef AVUTIL_SPHERICAL_H
+#define AVUTIL_SPHERICAL_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+/**
+ * @addtogroup lavu_video
+ * @{
+ *
+ * @defgroup lavu_video_spherical Spherical video mapping
+ * @{
+ */
+
+/**
+ * @addtogroup lavu_video_spherical
+ * A spherical video file contains surfaces that need to be mapped onto a
+ * sphere. Depending on how the frame was converted, a different distortion
+ * transformation or surface recomposition function needs to be applied before
+ * the video should be mapped and displayed.
+ */
+
+/**
+ * Projection of the video surface(s) on a sphere.
+ */
+enum AVSphericalProjection {
+    /**
+     * Video represents a sphere mapped on a flat surface using
+     * equirectangular projection.
+     */
+    AV_SPHERICAL_EQUIRECTANGULAR,
+
+    /**
+     * Video frame is split into 6 faces of a cube, and arranged on a
+     * 3x2 layout. Faces are oriented upwards for the front, left, right,
+     * and back faces. The up face is oriented so the top of the face is
+     * forwards and the down face is oriented so the top of the face is
+     * to the back.
+     */
+    AV_SPHERICAL_CUBEMAP,
+
+    /**
+     * Video represents a portion of a sphere mapped on a flat surface
+     * using equirectangular projection. The @ref bounding fields indicate
+     * the position of the current video in a larger surface.
+     */
+    AV_SPHERICAL_EQUIRECTANGULAR_TILE,
+};
+
+/**
+ * This structure describes how to handle spherical videos, outlining
+ * information about projection, initial layout, and any other view modifier.
+ *
+ * @note The struct must be allocated with av_spherical_alloc() and
+ *       its size is not a part of the public ABI.
+ */
+typedef struct AVSphericalMapping {
+    /**
+     * Projection type.
+     */
+    enum AVSphericalProjection projection;
+
+    /**
+     * @name Initial orientation
+     * @{
+     * There fields describe additional rotations applied to the sphere after
+     * the video frame is mapped onto it. The sphere is rotated around the
+     * viewer, who remains stationary. The order of transformation is always
+     * yaw, followed by pitch, and finally by roll.
+     *
+     * The coordinate system matches the one defined in OpenGL, where the
+     * forward vector (z) is coming out of screen, and it is equivalent to
+     * a rotation matrix of R = r_y(yaw) * r_x(pitch) * r_z(roll).
+     *
+     * A positive yaw rotates the portion of the sphere in front of the viewer
+     * toward their right. A positive pitch rotates the portion of the sphere
+     * in front of the viewer upwards. A positive roll tilts the portion of
+     * the sphere in front of the viewer to the viewer's right.
+     *
+     * These values are exported as 16.16 fixed point.
+     *
+     * See this equirectangular projection as example:
+     *
+     * @code{.unparsed}
+     *                   Yaw
+     *     -180           0           180
+     *   90 +-------------+-------------+  180
+     *      |             |             |                  up
+     * P    |             |             |                 y|    forward
+     * i    |             ^             |                  |   /z
+     * t  0 +-------------X-------------+    0 Roll        |  /
+     * c    |             |             |                  | /
+     * h    |             |             |                 0|/_____right
+     *      |             |             |                        x
+     *  -90 +-------------+-------------+ -180
+     *
+     * X - the default camera center
+     * ^ - the default up vector
+     * @endcode
+     */
+    int32_t yaw;   ///< Rotation around the up vector [-180, 180].
+    int32_t pitch; ///< Rotation around the right vector [-90, 90].
+    int32_t roll;  ///< Rotation around the forward vector [-180, 180].
+    /**
+     * @}
+     */
+
+    /**
+     * @name Bounding rectangle
+     * @anchor bounding
+     * @{
+     * These fields indicate the location of the current tile, and where
+     * it should be mapped relative to the original surface. They are
+     * exported as 0.32 fixed point, and can be converted to classic
+     * pixel values with av_spherical_bounds().
+     *
+     * @code{.unparsed}
+     *      +----------------+----------+
+     *      |                |bound_top |
+     *      |            +--------+     |
+     *      | bound_left |tile    |     |
+     *      +<---------->|        |<--->+bound_right
+     *      |            +--------+     |
+     *      |                |          |
+     *      |    bound_bottom|          |
+     *      +----------------+----------+
+     * @endcode
+     *
+     * If needed, the original video surface dimensions can be derived
+     * by adding the current stream or frame size to the related bounds,
+     * like in the following example:
+     *
+     * @code{c}
+     *     original_width  = tile->width  + bound_left + bound_right;
+     *     original_height = tile->height + bound_top  + bound_bottom;
+     * @endcode
+     *
+     * @note These values are valid only for the tiled equirectangular
+     *       projection type (@ref AV_SPHERICAL_EQUIRECTANGULAR_TILE),
+     *       and should be ignored in all other cases.
+     */
+    uint32_t bound_left;   ///< Distance from the left edge
+    uint32_t bound_top;    ///< Distance from the top edge
+    uint32_t bound_right;  ///< Distance from the right edge
+    uint32_t bound_bottom; ///< Distance from the bottom edge
+    /**
+     * @}
+     */
+
+    /**
+     * Number of pixels to pad from the edge of each cube face.
+     *
+     * @note This value is valid for only for the cubemap projection type
+     *       (@ref AV_SPHERICAL_CUBEMAP), and should be ignored in all other
+     *       cases.
+     */
+    uint32_t padding;
+} AVSphericalMapping;
+
+/**
+ * Allocate a AVSphericalVideo structure and initialize its fields to default
+ * values.
+ *
+ * @return the newly allocated struct or NULL on failure
+ */
+AVSphericalMapping *av_spherical_alloc(size_t *size);
+
+/**
+ * Convert the @ref bounding fields from an AVSphericalVideo
+ * from 0.32 fixed point to pixels.
+ *
+ * @param map    The AVSphericalVideo map to read bound values from.
+ * @param width  Width of the current frame or stream.
+ * @param height Height of the current frame or stream.
+ * @param left   Pixels from the left edge.
+ * @param top    Pixels from the top edge.
+ * @param right  Pixels from the right edge.
+ * @param bottom Pixels from the bottom edge.
+ */
+void av_spherical_tile_bounds(const AVSphericalMapping *map,
+                              size_t width, size_t height,
+                              size_t *left, size_t *top,
+                              size_t *right, size_t *bottom);
+
+/**
+ * Provide a human-readable name of a given AVSphericalProjection.
+ *
+ * @param projection The input AVSphericalProjection.
+ *
+ * @return The name of the AVSphericalProjection, or "unknown".
+ */
+const char *av_spherical_projection_name(enum AVSphericalProjection projection);
+
+/**
+ * Get the AVSphericalProjection form a human-readable name.
+ *
+ * @param name The input string.
+ *
+ * @return The AVSphericalProjection value, or -1 if not found.
+ */
+int av_spherical_from_name(const char *name);
+/**
+ * @}
+ * @}
+ */
+
+#endif /* AVUTIL_SPHERICAL_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/stereo3d.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/stereo3d.h
new file mode 100644
index 0000000..19c5416
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/stereo3d.h
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2013 Vittorio Giovara <vittorio.giovara@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_STEREO3D_H
+#define AVUTIL_STEREO3D_H
+
+#include <stdint.h>
+
+#include "frame.h"
+
+/**
+ * List of possible 3D Types
+ */
+enum AVStereo3DType {
+    /**
+     * Video is not stereoscopic (and metadata has to be there).
+     */
+    AV_STEREO3D_2D,
+
+    /**
+     * Views are next to each other.
+     *
+     *    LLLLRRRR
+     *    LLLLRRRR
+     *    LLLLRRRR
+     *    ...
+     */
+    AV_STEREO3D_SIDEBYSIDE,
+
+    /**
+     * Views are on top of each other.
+     *
+     *    LLLLLLLL
+     *    LLLLLLLL
+     *    RRRRRRRR
+     *    RRRRRRRR
+     */
+    AV_STEREO3D_TOPBOTTOM,
+
+    /**
+     * Views are alternated temporally.
+     *
+     *     frame0   frame1   frame2   ...
+     *    LLLLLLLL RRRRRRRR LLLLLLLL
+     *    LLLLLLLL RRRRRRRR LLLLLLLL
+     *    LLLLLLLL RRRRRRRR LLLLLLLL
+     *    ...      ...      ...
+     */
+    AV_STEREO3D_FRAMESEQUENCE,
+
+    /**
+     * Views are packed in a checkerboard-like structure per pixel.
+     *
+     *    LRLRLRLR
+     *    RLRLRLRL
+     *    LRLRLRLR
+     *    ...
+     */
+    AV_STEREO3D_CHECKERBOARD,
+
+    /**
+     * Views are next to each other, but when upscaling
+     * apply a checkerboard pattern.
+     *
+     *     LLLLRRRR          L L L L    R R R R
+     *     LLLLRRRR    =>     L L L L  R R R R
+     *     LLLLRRRR          L L L L    R R R R
+     *     LLLLRRRR           L L L L  R R R R
+     */
+    AV_STEREO3D_SIDEBYSIDE_QUINCUNX,
+
+    /**
+     * Views are packed per line, as if interlaced.
+     *
+     *    LLLLLLLL
+     *    RRRRRRRR
+     *    LLLLLLLL
+     *    ...
+     */
+    AV_STEREO3D_LINES,
+
+    /**
+     * Views are packed per column.
+     *
+     *    LRLRLRLR
+     *    LRLRLRLR
+     *    LRLRLRLR
+     *    ...
+     */
+    AV_STEREO3D_COLUMNS,
+};
+
+
+/**
+ * Inverted views, Right/Bottom represents the left view.
+ */
+#define AV_STEREO3D_FLAG_INVERT     (1 << 0)
+
+/**
+ * Stereo 3D type: this structure describes how two videos are packed
+ * within a single video surface, with additional information as needed.
+ *
+ * @note The struct must be allocated with av_stereo3d_alloc() and
+ *       its size is not a part of the public ABI.
+ */
+typedef struct AVStereo3D {
+    /**
+     * How views are packed within the video.
+     */
+    enum AVStereo3DType type;
+
+    /**
+     * Additional information about the frame packing.
+     */
+    int flags;
+} AVStereo3D;
+
+/**
+ * Allocate an AVStereo3D structure and set its fields to default values.
+ * The resulting struct can be freed using av_freep().
+ *
+ * @return An AVStereo3D filled with default values or NULL on failure.
+ */
+AVStereo3D *av_stereo3d_alloc(void);
+
+/**
+ * Allocate a complete AVFrameSideData and add it to the frame.
+ *
+ * @param frame The frame which side data is added to.
+ *
+ * @return The AVStereo3D structure to be filled by caller.
+ */
+AVStereo3D *av_stereo3d_create_side_data(AVFrame *frame);
+
+/**
+ * Provide a human-readable name of a given stereo3d type.
+ *
+ * @param type The input stereo3d type value.
+ *
+ * @return The name of the stereo3d value, or "unknown".
+ */
+const char *av_stereo3d_type_name(unsigned int type);
+
+/**
+ * Get the AVStereo3DType form a human-readable name.
+ *
+ * @param type The input string.
+ *
+ * @return The AVStereo3DType value, or -1 if not found.
+ */
+int av_stereo3d_from_name(const char *name);
+
+#endif /* AVUTIL_STEREO3D_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/tea.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/tea.h
new file mode 100644
index 0000000..dd929bd
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/tea.h
@@ -0,0 +1,71 @@
+/*
+ * A 32-bit implementation of the TEA algorithm
+ * Copyright (c) 2015 Vesselin Bontchev
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_TEA_H
+#define AVUTIL_TEA_H
+
+#include <stdint.h>
+
+/**
+ * @file
+ * @brief Public header for libavutil TEA algorithm
+ * @defgroup lavu_tea TEA
+ * @ingroup lavu_crypto
+ * @{
+ */
+
+extern const int av_tea_size;
+
+struct AVTEA;
+
+/**
+  * Allocate an AVTEA context
+  * To free the struct: av_free(ptr)
+  */
+struct AVTEA *av_tea_alloc(void);
+
+/**
+ * Initialize an AVTEA context.
+ *
+ * @param ctx an AVTEA context
+ * @param key a key of 16 bytes used for encryption/decryption
+ * @param rounds the number of rounds in TEA (64 is the "standard")
+ */
+void av_tea_init(struct AVTEA *ctx, const uint8_t key[16], int rounds);
+
+/**
+ * Encrypt or decrypt a buffer using a previously initialized context.
+ *
+ * @param ctx an AVTEA context
+ * @param dst destination array, can be equal to src
+ * @param src source array, can be equal to dst
+ * @param count number of 8 byte blocks
+ * @param iv initialization vector for CBC mode, if NULL then ECB will be used
+ * @param decrypt 0 for encryption, 1 for decryption
+ */
+void av_tea_crypt(struct AVTEA *ctx, uint8_t *dst, const uint8_t *src,
+                  int count, uint8_t *iv, int decrypt);
+
+/**
+ * @}
+ */
+
+#endif /* AVUTIL_TEA_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/threadmessage.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/threadmessage.h
new file mode 100644
index 0000000..8480a0a
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/threadmessage.h
@@ -0,0 +1,107 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_THREADMESSAGE_H
+#define AVUTIL_THREADMESSAGE_H
+
+typedef struct AVThreadMessageQueue AVThreadMessageQueue;
+
+typedef enum AVThreadMessageFlags {
+
+    /**
+     * Perform non-blocking operation.
+     * If this flag is set, send and recv operations are non-blocking and
+     * return AVERROR(EAGAIN) immediately if they can not proceed.
+     */
+    AV_THREAD_MESSAGE_NONBLOCK = 1,
+
+} AVThreadMessageFlags;
+
+/**
+ * Allocate a new message queue.
+ *
+ * @param mq      pointer to the message queue
+ * @param nelem   maximum number of elements in the queue
+ * @param elsize  size of each element in the queue
+ * @return  >=0 for success; <0 for error, in particular AVERROR(ENOSYS) if
+ *          lavu was built without thread support
+ */
+int av_thread_message_queue_alloc(AVThreadMessageQueue **mq,
+                                  unsigned nelem,
+                                  unsigned elsize);
+
+/**
+ * Free a message queue.
+ *
+ * The message queue must no longer be in use by another thread.
+ */
+void av_thread_message_queue_free(AVThreadMessageQueue **mq);
+
+/**
+ * Send a message on the queue.
+ */
+int av_thread_message_queue_send(AVThreadMessageQueue *mq,
+                                 void *msg,
+                                 unsigned flags);
+
+/**
+ * Receive a message from the queue.
+ */
+int av_thread_message_queue_recv(AVThreadMessageQueue *mq,
+                                 void *msg,
+                                 unsigned flags);
+
+/**
+ * Set the sending error code.
+ *
+ * If the error code is set to non-zero, av_thread_message_queue_send() will
+ * return it immediately. Conventional values, such as AVERROR_EOF or
+ * AVERROR(EAGAIN), can be used to cause the sending thread to stop or
+ * suspend its operation.
+ */
+void av_thread_message_queue_set_err_send(AVThreadMessageQueue *mq,
+                                          int err);
+
+/**
+ * Set the receiving error code.
+ *
+ * If the error code is set to non-zero, av_thread_message_queue_recv() will
+ * return it immediately when there are no longer available messages.
+ * Conventional values, such as AVERROR_EOF or AVERROR(EAGAIN), can be used
+ * to cause the receiving thread to stop or suspend its operation.
+ */
+void av_thread_message_queue_set_err_recv(AVThreadMessageQueue *mq,
+                                          int err);
+
+/**
+ * Set the optional free message callback function which will be called if an
+ * operation is removing messages from the queue.
+ */
+void av_thread_message_queue_set_free_func(AVThreadMessageQueue *mq,
+                                           void (*free_func)(void *msg));
+
+/**
+ * Flush the message queue
+ *
+ * This function is mostly equivalent to reading and free-ing every message
+ * except that it will be done in a single operation (no lock/unlock between
+ * reads).
+ */
+void av_thread_message_flush(AVThreadMessageQueue *mq);
+
+#endif /* AVUTIL_THREADMESSAGE_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/time.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/time.h
new file mode 100644
index 0000000..dc169b0
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/time.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2000-2003 Fabrice Bellard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_TIME_H
+#define AVUTIL_TIME_H
+
+#include <stdint.h>
+
+/**
+ * Get the current time in microseconds.
+ */
+int64_t av_gettime(void);
+
+/**
+ * Get the current time in microseconds since some unspecified starting point.
+ * On platforms that support it, the time comes from a monotonic clock
+ * This property makes this time source ideal for measuring relative time.
+ * The returned values may not be monotonic on platforms where a monotonic
+ * clock is not available.
+ */
+int64_t av_gettime_relative(void);
+
+/**
+ * Indicates with a boolean result if the av_gettime_relative() time source
+ * is monotonic.
+ */
+int av_gettime_relative_is_monotonic(void);
+
+/**
+ * Sleep for a period of time.  Although the duration is expressed in
+ * microseconds, the actual delay may be rounded to the precision of the
+ * system timer.
+ *
+ * @param  usec Number of microseconds to sleep.
+ * @return zero on success or (negative) error code.
+ */
+int av_usleep(unsigned usec);
+
+#endif /* AVUTIL_TIME_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/timecode.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/timecode.h
new file mode 100644
index 0000000..56e3975
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/timecode.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2006 Smartjog S.A.S, Baptiste Coudurier <baptiste.coudurier@gmail.com>
+ * Copyright (c) 2011-2012 Smartjog S.A.S, Clément Bœsch <clement.boesch@smartjog.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Timecode helpers header
+ */
+
+#ifndef AVUTIL_TIMECODE_H
+#define AVUTIL_TIMECODE_H
+
+#include <stdint.h>
+#include "rational.h"
+
+#define AV_TIMECODE_STR_SIZE 16
+
+enum AVTimecodeFlag {
+    AV_TIMECODE_FLAG_DROPFRAME      = 1<<0, ///< timecode is drop frame
+    AV_TIMECODE_FLAG_24HOURSMAX     = 1<<1, ///< timecode wraps after 24 hours
+    AV_TIMECODE_FLAG_ALLOWNEGATIVE  = 1<<2, ///< negative time values are allowed
+};
+
+typedef struct {
+    int start;          ///< timecode frame start (first base frame number)
+    uint32_t flags;     ///< flags such as drop frame, +24 hours support, ...
+    AVRational rate;    ///< frame rate in rational form
+    unsigned fps;       ///< frame per second; must be consistent with the rate field
+} AVTimecode;
+
+/**
+ * Adjust frame number for NTSC drop frame time code.
+ *
+ * @param framenum frame number to adjust
+ * @param fps      frame per second, 30 or 60
+ * @return         adjusted frame number
+ * @warning        adjustment is only valid in NTSC 29.97 and 59.94
+ */
+int av_timecode_adjust_ntsc_framenum2(int framenum, int fps);
+
+/**
+ * Convert frame number to SMPTE 12M binary representation.
+ *
+ * @param tc       timecode data correctly initialized
+ * @param framenum frame number
+ * @return         the SMPTE binary representation
+ *
+ * @note Frame number adjustment is automatically done in case of drop timecode,
+ *       you do NOT have to call av_timecode_adjust_ntsc_framenum2().
+ * @note The frame number is relative to tc->start.
+ * @note Color frame (CF), binary group flags (BGF) and biphase mark polarity
+ *       correction (PC) bits are set to zero.
+ */
+uint32_t av_timecode_get_smpte_from_framenum(const AVTimecode *tc, int framenum);
+
+/**
+ * Load timecode string in buf.
+ *
+ * @param buf      destination buffer, must be at least AV_TIMECODE_STR_SIZE long
+ * @param tc       timecode data correctly initialized
+ * @param framenum frame number
+ * @return         the buf parameter
+ *
+ * @note Timecode representation can be a negative timecode and have more than
+ *       24 hours, but will only be honored if the flags are correctly set.
+ * @note The frame number is relative to tc->start.
+ */
+char *av_timecode_make_string(const AVTimecode *tc, char *buf, int framenum);
+
+/**
+ * Get the timecode string from the SMPTE timecode format.
+ *
+ * @param buf        destination buffer, must be at least AV_TIMECODE_STR_SIZE long
+ * @param tcsmpte    the 32-bit SMPTE timecode
+ * @param prevent_df prevent the use of a drop flag when it is known the DF bit
+ *                   is arbitrary
+ * @return           the buf parameter
+ */
+char *av_timecode_make_smpte_tc_string(char *buf, uint32_t tcsmpte, int prevent_df);
+
+/**
+ * Get the timecode string from the 25-bit timecode format (MPEG GOP format).
+ *
+ * @param buf     destination buffer, must be at least AV_TIMECODE_STR_SIZE long
+ * @param tc25bit the 25-bits timecode
+ * @return        the buf parameter
+ */
+char *av_timecode_make_mpeg_tc_string(char *buf, uint32_t tc25bit);
+
+/**
+ * Init a timecode struct with the passed parameters.
+ *
+ * @param log_ctx     a pointer to an arbitrary struct of which the first field
+ *                    is a pointer to an AVClass struct (used for av_log)
+ * @param tc          pointer to an allocated AVTimecode
+ * @param rate        frame rate in rational form
+ * @param flags       miscellaneous flags such as drop frame, +24 hours, ...
+ *                    (see AVTimecodeFlag)
+ * @param frame_start the first frame number
+ * @return            0 on success, AVERROR otherwise
+ */
+int av_timecode_init(AVTimecode *tc, AVRational rate, int flags, int frame_start, void *log_ctx);
+
+/**
+ * Parse timecode representation (hh:mm:ss[:;.]ff).
+ *
+ * @param log_ctx a pointer to an arbitrary struct of which the first field is a
+ *                pointer to an AVClass struct (used for av_log).
+ * @param tc      pointer to an allocated AVTimecode
+ * @param rate    frame rate in rational form
+ * @param str     timecode string which will determine the frame start
+ * @return        0 on success, AVERROR otherwise
+ */
+int av_timecode_init_from_string(AVTimecode *tc, AVRational rate, const char *str, void *log_ctx);
+
+/**
+ * Check if the timecode feature is available for the given frame rate
+ *
+ * @return 0 if supported, <0 otherwise
+ */
+int av_timecode_check_frame_rate(AVRational rate);
+
+#endif /* AVUTIL_TIMECODE_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/timestamp.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/timestamp.h
new file mode 100644
index 0000000..e082f01
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/timestamp.h
@@ -0,0 +1,78 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * timestamp utils, mostly useful for debugging/logging purposes
+ */
+
+#ifndef AVUTIL_TIMESTAMP_H
+#define AVUTIL_TIMESTAMP_H
+
+#include "common.h"
+
+#if defined(__cplusplus) && !defined(__STDC_FORMAT_MACROS) && !defined(PRId64)
+#error missing -D__STDC_FORMAT_MACROS / #define __STDC_FORMAT_MACROS
+#endif
+
+#define AV_TS_MAX_STRING_SIZE 32
+
+/**
+ * Fill the provided buffer with a string containing a timestamp
+ * representation.
+ *
+ * @param buf a buffer with size in bytes of at least AV_TS_MAX_STRING_SIZE
+ * @param ts the timestamp to represent
+ * @return the buffer in input
+ */
+static inline char *av_ts_make_string(char *buf, int64_t ts)
+{
+    if (ts == AV_NOPTS_VALUE) snprintf(buf, AV_TS_MAX_STRING_SIZE, "NOPTS");
+    else                      snprintf(buf, AV_TS_MAX_STRING_SIZE, "%" PRId64, ts);
+    return buf;
+}
+
+/**
+ * Convenience macro, the return value should be used only directly in
+ * function arguments but never stand-alone.
+ */
+#define av_ts2str(ts) av_ts_make_string((char[AV_TS_MAX_STRING_SIZE]){0}, ts)
+
+/**
+ * Fill the provided buffer with a string containing a timestamp time
+ * representation.
+ *
+ * @param buf a buffer with size in bytes of at least AV_TS_MAX_STRING_SIZE
+ * @param ts the timestamp to represent
+ * @param tb the timebase of the timestamp
+ * @return the buffer in input
+ */
+static inline char *av_ts_make_time_string(char *buf, int64_t ts, AVRational *tb)
+{
+    if (ts == AV_NOPTS_VALUE) snprintf(buf, AV_TS_MAX_STRING_SIZE, "NOPTS");
+    else                      snprintf(buf, AV_TS_MAX_STRING_SIZE, "%.6g", av_q2d(*tb) * ts);
+    return buf;
+}
+
+/**
+ * Convenience macro, the return value should be used only directly in
+ * function arguments but never stand-alone.
+ */
+#define av_ts2timestr(ts, tb) av_ts_make_time_string((char[AV_TS_MAX_STRING_SIZE]){0}, ts, tb)
+
+#endif /* AVUTIL_TIMESTAMP_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/tree.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/tree.h
new file mode 100644
index 0000000..d5e0aeb
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/tree.h
@@ -0,0 +1,138 @@
+/*
+ * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * A tree container.
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#ifndef AVUTIL_TREE_H
+#define AVUTIL_TREE_H
+
+#include "attributes.h"
+#include "version.h"
+
+/**
+ * @addtogroup lavu_tree AVTree
+ * @ingroup lavu_data
+ *
+ * Low-complexity tree container
+ *
+ * Insertion, removal, finding equal, largest which is smaller than and
+ * smallest which is larger than, all have O(log n) worst-case complexity.
+ * @{
+ */
+
+
+struct AVTreeNode;
+extern const int av_tree_node_size;
+
+/**
+ * Allocate an AVTreeNode.
+ */
+struct AVTreeNode *av_tree_node_alloc(void);
+
+/**
+ * Find an element.
+ * @param root a pointer to the root node of the tree
+ * @param next If next is not NULL, then next[0] will contain the previous
+ *             element and next[1] the next element. If either does not exist,
+ *             then the corresponding entry in next is unchanged.
+ * @param cmp compare function used to compare elements in the tree,
+ *            API identical to that of Standard C's qsort
+ *            It is guaranteed that the first and only the first argument to cmp()
+ *            will be the key parameter to av_tree_find(), thus it could if the
+ *            user wants, be a different type (like an opaque context).
+ * @return An element with cmp(key, elem) == 0 or NULL if no such element
+ *         exists in the tree.
+ */
+void *av_tree_find(const struct AVTreeNode *root, void *key,
+                   int (*cmp)(const void *key, const void *b), void *next[2]);
+
+/**
+ * Insert or remove an element.
+ *
+ * If *next is NULL, then the supplied element will be removed if it exists.
+ * If *next is non-NULL, then the supplied element will be inserted, unless
+ * it already exists in the tree.
+ *
+ * @param rootp A pointer to a pointer to the root node of the tree; note that
+ *              the root node can change during insertions, this is required
+ *              to keep the tree balanced.
+ * @param key  pointer to the element key to insert in the tree
+ * @param next Used to allocate and free AVTreeNodes. For insertion the user
+ *             must set it to an allocated and zeroed object of at least
+ *             av_tree_node_size bytes size. av_tree_insert() will set it to
+ *             NULL if it has been consumed.
+ *             For deleting elements *next is set to NULL by the user and
+ *             av_tree_insert() will set it to the AVTreeNode which was
+ *             used for the removed element.
+ *             This allows the use of flat arrays, which have
+ *             lower overhead compared to many malloced elements.
+ *             You might want to define a function like:
+ *             @code
+ *             void *tree_insert(struct AVTreeNode **rootp, void *key,
+ *                               int (*cmp)(void *key, const void *b),
+ *                               AVTreeNode **next)
+ *             {
+ *                 if (!*next)
+ *                     *next = av_mallocz(av_tree_node_size);
+ *                 return av_tree_insert(rootp, key, cmp, next);
+ *             }
+ *             void *tree_remove(struct AVTreeNode **rootp, void *key,
+ *                               int (*cmp)(void *key, const void *b, AVTreeNode **next))
+ *             {
+ *                 av_freep(next);
+ *                 return av_tree_insert(rootp, key, cmp, next);
+ *             }
+ *             @endcode
+ * @param cmp compare function used to compare elements in the tree, API identical
+ *            to that of Standard C's qsort
+ * @return If no insertion happened, the found element; if an insertion or
+ *         removal happened, then either key or NULL will be returned.
+ *         Which one it is depends on the tree state and the implementation. You
+ *         should make no assumptions that it's one or the other in the code.
+ */
+void *av_tree_insert(struct AVTreeNode **rootp, void *key,
+                     int (*cmp)(const void *key, const void *b),
+                     struct AVTreeNode **next);
+
+void av_tree_destroy(struct AVTreeNode *t);
+
+/**
+ * Apply enu(opaque, &elem) to all the elements in the tree in a given range.
+ *
+ * @param cmp a comparison function that returns < 0 for an element below the
+ *            range, > 0 for an element above the range and == 0 for an
+ *            element inside the range
+ *
+ * @note The cmp function should use the same ordering used to construct the
+ *       tree.
+ */
+void av_tree_enumerate(struct AVTreeNode *t, void *opaque,
+                       int (*cmp)(void *opaque, void *elem),
+                       int (*enu)(void *opaque, void *elem));
+
+/**
+ * @}
+ */
+
+#endif /* AVUTIL_TREE_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/twofish.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/twofish.h
new file mode 100644
index 0000000..813cfec
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/twofish.h
@@ -0,0 +1,70 @@
+/*
+ * An implementation of the TwoFish algorithm
+ * Copyright (c) 2015 Supraja Meedinti
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_TWOFISH_H
+#define AVUTIL_TWOFISH_H
+
+#include <stdint.h>
+
+
+/**
+  * @file
+  * @brief Public header for libavutil TWOFISH algorithm
+  * @defgroup lavu_twofish TWOFISH
+  * @ingroup lavu_crypto
+  * @{
+  */
+
+extern const int av_twofish_size;
+
+struct AVTWOFISH;
+
+/**
+  * Allocate an AVTWOFISH context
+  * To free the struct: av_free(ptr)
+  */
+struct AVTWOFISH *av_twofish_alloc(void);
+
+/**
+  * Initialize an AVTWOFISH context.
+  *
+  * @param ctx an AVTWOFISH context
+  * @param key a key of size ranging from 1 to 32 bytes used for encryption/decryption
+  * @param key_bits number of keybits: 128, 192, 256 If less than the required, padded with zeroes to nearest valid value; return value is 0 if key_bits is 128/192/256, -1 if less than 0, 1 otherwise
+ */
+int av_twofish_init(struct AVTWOFISH *ctx, const uint8_t *key, int key_bits);
+
+/**
+  * Encrypt or decrypt a buffer using a previously initialized context
+  *
+  * @param ctx an AVTWOFISH context
+  * @param dst destination array, can be equal to src
+  * @param src source array, can be equal to dst
+  * @param count number of 16 byte blocks
+  * @paran iv initialization vector for CBC mode, NULL for ECB mode
+  * @param decrypt 0 for encryption, 1 for decryption
+ */
+void av_twofish_crypt(struct AVTWOFISH *ctx, uint8_t *dst, const uint8_t *src, int count, uint8_t* iv, int decrypt);
+
+/**
+ * @}
+ */
+#endif /* AVUTIL_TWOFISH_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/version.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/version.h
new file mode 100644
index 0000000..abea216
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/version.h
@@ -0,0 +1,145 @@
+/*
+ * copyright (c) 2003 Fabrice Bellard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * @ingroup lavu
+ * Libavutil version macros
+ */
+
+#ifndef AVUTIL_VERSION_H
+#define AVUTIL_VERSION_H
+
+#include "macros.h"
+
+/**
+ * @addtogroup version_utils
+ *
+ * Useful to check and match library version in order to maintain
+ * backward compatibility.
+ *
+ * The FFmpeg libraries follow a versioning sheme very similar to
+ * Semantic Versioning (http://semver.org/)
+ * The difference is that the component called PATCH is called MICRO in FFmpeg
+ * and its value is reset to 100 instead of 0 to keep it above or equal to 100.
+ * Also we do not increase MICRO for every bugfix or change in git master.
+ *
+ * Prior to FFmpeg 3.2 point releases did not change any lib version number to
+ * avoid aliassing different git master checkouts.
+ * Starting with FFmpeg 3.2, the released library versions will occupy
+ * a separate MAJOR.MINOR that is not used on the master development branch.
+ * That is if we branch a release of master 55.10.123 we will bump to 55.11.100
+ * for the release and master will continue at 55.12.100 after it. Each new
+ * point release will then bump the MICRO improving the usefulness of the lib
+ * versions.
+ *
+ * @{
+ */
+
+#define AV_VERSION_INT(a, b, c) ((a)<<16 | (b)<<8 | (c))
+#define AV_VERSION_DOT(a, b, c) a ##.## b ##.## c
+#define AV_VERSION(a, b, c) AV_VERSION_DOT(a, b, c)
+
+/**
+ * Extract version components from the full ::AV_VERSION_INT int as returned
+ * by functions like ::avformat_version() and ::avcodec_version()
+ */
+#define AV_VERSION_MAJOR(a) ((a) >> 16)
+#define AV_VERSION_MINOR(a) (((a) & 0x00FF00) >> 8)
+#define AV_VERSION_MICRO(a) ((a) & 0xFF)
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup lavu_ver Version and Build diagnostics
+ *
+ * Macros and function useful to check at compiletime and at runtime
+ * which version of libavutil is in use.
+ *
+ * @{
+ */
+
+#define LIBAVUTIL_VERSION_MAJOR  55
+#define LIBAVUTIL_VERSION_MINOR  58
+#define LIBAVUTIL_VERSION_MICRO 100
+
+#define LIBAVUTIL_VERSION_INT   AV_VERSION_INT(LIBAVUTIL_VERSION_MAJOR, \
+                                               LIBAVUTIL_VERSION_MINOR, \
+                                               LIBAVUTIL_VERSION_MICRO)
+#define LIBAVUTIL_VERSION       AV_VERSION(LIBAVUTIL_VERSION_MAJOR,     \
+                                           LIBAVUTIL_VERSION_MINOR,     \
+                                           LIBAVUTIL_VERSION_MICRO)
+#define LIBAVUTIL_BUILD         LIBAVUTIL_VERSION_INT
+
+#define LIBAVUTIL_IDENT         "Lavu" AV_STRINGIFY(LIBAVUTIL_VERSION)
+
+/**
+ * @defgroup lavu_depr_guards Deprecation Guards
+ * FF_API_* defines may be placed below to indicate public API that will be
+ * dropped at a future version bump. The defines themselves are not part of
+ * the public API and may change, break or disappear at any time.
+ *
+ * @note, when bumping the major version it is recommended to manually
+ * disable each FF_API_* in its own commit instead of disabling them all
+ * at once through the bump. This improves the git bisect-ability of the change.
+ *
+ * @{
+ */
+
+#ifndef FF_API_VDPAU
+#define FF_API_VDPAU                    (LIBAVUTIL_VERSION_MAJOR < 56)
+#endif
+#ifndef FF_API_XVMC
+#define FF_API_XVMC                     (LIBAVUTIL_VERSION_MAJOR < 56)
+#endif
+#ifndef FF_API_OPT_TYPE_METADATA
+#define FF_API_OPT_TYPE_METADATA        (LIBAVUTIL_VERSION_MAJOR < 56)
+#endif
+#ifndef FF_API_DLOG
+#define FF_API_DLOG                     (LIBAVUTIL_VERSION_MAJOR < 56)
+#endif
+#ifndef FF_API_VAAPI
+#define FF_API_VAAPI                    (LIBAVUTIL_VERSION_MAJOR < 56)
+#endif
+#ifndef FF_API_FRAME_QP
+#define FF_API_FRAME_QP                 (LIBAVUTIL_VERSION_MAJOR < 56)
+#endif
+#ifndef FF_API_PLUS1_MINUS1
+#define FF_API_PLUS1_MINUS1             (LIBAVUTIL_VERSION_MAJOR < 56)
+#endif
+#ifndef FF_API_ERROR_FRAME
+#define FF_API_ERROR_FRAME              (LIBAVUTIL_VERSION_MAJOR < 56)
+#endif
+#ifndef FF_API_CRC_BIG_TABLE
+#define FF_API_CRC_BIG_TABLE            (LIBAVUTIL_VERSION_MAJOR < 56)
+#endif
+#ifndef FF_API_PKT_PTS
+#define FF_API_PKT_PTS                  (LIBAVUTIL_VERSION_MAJOR < 56)
+#endif
+
+
+/**
+ * @}
+ * @}
+ */
+
+#endif /* AVUTIL_VERSION_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/xtea.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/xtea.h
new file mode 100644
index 0000000..735427c
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libavutil/xtea.h
@@ -0,0 +1,94 @@
+/*
+ * A 32-bit implementation of the XTEA algorithm
+ * Copyright (c) 2012 Samuel Pitoiset
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_XTEA_H
+#define AVUTIL_XTEA_H
+
+#include <stdint.h>
+
+/**
+ * @file
+ * @brief Public header for libavutil XTEA algorithm
+ * @defgroup lavu_xtea XTEA
+ * @ingroup lavu_crypto
+ * @{
+ */
+
+typedef struct AVXTEA {
+    uint32_t key[16];
+} AVXTEA;
+
+/**
+ * Allocate an AVXTEA context.
+ */
+AVXTEA *av_xtea_alloc(void);
+
+/**
+ * Initialize an AVXTEA context.
+ *
+ * @param ctx an AVXTEA context
+ * @param key a key of 16 bytes used for encryption/decryption,
+ *            interpreted as big endian 32 bit numbers
+ */
+void av_xtea_init(struct AVXTEA *ctx, const uint8_t key[16]);
+
+/**
+ * Initialize an AVXTEA context.
+ *
+ * @param ctx an AVXTEA context
+ * @param key a key of 16 bytes used for encryption/decryption,
+ *            interpreted as little endian 32 bit numbers
+ */
+void av_xtea_le_init(struct AVXTEA *ctx, const uint8_t key[16]);
+
+/**
+ * Encrypt or decrypt a buffer using a previously initialized context,
+ * in big endian format.
+ *
+ * @param ctx an AVXTEA context
+ * @param dst destination array, can be equal to src
+ * @param src source array, can be equal to dst
+ * @param count number of 8 byte blocks
+ * @param iv initialization vector for CBC mode, if NULL then ECB will be used
+ * @param decrypt 0 for encryption, 1 for decryption
+ */
+void av_xtea_crypt(struct AVXTEA *ctx, uint8_t *dst, const uint8_t *src,
+                   int count, uint8_t *iv, int decrypt);
+
+/**
+ * Encrypt or decrypt a buffer using a previously initialized context,
+ * in little endian format.
+ *
+ * @param ctx an AVXTEA context
+ * @param dst destination array, can be equal to src
+ * @param src source array, can be equal to dst
+ * @param count number of 8 byte blocks
+ * @param iv initialization vector for CBC mode, if NULL then ECB will be used
+ * @param decrypt 0 for encryption, 1 for decryption
+ */
+void av_xtea_le_crypt(struct AVXTEA *ctx, uint8_t *dst, const uint8_t *src,
+                      int count, uint8_t *iv, int decrypt);
+
+/**
+ * @}
+ */
+
+#endif /* AVUTIL_XTEA_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libswresample/swresample.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libswresample/swresample.h
new file mode 100644
index 0000000..a8db5c2
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libswresample/swresample.h
@@ -0,0 +1,583 @@
+/*
+ * Copyright (C) 2011-2013 Michael Niedermayer (michaelni@gmx.at)
+ *
+ * This file is part of libswresample
+ *
+ * libswresample is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libswresample is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libswresample; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef SWRESAMPLE_SWRESAMPLE_H
+#define SWRESAMPLE_SWRESAMPLE_H
+
+/**
+ * @file
+ * @ingroup lswr
+ * libswresample public header
+ */
+
+/**
+ * @defgroup lswr libswresample
+ * @{
+ *
+ * Audio resampling, sample format conversion and mixing library.
+ *
+ * Interaction with lswr is done through SwrContext, which is
+ * allocated with swr_alloc() or swr_alloc_set_opts(). It is opaque, so all parameters
+ * must be set with the @ref avoptions API.
+ *
+ * The first thing you will need to do in order to use lswr is to allocate
+ * SwrContext. This can be done with swr_alloc() or swr_alloc_set_opts(). If you
+ * are using the former, you must set options through the @ref avoptions API.
+ * The latter function provides the same feature, but it allows you to set some
+ * common options in the same statement.
+ *
+ * For example the following code will setup conversion from planar float sample
+ * format to interleaved signed 16-bit integer, downsampling from 48kHz to
+ * 44.1kHz and downmixing from 5.1 channels to stereo (using the default mixing
+ * matrix). This is using the swr_alloc() function.
+ * @code
+ * SwrContext *swr = swr_alloc();
+ * av_opt_set_channel_layout(swr, "in_channel_layout",  AV_CH_LAYOUT_5POINT1, 0);
+ * av_opt_set_channel_layout(swr, "out_channel_layout", AV_CH_LAYOUT_STEREO,  0);
+ * av_opt_set_int(swr, "in_sample_rate",     48000,                0);
+ * av_opt_set_int(swr, "out_sample_rate",    44100,                0);
+ * av_opt_set_sample_fmt(swr, "in_sample_fmt",  AV_SAMPLE_FMT_FLTP, 0);
+ * av_opt_set_sample_fmt(swr, "out_sample_fmt", AV_SAMPLE_FMT_S16,  0);
+ * @endcode
+ *
+ * The same job can be done using swr_alloc_set_opts() as well:
+ * @code
+ * SwrContext *swr = swr_alloc_set_opts(NULL,  // we're allocating a new context
+ *                       AV_CH_LAYOUT_STEREO,  // out_ch_layout
+ *                       AV_SAMPLE_FMT_S16,    // out_sample_fmt
+ *                       44100,                // out_sample_rate
+ *                       AV_CH_LAYOUT_5POINT1, // in_ch_layout
+ *                       AV_SAMPLE_FMT_FLTP,   // in_sample_fmt
+ *                       48000,                // in_sample_rate
+ *                       0,                    // log_offset
+ *                       NULL);                // log_ctx
+ * @endcode
+ *
+ * Once all values have been set, it must be initialized with swr_init(). If
+ * you need to change the conversion parameters, you can change the parameters
+ * using @ref AVOptions, as described above in the first example; or by using
+ * swr_alloc_set_opts(), but with the first argument the allocated context.
+ * You must then call swr_init() again.
+ *
+ * The conversion itself is done by repeatedly calling swr_convert().
+ * Note that the samples may get buffered in swr if you provide insufficient
+ * output space or if sample rate conversion is done, which requires "future"
+ * samples. Samples that do not require future input can be retrieved at any
+ * time by using swr_convert() (in_count can be set to 0).
+ * At the end of conversion the resampling buffer can be flushed by calling
+ * swr_convert() with NULL in and 0 in_count.
+ *
+ * The samples used in the conversion process can be managed with the libavutil
+ * @ref lavu_sampmanip "samples manipulation" API, including av_samples_alloc()
+ * function used in the following example.
+ *
+ * The delay between input and output, can at any time be found by using
+ * swr_get_delay().
+ *
+ * The following code demonstrates the conversion loop assuming the parameters
+ * from above and caller-defined functions get_input() and handle_output():
+ * @code
+ * uint8_t **input;
+ * int in_samples;
+ *
+ * while (get_input(&input, &in_samples)) {
+ *     uint8_t *output;
+ *     int out_samples = av_rescale_rnd(swr_get_delay(swr, 48000) +
+ *                                      in_samples, 44100, 48000, AV_ROUND_UP);
+ *     av_samples_alloc(&output, NULL, 2, out_samples,
+ *                      AV_SAMPLE_FMT_S16, 0);
+ *     out_samples = swr_convert(swr, &output, out_samples,
+ *                                      input, in_samples);
+ *     handle_output(output, out_samples);
+ *     av_freep(&output);
+ * }
+ * @endcode
+ *
+ * When the conversion is finished, the conversion
+ * context and everything associated with it must be freed with swr_free().
+ * A swr_close() function is also available, but it exists mainly for
+ * compatibility with libavresample, and is not required to be called.
+ *
+ * There will be no memory leak if the data is not completely flushed before
+ * swr_free().
+ */
+
+#include <stdint.h>
+#include "libavutil/channel_layout.h"
+#include "libavutil/frame.h"
+#include "libavutil/samplefmt.h"
+
+#include "libswresample/version.h"
+
+#if LIBSWRESAMPLE_VERSION_MAJOR < 1
+#define SWR_CH_MAX 32   ///< Maximum number of channels
+#endif
+
+/**
+ * @name Option constants
+ * These constants are used for the @ref avoptions interface for lswr.
+ * @{
+ *
+ */
+
+#define SWR_FLAG_RESAMPLE 1 ///< Force resampling even if equal sample rate
+//TODO use int resample ?
+//long term TODO can we enable this dynamically?
+
+/** Dithering algorithms */
+enum SwrDitherType {
+    SWR_DITHER_NONE = 0,
+    SWR_DITHER_RECTANGULAR,
+    SWR_DITHER_TRIANGULAR,
+    SWR_DITHER_TRIANGULAR_HIGHPASS,
+
+    SWR_DITHER_NS = 64,         ///< not part of API/ABI
+    SWR_DITHER_NS_LIPSHITZ,
+    SWR_DITHER_NS_F_WEIGHTED,
+    SWR_DITHER_NS_MODIFIED_E_WEIGHTED,
+    SWR_DITHER_NS_IMPROVED_E_WEIGHTED,
+    SWR_DITHER_NS_SHIBATA,
+    SWR_DITHER_NS_LOW_SHIBATA,
+    SWR_DITHER_NS_HIGH_SHIBATA,
+    SWR_DITHER_NB,              ///< not part of API/ABI
+};
+
+/** Resampling Engines */
+enum SwrEngine {
+    SWR_ENGINE_SWR,             /**< SW Resampler */
+    SWR_ENGINE_SOXR,            /**< SoX Resampler */
+    SWR_ENGINE_NB,              ///< not part of API/ABI
+};
+
+/** Resampling Filter Types */
+enum SwrFilterType {
+    SWR_FILTER_TYPE_CUBIC,              /**< Cubic */
+    SWR_FILTER_TYPE_BLACKMAN_NUTTALL,   /**< Blackman Nuttall windowed sinc */
+    SWR_FILTER_TYPE_KAISER,             /**< Kaiser windowed sinc */
+};
+
+/**
+ * @}
+ */
+
+/**
+ * The libswresample context. Unlike libavcodec and libavformat, this structure
+ * is opaque. This means that if you would like to set options, you must use
+ * the @ref avoptions API and cannot directly set values to members of the
+ * structure.
+ */
+typedef struct SwrContext SwrContext;
+
+/**
+ * Get the AVClass for SwrContext. It can be used in combination with
+ * AV_OPT_SEARCH_FAKE_OBJ for examining options.
+ *
+ * @see av_opt_find().
+ * @return the AVClass of SwrContext
+ */
+const AVClass *swr_get_class(void);
+
+/**
+ * @name SwrContext constructor functions
+ * @{
+ */
+
+/**
+ * Allocate SwrContext.
+ *
+ * If you use this function you will need to set the parameters (manually or
+ * with swr_alloc_set_opts()) before calling swr_init().
+ *
+ * @see swr_alloc_set_opts(), swr_init(), swr_free()
+ * @return NULL on error, allocated context otherwise
+ */
+struct SwrContext *swr_alloc(void);
+
+/**
+ * Initialize context after user parameters have been set.
+ * @note The context must be configured using the AVOption API.
+ *
+ * @see av_opt_set_int()
+ * @see av_opt_set_dict()
+ *
+ * @param[in,out]   s Swr context to initialize
+ * @return AVERROR error code in case of failure.
+ */
+int swr_init(struct SwrContext *s);
+
+/**
+ * Check whether an swr context has been initialized or not.
+ *
+ * @param[in]       s Swr context to check
+ * @see swr_init()
+ * @return positive if it has been initialized, 0 if not initialized
+ */
+int swr_is_initialized(struct SwrContext *s);
+
+/**
+ * Allocate SwrContext if needed and set/reset common parameters.
+ *
+ * This function does not require s to be allocated with swr_alloc(). On the
+ * other hand, swr_alloc() can use swr_alloc_set_opts() to set the parameters
+ * on the allocated context.
+ *
+ * @param s               existing Swr context if available, or NULL if not
+ * @param out_ch_layout   output channel layout (AV_CH_LAYOUT_*)
+ * @param out_sample_fmt  output sample format (AV_SAMPLE_FMT_*).
+ * @param out_sample_rate output sample rate (frequency in Hz)
+ * @param in_ch_layout    input channel layout (AV_CH_LAYOUT_*)
+ * @param in_sample_fmt   input sample format (AV_SAMPLE_FMT_*).
+ * @param in_sample_rate  input sample rate (frequency in Hz)
+ * @param log_offset      logging level offset
+ * @param log_ctx         parent logging context, can be NULL
+ *
+ * @see swr_init(), swr_free()
+ * @return NULL on error, allocated context otherwise
+ */
+struct SwrContext *swr_alloc_set_opts(struct SwrContext *s,
+                                      int64_t out_ch_layout, enum AVSampleFormat out_sample_fmt, int out_sample_rate,
+                                      int64_t  in_ch_layout, enum AVSampleFormat  in_sample_fmt, int  in_sample_rate,
+                                      int log_offset, void *log_ctx);
+
+/**
+ * @}
+ *
+ * @name SwrContext destructor functions
+ * @{
+ */
+
+/**
+ * Free the given SwrContext and set the pointer to NULL.
+ *
+ * @param[in] s a pointer to a pointer to Swr context
+ */
+void swr_free(struct SwrContext **s);
+
+/**
+ * Closes the context so that swr_is_initialized() returns 0.
+ *
+ * The context can be brought back to life by running swr_init(),
+ * swr_init() can also be used without swr_close().
+ * This function is mainly provided for simplifying the usecase
+ * where one tries to support libavresample and libswresample.
+ *
+ * @param[in,out] s Swr context to be closed
+ */
+void swr_close(struct SwrContext *s);
+
+/**
+ * @}
+ *
+ * @name Core conversion functions
+ * @{
+ */
+
+/** Convert audio.
+ *
+ * in and in_count can be set to 0 to flush the last few samples out at the
+ * end.
+ *
+ * If more input is provided than output space, then the input will be buffered.
+ * You can avoid this buffering by using swr_get_out_samples() to retrieve an
+ * upper bound on the required number of output samples for the given number of
+ * input samples. Conversion will run directly without copying whenever possible.
+ *
+ * @param s         allocated Swr context, with parameters set
+ * @param out       output buffers, only the first one need be set in case of packed audio
+ * @param out_count amount of space available for output in samples per channel
+ * @param in        input buffers, only the first one need to be set in case of packed audio
+ * @param in_count  number of input samples available in one channel
+ *
+ * @return number of samples output per channel, negative value on error
+ */
+int swr_convert(struct SwrContext *s, uint8_t **out, int out_count,
+                                const uint8_t **in , int in_count);
+
+/**
+ * Convert the next timestamp from input to output
+ * timestamps are in 1/(in_sample_rate * out_sample_rate) units.
+ *
+ * @note There are 2 slightly differently behaving modes.
+ *       @li When automatic timestamp compensation is not used, (min_compensation >= FLT_MAX)
+ *              in this case timestamps will be passed through with delays compensated
+ *       @li When automatic timestamp compensation is used, (min_compensation < FLT_MAX)
+ *              in this case the output timestamps will match output sample numbers.
+ *              See ffmpeg-resampler(1) for the two modes of compensation.
+ *
+ * @param s[in]     initialized Swr context
+ * @param pts[in]   timestamp for the next input sample, INT64_MIN if unknown
+ * @see swr_set_compensation(), swr_drop_output(), and swr_inject_silence() are
+ *      function used internally for timestamp compensation.
+ * @return the output timestamp for the next output sample
+ */
+int64_t swr_next_pts(struct SwrContext *s, int64_t pts);
+
+/**
+ * @}
+ *
+ * @name Low-level option setting functions
+ * These functons provide a means to set low-level options that is not possible
+ * with the AVOption API.
+ * @{
+ */
+
+/**
+ * Activate resampling compensation ("soft" compensation). This function is
+ * internally called when needed in swr_next_pts().
+ *
+ * @param[in,out] s             allocated Swr context. If it is not initialized,
+ *                              or SWR_FLAG_RESAMPLE is not set, swr_init() is
+ *                              called with the flag set.
+ * @param[in]     sample_delta  delta in PTS per sample
+ * @param[in]     compensation_distance number of samples to compensate for
+ * @return    >= 0 on success, AVERROR error codes if:
+ *            @li @c s is NULL,
+ *            @li @c compensation_distance is less than 0,
+ *            @li @c compensation_distance is 0 but sample_delta is not,
+ *            @li compensation unsupported by resampler, or
+ *            @li swr_init() fails when called.
+ */
+int swr_set_compensation(struct SwrContext *s, int sample_delta, int compensation_distance);
+
+/**
+ * Set a customized input channel mapping.
+ *
+ * @param[in,out] s           allocated Swr context, not yet initialized
+ * @param[in]     channel_map customized input channel mapping (array of channel
+ *                            indexes, -1 for a muted channel)
+ * @return >= 0 on success, or AVERROR error code in case of failure.
+ */
+int swr_set_channel_mapping(struct SwrContext *s, const int *channel_map);
+
+/**
+ * Generate a channel mixing matrix.
+ *
+ * This function is the one used internally by libswresample for building the
+ * default mixing matrix. It is made public just as a utility function for
+ * building custom matrices.
+ *
+ * @param in_layout           input channel layout
+ * @param out_layout          output channel layout
+ * @param center_mix_level    mix level for the center channel
+ * @param surround_mix_level  mix level for the surround channel(s)
+ * @param lfe_mix_level       mix level for the low-frequency effects channel
+ * @param rematrix_maxval     if 1.0, coefficients will be normalized to prevent
+ *                            overflow. if INT_MAX, coefficients will not be
+ *                            normalized.
+ * @param[out] matrix         mixing coefficients; matrix[i + stride * o] is
+ *                            the weight of input channel i in output channel o.
+ * @param stride              distance between adjacent input channels in the
+ *                            matrix array
+ * @param matrix_encoding     matrixed stereo downmix mode (e.g. dplii)
+ * @param log_ctx             parent logging context, can be NULL
+ * @return                    0 on success, negative AVERROR code on failure
+ */
+int swr_build_matrix(uint64_t in_layout, uint64_t out_layout,
+                     double center_mix_level, double surround_mix_level,
+                     double lfe_mix_level, double rematrix_maxval,
+                     double rematrix_volume, double *matrix,
+                     int stride, enum AVMatrixEncoding matrix_encoding,
+                     void *log_ctx);
+
+/**
+ * Set a customized remix matrix.
+ *
+ * @param s       allocated Swr context, not yet initialized
+ * @param matrix  remix coefficients; matrix[i + stride * o] is
+ *                the weight of input channel i in output channel o
+ * @param stride  offset between lines of the matrix
+ * @return  >= 0 on success, or AVERROR error code in case of failure.
+ */
+int swr_set_matrix(struct SwrContext *s, const double *matrix, int stride);
+
+/**
+ * @}
+ *
+ * @name Sample handling functions
+ * @{
+ */
+
+/**
+ * Drops the specified number of output samples.
+ *
+ * This function, along with swr_inject_silence(), is called by swr_next_pts()
+ * if needed for "hard" compensation.
+ *
+ * @param s     allocated Swr context
+ * @param count number of samples to be dropped
+ *
+ * @return >= 0 on success, or a negative AVERROR code on failure
+ */
+int swr_drop_output(struct SwrContext *s, int count);
+
+/**
+ * Injects the specified number of silence samples.
+ *
+ * This function, along with swr_drop_output(), is called by swr_next_pts()
+ * if needed for "hard" compensation.
+ *
+ * @param s     allocated Swr context
+ * @param count number of samples to be dropped
+ *
+ * @return >= 0 on success, or a negative AVERROR code on failure
+ */
+int swr_inject_silence(struct SwrContext *s, int count);
+
+/**
+ * Gets the delay the next input sample will experience relative to the next output sample.
+ *
+ * Swresample can buffer data if more input has been provided than available
+ * output space, also converting between sample rates needs a delay.
+ * This function returns the sum of all such delays.
+ * The exact delay is not necessarily an integer value in either input or
+ * output sample rate. Especially when downsampling by a large value, the
+ * output sample rate may be a poor choice to represent the delay, similarly
+ * for upsampling and the input sample rate.
+ *
+ * @param s     swr context
+ * @param base  timebase in which the returned delay will be:
+ *              @li if it's set to 1 the returned delay is in seconds
+ *              @li if it's set to 1000 the returned delay is in milliseconds
+ *              @li if it's set to the input sample rate then the returned
+ *                  delay is in input samples
+ *              @li if it's set to the output sample rate then the returned
+ *                  delay is in output samples
+ *              @li if it's the least common multiple of in_sample_rate and
+ *                  out_sample_rate then an exact rounding-free delay will be
+ *                  returned
+ * @returns     the delay in 1 / @c base units.
+ */
+int64_t swr_get_delay(struct SwrContext *s, int64_t base);
+
+/**
+ * Find an upper bound on the number of samples that the next swr_convert
+ * call will output, if called with in_samples of input samples. This
+ * depends on the internal state, and anything changing the internal state
+ * (like further swr_convert() calls) will may change the number of samples
+ * swr_get_out_samples() returns for the same number of input samples.
+ *
+ * @param in_samples    number of input samples.
+ * @note any call to swr_inject_silence(), swr_convert(), swr_next_pts()
+ *       or swr_set_compensation() invalidates this limit
+ * @note it is recommended to pass the correct available buffer size
+ *       to all functions like swr_convert() even if swr_get_out_samples()
+ *       indicates that less would be used.
+ * @returns an upper bound on the number of samples that the next swr_convert
+ *          will output or a negative value to indicate an error
+ */
+int swr_get_out_samples(struct SwrContext *s, int in_samples);
+
+/**
+ * @}
+ *
+ * @name Configuration accessors
+ * @{
+ */
+
+/**
+ * Return the @ref LIBSWRESAMPLE_VERSION_INT constant.
+ *
+ * This is useful to check if the build-time libswresample has the same version
+ * as the run-time one.
+ *
+ * @returns     the unsigned int-typed version
+ */
+unsigned swresample_version(void);
+
+/**
+ * Return the swr build-time configuration.
+ *
+ * @returns     the build-time @c ./configure flags
+ */
+const char *swresample_configuration(void);
+
+/**
+ * Return the swr license.
+ *
+ * @returns     the license of libswresample, determined at build-time
+ */
+const char *swresample_license(void);
+
+/**
+ * @}
+ *
+ * @name AVFrame based API
+ * @{
+ */
+
+/**
+ * Convert the samples in the input AVFrame and write them to the output AVFrame.
+ *
+ * Input and output AVFrames must have channel_layout, sample_rate and format set.
+ *
+ * If the output AVFrame does not have the data pointers allocated the nb_samples
+ * field will be set using av_frame_get_buffer()
+ * is called to allocate the frame.
+ *
+ * The output AVFrame can be NULL or have fewer allocated samples than required.
+ * In this case, any remaining samples not written to the output will be added
+ * to an internal FIFO buffer, to be returned at the next call to this function
+ * or to swr_convert().
+ *
+ * If converting sample rate, there may be data remaining in the internal
+ * resampling delay buffer. swr_get_delay() tells the number of
+ * remaining samples. To get this data as output, call this function or
+ * swr_convert() with NULL input.
+ *
+ * If the SwrContext configuration does not match the output and
+ * input AVFrame settings the conversion does not take place and depending on
+ * which AVFrame is not matching AVERROR_OUTPUT_CHANGED, AVERROR_INPUT_CHANGED
+ * or the result of a bitwise-OR of them is returned.
+ *
+ * @see swr_delay()
+ * @see swr_convert()
+ * @see swr_get_delay()
+ *
+ * @param swr             audio resample context
+ * @param output          output AVFrame
+ * @param input           input AVFrame
+ * @return                0 on success, AVERROR on failure or nonmatching
+ *                        configuration.
+ */
+int swr_convert_frame(SwrContext *swr,
+                      AVFrame *output, const AVFrame *input);
+
+/**
+ * Configure or reconfigure the SwrContext using the information
+ * provided by the AVFrames.
+ *
+ * The original resampling context is reset even on failure.
+ * The function calls swr_close() internally if the context is open.
+ *
+ * @see swr_close();
+ *
+ * @param swr             audio resample context
+ * @param output          output AVFrame
+ * @param input           input AVFrame
+ * @return                0 on success, AVERROR on failure.
+ */
+int swr_config_frame(SwrContext *swr, const AVFrame *out, const AVFrame *in);
+
+/**
+ * @}
+ * @}
+ */
+
+#endif /* SWRESAMPLE_SWRESAMPLE_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libswresample/version.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libswresample/version.h
new file mode 100644
index 0000000..fb76f56
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libswresample/version.h
@@ -0,0 +1,45 @@
+/*
+ * Version macros.
+ *
+ * This file is part of libswresample
+ *
+ * libswresample is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libswresample is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libswresample; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef SWRESAMPLE_VERSION_H
+#define SWRESAMPLE_VERSION_H
+
+/**
+ * @file
+ * Libswresample version macros
+ */
+
+#include "libavutil/avutil.h"
+
+#define LIBSWRESAMPLE_VERSION_MAJOR   2
+#define LIBSWRESAMPLE_VERSION_MINOR   7
+#define LIBSWRESAMPLE_VERSION_MICRO 100
+
+#define LIBSWRESAMPLE_VERSION_INT  AV_VERSION_INT(LIBSWRESAMPLE_VERSION_MAJOR, \
+                                                  LIBSWRESAMPLE_VERSION_MINOR, \
+                                                  LIBSWRESAMPLE_VERSION_MICRO)
+#define LIBSWRESAMPLE_VERSION      AV_VERSION(LIBSWRESAMPLE_VERSION_MAJOR, \
+                                              LIBSWRESAMPLE_VERSION_MINOR, \
+                                              LIBSWRESAMPLE_VERSION_MICRO)
+#define LIBSWRESAMPLE_BUILD        LIBSWRESAMPLE_VERSION_INT
+
+#define LIBSWRESAMPLE_IDENT        "SwR" AV_STRINGIFY(LIBSWRESAMPLE_VERSION)
+
+#endif /* SWRESAMPLE_VERSION_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libswscale/swscale.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libswscale/swscale.h
new file mode 100644
index 0000000..7713f51
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libswscale/swscale.h
@@ -0,0 +1,336 @@
+/*
+ * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef SWSCALE_SWSCALE_H
+#define SWSCALE_SWSCALE_H
+
+/**
+ * @file
+ * @ingroup libsws
+ * external API header
+ */
+
+#include <stdint.h>
+
+#include "libavutil/avutil.h"
+#include "libavutil/log.h"
+#include "libavutil/pixfmt.h"
+#include "version.h"
+
+/**
+ * @defgroup libsws libswscale
+ * Color conversion and scaling library.
+ *
+ * @{
+ *
+ * Return the LIBSWSCALE_VERSION_INT constant.
+ */
+unsigned swscale_version(void);
+
+/**
+ * Return the libswscale build-time configuration.
+ */
+const char *swscale_configuration(void);
+
+/**
+ * Return the libswscale license.
+ */
+const char *swscale_license(void);
+
+/* values for the flags, the stuff on the command line is different */
+#define SWS_FAST_BILINEAR     1
+#define SWS_BILINEAR          2
+#define SWS_BICUBIC           4
+#define SWS_X                 8
+#define SWS_POINT          0x10
+#define SWS_AREA           0x20
+#define SWS_BICUBLIN       0x40
+#define SWS_GAUSS          0x80
+#define SWS_SINC          0x100
+#define SWS_LANCZOS       0x200
+#define SWS_SPLINE        0x400
+
+#define SWS_SRC_V_CHR_DROP_MASK     0x30000
+#define SWS_SRC_V_CHR_DROP_SHIFT    16
+
+#define SWS_PARAM_DEFAULT           123456
+
+#define SWS_PRINT_INFO              0x1000
+
+//the following 3 flags are not completely implemented
+//internal chrominance subsampling info
+#define SWS_FULL_CHR_H_INT    0x2000
+//input subsampling info
+#define SWS_FULL_CHR_H_INP    0x4000
+#define SWS_DIRECT_BGR        0x8000
+#define SWS_ACCURATE_RND      0x40000
+#define SWS_BITEXACT          0x80000
+#define SWS_ERROR_DIFFUSION  0x800000
+
+#define SWS_MAX_REDUCE_CUTOFF 0.002
+
+#define SWS_CS_ITU709         1
+#define SWS_CS_FCC            4
+#define SWS_CS_ITU601         5
+#define SWS_CS_ITU624         5
+#define SWS_CS_SMPTE170M      5
+#define SWS_CS_SMPTE240M      7
+#define SWS_CS_DEFAULT        5
+#define SWS_CS_BT2020         9
+
+/**
+ * Return a pointer to yuv<->rgb coefficients for the given colorspace
+ * suitable for sws_setColorspaceDetails().
+ *
+ * @param colorspace One of the SWS_CS_* macros. If invalid,
+ * SWS_CS_DEFAULT is used.
+ */
+const int *sws_getCoefficients(int colorspace);
+
+// when used for filters they must have an odd number of elements
+// coeffs cannot be shared between vectors
+typedef struct SwsVector {
+    double *coeff;              ///< pointer to the list of coefficients
+    int length;                 ///< number of coefficients in the vector
+} SwsVector;
+
+// vectors can be shared
+typedef struct SwsFilter {
+    SwsVector *lumH;
+    SwsVector *lumV;
+    SwsVector *chrH;
+    SwsVector *chrV;
+} SwsFilter;
+
+struct SwsContext;
+
+/**
+ * Return a positive value if pix_fmt is a supported input format, 0
+ * otherwise.
+ */
+int sws_isSupportedInput(enum AVPixelFormat pix_fmt);
+
+/**
+ * Return a positive value if pix_fmt is a supported output format, 0
+ * otherwise.
+ */
+int sws_isSupportedOutput(enum AVPixelFormat pix_fmt);
+
+/**
+ * @param[in]  pix_fmt the pixel format
+ * @return a positive value if an endianness conversion for pix_fmt is
+ * supported, 0 otherwise.
+ */
+int sws_isSupportedEndiannessConversion(enum AVPixelFormat pix_fmt);
+
+/**
+ * Allocate an empty SwsContext. This must be filled and passed to
+ * sws_init_context(). For filling see AVOptions, options.c and
+ * sws_setColorspaceDetails().
+ */
+struct SwsContext *sws_alloc_context(void);
+
+/**
+ * Initialize the swscaler context sws_context.
+ *
+ * @return zero or positive value on success, a negative value on
+ * error
+ */
+av_warn_unused_result
+int sws_init_context(struct SwsContext *sws_context, SwsFilter *srcFilter, SwsFilter *dstFilter);
+
+/**
+ * Free the swscaler context swsContext.
+ * If swsContext is NULL, then does nothing.
+ */
+void sws_freeContext(struct SwsContext *swsContext);
+
+/**
+ * Allocate and return an SwsContext. You need it to perform
+ * scaling/conversion operations using sws_scale().
+ *
+ * @param srcW the width of the source image
+ * @param srcH the height of the source image
+ * @param srcFormat the source image format
+ * @param dstW the width of the destination image
+ * @param dstH the height of the destination image
+ * @param dstFormat the destination image format
+ * @param flags specify which algorithm and options to use for rescaling
+ * @param param extra parameters to tune the used scaler
+ *              For SWS_BICUBIC param[0] and [1] tune the shape of the basis
+ *              function, param[0] tunes f(1) and param[1] f´(1)
+ *              For SWS_GAUSS param[0] tunes the exponent and thus cutoff
+ *              frequency
+ *              For SWS_LANCZOS param[0] tunes the width of the window function
+ * @return a pointer to an allocated context, or NULL in case of error
+ * @note this function is to be removed after a saner alternative is
+ *       written
+ */
+struct SwsContext *sws_getContext(int srcW, int srcH, enum AVPixelFormat srcFormat,
+                                  int dstW, int dstH, enum AVPixelFormat dstFormat,
+                                  int flags, SwsFilter *srcFilter,
+                                  SwsFilter *dstFilter, const double *param);
+
+/**
+ * Scale the image slice in srcSlice and put the resulting scaled
+ * slice in the image in dst. A slice is a sequence of consecutive
+ * rows in an image.
+ *
+ * Slices have to be provided in sequential order, either in
+ * top-bottom or bottom-top order. If slices are provided in
+ * non-sequential order the behavior of the function is undefined.
+ *
+ * @param c         the scaling context previously created with
+ *                  sws_getContext()
+ * @param srcSlice  the array containing the pointers to the planes of
+ *                  the source slice
+ * @param srcStride the array containing the strides for each plane of
+ *                  the source image
+ * @param srcSliceY the position in the source image of the slice to
+ *                  process, that is the number (counted starting from
+ *                  zero) in the image of the first row of the slice
+ * @param srcSliceH the height of the source slice, that is the number
+ *                  of rows in the slice
+ * @param dst       the array containing the pointers to the planes of
+ *                  the destination image
+ * @param dstStride the array containing the strides for each plane of
+ *                  the destination image
+ * @return          the height of the output slice
+ */
+int sws_scale(struct SwsContext *c, const uint8_t *const srcSlice[],
+              const int srcStride[], int srcSliceY, int srcSliceH,
+              uint8_t *const dst[], const int dstStride[]);
+
+/**
+ * @param dstRange flag indicating the while-black range of the output (1=jpeg / 0=mpeg)
+ * @param srcRange flag indicating the while-black range of the input (1=jpeg / 0=mpeg)
+ * @param table the yuv2rgb coefficients describing the output yuv space, normally ff_yuv2rgb_coeffs[x]
+ * @param inv_table the yuv2rgb coefficients describing the input yuv space, normally ff_yuv2rgb_coeffs[x]
+ * @param brightness 16.16 fixed point brightness correction
+ * @param contrast 16.16 fixed point contrast correction
+ * @param saturation 16.16 fixed point saturation correction
+ * @return -1 if not supported
+ */
+int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
+                             int srcRange, const int table[4], int dstRange,
+                             int brightness, int contrast, int saturation);
+
+/**
+ * @return -1 if not supported
+ */
+int sws_getColorspaceDetails(struct SwsContext *c, int **inv_table,
+                             int *srcRange, int **table, int *dstRange,
+                             int *brightness, int *contrast, int *saturation);
+
+/**
+ * Allocate and return an uninitialized vector with length coefficients.
+ */
+SwsVector *sws_allocVec(int length);
+
+/**
+ * Return a normalized Gaussian curve used to filter stuff
+ * quality = 3 is high quality, lower is lower quality.
+ */
+SwsVector *sws_getGaussianVec(double variance, double quality);
+
+/**
+ * Scale all the coefficients of a by the scalar value.
+ */
+void sws_scaleVec(SwsVector *a, double scalar);
+
+/**
+ * Scale all the coefficients of a so that their sum equals height.
+ */
+void sws_normalizeVec(SwsVector *a, double height);
+
+#if FF_API_SWS_VECTOR
+attribute_deprecated SwsVector *sws_getConstVec(double c, int length);
+attribute_deprecated SwsVector *sws_getIdentityVec(void);
+attribute_deprecated void sws_convVec(SwsVector *a, SwsVector *b);
+attribute_deprecated void sws_addVec(SwsVector *a, SwsVector *b);
+attribute_deprecated void sws_subVec(SwsVector *a, SwsVector *b);
+attribute_deprecated void sws_shiftVec(SwsVector *a, int shift);
+attribute_deprecated SwsVector *sws_cloneVec(SwsVector *a);
+attribute_deprecated void sws_printVec2(SwsVector *a, AVClass *log_ctx, int log_level);
+#endif
+
+void sws_freeVec(SwsVector *a);
+
+SwsFilter *sws_getDefaultFilter(float lumaGBlur, float chromaGBlur,
+                                float lumaSharpen, float chromaSharpen,
+                                float chromaHShift, float chromaVShift,
+                                int verbose);
+void sws_freeFilter(SwsFilter *filter);
+
+/**
+ * Check if context can be reused, otherwise reallocate a new one.
+ *
+ * If context is NULL, just calls sws_getContext() to get a new
+ * context. Otherwise, checks if the parameters are the ones already
+ * saved in context. If that is the case, returns the current
+ * context. Otherwise, frees context and gets a new context with
+ * the new parameters.
+ *
+ * Be warned that srcFilter and dstFilter are not checked, they
+ * are assumed to remain the same.
+ */
+struct SwsContext *sws_getCachedContext(struct SwsContext *context,
+                                        int srcW, int srcH, enum AVPixelFormat srcFormat,
+                                        int dstW, int dstH, enum AVPixelFormat dstFormat,
+                                        int flags, SwsFilter *srcFilter,
+                                        SwsFilter *dstFilter, const double *param);
+
+/**
+ * Convert an 8-bit paletted frame into a frame with a color depth of 32 bits.
+ *
+ * The output frame will have the same packed format as the palette.
+ *
+ * @param src        source frame buffer
+ * @param dst        destination frame buffer
+ * @param num_pixels number of pixels to convert
+ * @param palette    array with [256] entries, which must match color arrangement (RGB or BGR) of src
+ */
+void sws_convertPalette8ToPacked32(const uint8_t *src, uint8_t *dst, int num_pixels, const uint8_t *palette);
+
+/**
+ * Convert an 8-bit paletted frame into a frame with a color depth of 24 bits.
+ *
+ * With the palette format "ABCD", the destination frame ends up with the format "ABC".
+ *
+ * @param src        source frame buffer
+ * @param dst        destination frame buffer
+ * @param num_pixels number of pixels to convert
+ * @param palette    array with [256] entries, which must match color arrangement (RGB or BGR) of src
+ */
+void sws_convertPalette8ToPacked24(const uint8_t *src, uint8_t *dst, int num_pixels, const uint8_t *palette);
+
+/**
+ * Get the AVClass for swsContext. It can be used in combination with
+ * AV_OPT_SEARCH_FAKE_OBJ for examining options.
+ *
+ * @see av_opt_find().
+ */
+const AVClass *sws_get_class(void);
+
+/**
+ * @}
+ */
+
+#endif /* SWSCALE_SWSCALE_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libswscale/version.h b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libswscale/version.h
new file mode 100644
index 0000000..c1090ca
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ffmpeg/libswscale/version.h
@@ -0,0 +1,53 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef SWSCALE_VERSION_H
+#define SWSCALE_VERSION_H
+
+/**
+ * @file
+ * swscale version macros
+ */
+
+#include "libavutil/version.h"
+
+#define LIBSWSCALE_VERSION_MAJOR   4
+#define LIBSWSCALE_VERSION_MINOR   6
+#define LIBSWSCALE_VERSION_MICRO 100
+
+#define LIBSWSCALE_VERSION_INT  AV_VERSION_INT(LIBSWSCALE_VERSION_MAJOR, \
+                                               LIBSWSCALE_VERSION_MINOR, \
+                                               LIBSWSCALE_VERSION_MICRO)
+#define LIBSWSCALE_VERSION      AV_VERSION(LIBSWSCALE_VERSION_MAJOR, \
+                                           LIBSWSCALE_VERSION_MINOR, \
+                                           LIBSWSCALE_VERSION_MICRO)
+#define LIBSWSCALE_BUILD        LIBSWSCALE_VERSION_INT
+
+#define LIBSWSCALE_IDENT        "SwS" AV_STRINGIFY(LIBSWSCALE_VERSION)
+
+/**
+ * FF_API_* defines may be placed below to indicate public API that will be
+ * dropped at a future version bump. The defines themselves are not part of
+ * the public API and may change, break or disappear at any time.
+ */
+
+#ifndef FF_API_SWS_VECTOR
+#define FF_API_SWS_VECTOR            (LIBSWSCALE_VERSION_MAJOR < 6)
+#endif
+
+#endif /* SWSCALE_VERSION_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ncnn/allocator.h b/duix-sdk/src/main/cpp/third/arm/include/ncnn/allocator.h
new file mode 100644
index 0000000..3a5ebca
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ncnn/allocator.h
@@ -0,0 +1,448 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_ALLOCATOR_H
+#define NCNN_ALLOCATOR_H
+
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+
+#include "platform.h"
+
+#include <stdlib.h>
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+#include <android/hardware_buffer.h>
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+// the alignment of all the allocated buffers
+#if NCNN_AVX512
+#define NCNN_MALLOC_ALIGN 64
+#elif NCNN_AVX
+#define NCNN_MALLOC_ALIGN 32
+#else
+#define NCNN_MALLOC_ALIGN 16
+#endif
+
+// we have some optimized kernels that may overread buffer a bit in loop
+// it is common to interleave next-loop data load with arithmetic instructions
+// allocating more bytes keeps us safe from SEGV_ACCERR failure
+#define NCNN_MALLOC_OVERREAD 64
+
+// Aligns a pointer to the specified number of bytes
+// ptr Aligned pointer
+// n Alignment size that must be a power of two
+template<typename _Tp>
+static NCNN_FORCEINLINE _Tp* alignPtr(_Tp* ptr, int n = (int)sizeof(_Tp))
+{
+    return (_Tp*)(((size_t)ptr + n - 1) & -n);
+}
+
+// Aligns a buffer size to the specified number of bytes
+// The function returns the minimum number that is greater or equal to sz and is divisible by n
+// sz Buffer size to align
+// n Alignment size that must be a power of two
+static NCNN_FORCEINLINE size_t alignSize(size_t sz, int n)
+{
+    return (sz + n - 1) & -n;
+}
+
+static NCNN_FORCEINLINE void* fastMalloc(size_t size)
+{
+#if _MSC_VER
+    return _aligned_malloc(size, NCNN_MALLOC_ALIGN);
+#elif (defined(__unix__) || defined(__APPLE__)) && _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17)
+    void* ptr = 0;
+    if (posix_memalign(&ptr, NCNN_MALLOC_ALIGN, size + NCNN_MALLOC_OVERREAD))
+        ptr = 0;
+    return ptr;
+#elif __ANDROID__ && __ANDROID_API__ < 17
+    return memalign(NCNN_MALLOC_ALIGN, size + NCNN_MALLOC_OVERREAD);
+#else
+    unsigned char* udata = (unsigned char*)malloc(size + sizeof(void*) + NCNN_MALLOC_ALIGN + NCNN_MALLOC_OVERREAD);
+    if (!udata)
+        return 0;
+    unsigned char** adata = alignPtr((unsigned char**)udata + 1, NCNN_MALLOC_ALIGN);
+    adata[-1] = udata;
+    return adata;
+#endif
+}
+
+static NCNN_FORCEINLINE void fastFree(void* ptr)
+{
+    if (ptr)
+    {
+#if _MSC_VER
+        _aligned_free(ptr);
+#elif (defined(__unix__) || defined(__APPLE__)) && _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17)
+        free(ptr);
+#elif __ANDROID__ && __ANDROID_API__ < 17
+        free(ptr);
+#else
+        unsigned char* udata = ((unsigned char**)ptr)[-1];
+        free(udata);
+#endif
+    }
+}
+
+#if NCNN_THREADS
+// exchange-add operation for atomic operations on reference counters
+#if defined __riscv && !defined __riscv_atomic
+// riscv target without A extension
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#elif defined __INTEL_COMPILER && !(defined WIN32 || defined _WIN32)
+// atomic increment on the linux version of the Intel(tm) compiler
+#define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd(const_cast<void*>(reinterpret_cast<volatile void*>(addr)), delta)
+#elif defined __GNUC__
+#if defined __clang__ && __clang_major__ >= 3 && !defined __ANDROID__ && !defined __EMSCRIPTEN__ && !defined(__CUDACC__)
+#ifdef __ATOMIC_ACQ_REL
+#define NCNN_XADD(addr, delta) __c11_atomic_fetch_add((_Atomic(int)*)(addr), delta, __ATOMIC_ACQ_REL)
+#else
+#define NCNN_XADD(addr, delta) __atomic_fetch_add((_Atomic(int)*)(addr), delta, 4)
+#endif
+#else
+#if defined __ATOMIC_ACQ_REL && !defined __clang__
+// version for gcc >= 4.7
+#define NCNN_XADD(addr, delta) (int)__atomic_fetch_add((unsigned*)(addr), (unsigned)(delta), __ATOMIC_ACQ_REL)
+#else
+#define NCNN_XADD(addr, delta) (int)__sync_fetch_and_add((unsigned*)(addr), (unsigned)(delta))
+#endif
+#endif
+#elif defined _MSC_VER && !defined RC_INVOKED
+#define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile*)addr, delta)
+#else
+// thread-unsafe branch
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#endif
+#else  // NCNN_THREADS
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#endif // NCNN_THREADS
+
+class NCNN_EXPORT Allocator
+{
+public:
+    virtual ~Allocator();
+    virtual void* fastMalloc(size_t size) = 0;
+    virtual void fastFree(void* ptr) = 0;
+};
+
+class PoolAllocatorPrivate;
+class NCNN_EXPORT PoolAllocator : public Allocator
+{
+public:
+    PoolAllocator();
+    ~PoolAllocator();
+
+    // ratio range 0 ~ 1
+    // default cr = 0
+    void set_size_compare_ratio(float scr);
+
+    // budget drop threshold
+    // default threshold = 10
+    void set_size_drop_threshold(size_t);
+
+    // release all budgets immediately
+    void clear();
+
+    virtual void* fastMalloc(size_t size);
+    virtual void fastFree(void* ptr);
+
+private:
+    PoolAllocator(const PoolAllocator&);
+    PoolAllocator& operator=(const PoolAllocator&);
+
+private:
+    PoolAllocatorPrivate* const d;
+};
+
+class UnlockedPoolAllocatorPrivate;
+class NCNN_EXPORT UnlockedPoolAllocator : public Allocator
+{
+public:
+    UnlockedPoolAllocator();
+    ~UnlockedPoolAllocator();
+
+    // ratio range 0 ~ 1
+    // default cr = 0
+    void set_size_compare_ratio(float scr);
+
+    // budget drop threshold
+    // default threshold = 10
+    void set_size_drop_threshold(size_t);
+
+    // release all budgets immediately
+    void clear();
+
+    virtual void* fastMalloc(size_t size);
+    virtual void fastFree(void* ptr);
+
+private:
+    UnlockedPoolAllocator(const UnlockedPoolAllocator&);
+    UnlockedPoolAllocator& operator=(const UnlockedPoolAllocator&);
+
+private:
+    UnlockedPoolAllocatorPrivate* const d;
+};
+
+#if NCNN_VULKAN
+
+class VulkanDevice;
+
+class NCNN_EXPORT VkBufferMemory
+{
+public:
+    VkBuffer buffer;
+
+    // the base offset assigned by allocator
+    size_t offset;
+    size_t capacity;
+
+    VkDeviceMemory memory;
+    void* mapped_ptr;
+
+    // buffer state, modified by command functions internally
+    mutable VkAccessFlags access_flags;
+    mutable VkPipelineStageFlags stage_flags;
+
+    // initialize and modified by mat
+    int refcount;
+};
+
+class NCNN_EXPORT VkImageMemory
+{
+public:
+    VkImage image;
+    VkImageView imageview;
+
+    // underlying info assigned by allocator
+    int width;
+    int height;
+    int depth;
+    VkFormat format;
+
+    VkDeviceMemory memory;
+    void* mapped_ptr;
+
+    // the base offset assigned by allocator
+    size_t bind_offset;
+    size_t bind_capacity;
+
+    // image state, modified by command functions internally
+    mutable VkAccessFlags access_flags;
+    mutable VkImageLayout image_layout;
+    mutable VkPipelineStageFlags stage_flags;
+
+    // in-execution state, modified by command functions internally
+    mutable int command_refcount;
+
+    // initialize and modified by mat
+    int refcount;
+};
+
+class NCNN_EXPORT VkAllocator
+{
+public:
+    explicit VkAllocator(const VulkanDevice* _vkdev);
+    virtual ~VkAllocator();
+
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size) = 0;
+    virtual void fastFree(VkBufferMemory* ptr) = 0;
+    virtual int flush(VkBufferMemory* ptr);
+    virtual int invalidate(VkBufferMemory* ptr);
+
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack) = 0;
+    virtual void fastFree(VkImageMemory* ptr) = 0;
+
+public:
+    const VulkanDevice* vkdev;
+    uint32_t buffer_memory_type_index;
+    uint32_t image_memory_type_index;
+    uint32_t reserved_type_index;
+    bool mappable;
+    bool coherent;
+
+protected:
+    VkBuffer create_buffer(size_t size, VkBufferUsageFlags usage);
+    VkDeviceMemory allocate_memory(size_t size, uint32_t memory_type_index);
+    VkDeviceMemory allocate_dedicated_memory(size_t size, uint32_t memory_type_index, VkImage image, VkBuffer buffer);
+
+    VkImage create_image(int width, int height, int depth, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage);
+    VkImageView create_imageview(VkImage image, VkFormat format);
+};
+
+class VkBlobAllocatorPrivate;
+class NCNN_EXPORT VkBlobAllocator : public VkAllocator
+{
+public:
+    explicit VkBlobAllocator(const VulkanDevice* vkdev, size_t preferred_block_size = 16 * 1024 * 1024); // 16M
+    virtual ~VkBlobAllocator();
+
+public:
+    // release all budgets immediately
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkBlobAllocator(const VkBlobAllocator&);
+    VkBlobAllocator& operator=(const VkBlobAllocator&);
+
+private:
+    VkBlobAllocatorPrivate* const d;
+};
+
+class VkWeightAllocatorPrivate;
+class NCNN_EXPORT VkWeightAllocator : public VkAllocator
+{
+public:
+    explicit VkWeightAllocator(const VulkanDevice* vkdev, size_t preferred_block_size = 8 * 1024 * 1024); // 8M
+    virtual ~VkWeightAllocator();
+
+public:
+    // release all blocks immediately
+    virtual void clear();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkWeightAllocator(const VkWeightAllocator&);
+    VkWeightAllocator& operator=(const VkWeightAllocator&);
+
+private:
+    VkWeightAllocatorPrivate* const d;
+};
+
+class VkStagingAllocatorPrivate;
+class NCNN_EXPORT VkStagingAllocator : public VkAllocator
+{
+public:
+    explicit VkStagingAllocator(const VulkanDevice* vkdev);
+    virtual ~VkStagingAllocator();
+
+public:
+    // ratio range 0 ~ 1
+    // default cr = 0.75
+    void set_size_compare_ratio(float scr);
+
+    // release all budgets immediately
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkStagingAllocator(const VkStagingAllocator&);
+    VkStagingAllocator& operator=(const VkStagingAllocator&);
+
+private:
+    VkStagingAllocatorPrivate* const d;
+};
+
+class VkWeightStagingAllocatorPrivate;
+class NCNN_EXPORT VkWeightStagingAllocator : public VkAllocator
+{
+public:
+    explicit VkWeightStagingAllocator(const VulkanDevice* vkdev);
+    virtual ~VkWeightStagingAllocator();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkWeightStagingAllocator(const VkWeightStagingAllocator&);
+    VkWeightStagingAllocator& operator=(const VkWeightStagingAllocator&);
+
+private:
+    VkWeightStagingAllocatorPrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class NCNN_EXPORT VkAndroidHardwareBufferImageAllocator : public VkAllocator
+{
+public:
+    VkAndroidHardwareBufferImageAllocator(const VulkanDevice* _vkdev, AHardwareBuffer* _hb);
+    virtual ~VkAndroidHardwareBufferImageAllocator();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkAndroidHardwareBufferImageAllocator(const VkAndroidHardwareBufferImageAllocator&);
+    VkAndroidHardwareBufferImageAllocator& operator=(const VkAndroidHardwareBufferImageAllocator&);
+
+public:
+    int init();
+
+    int width() const;
+    int height() const;
+    uint64_t external_format() const;
+
+public:
+    AHardwareBuffer* hb;
+    AHardwareBuffer_Desc bufferDesc;
+    VkAndroidHardwareBufferFormatPropertiesANDROID bufferFormatProperties;
+    VkAndroidHardwareBufferPropertiesANDROID bufferProperties;
+    VkSamplerYcbcrConversionKHR samplerYcbcrConversion;
+};
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_ALLOCATOR_H
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ncnn/benchmark.h b/duix-sdk/src/main/cpp/third/arm/include/ncnn/benchmark.h
new file mode 100644
index 0000000..ed42c1a
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ncnn/benchmark.h
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_BENCHMARK_H
+#define NCNN_BENCHMARK_H
+
+#include "layer.h"
+#include "mat.h"
+#include "platform.h"
+
+namespace ncnn {
+
+// get now timestamp in ms
+NCNN_EXPORT double get_current_time();
+
+// sleep milliseconds
+NCNN_EXPORT void sleep(unsigned long long int milliseconds = 1000);
+
+#if NCNN_BENCHMARK
+
+NCNN_EXPORT void benchmark(const Layer* layer, double start, double end);
+NCNN_EXPORT void benchmark(const Layer* layer, const Mat& bottom_blob, Mat& top_blob, double start, double end);
+
+#endif // NCNN_BENCHMARK
+
+} // namespace ncnn
+
+#endif // NCNN_BENCHMARK_H
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ncnn/blob.h b/duix-sdk/src/main/cpp/third/arm/include/ncnn/blob.h
new file mode 100644
index 0000000..c9f144f
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ncnn/blob.h
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_BLOB_H
+#define NCNN_BLOB_H
+
+#include "mat.h"
+#include "platform.h"
+
+namespace ncnn {
+
+class NCNN_EXPORT Blob
+{
+public:
+    // empty
+    Blob();
+
+public:
+#if NCNN_STRING
+    // blob name
+    std::string name;
+#endif // NCNN_STRING
+    // layer index which produce this blob as output
+    int producer;
+    // layer index which need this blob as input
+    int consumer;
+    // shape hint
+    Mat shape;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_BLOB_H
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ncnn/c_api.h b/duix-sdk/src/main/cpp/third/arm/include/ncnn/c_api.h
new file mode 100644
index 0000000..31d5b6d
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ncnn/c_api.h
@@ -0,0 +1,347 @@
+/* Tencent is pleased to support the open source community by making ncnn available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed
+ * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+ * CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ */
+
+#ifndef NCNN_C_API_H
+#define NCNN_C_API_H
+
+#include "platform.h"
+
+#if NCNN_C_API
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+NCNN_EXPORT const char* ncnn_version();
+
+/* allocator api */
+typedef struct __ncnn_allocator_t* ncnn_allocator_t;
+struct NCNN_EXPORT __ncnn_allocator_t
+{
+    void* pthis;
+
+    void* (*fast_malloc)(ncnn_allocator_t allocator, size_t size);
+    void (*fast_free)(ncnn_allocator_t allocator, void* ptr);
+};
+
+NCNN_EXPORT ncnn_allocator_t ncnn_allocator_create_pool_allocator();
+NCNN_EXPORT ncnn_allocator_t ncnn_allocator_create_unlocked_pool_allocator();
+NCNN_EXPORT void ncnn_allocator_destroy(ncnn_allocator_t allocator);
+
+/* option api */
+typedef struct __ncnn_option_t* ncnn_option_t;
+
+NCNN_EXPORT ncnn_option_t ncnn_option_create();
+NCNN_EXPORT void ncnn_option_destroy(ncnn_option_t opt);
+
+NCNN_EXPORT int ncnn_option_get_num_threads(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_num_threads(ncnn_option_t opt, int num_threads);
+
+NCNN_EXPORT int ncnn_option_get_use_local_pool_allocator(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_use_local_pool_allocator(ncnn_option_t opt, int use_local_pool_allocator);
+
+NCNN_EXPORT void ncnn_option_set_blob_allocator(ncnn_option_t opt, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_option_set_workspace_allocator(ncnn_option_t opt, ncnn_allocator_t allocator);
+
+NCNN_EXPORT int ncnn_option_get_use_vulkan_compute(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_use_vulkan_compute(ncnn_option_t opt, int use_vulkan_compute);
+
+/* mat api */
+typedef struct __ncnn_mat_t* ncnn_mat_t;
+
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create();
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_1d(int w, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_2d(int w, int h, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_3d(int w, int h, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_4d(int w, int h, int d, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_1d(int w, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_2d(int w, int h, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_3d(int w, int h, int c, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_4d(int w, int h, int d, int c, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_1d_elem(int w, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_2d_elem(int w, int h, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_3d_elem(int w, int h, int c, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_4d_elem(int w, int h, int d, int c, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_1d_elem(int w, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_2d_elem(int w, int h, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_3d_elem(int w, int h, int c, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_4d_elem(int w, int h, int d, int c, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_mat_destroy(ncnn_mat_t mat);
+
+NCNN_EXPORT void ncnn_mat_fill_float(ncnn_mat_t mat, float v);
+
+NCNN_EXPORT ncnn_mat_t ncnn_mat_clone(const ncnn_mat_t mat, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_1d(const ncnn_mat_t mat, int w, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_2d(const ncnn_mat_t mat, int w, int h, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_3d(const ncnn_mat_t mat, int w, int h, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_4d(const ncnn_mat_t mat, int w, int h, int d, int c, ncnn_allocator_t allocator);
+
+NCNN_EXPORT int ncnn_mat_get_dims(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_w(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_h(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_d(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_c(const ncnn_mat_t mat);
+NCNN_EXPORT size_t ncnn_mat_get_elemsize(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_elempack(const ncnn_mat_t mat);
+NCNN_EXPORT size_t ncnn_mat_get_cstep(const ncnn_mat_t mat);
+NCNN_EXPORT void* ncnn_mat_get_data(const ncnn_mat_t mat);
+
+NCNN_EXPORT void* ncnn_mat_get_channel_data(const ncnn_mat_t mat, int c);
+
+#if NCNN_PIXEL
+
+/* mat pixel api */
+#define NCNN_MAT_PIXEL_RGB       1
+#define NCNN_MAT_PIXEL_BGR       2
+#define NCNN_MAT_PIXEL_GRAY      3
+#define NCNN_MAT_PIXEL_RGBA      4
+#define NCNN_MAT_PIXEL_BGRA      5
+#define NCNN_MAT_PIXEL_X2Y(X, Y) (X | (Y << 16))
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels(const unsigned char* pixels, int type, int w, int h, int stride, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int stride, int target_width, int target_height, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, int target_width, int target_height, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_mat_to_pixels(const ncnn_mat_t mat, unsigned char* pixels, int type, int stride);
+NCNN_EXPORT void ncnn_mat_to_pixels_resize(const ncnn_mat_t mat, unsigned char* pixels, int type, int target_width, int target_height, int target_stride);
+
+#endif /* NCNN_PIXEL */
+
+NCNN_EXPORT void ncnn_mat_substract_mean_normalize(ncnn_mat_t mat, const float* mean_vals, const float* norm_vals);
+
+NCNN_EXPORT void ncnn_convert_packing(const ncnn_mat_t src, ncnn_mat_t* dst, int elempack, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_flatten(const ncnn_mat_t src, ncnn_mat_t* dst, const ncnn_option_t opt);
+
+/* blob api */
+typedef struct __ncnn_blob_t* ncnn_blob_t;
+
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_blob_get_name(const ncnn_blob_t blob);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_blob_get_producer(const ncnn_blob_t blob);
+NCNN_EXPORT int ncnn_blob_get_consumer(const ncnn_blob_t blob);
+
+NCNN_EXPORT void ncnn_blob_get_shape(const ncnn_blob_t blob, int* dims, int* w, int* h, int* c);
+
+/* paramdict api */
+typedef struct __ncnn_paramdict_t* ncnn_paramdict_t;
+
+NCNN_EXPORT ncnn_paramdict_t ncnn_paramdict_create();
+NCNN_EXPORT void ncnn_paramdict_destroy(ncnn_paramdict_t pd);
+
+NCNN_EXPORT int ncnn_paramdict_get_type(const ncnn_paramdict_t pd, int id);
+
+NCNN_EXPORT int ncnn_paramdict_get_int(const ncnn_paramdict_t pd, int id, int def);
+NCNN_EXPORT float ncnn_paramdict_get_float(const ncnn_paramdict_t pd, int id, float def);
+NCNN_EXPORT ncnn_mat_t ncnn_paramdict_get_array(const ncnn_paramdict_t pd, int id, const ncnn_mat_t def);
+
+NCNN_EXPORT void ncnn_paramdict_set_int(ncnn_paramdict_t pd, int id, int i);
+NCNN_EXPORT void ncnn_paramdict_set_float(ncnn_paramdict_t pd, int id, float f);
+NCNN_EXPORT void ncnn_paramdict_set_array(ncnn_paramdict_t pd, int id, const ncnn_mat_t v);
+
+/* datareader api */
+typedef struct __ncnn_datareader_t* ncnn_datareader_t;
+struct NCNN_EXPORT __ncnn_datareader_t
+{
+    void* pthis;
+
+#if NCNN_STRING
+    int (*scan)(ncnn_datareader_t dr, const char* format, void* p);
+#endif /* NCNN_STRING */
+    size_t (*read)(ncnn_datareader_t dr, void* buf, size_t size);
+};
+
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create();
+#if NCNN_STDIO
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create_from_stdio(FILE* fp);
+#endif /* NCNN_STDIO */
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create_from_memory(const unsigned char** mem);
+NCNN_EXPORT void ncnn_datareader_destroy(ncnn_datareader_t dr);
+
+/* modelbin api */
+typedef struct __ncnn_modelbin_t* ncnn_modelbin_t;
+struct NCNN_EXPORT __ncnn_modelbin_t
+{
+    void* pthis;
+
+    ncnn_mat_t (*load_1d)(const ncnn_modelbin_t mb, int w, int type);
+    ncnn_mat_t (*load_2d)(const ncnn_modelbin_t mb, int w, int h, int type);
+    ncnn_mat_t (*load_3d)(const ncnn_modelbin_t mb, int w, int h, int c, int type);
+};
+
+NCNN_EXPORT ncnn_modelbin_t ncnn_modelbin_create_from_datareader(const ncnn_datareader_t dr);
+NCNN_EXPORT ncnn_modelbin_t ncnn_modelbin_create_from_mat_array(const ncnn_mat_t* weights, int n);
+NCNN_EXPORT void ncnn_modelbin_destroy(ncnn_modelbin_t mb);
+
+/* layer api */
+typedef struct __ncnn_layer_t* ncnn_layer_t;
+struct NCNN_EXPORT __ncnn_layer_t
+{
+    void* pthis;
+
+    int (*load_param)(ncnn_layer_t layer, const ncnn_paramdict_t pd);
+    int (*load_model)(ncnn_layer_t layer, const ncnn_modelbin_t mb);
+
+    int (*create_pipeline)(ncnn_layer_t layer, const ncnn_option_t opt);
+    int (*destroy_pipeline)(ncnn_layer_t layer, const ncnn_option_t opt);
+
+    int (*forward_1)(const ncnn_layer_t layer, const ncnn_mat_t bottom_blob, ncnn_mat_t* top_blob, const ncnn_option_t opt);
+    int (*forward_n)(const ncnn_layer_t layer, const ncnn_mat_t* bottom_blobs, int n, ncnn_mat_t* top_blobs, int n2, const ncnn_option_t opt);
+
+    int (*forward_inplace_1)(const ncnn_layer_t layer, ncnn_mat_t bottom_top_blob, const ncnn_option_t opt);
+    int (*forward_inplace_n)(const ncnn_layer_t layer, ncnn_mat_t* bottom_top_blobs, int n, const ncnn_option_t opt);
+};
+
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create();
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create_by_typeindex(int typeindex);
+#if NCNN_STRING
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create_by_type(const char* type);
+NCNN_EXPORT int ncnn_layer_type_to_index(const char* type);
+#endif /* NCNN_STRING */
+NCNN_EXPORT void ncnn_layer_destroy(ncnn_layer_t layer);
+
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_layer_get_name(const ncnn_layer_t layer);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_layer_get_typeindex(const ncnn_layer_t layer);
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_layer_get_type(const ncnn_layer_t layer);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_layer_get_one_blob_only(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_inplace(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_vulkan(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_packing(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_bf16_storage(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_fp16_storage(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_image_storage(const ncnn_layer_t layer);
+
+NCNN_EXPORT void ncnn_layer_set_one_blob_only(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_inplace(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_vulkan(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_packing(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_bf16_storage(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_fp16_storage(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_image_storage(ncnn_layer_t layer, int enable);
+
+NCNN_EXPORT int ncnn_layer_get_bottom_count(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_bottom(const ncnn_layer_t layer, int i);
+NCNN_EXPORT int ncnn_layer_get_top_count(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_top(const ncnn_layer_t layer, int i);
+
+NCNN_EXPORT void ncnn_blob_get_bottom_shape(const ncnn_layer_t layer, int i, int* dims, int* w, int* h, int* c);
+NCNN_EXPORT void ncnn_blob_get_top_shape(const ncnn_layer_t layer, int i, int* dims, int* w, int* h, int* c);
+
+/* layer factory function */
+typedef ncnn_layer_t (*ncnn_layer_creator_t)(void* userdata);
+typedef void (*ncnn_layer_destroyer_t)(ncnn_layer_t layer, void* userdata);
+
+typedef struct __ncnn_net_custom_layer_factory_t* ncnn_net_custom_layer_factory_t;
+struct __ncnn_net_custom_layer_factory_t
+{
+    ncnn_layer_creator_t creator;
+    ncnn_layer_destroyer_t destroyer;
+    void* userdata;
+    ncnn_net_custom_layer_factory_t next;
+};
+
+/* net api */
+typedef struct __ncnn_net_t* ncnn_net_t;
+struct __ncnn_net_t
+{
+    void* pthis;
+
+    ncnn_net_custom_layer_factory_t custom_layer_factory;
+};
+
+NCNN_EXPORT ncnn_net_t ncnn_net_create();
+NCNN_EXPORT void ncnn_net_destroy(ncnn_net_t net);
+
+NCNN_EXPORT ncnn_option_t ncnn_net_get_option(ncnn_net_t net);
+NCNN_EXPORT void ncnn_net_set_option(ncnn_net_t net, ncnn_option_t opt);
+
+#if NCNN_STRING
+NCNN_EXPORT void ncnn_net_register_custom_layer_by_type(ncnn_net_t net, const char* type, ncnn_layer_creator_t creator, ncnn_layer_destroyer_t destroyer, void* userdata);
+#endif /* NCNN_STRING */
+NCNN_EXPORT void ncnn_net_register_custom_layer_by_typeindex(ncnn_net_t net, int typeindex, ncnn_layer_creator_t creator, ncnn_layer_destroyer_t destroyer, void* userdata);
+
+#if NCNN_STDIO
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param(ncnn_net_t net, const char* path);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_net_load_param_bin(ncnn_net_t net, const char* path);
+NCNN_EXPORT int ncnn_net_load_model(ncnn_net_t net, const char* path);
+#endif /* NCNN_STDIO */
+
+#if NCNN_STDIO
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param_memory(ncnn_net_t net, const char* mem);
+#endif /* NCNN_STRING */
+#endif /* NCNN_STDIO */
+NCNN_EXPORT int ncnn_net_load_param_bin_memory(ncnn_net_t net, const unsigned char* mem);
+NCNN_EXPORT int ncnn_net_load_model_memory(ncnn_net_t net, const unsigned char* mem);
+
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_net_load_param_bin_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+NCNN_EXPORT int ncnn_net_load_model_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+
+NCNN_EXPORT void ncnn_net_clear(ncnn_net_t net);
+
+NCNN_EXPORT int ncnn_net_get_input_count(const ncnn_net_t net);
+NCNN_EXPORT int ncnn_net_get_output_count(const ncnn_net_t net);
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_net_get_input_name(const ncnn_net_t net, int i);
+NCNN_EXPORT const char* ncnn_net_get_output_name(const ncnn_net_t net, int i);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_net_get_input_index(const ncnn_net_t net, int i);
+NCNN_EXPORT int ncnn_net_get_output_index(const ncnn_net_t net, int i);
+
+/* extractor api */
+typedef struct __ncnn_extractor_t* ncnn_extractor_t;
+
+NCNN_EXPORT ncnn_extractor_t ncnn_extractor_create(ncnn_net_t net);
+NCNN_EXPORT void ncnn_extractor_destroy(ncnn_extractor_t ex);
+
+NCNN_EXPORT void ncnn_extractor_set_option(ncnn_extractor_t ex, const ncnn_option_t opt);
+
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_extractor_input(ncnn_extractor_t ex, const char* name, const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_extractor_extract(ncnn_extractor_t ex, const char* name, ncnn_mat_t* mat);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_extractor_input_index(ncnn_extractor_t ex, int index, const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_extractor_extract_index(ncnn_extractor_t ex, int index, ncnn_mat_t* mat);
+
+/* mat process api */
+#define NCNN_BORDER_CONSTANT    0
+#define NCNN_BORDER_REPLICATE   1
+#define NCNN_BORDER_REFLECT     2
+#define NCNN_BORDER_TRANSPARENT -233
+NCNN_EXPORT void ncnn_copy_make_border(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int type, float v, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_copy_make_border_3d(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int front, int behind, int type, float v, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_copy_cut_border(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_copy_cut_border_3d(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int front, int behind, const ncnn_option_t opt);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* NCNN_C_API */
+
+#endif /* NCNN_C_API_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ncnn/command.h b/duix-sdk/src/main/cpp/third/arm/include/ncnn/command.h
new file mode 100644
index 0000000..337d085
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ncnn/command.h
@@ -0,0 +1,136 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_COMMAND_H
+#define NCNN_COMMAND_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+
+#include "mat.h"
+
+#include <vulkan/vulkan.h>
+
+namespace ncnn {
+
+class Pipeline;
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class ImportAndroidHardwareBufferPipeline;
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+class VkComputePrivate;
+class NCNN_EXPORT VkCompute
+{
+public:
+    explicit VkCompute(const VulkanDevice* vkdev);
+    virtual ~VkCompute();
+
+public:
+    void record_upload(const Mat& src, VkMat& dst, const Option& opt);
+
+    void record_upload(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    void record_download(const VkMat& src, Mat& dst, const Option& opt);
+
+    void record_download(const VkImageMat& src, Mat& dst, const Option& opt);
+
+    void record_buffer_to_image(const VkMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_image_to_buffer(const VkImageMat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const Mat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, Mat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, Mat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, VkMat& dst, const Option& opt);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkImageMat>& bindings, const std::vector<vk_constant_type>& constants, const VkImageMat& dispatcher);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher);
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const VkImageMat& dispatcher);
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const Mat& dispatcher);
+
+#if NCNN_BENCHMARK
+    void record_write_timestamp(uint32_t query);
+#endif // NCNN_BENCHMARK
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+    void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkMat& dst);
+
+    void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkImageMat& dst);
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+    int submit_and_wait();
+
+    int reset();
+
+#if NCNN_BENCHMARK
+    int create_query_pool(uint32_t query_count);
+
+    int get_query_pool_results(uint32_t first_query, uint32_t query_count, std::vector<uint64_t>& results);
+#endif // NCNN_BENCHMARK
+
+protected:
+    const VulkanDevice* vkdev;
+
+    void barrier_readwrite(const VkMat& binding);
+    void barrier_readwrite(const VkImageMat& binding);
+    void barrier_readonly(const VkImageMat& binding);
+
+private:
+    VkComputePrivate* const d;
+};
+
+class VkTransferPrivate;
+class NCNN_EXPORT VkTransfer
+{
+public:
+    explicit VkTransfer(const VulkanDevice* vkdev);
+    virtual ~VkTransfer();
+
+public:
+    void record_upload(const Mat& src, VkMat& dst, const Option& opt, bool flatten = true);
+
+    void record_upload(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    int submit_and_wait();
+
+protected:
+    const VulkanDevice* vkdev;
+
+private:
+    VkTransferPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_VULKAN
+
+#endif // NCNN_COMMAND_H
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ncnn/cpu.h b/duix-sdk/src/main/cpp/third/arm/include/ncnn/cpu.h
new file mode 100644
index 0000000..7d6bfce
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ncnn/cpu.h
@@ -0,0 +1,178 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_CPU_H
+#define NCNN_CPU_H
+
+#include <stddef.h>
+
+#if (defined _WIN32 && !(defined __MINGW32__))
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+#if defined __ANDROID__ || defined __linux__
+#include <sched.h> // cpu_set_t
+#endif
+
+#include "platform.h"
+
+namespace ncnn {
+
+class NCNN_EXPORT CpuSet
+{
+public:
+    CpuSet();
+    void enable(int cpu);
+    void disable(int cpu);
+    void disable_all();
+    bool is_enabled(int cpu) const;
+    int num_enabled() const;
+
+public:
+#if (defined _WIN32 && !(defined __MINGW32__))
+    ULONG_PTR mask;
+#endif
+#if defined __ANDROID__ || defined __linux__
+    cpu_set_t cpu_set;
+#endif
+#if __APPLE__
+    unsigned int policy;
+#endif
+};
+
+// test optional cpu features
+// edsp = armv7 edsp
+NCNN_EXPORT int cpu_support_arm_edsp();
+// neon = armv7 neon or aarch64 asimd
+NCNN_EXPORT int cpu_support_arm_neon();
+// vfpv4 = armv7 fp16 + fma
+NCNN_EXPORT int cpu_support_arm_vfpv4();
+// asimdhp = aarch64 asimd half precision
+NCNN_EXPORT int cpu_support_arm_asimdhp();
+// cpuid = aarch64 cpuid info
+NCNN_EXPORT int cpu_support_arm_cpuid();
+// asimddp = aarch64 asimd dot product
+NCNN_EXPORT int cpu_support_arm_asimddp();
+// asimdfhm = aarch64 asimd fhm
+NCNN_EXPORT int cpu_support_arm_asimdfhm();
+// bf16 = aarch64 bf16
+NCNN_EXPORT int cpu_support_arm_bf16();
+// i8mm = aarch64 i8mm
+NCNN_EXPORT int cpu_support_arm_i8mm();
+// sve = aarch64 sve
+NCNN_EXPORT int cpu_support_arm_sve();
+// sve2 = aarch64 sve2
+NCNN_EXPORT int cpu_support_arm_sve2();
+// svebf16 = aarch64 svebf16
+NCNN_EXPORT int cpu_support_arm_svebf16();
+// svei8mm = aarch64 svei8mm
+NCNN_EXPORT int cpu_support_arm_svei8mm();
+// svef32mm = aarch64 svef32mm
+NCNN_EXPORT int cpu_support_arm_svef32mm();
+
+// avx = x86 avx
+NCNN_EXPORT int cpu_support_x86_avx();
+// fma = x86 fma
+NCNN_EXPORT int cpu_support_x86_fma();
+// xop = x86 xop
+NCNN_EXPORT int cpu_support_x86_xop();
+// f16c = x86 f16c
+NCNN_EXPORT int cpu_support_x86_f16c();
+// avx2 = x86 avx2 + fma + f16c
+NCNN_EXPORT int cpu_support_x86_avx2();
+// avx_vnni = x86 avx vnni
+NCNN_EXPORT int cpu_support_x86_avx_vnni();
+// avx512 = x86 avx512f + avx512cd + avx512bw + avx512dq + avx512vl
+NCNN_EXPORT int cpu_support_x86_avx512();
+// avx512_vnni = x86 avx512 vnni
+NCNN_EXPORT int cpu_support_x86_avx512_vnni();
+// avx512_bf16 = x86 avx512 bf16
+NCNN_EXPORT int cpu_support_x86_avx512_bf16();
+// avx512_fp16 = x86 avx512 fp16
+NCNN_EXPORT int cpu_support_x86_avx512_fp16();
+
+// lsx = loongarch lsx
+NCNN_EXPORT int cpu_support_loongarch_lsx();
+// lasx = loongarch lasx
+NCNN_EXPORT int cpu_support_loongarch_lasx();
+
+// msa = mips mas
+NCNN_EXPORT int cpu_support_mips_msa();
+// mmi = loongson mmi
+NCNN_EXPORT int cpu_support_loongson_mmi();
+
+// v = riscv vector
+NCNN_EXPORT int cpu_support_riscv_v();
+// zfh = riscv half-precision float
+NCNN_EXPORT int cpu_support_riscv_zfh();
+// vlenb = riscv vector length in bytes
+NCNN_EXPORT int cpu_riscv_vlenb();
+
+// cpu info
+NCNN_EXPORT int get_cpu_count();
+NCNN_EXPORT int get_little_cpu_count();
+NCNN_EXPORT int get_big_cpu_count();
+
+NCNN_EXPORT int get_physical_cpu_count();
+NCNN_EXPORT int get_physical_little_cpu_count();
+NCNN_EXPORT int get_physical_big_cpu_count();
+
+// cpu l2 varies from 64k to 1M, but l3 can be zero
+NCNN_EXPORT int get_cpu_level2_cache_size();
+NCNN_EXPORT int get_cpu_level3_cache_size();
+
+// bind all threads on little clusters if powersave enabled
+// affects HMP arch cpu like ARM big.LITTLE
+// only implemented on android at the moment
+// switching powersave is expensive and not thread-safe
+// 0 = all cores enabled(default)
+// 1 = only little clusters enabled
+// 2 = only big clusters enabled
+// return 0 if success for setter function
+NCNN_EXPORT int get_cpu_powersave();
+NCNN_EXPORT int set_cpu_powersave(int powersave);
+
+// convenient wrapper
+NCNN_EXPORT const CpuSet& get_cpu_thread_affinity_mask(int powersave);
+
+// set explicit thread affinity
+NCNN_EXPORT int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask);
+
+// runtime thread affinity info
+NCNN_EXPORT int is_current_thread_running_on_a53_a55();
+
+// misc function wrapper for openmp routines
+NCNN_EXPORT int get_omp_num_threads();
+NCNN_EXPORT void set_omp_num_threads(int num_threads);
+
+NCNN_EXPORT int get_omp_dynamic();
+NCNN_EXPORT void set_omp_dynamic(int dynamic);
+
+NCNN_EXPORT int get_omp_thread_num();
+
+NCNN_EXPORT int get_kmp_blocktime();
+NCNN_EXPORT void set_kmp_blocktime(int time_ms);
+
+// need to flush denormals on Intel Chipset.
+// Other architectures such as ARM can be added as needed.
+// 0 = DAZ OFF, FTZ OFF
+// 1 = DAZ ON , FTZ OFF
+// 2 = DAZ OFF, FTZ ON
+// 3 = DAZ ON,  FTZ ON
+NCNN_EXPORT int get_flush_denormals();
+NCNN_EXPORT int set_flush_denormals(int flush_denormals);
+
+} // namespace ncnn
+
+#endif // NCNN_CPU_H
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ncnn/datareader.h b/duix-sdk/src/main/cpp/third/arm/include/ncnn/datareader.h
new file mode 100644
index 0000000..ed2aba3
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ncnn/datareader.h
@@ -0,0 +1,122 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_DATAREADER_H
+#define NCNN_DATAREADER_H
+
+#include "platform.h"
+#if NCNN_STDIO
+#include <stdio.h>
+#endif
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/asset_manager.h>
+#endif
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+// data read wrapper
+class NCNN_EXPORT DataReader
+{
+public:
+    DataReader();
+    virtual ~DataReader();
+
+#if NCNN_STRING
+    // parse plain param text
+    // return 1 if scan success
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+
+    // read binary param and model data
+    // return bytes read
+    virtual size_t read(void* buf, size_t size) const;
+
+    // get model data reference
+    // return bytes referenced
+    virtual size_t reference(size_t size, const void** buf) const;
+};
+
+#if NCNN_STDIO
+class DataReaderFromStdioPrivate;
+class NCNN_EXPORT DataReaderFromStdio : public DataReader
+{
+public:
+    explicit DataReaderFromStdio(FILE* fp);
+    virtual ~DataReaderFromStdio();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+
+private:
+    DataReaderFromStdio(const DataReaderFromStdio&);
+    DataReaderFromStdio& operator=(const DataReaderFromStdio&);
+
+private:
+    DataReaderFromStdioPrivate* const d;
+};
+#endif // NCNN_STDIO
+
+class DataReaderFromMemoryPrivate;
+class NCNN_EXPORT DataReaderFromMemory : public DataReader
+{
+public:
+    explicit DataReaderFromMemory(const unsigned char*& mem);
+    virtual ~DataReaderFromMemory();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+    virtual size_t reference(size_t size, const void** buf) const;
+
+private:
+    DataReaderFromMemory(const DataReaderFromMemory&);
+    DataReaderFromMemory& operator=(const DataReaderFromMemory&);
+
+private:
+    DataReaderFromMemoryPrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+class DataReaderFromAndroidAssetPrivate;
+class NCNN_EXPORT DataReaderFromAndroidAsset : public DataReader
+{
+public:
+    explicit DataReaderFromAndroidAsset(AAsset* asset);
+    virtual ~DataReaderFromAndroidAsset();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+
+private:
+    DataReaderFromAndroidAsset(const DataReaderFromAndroidAsset&);
+    DataReaderFromAndroidAsset& operator=(const DataReaderFromAndroidAsset&);
+
+private:
+    DataReaderFromAndroidAssetPrivate* const d;
+};
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+} // namespace ncnn
+
+#endif // NCNN_DATAREADER_H
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ncnn/gpu.h b/duix-sdk/src/main/cpp/third/arm/include/ncnn/gpu.h
new file mode 100644
index 0000000..1eff228
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ncnn/gpu.h
@@ -0,0 +1,392 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_GPU_H
+#define NCNN_GPU_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+
+#include "mat.h"
+
+#include <vulkan/vulkan.h>
+
+#include "vulkan_header_fix.h"
+
+namespace ncnn {
+
+// instance
+
+// Create VkInstance and initialize some objects that need to be calculated by GPU
+// Creates a VkInstance object, Checks the extended attributes supported by the Vulkan instance concerned,
+// Initializes, and creates Vulkan validation layers (if ENABLE_VALIDATION_LAYER is enabled),
+// Iterates over all supported physical devices, etc.
+NCNN_EXPORT int create_gpu_instance();
+
+// Get global VkInstance variable
+// Must be called after create_gpu_instance() and before destroy_gpu_instance()
+NCNN_EXPORT VkInstance get_gpu_instance();
+
+// Destroy VkInstance object and free the memory of the associated object
+// Usually called in the destructor of the main program exit
+NCNN_EXPORT void destroy_gpu_instance();
+
+// instance extension capability
+extern int support_VK_KHR_external_memory_capabilities;
+extern int support_VK_KHR_get_physical_device_properties2;
+extern int support_VK_KHR_get_surface_capabilities2;
+extern int support_VK_KHR_surface;
+extern int support_VK_EXT_debug_utils;
+extern int support_VK_EXT_validation_features;
+extern int support_VK_EXT_validation_flags;
+#if __ANDROID_API__ >= 26
+extern int support_VK_KHR_android_surface;
+#endif // __ANDROID_API__ >= 26
+
+// VK_KHR_cooperative_matrix
+extern PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR;
+
+// VK_KHR_external_memory_capabilities
+extern PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR vkGetPhysicalDeviceExternalBufferPropertiesKHR;
+
+// VK_KHR_get_physical_device_properties2
+extern PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR;
+extern PFN_vkGetPhysicalDeviceProperties2KHR vkGetPhysicalDeviceProperties2KHR;
+extern PFN_vkGetPhysicalDeviceFormatProperties2KHR vkGetPhysicalDeviceFormatProperties2KHR;
+extern PFN_vkGetPhysicalDeviceImageFormatProperties2KHR vkGetPhysicalDeviceImageFormatProperties2KHR;
+extern PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR vkGetPhysicalDeviceQueueFamilyProperties2KHR;
+extern PFN_vkGetPhysicalDeviceMemoryProperties2KHR vkGetPhysicalDeviceMemoryProperties2KHR;
+extern PFN_vkGetPhysicalDeviceSparseImageFormatProperties2KHR vkGetPhysicalDeviceSparseImageFormatProperties2KHR;
+
+// VK_KHR_get_surface_capabilities2
+extern PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR vkGetPhysicalDeviceSurfaceCapabilities2KHR;
+extern PFN_vkGetPhysicalDeviceSurfaceFormats2KHR vkGetPhysicalDeviceSurfaceFormats2KHR;
+
+// VK_KHR_surface
+extern PFN_vkDestroySurfaceKHR vkDestroySurfaceKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceSupportKHR vkGetPhysicalDeviceSurfaceSupportKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR vkGetPhysicalDeviceSurfaceCapabilitiesKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceFormatsKHR vkGetPhysicalDeviceSurfaceFormatsKHR;
+extern PFN_vkGetPhysicalDeviceSurfacePresentModesKHR vkGetPhysicalDeviceSurfacePresentModesKHR;
+
+#if __ANDROID_API__ >= 26
+// VK_KHR_android_surface
+extern PFN_vkCreateAndroidSurfaceKHR vkCreateAndroidSurfaceKHR;
+#endif // __ANDROID_API__ >= 26
+
+// VK_NV_cooperative_matrix
+extern PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesNV vkGetPhysicalDeviceCooperativeMatrixPropertiesNV;
+
+// get info
+NCNN_EXPORT int get_gpu_count();
+NCNN_EXPORT int get_default_gpu_index();
+
+class GpuInfoPrivate;
+class NCNN_EXPORT GpuInfo
+{
+public:
+    explicit GpuInfo();
+    virtual ~GpuInfo();
+
+    // vulkan physical device
+    VkPhysicalDevice physical_device() const;
+
+    // memory properties
+    const VkPhysicalDeviceMemoryProperties& physical_device_memory_properties() const;
+
+    // info
+    uint32_t api_version() const;
+    uint32_t driver_version() const;
+    uint32_t vendor_id() const;
+    uint32_t device_id() const;
+    const char* device_name() const;
+    uint8_t* pipeline_cache_uuid() const;
+
+    // 0 = discrete gpu
+    // 1 = integrated gpu
+    // 2 = virtual gpu
+    // 3 = cpu
+    int type() const;
+
+    // hardware limit
+    uint32_t max_shared_memory_size() const;
+    uint32_t max_workgroup_count_x() const;
+    uint32_t max_workgroup_count_y() const;
+    uint32_t max_workgroup_count_z() const;
+    uint32_t max_workgroup_invocations() const;
+    uint32_t max_workgroup_size_x() const;
+    uint32_t max_workgroup_size_y() const;
+    uint32_t max_workgroup_size_z() const;
+    size_t memory_map_alignment() const;
+    size_t buffer_offset_alignment() const;
+    size_t non_coherent_atom_size() const;
+    size_t buffer_image_granularity() const;
+    uint32_t max_image_dimension_1d() const;
+    uint32_t max_image_dimension_2d() const;
+    uint32_t max_image_dimension_3d() const;
+    float timestamp_period() const;
+
+    // runtime
+    uint32_t compute_queue_family_index() const;
+    uint32_t graphics_queue_family_index() const;
+    uint32_t transfer_queue_family_index() const;
+
+    uint32_t compute_queue_count() const;
+    uint32_t graphics_queue_count() const;
+    uint32_t transfer_queue_count() const;
+
+    // property
+    bool unified_compute_transfer_queue() const;
+
+    // subgroup
+    uint32_t subgroup_size() const;
+    bool support_subgroup_basic() const;
+    bool support_subgroup_vote() const;
+    bool support_subgroup_ballot() const;
+    bool support_subgroup_shuffle() const;
+
+    // bug is not feature
+    bool bug_storage_buffer_no_l1() const;
+    bool bug_corrupted_online_pipeline_cache() const;
+    bool bug_buffer_image_load_zero() const;
+
+    // but sometimes bug is a feature
+    bool bug_implicit_fp16_arithmetic() const;
+
+    // fp16 and int8 feature
+    bool support_fp16_packed() const;
+    bool support_fp16_storage() const;
+    bool support_fp16_arithmetic() const;
+    bool support_int8_packed() const;
+    bool support_int8_storage() const;
+    bool support_int8_arithmetic() const;
+
+    // ycbcr conversion feature
+    bool support_ycbcr_conversion() const;
+
+    // cooperative matrix feature
+    bool support_cooperative_matrix() const;
+    bool support_cooperative_matrix_16_8_8() const;
+    bool support_cooperative_matrix_16_8_16() const;
+    bool support_cooperative_matrix_16_16_16() const;
+
+    // extension capability
+    int support_VK_KHR_8bit_storage() const;
+    int support_VK_KHR_16bit_storage() const;
+    int support_VK_KHR_bind_memory2() const;
+    int support_VK_KHR_buffer_device_address() const;
+    int support_VK_KHR_create_renderpass2() const;
+    int support_VK_KHR_cooperative_matrix() const;
+    int support_VK_KHR_dedicated_allocation() const;
+    int support_VK_KHR_descriptor_update_template() const;
+    int support_VK_KHR_external_memory() const;
+    int support_VK_KHR_get_memory_requirements2() const;
+    int support_VK_KHR_maintenance1() const;
+    int support_VK_KHR_maintenance2() const;
+    int support_VK_KHR_maintenance3() const;
+    int support_VK_KHR_multiview() const;
+    int support_VK_KHR_portability_subset() const;
+    int support_VK_KHR_push_descriptor() const;
+    int support_VK_KHR_sampler_ycbcr_conversion() const;
+    int support_VK_KHR_shader_float16_int8() const;
+    int support_VK_KHR_shader_float_controls() const;
+    int support_VK_KHR_storage_buffer_storage_class() const;
+    int support_VK_KHR_swapchain() const;
+    int support_VK_EXT_buffer_device_address() const;
+    int support_VK_EXT_descriptor_indexing() const;
+    int support_VK_EXT_memory_budget() const;
+    int support_VK_EXT_memory_priority() const;
+    int support_VK_EXT_queue_family_foreign() const;
+    int support_VK_AMD_device_coherent_memory() const;
+#if __ANDROID_API__ >= 26
+    int support_VK_ANDROID_external_memory_android_hardware_buffer() const;
+#endif // __ANDROID_API__ >= 26
+    int support_VK_NV_cooperative_matrix() const;
+
+private:
+    GpuInfo(const GpuInfo&);
+    GpuInfo& operator=(const GpuInfo&);
+
+private:
+    friend int create_gpu_instance();
+    GpuInfoPrivate* const d;
+};
+
+NCNN_EXPORT const GpuInfo& get_gpu_info(int device_index = get_default_gpu_index());
+
+class VkAllocator;
+class VkCompute;
+class Option;
+class PipelineCache;
+class VulkanDevicePrivate;
+class NCNN_EXPORT VulkanDevice
+{
+public:
+    VulkanDevice(int device_index = get_default_gpu_index());
+    ~VulkanDevice();
+
+    const GpuInfo& info;
+
+    VkDevice vkdevice() const;
+
+    VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const;
+
+    // with fixed workgroup size
+    VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const;
+
+    // helper for creating pipeline
+    int create_descriptorset_layout(int binding_count, const int* binding_types, VkDescriptorSetLayout* descriptorset_layout) const;
+    int create_pipeline_layout(int push_constant_count, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout* pipeline_layout) const;
+    int create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, VkPipeline* pipeline) const;
+    int create_descriptor_update_template(int binding_count, const int* binding_types, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR* descriptor_update_template) const;
+
+    uint32_t find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const;
+    bool is_mappable(uint32_t memory_type_index) const;
+    bool is_coherent(uint32_t memory_type_index) const;
+
+    VkQueue acquire_queue(uint32_t queue_family_index) const;
+    void reclaim_queue(uint32_t queue_family_index, VkQueue queue) const;
+
+    // allocator on this device
+    VkAllocator* acquire_blob_allocator() const;
+    void reclaim_blob_allocator(VkAllocator* allocator) const;
+
+    VkAllocator* acquire_staging_allocator() const;
+    void reclaim_staging_allocator(VkAllocator* allocator) const;
+
+    // immutable sampler for texelfetch
+    const VkSampler* immutable_texelfetch_sampler() const;
+
+    // dummy buffer image
+    VkMat get_dummy_buffer() const;
+    VkImageMat get_dummy_image() const;
+    VkImageMat get_dummy_image_readonly() const;
+
+    // pipeline cache on this device
+    const PipelineCache* get_pipeline_cache() const;
+
+    // test image allocation
+    bool shape_support_image_storage(const Mat& shape) const;
+
+    // current gpu heap memory budget in MB
+    uint32_t get_heap_budget() const;
+
+    // utility operator
+    void convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkImageMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkImageMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+
+    // VK_KHR_bind_memory2
+    PFN_vkBindBufferMemory2KHR vkBindBufferMemory2KHR;
+    PFN_vkBindImageMemory2KHR vkBindImageMemory2KHR;
+
+    // VK_KHR_buffer_device_address
+    PFN_vkGetBufferDeviceAddressKHR vkGetBufferDeviceAddressKHR;
+    PFN_vkGetBufferOpaqueCaptureAddressKHR vkGetBufferOpaqueCaptureAddressKHR;
+    PFN_vkGetDeviceMemoryOpaqueCaptureAddressKHR vkGetDeviceMemoryOpaqueCaptureAddressKHR;
+
+    // VK_KHR_create_renderpass2
+    PFN_vkCmdBeginRenderPass2KHR vkCmdBeginRenderPass2KHR;
+    PFN_vkCmdEndRenderPass2KHR vkCmdEndRenderPass2KHR;
+    PFN_vkCmdNextSubpass2KHR vkCmdNextSubpass2KHR;
+    PFN_vkCreateRenderPass2KHR vkCreateRenderPass2KHR;
+
+    // VK_KHR_descriptor_update_template
+    PFN_vkCreateDescriptorUpdateTemplateKHR vkCreateDescriptorUpdateTemplateKHR;
+    PFN_vkDestroyDescriptorUpdateTemplateKHR vkDestroyDescriptorUpdateTemplateKHR;
+    PFN_vkUpdateDescriptorSetWithTemplateKHR vkUpdateDescriptorSetWithTemplateKHR;
+
+    // VK_KHR_get_memory_requirements2
+    PFN_vkGetImageMemoryRequirements2KHR vkGetImageMemoryRequirements2KHR;
+    PFN_vkGetBufferMemoryRequirements2KHR vkGetBufferMemoryRequirements2KHR;
+    PFN_vkGetImageSparseMemoryRequirements2KHR vkGetImageSparseMemoryRequirements2KHR;
+
+    // VK_KHR_maintenance1
+    PFN_vkTrimCommandPoolKHR vkTrimCommandPoolKHR;
+
+    // VK_KHR_maintenance3
+    PFN_vkGetDescriptorSetLayoutSupportKHR vkGetDescriptorSetLayoutSupportKHR;
+
+    // VK_KHR_push_descriptor
+    PFN_vkCmdPushDescriptorSetWithTemplateKHR vkCmdPushDescriptorSetWithTemplateKHR;
+    PFN_vkCmdPushDescriptorSetKHR vkCmdPushDescriptorSetKHR;
+
+    // VK_KHR_sampler_ycbcr_conversion
+    PFN_vkCreateSamplerYcbcrConversionKHR vkCreateSamplerYcbcrConversionKHR;
+    PFN_vkDestroySamplerYcbcrConversionKHR vkDestroySamplerYcbcrConversionKHR;
+
+    // VK_KHR_swapchain
+    PFN_vkCreateSwapchainKHR vkCreateSwapchainKHR;
+    PFN_vkDestroySwapchainKHR vkDestroySwapchainKHR;
+    PFN_vkGetSwapchainImagesKHR vkGetSwapchainImagesKHR;
+    PFN_vkAcquireNextImageKHR vkAcquireNextImageKHR;
+    PFN_vkQueuePresentKHR vkQueuePresentKHR;
+
+    // VK_EXT_buffer_device_address
+    PFN_vkGetBufferDeviceAddressEXT vkGetBufferDeviceAddressEXT;
+
+#if __ANDROID_API__ >= 26
+    // VK_ANDROID_external_memory_android_hardware_buffer
+    PFN_vkGetAndroidHardwareBufferPropertiesANDROID vkGetAndroidHardwareBufferPropertiesANDROID;
+    PFN_vkGetMemoryAndroidHardwareBufferANDROID vkGetMemoryAndroidHardwareBufferANDROID;
+#endif // __ANDROID_API__ >= 26
+
+protected:
+    // device extension
+    int init_device_extension();
+
+private:
+    VulkanDevice(const VulkanDevice&);
+    VulkanDevice& operator=(const VulkanDevice&);
+
+private:
+    VulkanDevicePrivate* const d;
+};
+
+NCNN_EXPORT VulkanDevice* get_gpu_device(int device_index = get_default_gpu_index());
+
+// online spirv compilation
+NCNN_EXPORT int compile_spirv_module(const char* comp_string, const Option& opt, std::vector<uint32_t>& spirv);
+NCNN_EXPORT int compile_spirv_module(const char* comp_data, int comp_data_size, const Option& opt, std::vector<uint32_t>& spirv);
+NCNN_EXPORT int compile_spirv_module(int shader_type_index, const Option& opt, std::vector<uint32_t>& spirv);
+
+// info from spirv
+class NCNN_EXPORT ShaderInfo
+{
+public:
+    int specialization_count;
+    int binding_count;
+    int push_constant_count;
+
+    // 0 = null
+    // 1 = storage buffer
+    // 2 = storage image
+    // 3 = combined image sampler
+    int binding_types[16]; // 16 is large enough I think ...
+
+    int reserved_0;
+    int reserved_1;
+    int reserved_2;
+    int reserved_3;
+};
+
+NCNN_EXPORT int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info);
+
+} // namespace ncnn
+
+#endif // NCNN_VULKAN
+
+#endif // NCNN_GPU_H
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ncnn/layer.h b/duix-sdk/src/main/cpp/third/arm/include/ncnn/layer.h
new file mode 100644
index 0000000..f0418a9
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ncnn/layer.h
@@ -0,0 +1,222 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_H
+#define NCNN_LAYER_H
+
+#include "mat.h"
+#include "modelbin.h"
+#include "option.h"
+#include "paramdict.h"
+#include "platform.h"
+
+#if NCNN_VULKAN
+#include "command.h"
+#include "pipeline.h"
+
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+namespace ncnn {
+
+class NCNN_EXPORT Layer
+{
+public:
+    // empty
+    Layer();
+    // virtual destructor
+    virtual ~Layer();
+
+    // load layer specific parameter from parsed dict
+    // return 0 if success
+    virtual int load_param(const ParamDict& pd);
+
+    // load layer specific weight data from model binary
+    // return 0 if success
+    virtual int load_model(const ModelBin& mb);
+
+    // layer implementation specific setup
+    // return 0 if success
+    virtual int create_pipeline(const Option& opt);
+
+    // layer implementation specific clean
+    // return 0 if success
+    virtual int destroy_pipeline(const Option& opt);
+
+public:
+    // one input and one output blob
+    bool one_blob_only;
+
+    // support inplace inference
+    bool support_inplace;
+
+    // support vulkan compute
+    bool support_vulkan;
+
+    // accept input blob with packed storage
+    bool support_packing;
+
+    // accept bf16
+    bool support_bf16_storage;
+
+    // accept fp16
+    bool support_fp16_storage;
+
+    // accept int8
+    bool support_int8_storage;
+
+    // shader image storage
+    bool support_image_storage;
+
+    // shader tensor storage
+    bool support_tensor_storage;
+
+    bool support_reserved_00;
+
+    bool support_reserved_0;
+    bool support_reserved_1;
+    bool support_reserved_2;
+    bool support_reserved_3;
+    bool support_reserved_4;
+    bool support_reserved_5;
+    bool support_reserved_6;
+    bool support_reserved_7;
+    bool support_reserved_8;
+    bool support_reserved_9;
+
+    // feature disabled set
+    int featmask;
+
+public:
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+
+#if NCNN_VULKAN
+public:
+    // upload weight blob from host to device
+    virtual int upload_model(VkTransfer& cmd, const Option& opt);
+
+public:
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<VkMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<VkImageMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    // assigned immediately after creating this layer
+    const VulkanDevice* vkdev;
+#endif // NCNN_VULKAN
+
+public:
+    // custom user data
+    void* userdata;
+    // layer type index
+    int typeindex;
+#if NCNN_STRING
+    // layer type name
+    std::string type;
+    // layer name
+    std::string name;
+#endif // NCNN_STRING
+    // blob index which this layer needs as input
+    std::vector<int> bottoms;
+    // blob index which this layer produces as output
+    std::vector<int> tops;
+    // shape hint
+    std::vector<Mat> bottom_shapes;
+    std::vector<Mat> top_shapes;
+};
+
+// layer factory function
+typedef Layer* (*layer_creator_func)(void*);
+typedef void (*layer_destroyer_func)(Layer*, void*);
+
+struct layer_registry_entry
+{
+#if NCNN_STRING
+    // layer type name
+    const char* name;
+#endif // NCNN_STRING
+    // layer factory entry
+    layer_creator_func creator;
+};
+
+struct custom_layer_registry_entry
+{
+#if NCNN_STRING
+    // layer type name
+    const char* name;
+#endif // NCNN_STRING
+    // layer factory entry
+    layer_creator_func creator;
+    layer_destroyer_func destroyer;
+    void* userdata;
+};
+
+struct overwrite_builtin_layer_registry_entry
+{
+    // layer type index
+    int typeindex;
+    // layer factory entry
+    layer_creator_func creator;
+    layer_destroyer_func destroyer;
+    void* userdata;
+};
+
+#if NCNN_STRING
+// get layer type from type name
+NCNN_EXPORT int layer_to_index(const char* type);
+// create layer from type name
+NCNN_EXPORT Layer* create_layer(const char* type);
+#endif // NCNN_STRING
+// create layer from layer type
+NCNN_EXPORT Layer* create_layer(int index);
+
+#define DEFINE_LAYER_CREATOR(name)                          \
+    ::ncnn::Layer* name##_layer_creator(void* /*userdata*/) \
+    {                                                       \
+        return new name;                                    \
+    }
+
+#define DEFINE_LAYER_DESTROYER(name)                                      \
+    void name##_layer_destroyer(::ncnn::Layer* layer, void* /*userdata*/) \
+    {                                                                     \
+        delete layer;                                                     \
+    }
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_H
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ncnn/layer_shader_type.h b/duix-sdk/src/main/cpp/third/arm/include/ncnn/layer_shader_type.h
new file mode 100644
index 0000000..c143e7d
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ncnn/layer_shader_type.h
@@ -0,0 +1,29 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_SHADER_TYPE_H
+#define NCNN_LAYER_SHADER_TYPE_H
+
+namespace ncnn {
+
+namespace LayerShaderType {
+enum LayerShaderType
+{
+#include "layer_shader_type_enum.h"
+};
+} // namespace LayerShaderType
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_SHADER_TYPE_H
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ncnn/layer_shader_type_enum.h b/duix-sdk/src/main/cpp/third/arm/include/ncnn/layer_shader_type_enum.h
new file mode 100644
index 0000000..aac8803
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ncnn/layer_shader_type_enum.h
@@ -0,0 +1,5 @@
+// Layer Shader Enum header
+//
+// This file is auto-generated by cmake, don't edit it.
+
+
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ncnn/layer_type.h b/duix-sdk/src/main/cpp/third/arm/include/ncnn/layer_type.h
new file mode 100644
index 0000000..511c714
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ncnn/layer_type.h
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_TYPE_H
+#define NCNN_LAYER_TYPE_H
+
+namespace ncnn {
+
+namespace LayerType {
+enum LayerType
+{
+#include "layer_type_enum.h"
+    CustomBit = (1 << 8),
+};
+} // namespace LayerType
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_TYPE_H
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ncnn/layer_type_enum.h b/duix-sdk/src/main/cpp/third/arm/include/ncnn/layer_type_enum.h
new file mode 100644
index 0000000..97153ed
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ncnn/layer_type_enum.h
@@ -0,0 +1,109 @@
+// Layer Type Enum header
+//
+// This file is auto-generated by cmake, don't edit it.
+
+AbsVal = 0,
+ArgMax = 1,
+BatchNorm = 2,
+Bias = 3,
+BNLL = 4,
+Concat = 5,
+Convolution = 6,
+Crop = 7,
+Deconvolution = 8,
+Dropout = 9,
+Eltwise = 10,
+ELU = 11,
+Embed = 12,
+Exp = 13,
+Flatten = 14,
+InnerProduct = 15,
+Input = 16,
+Log = 17,
+LRN = 18,
+MemoryData = 19,
+MVN = 20,
+Pooling = 21,
+Power = 22,
+PReLU = 23,
+Proposal = 24,
+Reduction = 25,
+ReLU = 26,
+Reshape = 27,
+ROIPooling = 28,
+Scale = 29,
+Sigmoid = 30,
+Slice = 31,
+Softmax = 32,
+Split = 33,
+SPP = 34,
+TanH = 35,
+Threshold = 36,
+Tile = 37,
+RNN = 38,
+LSTM = 39,
+BinaryOp = 40,
+UnaryOp = 41,
+ConvolutionDepthWise = 42,
+Padding = 43,
+Squeeze = 44,
+ExpandDims = 45,
+Normalize = 46,
+Permute = 47,
+PriorBox = 48,
+DetectionOutput = 49,
+Interp = 50,
+DeconvolutionDepthWise = 51,
+ShuffleChannel = 52,
+InstanceNorm = 53,
+Clip = 54,
+Reorg = 55,
+YoloDetectionOutput = 56,
+Quantize = 57,
+Dequantize = 58,
+Yolov3DetectionOutput = 59,
+PSROIPooling = 60,
+ROIAlign = 61,
+Packing = 62,
+Requantize = 63,
+Cast = 64,
+HardSigmoid = 65,
+SELU = 66,
+HardSwish = 67,
+Noop = 68,
+PixelShuffle = 69,
+DeepCopy = 70,
+Mish = 71,
+StatisticsPooling = 72,
+Swish = 73,
+Gemm = 74,
+GroupNorm = 75,
+LayerNorm = 76,
+Softplus = 77,
+GRU = 78,
+MultiHeadAttention = 79,
+GELU = 80,
+Convolution1D = 81,
+Pooling1D = 82,
+ConvolutionDepthWise1D = 83,
+Convolution3D = 84,
+ConvolutionDepthWise3D = 85,
+Pooling3D = 86,
+MatMul = 87,
+Deconvolution1D = 88,
+DeconvolutionDepthWise1D = 89,
+Deconvolution3D = 90,
+DeconvolutionDepthWise3D = 91,
+Einsum = 92,
+DeformableConv2D = 93,
+GLU = 94,
+Fold = 95,
+Unfold = 96,
+GridSample = 97,
+CumulativeSum = 98,
+CopyTo = 99,
+Erf = 100,
+Diag = 101,
+CELU = 102,
+Shrink = 103,
+
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ncnn/mat.h b/duix-sdk/src/main/cpp/third/arm/include/ncnn/mat.h
new file mode 100644
index 0000000..c6f59ef
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ncnn/mat.h
@@ -0,0 +1,1843 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_MAT_H
+#define NCNN_MAT_H
+
+#include <stdlib.h>
+#include <string.h>
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif
+#if __SSE2__
+#include <emmintrin.h>
+#if __AVX__
+#include <immintrin.h>
+#endif
+#endif
+#if __mips_msa
+#include <msa.h>
+#endif
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif
+#if __riscv_vector
+#include <riscv_vector.h>
+#include "cpu.h" // cpu_riscv_vlenb()
+#endif
+
+#include "allocator.h"
+#include "option.h"
+#include "platform.h"
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#if NCNN_PIXEL
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/bitmap.h>
+#include <jni.h>
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+#endif // NCNN_PIXEL
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkMat;
+class VkImageMat;
+#endif // NCNN_VULKAN
+
+// the three dimension matrix
+class NCNN_EXPORT Mat
+{
+public:
+    // empty
+    Mat();
+    // vec
+    Mat(int w, size_t elemsize = 4u, Allocator* allocator = 0);
+    // image
+    Mat(int w, int h, size_t elemsize = 4u, Allocator* allocator = 0);
+    // dim
+    Mat(int w, int h, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // cube
+    Mat(int w, int h, int d, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // packed vec
+    Mat(int w, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed image
+    Mat(int w, int h, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed dim
+    Mat(int w, int h, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed cube
+    Mat(int w, int h, int d, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // copy
+    Mat(const Mat& m);
+    // external vec
+    Mat(int w, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external image
+    Mat(int w, int h, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external dim
+    Mat(int w, int h, int c, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external cube
+    Mat(int w, int h, int d, int c, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external packed vec
+    Mat(int w, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed image
+    Mat(int w, int h, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed dim
+    Mat(int w, int h, int c, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed cube
+    Mat(int w, int h, int d, int c, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // release
+    ~Mat();
+    // assign
+    Mat& operator=(const Mat& m);
+    // set all
+    void fill(float v);
+    void fill(int v);
+#if __ARM_NEON
+    void fill(float32x4_t _v);
+    void fill(uint16x4_t _v);
+    void fill(int32x4_t _v);
+    void fill(int32x4_t _v0, int32x4_t _v1);
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    void fill(float16x4_t _v);
+    void fill(float16x8_t _v);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif // __ARM_NEON
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+    void fill(__m512 _v);
+#endif // __AVX512F__
+    void fill(__m256 _v, int i = 0);
+#endif // __AVX__
+    void fill(__m128 _v);
+    void fill(__m128i _v);
+#endif // __SSE2__
+#if __mips_msa
+    void fill(v4f32 _v);
+#endif // __mips_msa
+#if __loongarch_sx
+    void fill(__m128 _v);
+#endif //__loongarch_sx
+#if __riscv_vector
+    void fill(vfloat32m1_t _v);
+    void fill(vuint16m1_t _v);
+    void fill(vint8m1_t _v);
+#if __riscv_zfh
+    void fill(vfloat16m1_t _v);
+#endif // __riscv_zfh
+#endif // __riscv_vector
+    template<typename T>
+    void fill(T v);
+    // deep copy
+    Mat clone(Allocator* allocator = 0) const;
+    // deep copy from other mat, inplace
+    void clone_from(const ncnn::Mat& mat, Allocator* allocator = 0);
+    // reshape vec
+    Mat reshape(int w, Allocator* allocator = 0) const;
+    // reshape image
+    Mat reshape(int w, int h, Allocator* allocator = 0) const;
+    // reshape dim
+    Mat reshape(int w, int h, int c, Allocator* allocator = 0) const;
+    // reshape cube
+    Mat reshape(int w, int h, int d, int c, Allocator* allocator = 0) const;
+    // allocate vec
+    void create(int w, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate image
+    void create(int w, int h, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate cube
+    void create(int w, int h, int d, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed cube
+    void create(int w, int h, int d, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate like
+    void create_like(const Mat& m, Allocator* allocator = 0);
+#if NCNN_VULKAN
+    // allocate like
+    void create_like(const VkMat& m, Allocator* allocator = 0);
+    // allocate like
+    void create_like(const VkImageMat& im, Allocator* allocator = 0);
+#endif // NCNN_VULKAN
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // bits per element
+    int elembits() const;
+
+    // shape only
+    Mat shape() const;
+
+    // data reference
+    Mat channel(int c);
+    const Mat channel(int c) const;
+    Mat depth(int z);
+    const Mat depth(int z) const;
+    float* row(int y);
+    const float* row(int y) const;
+    template<typename T>
+    T* row(int y);
+    template<typename T>
+    const T* row(int y) const;
+
+    // range reference
+    Mat channel_range(int c, int channels);
+    const Mat channel_range(int c, int channels) const;
+    Mat depth_range(int z, int depths);
+    const Mat depth_range(int z, int depths) const;
+    Mat row_range(int y, int rows);
+    const Mat row_range(int y, int rows) const;
+    Mat range(int x, int n);
+    const Mat range(int x, int n) const;
+
+    // access raw data
+    template<typename T>
+    operator T*();
+    template<typename T>
+    operator const T*() const;
+
+    // convenient access float vec element
+    float& operator[](size_t i);
+    const float& operator[](size_t i) const;
+
+#if NCNN_PIXEL
+    enum PixelType
+    {
+        PIXEL_CONVERT_SHIFT = 16,
+        PIXEL_FORMAT_MASK = 0x0000ffff,
+        PIXEL_CONVERT_MASK = 0xffff0000,
+
+        PIXEL_RGB = 1,
+        PIXEL_BGR = 2,
+        PIXEL_GRAY = 3,
+        PIXEL_RGBA = 4,
+        PIXEL_BGRA = 5,
+
+        PIXEL_RGB2BGR = PIXEL_RGB | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGB2GRAY = PIXEL_RGB | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGB2RGBA = PIXEL_RGB | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGB2BGRA = PIXEL_RGB | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_BGR2RGB = PIXEL_BGR | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGR2GRAY = PIXEL_BGR | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGR2RGBA = PIXEL_BGR | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGR2BGRA = PIXEL_BGR | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_GRAY2RGB = PIXEL_GRAY | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_GRAY2BGR = PIXEL_GRAY | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_GRAY2RGBA = PIXEL_GRAY | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+        PIXEL_GRAY2BGRA = PIXEL_GRAY | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_RGBA2RGB = PIXEL_RGBA | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2BGR = PIXEL_RGBA | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2GRAY = PIXEL_RGBA | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2BGRA = PIXEL_RGBA | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_BGRA2RGB = PIXEL_BGRA | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGRA2BGR = PIXEL_BGRA | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGRA2GRAY = PIXEL_BGRA | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGRA2RGBA = PIXEL_BGRA | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+    };
+    // convenient construct from pixel data
+    static Mat from_pixels(const unsigned char* pixels, int type, int w, int h, Allocator* allocator = 0);
+    // convenient construct from pixel data with stride(bytes-per-row) parameter
+    static Mat from_pixels(const unsigned char* pixels, int type, int w, int h, int stride, Allocator* allocator = 0);
+    // convenient construct from pixel data and resize to specific size
+    static Mat from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from pixel data and resize to specific size with stride(bytes-per-row) parameter
+    static Mat from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int stride, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from pixel data roi
+    static Mat from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int roix, int roiy, int roiw, int roih, Allocator* allocator = 0);
+    // convenient construct from pixel data roi with stride(bytes-per-row) parameter
+    static Mat from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, Allocator* allocator = 0);
+    // convenient construct from pixel data roi and resize to specific size
+    static Mat from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from pixel data roi and resize to specific size with stride(bytes-per-row) parameter
+    static Mat from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator = 0);
+
+    // convenient export to pixel data
+    void to_pixels(unsigned char* pixels, int type) const;
+    // convenient export to pixel data with stride(bytes-per-row) parameter
+    void to_pixels(unsigned char* pixels, int type, int stride) const;
+    // convenient export to pixel data and resize to specific size
+    void to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height) const;
+    // convenient export to pixel data and resize to specific size with stride(bytes-per-row) parameter
+    void to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height, int target_stride) const;
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+    // convenient construct from android Bitmap
+    static Mat from_android_bitmap(JNIEnv* env, jobject bitmap, int type_to, Allocator* allocator = 0);
+    // convenient construct from android Bitmap and resize to specific size
+    static Mat from_android_bitmap_resize(JNIEnv* env, jobject bitmap, int type_to, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from android Bitmap roi
+    static Mat from_android_bitmap_roi(JNIEnv* env, jobject bitmap, int type_to, int roix, int roiy, int roiw, int roih, Allocator* allocator = 0);
+    // convenient construct from android Bitmap roi and resize to specific size
+    static Mat from_android_bitmap_roi_resize(JNIEnv* env, jobject bitmap, int type_to, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient export to android Bitmap and resize to the android Bitmap size
+    void to_android_bitmap(JNIEnv* env, jobject bitmap, int type_from) const;
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+#endif // NCNN_PIXEL
+
+    // substract channel-wise mean values, then multiply by normalize values, pass 0 to skip
+    void substract_mean_normalize(const float* mean_vals, const float* norm_vals);
+
+    // convenient construct from half precision floating point data
+    static Mat from_float16(const unsigned short* data, int size);
+
+    // pointer to the data
+    void* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-d-h-w-1  c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-d-h-w-4  c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-d-h-w-8  c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    Allocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int d;
+    int c;
+
+    size_t cstep;
+};
+
+#if NCNN_VULKAN
+
+// the three dimension matrix, vulkan version
+class NCNN_EXPORT VkMat
+{
+public:
+    // empty
+    VkMat();
+    // vec
+    VkMat(int w, size_t elemsize, VkAllocator* allocator);
+    // image
+    VkMat(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // dim
+    VkMat(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // cube
+    VkMat(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // packed vec
+    VkMat(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed image
+    VkMat(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed dim
+    VkMat(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed cube
+    VkMat(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // copy
+    VkMat(const VkMat& m);
+    // external vec
+    VkMat(int w, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external image
+    VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external dim
+    VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external cube
+    VkMat(int w, int h, int d, int c, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external packed vec
+    VkMat(int w, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed image
+    VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed dim
+    VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed cube
+    VkMat(int w, int h, int d, int c, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // release
+    ~VkMat();
+    // assign
+    VkMat& operator=(const VkMat& m);
+    // allocate vec
+    void create(int w, size_t elemsize, VkAllocator* allocator);
+    // allocate image
+    void create(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate cube
+    void create(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed cube
+    void create(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate like
+    void create_like(const Mat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkMat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkImageMat& im, VkAllocator* allocator);
+
+    // mapped
+    Mat mapped() const;
+    void* mapped_ptr() const;
+
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // bits per element
+    int elembits() const;
+
+    // shape only
+    Mat shape() const;
+
+    // low-level reference
+    VkBuffer buffer() const;
+    size_t buffer_offset() const;
+    size_t buffer_capacity() const;
+
+    // device buffer
+    VkBufferMemory* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-d-h-w-1  c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-d-h-w-4  c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-d-h-w-8  c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    VkAllocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int d;
+    int c;
+
+    size_t cstep;
+};
+
+class NCNN_EXPORT VkImageMat
+{
+public:
+    // empty
+    VkImageMat();
+    // vec
+    VkImageMat(int w, size_t elemsize, VkAllocator* allocator);
+    // image
+    VkImageMat(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // dim
+    VkImageMat(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // cube
+    VkImageMat(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // packed vec
+    VkImageMat(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed image
+    VkImageMat(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed dim
+    VkImageMat(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed cube
+    VkImageMat(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // copy
+    VkImageMat(const VkImageMat& m);
+    // external vec
+    VkImageMat(int w, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external image
+    VkImageMat(int w, int h, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external dim
+    VkImageMat(int w, int h, int c, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external cube
+    VkImageMat(int w, int h, int d, int c, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external packed vec
+    VkImageMat(int w, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed image
+    VkImageMat(int w, int h, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed dim
+    VkImageMat(int w, int h, int c, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed cube
+    VkImageMat(int w, int h, int d, int c, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // release
+    ~VkImageMat();
+    // assign
+    VkImageMat& operator=(const VkImageMat& m);
+    // allocate vec
+    void create(int w, size_t elemsize, VkAllocator* allocator);
+    // allocate image
+    void create(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate cube
+    void create(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed cube
+    void create(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate like
+    void create_like(const Mat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkMat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkImageMat& im, VkAllocator* allocator);
+
+    // mapped
+    Mat mapped() const;
+    void* mapped_ptr() const;
+
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // bits per element
+    int elembits() const;
+
+    // shape only
+    Mat shape() const;
+
+    // low-level reference
+    VkImage image() const;
+    VkImageView imageview() const;
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+    // convenient construct from android hardware buffer
+    static VkImageMat from_android_hardware_buffer(VkAndroidHardwareBufferImageAllocator* allocator);
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+    // device image
+    VkImageMemory* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-d-h-w-1  c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-d-h-w-4  c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-d-h-w-8  c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    VkAllocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int d;
+    int c;
+};
+
+// type for vulkan specialization constant and push constant
+union vk_specialization_type
+{
+    int i;
+    float f;
+    uint32_t u32;
+};
+union vk_constant_type
+{
+    int i;
+    float f;
+};
+#endif // NCNN_VULKAN
+
+// misc function
+#if NCNN_PIXEL
+// convert yuv420sp(nv21) to rgb, the fast approximate version
+NCNN_EXPORT void yuv420sp2rgb(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
+// convert yuv420sp(nv12) to rgb, the fast approximate version
+NCNN_EXPORT void yuv420sp2rgb_nv12(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
+// convert yuv420sp(nv21) to rgb with half resize, the faster approximate version
+NCNN_EXPORT void yuv420sp2rgb_half(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
+// image pixel bilinear resize
+NCNN_EXPORT void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+NCNN_EXPORT void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+NCNN_EXPORT void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+NCNN_EXPORT void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+// image pixel bilinear resize with stride(bytes-per-row) parameter
+NCNN_EXPORT void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+NCNN_EXPORT void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+NCNN_EXPORT void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+NCNN_EXPORT void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+// image pixel bilinear resize, convenient wrapper for yuv420sp(nv21/nv12)
+NCNN_EXPORT void resize_bilinear_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+#endif // NCNN_PIXEL
+#if NCNN_PIXEL_ROTATE
+// type is the from type, 6 means rotating from 6 to 1
+//
+//     1        2       3      4         5            6           7          8
+//
+//   888888  888888      88  88      8888888888  88                  88  8888888888
+//   88          88      88  88      88  88      88  88          88  88      88  88
+//   8888      8888    8888  8888    88          8888888888  8888888888          88
+//   88          88      88  88
+//   88          88  888888  888888
+//
+// ref http://sylvana.net/jpegcrop/exif_orientation.html
+// image pixel kanna rotate
+NCNN_EXPORT void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+NCNN_EXPORT void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+NCNN_EXPORT void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+NCNN_EXPORT void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+// image pixel kanna rotate with stride(bytes-per-row) parameter
+NCNN_EXPORT void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+NCNN_EXPORT void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+NCNN_EXPORT void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+NCNN_EXPORT void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+// image pixel kanna rotate, convenient wrapper for yuv420sp(nv21/nv12)
+NCNN_EXPORT void kanna_rotate_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+#endif // NCNN_PIXEL_ROTATE
+#if NCNN_PIXEL_AFFINE
+// resolve affine transform matrix from rotation angle, scale factor and x y offset
+NCNN_EXPORT void get_rotation_matrix(float angle, float scale, float dx, float dy, float* tm);
+// resolve affine transform matrix from two set of points, num_point must be >= 2
+NCNN_EXPORT void get_affine_transform(const float* points_from, const float* points_to, int num_point, float* tm);
+// resolve the inversion affine transform matrix
+NCNN_EXPORT void invert_affine_transform(const float* tm, float* tm_inv);
+// image pixel bilinear warpaffine inverse transform, set -233 for transparent border color, the color RGBA is little-endian encoded
+NCNN_EXPORT void warpaffine_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+// image pixel bilinear warpaffine inverse transform with stride(bytes-per-row) parameter, set -233 for transparent border color, the color RGBA is little-endian encoded
+NCNN_EXPORT void warpaffine_bilinear_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+// image pixel bilinear warpaffine, convenient wrapper for yuv420sp(nv21/nv12), set -233 for transparent border color, the color YUV_ is little-endian encoded
+NCNN_EXPORT void warpaffine_bilinear_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+#endif // NCNN_PIXEL_AFFINE
+#if NCNN_PIXEL_DRAWING
+// draw rectangle, set thickness -1 for filled rectangle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_rectangle_c1(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c2(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c3(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c4(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+// draw rectangle with stride(bytes-per-row) parameter, set thickness -1 for filled rectangle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_rectangle_c1(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c2(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c3(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c4(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+// draw rectangle, convenient wrapper for yuv420sp(nv21/nv12), set thickness -1 for filled rectangle, the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_rectangle_yuv420sp(unsigned char* yuv420sp, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+// draw circle, set thickness -1 for filled circle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_circle_c1(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c2(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c3(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c4(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+// draw circle with stride(bytes-per-row) parameter, set thickness -1 for filled circle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_circle_c1(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c2(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c3(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c4(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+// draw circle, convenient wrapper for yuv420sp(nv21/nv12), set thickness -1 for filled circle, the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_circle_yuv420sp(unsigned char* yuv420sp, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+// draw line, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_line_c1(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c2(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c3(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c4(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+// draw line with stride(bytes-per-row) parameter, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_line_c1(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c2(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c3(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c4(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+// draw line, convenient wrapper for yuv420sp(nv21/nv12), the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_line_yuv420sp(unsigned char* yuv420sp, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+// resolve text bounding box size
+NCNN_EXPORT void get_text_drawing_size(const char* text, int fontpixelsize, int* w, int* h);
+// draw ascii printables and newline, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_text_c1(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c2(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c3(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c4(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+// draw ascii printables and newline with stride(bytes-per-row) parameter, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_text_c1(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c2(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c3(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c4(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+// draw ascii printables and newline, convenient wrapper for yuv420sp(nv21/nv12), the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_text_yuv420sp(unsigned char* yuv420sp, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+#endif // NCNN_PIXEL_DRAWING
+
+// type conversion
+// convert float to half precision floating point
+NCNN_EXPORT unsigned short float32_to_float16(float value);
+// convert half precision floating point to float
+NCNN_EXPORT float float16_to_float32(unsigned short value);
+// convert float to brain half
+NCNN_EXPORT NCNN_FORCEINLINE unsigned short float32_to_bfloat16(float value)
+{
+    // 16 : 16
+    union
+    {
+        unsigned int u;
+        float f;
+    } tmp;
+    tmp.f = value;
+    return tmp.u >> 16;
+}
+// convert brain half to float
+NCNN_EXPORT NCNN_FORCEINLINE float bfloat16_to_float32(unsigned short value)
+{
+    // 16 : 16
+    union
+    {
+        unsigned int u;
+        float f;
+    } tmp;
+    tmp.u = value << 16;
+    return tmp.f;
+}
+
+// mat process
+enum BorderType
+{
+    BORDER_CONSTANT = 0,
+    BORDER_REPLICATE = 1,
+    BORDER_REFLECT = 2,
+    BORDER_TRANSPARENT = -233,
+};
+NCNN_EXPORT void copy_make_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int type, float v, const Option& opt = Option());
+NCNN_EXPORT void copy_make_border_3d(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int front, int behind, int type, float v, const Option& opt = Option());
+NCNN_EXPORT void copy_cut_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, const Option& opt = Option());
+NCNN_EXPORT void copy_cut_border_3d(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int front, int behind, const Option& opt = Option());
+NCNN_EXPORT void resize_nearest(const Mat& src, Mat& dst, int w, int h, const Option& opt = Option());
+NCNN_EXPORT void resize_bilinear(const Mat& src, Mat& dst, int w, int h, const Option& opt = Option());
+NCNN_EXPORT void resize_bicubic(const Mat& src, Mat& dst, int w, int h, const Option& opt = Option());
+NCNN_EXPORT void convert_packing(const Mat& src, Mat& dst, int elempack, const Option& opt = Option());
+NCNN_EXPORT void flatten(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_float32_to_float16(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_float16_to_float32(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_int8_to_float32(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_float32_to_bfloat16(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_bfloat16_to_float32(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void quantize_to_int8(const Mat& src, Mat& dst, const Mat& scale_data, const Option& opt = Option());
+NCNN_EXPORT void dequantize_from_int32(const Mat& src, Mat& dst, const Mat& scale_data, const Mat& bias_data, const Option& opt = Option());
+NCNN_EXPORT void requantize_from_int32_to_int8(const Mat& src, Mat& dst, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt = Option());
+
+NCNN_FORCEINLINE Mat::Mat()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(const Mat& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c), cstep(m.cstep)
+{
+    addref();
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = (size_t)w * h;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = (size_t)w * h;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::~Mat()
+{
+    release();
+}
+
+NCNN_FORCEINLINE void Mat::fill(float _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+
+    int i = 0;
+#if __ARM_NEON
+    float32x4_t _c = vdupq_n_f32(_v);
+    for (; i + 3 < size; i += 4)
+    {
+        vst1q_f32(ptr, _c);
+        ptr += 4;
+    }
+#endif // __ARM_NEON
+    for (; i < size; i++)
+    {
+        *ptr++ = _v;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(int _v)
+{
+    int size = (int)total();
+    int* ptr = (int*)data;
+
+    int i = 0;
+#if __ARM_NEON
+    int32x4_t _c = vdupq_n_s32(_v);
+    for (; i + 3 < size; i += 4)
+    {
+        vst1q_s32(ptr, _c);
+        ptr += 4;
+    }
+#endif // __ARM_NEON
+    for (; i < size; i++)
+    {
+        *ptr++ = _v;
+    }
+}
+
+#if __ARM_NEON
+NCNN_FORCEINLINE void Mat::fill(float32x4_t _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_f32(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(uint16x4_t _v)
+{
+    int size = (int)total();
+    unsigned short* ptr = (unsigned short*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1_u16(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(int32x4_t _v)
+{
+    int size = (int)total();
+    int* ptr = (int*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_s32(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(int32x4_t _v0, int32x4_t _v1)
+{
+    int size = (int)total();
+    int* ptr = (int*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_s32(ptr, _v0);
+        vst1q_s32(ptr + 4, _v1);
+        ptr += 8;
+    }
+}
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+NCNN_FORCEINLINE void Mat::fill(float16x4_t _v)
+{
+    int size = (int)total();
+    __fp16* ptr = (__fp16*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1_f16(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(float16x8_t _v)
+{
+    int size = (int)total();
+    __fp16* ptr = (__fp16*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_f16(ptr, _v);
+        ptr += 8;
+    }
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif // __ARM_NEON
+
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+NCNN_FORCEINLINE void Mat::fill(__m512 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm512_storeu_ps(ptr, _v);
+        ptr += 16;
+    }
+}
+#endif // __AVX512F__
+NCNN_FORCEINLINE void Mat::fill(__m256 _v, int _i)
+{
+    // old gcc cannot overload __m128 and __m256 type
+    // add a dummy int parameter for different mangled function symbol
+    (void)_i;
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm256_storeu_ps(ptr, _v);
+        ptr += 8;
+    }
+}
+#endif // __AVX__
+NCNN_FORCEINLINE void Mat::fill(__m128 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm_storeu_ps(ptr, _v);
+        ptr += 4;
+    }
+}
+NCNN_FORCEINLINE void Mat::fill(__m128i _v)
+{
+    int size = (int)total();
+    unsigned short* ptr = (unsigned short*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm_store_si128((__m128i*)ptr, _v);
+        ptr += 8;
+    }
+}
+#endif // __SSE2__
+
+#if __mips_msa
+NCNN_FORCEINLINE void Mat::fill(v4f32 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        __msa_st_w((v4i32)_v, ptr, 0);
+        ptr += 4;
+    }
+}
+#endif // __mips_msa
+
+#if __loongarch_sx
+NCNN_FORCEINLINE void Mat::fill(__m128 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        __lsx_vst(_v, ptr, 0);
+        ptr += 4;
+    }
+}
+#endif // __loongarch_sx
+#if __riscv_vector
+NCNN_FORCEINLINE void Mat::fill(vfloat32m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 4;
+    const size_t vl = vsetvl_e32m1(packn);
+
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse32_v_f32m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(vuint16m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 2;
+    const size_t vl = vsetvl_e16m1(packn);
+
+    int size = (int)total();
+    unsigned short* ptr = (unsigned short*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse16_v_u16m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(vint8m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 1;
+    const size_t vl = vsetvl_e8m1(packn);
+
+    int size = (int)total();
+    signed char* ptr = (signed char*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse8_v_i8m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+#if __riscv_zfh
+NCNN_FORCEINLINE void Mat::fill(vfloat16m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 2;
+    const size_t vl = vsetvl_e16m1(packn);
+
+    int size = (int)total();
+    __fp16* ptr = (__fp16*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse16_v_f16m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+#endif // __riscv_zfh
+#endif // __riscv_vector
+
+template<typename T>
+NCNN_FORCEINLINE void Mat::fill(T _v)
+{
+    int size = (int)total();
+    T* ptr = (T*)data;
+    for (int i = 0; i < size; i++)
+    {
+        ptr[i] = _v;
+    }
+}
+
+NCNN_FORCEINLINE Mat& Mat::operator=(const Mat& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        NCNN_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    d = m.d;
+    c = m.c;
+
+    cstep = m.cstep;
+
+    return *this;
+}
+
+NCNN_FORCEINLINE void Mat::addref()
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+}
+
+NCNN_FORCEINLINE void Mat::release()
+{
+    if (refcount && NCNN_XADD(refcount, -1) == 1)
+    {
+        if (allocator)
+            allocator->fastFree(data);
+        else
+            fastFree(data);
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    d = 0;
+    c = 0;
+
+    cstep = 0;
+
+    refcount = 0;
+}
+
+NCNN_FORCEINLINE bool Mat::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+NCNN_FORCEINLINE size_t Mat::total() const
+{
+    return cstep * c;
+}
+
+NCNN_FORCEINLINE int Mat::elembits() const
+{
+    return elempack ? static_cast<int>(elemsize * 8) / elempack : 0;
+}
+
+NCNN_FORCEINLINE Mat Mat::shape() const
+{
+    if (dims == 1)
+        return Mat(w * elempack, (void*)0);
+    if (dims == 2)
+        return Mat(w, h * elempack, (void*)0);
+    if (dims == 3)
+        return Mat(w, h, c * elempack, (void*)0);
+    if (dims == 4)
+        return Mat(w, h, d, c * elempack, (void*)0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE Mat Mat::channel(int _c)
+{
+    Mat m(w, h, d, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims - 1;
+    if (dims == 4)
+        m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::channel(int _c) const
+{
+    Mat m(w, h, d, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims - 1;
+    if (dims == 4)
+        m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE Mat Mat::depth(int z)
+{
+    return Mat(w, h, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE const Mat Mat::depth(int z) const
+{
+    return Mat(w, h, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE float* Mat::row(int y)
+{
+    return (float*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+NCNN_FORCEINLINE const float* Mat::row(int y) const
+{
+    return (const float*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+template<typename T>
+NCNN_FORCEINLINE T* Mat::row(int y)
+{
+    return (T*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+template<typename T>
+NCNN_FORCEINLINE const T* Mat::row(int y) const
+{
+    return (const T*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+NCNN_FORCEINLINE Mat Mat::channel_range(int _c, int channels)
+{
+    Mat m(w, h, d, channels, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::channel_range(int _c, int channels) const
+{
+    Mat m(w, h, d, channels, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims;
+    return m;
+}
+
+NCNN_FORCEINLINE Mat Mat::depth_range(int z, int depths)
+{
+    Mat m(w, h, depths, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+    m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::depth_range(int z, int depths) const
+{
+    Mat m(w, h, depths, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+    m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE Mat Mat::row_range(int y, int rows)
+{
+    return Mat(w, rows, (unsigned char*)data + (size_t)w * y * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE const Mat Mat::row_range(int y, int rows) const
+{
+    return Mat(w, rows, (unsigned char*)data + (size_t)w * y * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE Mat Mat::range(int x, int n)
+{
+    return Mat(n, (unsigned char*)data + x * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE const Mat Mat::range(int x, int n) const
+{
+    return Mat(n, (unsigned char*)data + x * elemsize, elemsize, elempack, allocator);
+}
+
+template<typename T>
+NCNN_FORCEINLINE Mat::operator T*()
+{
+    return (T*)data;
+}
+
+template<typename T>
+NCNN_FORCEINLINE Mat::operator const T*() const
+{
+    return (const T*)data;
+}
+
+NCNN_FORCEINLINE float& Mat::operator[](size_t i)
+{
+    return ((float*)data)[i];
+}
+
+NCNN_FORCEINLINE const float& Mat::operator[](size_t i) const
+{
+    return ((const float*)data)[i];
+}
+
+#if NCNN_VULKAN
+
+NCNN_FORCEINLINE VkMat::VkMat()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(const VkMat& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c)
+{
+    addref();
+
+    cstep = m.cstep;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = w * h;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize(w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize(w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = w * h;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize(w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize(w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::~VkMat()
+{
+    release();
+}
+
+NCNN_FORCEINLINE VkMat& VkMat::operator=(const VkMat& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        NCNN_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    d = m.d;
+    c = m.c;
+
+    cstep = m.cstep;
+
+    return *this;
+}
+
+NCNN_FORCEINLINE Mat VkMat::mapped() const
+{
+    if (!allocator->mappable)
+        return Mat();
+
+    if (dims == 1)
+        return Mat(w, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 2)
+        return Mat(w, h, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 3)
+        return Mat(w, h, c, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 4)
+        return Mat(w, h, d, c, mapped_ptr(), elemsize, elempack, 0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE void* VkMat::mapped_ptr() const
+{
+    if (!allocator->mappable)
+        return 0;
+
+    return (unsigned char*)data->mapped_ptr + data->offset;
+}
+
+NCNN_FORCEINLINE void VkMat::addref()
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+}
+
+NCNN_FORCEINLINE void VkMat::release()
+{
+    if (refcount && NCNN_XADD(refcount, -1) == 1)
+    {
+        if (allocator && data)
+        {
+            allocator->fastFree(data);
+        }
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    d = 0;
+    c = 0;
+
+    cstep = 0;
+
+    refcount = 0;
+}
+
+NCNN_FORCEINLINE bool VkMat::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+NCNN_FORCEINLINE size_t VkMat::total() const
+{
+    return cstep * c;
+}
+
+NCNN_FORCEINLINE int VkMat::elembits() const
+{
+    return elempack ? static_cast<int>(elemsize) * 8 / elempack : 0;
+}
+
+NCNN_FORCEINLINE Mat VkMat::shape() const
+{
+    if (dims == 1)
+        return Mat(w * elempack, (void*)0);
+    if (dims == 2)
+        return Mat(w, h * elempack, (void*)0);
+    if (dims == 3)
+        return Mat(w, h, c * elempack, (void*)0);
+    if (dims == 4)
+        return Mat(w, h, d, c * elempack, (void*)0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE VkBuffer VkMat::buffer() const
+{
+    return data->buffer;
+}
+
+NCNN_FORCEINLINE size_t VkMat::buffer_offset() const
+{
+    return data->offset;
+}
+
+NCNN_FORCEINLINE size_t VkMat::buffer_capacity() const
+{
+    return data->capacity;
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(const VkImageMat& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c)
+{
+    addref();
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::~VkImageMat()
+{
+    release();
+}
+
+NCNN_FORCEINLINE VkImageMat& VkImageMat::operator=(const VkImageMat& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        NCNN_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    d = m.d;
+    c = m.c;
+
+    return *this;
+}
+
+NCNN_FORCEINLINE Mat VkImageMat::mapped() const
+{
+    if (!allocator->mappable || !data->mapped_ptr)
+        return Mat();
+
+    if (dims == 1)
+        return Mat(w, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 2)
+        return Mat(w, h, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 3)
+        return Mat(w, h, c, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 4)
+        return Mat(w, h, d, c, mapped_ptr(), elemsize, elempack, 0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE void* VkImageMat::mapped_ptr() const
+{
+    if (!allocator->mappable || !data->mapped_ptr)
+        return 0;
+
+    return (unsigned char*)data->mapped_ptr + data->bind_offset;
+}
+
+NCNN_FORCEINLINE void VkImageMat::addref()
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+}
+
+NCNN_FORCEINLINE void VkImageMat::release()
+{
+    if (refcount && NCNN_XADD(refcount, -1) == 1)
+    {
+        if (allocator && data)
+        {
+            allocator->fastFree(data);
+        }
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    d = 0;
+    c = 0;
+
+    refcount = 0;
+}
+
+NCNN_FORCEINLINE bool VkImageMat::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+NCNN_FORCEINLINE size_t VkImageMat::total() const
+{
+    return w * h * d * c;
+}
+
+NCNN_FORCEINLINE int VkImageMat::elembits() const
+{
+    return elempack ? static_cast<int>(elemsize) * 8 / elempack : 0;
+}
+
+NCNN_FORCEINLINE Mat VkImageMat::shape() const
+{
+    if (dims == 1)
+        return Mat(w * elempack, (void*)0);
+    if (dims == 2)
+        return Mat(w, h * elempack, (void*)0);
+    if (dims == 3)
+        return Mat(w, h, c * elempack, (void*)0);
+    if (dims == 4)
+        return Mat(w, h, d, c * elempack, (void*)0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE VkImage VkImageMat::image() const
+{
+    return data->image;
+}
+
+NCNN_FORCEINLINE VkImageView VkImageMat::imageview() const
+{
+    return data->imageview;
+}
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_MAT_H
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ncnn/modelbin.h b/duix-sdk/src/main/cpp/third/arm/include/ncnn/modelbin.h
new file mode 100644
index 0000000..aada5f6
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ncnn/modelbin.h
@@ -0,0 +1,80 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_MODELBIN_H
+#define NCNN_MODELBIN_H
+
+#include "mat.h"
+
+namespace ncnn {
+
+class DataReader;
+class NCNN_EXPORT ModelBin
+{
+public:
+    ModelBin();
+    virtual ~ModelBin();
+    // element type
+    // 0 = auto
+    // 1 = float32
+    // 2 = float16
+    // 3 = int8
+    // load vec
+    virtual Mat load(int w, int type) const;
+    // load image
+    virtual Mat load(int w, int h, int type) const;
+    // load dim
+    virtual Mat load(int w, int h, int c, int type) const;
+    // load cube
+    virtual Mat load(int w, int h, int d, int c, int type) const;
+};
+
+class ModelBinFromDataReaderPrivate;
+class NCNN_EXPORT ModelBinFromDataReader : public ModelBin
+{
+public:
+    explicit ModelBinFromDataReader(const DataReader& dr);
+    virtual ~ModelBinFromDataReader();
+
+    virtual Mat load(int w, int type) const;
+
+private:
+    ModelBinFromDataReader(const ModelBinFromDataReader&);
+    ModelBinFromDataReader& operator=(const ModelBinFromDataReader&);
+
+private:
+    ModelBinFromDataReaderPrivate* const d;
+};
+
+class ModelBinFromMatArrayPrivate;
+class NCNN_EXPORT ModelBinFromMatArray : public ModelBin
+{
+public:
+    // construct from weight blob array
+    explicit ModelBinFromMatArray(const Mat* weights);
+    virtual ~ModelBinFromMatArray();
+
+    virtual Mat load(int w, int type) const;
+
+private:
+    ModelBinFromMatArray(const ModelBinFromMatArray&);
+    ModelBinFromMatArray& operator=(const ModelBinFromMatArray&);
+
+private:
+    ModelBinFromMatArrayPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_MODELBIN_H
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ncnn/ncnn_export.h b/duix-sdk/src/main/cpp/third/arm/include/ncnn/ncnn_export.h
new file mode 100644
index 0000000..e2f5fde
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ncnn/ncnn_export.h
@@ -0,0 +1,42 @@
+
+#ifndef NCNN_EXPORT_H
+#define NCNN_EXPORT_H
+
+#ifdef NCNN_STATIC_DEFINE
+#  define NCNN_EXPORT
+#  define NCNN_NO_EXPORT
+#else
+#  ifndef NCNN_EXPORT
+#    ifdef ncnn_EXPORTS
+        /* We are building this library */
+#      define NCNN_EXPORT __attribute__((visibility("default")))
+#    else
+        /* We are using this library */
+#      define NCNN_EXPORT __attribute__((visibility("default")))
+#    endif
+#  endif
+
+#  ifndef NCNN_NO_EXPORT
+#    define NCNN_NO_EXPORT __attribute__((visibility("hidden")))
+#  endif
+#endif
+
+#ifndef NCNN_DEPRECATED
+#  define NCNN_DEPRECATED __attribute__ ((__deprecated__))
+#endif
+
+#ifndef NCNN_DEPRECATED_EXPORT
+#  define NCNN_DEPRECATED_EXPORT NCNN_EXPORT NCNN_DEPRECATED
+#endif
+
+#ifndef NCNN_DEPRECATED_NO_EXPORT
+#  define NCNN_DEPRECATED_NO_EXPORT NCNN_NO_EXPORT NCNN_DEPRECATED
+#endif
+
+#if 0 /* DEFINE_NO_DEPRECATED */
+#  ifndef NCNN_NO_DEPRECATED
+#    define NCNN_NO_DEPRECATED
+#  endif
+#endif
+
+#endif /* NCNN_EXPORT_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ncnn/net.h b/duix-sdk/src/main/cpp/third/arm/include/ncnn/net.h
new file mode 100644
index 0000000..98e3ec3
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ncnn/net.h
@@ -0,0 +1,274 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_NET_H
+#define NCNN_NET_H
+
+#include "blob.h"
+#include "layer.h"
+#include "mat.h"
+#include "option.h"
+#include "platform.h"
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/asset_manager.h>
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkCompute;
+#endif // NCNN_VULKAN
+class DataReader;
+class Extractor;
+class NetPrivate;
+class NCNN_EXPORT Net
+{
+public:
+    // empty init
+    Net();
+    // clear and destroy
+    virtual ~Net();
+
+public:
+    // option can be changed before loading
+    Option opt;
+
+#if NCNN_VULKAN
+    // set gpu device by index
+    void set_vulkan_device(int device_index);
+
+    // set gpu device by device handle, no owner transfer
+    void set_vulkan_device(const VulkanDevice* vkdev);
+
+    const VulkanDevice* vulkan_device() const;
+#endif // NCNN_VULKAN
+
+#if NCNN_STRING
+    // register custom layer or overwrite built-in layer by layer type name
+    // return 0 if success
+    int register_custom_layer(const char* type, layer_creator_func creator, layer_destroyer_func destroyer = 0, void* userdata = 0);
+    virtual int custom_layer_to_index(const char* type);
+#endif // NCNN_STRING
+    // register custom layer or overwrite built-in layer by layer type
+    // return 0 if success
+    int register_custom_layer(int index, layer_creator_func creator, layer_destroyer_func destroyer = 0, void* userdata = 0);
+
+#if NCNN_STRING
+    int load_param(const DataReader& dr);
+#endif // NCNN_STRING
+
+    int load_param_bin(const DataReader& dr);
+
+    int load_model(const DataReader& dr);
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    // load network structure from plain param file
+    // return 0 if success
+    int load_param(FILE* fp);
+    int load_param(const char* protopath);
+    int load_param_mem(const char* mem);
+#endif // NCNN_STRING
+    // load network structure from binary param file
+    // return 0 if success
+    int load_param_bin(FILE* fp);
+    int load_param_bin(const char* protopath);
+
+    // load network weight data from model file
+    // return 0 if success
+    int load_model(FILE* fp);
+    int load_model(const char* modelpath);
+#endif // NCNN_STDIO
+
+    // load network structure from external memory
+    // memory pointer must be 32-bit aligned
+    // return bytes consumed
+    int load_param(const unsigned char* mem);
+
+    // reference network weight data from external memory
+    // weight data is not copied but referenced
+    // so external memory should be retained when used
+    // memory pointer must be 32-bit aligned
+    // return bytes consumed
+    int load_model(const unsigned char* mem);
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#if NCNN_STRING
+    // convenient load network structure from android asset plain param file
+    int load_param(AAsset* asset);
+    int load_param(AAssetManager* mgr, const char* assetpath);
+#endif // NCNN_STRING
+    // convenient load network structure from android asset binary param file
+    int load_param_bin(AAsset* asset);
+    int load_param_bin(AAssetManager* mgr, const char* assetpath);
+
+    // convenient load network weight data from android asset model file
+    int load_model(AAsset* asset);
+    int load_model(AAssetManager* mgr, const char* assetpath);
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+    // unload network structure and weight data
+    void clear();
+
+    // construct an Extractor from network
+    Extractor create_extractor() const;
+
+    // get input/output indexes/names
+    const std::vector<int>& input_indexes() const;
+    const std::vector<int>& output_indexes() const;
+#if NCNN_STRING
+    const std::vector<const char*>& input_names() const;
+    const std::vector<const char*>& output_names() const;
+#endif
+
+    const std::vector<Blob>& blobs() const;
+    const std::vector<Layer*>& layers() const;
+
+    std::vector<Blob>& mutable_blobs();
+    std::vector<Layer*>& mutable_layers();
+
+protected:
+    friend class Extractor;
+#if NCNN_STRING
+    int find_blob_index_by_name(const char* name) const;
+    int find_layer_index_by_name(const char* name) const;
+    virtual Layer* create_custom_layer(const char* type);
+    virtual Layer* create_overwrite_builtin_layer(const char* type);
+#endif // NCNN_STRING
+    virtual Layer* create_custom_layer(int index);
+    virtual Layer* create_overwrite_builtin_layer(int typeindex);
+
+private:
+    Net(const Net&);
+    Net& operator=(const Net&);
+
+private:
+    NetPrivate* const d;
+};
+
+class ExtractorPrivate;
+class NCNN_EXPORT Extractor
+{
+public:
+    virtual ~Extractor();
+
+    // copy
+    Extractor(const Extractor&);
+
+    // assign
+    Extractor& operator=(const Extractor&);
+
+    // clear blob mats and alloctors
+    void clear();
+
+    // enable light mode
+    // intermediate blob will be recycled when enabled
+    // enabled by default
+    void set_light_mode(bool enable);
+
+    // set thread count for this extractor
+    // this will overwrite the global setting
+    // default count is system depended
+    void set_num_threads(int num_threads);
+
+    // set blob memory allocator
+    void set_blob_allocator(Allocator* allocator);
+
+    // set workspace memory allocator
+    void set_workspace_allocator(Allocator* allocator);
+
+#if NCNN_VULKAN
+    void set_vulkan_compute(bool enable);
+
+    void set_blob_vkallocator(VkAllocator* allocator);
+
+    void set_workspace_vkallocator(VkAllocator* allocator);
+
+    void set_staging_vkallocator(VkAllocator* allocator);
+#endif // NCNN_VULKAN
+
+#if NCNN_STRING
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const Mat& in);
+
+    // get result by blob name
+    // return 0 if success
+    // type = 0, default
+    // type = 1, do not convert fp16/bf16 or / and packing
+    int extract(const char* blob_name, Mat& feat, int type = 0);
+#endif // NCNN_STRING
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const Mat& in);
+
+    // get result by blob index
+    // return 0 if success
+    // type = 0, default
+    // type = 1, do not convert fp16/bf16 or / and packing
+    int extract(int blob_index, Mat& feat, int type = 0);
+
+#if NCNN_VULKAN
+#if NCNN_STRING
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const VkMat& in);
+
+    // get result by blob name
+    // return 0 if success
+    int extract(const char* blob_name, VkMat& feat, VkCompute& cmd);
+
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const VkImageMat& in);
+
+    // get result by blob name
+    // return 0 if success
+    int extract(const char* blob_name, VkImageMat& feat, VkCompute& cmd);
+#endif // NCNN_STRING
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const VkMat& in);
+
+    // get result by blob index
+    // return 0 if success
+    int extract(int blob_index, VkMat& feat, VkCompute& cmd);
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const VkImageMat& in);
+
+    // get result by blob index
+    // return 0 if success
+    int extract(int blob_index, VkImageMat& feat, VkCompute& cmd);
+#endif // NCNN_VULKAN
+
+protected:
+    friend Extractor Net::create_extractor() const;
+    Extractor(const Net* net, size_t blob_count);
+
+private:
+    ExtractorPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_NET_H
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ncnn/option.h b/duix-sdk/src/main/cpp/third/arm/include/ncnn/option.h
new file mode 100644
index 0000000..7d0cc60
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ncnn/option.h
@@ -0,0 +1,156 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_OPTION_H
+#define NCNN_OPTION_H
+
+#include "platform.h"
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkAllocator;
+class PipelineCache;
+#endif // NCNN_VULKAN
+
+class Allocator;
+class NCNN_EXPORT Option
+{
+public:
+    // default option
+    Option();
+
+public:
+    // light mode
+    // intermediate blob will be recycled when enabled
+    // enabled by default
+    bool lightmode;
+
+    // thread count
+    // default value is the one returned by get_cpu_count()
+    int num_threads;
+
+    // blob memory allocator
+    Allocator* blob_allocator;
+
+    // workspace memory allocator
+    Allocator* workspace_allocator;
+
+#if NCNN_VULKAN
+    // blob memory allocator
+    VkAllocator* blob_vkallocator;
+
+    // workspace memory allocator
+    VkAllocator* workspace_vkallocator;
+
+    // staging memory allocator
+    VkAllocator* staging_vkallocator;
+
+    // pipeline cache
+    PipelineCache* pipeline_cache;
+#endif // NCNN_VULKAN
+
+    // the time openmp threads busy-wait for more work before going to sleep
+    // default value is 20ms to keep the cores enabled
+    // without too much extra power consumption afterwards
+    int openmp_blocktime;
+
+    // enable winograd convolution optimization
+    // improve convolution 3x3 stride1 performance, may consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_winograd_convolution;
+
+    // enable sgemm convolution optimization
+    // improve convolution 1x1 stride1 performance, may consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_sgemm_convolution;
+
+    // enable quantized int8 inference
+    // use low-precision int8 path for quantized model
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_int8_inference;
+
+    // enable vulkan compute
+    bool use_vulkan_compute;
+
+    // enable bf16 data type for storage
+    // improve most operator performance on all arm devices, may consume more memory
+    bool use_bf16_storage;
+
+    // enable options for gpu inference
+    bool use_fp16_packed;
+    bool use_fp16_storage;
+    bool use_fp16_arithmetic;
+    bool use_int8_packed;
+    bool use_int8_storage;
+    bool use_int8_arithmetic;
+
+    // enable simd-friendly packed memory layout
+    // improve all operator performance on all arm devices, will consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_packing_layout;
+
+    bool use_shader_pack8;
+
+    // subgroup option
+    bool use_subgroup_basic;
+    bool use_subgroup_vote;
+    bool use_subgroup_ballot;
+    bool use_subgroup_shuffle;
+
+    // turn on for adreno
+    bool use_image_storage;
+    bool use_tensor_storage;
+
+    bool use_reserved_0;
+
+    // enable DAZ(Denormals-Are-Zero) and FTZ(Flush-To-Zero)
+    // default value is 3
+    // 0 = DAZ OFF, FTZ OFF
+    // 1 = DAZ ON , FTZ OFF
+    // 2 = DAZ OFF, FTZ ON
+    // 3 = DAZ ON,  FTZ ON
+    int flush_denormals;
+
+    bool use_local_pool_allocator;
+
+    // enable local memory optimization for gpu inference
+    bool use_shader_local_memory;
+
+    // enable cooperative matrix optimization for gpu inference
+    bool use_cooperative_matrix;
+
+    // more fine-grained control of winograd convolution
+    bool use_winograd23_convolution;
+    bool use_winograd43_convolution;
+    bool use_winograd63_convolution;
+
+    // this option is turned on for A53/A55 automatically
+    // but you can force this on/off if you wish
+    bool use_a53_a55_optimized_kernel;
+
+    bool use_reserved_7;
+    bool use_reserved_8;
+    bool use_reserved_9;
+    bool use_reserved_10;
+    bool use_reserved_11;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_OPTION_H
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ncnn/paramdict.h b/duix-sdk/src/main/cpp/third/arm/include/ncnn/paramdict.h
new file mode 100644
index 0000000..c2ef160
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ncnn/paramdict.h
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PARAMDICT_H
+#define NCNN_PARAMDICT_H
+
+#include "mat.h"
+
+// at most 32 parameters
+#define NCNN_MAX_PARAM_COUNT 32
+
+namespace ncnn {
+
+class DataReader;
+class Net;
+class ParamDictPrivate;
+class NCNN_EXPORT ParamDict
+{
+public:
+    // empty
+    ParamDict();
+
+    virtual ~ParamDict();
+
+    // copy
+    ParamDict(const ParamDict&);
+
+    // assign
+    ParamDict& operator=(const ParamDict&);
+
+    // get type
+    int type(int id) const;
+
+    // get int
+    int get(int id, int def) const;
+    // get float
+    float get(int id, float def) const;
+    // get array
+    Mat get(int id, const Mat& def) const;
+
+    // set int
+    void set(int id, int i);
+    // set float
+    void set(int id, float f);
+    // set array
+    void set(int id, const Mat& v);
+
+protected:
+    friend class Net;
+
+    void clear();
+
+    int load_param(const DataReader& dr);
+    int load_param_bin(const DataReader& dr);
+
+private:
+    ParamDictPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_PARAMDICT_H
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ncnn/pipeline.h b/duix-sdk/src/main/cpp/third/arm/include/ncnn/pipeline.h
new file mode 100644
index 0000000..c284a14
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ncnn/pipeline.h
@@ -0,0 +1,113 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PIPELINE_H
+#define NCNN_PIPELINE_H
+
+#include "mat.h"
+#include "platform.h"
+#if NCNN_VULKAN
+#include "gpu.h"
+
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class Option;
+class PipelinePrivate;
+class NCNN_EXPORT Pipeline
+{
+public:
+    explicit Pipeline(const VulkanDevice* vkdev);
+    virtual ~Pipeline();
+
+public:
+    void set_optimal_local_size_xyz(int w = 4, int h = 4, int c = 4);
+    void set_optimal_local_size_xyz(const Mat& local_size_xyz);
+    void set_local_size_xyz(int w, int h, int c);
+
+    int create(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations);
+
+    int create(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations);
+
+public:
+    VkShaderModule shader_module() const;
+    VkDescriptorSetLayout descriptorset_layout() const;
+    VkPipelineLayout pipeline_layout() const;
+    VkPipeline pipeline() const;
+    VkDescriptorUpdateTemplateKHR descriptor_update_template() const;
+
+    const ShaderInfo& shader_info() const;
+
+    uint32_t local_size_x() const;
+    uint32_t local_size_y() const;
+    uint32_t local_size_z() const;
+
+protected:
+    void set_shader_module(VkShaderModule shader_module);
+    void set_descriptorset_layout(VkDescriptorSetLayout descriptorset_layout);
+    void set_pipeline_layout(VkPipelineLayout pipeline_layout);
+    void set_pipeline(VkPipeline pipeline);
+    void set_descriptor_update_template(VkDescriptorUpdateTemplateKHR descriptor_update_template);
+
+    void set_shader_info(const ShaderInfo& shader_info);
+
+public:
+    const VulkanDevice* vkdev;
+
+private:
+    Pipeline(const Pipeline&);
+    Pipeline& operator=(const Pipeline&);
+
+private:
+    PipelinePrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class VkCompute;
+class NCNN_EXPORT ImportAndroidHardwareBufferPipeline : private Pipeline
+{
+public:
+    explicit ImportAndroidHardwareBufferPipeline(const VulkanDevice* vkdev);
+    virtual ~ImportAndroidHardwareBufferPipeline();
+
+    int create(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator, int type_to, int rotate_from, const Option& opt);
+    int create(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator, int type_to, int rotate_from, int target_width, int target_height, const Option& opt);
+    void destroy();
+
+    friend class VkCompute;
+
+protected:
+    int create_shader_module(const Option& opt);
+    int create_sampler(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator);
+    int create_descriptorset_layout();
+
+public:
+    int type_to;
+    int rotate_from;
+    bool need_resize;
+
+    VkSampler sampler;
+};
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_PIPELINE_H
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ncnn/pipelinecache.h b/duix-sdk/src/main/cpp/third/arm/include/ncnn/pipelinecache.h
new file mode 100644
index 0000000..bb6b8fb
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ncnn/pipelinecache.h
@@ -0,0 +1,85 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PIPELINECACHE_H
+#define NCNN_PIPELINECACHE_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#include "mat.h"
+#include "gpu.h"
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+
+class VulkanDevice;
+class PipelineCachePrivate;
+class NCNN_EXPORT PipelineCache
+{
+public:
+    explicit PipelineCache(const VulkanDevice* _vkdev);
+
+    virtual ~PipelineCache();
+
+    void clear();
+
+    int get_pipeline(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations,
+                     uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                     VkShaderModule* shader_module,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template,
+                     ShaderInfo& shader_info) const;
+
+    int get_pipeline(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations,
+                     uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                     VkShaderModule* shader_module,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template,
+                     ShaderInfo& shader_info) const;
+
+protected:
+    int create_shader_module(int shader_type_index, const Option& opt, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                             VkShaderModule* _shader_module, ShaderInfo& si) const;
+
+    int new_pipeline(VkShaderModule shader_module, const ShaderInfo& shader_info, const std::vector<vk_specialization_type>& specializations,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template) const;
+
+protected:
+    const VulkanDevice* vkdev;
+
+private:
+    PipelineCache(const PipelineCache&);
+    PipelineCache& operator=(const PipelineCache&);
+
+private:
+    PipelineCachePrivate* const d;
+};
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_PIPELINECACHE_H
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ncnn/platform.h b/duix-sdk/src/main/cpp/third/arm/include/ncnn/platform.h
new file mode 100644
index 0000000..8c46058
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ncnn/platform.h
@@ -0,0 +1,293 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PLATFORM_H
+#define NCNN_PLATFORM_H
+
+#define NCNN_STDIO 1
+#define NCNN_STRING 1
+#define NCNN_SIMPLEOCV 0
+#define NCNN_SIMPLEOMP 0
+#define NCNN_SIMPLESTL 0
+#define NCNN_SIMPLEMATH 0
+#define NCNN_THREADS 1
+#define NCNN_BENCHMARK 0
+#define NCNN_C_API 1
+#define NCNN_PLATFORM_API 1
+#define NCNN_PIXEL 1
+#define NCNN_PIXEL_ROTATE 1
+#define NCNN_PIXEL_AFFINE 1
+#define NCNN_PIXEL_DRAWING 1
+#define NCNN_VULKAN 0
+#define NCNN_SYSTEM_GLSLANG 0
+#define NCNN_RUNTIME_CPU 1
+#define NCNN_GNU_INLINE_ASM 1
+#define NCNN_AVX 0
+#define NCNN_XOP 0
+#define NCNN_FMA 0
+#define NCNN_F16C 0
+#define NCNN_AVX2 0
+#define NCNN_AVXVNNI 0
+#define NCNN_AVX512 0
+#define NCNN_AVX512VNNI 0
+#define NCNN_AVX512BF16 0
+#define NCNN_AVX512FP16 0
+#define NCNN_VFPV4 1
+#define NCNN_ARM82 1
+#define NCNN_ARM82DOT 1
+#define NCNN_ARM82FP16FML 1
+#define NCNN_ARM84BF16 1
+#define NCNN_ARM84I8MM 1
+#define NCNN_ARM86SVE 1
+#define NCNN_ARM86SVE2 1
+#define NCNN_ARM86SVEBF16 1
+#define NCNN_ARM86SVEI8MM 1
+#define NCNN_ARM86SVEF32MM 1
+#define NCNN_MSA 0
+#define NCNN_LSX 0
+#define NCNN_MMI 0
+#define NCNN_RVV 0
+#define NCNN_INT8 1
+#define NCNN_BF16 1
+#define NCNN_FORCE_INLINE 1
+
+#define NCNN_VERSION_STRING "1.0.20231027"
+
+#include "ncnn_export.h"
+
+#ifdef __cplusplus
+
+#if NCNN_THREADS
+#if (defined _WIN32 && !(defined __MINGW32__))
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <process.h>
+#else
+#include <pthread.h>
+#endif
+#endif // NCNN_THREADS
+
+#if __ANDROID_API__ >= 26
+#define VK_USE_PLATFORM_ANDROID_KHR
+#endif // __ANDROID_API__ >= 26
+
+namespace ncnn {
+
+#if NCNN_THREADS
+#if (defined _WIN32 && !(defined __MINGW32__))
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() { InitializeSRWLock(&srwlock); }
+    ~Mutex() {}
+    void lock() { AcquireSRWLockExclusive(&srwlock); }
+    void unlock() { ReleaseSRWLockExclusive(&srwlock); }
+private:
+    friend class ConditionVariable;
+    // NOTE SRWLock is available from windows vista
+    SRWLOCK srwlock;
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() { InitializeConditionVariable(&condvar); }
+    ~ConditionVariable() {}
+    void wait(Mutex& mutex) { SleepConditionVariableSRW(&condvar, &mutex.srwlock, INFINITE, 0); }
+    void broadcast() { WakeAllConditionVariable(&condvar); }
+    void signal() { WakeConditionVariable(&condvar); }
+private:
+    CONDITION_VARIABLE condvar;
+};
+
+static unsigned __stdcall start_wrapper(void* args);
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*start)(void*), void* args = 0) { _start = start; _args = args; handle = (HANDLE)_beginthreadex(0, 0, start_wrapper, this, 0, 0); }
+    ~Thread() {}
+    void join() { WaitForSingleObject(handle, INFINITE); CloseHandle(handle); }
+private:
+    friend unsigned __stdcall start_wrapper(void* args)
+    {
+        Thread* t = (Thread*)args;
+        t->_start(t->_args);
+        return 0;
+    }
+    HANDLE handle;
+    void* (*_start)(void*);
+    void* _args;
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { key = TlsAlloc(); }
+    ~ThreadLocalStorage() { TlsFree(key); }
+    void set(void* value) { TlsSetValue(key, (LPVOID)value); }
+    void* get() { return (void*)TlsGetValue(key); }
+private:
+    DWORD key;
+};
+#else // (defined _WIN32 && !(defined __MINGW32__))
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() { pthread_mutex_init(&mutex, 0); }
+    ~Mutex() { pthread_mutex_destroy(&mutex); }
+    void lock() { pthread_mutex_lock(&mutex); }
+    void unlock() { pthread_mutex_unlock(&mutex); }
+private:
+    friend class ConditionVariable;
+    pthread_mutex_t mutex;
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() { pthread_cond_init(&cond, 0); }
+    ~ConditionVariable() { pthread_cond_destroy(&cond); }
+    void wait(Mutex& mutex) { pthread_cond_wait(&cond, &mutex.mutex); }
+    void broadcast() { pthread_cond_broadcast(&cond); }
+    void signal() { pthread_cond_signal(&cond); }
+private:
+    pthread_cond_t cond;
+};
+
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*start)(void*), void* args = 0) { pthread_create(&t, 0, start, args); }
+    ~Thread() {}
+    void join() { pthread_join(t, 0); }
+private:
+    pthread_t t;
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { pthread_key_create(&key, 0); }
+    ~ThreadLocalStorage() { pthread_key_delete(key); }
+    void set(void* value) { pthread_setspecific(key, value); }
+    void* get() { return pthread_getspecific(key); }
+private:
+    pthread_key_t key;
+};
+#endif // (defined _WIN32 && !(defined __MINGW32__))
+#else // NCNN_THREADS
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() {}
+    ~Mutex() {}
+    void lock() {}
+    void unlock() {}
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() {}
+    ~ConditionVariable() {}
+    void wait(Mutex& /*mutex*/) {}
+    void broadcast() {}
+    void signal() {}
+};
+
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*/*start*/)(void*), void* /*args*/ = 0) {}
+    ~Thread() {}
+    void join() {}
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { data = 0; }
+    ~ThreadLocalStorage() {}
+    void set(void* value) { data = value; }
+    void* get() { return data; }
+private:
+    void* data;
+};
+#endif // NCNN_THREADS
+
+class NCNN_EXPORT MutexLockGuard
+{
+public:
+    MutexLockGuard(Mutex& _mutex) : mutex(_mutex) { mutex.lock(); }
+    ~MutexLockGuard() { mutex.unlock(); }
+private:
+    Mutex& mutex;
+};
+
+} // namespace ncnn
+
+#if NCNN_SIMPLESTL
+#include "simplestl.h"
+#else
+#include <algorithm>
+#include <list>
+#include <vector>
+#include <string>
+#endif
+
+// simplemath
+#if NCNN_SIMPLEMATH
+#include "simplemath.h"
+#else
+#include <math.h>
+#include <fenv.h>
+#endif
+
+#endif // __cplusplus
+
+#if NCNN_STDIO
+#if NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#include <android/log.h>
+#define NCNN_LOGE(...) do { \
+    fprintf(stderr, ##__VA_ARGS__); fprintf(stderr, "\n"); \
+    __android_log_print(ANDROID_LOG_WARN, "ncnn", ##__VA_ARGS__); } while(0)
+#else // NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#include <stdio.h>
+#define NCNN_LOGE(...) do { \
+    fprintf(stderr, ##__VA_ARGS__); fprintf(stderr, "\n"); } while(0)
+#endif // NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#else
+#define NCNN_LOGE(...)
+#endif
+
+
+#if NCNN_FORCE_INLINE
+#ifdef _MSC_VER
+    #define NCNN_FORCEINLINE __forceinline
+#elif defined(__GNUC__)
+    #define NCNN_FORCEINLINE inline __attribute__((__always_inline__))
+#elif defined(__CLANG__)
+    #if __has_attribute(__always_inline__)
+        #define NCNN_FORCEINLINE inline __attribute__((__always_inline__))
+    #else
+        #define NCNN_FORCEINLINE inline
+    #endif
+#else
+    #define NCNN_FORCEINLINE inline
+#endif
+#else
+    #define NCNN_FORCEINLINE inline
+#endif
+
+#endif // NCNN_PLATFORM_H
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ncnn/simplemath.h b/duix-sdk/src/main/cpp/third/arm/include/ncnn/simplemath.h
new file mode 100644
index 0000000..fd7fa69
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ncnn/simplemath.h
@@ -0,0 +1,102 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLEMATH_H
+#define NCNN_SIMPLEMATH_H
+
+#include "platform.h"
+
+#if NCNN_SIMPLEMATH
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+* ====================================================
+* discrete functions
+* ====================================================
+*/
+NCNN_EXPORT float fabs(float);
+NCNN_EXPORT float fabsf(float);
+NCNN_EXPORT float fmod(float, float);
+NCNN_EXPORT float floor(float);
+NCNN_EXPORT float floorf(float);
+NCNN_EXPORT float round(float);
+NCNN_EXPORT float roundf(float);
+NCNN_EXPORT float ceil(float);
+NCNN_EXPORT float ceilf(float);
+NCNN_EXPORT float fmaxf(float, float);
+NCNN_EXPORT float truncf(float);
+NCNN_EXPORT float frac(float);
+/*
+* ====================================================
+* trigonometric functions
+* ====================================================
+*/
+NCNN_EXPORT float sinf(float);
+NCNN_EXPORT float cosf(float);
+NCNN_EXPORT float tanf(float);
+NCNN_EXPORT float asinf(float);
+NCNN_EXPORT float acosf(float);
+NCNN_EXPORT float atanf(float);
+NCNN_EXPORT float atan2f(float, float);
+NCNN_EXPORT float tanhf(float);
+
+/*
+* ====================================================
+* power functions
+* ====================================================
+*/
+NCNN_EXPORT float sqrtf(float);
+NCNN_EXPORT float sqrt(float);
+NCNN_EXPORT float powf(float, float);
+
+/*
+* ====================================================
+* exponential and logarithm functions
+* ====================================================
+*/
+NCNN_EXPORT float expf(float);
+NCNN_EXPORT float frexp(float, int*);
+NCNN_EXPORT float logf(float);
+NCNN_EXPORT float log(float);
+NCNN_EXPORT float log10f(float);
+
+/*
+* ====================================================
+* probability functions
+* ====================================================
+*/
+NCNN_EXPORT float erf(float);
+NCNN_EXPORT float erfcf(float);
+
+/*
+* ====================================================
+* other functions
+* ====================================================
+*/
+NCNN_EXPORT int msb(unsigned int);
+NCNN_EXPORT float fmaf(float, float, float);
+NCNN_EXPORT float copysignf(float, float);
+NCNN_EXPORT void fesetround(int);
+NCNN_EXPORT int fegetround();
+NCNN_EXPORT float nearbyintf(float);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // NCNN_SIMPLEMATH
+
+#endif // NCNN_SIMPLEMATH_H
\ No newline at end of file
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ncnn/simpleocv.h b/duix-sdk/src/main/cpp/third/arm/include/ncnn/simpleocv.h
new file mode 100644
index 0000000..54b22d9
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ncnn/simpleocv.h
@@ -0,0 +1,503 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLEOCV_H
+#define NCNN_SIMPLEOCV_H
+
+#include "platform.h"
+
+#if NCNN_SIMPLEOCV
+
+#include <limits.h>
+#include <string.h>
+#include "allocator.h"
+#include "mat.h"
+
+#if defined(_MSC_VER) || defined(__GNUC__)
+#pragma push_macro("min")
+#pragma push_macro("max")
+#undef min
+#undef max
+#endif
+
+#ifndef NCNN_XADD
+using ncnn::NCNN_XADD;
+#endif
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+
+enum
+{
+    CV_LOAD_IMAGE_UNCHANGED = -1,
+    CV_LOAD_IMAGE_GRAYSCALE = 0,
+    CV_LOAD_IMAGE_COLOR = 1,
+};
+
+enum
+{
+    CV_IMWRITE_JPEG_QUALITY = 1
+};
+
+// minimal opencv style data structure implementation
+namespace cv {
+
+template<typename _Tp>
+static inline _Tp saturate_cast(int v)
+{
+    return _Tp(v);
+}
+template<>
+inline uchar saturate_cast<uchar>(int v)
+{
+    return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
+}
+
+template<typename _Tp>
+struct Scalar_
+{
+    Scalar_()
+    {
+        v[0] = 0;
+        v[1] = 0;
+        v[2] = 0;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0)
+    {
+        v[0] = _v0;
+        v[1] = 0;
+        v[2] = 0;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0, _Tp _v1, _Tp _v2)
+    {
+        v[0] = _v0;
+        v[1] = _v1;
+        v[2] = _v2;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0, _Tp _v1, _Tp _v2, _Tp _v3)
+    {
+        v[0] = _v0;
+        v[1] = _v1;
+        v[2] = _v2;
+        v[3] = _v3;
+    }
+
+    const _Tp operator[](const int i) const
+    {
+        return v[i];
+    }
+
+    _Tp operator[](const int i)
+    {
+        return v[i];
+    }
+
+    _Tp v[4];
+};
+
+typedef Scalar_<uchar> Scalar;
+
+template<typename _Tp>
+struct Point_
+{
+    Point_()
+        : x(0), y(0)
+    {
+    }
+    Point_(_Tp _x, _Tp _y)
+        : x(_x), y(_y)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Point_<_Tp2>() const
+    {
+        return Point_<_Tp2>(saturate_cast<_Tp2>(x), saturate_cast<_Tp2>(y));
+    }
+
+    _Tp x;
+    _Tp y;
+};
+
+typedef Point_<int> Point;
+typedef Point_<float> Point2f;
+
+template<typename _Tp>
+struct Size_
+{
+    Size_()
+        : width(0), height(0)
+    {
+    }
+    Size_(_Tp _w, _Tp _h)
+        : width(_w), height(_h)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Size_<_Tp2>() const
+    {
+        return Size_<_Tp2>(saturate_cast<_Tp2>(width), saturate_cast<_Tp2>(height));
+    }
+
+    _Tp width;
+    _Tp height;
+};
+
+typedef Size_<int> Size;
+typedef Size_<float> Size2f;
+
+template<typename _Tp>
+struct Rect_
+{
+    Rect_()
+        : x(0), y(0), width(0), height(0)
+    {
+    }
+    Rect_(_Tp _x, _Tp _y, _Tp _w, _Tp _h)
+        : x(_x), y(_y), width(_w), height(_h)
+    {
+    }
+    Rect_(Point_<_Tp> _p, Size_<_Tp> _size)
+        : x(_p.x), y(_p.y), width(_size.width), height(_size.height)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Rect_<_Tp2>() const
+    {
+        return Rect_<_Tp2>(saturate_cast<_Tp2>(x), saturate_cast<_Tp2>(y), saturate_cast<_Tp2>(width), saturate_cast<_Tp2>(height));
+    }
+
+    _Tp x;
+    _Tp y;
+    _Tp width;
+    _Tp height;
+
+    // area
+    _Tp area() const
+    {
+        return width * height;
+    }
+};
+
+template<typename _Tp>
+static inline Rect_<_Tp>& operator&=(Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    _Tp x1 = std::max(a.x, b.x), y1 = std::max(a.y, b.y);
+    a.width = std::min(a.x + a.width, b.x + b.width) - x1;
+    a.height = std::min(a.y + a.height, b.y + b.height) - y1;
+    a.x = x1;
+    a.y = y1;
+    if (a.width <= 0 || a.height <= 0)
+        a = Rect_<_Tp>();
+    return a;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp>& operator|=(Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    _Tp x1 = std::min(a.x, b.x), y1 = std::min(a.y, b.y);
+    a.width = std::max(a.x + a.width, b.x + b.width) - x1;
+    a.height = std::max(a.y + a.height, b.y + b.height) - y1;
+    a.x = x1;
+    a.y = y1;
+    return a;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp> operator&(const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    Rect_<_Tp> c = a;
+    return c &= b;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp> operator|(const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    Rect_<_Tp> c = a;
+    return c |= b;
+}
+
+typedef Rect_<int> Rect;
+typedef Rect_<float> Rect2f;
+
+#define CV_8UC1  1
+#define CV_8UC3  3
+#define CV_8UC4  4
+#define CV_32FC1 4
+
+struct NCNN_EXPORT Mat
+{
+    Mat()
+        : data(0), refcount(0), rows(0), cols(0), c(0)
+    {
+    }
+
+    Mat(int _rows, int _cols, int flags)
+        : data(0), refcount(0)
+    {
+        create(_rows, _cols, flags);
+    }
+
+    // copy
+    Mat(const Mat& m)
+        : data(m.data), refcount(m.refcount)
+    {
+        if (refcount)
+            NCNN_XADD(refcount, 1);
+
+        rows = m.rows;
+        cols = m.cols;
+        c = m.c;
+    }
+
+    Mat(int _rows, int _cols, int flags, void* _data)
+        : data((unsigned char*)_data), refcount(0)
+    {
+        rows = _rows;
+        cols = _cols;
+        c = flags;
+    }
+
+    ~Mat()
+    {
+        release();
+    }
+
+    // assign
+    Mat& operator=(const Mat& m)
+    {
+        if (this == &m)
+            return *this;
+
+        if (m.refcount)
+            NCNN_XADD(m.refcount, 1);
+
+        release();
+
+        data = m.data;
+        refcount = m.refcount;
+
+        rows = m.rows;
+        cols = m.cols;
+        c = m.c;
+
+        return *this;
+    }
+
+    Mat& operator=(const Scalar& s)
+    {
+        if (total() > 0)
+        {
+            uchar* p = data;
+            for (int i = 0; i < cols * rows; i++)
+            {
+                for (int j = 0; j < c; j++)
+                {
+                    *p++ = s[j];
+                }
+            }
+        }
+
+        return *this;
+    }
+
+    void create(int _rows, int _cols, int flags)
+    {
+        release();
+
+        rows = _rows;
+        cols = _cols;
+        c = flags;
+
+        if (total() > 0)
+        {
+            // refcount address must be aligned, so we expand totalsize here
+            size_t totalsize = (total() + 3) >> 2 << 2;
+            data = (uchar*)ncnn::fastMalloc(totalsize + (int)sizeof(*refcount));
+            refcount = (int*)(((uchar*)data) + totalsize);
+            *refcount = 1;
+        }
+    }
+
+    void release()
+    {
+        if (refcount && NCNN_XADD(refcount, -1) == 1)
+            ncnn::fastFree(data);
+
+        data = 0;
+
+        rows = 0;
+        cols = 0;
+        c = 0;
+
+        refcount = 0;
+    }
+
+    Mat clone() const
+    {
+        if (empty())
+            return Mat();
+
+        Mat m(rows, cols, c);
+
+        if (total() > 0)
+        {
+            memcpy(m.data, data, total());
+        }
+
+        return m;
+    }
+
+    bool empty() const
+    {
+        return data == 0 || total() == 0;
+    }
+
+    int channels() const
+    {
+        return c;
+    }
+
+    int type() const
+    {
+        return c;
+    }
+
+    size_t total() const
+    {
+        return cols * rows * c;
+    }
+
+    const uchar* ptr(int y) const
+    {
+        return data + y * cols * c;
+    }
+
+    uchar* ptr(int y)
+    {
+        return data + y * cols * c;
+    }
+
+    template<typename _Tp>
+    const _Tp* ptr(int y) const
+    {
+        return (const _Tp*)data + y * cols * c;
+    }
+
+    template<typename _Tp>
+    _Tp* ptr(int y)
+    {
+        return (_Tp*)data + y * cols * c;
+    }
+
+    // roi
+    Mat operator()(const Rect& roi) const
+    {
+        if (empty())
+            return Mat();
+
+        Mat m(roi.height, roi.width, c);
+
+        int sy = roi.y;
+        for (int y = 0; y < roi.height; y++)
+        {
+            const uchar* sptr = ptr(sy) + roi.x * c;
+            uchar* dptr = m.ptr(y);
+            memcpy(dptr, sptr, roi.width * c);
+            sy++;
+        }
+
+        return m;
+    }
+
+    uchar* data;
+
+    // pointer to the reference counter;
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    int rows;
+    int cols;
+
+    int c;
+};
+
+enum ImreadModes
+{
+    IMREAD_UNCHANGED = -1,
+    IMREAD_GRAYSCALE = 0,
+    IMREAD_COLOR = 1
+};
+
+NCNN_EXPORT Mat imread(const std::string& path, int flags = IMREAD_COLOR);
+
+NCNN_EXPORT Mat imdecode(const std::vector<uchar>& buf, int flags = IMREAD_COLOR);
+
+enum ImwriteFlags
+{
+    IMWRITE_JPEG_QUALITY = 1
+};
+
+NCNN_EXPORT bool imwrite(const std::string& path, const Mat& m, const std::vector<int>& params = std::vector<int>());
+
+NCNN_EXPORT void imshow(const std::string& name, const Mat& m);
+
+NCNN_EXPORT int waitKey(int delay = 0);
+
+#if NCNN_PIXEL
+NCNN_EXPORT void resize(const Mat& src, Mat& dst, const Size& size, float sw = 0.f, float sh = 0.f, int flags = 0);
+#endif // NCNN_PIXEL
+
+#if NCNN_PIXEL_DRAWING
+
+enum
+{
+    FILLED = -1
+};
+
+NCNN_EXPORT void rectangle(Mat& img, Point pt1, Point pt2, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void rectangle(Mat& img, Rect rec, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void circle(Mat& img, Point center, int radius, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void line(Mat& img, Point p0, Point p1, const Scalar& color, int thickness = 1);
+
+enum
+{
+    FONT_HERSHEY_SIMPLEX = 0
+};
+
+NCNN_EXPORT void putText(Mat& img, const std::string& text, Point org, int fontFace, double fontScale, Scalar color, int thickness = 1);
+
+NCNN_EXPORT Size getTextSize(const std::string& text, int fontFace, double fontScale, int thickness, int* baseLine);
+
+#endif // NCNN_PIXEL_DRAWING
+
+} // namespace cv
+
+#if defined(_MSC_VER) || defined(__GNUC__)
+#pragma pop_macro("min")
+#pragma pop_macro("max")
+#endif
+
+#endif // NCNN_SIMPLEOCV
+
+#endif // NCNN_SIMPLEOCV_H
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ncnn/simpleomp.h b/duix-sdk/src/main/cpp/third/arm/include/ncnn/simpleomp.h
new file mode 100644
index 0000000..13e2452
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ncnn/simpleomp.h
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLEOMP_H
+#define NCNN_SIMPLEOMP_H
+
+#include "platform.h"
+
+#if NCNN_SIMPLEOMP
+
+#include <stdint.h>
+
+// This minimal openmp runtime implementation only supports the llvm openmp abi
+// and only supports #pragma omp parallel for num_threads(X)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+NCNN_EXPORT int omp_get_max_threads();
+
+NCNN_EXPORT void omp_set_num_threads(int num_threads);
+
+NCNN_EXPORT int omp_get_dynamic();
+
+NCNN_EXPORT void omp_set_dynamic(int dynamic);
+
+NCNN_EXPORT int omp_get_num_threads();
+
+NCNN_EXPORT int omp_get_thread_num();
+
+NCNN_EXPORT int kmp_get_blocktime();
+
+NCNN_EXPORT void kmp_set_blocktime(int blocktime);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // NCNN_SIMPLEOMP
+
+#endif // NCNN_SIMPLEOMP_H
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ncnn/simplestl.h b/duix-sdk/src/main/cpp/third/arm/include/ncnn/simplestl.h
new file mode 100644
index 0000000..00ff468
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ncnn/simplestl.h
@@ -0,0 +1,565 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLESTL_H
+#define NCNN_SIMPLESTL_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#if !NCNN_SIMPLESTL
+
+#include <new>
+
+#else
+
+// allocation functions
+NCNN_EXPORT void* operator new(size_t size);
+NCNN_EXPORT void* operator new[](size_t size);
+// placement allocation functions
+NCNN_EXPORT void* operator new(size_t size, void* ptr);
+NCNN_EXPORT void* operator new[](size_t size, void* ptr);
+// deallocation functions
+NCNN_EXPORT void operator delete(void* ptr);
+NCNN_EXPORT void operator delete[](void* ptr);
+// deallocation functions since c++14
+#if __cplusplus >= 201402L
+NCNN_EXPORT void operator delete(void* ptr, size_t sz);
+NCNN_EXPORT void operator delete[](void* ptr, size_t sz);
+#endif
+// placement deallocation functions
+NCNN_EXPORT void operator delete(void* ptr, void* voidptr2);
+NCNN_EXPORT void operator delete[](void* ptr, void* voidptr2);
+
+#endif
+
+// minimal stl data structure implementation
+namespace std {
+
+template<typename T>
+const T& max(const T& a, const T& b)
+{
+    return (a < b) ? b : a;
+}
+
+template<typename T>
+const T& min(const T& a, const T& b)
+{
+    return (a > b) ? b : a;
+}
+
+template<typename T>
+void swap(T& a, T& b)
+{
+    T temp(a);
+    a = b;
+    b = temp;
+}
+
+template<typename T1, typename T2>
+struct pair
+{
+    pair()
+        : first(), second()
+    {
+    }
+    pair(const T1& t1, const T2& t2)
+        : first(t1), second(t2)
+    {
+    }
+
+    T1 first;
+    T2 second;
+};
+
+template<typename T1, typename T2>
+bool operator==(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return (x.first == y.first && x.second == y.second);
+}
+template<typename T1, typename T2>
+bool operator<(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return x.first < y.first || (!(y.first < x.first) && x.second < y.second);
+}
+template<typename T1, typename T2>
+bool operator!=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(x == y);
+}
+template<typename T1, typename T2>
+bool operator>(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return y < x;
+}
+template<typename T1, typename T2>
+bool operator<=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(y < x);
+}
+template<typename T1, typename T2>
+bool operator>=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(x < y);
+}
+
+template<typename T1, typename T2>
+pair<T1, T2> make_pair(const T1& t1, const T2& t2)
+{
+    return pair<T1, T2>(t1, t2);
+}
+
+template<typename T>
+struct node
+{
+    node* prev_;
+    node* next_;
+    T data_;
+
+    node()
+        : prev_(0), next_(0), data_()
+    {
+    }
+    node(const T& t)
+        : prev_(0), next_(0), data_(t)
+    {
+    }
+};
+
+template<typename T>
+struct iter_list
+{
+    iter_list()
+        : curr_(0)
+    {
+    }
+    iter_list(node<T>* n)
+        : curr_(n)
+    {
+    }
+    iter_list(const iter_list& i)
+        : curr_(i.curr_)
+    {
+    }
+    ~iter_list()
+    {
+    }
+
+    iter_list& operator=(const iter_list& i)
+    {
+        curr_ = i.curr_;
+        return *this;
+    }
+
+    T& operator*()
+    {
+        return curr_->data_;
+    }
+    T* operator->()
+    {
+        return &(curr_->data_);
+    }
+
+    bool operator==(const iter_list& i)
+    {
+        return curr_ == i.curr_;
+    }
+    bool operator!=(const iter_list& i)
+    {
+        return curr_ != i.curr_;
+    }
+
+    iter_list& operator++()
+    {
+        curr_ = curr_->next_;
+        return *this;
+    }
+    iter_list& operator--()
+    {
+        curr_ = curr_->prev_;
+        return *this;
+    }
+
+    node<T>* curr_;
+};
+
+template<typename T>
+struct list
+{
+    typedef iter_list<T> iterator;
+
+    list()
+    {
+        head_ = new node<T>();
+        tail_ = head_;
+        count_ = 0;
+    }
+    ~list()
+    {
+        clear();
+        delete head_;
+    }
+    list(const list& l)
+    {
+        head_ = new node<T>();
+        tail_ = head_;
+        count_ = 0;
+
+        for (iter_list<T> i = l.begin(); i != l.end(); ++i)
+        {
+            push_back(*i);
+        }
+    }
+
+    list& operator=(const list& l)
+    {
+        if (this == &l)
+        {
+            return *this;
+        }
+        clear();
+
+        for (iter_list<T> i = l.begin(); i != l.end(); ++i)
+        {
+            push_back(*i);
+        }
+        return *this;
+    }
+
+    void clear()
+    {
+        while (count_ > 0)
+        {
+            pop_front();
+        }
+    }
+
+    void pop_front()
+    {
+        if (count_ > 0)
+        {
+            head_ = head_->next_;
+            delete head_->prev_;
+            head_->prev_ = 0;
+            --count_;
+        }
+    }
+
+    size_t size() const
+    {
+        return count_;
+    }
+    iter_list<T> begin() const
+    {
+        return iter_list<T>(head_);
+    }
+    iter_list<T> end() const
+    {
+        return iter_list<T>(tail_);
+    }
+    bool empty() const
+    {
+        return count_ == 0;
+    }
+
+    void push_back(const T& t)
+    {
+        if (count_ == 0)
+        {
+            head_ = new node<T>(t);
+            head_->prev_ = 0;
+            head_->next_ = tail_;
+            tail_->prev_ = head_;
+            count_ = 1;
+        }
+        else
+        {
+            node<T>* temp = new node<T>(t);
+            temp->prev_ = tail_->prev_;
+            temp->next_ = tail_;
+            tail_->prev_->next_ = temp;
+            tail_->prev_ = temp;
+            ++count_;
+        }
+    }
+
+    iter_list<T> erase(iter_list<T> pos)
+    {
+        if (pos != end())
+        {
+            node<T>* temp = pos.curr_;
+            if (temp == head_)
+            {
+                ++pos;
+                temp->next_->prev_ = 0;
+                head_ = temp->next_;
+            }
+            else
+            {
+                --pos;
+                temp->next_->prev_ = temp->prev_;
+                temp->prev_->next_ = temp->next_;
+                ++pos;
+            }
+            delete temp;
+            --count_;
+        }
+        return pos;
+    }
+
+protected:
+    node<T>* head_;
+    node<T>* tail_;
+    size_t count_;
+};
+
+template<typename T>
+struct greater
+{
+    bool operator()(const T& x, const T& y) const
+    {
+        return (x > y);
+    }
+};
+
+template<typename T>
+struct less
+{
+    bool operator()(const T& x, const T& y) const
+    {
+        return (x < y);
+    }
+};
+
+template<typename RandomAccessIter, typename Compare>
+void partial_sort(RandomAccessIter first, RandomAccessIter middle, RandomAccessIter last, Compare comp)
+{
+    // [TODO] heap sort should be used here, but we simply use bubble sort now
+    for (RandomAccessIter i = first; i < middle; ++i)
+    {
+        // bubble sort
+        for (RandomAccessIter j = last - 1; j > first; --j)
+        {
+            if (comp(*j, *(j - 1)))
+            {
+                swap(*j, *(j - 1));
+            }
+        }
+    }
+}
+
+template<typename T>
+struct vector
+{
+    vector()
+        : data_(0), size_(0), capacity_(0)
+    {
+    }
+    vector(const size_t new_size, const T& value = T())
+        : data_(0), size_(0), capacity_(0)
+    {
+        resize(new_size, value);
+    }
+    ~vector()
+    {
+        clear();
+    }
+    vector(const vector& v)
+        : data_(0), size_(0), capacity_(0)
+    {
+        resize(v.size());
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i] = v.data_[i];
+        }
+    }
+
+    vector& operator=(const vector& v)
+    {
+        if (this == &v)
+        {
+            return *this;
+        }
+        resize(0);
+        resize(v.size());
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i] = v.data_[i];
+        }
+        return *this;
+    }
+
+    void resize(const size_t new_size, const T& value = T())
+    {
+        try_alloc(new_size);
+        if (new_size > size_)
+        {
+            for (size_t i = size_; i < new_size; i++)
+            {
+                new (&data_[i]) T(value);
+            }
+        }
+        else if (new_size < size_)
+        {
+            for (size_t i = new_size; i < size_; i++)
+            {
+                data_[i].~T();
+            }
+        }
+        size_ = new_size;
+    }
+
+    void clear()
+    {
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i].~T();
+        }
+        delete[](char*) data_;
+        data_ = 0;
+        size_ = 0;
+        capacity_ = 0;
+    }
+
+    T* data() const
+    {
+        return data_;
+    }
+    size_t size() const
+    {
+        return size_;
+    }
+    T& operator[](size_t i) const
+    {
+        return data_[i];
+    }
+    T* begin() const
+    {
+        return &data_[0];
+    }
+    T* end() const
+    {
+        return &data_[size_];
+    }
+    bool empty() const
+    {
+        return size_ == 0;
+    }
+
+    void push_back(const T& t)
+    {
+        try_alloc(size_ + 1);
+        new (&data_[size_]) T(t);
+        size_++;
+    }
+
+    void insert(T* pos, T* b, T* e)
+    {
+        vector* v = 0;
+        if (b >= begin() && b < end())
+        {
+            //the same vector
+            v = new vector(*this);
+            b = v->begin() + (b - begin());
+            e = v->begin() + (e - begin());
+        }
+        size_t diff = pos - begin();
+        try_alloc(size_ + (e - b));
+        pos = begin() + diff;
+        memmove(pos + (e - b), pos, (end() - pos) * sizeof(T));
+        size_t len = e - b;
+        size_ += len;
+        for (size_t i = 0; i < len; i++)
+        {
+            *pos = *b;
+            pos++;
+            b++;
+        }
+        delete v;
+    }
+
+    T* erase(T* pos)
+    {
+        pos->~T();
+        memmove(pos, pos + 1, (end() - pos - 1) * sizeof(T));
+        size_--;
+        return pos;
+    }
+
+protected:
+    T* data_;
+    size_t size_;
+    size_t capacity_;
+    void try_alloc(size_t new_size)
+    {
+        if (new_size * 3 / 2 > capacity_ / 2)
+        {
+            capacity_ = new_size * 2;
+            T* new_data = (T*)new char[capacity_ * sizeof(T)];
+            memset(static_cast<void*>(new_data), 0, capacity_ * sizeof(T));
+            if (data_)
+            {
+                memmove(new_data, data_, sizeof(T) * size_);
+                delete[](char*) data_;
+            }
+            data_ = new_data;
+        }
+    }
+};
+
+struct NCNN_EXPORT string : public vector<char>
+{
+    string()
+    {
+    }
+    string(const char* str)
+    {
+        size_t len = strlen(str);
+        resize(len);
+        memcpy(data_, str, len);
+    }
+    const char* c_str() const
+    {
+        return (const char*)data_;
+    }
+    bool operator==(const string& str2) const
+    {
+        return strcmp(data_, str2.data_) == 0;
+    }
+    bool operator==(const char* str2) const
+    {
+        return strcmp(data_, str2) == 0;
+    }
+    bool operator!=(const char* str2) const
+    {
+        return strcmp(data_, str2) != 0;
+    }
+    string& operator+=(const string& str1)
+    {
+        insert(end(), str1.begin(), str1.end());
+        return *this;
+    }
+};
+
+inline string operator+(const string& str1, const string& str2)
+{
+    string str(str1);
+    str.insert(str.end(), str2.begin(), str2.end());
+    return str;
+}
+
+} // namespace std
+
+#endif // NCNN_SIMPLESTL_H
diff --git a/duix-sdk/src/main/cpp/third/arm/include/ncnn/vulkan_header_fix.h b/duix-sdk/src/main/cpp/third/arm/include/ncnn/vulkan_header_fix.h
new file mode 100644
index 0000000..0a5ea9b
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/ncnn/vulkan_header_fix.h
@@ -0,0 +1,449 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_VULKAN_HEADER_FIX_H
+#define NCNN_VULKAN_HEADER_FIX_H
+
+#include <vulkan/vulkan.h>
+
+// This header contains new structure and function declearation to fix build with old vulkan sdk
+
+#if VK_HEADER_VERSION < 70
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES (VkStructureType)1000094000
+typedef enum VkSubgroupFeatureFlagBits
+{
+    VK_SUBGROUP_FEATURE_BASIC_BIT = 0x00000001,
+    VK_SUBGROUP_FEATURE_VOTE_BIT = 0x00000002,
+    VK_SUBGROUP_FEATURE_ARITHMETIC_BIT = 0x00000004,
+    VK_SUBGROUP_FEATURE_BALLOT_BIT = 0x00000008,
+    VK_SUBGROUP_FEATURE_SHUFFLE_BIT = 0x00000010,
+    VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT = 0x00000020,
+    VK_SUBGROUP_FEATURE_CLUSTERED_BIT = 0x00000040,
+    VK_SUBGROUP_FEATURE_QUAD_BIT = 0x00000080,
+    VK_SUBGROUP_FEATURE_PARTITIONED_BIT_NV = 0x00000100,
+    VK_SUBGROUP_FEATURE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkSubgroupFeatureFlagBits;
+typedef VkFlags VkSubgroupFeatureFlags;
+typedef struct VkPhysicalDeviceSubgroupProperties
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t subgroupSize;
+    VkShaderStageFlags supportedStages;
+    VkSubgroupFeatureFlags supportedOperations;
+    VkBool32 quadOperationsInAllStages;
+} VkPhysicalDeviceSubgroupProperties;
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_3_PROPERTIES (VkStructureType)1000168000
+#define VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_SUPPORT            (VkStructureType)1000168001
+typedef struct VkPhysicalDeviceMaintenance3Properties
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t maxPerSetDescriptors;
+    VkDeviceSize maxMemoryAllocationSize;
+} VkPhysicalDeviceMaintenance3Properties;
+typedef struct VkDescriptorSetLayoutSupport
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 supported;
+} VkDescriptorSetLayoutSupport;
+typedef VkPhysicalDeviceMaintenance3Properties VkPhysicalDeviceMaintenance3PropertiesKHR;
+typedef VkDescriptorSetLayoutSupport VkDescriptorSetLayoutSupportKHR;
+typedef void(VKAPI_PTR* PFN_vkGetDescriptorSetLayoutSupportKHR)(VkDevice device, const VkDescriptorSetLayoutCreateInfo* pCreateInfo, VkDescriptorSetLayoutSupport* pSupport);
+#endif // VK_HEADER_VERSION < 70
+
+#if VK_HEADER_VERSION < 80
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR (VkStructureType)1000177000
+typedef struct VkPhysicalDevice8BitStorageFeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 storageBuffer8BitAccess;
+    VkBool32 uniformAndStorageBuffer8BitAccess;
+    VkBool32 storagePushConstant8;
+} VkPhysicalDevice8BitStorageFeaturesKHR;
+#define VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2_KHR  (VkStructureType)1000109000
+#define VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2_KHR    (VkStructureType)1000109001
+#define VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2_KHR     (VkStructureType)1000109002
+#define VK_STRUCTURE_TYPE_SUBPASS_DEPENDENCY_2_KHR      (VkStructureType)1000109003
+#define VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2_KHR (VkStructureType)1000109004
+#define VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO_KHR        (VkStructureType)1000109005
+#define VK_STRUCTURE_TYPE_SUBPASS_END_INFO_KHR          (VkStructureType)1000109006
+typedef struct VkAttachmentDescription2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkAttachmentDescriptionFlags flags;
+    VkFormat format;
+    VkSampleCountFlagBits samples;
+    VkAttachmentLoadOp loadOp;
+    VkAttachmentStoreOp storeOp;
+    VkAttachmentLoadOp stencilLoadOp;
+    VkAttachmentStoreOp stencilStoreOp;
+    VkImageLayout initialLayout;
+    VkImageLayout finalLayout;
+} VkAttachmentDescription2KHR;
+typedef struct VkAttachmentReference2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint32_t attachment;
+    VkImageLayout layout;
+    VkImageAspectFlags aspectMask;
+} VkAttachmentReference2KHR;
+typedef struct VkSubpassDescription2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkSubpassDescriptionFlags flags;
+    VkPipelineBindPoint pipelineBindPoint;
+    uint32_t viewMask;
+    uint32_t inputAttachmentCount;
+    const VkAttachmentReference2KHR* pInputAttachments;
+    uint32_t colorAttachmentCount;
+    const VkAttachmentReference2KHR* pColorAttachments;
+    const VkAttachmentReference2KHR* pResolveAttachments;
+    const VkAttachmentReference2KHR* pDepthStencilAttachment;
+    uint32_t preserveAttachmentCount;
+    const uint32_t* pPreserveAttachments;
+} VkSubpassDescription2KHR;
+typedef struct VkSubpassDependency2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint32_t srcSubpass;
+    uint32_t dstSubpass;
+    VkPipelineStageFlags srcStageMask;
+    VkPipelineStageFlags dstStageMask;
+    VkAccessFlags srcAccessMask;
+    VkAccessFlags dstAccessMask;
+    VkDependencyFlags dependencyFlags;
+    int32_t viewOffset;
+} VkSubpassDependency2KHR;
+typedef struct VkRenderPassCreateInfo2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkRenderPassCreateFlags flags;
+    uint32_t attachmentCount;
+    const VkAttachmentDescription2KHR* pAttachments;
+    uint32_t subpassCount;
+    const VkSubpassDescription2KHR* pSubpasses;
+    uint32_t dependencyCount;
+    const VkSubpassDependency2KHR* pDependencies;
+    uint32_t correlatedViewMaskCount;
+    const uint32_t* pCorrelatedViewMasks;
+} VkRenderPassCreateInfo2KHR;
+typedef struct VkSubpassBeginInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkSubpassContents contents;
+} VkSubpassBeginInfoKHR;
+
+typedef struct VkSubpassEndInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+} VkSubpassEndInfoKHR;
+typedef VkResult(VKAPI_PTR* PFN_vkCreateRenderPass2KHR)(VkDevice device, const VkRenderPassCreateInfo2KHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkRenderPass* pRenderPass);
+typedef void(VKAPI_PTR* PFN_vkCmdBeginRenderPass2KHR)(VkCommandBuffer commandBuffer, const VkRenderPassBeginInfo* pRenderPassBegin, const VkSubpassBeginInfoKHR* pSubpassBeginInfo);
+typedef void(VKAPI_PTR* PFN_vkCmdNextSubpass2KHR)(VkCommandBuffer commandBuffer, const VkSubpassBeginInfoKHR* pSubpassBeginInfo, const VkSubpassEndInfoKHR* pSubpassEndInfo);
+typedef void(VKAPI_PTR* PFN_vkCmdEndRenderPass2KHR)(VkCommandBuffer commandBuffer, const VkSubpassEndInfoKHR* pSubpassEndInfo);
+#endif // VK_HEADER_VERSION < 80
+
+#if VK_HEADER_VERSION < 95
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR (VkStructureType)1000082000
+typedef struct VkPhysicalDeviceFloat16Int8FeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 shaderFloat16;
+    VkBool32 shaderInt8;
+} VkPhysicalDeviceFloat16Int8FeaturesKHR;
+#endif // VK_HEADER_VERSION < 95
+
+#if VK_HEADER_VERSION < 97
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT (VkStructureType)1000237000
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PRIORITY_FEATURES_EXT (VkStructureType)1000238000
+#define VK_STRUCTURE_TYPE_MEMORY_PRIORITY_ALLOCATE_INFO_EXT            (VkStructureType)1000238001
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_ADDRESS_FEATURES_EXT  (VkStructureType)1000244000
+#define VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO_EXT               (VkStructureType)1000244001
+#define VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_CREATE_INFO_EXT        (VkStructureType)1000244002
+#define VK_STRUCTURE_TYPE_VALIDATION_FEATURES_EXT                      (VkStructureType)1000247000
+#define VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT_EXT         (VkBufferCreateFlagBits)0x00020000
+#define VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_EXT                  (VkBufferUsageFlagBits)0x00020000
+typedef uint64_t VkDeviceAddress;
+typedef struct VkPhysicalDeviceMemoryBudgetPropertiesEXT
+{
+    VkStructureType sType;
+    void* pNext;
+    VkDeviceSize heapBudget[VK_MAX_MEMORY_HEAPS];
+    VkDeviceSize heapUsage[VK_MAX_MEMORY_HEAPS];
+} VkPhysicalDeviceMemoryBudgetPropertiesEXT;
+typedef struct VkPhysicalDeviceMemoryPriorityFeaturesEXT
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 memoryPriority;
+} VkPhysicalDeviceMemoryPriorityFeaturesEXT;
+typedef struct VkMemoryPriorityAllocateInfoEXT
+{
+    VkStructureType sType;
+    const void* pNext;
+    float priority;
+} VkMemoryPriorityAllocateInfoEXT;
+typedef struct VkPhysicalDeviceBufferAddressFeaturesEXT
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 bufferDeviceAddress;
+    VkBool32 bufferDeviceAddressCaptureReplay;
+    VkBool32 bufferDeviceAddressMultiDevice;
+} VkPhysicalDeviceBufferAddressFeaturesEXT;
+typedef struct VkBufferDeviceAddressInfoEXT
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkBuffer buffer;
+} VkBufferDeviceAddressInfoEXT;
+typedef struct VkBufferDeviceAddressCreateInfoEXT
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkDeviceSize deviceAddress;
+} VkBufferDeviceAddressCreateInfoEXT;
+typedef VkDeviceAddress(VKAPI_PTR* PFN_vkGetBufferDeviceAddressEXT)(VkDevice device, const VkBufferDeviceAddressInfoEXT* pInfo);
+typedef enum VkValidationFeatureEnableEXT
+{
+    VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT = 0,
+    VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT = 1,
+    VK_VALIDATION_FEATURE_ENABLE_BEGIN_RANGE_EXT = VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT,
+    VK_VALIDATION_FEATURE_ENABLE_END_RANGE_EXT = VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT,
+    VK_VALIDATION_FEATURE_ENABLE_RANGE_SIZE_EXT = (VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT - VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT + 1),
+    VK_VALIDATION_FEATURE_ENABLE_MAX_ENUM_EXT = 0x7FFFFFFF
+} VkValidationFeatureEnableEXT;
+typedef enum VkValidationFeatureDisableEXT
+{
+    VK_VALIDATION_FEATURE_DISABLE_ALL_EXT = 0,
+    VK_VALIDATION_FEATURE_DISABLE_SHADERS_EXT = 1,
+    VK_VALIDATION_FEATURE_DISABLE_THREAD_SAFETY_EXT = 2,
+    VK_VALIDATION_FEATURE_DISABLE_API_PARAMETERS_EXT = 3,
+    VK_VALIDATION_FEATURE_DISABLE_OBJECT_LIFETIMES_EXT = 4,
+    VK_VALIDATION_FEATURE_DISABLE_CORE_CHECKS_EXT = 5,
+    VK_VALIDATION_FEATURE_DISABLE_UNIQUE_HANDLES_EXT = 6,
+    VK_VALIDATION_FEATURE_DISABLE_BEGIN_RANGE_EXT = VK_VALIDATION_FEATURE_DISABLE_ALL_EXT,
+    VK_VALIDATION_FEATURE_DISABLE_END_RANGE_EXT = VK_VALIDATION_FEATURE_DISABLE_UNIQUE_HANDLES_EXT,
+    VK_VALIDATION_FEATURE_DISABLE_RANGE_SIZE_EXT = (VK_VALIDATION_FEATURE_DISABLE_UNIQUE_HANDLES_EXT - VK_VALIDATION_FEATURE_DISABLE_ALL_EXT + 1),
+    VK_VALIDATION_FEATURE_DISABLE_MAX_ENUM_EXT = 0x7FFFFFFF
+} VkValidationFeatureDisableEXT;
+typedef struct VkValidationFeaturesEXT
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint32_t enabledValidationFeatureCount;
+    const VkValidationFeatureEnableEXT* pEnabledValidationFeatures;
+    uint32_t disabledValidationFeatureCount;
+    const VkValidationFeatureDisableEXT* pDisabledValidationFeatures;
+} VkValidationFeaturesEXT;
+#endif // VK_HEADER_VERSION < 97
+
+#if VK_HEADER_VERSION < 101
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_NV   (VkStructureType)1000249000
+#define VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_NV                 (VkStructureType)1000249001
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_PROPERTIES_NV (VkStructureType)1000249002
+typedef enum VkComponentTypeNV
+{
+    VK_COMPONENT_TYPE_FLOAT16_NV = 0,
+    VK_COMPONENT_TYPE_FLOAT32_NV = 1,
+    VK_COMPONENT_TYPE_FLOAT64_NV = 2,
+    VK_COMPONENT_TYPE_SINT8_NV = 3,
+    VK_COMPONENT_TYPE_SINT16_NV = 4,
+    VK_COMPONENT_TYPE_SINT32_NV = 5,
+    VK_COMPONENT_TYPE_SINT64_NV = 6,
+    VK_COMPONENT_TYPE_UINT8_NV = 7,
+    VK_COMPONENT_TYPE_UINT16_NV = 8,
+    VK_COMPONENT_TYPE_UINT32_NV = 9,
+    VK_COMPONENT_TYPE_UINT64_NV = 10,
+    VK_COMPONENT_TYPE_BEGIN_RANGE_NV = VK_COMPONENT_TYPE_FLOAT16_NV,
+    VK_COMPONENT_TYPE_END_RANGE_NV = VK_COMPONENT_TYPE_UINT64_NV,
+    VK_COMPONENT_TYPE_RANGE_SIZE_NV = (VK_COMPONENT_TYPE_UINT64_NV - VK_COMPONENT_TYPE_FLOAT16_NV + 1),
+    VK_COMPONENT_TYPE_MAX_ENUM_NV = 0x7FFFFFFF
+} VkComponentTypeNV;
+typedef enum VkScopeNV
+{
+    VK_SCOPE_DEVICE_NV = 1,
+    VK_SCOPE_WORKGROUP_NV = 2,
+    VK_SCOPE_SUBGROUP_NV = 3,
+    VK_SCOPE_QUEUE_FAMILY_NV = 5,
+    VK_SCOPE_BEGIN_RANGE_NV = VK_SCOPE_DEVICE_NV,
+    VK_SCOPE_END_RANGE_NV = VK_SCOPE_QUEUE_FAMILY_NV,
+    VK_SCOPE_RANGE_SIZE_NV = (VK_SCOPE_QUEUE_FAMILY_NV - VK_SCOPE_DEVICE_NV + 1),
+    VK_SCOPE_MAX_ENUM_NV = 0x7FFFFFFF
+} VkScopeNV;
+typedef struct VkCooperativeMatrixPropertiesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t MSize;
+    uint32_t NSize;
+    uint32_t KSize;
+    VkComponentTypeNV AType;
+    VkComponentTypeNV BType;
+    VkComponentTypeNV CType;
+    VkComponentTypeNV DType;
+    VkScopeNV scope;
+} VkCooperativeMatrixPropertiesNV;
+typedef struct VkPhysicalDeviceCooperativeMatrixFeaturesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 cooperativeMatrix;
+    VkBool32 cooperativeMatrixRobustBufferAccess;
+} VkPhysicalDeviceCooperativeMatrixFeaturesNV;
+typedef struct VkPhysicalDeviceCooperativeMatrixPropertiesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    VkShaderStageFlags cooperativeMatrixSupportedStages;
+} VkPhysicalDeviceCooperativeMatrixPropertiesNV;
+typedef VkResult(VKAPI_PTR* PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesNV)(VkPhysicalDevice physicalDevice, uint32_t* pPropertyCount, VkCooperativeMatrixPropertiesNV* pProperties);
+#endif // VK_HEADER_VERSION < 101
+
+#if VK_HEADER_VERSION < 121
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COHERENT_MEMORY_FEATURES_AMD (VkStructureType)1000229000
+#define VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD                     (VkMemoryPropertyFlagBits)0x00000040
+#define VK_MEMORY_PROPERTY_DEVICE_UNCACHED_BIT_AMD                     (VkMemoryPropertyFlagBits)0x00000040
+typedef struct VkPhysicalDeviceCoherentMemoryFeaturesAMD
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 deviceCoherentMemory;
+} VkPhysicalDeviceCoherentMemoryFeaturesAMD;
+#endif // VK_HEADER_VERSION < 121
+
+#if VK_HEADER_VERSION < 129
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES_KHR (VkStructureType)1000257000
+#define VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO_KHR                     (VkStructureType)1000244001
+#define VK_STRUCTURE_TYPE_BUFFER_OPAQUE_CAPTURE_ADDRESS_CREATE_INFO_KHR      (VkStructureType)1000257002
+#define VK_STRUCTURE_TYPE_MEMORY_OPAQUE_CAPTURE_ADDRESS_ALLOCATE_INFO_KHR    (VkStructureType)1000257003
+#define VK_STRUCTURE_TYPE_DEVICE_MEMORY_OPAQUE_CAPTURE_ADDRESS_INFO_KHR      (VkStructureType)1000257004
+#define VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT_KHR               (VkBufferCreateFlagBits)0x00020000
+#define VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_KHR                        (VkBufferUsageFlagBits)0x00020000
+#define VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT_KHR                            (VkMemoryAllocateFlagBits)0x00000002
+#define VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT_KHR             (VkMemoryAllocateFlagBits)0x00000004
+typedef struct VkPhysicalDeviceBufferDeviceAddressFeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 bufferDeviceAddress;
+    VkBool32 bufferDeviceAddressCaptureReplay;
+    VkBool32 bufferDeviceAddressMultiDevice;
+} VkPhysicalDeviceBufferDeviceAddressFeaturesKHR;
+typedef struct VkBufferDeviceAddressInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkBuffer buffer;
+} VkBufferDeviceAddressInfoKHR;
+typedef struct VkBufferOpaqueCaptureAddressCreateInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint64_t opaqueCaptureAddress;
+} VkBufferOpaqueCaptureAddressCreateInfoKHR;
+typedef struct VkMemoryOpaqueCaptureAddressAllocateInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint64_t opaqueCaptureAddress;
+} VkMemoryOpaqueCaptureAddressAllocateInfoKHR;
+typedef struct VkDeviceMemoryOpaqueCaptureAddressInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkDeviceMemory memory;
+} VkDeviceMemoryOpaqueCaptureAddressInfoKHR;
+typedef VkDeviceAddress(VKAPI_PTR* PFN_vkGetBufferDeviceAddressKHR)(VkDevice device, const VkBufferDeviceAddressInfoKHR* pInfo);
+typedef uint64_t(VKAPI_PTR* PFN_vkGetBufferOpaqueCaptureAddressKHR)(VkDevice device, const VkBufferDeviceAddressInfoKHR* pInfo);
+typedef uint64_t(VKAPI_PTR* PFN_vkGetDeviceMemoryOpaqueCaptureAddressKHR)(VkDevice device, const VkDeviceMemoryOpaqueCaptureAddressInfoKHR* pInfo);
+#endif // VK_HEADER_VERSION < 129
+
+#if VK_HEADER_VERSION < 208
+typedef enum VkInstanceCreateFlagBits
+{
+    VK_INSTANCE_CREATE_ENUMERATE_PORTABILITY_BIT_KHR = 0x00000001,
+    VK_INSTANCE_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkInstanceCreateFlagBits;
+#endif // VK_HEADER_VERSION < 208
+
+#if VK_HEADER_VERSION < 255
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR   (VkStructureType)1000506000
+#define VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR                 (VkStructureType)1000506001
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_PROPERTIES_KHR (VkStructureType)1000506002
+typedef enum VkComponentTypeKHR
+{
+    VK_COMPONENT_TYPE_FLOAT16_KHR = 0,
+    VK_COMPONENT_TYPE_FLOAT32_KHR = 1,
+    VK_COMPONENT_TYPE_FLOAT64_KHR = 2,
+    VK_COMPONENT_TYPE_SINT8_KHR = 3,
+    VK_COMPONENT_TYPE_SINT16_KHR = 4,
+    VK_COMPONENT_TYPE_SINT32_KHR = 5,
+    VK_COMPONENT_TYPE_SINT64_KHR = 6,
+    VK_COMPONENT_TYPE_UINT8_KHR = 7,
+    VK_COMPONENT_TYPE_UINT16_KHR = 8,
+    VK_COMPONENT_TYPE_UINT32_KHR = 9,
+    VK_COMPONENT_TYPE_UINT64_KHR = 10,
+    VK_COMPONENT_TYPE_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkComponentTypeKHR;
+typedef enum VkScopeKHR
+{
+    VK_SCOPE_DEVICE_KHR = 1,
+    VK_SCOPE_WORKGROUP_KHR = 2,
+    VK_SCOPE_SUBGROUP_KHR = 3,
+    VK_SCOPE_QUEUE_FAMILY_KHR = 5,
+    VK_SCOPE_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkScopeKHR;
+typedef struct VkCooperativeMatrixPropertiesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t MSize;
+    uint32_t NSize;
+    uint32_t KSize;
+    VkComponentTypeKHR AType;
+    VkComponentTypeKHR BType;
+    VkComponentTypeKHR CType;
+    VkComponentTypeKHR ResultType;
+    VkBool32 saturatingAccumulation;
+    VkScopeKHR scope;
+} VkCooperativeMatrixPropertiesKHR;
+typedef struct VkPhysicalDeviceCooperativeMatrixFeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 cooperativeMatrix;
+    VkBool32 cooperativeMatrixRobustBufferAccess;
+} VkPhysicalDeviceCooperativeMatrixFeaturesKHR;
+typedef struct VkPhysicalDeviceCooperativeMatrixPropertiesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkShaderStageFlags cooperativeMatrixSupportedStages;
+} VkPhysicalDeviceCooperativeMatrixPropertiesKHR;
+typedef VkResult(VKAPI_PTR* PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR)(VkPhysicalDevice physicalDevice, uint32_t* pPropertyCount, VkCooperativeMatrixPropertiesKHR* pProperties);
+#endif // VK_HEADER_VERSION < 255
+
+#endif // NCNN_VULKAN_HEADER_FIX_H
diff --git a/duix-sdk/src/main/cpp/third/arm/include/onnx/cpu_provider_factory.h b/duix-sdk/src/main/cpp/third/arm/include/onnx/cpu_provider_factory.h
new file mode 100644
index 0000000..2926786
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/onnx/cpu_provider_factory.h
@@ -0,0 +1,19 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "onnxruntime_c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \param use_arena zero: false. non-zero: true.
+ */
+ORT_EXPORT
+ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_CPU, _In_ OrtSessionOptions* options, int use_arena)
+ORT_ALL_ARGS_NONNULL;
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/onnx/nnapi_provider_factory.h b/duix-sdk/src/main/cpp/third/arm/include/onnx/nnapi_provider_factory.h
new file mode 100644
index 0000000..a0aeb5a
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/onnx/nnapi_provider_factory.h
@@ -0,0 +1,62 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+
+#include "onnxruntime_c_api.h"
+
+// NNAPIFlags are bool options we want to set for NNAPI EP
+// This enum is defined as bit flags, and cannot have negative value
+// To generate an uint32_t nnapi_flags for using with OrtSessionOptionsAppendExecutionProvider_Nnapi below,
+//   uint32_t nnapi_flags = 0;
+//   nnapi_flags |= NNAPI_FLAG_USE_FP16;
+enum NNAPIFlags {
+  NNAPI_FLAG_USE_NONE = 0x000,
+
+  // Using fp16 relaxation in NNAPI EP, this may improve perf but may also reduce precision
+  NNAPI_FLAG_USE_FP16 = 0x001,
+
+  // Use NCHW layout in NNAPI EP, this is only available after Android API level 29
+  // Please note for now, NNAPI perform worse using NCHW compare to using NHWC
+  NNAPI_FLAG_USE_NCHW = 0x002,
+
+  // Prevent NNAPI from using CPU devices.
+  //
+  // NNAPI is more efficient using GPU or NPU for execution, and NNAPI might fall back to its own CPU implementation
+  // for operations not supported by GPU/NPU. The CPU implementation of NNAPI (which is called nnapi-reference)
+  // might be less efficient than the optimized versions of the operation of ORT. It might be advantageous to disable
+  // the NNAPI CPU fallback and handle execution using ORT kernels.
+  //
+  // For some models, if NNAPI would use CPU to execute an operation, and this flag is set, the execution of the
+  // model may fall back to ORT kernels.
+  //
+  // This option is only available after Android API level 29, and will be ignored for Android API level 28-
+  //
+  // For NNAPI device assignments, see https://developer.android.com/ndk/guides/neuralnetworks#device-assignment
+  // For NNAPI CPU fallback, see https://developer.android.com/ndk/guides/neuralnetworks#cpu-fallback
+  //
+  // Please note, the NNAPI EP will return error status if both NNAPI_FLAG_CPU_DISABLED
+  // and NNAPI_FLAG_CPU_ONLY flags are set
+  NNAPI_FLAG_CPU_DISABLED = 0x004,
+
+  // Using CPU only in NNAPI EP, this may decrease the perf but will provide
+  // reference output value without precision loss, which is useful for validation
+  //
+  // Please note, the NNAPI EP will return error status if both NNAPI_FLAG_CPU_DISABLED
+  // and NNAPI_FLAG_CPU_ONLY flags are set
+  NNAPI_FLAG_CPU_ONLY = 0x008,
+
+  // Keep NNAPI_FLAG_LAST at the end of the enum definition
+  // And assign the last NNAPIFlag to it
+  NNAPI_FLAG_LAST = NNAPI_FLAG_CPU_ONLY,
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ORT_EXPORT ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_Nnapi,
+                          _In_ OrtSessionOptions* options, uint32_t nnapi_flags);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/onnx/onnxruntime_c_api.h b/duix-sdk/src/main/cpp/third/arm/include/onnx/onnxruntime_c_api.h
new file mode 100644
index 0000000..456a116
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/onnx/onnxruntime_c_api.h
@@ -0,0 +1,4550 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// See docs\c_cxx\README.md on generating the Doxygen documentation from this file
+
+/** \mainpage ONNX Runtime
+ *
+ * ONNX Runtime is a high-performance inference and training graph execution engine for deep learning models.
+ *
+ * ONNX Runtime's C, C++ APIs offer an easy to use interface to onboard and execute onnx models.
+ * - \subpage c_cpp_api "Core C, C++ APIs"
+ * - \subpage training_c_cpp_api "Training C, C++ APIs for on-device training"
+ *
+ * \page c_cpp_api Core C, C++ APIs
+ * <h1>C</h1>
+ *
+ * ::OrtApi - Click here to go to the structure with all C API functions.
+ *
+ * <h1>C++</h1>
+ *
+ * ::Ort - Click here to go to the namespace holding all of the C++ wrapper classes
+ *
+ * It is a set of header only wrapper classes around the C API. The goal is to turn the C style return value error codes into C++ exceptions, and to
+ * automate memory management through standard C++ RAII principles.
+ *
+ * \addtogroup Global
+ * ONNX Runtime C API
+ * @{
+ */
+
+#pragma once
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+/** \brief The API version defined in this header
+ *
+ * This value is used by some API functions to behave as this version of the header expects.
+ */
+#define ORT_API_VERSION 16
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//! @}
+// SAL2 Definitions
+#ifndef _WIN32
+#define _In_
+#define _In_z_
+#define _In_opt_
+#define _In_opt_z_
+#define _Out_
+#define _Outptr_
+#define _Out_opt_
+#define _Inout_
+#define _Inout_opt_
+#define _Frees_ptr_opt_
+#define _Ret_maybenull_
+#define _Ret_notnull_
+#define _Check_return_
+#define _Outptr_result_maybenull_
+#define _In_reads_(X)
+#define _Inout_updates_(X)
+#define _Out_writes_(X)
+#define _Inout_updates_all_(X)
+#define _Out_writes_bytes_all_(X)
+#define _Out_writes_all_(X)
+#define _Success_(X)
+#define _Outptr_result_buffer_maybenull_(X)
+#define ORT_ALL_ARGS_NONNULL __attribute__((nonnull))
+#else
+#include <specstrings.h>
+#define ORT_ALL_ARGS_NONNULL
+#endif
+
+#ifdef _WIN32
+// Define ORT_DLL_IMPORT if your program is dynamically linked to Ort.
+// dllexport is not used, we use a .def file.
+#ifdef ORT_DLL_IMPORT
+#define ORT_EXPORT __declspec(dllimport)
+#else
+#define ORT_EXPORT
+#endif
+#define ORT_API_CALL _stdcall
+#define ORT_MUST_USE_RESULT
+#define ORTCHAR_T wchar_t
+#else
+// To make symbols visible on macOS/iOS
+#ifdef __APPLE__
+#define ORT_EXPORT __attribute__((visibility("default")))
+#else
+#define ORT_EXPORT
+#endif
+#define ORT_API_CALL
+#define ORT_MUST_USE_RESULT __attribute__((warn_unused_result))
+#define ORTCHAR_T char
+#endif
+
+/// ORTCHAR_T, ORT_TSTR are reserved specifically for path handling.
+/// All other strings are UTF-8 encoded, use char and std::string
+#ifndef ORT_TSTR
+#ifdef _WIN32
+#define ORT_TSTR(X) L##X
+// When X is a macro, L##X is not defined. In this case, we need to use ORT_TSTR_ON_MACRO.
+#define ORT_TSTR_ON_MACRO(X) L"" X
+#else
+#define ORT_TSTR(X) X
+#define ORT_TSTR_ON_MACRO(X) X
+#endif
+#endif
+
+// On Windows, ORT_FILE is a wchar_t version of the __FILE__ macro.
+// Otherwise, ORT_FILE is equivalent to __FILE__.
+#ifndef ORT_FILE
+#define ORT_FILE_INTERNAL(x) ORT_TSTR(x)
+#define ORT_FILE ORT_FILE_INTERNAL(__FILE__)
+#endif
+
+// Any pointer marked with _In_ or _Out_, cannot be NULL.
+
+// Windows users should use unicode paths when possible to bypass the MAX_PATH limitation
+// Every pointer marked with _In_ or _Out_, cannot be NULL. Caller should ensure that.
+// for ReleaseXXX(...) functions, they can accept NULL pointer.
+
+#ifdef __cplusplus
+// For any compiler with C++11 support, MSVC 2015 and greater, or Clang version supporting noexcept.
+// Such complex condition is needed because compilers set __cplusplus value differently.
+#ifndef __has_feature
+#define __has_feature(x) 0
+#endif
+#if ((__cplusplus >= 201103L) || (_MSC_VER >= 1900) || (defined(__has_feature) && __has_feature(cxx_noexcept)))
+#define NO_EXCEPTION noexcept
+#else
+#define NO_EXCEPTION throw()
+#endif
+#else
+#define NO_EXCEPTION
+#endif
+
+// __VA_ARGS__ on Windows and Linux are different
+#define ORT_API(RETURN_TYPE, NAME, ...) RETURN_TYPE ORT_API_CALL NAME(__VA_ARGS__) NO_EXCEPTION
+
+#define ORT_API_STATUS(NAME, ...)                                                                   \
+  _Success_(return == 0) _Check_return_ _Ret_maybenull_ OrtStatusPtr ORT_API_CALL NAME(__VA_ARGS__) \
+  NO_EXCEPTION ORT_MUST_USE_RESULT
+
+// XXX: Unfortunately, SAL annotations are known to not work with function pointers
+#define ORT_API2_STATUS(NAME, ...) \
+  _Check_return_ _Ret_maybenull_ OrtStatusPtr(ORT_API_CALL* NAME)(__VA_ARGS__) NO_EXCEPTION ORT_MUST_USE_RESULT
+
+// Used in *.cc files. Almost as same as ORT_API_STATUS, except without ORT_MUST_USE_RESULT and ORT_EXPORT
+#define ORT_API_STATUS_IMPL(NAME, ...) \
+  _Success_(return == 0) _Check_return_ _Ret_maybenull_ OrtStatusPtr ORT_API_CALL NAME(__VA_ARGS__) NO_EXCEPTION
+
+#define ORT_CLASS_RELEASE(X) void(ORT_API_CALL * Release##X)(_Frees_ptr_opt_ Ort##X * input)
+
+#ifdef __DOXYGEN__
+#undef ORT_API_STATUS
+#define ORT_API_STATUS(NAME, ...) OrtStatus* NAME(__VA_ARGS__)
+#undef ORT_API2_STATUS
+#define ORT_API2_STATUS(NAME, ...) OrtStatus* NAME(__VA_ARGS__)
+#undef ORT_CLASS_RELEASE
+#define ORT_CLASS_RELEASE(X) void Release##X(Ort##X* input)
+#undef NO_EXCEPTION
+#define NO_EXCEPTION
+#endif
+/** \addtogroup Global
+ * ONNX Runtime C API
+ * @{
+ */
+
+/** Copied from TensorProto::DataType
+ * Currently, Ort doesn't support complex64, complex128
+ */
+typedef enum ONNXTensorElementDataType {
+  ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED,
+  ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT,   // maps to c type float
+  ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8,   // maps to c type uint8_t
+  ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8,    // maps to c type int8_t
+  ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16,  // maps to c type uint16_t
+  ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16,   // maps to c type int16_t
+  ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32,   // maps to c type int32_t
+  ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64,   // maps to c type int64_t
+  ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING,  // maps to c++ type std::string
+  ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL,
+  ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16,
+  ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE,      // maps to c type double
+  ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32,      // maps to c type uint32_t
+  ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64,      // maps to c type uint64_t
+  ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX64,   // complex with float32 real and imaginary components
+  ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX128,  // complex with float64 real and imaginary components
+  ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16,    // Non-IEEE floating-point format based on IEEE754 single-precision
+  // float 8 types were introduced in onnx 1.14, see https://onnx.ai/onnx/technical/float8.html
+  ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FN,    // Non-IEEE floating-point format based on IEEE754 single-precision
+  ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FNUZ,  // Non-IEEE floating-point format based on IEEE754 single-precision
+  ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2,      // Non-IEEE floating-point format based on IEEE754 single-precision
+  ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2FNUZ   // Non-IEEE floating-point format based on IEEE754 single-precision
+} ONNXTensorElementDataType;
+
+// Synced with onnx TypeProto oneof
+typedef enum ONNXType {
+  ONNX_TYPE_UNKNOWN,
+  ONNX_TYPE_TENSOR,
+  ONNX_TYPE_SEQUENCE,
+  ONNX_TYPE_MAP,
+  ONNX_TYPE_OPAQUE,
+  ONNX_TYPE_SPARSETENSOR,
+  ONNX_TYPE_OPTIONAL
+} ONNXType;
+
+// These types are synced with internal
+// SparseFormatFlags
+typedef enum OrtSparseFormat {
+  ORT_SPARSE_UNDEFINED = 0,
+  ORT_SPARSE_COO = 0x1,
+  ORT_SPARSE_CSRC = 0x2,
+  ORT_SPARSE_BLOCK_SPARSE = 0x4
+} OrtSparseFormat;
+
+// Enum allows to query sparse tensor indices
+enum OrtSparseIndicesFormat {
+  ORT_SPARSE_COO_INDICES,
+  ORT_SPARSE_CSR_INNER_INDICES,
+  ORT_SPARSE_CSR_OUTER_INDICES,
+  ORT_SPARSE_BLOCK_SPARSE_INDICES
+};
+
+/** \brief Logging severity levels
+ *
+ * In typical API usage, specifying a logging severity level specifies the minimum severity of log messages to show.
+ */
+typedef enum OrtLoggingLevel {
+  ORT_LOGGING_LEVEL_VERBOSE,  ///< Verbose informational messages (least severe).
+  ORT_LOGGING_LEVEL_INFO,     ///< Informational messages.
+  ORT_LOGGING_LEVEL_WARNING,  ///< Warning messages.
+  ORT_LOGGING_LEVEL_ERROR,    ///< Error messages.
+  ORT_LOGGING_LEVEL_FATAL,    ///< Fatal error messages (most severe).
+} OrtLoggingLevel;
+
+typedef enum OrtErrorCode {
+  ORT_OK,
+  ORT_FAIL,
+  ORT_INVALID_ARGUMENT,
+  ORT_NO_SUCHFILE,
+  ORT_NO_MODEL,
+  ORT_ENGINE_ERROR,
+  ORT_RUNTIME_EXCEPTION,
+  ORT_INVALID_PROTOBUF,
+  ORT_MODEL_LOADED,
+  ORT_NOT_IMPLEMENTED,
+  ORT_INVALID_GRAPH,
+  ORT_EP_FAIL,
+} OrtErrorCode;
+
+typedef enum OrtOpAttrType {
+  ORT_OP_ATTR_UNDEFINED = 0,
+  ORT_OP_ATTR_INT,
+  ORT_OP_ATTR_INTS,
+  ORT_OP_ATTR_FLOAT,
+  ORT_OP_ATTR_FLOATS,
+  ORT_OP_ATTR_STRING,
+  ORT_OP_ATTR_STRINGS,
+} OrtOpAttrType;
+
+//! @}
+#define ORT_RUNTIME_CLASS(X) \
+  struct Ort##X;             \
+  typedef struct Ort##X Ort##X;
+
+/** \addtogroup Global
+ * ONNX Runtime C API
+ * @{
+ */
+// The actual types defined have an Ort prefix
+ORT_RUNTIME_CLASS(Env);
+ORT_RUNTIME_CLASS(Status);  // nullptr for Status* indicates success
+ORT_RUNTIME_CLASS(MemoryInfo);
+ORT_RUNTIME_CLASS(IoBinding);
+ORT_RUNTIME_CLASS(Session);  // Don't call ReleaseSession from Dllmain (because session owns a thread pool)
+ORT_RUNTIME_CLASS(Value);
+ORT_RUNTIME_CLASS(RunOptions);
+ORT_RUNTIME_CLASS(TypeInfo);
+ORT_RUNTIME_CLASS(TensorTypeAndShapeInfo);
+ORT_RUNTIME_CLASS(MapTypeInfo);
+ORT_RUNTIME_CLASS(SequenceTypeInfo);
+ORT_RUNTIME_CLASS(OptionalTypeInfo);
+ORT_RUNTIME_CLASS(SessionOptions);
+ORT_RUNTIME_CLASS(CustomOpDomain);
+ORT_RUNTIME_CLASS(ModelMetadata);
+ORT_RUNTIME_CLASS(ThreadPoolParams);
+ORT_RUNTIME_CLASS(ThreadingOptions);
+ORT_RUNTIME_CLASS(ArenaCfg);
+ORT_RUNTIME_CLASS(PrepackedWeightsContainer);
+ORT_RUNTIME_CLASS(TensorRTProviderOptionsV2);
+ORT_RUNTIME_CLASS(CUDAProviderOptionsV2);
+ORT_RUNTIME_CLASS(CANNProviderOptions);
+ORT_RUNTIME_CLASS(DnnlProviderOptions);
+ORT_RUNTIME_CLASS(Op);
+ORT_RUNTIME_CLASS(OpAttr);
+ORT_RUNTIME_CLASS(Logger);
+
+#ifdef _WIN32
+typedef _Return_type_success_(return == 0) OrtStatus* OrtStatusPtr;
+#else
+typedef OrtStatus* OrtStatusPtr;
+#endif
+
+/** \brief Memory allocation interface
+ *
+ * Structure of function pointers that defines a memory allocator. This can be created and filled in by the user for custom allocators.
+ *
+ * When an allocator is passed to any function, be sure that the allocator object is not destroyed until the last allocated object using it is freed.
+ */
+typedef struct OrtAllocator {
+  uint32_t version;                                                                   ///< Must be initialized to ORT_API_VERSION
+  void*(ORT_API_CALL* Alloc)(struct OrtAllocator* this_, size_t size);                ///< Returns a pointer to an allocated block of `size` bytes
+  void(ORT_API_CALL* Free)(struct OrtAllocator* this_, void* p);                      ///< Free a block of memory previously allocated with OrtAllocator::Alloc
+  const struct OrtMemoryInfo*(ORT_API_CALL* Info)(const struct OrtAllocator* this_);  ///< Return a pointer to an ::OrtMemoryInfo that describes this allocator
+} OrtAllocator;
+
+typedef void(ORT_API_CALL* OrtLoggingFunction)(
+    void* param, OrtLoggingLevel severity, const char* category, const char* logid, const char* code_location,
+    const char* message);
+
+/** \brief Graph optimization level
+ *
+ * Refer to https://www.onnxruntime.ai/docs/performance/graph-optimizations.html#graph-optimization-levels
+ * for an in-depth understanding of the Graph Optimization Levels.
+ */
+typedef enum GraphOptimizationLevel {
+  ORT_DISABLE_ALL = 0,
+  ORT_ENABLE_BASIC = 1,
+  ORT_ENABLE_EXTENDED = 2,
+  ORT_ENABLE_ALL = 99
+} GraphOptimizationLevel;
+
+typedef enum ExecutionMode {
+  ORT_SEQUENTIAL = 0,
+  ORT_PARALLEL = 1,
+} ExecutionMode;
+
+/** \brief Language projection identifiers
+ * /see OrtApi::SetLanguageProjection
+ */
+typedef enum OrtLanguageProjection {
+  ORT_PROJECTION_C = 0,
+  ORT_PROJECTION_CPLUSPLUS = 1,
+  ORT_PROJECTION_CSHARP = 2,
+  ORT_PROJECTION_PYTHON = 3,
+  ORT_PROJECTION_JAVA = 4,
+  ORT_PROJECTION_WINML = 5,
+  ORT_PROJECTION_NODEJS = 6,
+} OrtLanguageProjection;
+
+struct OrtKernelInfo;
+typedef struct OrtKernelInfo OrtKernelInfo;
+struct OrtKernelContext;
+typedef struct OrtKernelContext OrtKernelContext;
+struct OrtCustomOp;
+typedef struct OrtCustomOp OrtCustomOp;
+
+typedef enum OrtAllocatorType {
+  OrtInvalidAllocator = -1,
+  OrtDeviceAllocator = 0,
+  OrtArenaAllocator = 1
+} OrtAllocatorType;
+
+/** \brief Memory types for allocated memory, execution provider specific types should be extended in each provider.
+ */
+// Whenever this struct is updated, please also update the MakeKey function in onnxruntime / core / framework / execution_provider.cc
+typedef enum OrtMemType {
+  OrtMemTypeCPUInput = -2,              ///< Any CPU memory used by non-CPU execution provider
+  OrtMemTypeCPUOutput = -1,             ///< CPU accessible memory outputted by non-CPU execution provider, i.e. CUDA_PINNED
+  OrtMemTypeCPU = OrtMemTypeCPUOutput,  ///< Temporary CPU accessible memory allocated by non-CPU execution provider, i.e. CUDA_PINNED
+  OrtMemTypeDefault = 0,                ///< The default allocator for execution provider
+} OrtMemType;
+
+/** \brief This mimics OrtDevice type constants so they can be returned in the API
+ */
+typedef enum OrtMemoryInfoDeviceType {
+  OrtMemoryInfoDeviceType_CPU = 0,
+  OrtMemoryInfoDeviceType_GPU = 1,
+  OrtMemoryInfoDeviceType_FPGA = 2
+} OrtMemoryInfoDeviceType;
+
+/** \brief Algorithm to use for cuDNN Convolution Op
+ */
+typedef enum OrtCudnnConvAlgoSearch {
+  OrtCudnnConvAlgoSearchExhaustive,  // expensive exhaustive benchmarking using cudnnFindConvolutionForwardAlgorithmEx
+  OrtCudnnConvAlgoSearchHeuristic,   // lightweight heuristic based search using cudnnGetConvolutionForwardAlgorithm_v7
+  OrtCudnnConvAlgoSearchDefault,     // default algorithm using CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
+} OrtCudnnConvAlgoSearch;
+
+/** \brief CUDA Provider Options
+ *
+ * \see OrtApi::SessionOptionsAppendExecutionProvider_CUDA
+ */
+typedef struct OrtCUDAProviderOptions {
+#ifdef __cplusplus
+  OrtCUDAProviderOptions()
+      : device_id{},
+        cudnn_conv_algo_search{OrtCudnnConvAlgoSearchExhaustive},
+        gpu_mem_limit{SIZE_MAX},
+        arena_extend_strategy{},
+        do_copy_in_default_stream{1},
+        has_user_compute_stream{},
+        user_compute_stream{},
+        default_memory_arena_cfg{},
+        tunable_op_enable{false},
+        tunable_op_tuning_enable{false},
+        tunable_op_max_tuning_duration_ms{} {}
+#endif
+
+  /** \brief CUDA device Id
+   *   Defaults to 0.
+   */
+  int device_id;
+
+  /** \brief CUDA Convolution algorithm search configuration.
+   *   See enum OrtCudnnConvAlgoSearch for more details.
+   *   Defaults to OrtCudnnConvAlgoSearchExhaustive.
+   */
+  OrtCudnnConvAlgoSearch cudnn_conv_algo_search;
+
+  /** \brief CUDA memory limit (To use all possible memory pass in maximum size_t)
+   *   Defaults to SIZE_MAX.
+   *   \note If a ::OrtArenaCfg has been applied, it will override this field
+   */
+  size_t gpu_mem_limit;
+
+  /** \brief Strategy used to grow the memory arena
+   *   0 = kNextPowerOfTwo<br>
+   *   1 = kSameAsRequested<br>
+   *   Defaults to 0.
+   *   \note If a ::OrtArenaCfg has been applied, it will override this field
+   */
+  int arena_extend_strategy;
+
+  /** \brief Flag indicating if copying needs to take place on the same stream as the compute stream in the CUDA EP
+   *   0 = Use separate streams for copying and compute.
+   *   1 = Use the same stream for copying and compute.
+   *   Defaults to 1.
+   *   WARNING: Setting this to 0 may result in data races for some models.
+   *   Please see issue #4829 for more details.
+   */
+  int do_copy_in_default_stream;
+
+  /** \brief Flag indicating if there is a user provided compute stream
+   *   Defaults to 0.
+   */
+  int has_user_compute_stream;
+
+  /** \brief User provided compute stream.
+   *   If provided, please set `has_user_compute_stream` to 1.
+   */
+  void* user_compute_stream;
+
+  /** \brief CUDA memory arena configuration parameters
+   */
+  OrtArenaCfg* default_memory_arena_cfg;
+
+  /** \brief Enable TunableOp for using.
+   *   Set it to 1/0 to enable/disable TunableOp. Otherwise, it is disabled by default.
+   *   This option can be overriden by environment variable ORT_CUDA_TUNABLE_OP_ENABLE.
+   */
+  int tunable_op_enable;
+
+  /** \brief Enable TunableOp for tuning.
+   *   Set it to 1/0 to enable/disable TunableOp tuning. Otherwise, it is disabled by default.
+   *   This option can be overriden by environment variable ORT_CUDA_TUNABLE_OP_TUNING_ENABLE.
+   */
+  int tunable_op_tuning_enable;
+
+  /** \brief Max tuning duration time limit for each instance of TunableOp.
+   *   Defaults to 0 to disable the limit.
+   */
+  int tunable_op_max_tuning_duration_ms;
+
+} OrtCUDAProviderOptions;
+
+/** \brief ROCM Provider Options
+ *
+ * \see OrtApi::SessionOptionsAppendExecutionProvider_ROCM
+ */
+typedef struct OrtROCMProviderOptions {
+#ifdef __cplusplus
+  OrtROCMProviderOptions()
+      : device_id{},
+        miopen_conv_exhaustive_search{0},
+        gpu_mem_limit{SIZE_MAX},
+        arena_extend_strategy{},
+        do_copy_in_default_stream{1},
+        has_user_compute_stream{},
+        user_compute_stream{},
+        default_memory_arena_cfg{},
+        tunable_op_enable{false},
+        tunable_op_tuning_enable{false},
+        tunable_op_max_tuning_duration_ms{} {}
+#endif
+
+  /** \brief ROCM device Id
+   *   Defaults to 0.
+   */
+  int device_id;
+
+  /** \brief ROCM MIOpen Convolution algorithm exaustive search option.
+   *   Defaults to 0 (false).
+   */
+  int miopen_conv_exhaustive_search;
+
+  /** \brief ROCM memory limit (To use all possible memory pass in maximum size_t)
+   *   Defaults to SIZE_MAX.
+   *   \note If a ::OrtArenaCfg has been applied, it will override this field
+   */
+  size_t gpu_mem_limit;
+
+  /** \brief Strategy used to grow the memory arena
+   *   0 = kNextPowerOfTwo<br>
+   *   1 = kSameAsRequested<br>
+   *   Defaults to 0.
+   *   \note If a ::OrtArenaCfg has been applied, it will override this field
+   */
+  int arena_extend_strategy;
+
+  /** \brief Flag indicating if copying needs to take place on the same stream as the compute stream in the ROCM EP
+   *   0 = Use separate streams for copying and compute.
+   *   1 = Use the same stream for copying and compute.
+   *   Defaults to 1.
+   *   WARNING: Setting this to 0 may result in data races for some models.
+   *   Please see issue #4829 for more details.
+   */
+  int do_copy_in_default_stream;
+
+  /** \brief Flag indicating if there is a user provided compute stream
+   *   Defaults to 0.
+   */
+  int has_user_compute_stream;
+
+  /** \brief User provided compute stream.
+   *   If provided, please set `has_user_compute_stream` to 1.
+   */
+  void* user_compute_stream;
+
+  /** \brief ROCM memory arena configuration parameters
+   */
+  OrtArenaCfg* default_memory_arena_cfg;
+
+  /** \brief Enable TunableOp for using.
+   *   Set it to 1/0 to enable/disable TunableOp. Otherwise, it is disabled by default.
+   *   This option can be overriden by environment variable ORT_ROCM_TUNABLE_OP_ENABLE.
+   */
+  int tunable_op_enable;
+
+  /** \brief Enable TunableOp for tuning.
+   *   Set it to 1/0 to enable/disable TunableOp tuning. Otherwise, it is disabled by default.
+   *   This option can be overriden by environment variable ORT_ROCM_TUNABLE_OP_TUNING_ENABLE.
+   */
+  int tunable_op_tuning_enable;
+
+  /** \brief Max tuning duration time limit for each instance of TunableOp.
+   *   Defaults to 0 to disable the limit.
+   */
+  int tunable_op_max_tuning_duration_ms;
+
+} OrtROCMProviderOptions;
+
+/** \brief TensorRT Provider Options
+ *
+ * \see OrtApi::SessionOptionsAppendExecutionProvider_TensorRT
+ */
+typedef struct OrtTensorRTProviderOptions {
+  int device_id;                                ///< CUDA device id (0 = default device)
+  int has_user_compute_stream;                  // indicator of user specified CUDA compute stream.
+  void* user_compute_stream;                    // user specified CUDA compute stream.
+  int trt_max_partition_iterations;             // maximum iterations for TensorRT parser to get capability
+  int trt_min_subgraph_size;                    // minimum size of TensorRT subgraphs
+  size_t trt_max_workspace_size;                // maximum workspace size for TensorRT.
+  int trt_fp16_enable;                          // enable TensorRT FP16 precision. Default 0 = false, nonzero = true
+  int trt_int8_enable;                          // enable TensorRT INT8 precision. Default 0 = false, nonzero = true
+  const char* trt_int8_calibration_table_name;  // TensorRT INT8 calibration table name.
+  int trt_int8_use_native_calibration_table;    // use native TensorRT generated calibration table. Default 0 = false, nonzero = true
+  int trt_dla_enable;                           // enable DLA. Default 0 = false, nonzero = true
+  int trt_dla_core;                             // DLA core number. Default 0
+  int trt_dump_subgraphs;                       // dump TRT subgraph. Default 0 = false, nonzero = true
+  int trt_engine_cache_enable;                  // enable engine caching. Default 0 = false, nonzero = true
+  const char* trt_engine_cache_path;            // specify engine cache path
+  int trt_engine_decryption_enable;             // enable engine decryption. Default 0 = false, nonzero = true
+  const char* trt_engine_decryption_lib_path;   // specify engine decryption library path
+  int trt_force_sequential_engine_build;        // force building TensorRT engine sequentially. Default 0 = false, nonzero = true
+  // This is the legacy struct and don't add new fields here.
+  // For new field that can be represented by string, please add it in include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
+  // For non-string field, need to create a new separate api to handle it.
+} OrtTensorRTProviderOptions;
+
+/** \brief MIGraphX Provider Options
+ *
+ * \see OrtApi::SessionOptionsAppendExecutionProvider_MIGraphX
+ */
+typedef struct OrtMIGraphXProviderOptions {
+  int device_id;             // hip device id.
+  int migraphx_fp16_enable;  // enable MIGraphX FP16 precision. Default 0 = false, nonzero = true
+  int migraphx_int8_enable;  // enable MIGraphX INT8 precision. Default 0 = false, nonzero = true
+} OrtMIGraphXProviderOptions;
+
+/** \brief OpenVINO Provider Options
+ *
+ * \see OrtApi::SessionOptionsAppendExecutionProvider_OpenVINO
+ */
+typedef struct OrtOpenVINOProviderOptions {
+#ifdef __cplusplus
+  OrtOpenVINOProviderOptions() : device_type{},
+                                 enable_vpu_fast_compile{},
+                                 device_id{},
+                                 num_of_threads{},
+                                 cache_dir{},
+                                 context{},
+                                 enable_opencl_throttling{},
+                                 enable_dynamic_shapes{} {}
+#endif
+  /** \brief Device type string
+   *
+   * Valid settings are one of: "CPU_FP32", "CPU_FP16", "GPU_FP32", "GPU_FP16"
+   */
+  const char* device_type;
+  unsigned char enable_vpu_fast_compile;  ///< 0 = disabled, nonzero = enabled
+  const char* device_id;
+  size_t num_of_threads;  ///< 0 = Use default number of threads
+  const char* cache_dir;  // path is set to empty by default
+  void* context;
+  unsigned char enable_opencl_throttling;  ///< 0 = disabled, nonzero = enabled
+  unsigned char enable_dynamic_shapes;     ///< 0 = disabled, nonzero = enabled
+} OrtOpenVINOProviderOptions;
+
+struct OrtApi;
+typedef struct OrtApi OrtApi;
+
+struct OrtTrainingApi;
+typedef struct OrtTrainingApi OrtTrainingApi;
+
+/** \brief The helper interface to get the right version of OrtApi
+ *
+ * Get a pointer to this structure through ::OrtGetApiBase
+ */
+struct OrtApiBase {
+  /** \brief Get a pointer to the requested version of the ::OrtApi
+   *
+   * \param[in] version Must be ::ORT_API_VERSION
+   * \return The ::OrtApi for the version requested, nullptr will be returned if this version is unsupported, for example when using a runtime
+   *   older than the version created with this header file.
+   *
+   * One can call GetVersionString() to get the version of the Onnxruntime library for logging
+   * and error reporting purposes.
+   */
+  const OrtApi*(ORT_API_CALL* GetApi)(uint32_t version)NO_EXCEPTION;
+
+  /** \brief Returns a null terminated string of the version of the Onnxruntime library (eg: "1.8.1")
+   *
+   *  \return UTF-8 encoded version string. Do not deallocate the returned buffer.
+   */
+  const char*(ORT_API_CALL* GetVersionString)(void)NO_EXCEPTION;
+};
+
+typedef struct OrtApiBase OrtApiBase;
+
+/** \brief The Onnxruntime library's entry point to access the C API
+ *
+ * Call this to get the a pointer to an ::OrtApiBase
+ */
+ORT_EXPORT const OrtApiBase* ORT_API_CALL OrtGetApiBase(void) NO_EXCEPTION;
+
+/** \brief Thread work loop function
+ *
+ * Onnxruntime will provide the working loop on custom thread creation
+ * Argument is an onnxruntime built-in type which will be provided when thread pool calls OrtCustomCreateThreadFn
+ */
+typedef void (*OrtThreadWorkerFn)(void* ort_worker_fn_param);
+
+typedef const struct OrtCustomHandleType {
+  char __place_holder;
+}* OrtCustomThreadHandle;
+
+/** \brief Ort custom thread creation function
+ *
+ * The function should return a thread handle to be used in onnxruntime thread pools
+ * Onnxruntime will throw exception on return value of nullptr or 0, indicating that the function failed to create a thread
+ */
+typedef OrtCustomThreadHandle (*OrtCustomCreateThreadFn)(void* ort_custom_thread_creation_options, OrtThreadWorkerFn ort_thread_worker_fn, void* ort_worker_fn_param);
+
+/** \brief Custom thread join function
+ *
+ * Onnxruntime thread pool destructor will call the function to join a custom thread.
+ * Argument ort_custom_thread_handle is the value returned by OrtCustomCreateThreadFn
+ */
+typedef void (*OrtCustomJoinThreadFn)(OrtCustomThreadHandle ort_custom_thread_handle);
+
+typedef OrtStatus*(ORT_API_CALL* RegisterCustomOpsFn)(OrtSessionOptions* options, const OrtApiBase* api);
+
+/** \brief Callback function for RunAsync
+ *
+ * \param[in] user_data User specific data that passed back to the callback
+ * \param[out] outputs On succeed, outputs host inference results, on error, the value will be nullptr
+ * \param[out] num_outputs Number of outputs, on error, the value will be zero
+ * \param[out] status On error, status will provide details
+ */
+typedef void (*RunAsyncCallbackFn)(void* user_data, OrtValue** outputs, size_t num_outputs, OrtStatusPtr status);
+
+/** \brief The C API
+ *
+ * All C API functions are defined inside this structure as pointers to functions.
+ * Call OrtApiBase::GetApi to get a pointer to it
+ *
+ * \nosubgrouping
+ */
+struct OrtApi {
+  /// \name OrtStatus
+  /// @{
+
+  /**
+   * \brief Create an OrtStatus from a null terminated string
+   *
+   * \param[in] code
+   * \param[in] msg A null-terminated string. Its contents will be copied.
+   * \return A new OrtStatus object, must be destroyed with OrtApi::ReleaseStatus
+   */
+  OrtStatus*(ORT_API_CALL* CreateStatus)(OrtErrorCode code, _In_ const char* msg)NO_EXCEPTION ORT_ALL_ARGS_NONNULL;
+
+  /** \brief Get OrtErrorCode from OrtStatus
+   *
+   * \param[in] status
+   * \return OrtErrorCode that \p status was created with
+   */
+  OrtErrorCode(ORT_API_CALL* GetErrorCode)(_In_ const OrtStatus* status) NO_EXCEPTION ORT_ALL_ARGS_NONNULL;
+
+  /** \brief Get error string from OrtStatus
+   *
+   * \param[in] status
+   * \return The error message inside the `status`. Do not free the returned value.
+   */
+  const char*(ORT_API_CALL* GetErrorMessage)(_In_ const OrtStatus* status)NO_EXCEPTION ORT_ALL_ARGS_NONNULL;
+
+  /// @}
+  /// \name OrtEnv
+  /// @{
+
+  /** \brief Create an OrtEnv
+   *
+   * \param[in] log_severity_level The log severity level.
+   * \param[in] logid The log identifier.
+   * \param[out] out Returned newly created OrtEnv. Must be freed with OrtApi::ReleaseEnv
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CreateEnv, OrtLoggingLevel log_severity_level, _In_ const char* logid, _Outptr_ OrtEnv** out);
+
+  /** \brief Create an OrtEnv
+   *
+   * \param[in] logging_function A pointer to a logging function.
+   * \param[in] logger_param A pointer to arbitrary data passed as the ::OrtLoggingFunction `param` parameter to
+   *                         `logging_function`.
+   * \param[in] log_severity_level The log severity level.
+   * \param[in] logid The log identifier.
+   * \param[out] out Returned newly created OrtEnv. Must be freed with OrtApi::ReleaseEnv
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CreateEnvWithCustomLogger, OrtLoggingFunction logging_function, _In_opt_ void* logger_param,
+                  OrtLoggingLevel log_severity_level, _In_ const char* logid, _Outptr_ OrtEnv** out);
+
+  /** \brief Enable Telemetry
+   *
+   * \note Telemetry events are on by default since they are lightweight
+   * \param[in] env
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(EnableTelemetryEvents, _In_ const OrtEnv* env);
+  /** \brief Disable Telemetry
+   *
+   * \see OrtApi::EnableTelemetryEvents
+   * \param[in] env
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(DisableTelemetryEvents, _In_ const OrtEnv* env);
+
+  /// @}
+  /// \name OrtSession
+  /// @{
+
+  /** \brief Create an OrtSession from a model file
+   *
+   * \param[in] env
+   * \param[in] model_path
+   * \param[in] options
+   * \param[out] out Returned newly created OrtSession. Must be freed with OrtApi::ReleaseSession
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  // TODO: document the path separator convention? '/' vs '\'
+  // TODO: should specify the access characteristics of model_path. Is this read only during the
+  // execution of CreateSession, or does the OrtSession retain a handle to the file/directory
+  // and continue to access throughout the OrtSession lifetime?
+  //  What sort of access is needed to model_path : read or read/write?
+  ORT_API2_STATUS(CreateSession, _In_ const OrtEnv* env, _In_ const ORTCHAR_T* model_path,
+                  _In_ const OrtSessionOptions* options, _Outptr_ OrtSession** out);
+
+  /** \brief Create an OrtSession from memory
+   *
+   * \param[in] env
+   * \param[in] model_data
+   * \param[in] model_data_length
+   * \param[in] options
+   * \param[out] out Returned newly created OrtSession. Must be freed with OrtApi::ReleaseSession
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CreateSessionFromArray, _In_ const OrtEnv* env, _In_ const void* model_data, size_t model_data_length,
+                  _In_ const OrtSessionOptions* options, _Outptr_ OrtSession** out);
+
+  /** \brief Run the model in an ::OrtSession
+   *
+   * Will not return until the model run has completed. Multiple threads might be used to run the model based on
+   * the options in the ::OrtSession and settings used when creating the ::OrtEnv
+   *
+   * \param[in] session
+   * \param[in] run_options If nullptr, will use a default ::OrtRunOptions
+   * \param[in] input_names Array of null terminated UTF8 encoded strings of the input names
+   * \param[in] inputs Array of ::OrtValue%s of the input values
+   * \param[in] input_len Number of elements in the input_names and inputs arrays
+   * \param[in] output_names Array of null terminated UTF8 encoded strings of the output names
+   * \param[in] output_names_len Number of elements in the output_names and outputs array
+   * \param[out] outputs Array of ::OrtValue%s that the outputs are stored in. This can also be
+   *     an array of nullptr values, in this case ::OrtValue objects will be allocated and pointers
+   *     to them will be set into the `outputs` array.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(Run, _Inout_ OrtSession* session, _In_opt_ const OrtRunOptions* run_options,
+                  _In_reads_(input_len) const char* const* input_names,
+                  _In_reads_(input_len) const OrtValue* const* inputs, size_t input_len,
+                  _In_reads_(output_names_len) const char* const* output_names, size_t output_names_len,
+                  _Inout_updates_all_(output_names_len) OrtValue** outputs);
+
+  /// @}
+  /// \name OrtSessionOptions
+  /// @{
+
+  /** \brief Create an ::OrtSessionOptions object
+   *
+   * To use additional providers, you must build ORT with the extra providers enabled. Then call one of these
+   * functions to enable them in the session:<br>
+   *   OrtSessionOptionsAppendExecutionProvider_CPU<br>
+   *   OrtSessionOptionsAppendExecutionProvider_CUDA<br>
+   *   OrtSessionOptionsAppendExecutionProvider_(remaining providers...)<br>
+   * The order they are called indicates the preference order as well. In other words call this method
+   * on your most preferred execution provider first followed by the less preferred ones.
+   * If none are called Ort will use its internal CPU execution provider.
+   *
+   * \param[out] options The newly created OrtSessionOptions. Must be freed with OrtApi::ReleaseSessionOptions
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CreateSessionOptions, _Outptr_ OrtSessionOptions** options);
+
+  /** \brief Set filepath to save optimized model after graph level transformations
+   *
+   * \param[in] options
+   * \param[in] optimized_model_filepath
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SetOptimizedModelFilePath, _Inout_ OrtSessionOptions* options,
+                  _In_ const ORTCHAR_T* optimized_model_filepath);
+
+  /** \brief Create a copy of an existing ::OrtSessionOptions
+   *
+   * \param[in] in_options OrtSessionOptions to copy
+   * \param[out] out_options Returned newly created ::OrtSessionOptions. Must be freed with OrtApi::ReleaseSessionOptions
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CloneSessionOptions, _In_ const OrtSessionOptions* in_options,
+                  _Outptr_ OrtSessionOptions** out_options);
+
+  /** \brief Set execution mode
+   *
+   * Controls whether you want to execute operators in your graph sequentially or in parallel. Usually when the model
+   *  has many branches, setting this option to ExecutionMode.ORT_PARALLEL will give you better performance.
+   *  See [docs/ONNX_Runtime_Perf_Tuning.md] for more details.
+   *
+   * \param[in] options
+   * \param[in] execution_mode
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SetSessionExecutionMode, _Inout_ OrtSessionOptions* options, ExecutionMode execution_mode);
+
+  /** \brief Enable profiling for a session
+   *
+   * \param[in] options
+   * \param[in] profile_file_prefix
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(EnableProfiling, _Inout_ OrtSessionOptions* options, _In_ const ORTCHAR_T* profile_file_prefix);
+
+  /** \brief Disable profiling for a session
+   *
+   * \param[in] options
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(DisableProfiling, _Inout_ OrtSessionOptions* options);
+
+  /** \brief Enable the memory pattern optimization
+   *
+   * The idea is if the input shapes are the same, we could trace the internal memory allocation
+   * and generate a memory pattern for future request. So next time we could just do one allocation
+   * with a big chunk for all the internal memory allocation.
+   * \note Memory pattern optimization is only available when Sequential Execution mode is enabled (see OrtApi::SetSessionExecutionMode)
+   *
+   * \see OrtApi::DisableMemPattern
+   *
+   * \param[in] options
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(EnableMemPattern, _Inout_ OrtSessionOptions* options);
+
+  /** \brief Disable the memory pattern optimization
+   *
+   * \see OrtApi::EnableMemPattern
+   *
+   * \param[in] options
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(DisableMemPattern, _Inout_ OrtSessionOptions* options);
+
+  /** \brief Enable the memory arena on CPU
+   *
+   * Arena may pre-allocate memory for future usage.
+   *
+   * \param[in] options
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(EnableCpuMemArena, _Inout_ OrtSessionOptions* options);
+
+  /** \brief Disable the memory arena on CPU
+   *
+   * \param[in] options
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(DisableCpuMemArena, _Inout_ OrtSessionOptions* options);
+
+  /** \brief Set session log id
+   *
+   * \param[in] options
+   * \param[in] logid The log identifier.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SetSessionLogId, _Inout_ OrtSessionOptions* options, const char* logid);
+
+  /** \brief Set session log verbosity level
+   *
+   * Applies to session load, initialization, etc
+   *
+   * \param[in] options
+   * \param[in] session_log_verbosity_level \snippet{doc} snippets.dox Log Verbosity Level
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SetSessionLogVerbosityLevel, _Inout_ OrtSessionOptions* options, int session_log_verbosity_level);
+
+  /** \brief Set session log severity level
+   *
+   * \param[in] options
+   * \param[in] session_log_severity_level The log severity level (refer to ::OrtLoggingLevel for possible values).
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SetSessionLogSeverityLevel, _Inout_ OrtSessionOptions* options, int session_log_severity_level);
+
+  /** \brief Set the optimization level to apply when loading a graph
+   *
+   * Please see https://onnxruntime.ai/docs/performance/model-optimizations/graph-optimizations.html for an in-depth explanation
+   * \param[in,out] options The session options object
+   * \param[in] graph_optimization_level The optimization level
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SetSessionGraphOptimizationLevel, _Inout_ OrtSessionOptions* options,
+                  GraphOptimizationLevel graph_optimization_level);
+
+  /** \brief Sets the number of threads used to parallelize the execution within nodes
+   *
+   * When running a single node operation, ex. add, this sets the maximum number of threads to use.
+   *
+   * \note If built with OpenMP, this has no effect on the number of threads used. In this case
+   *       use the OpenMP env variables to configure the number of intra op num threads.
+   *
+   * \param[in] options
+   * \param[in] intra_op_num_threads Number of threads to use<br>
+   *   A value of 0 will use the default number of threads<br>
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SetIntraOpNumThreads, _Inout_ OrtSessionOptions* options, int intra_op_num_threads);
+
+  /** \brief Sets the number of threads used to parallelize the execution of the graph
+   *
+   * If nodes can be run in parallel, this sets the maximum number of threads to use to run them in parallel.
+   *
+   * \note If sequential execution is enabled this value is ignored, it acts as if it was set to 1.
+   *
+   * \param[in] options
+   * \param[in] inter_op_num_threads Number of threads to use<br>
+   *   A value of 0 will use the default number of threads<br>
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SetInterOpNumThreads, _Inout_ OrtSessionOptions* options, int inter_op_num_threads);
+
+  /// @}
+  /// \name OrtCustomOpDomain
+  /// @{
+
+  /** \brief Create a custom op domain
+   *
+   * \param[in] domain
+   * \param[out] out Newly created domain. Must be freed with OrtApi::ReleaseCustomOpDomain
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CreateCustomOpDomain, _In_ const char* domain, _Outptr_ OrtCustomOpDomain** out);
+
+  /** \brief Add a custom op to a custom op domain
+   *
+   * \note The OrtCustomOp* pointer must remain valid until the ::OrtCustomOpDomain using it is released
+   *
+   * \param[in] custom_op_domain
+   * \param[in] op
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CustomOpDomain_Add, _Inout_ OrtCustomOpDomain* custom_op_domain, _In_ const OrtCustomOp* op);
+
+  /// @}
+  /// \name OrtSessionOptions
+  /// @{
+
+  /** \brief Add custom op domain to a session options
+   *
+   * \note The OrtCustomOpDomain* must not be deleted until all sessions using it are released
+   *
+   * \param[in] options
+   * \param[in] custom_op_domain
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(AddCustomOpDomain, _Inout_ OrtSessionOptions* options, _In_ OrtCustomOpDomain* custom_op_domain);
+
+  /** \deprecated Use OrtApi::RegisterCustomOpsLibrary_V2.
+   *
+   * Registers custom ops from a shared library.
+   *
+   * Loads a shared library (dll on windows, so on linux, etc) named 'library_path' and looks for this entry point:
+   *		OrtStatus* RegisterCustomOps(OrtSessionOptions * options, const OrtApiBase* api);
+   * It then passes in the provided session options to this function along with the api base.
+   * The handle to the loaded library is returned in library_handle. It can be freed by the caller after all sessions using the passed in
+   * session options are destroyed, or if an error occurs and it is non null.
+   *
+   * \param[in] options
+   * \param[in] library_path
+   * \param[out] library_handle OS specific handle to the loaded library (Use FreeLibrary on Windows, dlclose on Linux, etc.. to unload)
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(RegisterCustomOpsLibrary, _Inout_ OrtSessionOptions* options, _In_ const char* library_path, _Outptr_ void** library_handle);
+
+  /// @}
+  /// \name OrtSession
+  /// @{
+
+  /** \brief Get input count for a session
+   *
+   * This number must also match the number of inputs passed to OrtApi::Run
+   *
+   * \see OrtApi::SessionGetInputTypeInfo, OrtApi::SessionGetInputName, OrtApi::Session
+   *
+   * \param[in] session
+   * \param[out] out Number of inputs
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SessionGetInputCount, _In_ const OrtSession* session, _Out_ size_t* out);
+
+  /** \brief Get output count for a session
+   *
+   * This number must also match the number of outputs returned by OrtApi::Run
+   *
+   * \see OrtApi::SessionGetOutputTypeInfo, OrtApi::SessionGetOutputName, OrtApi::Session
+   *
+   * \param[in] session
+   * \param[out] out Number of outputs
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SessionGetOutputCount, _In_ const OrtSession* session, _Out_ size_t* out);
+
+  /** \brief Get overridable initializer count
+   *
+   * \see OrtApi::SessionGetOverridableInitializerTypeInfo, OrtApi::SessionGetOverridableInitializerName
+   *
+   * \param[in] session
+   * \param[in] out
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SessionGetOverridableInitializerCount, _In_ const OrtSession* session, _Out_ size_t* out);
+
+  /** \brief Get input type information
+   *
+   * \param[in] session
+   * \param[in] index Must be between 0 (inclusive) and what OrtApi::SessionGetInputCount returns (exclusive)
+   * \param[out] type_info Must be freed with OrtApi::ReleaseTypeInfo
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SessionGetInputTypeInfo, _In_ const OrtSession* session, size_t index, _Outptr_ OrtTypeInfo** type_info);
+
+  /** \brief Get output type information
+   *
+   * \param[in] session
+   * \param[in] index Must be between 0 (inclusive) and what OrtApi::SessionGetOutputCount returns (exclusive)
+   * \param[out] type_info Must be freed with OrtApi::ReleaseTypeInfo
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SessionGetOutputTypeInfo, _In_ const OrtSession* session, size_t index, _Outptr_ OrtTypeInfo** type_info);
+
+  /** \brief Get overridable initializer type information
+   *
+   * \param[in] session
+   * \param[in] index Must be between 0 (inclusive) and what OrtApi::SessionGetOverridableInitializerCount returns (exclusive)
+   * \param[out] type_info Must be freed with OrtApi::ReleaseTypeInfo
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SessionGetOverridableInitializerTypeInfo, _In_ const OrtSession* session, size_t index, _Outptr_ OrtTypeInfo** type_info);
+
+  /** \brief Get input name
+   *
+   * \param[in] session
+   * \param[in] index Must be between 0 (inclusive) and what OrtApi::SessionGetInputCount returns (exclusive)
+   * \param[in] allocator
+   * \param[out] value Set to a null terminated UTF-8 encoded string allocated using `allocator`. Must be freed using `allocator`.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SessionGetInputName, _In_ const OrtSession* session, size_t index, _Inout_ OrtAllocator* allocator, _Outptr_ char** value);
+
+  /** \brief Get output name
+   *
+   * \param[in] session
+   * \param[in] index Must be between 0 (inclusive) and what OrtApi::SessionGetOutputCount returns (exclusive)
+   * \param[in] allocator
+   * \param[out] value Set to a null terminated UTF-8 encoded string allocated using `allocator`. Must be freed using `allocator`.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SessionGetOutputName, _In_ const OrtSession* session, size_t index, _Inout_ OrtAllocator* allocator, _Outptr_ char** value);
+
+  /** \brief Get overridable initializer name
+   *
+   * \param[in] session
+   * \param[in] index Must be between 0 (inclusive) and what OrtApi::SessionGetOverridableInitializerCount returns (exclusive)
+   * \param[in] allocator
+   * \param[out] value Set to a null terminated UTF-8 encoded string allocated using `allocator`. Must be freed using `allocator`.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SessionGetOverridableInitializerName, _In_ const OrtSession* session, size_t index,
+                  _Inout_ OrtAllocator* allocator, _Outptr_ char** value);
+
+  /// @}
+  /// \name OrtRunOptions
+  /// @{
+
+  /** \brief Create an OrtRunOptions
+   *
+   * \param[out] out Returned newly created ::OrtRunOptions. Must be freed with OrtApi::ReleaseRunOptions
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CreateRunOptions, _Outptr_ OrtRunOptions** out);
+
+  /** \brief Set per-run log verbosity level
+   *
+   * \see OrtApi::RunOptionsGetRunLogVerbosityLevel
+   *
+   * \param[in] options
+   * \param[in] log_verbosity_level \snippet{doc} snippets.dox Log Verbosity Level
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(RunOptionsSetRunLogVerbosityLevel, _Inout_ OrtRunOptions* options, int log_verbosity_level);
+
+  /** \brief Set per-run log severity level
+   *
+   * \see OrtApi::RunOptionsGetRunLogSeverityLevel
+   *
+   * \param[in] options
+   * \param[in] log_severity_level The log severity level (refer to ::OrtLoggingLevel for possible values).
+   */
+  ORT_API2_STATUS(RunOptionsSetRunLogSeverityLevel, _Inout_ OrtRunOptions* options, int log_severity_level);
+
+  /** \brief Set per-run tag
+   *
+   * This is used in a per-run log identifier.
+   *
+   * \see OrtApi::RunOptionsGetRunTag
+   *
+   * \param[in] options
+   * \param[in] run_tag The run tag.
+   */
+  ORT_API2_STATUS(RunOptionsSetRunTag, _Inout_ OrtRunOptions* options, _In_ const char* run_tag);
+
+  /** \brief Get per-run log verbosity level
+   *
+   * \see OrtApi::RunOptionsSetRunLogVerbosityLevel
+   *
+   * \param[in] options
+   * \param[out] log_verbosity_level \snippet{doc} snippets.dox Log Verbosity Level
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(RunOptionsGetRunLogVerbosityLevel, _In_ const OrtRunOptions* options,
+                  _Out_ int* log_verbosity_level);
+
+  /** \brief Get per-run log severity level
+   *
+   * \see OrtApi::RunOptionsSetRunLogSeverityLevel
+   *
+   * \param[in] options
+   * \param[out] log_severity_level The log severity level (refer to ::OrtLoggingLevel for possible values).
+   */
+  ORT_API2_STATUS(RunOptionsGetRunLogSeverityLevel, _In_ const OrtRunOptions* options, _Out_ int* log_severity_level);
+
+  /** \brief Get per-run tag
+   *
+   * This is used in a per-run log identifier.
+   *
+   * \see OrtApi::RunOptionsSetRunTag
+   *
+   * \param[in] options
+   * \param[out] run_tag The run tag.
+   *                     Do not free this value, it is owned by `options`. It will be invalidated if the run tag
+   *                     changes (i.e., with OrtApi::RunOptionsSetRunTag) or `options` is freed.
+   */
+  ORT_API2_STATUS(RunOptionsGetRunTag, _In_ const OrtRunOptions* options, _Out_ const char** run_tag);
+
+  /** \brief Set terminate flag
+   *
+   * If a currently executing session needs to be force terminated, this can be called from another thread to force it to fail with an error.
+   *
+   * \param[in] options
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(RunOptionsSetTerminate, _Inout_ OrtRunOptions* options);
+
+  /** \brief Clears the terminate flag
+   *
+   * Used so the OrtRunOptions instance can be used in a new OrtApi::Run call without it instantly terminating
+   *
+   * \param[in] options
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(RunOptionsUnsetTerminate, _Inout_ OrtRunOptions* options);
+
+  /// @}
+  /// \name OrtValue
+  /// @{
+
+  /** \brief Create a tensor
+   *
+   * Create a tensor using a supplied ::OrtAllocator
+   *
+   * \param[in] allocator
+   * \param[in] shape Pointer to the tensor shape dimensions.
+   * \param[in] shape_len The number of tensor shape dimensions.
+   * \param[in] type
+   * \param[out] out Returns newly created ::OrtValue. Must be freed with OrtApi::ReleaseValue
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CreateTensorAsOrtValue, _Inout_ OrtAllocator* allocator, _In_ const int64_t* shape, size_t shape_len,
+                  ONNXTensorElementDataType type, _Outptr_ OrtValue** out);
+
+  /** \brief Create a tensor backed by a user supplied buffer
+   *
+   * Create a tensor with user's buffer. You can fill the buffer either before calling this function or after.
+   * p_data is owned by caller. ReleaseValue won't release p_data.
+   *
+   * \param[in] info Memory description of where the p_data buffer resides (CPU vs GPU etc).
+   * \param[in] p_data Pointer to the data buffer.
+   * \param[in] p_data_len The number of bytes in the data buffer.
+   * \param[in] shape Pointer to the tensor shape dimensions.
+   * \param[in] shape_len The number of tensor shape dimensions.
+   * \param[in] type The data type.
+   * \param[out] out Returns newly created ::OrtValue. Must be freed with OrtApi::ReleaseValue
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CreateTensorWithDataAsOrtValue, _In_ const OrtMemoryInfo* info, _Inout_ void* p_data,
+                  size_t p_data_len, _In_ const int64_t* shape, size_t shape_len, ONNXTensorElementDataType type,
+                  _Outptr_ OrtValue** out);
+
+  /** \brief Return if an ::OrtValue is a tensor type
+   *
+   * \param[in] value A tensor type (string tensors are not supported)
+   * \param[out] out Set to 1 iff ::OrtValue is a tensor, 0 otherwise
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(IsTensor, _In_ const OrtValue* value, _Out_ int* out);
+
+  /** \brief Get a pointer to the raw data inside a tensor
+   *
+   * Used to read/write/modify the internal tensor data directly.
+   * \note The returned pointer is valid until the \p value is destroyed.
+   *
+   * \param[in] value A tensor type (string tensors are not supported)
+   * \param[out] out Filled in with a pointer to the internal storage
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetTensorMutableData, _In_ OrtValue* value, _Outptr_ void** out);
+
+  /** \brief Set all strings at once in a string tensor
+   *
+   * \param[in,out] value A tensor of type ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING
+   * \param[in] s An array of strings. Each string in this array must be null terminated.
+   * \param[in] s_len Count of strings in s (Must match the size of \p value's tensor shape)
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(FillStringTensor, _Inout_ OrtValue* value, _In_ const char* const* s, size_t s_len);
+
+  /** \brief Get total byte length for all strings in a string tensor
+   *
+   * Typically used with OrtApi::GetStringTensorContent
+   *
+   * \param[in] value A tensor of type ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING
+   * \param[out] len Total byte length of all strings (does not include trailing nulls)
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetStringTensorDataLength, _In_ const OrtValue* value, _Out_ size_t* len);
+
+  /** \brief Get all strings from a string tensor
+   *
+   * An example of the results:<br>
+   * Given \p value is a string tensor with the strings { "This" "is" "a" "test" }<br>
+   * \p s must have a size of 11 bytes<br>
+   * \p offsets must have 4 elements<br>
+   * After the call, these values will be filled in:<br>
+   * \p s will contain "Thisisatest"<br>
+   * \p offsets will contain { 0, 4, 6, 7 }<br>
+   * The length of the last string is just s_len - offsets[last]
+   *
+   * \param[in] value A tensor of type ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING
+   * \param[in] s Buffer to sequentially write all tensor strings to. Each string is NOT null-terminated.
+   * \param[in] s_len Number of bytes of buffer pointed to by \p s (Get it from OrtApi::GetStringTensorDataLength)
+   * \param[out] offsets Array of start offsets into the strings written to \p s
+   * \param[in] offsets_len Number of elements in offsets
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetStringTensorContent, _In_ const OrtValue* value, _Out_writes_bytes_all_(s_len) void* s,
+                  size_t s_len, _Out_writes_all_(offsets_len) size_t* offsets, size_t offsets_len);
+
+  /// @}
+  /// \name OrtTypeInfo
+  /// @{
+
+  /** \brief Get ::OrtTensorTypeAndShapeInfo from an ::OrtTypeInfo
+   *
+   * \param[in] type_info
+   * \param[out] out Do not free this value, it will be valid until type_info is freed.
+   *             If type_info does not represent tensor, this value will be set to nullptr.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CastTypeInfoToTensorInfo, _In_ const OrtTypeInfo* type_info,
+                  _Outptr_result_maybenull_ const OrtTensorTypeAndShapeInfo** out);
+
+  /** \brief Get ::ONNXType from ::OrtTypeInfo
+   *
+   * \param[in] type_info
+   * \param[out] out
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetOnnxTypeFromTypeInfo, _In_ const OrtTypeInfo* type_info, _Out_ enum ONNXType* out);
+
+  /// @}
+  /// \name OrtTensorTypeAndShapeInfo
+  /// @{
+
+  /** \brief Create an ::OrtTensorTypeAndShapeInfo object
+   *
+   * \param[out] out Returns newly created ::OrtTensorTypeAndShapeInfo. Must be freed with OrtApi::ReleaseTensorTypeAndShapeInfo
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CreateTensorTypeAndShapeInfo, _Outptr_ OrtTensorTypeAndShapeInfo** out);
+
+  /** \brief Set element type in ::OrtTensorTypeAndShapeInfo
+   *
+   * \param[in] info
+   * \param[in] type
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SetTensorElementType, _Inout_ OrtTensorTypeAndShapeInfo* info, enum ONNXTensorElementDataType type);
+
+  /** \brief Set shape information in ::OrtTensorTypeAndShapeInfo
+   *
+   * \param[in] info
+   * \param[in] dim_values Array with `dim_count` elements. Can contain negative values.
+   * \param[in] dim_count Number of elements in `dim_values`
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SetDimensions, OrtTensorTypeAndShapeInfo* info, _In_ const int64_t* dim_values, size_t dim_count);
+
+  /** \brief Get element type in ::OrtTensorTypeAndShapeInfo
+   *
+   * \see OrtApi::SetTensorElementType
+   *
+   * \param[in] info
+   * \param[out] out
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetTensorElementType, _In_ const OrtTensorTypeAndShapeInfo* info,
+                  _Out_ enum ONNXTensorElementDataType* out);
+
+  /** \brief Get dimension count in ::OrtTensorTypeAndShapeInfo
+   *
+   * \see OrtApi::GetDimensions
+   *
+   * \param[in] info
+   * \param[out] out
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetDimensionsCount, _In_ const OrtTensorTypeAndShapeInfo* info, _Out_ size_t* out);
+
+  /** \brief Get dimensions in ::OrtTensorTypeAndShapeInfo
+   *
+   * \param[in] info
+   * \param[out] dim_values Array with `dim_values_length` elements. On return, filled with the dimensions stored in the ::OrtTensorTypeAndShapeInfo
+   * \param[in] dim_values_length Number of elements in `dim_values`. Use OrtApi::GetDimensionsCount to get this value
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetDimensions, _In_ const OrtTensorTypeAndShapeInfo* info, _Out_ int64_t* dim_values,
+                  size_t dim_values_length);
+
+  /** \brief Get symbolic dimension names in ::OrtTensorTypeAndShapeInfo
+   *
+   * \param[in] info
+   * \param[in] dim_params Array with `dim_params_length` elements. On return filled with pointers to null terminated strings of the dimension names
+   * \param[in] dim_params_length Number of elements in `dim_params`. Use OrtApi::GetDimensionsCount to get this value
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetSymbolicDimensions, _In_ const OrtTensorTypeAndShapeInfo* info,
+                  _Out_writes_all_(dim_params_length) const char* dim_params[], size_t dim_params_length);
+
+  /** \brief Get total number of elements in a tensor shape from an ::OrtTensorTypeAndShapeInfo
+   *
+   * Return the number of elements specified by the tensor shape (all dimensions multiplied by each other).
+   * For 0 dimensions, 1 is returned. If any dimension is less than 0, the result is always -1.
+   *
+   * Examples:<br>
+   * [] = 1<br>
+   * [1,3,4] = 12<br>
+   * [2,0,4] = 0<br>
+   * [-1,3,4] = -1<br>
+   *
+   * \param[in] info
+   * \param[out] out Number of elements
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetTensorShapeElementCount, _In_ const OrtTensorTypeAndShapeInfo* info, _Out_ size_t* out);
+
+  /// @}
+  /// \name OrtValue
+  /// @{
+
+  /** \brief Get type and shape information from a tensor ::OrtValue
+   *
+   * \param[in] value Must be a tensor (not a map/sequence/etc) or will return failure
+   * \param[out] out Newly created ::OrtTensorTypeAndShapeInfo. Must be freed with OrtApi::ReleaseTensorTypeAndShapeInfo
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetTensorTypeAndShape, _In_ const OrtValue* value, _Outptr_ OrtTensorTypeAndShapeInfo** out);
+
+  /** \brief Get type information of an OrtValue
+   *
+   * \param[in] value
+   * \param[out] out Newly created ::OrtTypeInfo. Must be freed with OrtApi::ReleaseTypeInfo
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetTypeInfo, _In_ const OrtValue* value, _Outptr_result_maybenull_ OrtTypeInfo** out);
+
+  /** \brief Get ONNXType of an ::OrtValue
+   *
+   * \param[in] value
+   * \param[out] out
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetValueType, _In_ const OrtValue* value, _Out_ enum ONNXType* out);
+
+  /// @}
+  /// \name OrtMemoryInfo
+  /// @{
+
+  /** \brief Create an ::OrtMemoryInfo
+   *
+   * \param[in] name
+   * \param[in] type
+   * \param[in] id
+   * \param[in] mem_type
+   * \param[out] out Newly created ::OrtMemoryInfo. Must be freed with OrtAPi::ReleaseMemoryInfo
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CreateMemoryInfo, _In_ const char* name, enum OrtAllocatorType type, int id,
+                  enum OrtMemType mem_type, _Outptr_ OrtMemoryInfo** out);
+
+  /** \brief Create an ::OrtMemoryInfo for CPU memory
+   *
+   * Special case version of OrtApi::CreateMemoryInfo for CPU based memory. Same as using OrtApi::CreateMemoryInfo with name = "Cpu" and id = 0.
+   *
+   * \param[in] type
+   * \param[in] mem_type
+   * \param[out] out
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CreateCpuMemoryInfo, enum OrtAllocatorType type, enum OrtMemType mem_type,
+                  _Outptr_ OrtMemoryInfo** out);
+
+  /** \brief Compare ::OrtMemoryInfo objects for equality
+   *
+   * Compares all settings of each ::OrtMemoryInfo for equality
+   *
+   * \param[in] info1
+   * \param[in] info2
+   * \param[out] out Set to 0 if equal, -1 if not equal
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CompareMemoryInfo, _In_ const OrtMemoryInfo* info1, _In_ const OrtMemoryInfo* info2, _Out_ int* out);
+
+  /** \brief Get name from ::OrtMemoryInfo
+   *
+   * \param[in] ptr
+   * \param[out] out Writes null terminated string to this pointer. Do NOT free the returned pointer. It is valid for the lifetime of the ::OrtMemoryInfo
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(MemoryInfoGetName, _In_ const OrtMemoryInfo* ptr, _Out_ const char** out);
+
+  /** \brief Get the id from ::OrtMemoryInfo
+   */
+  ORT_API2_STATUS(MemoryInfoGetId, _In_ const OrtMemoryInfo* ptr, _Out_ int* out);
+
+  /** \brief Get the ::OrtMemType from ::OrtMemoryInfo
+   */
+  ORT_API2_STATUS(MemoryInfoGetMemType, _In_ const OrtMemoryInfo* ptr, _Out_ OrtMemType* out);
+
+  /** \brief Get the ::OrtAllocatorType from ::OrtMemoryInfo
+   */
+  ORT_API2_STATUS(MemoryInfoGetType, _In_ const OrtMemoryInfo* ptr, _Out_ OrtAllocatorType* out);
+
+  /// @}
+  /// \name OrtAllocator
+  /// @{
+
+  /// \brief Calls OrtAllocator::Alloc function
+  ORT_API2_STATUS(AllocatorAlloc, _Inout_ OrtAllocator* ort_allocator, size_t size, _Outptr_ void** out);
+  /// \brief Calls OrtAllocator::Free function
+  ORT_API2_STATUS(AllocatorFree, _Inout_ OrtAllocator* ort_allocator, void* p);
+  /// \brief Calls OrtAllocator::Info function
+  ORT_API2_STATUS(AllocatorGetInfo, _In_ const OrtAllocator* ort_allocator, _Outptr_ const struct OrtMemoryInfo** out);
+
+  /** \brief Get the default allocator
+   *
+   * The default allocator is a CPU based, non-arena. Always returns the same pointer to the same default allocator.
+   *
+   * \param[out] out Returned value should NOT be freed
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetAllocatorWithDefaultOptions, _Outptr_ OrtAllocator** out);
+
+  /// @}
+  /// \name OrtSessionOptions
+  /// @{
+
+  /** \brief Override session symbolic dimensions
+   *
+   * Override symbolic dimensions (by specific denotation strings) with actual values if known at session initialization time to enable
+   * optimizations that can take advantage of fixed values (such as memory planning, etc)
+   *
+   * \param[in] options
+   * \param[in] dim_denotation
+   * \param[in] dim_value
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(AddFreeDimensionOverride, _Inout_ OrtSessionOptions* options, _In_ const char* dim_denotation,
+                  _In_ int64_t dim_value);
+
+  /// @}
+  /// \name OrtValue
+  /// @{
+
+  /* Internal information (not seen in Doxygen)
+   *
+   * APIs to support non-tensor types - map and sequence.
+   * Currently only the following types are supported
+   * Note: the following types should be kept in sync with data_types.h
+   * Map types
+   * =========
+   * std::map<std::string, std::string>
+   * std::map<std::string, int64_t>
+   * std::map<std::string, float>
+   * std::map<std::string, double>
+   * std::map<int64_t, std::string>
+   * std::map<int64_t, int64_t>
+   * std::map<int64_t, float>
+   * std::map<int64_t, double>
+   *
+   * Sequence types
+   * ==============
+   * std::vector<std::string>
+   * std::vector<int64_t>
+   * std::vector<float>
+   * std::vector<double>
+   * std::vector<std::map<std::string, float>>
+   * std::vector<std::map<int64_t, float>
+   */
+
+  /** \brief Get non tensor data from an ::OrtValue
+   *
+   * If `value` is of type ONNX_TYPE_MAP, you need to retrieve the keys and values
+   * separately. Use index=0 to retrieve keys and index=1 to retrieve values.
+   * If `value` is of type ONNX_TYPE_SEQUENCE, use index to retrieve the index'th element
+   * of the sequence.
+   *
+   * \param[in] value
+   * \param[in] index See above for usage based on `value` type
+   * \param[in] allocator Allocator used to allocate ::OrtValue
+   * \param[out] out Created ::OrtValue that holds the element requested. Must be freed with OrtApi::ReleaseValue
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetValue, _In_ const OrtValue* value, int index, _Inout_ OrtAllocator* allocator,
+                  _Outptr_ OrtValue** out);
+
+  /** \brief Get non tensor value count from an ::OrtValue
+   *
+   * If `value` is of type ONNX_TYPE_MAP 2 will always be returned. For ONNX_TYPE_SEQUENCE
+   * the number of elements in the sequence will be returned
+   *
+   * \param[in] value
+   * \param[out] out
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetValueCount, _In_ const OrtValue* value, _Out_ size_t* out);
+
+  /** \brief Create a map or sequence ::OrtValue
+   *
+   * To construct a map (ONNX_TYPE_MAP), use num_values = 2 and `in` should be an array of 2 ::OrtValue%s
+   * representing keys and values.<br>
+   *
+   * To construct a sequence (ONNX_TYPE_SEQUENCE), use num_values = N where N is the number of the elements in the
+   * sequence. 'in' should be an array of N ::OrtValue%s.
+   *
+   * \param[in] in See above for details
+   * \param[in] num_values
+   * \param[in] value_type Must be either ONNX_TYPE_MAP or ONNX_TYPE_SEQUENCE
+   * \param[out] out Newly created ::OrtValue. Must be freed with OrtApi::ReleaseValue
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CreateValue, _In_reads_(num_values) const OrtValue* const* in, size_t num_values,
+                  enum ONNXType value_type, _Outptr_ OrtValue** out);
+
+  /** \brief Create an opaque (custom user defined type) ::OrtValue
+   *
+   * Constructs an ::OrtValue that contains a value of non-standard type created for
+   * experiments or while awaiting standardization. ::OrtValue in this case would contain
+   * an internal representation of the Opaque type. Opaque types are distinguished from
+   * each other by two strings 1) domain and 2) type name. The combination of the two
+   * must be unique, so the type representation is properly identified internally. The combination
+   * must be properly registered from within ORT at both compile/run time or by another API.
+   *
+   * To construct the ::OrtValue pass domain and type names, also a pointer to a data container
+   * the type of which must be known to both ORT and the client program. That data container may or may
+   * not match the internal representation of the Opaque type. The sizeof(data_container) is passed for
+   * verification purposes.
+   *
+   * \param[in] domain_name Null terminated string of the domain name
+   * \param[in] type_name Null terminated string of the type name
+   * \param[in] data_container User pointer Data to populate ::OrtValue
+   * \param[in] data_container_size Size in bytes of what `data_container` points to
+   * \param[out] out Newly created ::OrtValue. Must be freed with OrtApi::ReleaseValue
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CreateOpaqueValue, _In_z_ const char* domain_name, _In_z_ const char* type_name,
+                  _In_ const void* data_container, size_t data_container_size, _Outptr_ OrtValue** out);
+
+  /** \brief Get internal data from an opaque (custom user defined type) ::OrtValue
+   *
+   * Copies internal data from an opaque value into a user provided buffer
+   *
+   * \see OrtApi::CreateOpaqueValue
+   *
+   * \param[in] domain_name Null terminated string of the domain name
+   * \param[in] type_name Null terminated string of the type name
+   * \param[in] in The opaque ::OrtValue
+   * \param[out] data_container Buffer to copy data into
+   * \param[out] data_container_size Size in bytes of the buffer pointed to by data_container. Must match the size of the internal buffer.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetOpaqueValue, _In_ const char* domain_name, _In_ const char* type_name, _In_ const OrtValue* in,
+                  _Out_ void* data_container, size_t data_container_size);
+
+  /// @}
+  /// \name OrtKernelInfo
+  /// Custom operator APIs.
+  /// @{
+
+  /** \brief Get a float stored as an attribute in the graph node
+   *
+   * \param[in] info ::OrtKernelInfo instance
+   * \param[in] name Null terminated string of the name of the attribute
+   * \param[out] out Pointer to memory where the attribute will be stored
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(KernelInfoGetAttribute_float, _In_ const OrtKernelInfo* info, _In_ const char* name,
+                  _Out_ float* out);
+
+  /** \brief Fetch a 64-bit int stored as an attribute in the graph node
+   *
+   * \param[in] info ::OrtKernelInfo instance
+   * \param[in] name Null terminated string of the name of the attribute
+   * \param[out] out Pointer to memory where the attribute will be stored
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(KernelInfoGetAttribute_int64, _In_ const OrtKernelInfo* info, _In_ const char* name,
+                  _Out_ int64_t* out);
+
+  /** \brief Fetch a string stored as an attribute in the graph node
+   *
+   * If `out` is nullptr, the value of `size` is set to the true size of the string
+   * attribute, and a success status is returned.
+   *
+   * If the `size` parameter is greater than or equal to the actual string attribute's size,
+   * the value of `size` is set to the true size of the string attribute, the provided memory
+   * is filled with the attribute's contents, and a success status is returned.
+   *
+   * If the `size` parameter is less than the actual string attribute's size and `out`
+   * is not nullptr, the value of `size` is set to the true size of the string attribute
+   * and a failure status is returned.)
+   *
+   * \param[in] info ::OrtKernelInfo instance
+   * \param[in] name Null terminated string of the name of the attribute
+   * \param[out] out Pointer to memory where the attribute will be stored
+   * \param[in,out] size See above comments for details
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(KernelInfoGetAttribute_string, _In_ const OrtKernelInfo* info, _In_ const char* name, _Out_ char* out,
+                  _Inout_ size_t* size);
+
+  /// @}
+  /// \name OrtKernelContext
+  /// Custom operator APIs.
+  /// @{
+
+  /** \brief Used for custom operators, get the input count of a kernel
+   *
+   * \see ::OrtCustomOp
+   */
+  ORT_API2_STATUS(KernelContext_GetInputCount, _In_ const OrtKernelContext* context, _Out_ size_t* out);
+
+  /** \brief Used for custom operators, get the output count of a kernel
+   *
+   * \see ::OrtCustomOp
+   */
+  ORT_API2_STATUS(KernelContext_GetOutputCount, _In_ const OrtKernelContext* context, _Out_ size_t* out);
+
+  /** \brief Used for custom operators, get an input of a kernel
+   *
+   * \see ::OrtCustomOp
+   */
+  ORT_API2_STATUS(KernelContext_GetInput, _In_ const OrtKernelContext* context, _In_ size_t index,
+                  _Out_ const OrtValue** out);
+
+  /** \brief Used for custom operators, get an output of a kernel
+   *
+   * \see ::OrtCustomOp
+   */
+  ORT_API2_STATUS(KernelContext_GetOutput, _Inout_ OrtKernelContext* context, _In_ size_t index,
+                  _In_ const int64_t* dim_values, size_t dim_count, _Outptr_ OrtValue** out);
+
+  /// @}
+  /// \name OrtEnv
+  /// @{
+  ORT_CLASS_RELEASE(Env);
+  /// @}
+  /// \name OrtStatus
+  /// @{
+  ORT_CLASS_RELEASE(Status);
+  /// @}
+  /// \name OrtMemoryInfo
+  /// @{
+  ORT_CLASS_RELEASE(MemoryInfo);
+  /// @}
+  /// \name OrtSession
+  /// @{
+  ORT_CLASS_RELEASE(Session);  // Don't call ReleaseSession from Dllmain (because session owns a thread pool)
+  /// @}
+  /// \name OrtValue
+  /// @{
+  ORT_CLASS_RELEASE(Value);
+  /// @}
+  /// \name OrtRunOptions
+  /// @{
+  ORT_CLASS_RELEASE(RunOptions);
+  /// @}
+  /// \name OrtTypeInfo
+  /// @{
+  ORT_CLASS_RELEASE(TypeInfo);
+  /// @}
+  /// \name OrtTensorTypeAndShapeInfo
+  /// @{
+  ORT_CLASS_RELEASE(TensorTypeAndShapeInfo);
+  /// @}
+  /// \name OrtSessionOptions
+  /// @{
+  ORT_CLASS_RELEASE(SessionOptions);
+  /// @}
+  /// \name OrtCustomOpDomain
+  /// @{
+  ORT_CLASS_RELEASE(CustomOpDomain);
+
+  /// @}
+  /// \name OrtTypeInfo
+  /// @{
+
+  /** \brief Get denotation from type information
+   *
+   * Augments ::OrtTypeInfo to return denotations on the type.
+   *
+   * This is used by WinML to determine if an input/output is intended to be an Image or a Tensor.
+   *
+   * \param[in] type_info
+   * \param[out] denotation Pointer to the null terminated denotation string is written to this pointer. This pointer is valid until the object is destroyed or the name is changed, do not free.
+   * \param[out] len Length in bytes of the string returned in `denotation`
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetDenotationFromTypeInfo, _In_ const OrtTypeInfo* type_info, _Out_ const char** const denotation,
+                  _Out_ size_t* len);
+
+  /** \brief Get detailed map information from an ::OrtTypeInfo
+   *
+   * This augments ::OrtTypeInfo to return an ::OrtMapTypeInfo when the type is a map.
+   * The OrtMapTypeInfo has additional information about the map's key type and value type.
+   *
+   * This is used by WinML to support model reflection APIs.
+   *
+   * \param[out] type_info
+   * \param[out] out A pointer to the ::OrtMapTypeInfo. Do not free this value. If type_info
+   *             does not contain a map, this value will be set to nullptr.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CastTypeInfoToMapTypeInfo, _In_ const OrtTypeInfo* type_info,
+                  _Outptr_result_maybenull_ const OrtMapTypeInfo** out);
+
+  /** \brief Cast ::OrtTypeInfo to an ::OrtSequenceTypeInfo
+   *
+   * This api augments ::OrtTypeInfo to return an ::OrtSequenceTypeInfo when the type is a sequence.
+   * The ::OrtSequenceTypeInfo has additional information about the sequence's element type.
+   *
+   * This is used by WinML to support model reflection APIs.
+   *
+   * \param[in] type_info
+   * \param[out] out A pointer to the OrtSequenceTypeInfo. Do not free this value. If type_info
+   *             doesn not contain a sequence, this value will be set to nullptr.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CastTypeInfoToSequenceTypeInfo, _In_ const OrtTypeInfo* type_info,
+                  _Outptr_result_maybenull_ const OrtSequenceTypeInfo** out);
+
+  /// @}
+  /// \name OrtMapTypeInfo
+  /// @{
+
+  /** \brief Get key type from an ::OrtMapTypeInfo
+   *
+   * Key types are restricted to being scalar types.
+   *
+   * This is used by WinML to support model reflection APIs.
+   *
+   * \param[in] map_type_info
+   * \param[out] out
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetMapKeyType, _In_ const OrtMapTypeInfo* map_type_info, _Out_ enum ONNXTensorElementDataType* out);
+
+  /** \brief Get the value type from an ::OrtMapTypeInfo
+   *
+   * \param[in] map_type_info
+   * \param[out] type_info
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetMapValueType, _In_ const OrtMapTypeInfo* map_type_info, _Outptr_ OrtTypeInfo** type_info);
+
+  /// @}
+  /// \name OrtSequenceTypeInfo
+  /// @{
+
+  /** \brief Get element type from an ::OrtSequenceTypeInfo
+   *
+   * This is used by WinML to support model reflection APIs.
+   *
+   * \param[in] sequence_type_info
+   * \param[out] type_info
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetSequenceElementType, _In_ const OrtSequenceTypeInfo* sequence_type_info,
+                  _Outptr_ OrtTypeInfo** type_info);
+
+  /// @}
+  /// \name OrtMapTypeInfo
+  /// @{
+  ORT_CLASS_RELEASE(MapTypeInfo);
+  /// @}
+  /// \name OrtSequenceTypeInfo
+  /// @{
+  ORT_CLASS_RELEASE(SequenceTypeInfo);
+
+  /// @}
+  /// \name OrtSession
+  /// @{
+
+  /** \brief End profiling and return filename of the profile data
+   *
+   * Profiling is turned on through OrtApi::EnableProfiling
+   *
+   * \param[in] session
+   * \param[in] allocator
+   * \param[out] out Null terminated string of the filename, allocated using `allocator`. Must be freed using `allocator`
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SessionEndProfiling, _In_ OrtSession* session, _Inout_ OrtAllocator* allocator, _Outptr_ char** out);
+
+  /** \brief Get ::OrtModelMetadata from an ::OrtSession
+   *
+   * \param[in] session
+   * \param[out] out Newly created ::OrtModelMetadata. Must be freed using OrtApi::ReleaseModelMetadata
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SessionGetModelMetadata, _In_ const OrtSession* session, _Outptr_ OrtModelMetadata** out);
+
+  /// @}
+  /// \name OrtModelMetadata
+  /// @{
+
+  /** \brief Get `producer name` from an ::OrtModelMetadata
+   *
+   * \param[in] model_metadata
+   * \param[in] allocator
+   * \param[out] value Set to a null terminated string allocated using `allocator`. Must be freed using `allocator`
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(ModelMetadataGetProducerName, _In_ const OrtModelMetadata* model_metadata,
+                  _Inout_ OrtAllocator* allocator, _Outptr_ char** value);
+
+  /** \brief Get `graph name` from an ::OrtModelMetadata
+   *
+   * \param[in] model_metadata
+   * \param[in] allocator
+   * \param[out] value Set to a null terminated string allocated using `allocator`. Must be freed using `allocator`
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(ModelMetadataGetGraphName, _In_ const OrtModelMetadata* model_metadata,
+                  _Inout_ OrtAllocator* allocator, _Outptr_ char** value);
+
+  /** \brief Get `domain` from an ::OrtModelMetadata
+   *
+   * \param[in] model_metadata
+   * \param[in] allocator
+   * \param[out] value Set to a null terminated string allocated using `allocator`. Must be freed using `allocator`
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(ModelMetadataGetDomain, _In_ const OrtModelMetadata* model_metadata, _Inout_ OrtAllocator* allocator,
+                  _Outptr_ char** value);
+
+  /** \brief Get `description` from an ::OrtModelMetadata
+   *
+   * \param[in] model_metadata
+   * \param[in] allocator
+   * \param[out] value Set to a null terminated string allocated using `allocator`. Must be freed using `allocator`
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(ModelMetadataGetDescription, _In_ const OrtModelMetadata* model_metadata,
+                  _Inout_ OrtAllocator* allocator, _Outptr_ char** value);
+
+  /** \brief Return data for a key in the custom metadata map in an ::OrtModelMetadata
+   *
+   * \param[in] model_metadata
+   * \param[in] allocator
+   * \param[in] key Null terminated string
+   * \param[out] value Set to a null terminated string allocated using `allocator`. Must be freed using `allocator`
+   * `value` will be set to nullptr if the given key is not found in the custom metadata map.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(ModelMetadataLookupCustomMetadataMap, _In_ const OrtModelMetadata* model_metadata,
+                  _Inout_ OrtAllocator* allocator, _In_ const char* key, _Outptr_result_maybenull_ char** value);
+
+  /** \brief Get version number from an ::OrtModelMetadata
+   *
+   * \param[in] model_metadata
+   * \param[out] value Set to the version number
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(ModelMetadataGetVersion, _In_ const OrtModelMetadata* model_metadata, _Out_ int64_t* value);
+
+  ORT_CLASS_RELEASE(ModelMetadata);
+
+  /// @}
+  /// \name OrtEnv
+  /// @{
+
+  /** \brief Create an OrtEnv
+   *
+   * Create an environment with global threadpools that will be shared across sessions.
+   * Use this in conjunction with OrtApi::DisablePerSessionThreads or else the session will use
+   * its own thread pools.
+   *
+   * \param[in] log_severity_level The log severity level.
+   * \param[in] logid The log identifier.
+   * \param[in] tp_options
+   * \param[out] out Returned newly created OrtEnv. Must be freed with OrtApi::ReleaseEnv
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CreateEnvWithGlobalThreadPools, OrtLoggingLevel log_severity_level, _In_ const char* logid,
+                  _In_ const OrtThreadingOptions* tp_options, _Outptr_ OrtEnv** out);
+
+  /// @}
+  /// \name OrtSessionOptions
+  /// @{
+
+  /** \brief Use global thread pool on a session
+   *
+   * Disable using per session thread pool and use the shared global threadpool.
+   * This should be used in conjunction with OrtApi::CreateEnvWithGlobalThreadPools.
+   *
+   * \param[in] options
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(DisablePerSessionThreads, _Inout_ OrtSessionOptions* options);
+
+  /// @}
+  /// \name OrtThreadingOptions
+  /// @{
+
+  /** \brief Create an ::OrtThreadingOptions
+   *
+   * \param[out] out Newly created ::OrtThreadingOptions. Must be freed with OrtApi::ReleaseThreadingOptions
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CreateThreadingOptions, _Outptr_ OrtThreadingOptions** out);
+
+  ORT_CLASS_RELEASE(ThreadingOptions);
+
+  /// @}
+  /// \name OrtModelMetadata
+  /// @{
+
+  /**
+   *
+   * \param[in] model_metadata
+   * \param[in] allocator
+   * \param[out] keys Array of null terminated strings (array count = num_keys) allocated using `allocator`.
+   *  The strings and the pointer array must be freed using `allocator`
+   *  `keys` will be set to nullptr if the custom metadata map is empty.
+   * \param[out] num_keys Set to the number of elements in the `keys` array
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(ModelMetadataGetCustomMetadataMapKeys, _In_ const OrtModelMetadata* model_metadata,
+                  _Inout_ OrtAllocator* allocator, _Outptr_result_buffer_maybenull_(*num_keys) char*** keys, _Out_ int64_t* num_keys);
+
+  /// @}
+  /// \name OrtSessionOptions
+  /// @{
+
+  /**
+   *
+   * Override symbolic dimensions (by specific name strings) with actual values
+   * if known at session initialization time to enable optimizations that can
+   * take advantage of fixed values (such as memory planning, etc)
+   *
+   */
+  ORT_API2_STATUS(AddFreeDimensionOverrideByName,
+                  _Inout_ OrtSessionOptions* options, _In_ const char* dim_name,
+                  _In_ int64_t dim_value);
+
+  /// @}
+  /// \name Misc
+  /// @{
+
+  /** \brief Get the names of all available providers
+   *
+   * \note The providers in the list are not guaranteed to be usable. They may fail to load due to missing system dependencies.
+   *    For example, if the CUDA/cuDNN libraries are not installed, the CUDA provider will report an error when it is added to the session options.
+   *
+   * \param[out] out_ptr Set to a pointer to an array of null terminated strings of the available providers. The entries and the
+   *    array itself must be freed using OrtApi::ReleaseAvailableProviders
+   * \param[out] provider_length Set to the number of entries in the `out_ptr` array
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetAvailableProviders, _Outptr_ char*** out_ptr, _Out_ int* provider_length);
+
+  /** \brief Release data from OrtApi::GetAvailableProviders. This API will never fail
+   * so you can rely on it in a noexcept code.
+   *
+   * \param[in] ptr The `out_ptr` result from OrtApi::GetAvailableProviders.
+   * \param[in] providers_length The `provider_length` result from OrtApi::GetAvailableProviders
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(ReleaseAvailableProviders, _In_ char** ptr,
+                  _In_ int providers_length);
+
+  /// @}
+  /// \name OrtValue
+  /// @{
+
+  /** \brief Get the length of a single string in a string tensor
+   *
+   * \param[in] value A string tensor
+   * \param[in] index Index of the string in the tensor
+   * \param[out] out Set to number of bytes of the string element
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetStringTensorElementLength, _In_ const OrtValue* value, size_t index, _Out_ size_t* out);
+
+  /** \brief Get a single string from a string tensor
+   *
+   * \param[in] value A string tensor
+   * \param[in] s_len Number of bytes in the `s` buffer. Must match the value returned by OrtApi::GetStringTensorElementLength.
+   * \param[in] index Index of the string in the tensor
+   * \param[out] s The string element contents in UTF-8 encoding. The string is NOT null-terminated.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetStringTensorElement, _In_ const OrtValue* value, size_t s_len, size_t index, _Out_writes_bytes_all_(s_len) void* s);
+
+  /** \brief Set a single string in a string tensor
+   *
+   * \param[in] value A string tensor
+   * \param[in] s A null terminated UTF-8 encoded string
+   * \param[in] index Index of the string in the tensor to set
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(FillStringTensorElement, _Inout_ OrtValue* value, _In_ const char* s, size_t index);
+
+  /// @}
+  /// \name OrtSessionOptions
+  /// @{
+
+  /** \brief Set a session configuration entry as a pair of strings
+   *
+   * If a configuration with same key exists, this will overwrite the configuration with the given config_value.
+   *
+   * The config_key and the format of config_value are defined in onnxruntime_session_options_config_keys.h
+   *
+   * \param[in] options
+   * \param[in] config_key A null terminated string representation of the config key
+   * \param[in] config_value A null terminated string representation of the config value
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(AddSessionConfigEntry, _Inout_ OrtSessionOptions* options,
+                  _In_z_ const char* config_key, _In_z_ const char* config_value);
+
+  /// @}
+  /// \name OrtAllocator
+  /// @{
+
+  /** \brief Create an allocator for an ::OrtSession following an ::OrtMemoryInfo
+   *
+   * \param[in] session
+   * \param[in] mem_info valid ::OrtMemoryInfo instance
+   * \param[out] out Newly created ::OrtAllocator. Must be freed with OrtApi::ReleaseAllocator
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CreateAllocator, _In_ const OrtSession* session, _In_ const OrtMemoryInfo* mem_info,
+                  _Outptr_ OrtAllocator** out);
+
+  /** \brief Release an ::OrtAllocator obtained from OrtApi::CreateAllocator
+   */
+  ORT_CLASS_RELEASE(Allocator);
+
+  /// @}
+  /// \name OrtSession
+  /// @{
+
+  /** \brief Run a model using Io Bindings for the inputs & outputs
+   *
+   * \see OrtApi::Run
+   *
+   * \param[in] session
+   * \param[in] run_options
+   * \param[in] binding_ptr
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(RunWithBinding, _Inout_ OrtSession* session, _In_ const OrtRunOptions* run_options, _In_ const OrtIoBinding* binding_ptr);
+
+  /** \brief Create an ::OrtIoBinding instance
+   *
+   * An IoBinding object allows one to bind pre-allocated ::OrtValue%s to input names.
+   * Thus if you want to use a raw on device buffer as input or output you can avoid
+   * extra copy during runtime.
+   *
+   * \param[in] session
+   * \param[out] out Newly created ::OrtIoBinding. Must be freed with OrtApi::ReleaseIoBinding
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CreateIoBinding, _Inout_ OrtSession* session, _Outptr_ OrtIoBinding** out);
+
+  /// @}
+  /// \name OrtIoBinding
+  /// @{
+
+  /** \brief Release an ::OrtIoBinding obtained from OrtApi::CreateIoBinding
+   */
+  ORT_CLASS_RELEASE(IoBinding);
+
+  /** \brief Bind an ::OrtValue to an ::OrtIoBinding input
+   *
+   * When using OrtApi::RunWithBinding this value is used for the named input
+   *
+   * \param[in] binding_ptr
+   * \param[in] name Name for the model input
+   * \param[in] val_ptr ::OrtValue of Tensor type.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(BindInput, _Inout_ OrtIoBinding* binding_ptr, _In_ const char* name, _In_ const OrtValue* val_ptr);
+
+  /** \brief Bind an ::OrtValue to an ::OrtIoBinding output
+   *
+   * When using OrtApi::RunWithBinding this value is used for the named output
+   *
+   * \param[in] binding_ptr
+   * \param[in] name Null terminated string of the model output name
+   * \param[in] val_ptr ::OrtValue of Tensor type.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(BindOutput, _Inout_ OrtIoBinding* binding_ptr, _In_ const char* name, _In_ const OrtValue* val_ptr);
+
+  /** \brief Bind an ::OrtIoBinding output to a device
+   *
+   * Binds the ::OrtValue to a device which is specified by ::OrtMemoryInfo.
+   * You can either create an instance of ::OrtMemoryInfo with a device id or obtain one from the allocator that you have created/are using
+   * This is useful when one or more outputs have dynamic shapes and, it is hard to pre-allocate and bind a chunk of
+   * memory within ::OrtValue ahead of time.
+   *
+   * \see OrtApi::RunWithBinding
+   *
+   * \param[in] binding_ptr
+   * \param[in] name Null terminated string of the device name
+   * \param[in] mem_info_ptr
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(BindOutputToDevice, _Inout_ OrtIoBinding* binding_ptr, _In_ const char* name, _In_ const OrtMemoryInfo* mem_info_ptr);
+
+  /** \brief Get the names of an ::OrtIoBinding's outputs
+   *
+   * Returns the names of the outputs in the order they were bound. This is useful after running the model
+   * with bound outputs because the returned names are in order in which output ::OrtValue are returned. This is useful if
+   * the order of outputs and their names is not known.
+   *
+   * \param[in] binding_ptr
+   * \param[in] allocator Allocator used to allocate continuous buffers for output strings and lengths.
+   * \param[out] buffer Returns an array of non-null terminated UTF-8 strings. The number of strings stored is returned in the count parameter.
+   *   This buffer is allocated using `allocator` and must be freed using it.
+   * \param[out] lengths Returns an array of `count` lengths of the strings returned in `buffer`
+   *   This buffer is allocated using `allocator` and must be freed using it.
+   * \param[out] count Number of strings returned. If `binding_ptr` has no bound outputs, zero is returned,
+   *              no memory allocation is performed and buffer and lengths are set to nullptr.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetBoundOutputNames, _In_ const OrtIoBinding* binding_ptr, _In_ OrtAllocator* allocator,
+                  _Out_ char** buffer, _Out_writes_all_(count) size_t** lengths, _Out_ size_t* count);
+
+  /** \brief Get the output ::OrtValue objects from an ::OrtIoBinding
+   *
+   * Returns an array of pointers to individually allocated ::OrtValue%s that contain results of a model execution with OrtApi::RunWithBinding
+   * The array contains the same number of ::OrtValue%s and they are in the same order as they were bound with OrtApi::BindOutput
+   * or OrtApi::BindOutputToDevice.
+   *
+   * The returned ::OrtValue%s must be released using OrtApi::ReleaseValue after they are no longer needed.
+   * The array is allocated using the specified instance of the allocator and must be freed using the same allocator after
+   * all the ::OrtValue%s contained therein are individually released.
+   *
+   * \param[in] binding_ptr
+   * \param[in] allocator Allocator used to allocate output array
+   * \param[out] output Set to the allocated array of allocated ::OrtValue outputs. Set to nullptr if there are 0 outputs.
+   * \param[out] output_count Set to number of ::OrtValue%s returned
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetBoundOutputValues, _In_ const OrtIoBinding* binding_ptr, _In_ OrtAllocator* allocator,
+                  _Out_writes_all_(output_count) OrtValue*** output, _Out_ size_t* output_count);
+
+  /** \brief Clears any previously set Inputs for an ::OrtIoBinding
+   */
+  void(ORT_API_CALL* ClearBoundInputs)(_Inout_ OrtIoBinding* binding_ptr) NO_EXCEPTION ORT_ALL_ARGS_NONNULL;
+
+  /** \brief Clears any previously set Outputs for an ::OrtIoBinding
+   */
+  void(ORT_API_CALL* ClearBoundOutputs)(_Inout_ OrtIoBinding* binding_ptr) NO_EXCEPTION ORT_ALL_ARGS_NONNULL;
+
+  /// @}
+  /// \name OrtValue
+  /// @{
+
+  /** \brief Direct memory access to a specified tensor element
+   *
+   * For example, given a tensor with shape of [3,224,224], a pointer to the element at location [2,150,128] can be retrieved
+   *
+   * This function only works for numeric type tensors (No strings, etc).
+   * This is a no-copy method whose returned pointer is valid until the passed in ::OrtValue is free'd.
+   *
+   * \param[in] value
+   * \param[in] location_values Pointer to an array of index values that specify an element's location relative to its shape
+   * \param[in] location_values_count Number of elements in location_values. Must match the number of elements in the tensor's shape.
+   * \param[out] out Set to a pointer to the element specified
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(TensorAt, _Inout_ OrtValue* value, const int64_t* location_values, size_t location_values_count, _Outptr_ void** out);
+
+  /// @}
+  /// \name OrtEnv
+  /// @{
+
+  /** \brief Create an allocator and register it with the ::OrtEnv
+   *
+   * Enables sharing the allocator between multiple sessions that use the same env instance.
+   * Lifetime of the created allocator will be valid for the duration of the environment.
+   * Returns an error if an allocator with the same ::OrtMemoryInfo is already registered.
+   *
+   * See https://onnxruntime.ai/docs/get-started/with-c.html for details.
+   *
+   * \param[in] env ::OrtEnv instance
+   * \param[in] mem_info
+   * \param[in] arena_cfg Pass nullptr for defaults
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CreateAndRegisterAllocator, _Inout_ OrtEnv* env, _In_ const OrtMemoryInfo* mem_info,
+                  _In_ const OrtArenaCfg* arena_cfg);
+
+  /** \brief Set language projection
+   *
+   * Set the language projection for collecting telemetry data when Env is created.
+   *
+   * The default is ORT_PROJECTION_C, which means it will classify the language not in the list to C also.
+   *
+   * \param[in] ort_env
+   * \param[in] projection
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SetLanguageProjection, _In_ const OrtEnv* ort_env, _In_ OrtLanguageProjection projection);
+
+  /// @}
+  /// \name OrtSession
+  /// @{
+
+  /** \brief Return the time that profiling was started
+   *
+   * \note The timer precision varies per platform. On Windows and MacOS, the precision will be ~100ns
+   *
+   * \param[in] session
+   * \param[out] out nanoseconds of profiling's start time
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SessionGetProfilingStartTimeNs, _In_ const OrtSession* session, _Outptr_ uint64_t* out);
+
+  /// @}
+  /// \name OrtThreadingOptions
+  /// @{
+
+  /** \brief Set global intra-op thread count
+   *
+   * This configures the global thread pool options to be used in the call to OrtApi::CreateEnvWithGlobalThreadPools
+   *
+   * \param[in] tp_options
+   * \param[in] intra_op_num_threads Number of threads, special values:<br>
+   *    0 = Use default thread count<br>
+   *    1 = The invoking thread will be used; no threads will be created in the thread pool.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SetGlobalIntraOpNumThreads, _Inout_ OrtThreadingOptions* tp_options, int intra_op_num_threads);
+
+  /** \brief Set global inter-op thread count
+   *
+   * This configures the global thread pool options to be used in the call to OrtApi::CreateEnvWithGlobalThreadPools
+   *
+   * \param[in] tp_options
+   * \param[in] inter_op_num_threads Number of threads, special values:<br>
+   *    0 = Use default thread count<br>
+   *    1 = The invoking thread will be used; no threads will be created in the thread pool.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SetGlobalInterOpNumThreads, _Inout_ OrtThreadingOptions* tp_options, int inter_op_num_threads);
+
+  /** \brief Set global spin control options
+   *
+   * This will configure the global thread pool options to be used in the call to OrtApi::CreateEnvWithGlobalThreadPools.
+   * Allow spinning of thread pools when their queues are empty. This will set the value for both
+   * inter_op and intra_op threadpools.
+   *
+   * \param[in] tp_options
+   * \param[in] allow_spinning Valid values are 0 or 1.<br>
+   *   0 = It won't spin (recommended if CPU usage is high)<br>
+   *   1 = Threadpool will spin to wait for queue to become non-empty
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SetGlobalSpinControl, _Inout_ OrtThreadingOptions* tp_options, int allow_spinning);
+
+  /// @}
+  /// \name OrtSessionOptions
+  /// @{
+
+  /** \brief Add a pre-allocated initializer to a session
+   *
+   * If a model contains an initializer with a name that is same as the name passed to this call,
+   * ORT will use this initializer instance instead of deserializing one from the model file. This
+   * is useful when you want to share the same initializer across sessions.
+   *
+   * \param[in] options
+   * \param[in] name Null terminated string of the initializer name
+   * \param[in] val ::OrtValue containing the initializer. Its lifetime and the underlying initializer buffer must be
+   *   managed by the user (created using the OrtApi::CreateTensorWithDataAsOrtValue) and it must outlive the session object
+   *   to which it is added.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(AddInitializer, _Inout_ OrtSessionOptions* options, _In_z_ const char* name,
+                  _In_ const OrtValue* val);
+
+  /// @}
+  /// \name OrtEnv
+  /// @{
+
+  /**
+   * Create a custom environment with global threadpools and logger that will be shared across sessions.
+   * Use this in conjunction with OrtApi::DisablePerSessionThreads or else the session will use
+   * its own thread pools.
+   *
+   * \param[in] logging_function A pointer to a logging function.
+   * \param[in] logger_param A pointer to arbitrary data passed as the ::OrtLoggingFunction `param` parameter to
+   *                         `logging_function`.
+   * \param[in] log_severity_level The log severity level.
+   * \param[in] logid The log identifier.
+   * \param[in] tp_options
+   * \param[out] out Newly created OrtEnv. Must be freed with OrtApi::ReleaseEnv
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CreateEnvWithCustomLoggerAndGlobalThreadPools, OrtLoggingFunction logging_function, _In_opt_ void* logger_param, OrtLoggingLevel log_severity_level,
+                  _In_ const char* logid, _In_ const struct OrtThreadingOptions* tp_options, _Outptr_ OrtEnv** out);
+
+  /// @}
+  /// \name OrtSessionOptions
+  /// @{
+
+  /** \brief Append CUDA provider to session options
+   *
+   * If CUDA is not available (due to a non CUDA enabled build, or if CUDA is not installed on the system), this function will return failure.
+   *
+   * \param[in] options
+   * \param[in] cuda_options
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SessionOptionsAppendExecutionProvider_CUDA,
+                  _In_ OrtSessionOptions* options, _In_ const OrtCUDAProviderOptions* cuda_options);
+
+  /** \brief Append ROCM execution provider to the session options
+   *
+   * If ROCM is not available (due to a non ROCM enabled build, or if ROCM is not installed on the system), this function will return failure.
+   *
+   * \param[in] options
+   * \param[in] rocm_options
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SessionOptionsAppendExecutionProvider_ROCM,
+                  _In_ OrtSessionOptions* options, _In_ const OrtROCMProviderOptions* rocm_options);
+
+  /** \brief Append OpenVINO execution provider to the session options
+   *
+   * If OpenVINO is not available (due to a non OpenVINO enabled build, or if OpenVINO is not installed on the system), this function will fail.
+   *
+   * \param[in] options
+   * \param[in] provider_options
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SessionOptionsAppendExecutionProvider_OpenVINO,
+                  _In_ OrtSessionOptions* options, _In_ const OrtOpenVINOProviderOptions* provider_options);
+
+  /// @}
+  /// \name OrtThreadingOptions
+  /// @{
+
+  /** \brief Set threading flush-to-zero and denormal-as-zero
+   *
+   * Sets global thread pool options to be used in the call to OrtApi::CreateEnvWithGlobalThreadPools.
+   * Flush-to-zero and denormal-as-zero are applied to threads in both intra and inter global thread pool.
+   * \note This option is not needed if the models used have no denormals. Having no denormals is recommended as this option may hurt model accuracy.
+   *
+   * \param[in] tp_options
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SetGlobalDenormalAsZero, _Inout_ OrtThreadingOptions* tp_options);
+
+  /// @}
+  /// \name OrtArenaCfg
+  /// @{
+
+  /** \deprecated Use OrtApi::CreateArenaCfgV2
+   *
+   * This will create the configuration of an arena that can eventually be used to define an arena based allocator's behavior
+   *
+   * \param[in] max_mem Use 0 to allow ORT to choose the default
+   * \param[in] arena_extend_strategy Use -1 to allow ORT to choose the default, 0 = kNextPowerOfTwo, 1 = kSameAsRequested
+   * \param[in] initial_chunk_size_bytes Use -1 to allow ORT to choose the default
+   * \param[in] max_dead_bytes_per_chunk Use -1 to allow ORT to choose the default
+   * \param[in] out A pointer to an OrtArenaCfg instance
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CreateArenaCfg, _In_ size_t max_mem, int arena_extend_strategy, int initial_chunk_size_bytes,
+                  int max_dead_bytes_per_chunk, _Outptr_ OrtArenaCfg** out);
+
+  ORT_CLASS_RELEASE(ArenaCfg);
+
+  /// @}
+  /// \name OrtModelMetadata
+  /// @{
+
+  /**
+   * Use this to obtain the description of the graph present in the model
+   * (doc_string field of the GraphProto message within the ModelProto message).
+   * If it doesn't exist, an empty string will be returned.
+   *
+   * \param[in] model_metadata An instance of ::OrtModelMetadata
+   * \param[in] allocator Allocator used to allocate the string that will be returned back
+   * \param[out] value Set to a null terminated string allocated using `allocator`.  The caller is responsible for freeing it using `allocator`
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(ModelMetadataGetGraphDescription, _In_ const OrtModelMetadata* model_metadata,
+                  _Inout_ OrtAllocator* allocator, _Outptr_ char** value);
+
+  /// @}
+  /// \name OrtSessionOptions
+  /// @{
+
+  /** \brief Append TensorRT provider to session options
+   *
+   * If TensorRT is not available (due to a non TensorRT enabled build, or if TensorRT is not installed on the system), this function will return failure.
+   *
+   * \param[in] options
+   * \param[in] tensorrt_options
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SessionOptionsAppendExecutionProvider_TensorRT,
+                  _In_ OrtSessionOptions* options, _In_ const OrtTensorRTProviderOptions* tensorrt_options);
+
+  /// @}
+  /// \name Misc
+  /// @{
+
+  /** \brief Set current GPU device ID
+   *
+   * Set the current device id of the GPU execution provider (CUDA/tensorrt/rocm). The device id should be less
+   * than the total number of devices available. This is only useful when multiple-GPUs are installed and it is
+   * required to restrict execution to a single GPU.
+   *
+   * \param[in] device_id
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SetCurrentGpuDeviceId, _In_ int device_id);
+
+  /** \brief Get current GPU device ID
+   *
+   * Get the current device id of the GPU execution provider (CUDA/tensorrt/rocm).
+   *
+   * \see OrtApi::SetCurrentGpuDeviceId
+   *
+   * \param[out] device_id
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetCurrentGpuDeviceId, _In_ int* device_id);
+
+  /// @}
+  /// \name OrtKernelInfo
+  /// Custom operator APIs.
+  /// @{
+
+  /** \brief Fetch an array of int64_t values stored as an attribute in the graph node
+   *
+   *
+   * If `out` is nullptr, the value of `size` is set to the true size of the attribute
+   * array's size, and a success status is returned.
+   *
+   * If the `size` parameter is greater than or equal to the actual attribute array's size,
+   * the value of `size` is set to the true size of the attribute array's size,
+   * the provided memory is filled with the attribute's contents,
+   * and a success status is returned.
+   *
+   * If the `size` parameter is less than the actual attribute array's size and `out`
+   * is not nullptr, the value of `size` is set to the true size of the attribute array's size
+   * and a failure status is returned.)
+   *
+   * \param[in] info instance
+   * \param[in] name name of the attribute to be parsed
+   * \param[out] out pointer to memory where the attribute's contents are to be stored
+   * \param[in, out] size actual size of attribute array
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(KernelInfoGetAttributeArray_float, _In_ const OrtKernelInfo* info, _In_ const char* name,
+                  _Out_ float* out, _Inout_ size_t* size);
+
+  /** \brief Fetch an array of int64_t values stored as an attribute in the graph node
+   *
+   * If `out` is nullptr, the value of `size` is set to the true size of the attribute
+   * array's size, and a success status is returned.
+   *
+   * If the `size` parameter is greater than or equal to the actual attribute array's size,
+   * the value of `size` is set to the true size of the attribute array's size,
+   * the provided memory is filled with the attribute's contents,
+   * and a success status is returned.
+   *
+   * If the `size` parameter is less than the actual attribute array's size and `out`
+   * is not nullptr, the value of `size` is set to the true size of the attribute array's size
+   * and a failure status is returned.)
+   *
+   * \param[in] info instance
+   * \param[in] name name of the attribute to be parsed
+   * \param[out] out pointer to memory where the attribute's contents are to be stored
+   * \param[in, out] size actual size of attribute array
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(KernelInfoGetAttributeArray_int64, _In_ const OrtKernelInfo* info, _In_ const char* name,
+                  _Out_ int64_t* out, _Inout_ size_t* size);
+
+  /// @}
+  /// \name OrtArenaCfg
+  /// @{
+
+  /** \brief Create an ::OrtArenaCfg
+   *
+   * Create the configuration of an arena that can eventually be used to define an arena based allocator's behavior.
+   *
+   * Supported keys are (See https://onnxruntime.ai/docs/get-started/with-c.html for details on what the
+   * following parameters mean and how to choose these values.):
+   * "max_mem": Maximum memory that can be allocated by the arena based allocator.
+   *  Use 0 for ORT to pick the best value. Default is 0.
+   * "arena_extend_strategy": 0 = kNextPowerOfTwo, 1 = kSameAsRequested.
+   *  Use -1 to allow ORT to choose the default.
+   * "initial_chunk_size_bytes": (Possible) Size of the first allocation in the arena.
+   *  Only relevant if arena strategy is `kNextPowerOfTwo`. Use -1 to allow ORT to choose the default.
+   *  Ultimately, the first allocation size is determined by the allocation memory request.
+   * "max_dead_bytes_per_chunk": Threshold of unused memory in an allocated chunk of arena memory after
+   *  crossing which the current chunk is chunked into 2.
+   * "initial_growth_chunk_size_bytes": (Possible) Size of the second allocation in the arena.
+   *  Only relevant if arena strategy is `kNextPowerOfTwo`. Use -1 to allow ORT to choose the default.
+   * "max_power_of_two_extend_bytes": The maximum enxtend size if arena strategy is `kNextPowerOfTwo`.
+   *  It is not an allocation limit, it is only a limit for extention when requested byte is less than the limit.
+   *  When requested bytes is more than the limit, allocator will still return as requested.
+   *  Use -1 to allow ORT to choose the default 1GB for max_power_of_two_extend_bytes.
+   *  Ultimately, the allocation size is determined by the allocation memory request.
+   *  Further allocation sizes are governed by the arena extend strategy.
+   *
+   * \param[in] arena_config_keys Keys to configure the arena
+   * \param[in] arena_config_values Values to configure the arena
+   * \param[in] num_keys Number of keys in `arena_config_keys` and `arena_config_values`
+   * \param[out] out Newly created ::OrtArenaCfg. Must be freed with OrtApi::ReleaseArenaCfg
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CreateArenaCfgV2, _In_reads_(num_keys) const char* const* arena_config_keys,
+                  _In_reads_(num_keys) const size_t* arena_config_values, _In_ size_t num_keys,
+                  _Outptr_ OrtArenaCfg** out);
+
+  /// @}
+  /// \name OrtRunOptions
+  /// @{
+
+  /** \brief Set a single run configuration entry as a pair of strings
+   *
+   * If a configuration with same key exists, this will overwrite the configuration with the given config_value
+   *
+   * The config_key and the format of config_value are defined in onnxruntime_run_options_config_keys.h
+   *
+   * \param[in] options
+   * \param[in] config_key A null terminated string representation of the config key
+   * \param[in] config_value  A null terminated string representation of the config value
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(AddRunConfigEntry, _Inout_ OrtRunOptions* options,
+                  _In_z_ const char* config_key, _In_z_ const char* config_value);
+
+  /// @}
+  /// \name OrtPrepackedWeightsContainer
+  /// @{
+
+  /** \brief Create an ::OrtPrepackedWeightsContainer
+   *
+   * This container will hold pre-packed buffers of shared initializers for sharing between sessions
+   * (i.e.) if there are shared initializers that can be shared between sessions, the pre-packed buffers
+   * of these (if any) may possibly be shared to provide memory footprint savings. Pass this container
+   * to sessions that you would like to share pre-packed buffers of shared initializers at session
+   * creation time.
+   *
+   *  \param[out] out Newly created ::OrtPrepackedWeightsContainer. Must be freed with OrtApi::ReleasePrepackedWeightsContainer
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CreatePrepackedWeightsContainer, _Outptr_ OrtPrepackedWeightsContainer** out);
+
+  /** \brief Release OrtPrepackedWeightsContainer instance
+   *
+   * \note instance must not be released until the sessions using it are released
+   */
+  ORT_CLASS_RELEASE(PrepackedWeightsContainer);
+
+  /// @}
+  /// \name OrtSession
+  /// @{
+
+  /** \brief Create session with prepacked weights container
+   *
+   * Same functionality offered by OrtApi::CreateSession except that a container that contains
+   * pre-packed weights' buffers is written into/read from by the created session.
+   * This is useful when used in conjunction with OrtApi::AddInitializer which injects
+   * shared initializer info into sessions. Wherever possible, the pre-packed versions of these
+   * shared initializers are cached in this container so that multiple sessions can just re-use
+   * these instead of duplicating these in memory.
+   *
+   * \param[in] env OrtEnv instance instance
+   * \param[in] model_path Null terminated string of the path (wchar on Windows, char otherwise)
+   * \param[in] options
+   * \param[in] prepacked_weights_container
+   * \param[out] out Newly created ::OrtSession. Must be freed with OrtApi::ReleaseSession
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CreateSessionWithPrepackedWeightsContainer, _In_ const OrtEnv* env, _In_ const ORTCHAR_T* model_path,
+                  _In_ const OrtSessionOptions* options, _Inout_ OrtPrepackedWeightsContainer* prepacked_weights_container,
+                  _Outptr_ OrtSession** out);
+
+  /** \brief Create session from memory with prepacked weights container
+   *
+   * Same functionality offered by OrtApi::CreateSessionFromArray except that a container that contains
+   * pre-packed weights' buffers is written into/read from by the created session.
+   * This is useful when used in conjunction with OrtApi::AddInitializer which injects
+   * shared initializer info into sessions. Wherever possible, the pre-packed versions of these
+   * shared initializers are cached in this container so that multiple sessions can just re-use
+   * these instead of duplicating these in memory.
+   *
+   * \param[in] env
+   * \param[in] model_data Array of bytes holding the model
+   * \param[in] model_data_length Number of bytes in `model_data_model`
+   * \param[in] options
+   * \param[in] prepacked_weights_container
+   * \param[out] out Newly created ::OrtSession. Must be freed with OrtApi::ReleaseSession
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CreateSessionFromArrayWithPrepackedWeightsContainer, _In_ const OrtEnv* env,
+                  _In_ const void* model_data, size_t model_data_length,
+                  _In_ const OrtSessionOptions* options, _Inout_ OrtPrepackedWeightsContainer* prepacked_weights_container,
+                  _Outptr_ OrtSession** out);
+
+  /// @}
+  /// \name OrtSessionOptions
+  /// @{
+
+  /** \brief Append TensorRT execution provider to the session options
+   *
+   * If TensorRT is not available (due to a non TensorRT enabled build), this function will return failure.
+   *
+   * This is slightly different from OrtApi::SessionOptionsAppendExecutionProvider_TensorRT, it takes an
+   * ::OrtTensorRTProviderOptions which is publicly defined. This takes an opaque ::OrtTensorRTProviderOptionsV2
+   * which must be created with OrtApi::CreateTensorRTProviderOptions.
+   *
+   * For OrtApi::SessionOptionsAppendExecutionProvider_TensorRT, the user needs to instantiate ::OrtTensorRTProviderOptions
+   * as well as allocate/release buffers for some members of ::OrtTensorRTProviderOptions.
+   * Here, OrtApi::CreateTensorRTProviderOptions and Ortapi::ReleaseTensorRTProviderOptions will do the memory management for you.
+   *
+   * \param[in] options
+   * \param[in] tensorrt_options
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SessionOptionsAppendExecutionProvider_TensorRT_V2,
+                  _In_ OrtSessionOptions* options, _In_ const OrtTensorRTProviderOptionsV2* tensorrt_options);
+
+  /// @}
+  /// \name OrtTensorRTProviderOptionsV2
+  /// @{
+
+  /** \brief Create an OrtTensorRTProviderOptionsV2
+   *
+   * \param[out] out Newly created ::OrtTensorRTProviderOptionsV2. Must be released with OrtApi::ReleaseTensorRTProviderOptions
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CreateTensorRTProviderOptions, _Outptr_ OrtTensorRTProviderOptionsV2** out);
+
+  /** \brief Set options in a TensorRT Execution Provider.
+   *
+   * Please refer to https://onnxruntime.ai/docs/execution-providers/TensorRT-ExecutionProvider.html#cc
+   * to know the available keys and values. Key should be in null terminated string format of the member of ::OrtTensorRTProviderOptionsV2
+   * and value should be its related range.
+   *
+   * For example, key="trt_max_workspace_size" and value="2147483648"
+   *
+   * \param[in] tensorrt_options
+   * \param[in] provider_options_keys Array of UTF-8 null-terminated string for provider options keys
+   * \param[in] provider_options_values Array of UTF-8 null-terminated string for provider options values
+   * \param[in] num_keys Number of elements in the `provider_option_keys` and `provider_options_values` arrays
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(UpdateTensorRTProviderOptions, _Inout_ OrtTensorRTProviderOptionsV2* tensorrt_options,
+                  _In_reads_(num_keys) const char* const* provider_options_keys,
+                  _In_reads_(num_keys) const char* const* provider_options_values,
+                  _In_ size_t num_keys);
+
+  /** \brief Get serialized TensorRT provider options string.
+   *
+   * For example, "trt_max_workspace_size=2147483648;trt_max_partition_iterations=10;trt_int8_enable=1;......"
+   *
+   * \param tensorrt_options - OrtTensorRTProviderOptionsV2 instance
+   * \param allocator - a ptr to an instance of OrtAllocator obtained with OrtApi::CreateAllocator or OrtApi::GetAllocatorWithDefaultOptions
+   *                      the specified allocator will be used to allocate continuous buffers for output strings and lengths.
+   * \param ptr - is a UTF-8 null terminated string allocated using 'allocator'. The caller is responsible for using the same allocator to free it.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetTensorRTProviderOptionsAsString, _In_ const OrtTensorRTProviderOptionsV2* tensorrt_options, _Inout_ OrtAllocator* allocator, _Outptr_ char** ptr);
+
+  /** \brief Release an ::OrtTensorRTProviderOptionsV2
+   *
+   * \note This is an exception in the naming convention of other Release* functions, as the name of the method does not have the V2 suffix, but the type does
+   */
+  void(ORT_API_CALL* ReleaseTensorRTProviderOptions)(_Frees_ptr_opt_ OrtTensorRTProviderOptionsV2* input);
+
+  /// @}
+  /// \name OrtSessionOptions
+  /// @{
+
+  /** \brief Enable custom operators
+   *
+   * See onnxruntime-extensions: https://github.com/microsoft/onnxruntime-extensions.git
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(EnableOrtCustomOps, _Inout_ OrtSessionOptions* options);
+
+  /// @}
+  /// \name OrtAllocator
+  /// @{
+
+  /** \brief Register a custom allocator
+   *
+   * Enables sharing between multiple sessions that use the same env instance.
+   * Returns an error if an allocator with the same ::OrtMemoryInfo is already registered.
+   *
+   * The behavior of this is exactly the same as OrtApi::CreateAndRegisterAllocator except
+   * instead of ORT creating an allocator based on provided info, in this case
+   * ORT uses the user-provided custom allocator.
+   * See https://onnxruntime.ai/docs/get-started/with-c.html for details.
+   *
+   * \param[in] env
+   * \param[in] allocator User provided allocator
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(RegisterAllocator, _Inout_ OrtEnv* env, _In_ OrtAllocator* allocator);
+
+  /** \brief Unregister a custom allocator
+   *
+   * It is an error if you provide an ::OrtMemoryInfo not corresponding to any
+   * registered allocators for sharing.
+   *
+   * \param[in] env
+   * \param[in] mem_info
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(UnregisterAllocator, _Inout_ OrtEnv* env,
+                  _In_ const OrtMemoryInfo* mem_info);
+
+  /// @}
+  /// \name OrtValue
+  /// @{
+
+  /** \brief Sets *out to 1 iff an ::OrtValue is a SparseTensor, and 0 otherwise
+   *
+   * \param[in] value existing ::OrtValue
+   * \param[out] out unless an error occurs, contains 1 iff the value contains an instance
+   *  of sparse tensor or 0 otherwise.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(IsSparseTensor, _In_ const OrtValue* value, _Out_ int* out);
+
+  /** \brief Create an ::OrtValue with a sparse tensor that is empty.
+   *
+   * Use FillSparseTensor<Format>() functions to populate sparse tensor with non-zero values and
+   * format specific indices data.
+   * Use ReleaseValue to destroy the sparse tensor, this will also release the buffer inside the output value
+   * if any was allocated.
+   * \param[in,out] allocator allocator to use when performing an allocation. Allocation will be performed
+   *   by FillSparseTensor<Format>() APIs. The lifespan of the allocator instance must eclipse the lifespan
+   *   this sparse tensor instance as the same allocator will be used to free memory.
+   * \param[in] dense_shape shape of the original dense tensor
+   * \param[in] dense_shape_len number of shape dimensions being passed
+   * \param[in] type must be one of TENSOR_ELEMENT_DATA_TYPE_xxxx
+   * \param[out] out Should be freed by calling ReleaseValue
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CreateSparseTensorAsOrtValue, _Inout_ OrtAllocator* allocator, _In_ const int64_t* dense_shape,
+                  size_t dense_shape_len, ONNXTensorElementDataType type, _Outptr_ OrtValue** out);
+
+  /**
+   * This fills populates an empty tensor that was created using OrtApi::CreateSparseTensorAsOrtValue.
+   * This will allocate required memory and copy the supplied NNZ values and COO indices into that memory allocation.
+   * Memory allocation is performed using the allocator that was specified with OrtApi::CreateSparseTensorAsOrtValue.
+   *
+   * \param[in,out] ort_value ::OrtValue to populate with data
+   * \param[in] data_mem_info serves to identify the location of the data to be copied. If the allocator specified
+   *  at the creation time has memory info that is not the same as mem_info argument to this function a X-device copy will be performed.
+   *  String data is assumed to be on CPU and will only be copied into a CPU allocated buffer.
+   * \param[in] values_shape pointer to values shape array
+   * \param[in] values_shape_len length of the values_shape
+   * \param[in] values pointer to an array of values. For strings, pass const char**.
+   * \param[in] indices_data pointer to a location of COO indices
+   * \param[in] indices_num number of COO indices
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(FillSparseTensorCoo, _Inout_ OrtValue* ort_value, _In_ const OrtMemoryInfo* data_mem_info,
+                  _In_ const int64_t* values_shape, size_t values_shape_len, _In_ const void* values,
+                  _In_ const int64_t* indices_data, size_t indices_num);
+
+  /**
+   * This fills populates an empty tensor that was created using OrtApi::CreateSparseTensorAsOrtValue.
+   * This will allocate required memory and copy the supplied NNZ values and CSR indices into that memory allocation.
+   * Memory allocation is performed using the allocator that was specified with OrtApi::CreateSparseTensorAsOrtValue.
+   *
+   * \param[in,out] ort_value ::OrtValue to populate with data
+   * \param[in] data_mem_info serves to identify the location of the data to be copied. If the allocator specified
+   *  at the creation time has memory info that is not the same as mem_info argument to this function a X-device copy will be performed.
+   *  String data is assumed to be on CPU and will only be copied into a CPU allocated buffer.
+   * \param[in] values_shape pointer to values shape array
+   * \param[in] values_shape_len length of the values_shape
+   * \param[in] values - pointer to an array of values. For strings, pass const char**.
+   * \param[in] inner_indices_data pointer to a location of CSR inner indices
+   * \param[in] inner_indices_num number of CSR inner indices
+   * \param[in] outer_indices_data pointer to a location of CSR outer indices
+   * \param[in] outer_indices_num number of CSR outer indices
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(FillSparseTensorCsr, _Inout_ OrtValue* ort_value, _In_ const OrtMemoryInfo* data_mem_info,
+                  _In_ const int64_t* values_shape, size_t values_shape_len, _In_ const void* values,
+                  _In_ const int64_t* inner_indices_data, size_t inner_indices_num,
+                  _In_ const int64_t* outer_indices_data, size_t outer_indices_num);
+
+  /**
+   * This fills populates an empty tensor that was created using OrtApi::CreateSparseTensorAsOrtValue.
+   * This will allocate required memory and copy the supplied NNZ values and BlockSparse indices into that memory allocation.
+   * Memory allocation is performed using the allocator that was specified with OrtApi::CreateSparseTensorAsOrtValue.
+   *
+   * \param[in,out] ort_value ::OrtValue to populate with data
+   * \param[in] data_mem_info serves to identify the location of the data to be copied. If the allocator specified
+   *  at the creation time has memory info that is not the same as mem_info argument to this function a X-device copy will be performed.
+   *  String data is assumed to be on CPU and will only be copied into a CPU allocated buffer.
+   * \param[in] values_shape
+   * \param[in] values_shape_len
+   * \param[in] values structure with values information
+   * \param[in] indices_shape_data pointer to a location of indices shape
+   * \param[in] indices_shape_len length of the block sparse indices shape
+   * \param[in] indices_data pointer to a location of indices data. Shape will determine the length of the indices data.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(FillSparseTensorBlockSparse, _Inout_ OrtValue* ort_value, _In_ const OrtMemoryInfo* data_mem_info,
+                  _In_ const int64_t* values_shape, size_t values_shape_len, _In_ const void* values,
+                  _In_ const int64_t* indices_shape_data, size_t indices_shape_len,
+                  _In_ const int32_t* indices_data);
+
+  /**
+   * Create an ::OrtValue with a sparse tensor. This is the first step.
+   * Next, use Use<Format>Indices() functions to supply sparse tensor with
+   * format specific indices data and set its sparse format to a specific enum value.
+   * This will not perform memory allocations. It will
+   * use supplied user buffer which should outlive the created sparse tensor.
+   * Use OrtApi::ReleaseValue to destroy the sparse tensor. It would not release the supplied values buffer.
+   * This function can not be used to map strings from the user allocated memory. Strings must always be copied
+   * and have UTF-8 encoding. Therefore, use OrtApi::CreateSparseTensorAsOrtValue above and then fill it with data
+   * using appropriate Make*() function.
+   *
+   * \param[in] info memory info where sparse values reside.
+   * \param[in,out] p_data pointer to a user allocated buffer with values. To create a full sparse tensor with no non-zero
+   *   values, pass nullptr
+   * \param[in] dense_shape shape of the original dense tensor
+   * \param[in] dense_shape_len number of shape dimensions being passed
+   * \param[in] values_shape shape of the values data. To create a fully sparse tensor with no non-zero values,
+   *   pass {0} shape.
+   * \param[in] values_shape_len number of values shape dimensions
+   * \param[in] type must be one of TENSOR_ELEMENT_DATA_TYPE_xxxx
+   * \param[out] out Should be freed by calling ReleaseValue
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(CreateSparseTensorWithValuesAsOrtValue, _In_ const OrtMemoryInfo* info, _Inout_ void* p_data,
+                  _In_ const int64_t* dense_shape, size_t dense_shape_len,
+                  _In_ const int64_t* values_shape, size_t values_shape_len,
+                  ONNXTensorElementDataType type, _Outptr_ OrtValue** out);
+
+  /**
+   * This assigns Coo format indices to the SparseTensor that was created by
+   * OrtApi::CreateSparseTensorWithValuesAsOrtValue above. It also sets OrtSparseFormat to
+   * ORT_SPARSE_COO. This will not allocate any additional memory for data. The life span of
+   * indices_data buffer should eclipse the life span of this ::OrtValue.
+   *
+   * \param[in,out] ort_value ::OrtValue instance constructed with OrtApi::CreateSparseTensorWithValuesAsOrtValue
+   * \param[in,out] indices_data pointer to a user pre-allocated buffer or nullptr for fully sparse tensors.
+   * \param[in] indices_num  number of COO indices. Should either be 0 for fully sparse tensors, be equal
+   *  to the number of nnz values specified to OrtApi::CreateSparseTensorWithValuesAsOrtValue for 1-D {nnz} indices or
+   *  be twice as number of nnz values for a  2-D indices {nnz, 2}
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(UseCooIndices, _Inout_ OrtValue* ort_value, _Inout_ int64_t* indices_data, size_t indices_num);
+
+  /**
+   * The assigns CSR format indices to the SparseTensor that was created by
+   * OrtApi::CreateSparseTensorWithValuesAsOrtValue above. It also sets OrtSparseFormat to
+   * ORT_SPARSE_CSRC. This will not allocate any additional memory for data. The life spans of
+   * inner_data and outer_data buffers should eclipse the life span of this ::OrtValue.
+   *
+   * \param[in,out] ort_value ::OrtValue instance constructed with OrtApi::CreateSparseTensorWithValuesAsOrtValue
+   * \param[in,out] inner_data pointer to a user pre-allocated buffer or nullptr for fully sparse tensors.
+   * \param[in] inner_num  number of inner CSR indices. Should either be 0 for fully sparse tensors or be equal
+   * to the number of nnz values specified to OrtApi::CreateSparseTensorWithValuesAsOrtValue.
+   * \param[in,out] outer_data pointer to user pre-allocated buffer or nullptr for fully sparse tensors.
+   * \param[in] outer_num number of CSR outer indices. Should either be 0 for fully sparse tensors or
+   * equal to rows + 1 of the dense shape.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(UseCsrIndices, _Inout_ OrtValue* ort_value, _Inout_ int64_t* inner_data, size_t inner_num,
+                  _Inout_ int64_t* outer_data, size_t outer_num);
+
+  /**
+   * The assigns BlockSparse format indices to the SparseTensor that was created by
+   * OrtApi::CreateSparseTensorWithValuesAsOrtValue above. It also sets OrtSparseFormat to
+   * ORT_SPARSE_BLOCK_SPARSE. This will not allocate any additional memory for data. The life span of
+   * indices_data buffer must eclipse the lifespan of this ::OrtValue.
+   *
+   * \param[in,out] ort_value OrtValue instance constructed with OrtApi::CreateSparseTensorWithValuesAsOrtValue
+   * \param[in] indices_shape pointer to indices shape. Use {0} for fully sparse tensors
+   * \param[in] indices_shape_len length of the indices shape
+   * \param[in,out] indices_data pointer to user pre-allocated buffer or nullptr for fully sparse tensors.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(UseBlockSparseIndices, _Inout_ OrtValue* ort_value, const int64_t* indices_shape, size_t indices_shape_len, _Inout_ int32_t* indices_data);
+
+  /** \brief Returns sparse tensor format enum iff a given ort value contains an instance of sparse tensor.
+   *
+   * \param[in] ort_value ::OrtValue that contains an instance of sparse tensor
+   * \param[out] out pointer to out parameter
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetSparseTensorFormat, _In_ const OrtValue* ort_value, _Out_ enum OrtSparseFormat* out);
+
+  /** \brief Returns data type and shape of sparse tensor values (nnz) iff ::OrtValue contains a SparseTensor.
+   *
+   * \param[in] ort_value An ::OrtValue that contains a fully constructed sparse tensor
+   * \param[out] out Must be freed by OrtApi::ReleaseTensorTypeAndShapeInfo
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetSparseTensorValuesTypeAndShape, _In_ const OrtValue* ort_value, _Outptr_ OrtTensorTypeAndShapeInfo** out);
+
+  /** \brief Returns numeric data for sparse tensor values (nnz). For string values use GetStringTensor*().
+   *
+   * \param[in] ort_value an instance of ::OrtValue containing sparse tensor
+   * \param[out] out returns a pointer to values data.  Do not attempt to free this ptr.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetSparseTensorValues, _In_ const OrtValue* ort_value, _Outptr_ const void** out);
+
+  /** \brief Returns data type, shape for the type of indices specified by indices_format.
+   *
+   * \param[in] ort_value ::OrtValue containing sparse tensor.
+   * \param[in] indices_format One of the indices formats. It is an error to request a format that the sparse
+   * tensor does not contain.
+   * \param[out] out an instance of ::OrtTensorTypeAndShapeInfo. Must be freed by OrtApi::ReleaseTensorTypeAndShapeInfo
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetSparseTensorIndicesTypeShape, _In_ const OrtValue* ort_value, enum OrtSparseIndicesFormat indices_format, _Outptr_ OrtTensorTypeAndShapeInfo** out);
+
+  /** \brief Returns indices data for the type of the indices specified by indices_format
+   *
+   * \param[in] ort_value ::OrtValue containing sparse tensor.
+   * \param[in] indices_format One of the indices formats. It is an error to request a format that the sparse tensor does not contain.
+   * \param[out] num_indices Pointer to where the number of indices entries is returned
+   * \param[out] indices Returned pointer to the indices data. Do not free the returned pointer as it refers to internal data owned by the ::OrtValue
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetSparseTensorIndices, _In_ const OrtValue* ort_value, enum OrtSparseIndicesFormat indices_format, _Out_ size_t* num_indices, _Outptr_ const void** indices);
+  /// @}
+  /// \name OrtSessionOptions
+  /// @{
+
+  /**
+   * \brief Sets out to 1 iff an optional type OrtValue has an element, 0 otherwise (OrtValue is None)
+   * Use this API to find if the optional type OrtValue is None or not.
+   * If the optional type OrtValue is not None, use the OrtValue just like any other OrtValue.
+   * For example, if you get an OrtValue that corresponds to Optional(tensor) and
+   * if HasValue() returns true, use it as tensor and so on.
+
+   * \param[in] value Input OrtValue.
+   * \param[out] out indicating if the input OrtValue contains data (1) or if it is a None (0)
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(HasValue, _In_ const OrtValue* value, _Out_ int* out);
+
+  /// @}
+  /// \name OrtKernelContext
+  /// Custom operator APIs.
+  /// @{
+
+  /** \brief Used for custom operators, gets the GPU compute stream to use to launch the custom a GPU kernel
+   *   \see ::OrtCustomOp
+   * \param[in]  context OrtKernelContext instance
+   * \param[out] out Returns pointer to a GPU compute stream that can be used to launch the custom GPU kernel.
+   *             If retrieving the GPU compute stream is not relevant (GPU not enabled in the build, kernel partitioned to
+   *             some other EP), then a nullptr is returned as the output param.
+   *             Do not free or mutate the returned pointer as it refers to internal data owned by the underlying session.
+   *             Only use it for custom kernel launching.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(KernelContext_GetGPUComputeStream, _In_ const OrtKernelContext* context, _Outptr_ void** out);
+
+  /// @}
+  /// \name GetTensorMemoryInfo
+  /// @{
+  /** \brief Returns a pointer to the ::OrtMemoryInfo of a Tensor
+   * \param[in] value ::OrtValue containing tensor.
+   * \param[out] mem_info ::OrtMemoryInfo of the tensor. Do NOT free the returned pointer. It is valid for the lifetime of the ::OrtValue
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetTensorMemoryInfo, _In_ const OrtValue* value, _Out_ const OrtMemoryInfo** mem_info);
+
+  /// @}
+  /// \name GetExecutionProviderApi
+  /// @{
+  /** \brief Get a pointer to the requested version of the Execution Provider specific
+   * API extensions to the OrtApi
+   * \param[in] provider_name The name of the execution provider name. Currently only the following
+   * values are supported: "DML".
+   * \param[in] version Must be ::ORT_API_VERSION.
+   * \param[out] provider_api A void pointer containing a reference to the execution provider versioned api structure.
+   * For example, the provider_api pointer can be cast to the OrtDmlApi* when the provider_name is "DML".
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetExecutionProviderApi, _In_ const char* provider_name, _In_ uint32_t version, _Outptr_ const void** provider_api);
+
+  /// @}
+
+  /// \name SessionOptions
+  /// @{
+  /** \brief Set custom thread creation function
+   *
+   * \param[in] options Session options
+   * \param[in] ort_custom_create_thread_fn Custom thread creation function
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SessionOptionsSetCustomCreateThreadFn, _Inout_ OrtSessionOptions* options, _In_ OrtCustomCreateThreadFn ort_custom_create_thread_fn);
+
+  /** \brief Set creation options for custom thread
+   *
+   * \param[in] options Session options
+   * \param[in] ort_custom_thread_creation_options Custom thread creation options (can be nullptr)
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SessionOptionsSetCustomThreadCreationOptions, _Inout_ OrtSessionOptions* options, _In_ void* ort_custom_thread_creation_options);
+
+  /** \brief Set custom thread join function
+   *
+   * \param[in] options Session options
+   * \param[in] ort_custom_join_thread_fn Custom join thread function, must not be nullptr when ort_custom_create_thread_fn is set
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SessionOptionsSetCustomJoinThreadFn, _Inout_ OrtSessionOptions* options, _In_ OrtCustomJoinThreadFn ort_custom_join_thread_fn);
+  /// @}
+
+  /// \name OrtThreadingOptions
+  /// @{
+  /** \brief Set custom thread creation function for global thread pools
+   *
+   * \param[inout] tp_options
+   * \param[in] ort_custom_create_thread_fn Custom thread creation function
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SetGlobalCustomCreateThreadFn, _Inout_ OrtThreadingOptions* tp_options, _In_ OrtCustomCreateThreadFn ort_custom_create_thread_fn);
+
+  /** \brief Set custom thread creation options for global thread pools
+   *
+   * \param[inout] tp_options
+   * \param[in] ort_custom_thread_creation_options Custom thread creation options (can be nullptr)
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SetGlobalCustomThreadCreationOptions, _Inout_ OrtThreadingOptions* tp_options, _In_ void* ort_custom_thread_creation_options);
+
+  /** \brief Set custom thread join function for global thread pools
+   *
+   * \param[inout] tp_options
+   * \param[in] ort_custom_join_thread_fn Custom thread join function, must not be nullptr when global ort_custom_create_thread_fn is set
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SetGlobalCustomJoinThreadFn, _Inout_ OrtThreadingOptions* tp_options, _In_ OrtCustomJoinThreadFn ort_custom_join_thread_fn);
+  /// @}
+
+  /** \brief Synchronize bound inputs. The call may be necessary for some providers, such as cuda,
+   *   in case the system that allocated bound memory operated on a different stream. However, the
+   *   operation is provider specific and could be a no-op.
+   *
+   * \param[inout] binding_ptr
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SynchronizeBoundInputs, _Inout_ OrtIoBinding* binding_ptr);
+
+  /** \brief Synchronize bound outputs. The call may be necessary for some providers, such as cuda,
+   *   in case the system that allocated bound memory operated on a different stream. However, the
+   *   operation is provider specific and could be a no-op.
+   *
+   * \param[inout] binding_ptr
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SynchronizeBoundOutputs, _Inout_ OrtIoBinding* binding_ptr);
+
+  /// \name OrtSessionOptions
+  /// @{
+
+  /** \brief Append CUDA execution provider to the session options
+   *
+   * If CUDA is not available (due to a non CUDA enabled build), this function will return failure.
+   *
+   * This is slightly different from OrtApi::SessionOptionsAppendExecutionProvider_CUDA, it takes an
+   * ::OrtCUDAProviderOptions which is publicly defined. This takes an opaque ::OrtCUDAProviderOptionsV2
+   * which must be created with OrtApi::CreateCUDAProviderOptions.
+   *
+   * For OrtApi::SessionOptionsAppendExecutionProvider_CUDA, the user needs to instantiate ::OrtCUDAProviderOptions
+   * as well as allocate/release buffers for some members of ::OrtCUDAProviderOptions.
+   * Here, OrtApi::CreateCUDAProviderOptions and Ortapi::ReleaseCUDAProviderOptions will do the memory management for you.
+   *
+   * \param[in] options
+   * \param[in] cuda_options
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.11.
+   */
+  ORT_API2_STATUS(SessionOptionsAppendExecutionProvider_CUDA_V2,
+                  _In_ OrtSessionOptions* options, _In_ const OrtCUDAProviderOptionsV2* cuda_options);
+
+  /// @}
+  /// \name OrtCUDAProviderOptionsV2
+  /// @{
+
+  /** \brief Create an OrtCUDAProviderOptionsV2
+   *
+   * \param[out] out Newly created ::OrtCUDAProviderOptionsV2. Must be released with OrtApi::ReleaseCudaProviderOptions
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.11.
+   */
+  ORT_API2_STATUS(CreateCUDAProviderOptions, _Outptr_ OrtCUDAProviderOptionsV2** out);
+
+  /** \brief Set options in a CUDA Execution Provider.
+   *
+   * Please refer to https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#configuration-options
+   * to know the available keys and values. Key should be in null terminated string format of the member of ::OrtCUDAProviderOptionsV2
+   * and value should be its related range.
+   *
+   * For example, key="device_id" and value="0"
+   *
+   * \param[in] cuda_options
+   * \param[in] provider_options_keys Array of UTF-8 null-terminated string for provider options keys
+   * \param[in] provider_options_values Array of UTF-8 null-terminated string for provider options values
+   * \param[in] num_keys Number of elements in the `provider_option_keys` and `provider_options_values` arrays
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.11.
+   */
+  ORT_API2_STATUS(UpdateCUDAProviderOptions, _Inout_ OrtCUDAProviderOptionsV2* cuda_options,
+                  _In_reads_(num_keys) const char* const* provider_options_keys,
+                  _In_reads_(num_keys) const char* const* provider_options_values,
+                  _In_ size_t num_keys);
+
+  /**
+   * Get serialized CUDA provider options string.
+   *
+   * For example, "device_id=0;arena_extend_strategy=0;......"
+   *
+   * \param cuda_options - OrtCUDAProviderOptionsV2 instance
+   * \param allocator - a ptr to an instance of OrtAllocator obtained with CreateAllocator() or GetAllocatorWithDefaultOptions()
+   *                      the specified allocator will be used to allocate continuous buffers for output strings and lengths.
+   * \param ptr - is a UTF-8 null terminated string allocated using 'allocator'. The caller is responsible for using the same allocator to free it.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.11.
+   */
+  ORT_API2_STATUS(GetCUDAProviderOptionsAsString, _In_ const OrtCUDAProviderOptionsV2* cuda_options, _Inout_ OrtAllocator* allocator, _Outptr_ char** ptr);
+
+  /** \brief Release an ::OrtCUDAProviderOptionsV2
+   *
+   * \note This is an exception in the naming convention of other Release* functions, as the name of the method does not have the V2 suffix, but the type does
+   *
+   * \since Version 1.11.
+   */
+  void(ORT_API_CALL* ReleaseCUDAProviderOptions)(_Frees_ptr_opt_ OrtCUDAProviderOptionsV2* input);
+
+  /// @}
+
+  /** \brief Append MIGraphX provider to session options
+   *
+   * If MIGraphX is not available (due to a non MIGraphX enabled build, or if MIGraphX is not installed on the system), this function will return failure.
+   *
+   * \param[in] options
+   * \param[in] migraphx_options
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.11.
+   */
+  ORT_API2_STATUS(SessionOptionsAppendExecutionProvider_MIGraphX,
+                  _In_ OrtSessionOptions* options, _In_ const OrtMIGraphXProviderOptions* migraphx_options);
+
+  /** \brief Replace initialized Tensors with external data with the data provided in initializers.
+   *
+   * The function will find the initialized TensorProtos with external data in the graph with the provided names and
+   * replace them with the provided tensors. The API verifies that the TensorProto being replaced
+   * has an external data reference and has the same name, dimensions and data type as its replacement. The replacement
+   * will occur before any of the optimizations take place. The data will be copied into the graph
+   * since TensorProto can't refer to the user provided buffers.
+   *
+   * Once the model has been loaded, the OrtValue(s) added to SessionOptions instance will be removed
+   * from the internal SessionOptions copy to save memory, the user provided buffers can then be deallocated
+   * and the SessionOptions instance that refers to them can be destroyed.
+   *
+   * \param[in] options
+   * \param[in] initializer_names Array of null terminated UTF-8 encoded strings of the initializers names.
+   * \param[in] initializers Array of ::OrtValue type
+   * \param[in] initializers_num Number of elements in the initializer_names and initializers
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.12.
+   */
+  ORT_API2_STATUS(AddExternalInitializers, _In_ OrtSessionOptions* options,
+                  _In_reads_(input_len) const char* const* initializer_names,
+                  _In_reads_(input_len) const OrtValue* const* initializers, size_t initializers_num);
+
+  /** \brief: Create attribute of onnxruntime operator
+   *
+   * \param[in] name Name of the attribute
+   * \param[in] data Data content of the attribute
+   * \param[in] len Number of bytes stored in data
+   * \param[in] type Data type
+   * \param[out] op_attr Attribute that has been created, which must be released by OrtApi::ReleaseOpAttr
+   *
+   * \since Version 1.12.
+   */
+  ORT_API2_STATUS(CreateOpAttr,
+                  _In_ const char* name,
+                  _In_ const void* data,
+                  _In_ int len,
+                  _In_ OrtOpAttrType type,
+                  _Outptr_ OrtOpAttr** op_attr);
+
+  /* \brief: Release op attribute
+   *
+   * \param[in] opAttr Attribute created by OrtApi::CreateOpAttr
+   *
+   * \since Version 1.12.
+   */
+  ORT_CLASS_RELEASE(OpAttr);
+
+  /** \brief: Create onnxruntime native operator
+   *
+   * \param[in] info Kernel info
+   * \param[in] op_name Operator name
+   * \param[in] domain Operator domain
+   * \param[in] version Operator opset version
+   * \param[in] type_constraint_names Name of the type contraints, such as "T" or "T1"
+   * \param[in] type_constraint_values Type of each contraints
+   * \param[in] type_constraint_count Number of contraints
+   * \param[in] attr_values Attributes used to initialize the operator
+   * \param[in] attr_count Number of the attributes
+   * \param[in] input_count Number of inputs
+   * \param[in] output_count Number of outputs
+   * \param[out] ort_op Operator that has been created
+   *
+   * \since Version 1.12.
+   */
+  ORT_API2_STATUS(CreateOp,
+                  _In_ const OrtKernelInfo* info,
+                  _In_z_ const char* op_name,
+                  _In_z_ const char* domain,
+                  int version,
+                  _In_reads_(type_constraint_count) const char** type_constraint_names,
+                  _In_reads_(type_constraint_count) const ONNXTensorElementDataType* type_constraint_values,
+                  int type_constraint_count,
+                  _In_reads_(attr_count) const OrtOpAttr* const* attr_values,
+                  int attr_count,
+                  int input_count,
+                  int output_count,
+                  _Outptr_ OrtOp** ort_op);
+
+  /** \brief: Invoke the operator created by OrtApi::CreateOp
+   * The inputs must follow the order as specified in onnx specification
+   *
+   * \param[in] context Kernel context
+   * \param[in] ort_op Operator that has been created
+   * \param[in] input_values Array of inputs
+   * \param[in] input_count Number of inputs
+   * \param[in] output_values Array of outputs
+   * \param[in] output_count Number of outputs
+   *
+   * \since Version 1.12.
+   */
+  ORT_API2_STATUS(InvokeOp,
+                  _In_ const OrtKernelContext* context,
+                  _In_ const OrtOp* ort_op,
+                  _In_ const OrtValue* const* input_values,
+                  _In_ int input_count,
+                  _Inout_ OrtValue* const* output_values,
+                  _In_ int output_count);
+
+  /* \brief: Release an onnxruntime operator
+   *
+   * \param[in] Op Operator created by OrtApi::CreateOp
+   *
+   * \since Version 1.12.
+   */
+  ORT_CLASS_RELEASE(Op);
+
+  /** \brief: Append execution provider to the session options.
+   * \param[in] options
+   * \param[in] provider_name - provider to add.
+   * \param[in] provider_options_keys - keys to configure the provider options
+   * \param[in] provider_options_values - values to configure the provider options
+   * \param[in] num_keys - number of keys passed in
+   *
+   * Currently supported providers:
+   *   QNN
+   *   SNPE
+   *   XNNPACK
+   *
+   * Note: If an execution provider has a dedicated SessionOptionsAppendExecutionProvider_<provider name> function
+   *       that should be used to add it.
+   *
+   * QNN supported keys:
+   *   "backend_path": file path to QNN backend library.
+   *   "qnn_context_cache_enable": 1 to enable QNN graph creation from cached QNN context file. If it's enabled: QNN EP will
+   *    load from cached QNN context binary if it exist. It will generate a context binary file if it's not exist
+   *   "qnn_context_cache_path": explicitly provide the QNN context cache file. Default to model_file.onnx.bin if not provided.
+   *   "profiling_level": QNN profiling level, options: "off", "basic", "detailed". Default to off.
+   *   "rpc_control_latency": QNN RPC control latency.
+   *   "htp_performance_mode": QNN performance mode, options: "burst", "balanced", "default", "high_performance",
+   *   "high_power_saver", "low_balanced", "low_power_saver", "power_saver", "sustained_high_performance". Default to "default".
+   *
+   * SNPE supported keys:
+   *   "runtime": SNPE runtime engine, options: "CPU", "CPU_FLOAT32", "GPU", "GPU_FLOAT32_16_HYBRID", "GPU_FLOAT16",
+   *   "DSP", "DSP_FIXED8_TF", "AIP_FIXED_TF", "AIP_FIXED8_TF".
+   *   Mapping to SNPE Runtime_t definition: CPU, CPU_FLOAT32 => zdl::DlSystem::Runtime_t::CPU;
+   *   GPU, GPU_FLOAT32_16_HYBRID => zdl::DlSystem::Runtime_t::GPU;
+   *   GPU_FLOAT16 => zdl::DlSystem::Runtime_t::GPU_FLOAT16;
+   *   DSP, DSP_FIXED8_TF => zdl::DlSystem::Runtime_t::DSP.
+   *   AIP_FIXED_TF, AIP_FIXED8_TF => zdl::DlSystem::Runtime_t::AIP_FIXED_TF.
+   *   "priority": execution priority, options: "low", "normal".
+   *   "buffer_type": ITensor or user buffers, options: "ITENSOR", user buffer with different types - "TF8", "TF16", "UINT8", "FLOAT".
+   *   "ITENSOR" -- default, ITensor which is float only.
+   *   "TF8" -- quantized model required, "FLOAT" -- for both quantized or non-quantized model
+   *   "enable_init_cache": enable SNPE init caching feature, set to 1 to enabled it. Disabled by default.
+   *   If SNPE is not available (due to a non Snpe enabled build or its dependencies not being installed), this function will fail.
+   *
+   * XNNPACK supported keys:
+   *   "intra_op_num_threads": number of thread-pool size to use for XNNPACK execution provider.
+   *      default value is 0, which means to use the session thread-pool size.
+   *
+   * \since Version 1.12.
+   */
+  ORT_API2_STATUS(SessionOptionsAppendExecutionProvider, _In_ OrtSessionOptions* options,
+                  _In_ const char* provider_name,
+                  _In_reads_(num_keys) const char* const* provider_options_keys,
+                  _In_reads_(num_keys) const char* const* provider_options_values,
+                  _In_ size_t num_keys);
+
+  /* \brief: Get a copy of kernel info
+   *
+   * \param[in] info Kernel info
+   * \param[out] info_copy Copy of kernel info
+   *
+   * \since Version 1.12.
+   */
+  ORT_API2_STATUS(CopyKernelInfo,
+                  _In_ const OrtKernelInfo* info,
+                  _Outptr_ OrtKernelInfo** info_copy);
+
+  /* \brief: Release kernel info
+   *
+   * \param[in] KernelInfo A copy of kernel info returned by CopyKernelInfo
+   *
+   * \since Version 1.12.
+   */
+  ORT_CLASS_RELEASE(KernelInfo);
+
+  /// \name Ort Training
+  /// @{
+  /** \brief Gets the Training C Api struct
+   *
+   * Call this function to access the ::OrtTrainingApi structure that holds pointers to functions that enable
+   * training with onnxruntime.
+   * \note A NULL pointer will be returned and no error message will be printed if the training api
+   * is not supported with this build. A NULL pointer will be returned and an error message will be
+   * printed if the provided version is unsupported, for example when using a runtime older than the
+   * version created with this header file.
+   *
+   * \param[in] version Must be ::ORT_API_VERSION
+   * \return The ::OrtTrainingApi struct for the version requested.
+   *
+   * \since Version 1.13
+   */
+  const OrtTrainingApi*(ORT_API_CALL* GetTrainingApi)(uint32_t version)NO_EXCEPTION;
+
+  /// @}
+
+  /** \brief Append CANN provider to session options
+   *
+   * If CANN is not available (due to a non CANN enabled build, or if CANN is not installed on the system), this function will return failure.
+   *
+   * \param[in] options
+   * \param[in] cann_options
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.13.
+   */
+  ORT_API2_STATUS(SessionOptionsAppendExecutionProvider_CANN,
+                  _In_ OrtSessionOptions* options, _In_ const OrtCANNProviderOptions* cann_options);
+
+  /** \brief Create an OrtCANNProviderOptions
+   *
+   * \param[out] out created ::OrtCANNProviderOptions. Must be released with OrtApi::ReleaseCANNProviderOptions
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.13.
+   */
+  ORT_API2_STATUS(CreateCANNProviderOptions, _Outptr_ OrtCANNProviderOptions** out);
+
+  /** \brief Set options in a CANN Execution Provider.
+   *
+   * \param[in] cann_options
+   * \param[in] provider_options_keys Array of UTF-8 null-terminated string for provider options keys
+   * \param[in] provider_options_values Array of UTF-8 null-terminated string for provider options values
+   * \param[in] num_keys Number of elements in the `provider_option_keys` and `provider_options_values` arrays
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.13.
+   */
+  ORT_API2_STATUS(UpdateCANNProviderOptions, _Inout_ OrtCANNProviderOptions* cann_options,
+                  _In_reads_(num_keys) const char* const* provider_options_keys,
+                  _In_reads_(num_keys) const char* const* provider_options_values,
+                  _In_ size_t num_keys);
+
+  /** \brief Get serialized CANN provider options string.
+   *
+   * \param[in] cann_options OrtCANNProviderOptions instance
+   * \param[in] allocator a ptr to an instance of OrtAllocator obtained with CreateAllocator()
+   *                      or GetAllocatorWithDefaultOptions(), the specified allocator will be used to allocate
+   *                      continuous buffers for output strings and lengths.
+   * \param[out] ptr is a UTF-8 null terminated string allocated using 'allocator'.
+   *                 The caller is responsible for using the same allocator to free it.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.13.
+   */
+  ORT_API2_STATUS(GetCANNProviderOptionsAsString, _In_ const OrtCANNProviderOptions* cann_options,
+                  _Inout_ OrtAllocator* allocator, _Outptr_ char** ptr);
+
+  /** \brief Release an OrtCANNProviderOptions
+   *
+   * \param[in] the pointer of OrtCANNProviderOptions which will been deleted
+   *
+   * \since Version 1.13.
+   */
+  void(ORT_API_CALL* ReleaseCANNProviderOptions)(_Frees_ptr_opt_ OrtCANNProviderOptions* input);
+
+  /*  \brief Get OrtDevice type from MemoryInfo
+   *
+   *  \since Version 1.14
+   */
+  void(ORT_API_CALL* MemoryInfoGetDeviceType)(_In_ const OrtMemoryInfo* ptr, _Out_ OrtMemoryInfoDeviceType* out);
+
+  /* \brief Update the OrtEnv instance with custom log severity level
+   *
+   * \param[in] ort_env The OrtEnv instance being used
+   * \param[in] log_severity_level The log severity level.
+   *
+   * \since Version 1.14.
+   */
+  ORT_API2_STATUS(UpdateEnvWithCustomLogLevel, _In_ OrtEnv* ort_env, OrtLoggingLevel log_severity_level);
+
+  /*  \brief Set affinities for intra op threads
+   *
+   * Affinity string follows format:
+   * logical_processor_id,logical_processor_id;logical_processor_id,logical_processor_id
+   * Semicolon isolates configurations among threads, while comma split processors where ith thread expected to attach to.
+   * e.g. 1,2,3;4,5
+   * specifies affinities for two threads, with the 1st thread attach to the 1st, 2nd, and 3rd processor, and 2nd thread to the 4th and 5th.
+   * To ease the configuration, an "interval" is also allowed:
+   * e.g. 1-8;8-16;17-24
+   * orders that the 1st thread runs on first eight processors, 2nd thread runs on next eight processors, and so forth.
+   * Note:
+   * 1. Once set, the number of thread affinities must equal to intra_op_num_threads - 1,
+   *    ort does not set affinity on the main thread which is started and managed by the calling app;
+   * 2. For windows, ort will infer the group id from a logical processor id, for example, assuming there are two groups with each has 64 logical processors,
+   *    an id of 64 will be inferred as the last processor of the 1st group, while 65 will be interpreted as the 1st processor of the second group.
+   *    Hence 64-65 is an invalid configuration, because a windows thread cannot be attached to processors across group boundary.
+   *
+   *  \since Version 1.14
+   */
+  ORT_API2_STATUS(SetGlobalIntraOpThreadAffinity, _Inout_ OrtThreadingOptions* tp_options, const char* affinity_string);
+
+  /** \brief Register custom ops from a shared library.
+   *
+   * Loads a shared library (.dll on windows, .so on linux, etc) named 'library_name' and looks for this entry point:
+   *		OrtStatus* RegisterCustomOps(OrtSessionOptions * options, const OrtApiBase* api);
+   * It then passes in the provided session options to this function along with the api base.
+   *
+   * The handle to the loaded library is automatically released by ORT when the last OrtSession that references the
+   * library handle is released. If no OrtSession is created, then the library handle is released when the provided
+   * OrtSessionOptions is released.
+   *
+   * \param[in] options The session options.
+   * \param[in] library_name The name of the shared library to load and register. Refer to OS-specific dynamic library
+   *                         loading utilities (e.g., LoadLibraryEx on Windows or dlopen on Linux/MacOS) for information
+   *                         on the format of library names and search paths.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   * \since Version 1.14
+   */
+  ORT_API2_STATUS(RegisterCustomOpsLibrary_V2, _Inout_ OrtSessionOptions* options, _In_ const ORTCHAR_T* library_name);
+
+  /** \brief Register custom ops by calling a RegisterCustomOpsFn function.
+   *
+   * Searches for registration_func_name and if found calls it.
+   *
+   * The library containing the function must either be linked against or previously loaded by the executable.
+   *
+   * If you want ONNX Runtime to load the library and manage its lifetime, use RegisterCustomOpsLibrary_V2.
+   *
+   * RegisterCustomOpsUsingFunction can be used in scenarios where it may not be possible for ONNX Runtime to load
+   * the library from a path. e.g. mobile platforms where the library must be linked into the app.
+   *
+   * The registration function must have the signature of RegisterCustomOpsFn:
+   *    OrtStatus* (*fn)(OrtSessionOptions* options, const OrtApiBase* api);
+   *
+   * See https://onnxruntime.ai/docs/reference/operators/add-custom-op.html for details on how the registration
+   * function should be implemented.
+   *
+   * \param[in] options OrtSessionOptions that is passed through as the first argument in the call to the
+   *                    registration function.
+   * \param[in] registration_func_name Name of registration function to use.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   * \since Version 1.14
+   */
+  ORT_API2_STATUS(RegisterCustomOpsUsingFunction, _Inout_ OrtSessionOptions* options,
+                  _In_ const char* registration_func_name);
+
+  /// \name OrtKernelInfo
+  /// Custom operator APIs.
+  /// @{
+
+  /** \brief Get the number of inputs from ::OrtKernelInfo.
+   *
+   * Used in the CreateKernel callback of an OrtCustomOp to query the number of inputs
+   * during kernel/session creation.
+   *
+   * \param[in] info Instance of ::OrtKernelInfo.
+   * \param[out] out Pointer to variable assigned with the result on success.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   * \since Version 1.14
+   */
+  ORT_API2_STATUS(KernelInfo_GetInputCount, _In_ const OrtKernelInfo* info, _Out_ size_t* out);
+
+  /** \brief Get the number of outputs from ::OrtKernelInfo.
+   *
+   * Used in the CreateKernel callback of an OrtCustomOp to query the number of outputs
+   * during kernel/session creation.
+   *
+   * \param[in] info Instance of ::OrtKernelInfo.
+   * \param[out] out Pointer to variable assigned with the result on success.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   * \since Version 1.14
+   */
+  ORT_API2_STATUS(KernelInfo_GetOutputCount, _In_ const OrtKernelInfo* info, _Out_ size_t* out);
+
+  /** \brief Get the name of a ::OrtKernelInfo's input.
+   *
+   * Used in the CreateKernel callback of an OrtCustomOp to query an input's name
+   * during kernel/session creation.
+   *
+   * If `out` is nullptr, the value of `size` is set to the size of the name
+   * string (including null-terminator), and a success status is returned.
+   *
+   * If the `size` parameter is greater than or equal to the name string's size,
+   * the value of `size` is set to the true size of the string (including null-terminator),
+   * the provided memory is filled with the string's contents, and a success status is returned.
+   *
+   * If the `size` parameter is less than the actual string's size and `out`
+   * is not nullptr, the value of `size` is set to the true size of the string
+   * and a failure status is returned.
+   *
+   * \param[in] info An instance of ::OrtKernelInfo.
+   * \param[in] index The index of the input name to get. Returns a failure status if out-of-bounds.
+   * \param[out] out Memory location into which to write the UTF-8 null-terminated string representing the input's name.
+   * \param[in,out] size Pointer to the size of the `out` buffer. See above comments for details.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   * \since Version 1.14
+   */
+  ORT_API2_STATUS(KernelInfo_GetInputName, _In_ const OrtKernelInfo* info, size_t index, _Out_ char* out,
+                  _Inout_ size_t* size);
+
+  /** \brief Get the name of a ::OrtKernelInfo's output.
+   *
+   * Used in the CreateKernel callback of an OrtCustomOp to query an output's name
+   * during kernel/session creation.
+   *
+   * If `out` is nullptr, the value of `size` is set to the size of the name
+   * string (including null-terminator), and a success status is returned.
+   *
+   * If the `size` parameter is greater than or equal to the name string's size,
+   * the value of `size` is set to the true size of the string (including null-terminator),
+   * the provided memory is filled with the string's contents, and a success status is returned.
+   *
+   * If the `size` parameter is less than the actual string's size and `out`
+   * is not nullptr, the value of `size` is set to the true size of the string
+   * and a failure status is returned.
+   *
+   * \param[in] info An instance of ::OrtKernelInfo.
+   * \param[in] index The index of the output name to get. Returns a failure status if out-of-bounds.
+   * \param[out] out Memory location into which to write the UTF-8 null-terminated string representing the output's
+   *                 name.
+   * \param[in,out] size Pointer to the size of the `out` buffer. See above comments for details.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   * \since Version 1.14
+   */
+  ORT_API2_STATUS(KernelInfo_GetOutputName, _In_ const OrtKernelInfo* info, size_t index, _Out_ char* out,
+                  _Inout_ size_t* size);
+
+  /** \brief Get the type information for a ::OrtKernelInfo's input.
+   *
+   * Used in the CreateKernel callback of an OrtCustomOp to query the shape and type information
+   * of an input during kernel/session creation.
+   *
+   * \param[in] info An instance of ::OrtKernelInfo.
+   * \param[in] index Which input to get the type information for
+   * \param[out] type_info Pointer set to the resulting ::OrtTypeInfo. Must be freed with OrtApi::ReleaseTypeInfo.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   * \since Version 1.14
+   */
+  ORT_API2_STATUS(KernelInfo_GetInputTypeInfo, _In_ const OrtKernelInfo* info, size_t index,
+                  _Outptr_ OrtTypeInfo** type_info);
+
+  /** \brief Get the type information for a ::OrtKernelInfo's output.
+   *
+   * Used in the CreateKernel callback of an OrtCustomOp to query the shape and type information
+   * of an output during kernel/session creation.
+   *
+   * \param[in] info An instance of ::OrtKernelInfo.
+   * \param[in] index Which input to get the type information for
+   * \param[out] type_info Pointer set to the resulting ::OrtTypeInfo. Must be freed with OrtApi::ReleaseTypeInfo.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   * \since Version 1.14
+   */
+  ORT_API2_STATUS(KernelInfo_GetOutputTypeInfo, _In_ const OrtKernelInfo* info, size_t index,
+                  _Outptr_ OrtTypeInfo** type_info);
+
+  /** \brief Get a ::OrtValue tensor stored as an attribute in the graph node.
+   *
+   * Used in the CreateKernel callback of an OrtCustomOp to get a tensor attribute.
+   *
+   * \param[in] info ::OrtKernelInfo instance.
+   * \param[in] name UTF-8 null-terminated string representing the attribute's name.
+   * \param[in] allocator Allocator used to allocate the internal tensor state.
+   * \param[out] out Returns newly created ::OrtValue. Must be freed with OrtApi::ReleaseValue,
+   *                 which will also free internal tensor state allocated with the provided allocator.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(KernelInfoGetAttribute_tensor, _In_ const OrtKernelInfo* info, _In_z_ const char* name,
+                  _Inout_ OrtAllocator* allocator, _Outptr_ OrtValue** out);
+
+  /// @}
+  /// \name OrtSessionOptions
+  /// Custom operator APIs
+  /// @{
+
+  /** \brief Checks if the given session configuration entry exists.
+   *
+   * The config_key formats are defined in onnxruntime_session_options_config_keys.h
+   *
+   * Can be used in a custom operator library to check for session configuration entries
+   * that target one or more custom operators in the library. Example: The config entry
+   * custom_op.myop.some_key targets a custom op named "myop".
+   *
+   * \param[in] options The ::OrtSessionOptions instance.
+   * \param[in] config_key A null-terminated UTF-8 string representation of the configuration key.
+   * \param[out] out Pointer set to 1 if the entry exists and 0 otherwise.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   * \since Version 1.14
+   */
+  ORT_API2_STATUS(HasSessionConfigEntry, _In_ const OrtSessionOptions* options,
+                  _In_z_ const char* config_key, _Out_ int* out);
+
+  /** \brief Get a session configuration value.
+   *
+   * Returns a failure status if the configuration key does not exist.
+   * The config_key and the format of config_value are defined in onnxruntime_session_options_config_keys.h
+   *
+   * If `config_value` is nullptr, the value of `size` is set to the true size of the string
+   * value (including null-terminator), and a success status is returned.
+   *
+   * If the `size` parameter is greater than or equal to the actual string value's size,
+   * the value of `size` is set to the true size of the string value, the provided memory
+   * is filled with the value's contents, and a success status is returned.
+   *
+   * If the `size` parameter is less than the actual string value's size and `config_value`
+   * is not nullptr, the value of `size` is set to the true size of the string value
+   * and a failure status is returned.
+   *
+   * Can be used in a custom operator library to get session configuration entries
+   * that target one or more custom operators in the library. Example: The config entry
+   * custom_op.myop.some_key targets a custom op named "myop".
+   *
+   * \param[in] options The session options.
+   * \param[in] config_key A null-terminated UTF-8 string representation of the config key.
+   * \param[in] config_value Pointer to memory where the null-terminated UTF-8 string value will be stored.
+   * \param[in,out] size Pointer to the size of the `config_value` buffer. See above comments for details.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   * \since Version 1.14
+   */
+  ORT_API2_STATUS(GetSessionConfigEntry, _In_ const OrtSessionOptions* options,
+                  _In_z_ const char* config_key, _Out_ char* config_value, _Inout_ size_t* size);
+
+  /// @}
+
+  /** \brief Append dnnl provider to session options
+   *
+   * If oneDNN is not available, this function will return failure.
+   *
+   * \param[in] options
+   * \param[in] dnnl_options
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.15.
+   */
+  ORT_API2_STATUS(SessionOptionsAppendExecutionProvider_Dnnl,
+                  _In_ OrtSessionOptions* options, _In_ const OrtDnnlProviderOptions* dnnl_options);
+
+  /** \brief Create an OrtDnnlProviderOptions
+   *
+   * \param[out] out Newly created ::OrtDnnlProviderOptions. Must be released with OrtApi::ReleaseDnnlProviderOptions
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.15.
+   */
+  ORT_API2_STATUS(CreateDnnlProviderOptions, _Outptr_ OrtDnnlProviderOptions** out);
+
+  /** \brief Set options in a oneDNN Execution Provider.
+   *
+   * Key should be in null terminated string format of the member of ::OrtDnnlProviderOptions
+   * and value should be its related range.
+   *
+   * For example, key="use_arena" and value="1"
+   *
+   * \param[in] dnnl_options
+   * \param[in] provider_options_keys Array of UTF-8 null-terminated string for provider options keys
+   * \param[in] provider_options_values Array of UTF-8 null-terminated string for provider options values
+   * \param[in] num_keys Number of elements in the `provider_option_keys` and `provider_options_values` arrays
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.15.
+   */
+  ORT_API2_STATUS(UpdateDnnlProviderOptions, _Inout_ OrtDnnlProviderOptions* dnnl_options,
+                  _In_reads_(num_keys) const char* const* provider_options_keys,
+                  _In_reads_(num_keys) const char* const* provider_options_values,
+                  _In_ size_t num_keys);
+
+  /**
+   * Get serialized oneDNN provider options string.
+   *
+   * For example, "use_arena=1;......"
+   *
+   * \param dnnl_options - OrtDnnlProviderOptions instance
+   * \param allocator - a ptr to an instance of OrtAllocator obtained with CreateAllocator() or GetAllocatorWithDefaultOptions()
+   *                      the specified allocator will be used to allocate continuous buffers for output strings and lengths.
+   * \param ptr - is a UTF-8 null terminated string allocated using 'allocator'. The caller is responsible for using the same allocator to free it.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.15.
+   */
+  ORT_API2_STATUS(GetDnnlProviderOptionsAsString, _In_ const OrtDnnlProviderOptions* dnnl_options, _Inout_ OrtAllocator* allocator, _Outptr_ char** ptr);
+
+  /** \brief Release an ::OrtDnnlProviderOptions
+   *
+   * \since Version 1.15.
+   */
+  void(ORT_API_CALL* ReleaseDnnlProviderOptions)(_Frees_ptr_opt_ OrtDnnlProviderOptions* input);
+
+  /// \name OrtKernelInfo
+  /// Custom operator APIs.
+  /// @{
+
+  /** \brief Get the graph node name from ::OrtKernelInfo.
+   *
+   * If `out` is nullptr, the value of `size` is set to the size of the name
+   * string (including null-terminator), and a success status is returned.
+   *
+   * If the `size` parameter is greater than or equal to the name string's size,
+   * the value of `size` is set to the true size of the string (including null-terminator),
+   * the provided memory is filled with the string's contents, and a success status is returned.
+   *
+   * If the `size` parameter is less than the actual string's size and `out`
+   * is not nullptr, the value of `size` is set to the true size of the string
+   * and a failure status is returned.
+   *
+   * Can be used in a custom operator's CreateKernel callback to get the name of the operator's node name in the graph.
+   *
+   * \param[in] info An instance of ::OrtKernelInfo.
+   * \param[out] out Memory location into which to write the UTF-8 null-terminated string representing the name.
+   * \param[in,out] size Pointer to the size of the `out` buffer. See above comments for details.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   * \since Version 1.15
+   */
+  ORT_API2_STATUS(KernelInfo_GetNodeName, _In_ const OrtKernelInfo* info, _Out_ char* out, _Inout_ size_t* size);
+
+  /** \brief Get the session logger from ::OrtKernelInfo.
+   *
+   * Used in the CreateKernel callback of an OrtCustomOp to get a logger that can be used to log
+   * messages.
+   *
+   * \param[in] info An instance of ::OrtKernelInfo.
+   * \param[out] logger Pointer set to the session's ::OrtLogger. Owned by ONNX Runtime, so do not free.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   * \since Version 1.15
+   */
+  ORT_API2_STATUS(KernelInfo_GetLogger, _In_ const OrtKernelInfo* info, _Outptr_ const OrtLogger** logger);
+
+  /// @}
+  /// \name OrtKernelContext
+  /// Custom operator APIs.
+  /// @{
+
+  /** \brief Get the runtime logger from ::OrtKernelContext.
+   *
+   * Used in the KernelCompute callback of an OrtCustomOp to get a logger that can be used to log
+   * messages during inference.
+   *
+   * \param[in] context An instance of ::OrtKernelContext.
+   * \param[out] logger Pointer set to the kernel context's ::OrtLogger. Owned by ONNX Runtime, so do not free.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   * \since Version 1.15
+   */
+  ORT_API2_STATUS(KernelContext_GetLogger, _In_ const OrtKernelContext* context, _Outptr_ const OrtLogger** logger);
+
+  /// @}
+  /// \name OrtLogger
+  /// Custom operator APIs.
+  /// @{
+
+  /** \brief Logs a message at the given severity level using the provided ::OrtLogger.
+   *
+   * Only messages with a severity level equal or greater than the ::OrtLogger's logging severity level
+   * are logged. Use OrtApi::Logger_GetLoggingSeverityLevel to get the ::OrtLogger's logging severity
+   * level.
+   *
+   * Can be used in custom operators to log messages with the logger retrieved via OrtApi::KernelInfo_GetLogger.
+   *
+   * \param[in] logger The ::OrtLogger instance.
+   * \param[in] log_severity_level The message's severity level.
+   * \param[in] message The message to log.
+   * \param[in] file_path The filepath of the file in which the message is logged. Usually the value of ORT_FILE.
+   * \param[in] line_number The file line number in which the message is logged. Usually the value of __LINE__.
+   * \param[in] func_name The name of the function in which the message is logged. Usually the value of __FUNCTION__.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   * \since Version 1.15
+   */
+  ORT_API2_STATUS(Logger_LogMessage, _In_ const OrtLogger* logger, OrtLoggingLevel log_severity_level,
+                  _In_z_ const char* message, _In_z_ const ORTCHAR_T* file_path, int line_number,
+                  _In_z_ const char* func_name);
+
+  /** \brief Get the logging severity level of the ::OrtLogger.
+   *
+   * Can be used in a custom operator to get the logging serverity level of the ::OrtLogger associated with
+   * the ::OrtKernelInfo.
+   *
+   * \param[in] logger The ::OrtLogger instance.
+   * \param[out] out Pointer to variable assigned with the logging severity level on success.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   * \since Version 1.15
+   */
+  ORT_API2_STATUS(Logger_GetLoggingSeverityLevel, _In_ const OrtLogger* logger, _Out_ OrtLoggingLevel* out);
+
+  /// @}
+
+  /** \brief Get a ::OrtValue tensor stored as a constant initializer in the graph node.
+   *
+   * Used in the CreateKernel callback of an OrtCustomOp to get a tensor value.
+   *
+   * \param[in] info ::OrtKernelInfo instance.
+   * \param[in] index The node index.
+   * \param[out] is_constant Is it a constant node input or not.
+   * \param[out] out The OrtValue tensor value.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.15.
+   */
+  ORT_API2_STATUS(KernelInfoGetConstantInput_tensor, _In_ const OrtKernelInfo* info, size_t index, _Out_ int* is_constant, _Outptr_ const OrtValue** out);
+
+  /** \brief Get Optional Type information from an ::OrtTypeInfo
+   *
+   * This augments ::OrtTypeInfo to return an ::OrtOptionalTypeInfo when the type is optional.
+   * The OrtOptionalTypeInfo also has a nested ::OrtTypeInfo that describes the type of the optional value.
+   * ::OrtOptionalTypeInfo type can only appear within model metadata to describe inputs/outputs.
+   * The actual OrtValues that are supplied in place of optional type inputs should contain
+   * specific type that is described by ::OrtOptionalTypeInfo.
+   *
+   * So the picture: ::OrtTypeInfo -> ::OrtOptionalTypeInfo -> ::OrtTypeInfo (describes the type that can be supplied
+   * in place of the optional type when creating the actual ::OrtValue).
+   *
+   * \param[in] type_info
+   * \param[out] out A pointer to the ::OrtOptionalTypeInfo. Do not free this value,
+   *                 it is owned by OrtTypeInfo instance. When the type_info does not represent
+   *                 optional type, nullptr is returned in out.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.15.
+   */
+  ORT_API2_STATUS(CastTypeInfoToOptionalTypeInfo, _In_ const OrtTypeInfo* type_info,
+                  _Outptr_result_maybenull_ const OrtOptionalTypeInfo** out);
+
+  /** \brief Get OrtTypeInfo for the allowed contained type from an ::OrtOptionalTypeInfo.
+   *
+   * This augments ::OrtOptionalTypeInfo to return an ::OrtTypeInfo for the contained type.
+   * The OrtOptionalTypeInfo has a nested ::OrtTypeInfo that describes the type of the optional value.
+   * ::OrtOptionalTypeInfo type can only appear within model metadata to describe inputs/outputs.
+   * The actual OrtValues that are supplied in place of optional type inputs should contain
+   * specific type that is described by the returned ::OrtTypeInfo.
+   *
+   * \param[in] optional_type_info
+   * \param[out] out A pointer to the ::OrtTypeInfo for what the optional value could be.
+   * it is owned by OrtOptionalTypeInfo instance.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.15.
+   */
+  ORT_API2_STATUS(GetOptionalContainedTypeInfo, _In_ const OrtOptionalTypeInfo* optional_type_info,
+                  _Outptr_ OrtTypeInfo** out);
+
+  /** \brief Set a single string in a string tensor
+   *  Do not zero terminate the string data.
+   *
+   * \param[in] value A string tensor
+   * \param[in] index - flat index of the element
+   * \param[in] length_in_bytes length of the buffer in utf-8 bytes (without the null terminator)
+   * \param[inout] buffer - address of return value
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(GetResizedStringTensorElementBuffer, _Inout_ OrtValue* value, _In_ size_t index, _In_ size_t length_in_bytes, _Inout_ char** buffer);
+
+  /** \brief Get Allocator from KernelContext for a specific memoryInfo. Please use C API ReleaseAllocator to release out object
+   *
+   * \param[in] context OrtKernelContext instance
+   * \param[in] mem_info OrtMemoryInfo instance
+   * \param[out] out A pointer to OrtAllocator.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.15.
+   */
+  ORT_API2_STATUS(KernelContext_GetAllocator, _In_ const OrtKernelContext* context, _In_ const OrtMemoryInfo* mem_info, _Outptr_ OrtAllocator** out);
+
+  /** \brief Returns a null terminated string of the build info including git info and cxx flags
+   *
+   * \return UTF-8 encoded version string. Do not deallocate the returned buffer.
+   *
+   * \since Version 1.15.
+   */
+  const char*(ORT_API_CALL* GetBuildInfoString)(void);
+
+  /// \name OrtROCMProviderOptions
+  /// @{
+
+  /** \brief Create an OrtROCMProviderOptions
+   *
+   * \param[out] out Newly created ::OrtROCMProviderOptions. Must be released with OrtApi::ReleaseROCMProviderOptions
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.16.
+   */
+  ORT_API2_STATUS(CreateROCMProviderOptions, _Outptr_ OrtROCMProviderOptions** out);
+
+  /** \brief Set options in a ROCm Execution Provider.
+   *
+   * Please refer to https://onnxruntime.ai/docs/execution-providers/ROCm-ExecutionProvider.html
+   * to know the available keys and values. Key should be in null terminated string format of the member of
+   * ::OrtROCMProviderOptions and value should be its related range.
+   *
+   * For example, key="device_id" and value="0"
+   *
+   * \param[in] rocm_options
+   * \param[in] provider_options_keys Array of UTF-8 null-terminated string for provider options keys
+   * \param[in] provider_options_values Array of UTF-8 null-terminated string for provider options values
+   * \param[in] num_keys Number of elements in the `provider_option_keys` and `provider_options_values` arrays
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.16.
+   */
+  ORT_API2_STATUS(UpdateROCMProviderOptions, _Inout_ OrtROCMProviderOptions* rocm_options,
+                  _In_reads_(num_keys) const char* const* provider_options_keys,
+                  _In_reads_(num_keys) const char* const* provider_options_values,
+                  _In_ size_t num_keys);
+
+  /**
+   * Get serialized ROCm provider options string.
+   *
+   * For example, "device_id=0;arena_extend_strategy=0;......"
+   *
+   * \param rocm_options - OrtROCMProviderOptions instance
+   * \param allocator - a ptr to an instance of OrtAllocator obtained with CreateAllocator() or GetAllocatorWithDefaultOptions()
+   *                      the specified allocator will be used to allocate continuous buffers for output strings and lengths.
+   * \param ptr - is a UTF-8 null terminated string allocated using 'allocator'. The caller is responsible for using the same allocator to free it.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.16.
+   */
+  ORT_API2_STATUS(GetROCMProviderOptionsAsString, _In_ const OrtROCMProviderOptions* rocm_options, _Inout_ OrtAllocator* allocator, _Outptr_ char** ptr);
+
+  /** \brief Release an ::OrtROCMProviderOptions
+   *
+   * \note This is an exception in the naming convention of other Release* functions, as the name of the method does not have the V2 suffix, but the type does
+   *
+   * \since Version 1.16.
+   */
+  void(ORT_API_CALL* ReleaseROCMProviderOptions)(_Frees_ptr_opt_ OrtROCMProviderOptions* input);
+
+  /** \brief Create an allocator with specific type and register it with the ::OrtEnv
+   *  This API enhance CreateAndRegisterAllocator that it can create an allocator with specific type, not just CPU allocator
+   *  Enables sharing the allocator between multiple sessions that use the same env instance.
+   *  Lifetime of the created allocator will be valid for the duration of the environment.
+   *  Returns an error if an allocator with the same ::OrtMemoryInfo is already registered.
+   *  \param[in] env OrtEnv instance
+   *  \param[in] provider_type ExecutionProvider type
+   *  \param[in] mem_info OrtMemoryInfo instance
+   *  \param[in] arena_cfg Arena configuration
+   *  \param[in] provider_options_keys key of the provider options map
+   *  \param[in] provider_options_values value of the provider options map
+   *  \param[in] num_keys Length of the provider options map
+   */
+  ORT_API2_STATUS(CreateAndRegisterAllocatorV2, _Inout_ OrtEnv* env, _In_ const char* provider_type, _In_ const OrtMemoryInfo* mem_info, _In_ const OrtArenaCfg* arena_cfg,
+                  _In_reads_(num_keys) const char* const* provider_options_keys, _In_reads_(num_keys) const char* const* provider_options_values, _In_ size_t num_keys);
+
+  /** \brief Run the model asynchronously in a thread owned by intra op thread pool
+   *
+   * \param[in] session
+   * \param[in] run_options If nullptr, will use a default ::OrtRunOptions
+   * \param[in] input_names Array of null terminated UTF8 encoded strings of the input names
+   * \param[in] input Array of ::OrtValue%s of the input values
+   * \param[in] input_len Number of elements in the input_names and inputs arrays
+   * \param[in] output_names Array of null terminated UTF8 encoded strings of the output names
+   * \param[in] output_names_len Number of elements in the output_names and outputs array
+   * \param[out] output OrtValue* array of size output_names_len.
+   *             On calling RunAsync, output[i] could either be a null or a pointer to a preallocated OrtValue.
+   *             Later, the output array will be passed to run_async_callback with all null(s) filled with valid
+   *             OrtValue pointer(s) allocated by onnxruntime.
+   *             NOTE: it is customer's duty to finally release the output array and each of its member,
+   *             regardless of whether the member (OrtValue*) is allocated by onnxruntime or preallocated by the customer.
+   * \param[in] run_async_callback Callback function on model run completion
+   * \param[in] user_data User data that pass back to run_async_callback
+   */
+  ORT_API2_STATUS(RunAsync, _Inout_ OrtSession* session, _In_opt_ const OrtRunOptions* run_options,
+                  _In_reads_(input_len) const char* const* input_names,
+                  _In_reads_(input_len) const OrtValue* const* input, size_t input_len,
+                  _In_reads_(output_names_len) const char* const* output_names, size_t output_names_len,
+                  _Inout_updates_all_(output_names_len) OrtValue** output,
+                  _In_ RunAsyncCallbackFn run_async_callback, _In_opt_ void* user_data);
+
+  /**
+   * Update TensorRT EP provider option where its data type is pointer, for example 'user_compute_stream'.
+   * If the data type of the provider option can be represented by string please use UpdateTensorRTProviderOptions.
+   *
+   * Note: It's caller's responsibility to properly manage the lifetime of the instance pointed by this pointer.
+   *
+   * \param tensorrt_options - OrtTensorRTProviderOptionsV2 instance
+   * \param key - Name of the provider option
+   * \param value - A pointer to the instance that will be assigned to this provider option
+   *
+   * \since Version 1.16.
+   */
+  ORT_API2_STATUS(UpdateTensorRTProviderOptionsWithValue, _Inout_ OrtTensorRTProviderOptionsV2* tensorrt_options, _In_ const char* key, _In_ void* value);
+
+  /**
+   * Get TensorRT EP provider option where its data type is pointer.
+   * If the data type of the provider option can be represented by string please use GetTensorRTProviderOptionsAsString.
+   *
+   * \param tensorrt_options - OrtTensorRTProviderOptionsV2 instance
+   * \param key - Name of the provider option
+   * \param ptr - A pointer to the instance that is kept by the provider option
+   *
+   * \since Version 1.16.
+   */
+  ORT_API2_STATUS(GetTensorRTProviderOptionsByName, _In_ const OrtTensorRTProviderOptionsV2* tensorrt_options, _In_ const char* key, _Outptr_ void** ptr);
+
+  /**
+   * Update CUDA EP provider option where its data type is pointer, for example 'user_compute_stream'.
+   * If the data type of the provider option can be represented by string please use UpdateCUDAProviderOptions.
+   *
+   * Note: It's caller's responsibility to properly manage the lifetime of the instance pointed by this pointer.
+   *
+   * \param cuda_options - OrtCUDAProviderOptionsV2 instance
+   * \param key - Name of the provider option
+   * \param value - A pointer to the instance that will be assigned to this provider option
+   *
+   * \since Version 1.16.
+   */
+  ORT_API2_STATUS(UpdateCUDAProviderOptionsWithValue, _Inout_ OrtCUDAProviderOptionsV2* cuda_options, _In_ const char* key, _In_ void* value);
+
+  /**
+   * Get CUDA EP provider option where its data type is pointer.
+   * If the data type of the provider option can be represented by string please use GetCUDAProviderOptionsAsString.
+   *
+   * \param cuda_options - OrtCUDAProviderOptionsV2 instance
+   * \param key - Name of the provider option
+   * \param ptr - A pointer to the instance that is kept by the provider option
+   *
+   * \since Version 1.16.
+   */
+  ORT_API2_STATUS(GetCUDAProviderOptionsByName, _In_ const OrtCUDAProviderOptionsV2* cuda_options, _In_ const char* key, _Outptr_ void** ptr);
+
+  /**
+   * Get a EP resoure.
+   * E.g. a cuda stream or a cublas handle
+   *
+   * \param context - Kernel context
+   * \param resouce_version - Version of the resource
+   * \param resource_id - Type of resource
+   * \param resource - A pointer to returned resource
+   *
+   * \since Version 1.16.
+   */
+  ORT_API2_STATUS(KernelContext_GetResource, _In_ const OrtKernelContext* context, _In_ int resouce_version, _In_ int resource_id, _Outptr_ void** resource);
+};
+
+/*
+ * Steps to use a custom op:
+ *   1 Create an OrtCustomOpDomain with the domain name used by the custom ops
+ *   2 Create an OrtCustomOp structure for each op and add them to the domain
+ *   3 Call OrtAddCustomOpDomain to add the custom domain of ops to the session options
+ */
+
+// Specifies some characteristics of inputs/outputs of custom ops:
+// Specify if the inputs/outputs are one of:
+// 1) Non-optional (input/output must be present in the node)
+// 2) Optional (input/output may be absent in the node)
+// 3) Variadic: A variadic input or output specifies N (i.e., the minimum arity) or more operands.
+//              Only the last input or output of a custom op may be marked as variadic.
+//              The homogeneity of the variadic input or output determines whether all operands must be of the same
+//              tensor element type.
+typedef enum OrtCustomOpInputOutputCharacteristic {
+  INPUT_OUTPUT_REQUIRED = 0,
+  INPUT_OUTPUT_OPTIONAL,
+  INPUT_OUTPUT_VARIADIC,
+} OrtCustomOpInputOutputCharacteristic;
+
+/*
+ * The OrtCustomOp structure defines a custom op's schema and its kernel callbacks. The callbacks are filled in by
+ * the implementor of the custom op.
+ */
+struct OrtCustomOp {
+  uint32_t version;  // Must be initialized to ORT_API_VERSION
+
+  // This callback creates the kernel, which is a user defined
+  // parameter that is passed to the Kernel* callbacks below. It is
+  // recommended to use CreateKernelV2 which allows for a safe error
+  // propagation by returning an OrtStatusPtr.
+  void*(ORT_API_CALL* CreateKernel)(_In_ const struct OrtCustomOp* op, _In_ const OrtApi* api,
+                                    _In_ const OrtKernelInfo* info);
+
+  // Returns the name of the op
+  const char*(ORT_API_CALL* GetName)(_In_ const struct OrtCustomOp* op);
+
+  // Returns the type of the execution provider, return nullptr to use CPU execution provider
+  const char*(ORT_API_CALL* GetExecutionProviderType)(_In_ const struct OrtCustomOp* op);
+
+  // Returns the count and types of the input & output tensors
+  ONNXTensorElementDataType(ORT_API_CALL* GetInputType)(_In_ const struct OrtCustomOp* op, _In_ size_t index);
+  size_t(ORT_API_CALL* GetInputTypeCount)(_In_ const struct OrtCustomOp* op);
+  ONNXTensorElementDataType(ORT_API_CALL* GetOutputType)(_In_ const struct OrtCustomOp* op, _In_ size_t index);
+  size_t(ORT_API_CALL* GetOutputTypeCount)(_In_ const struct OrtCustomOp* op);
+
+  // Perform a computation step.  It is recommended to use
+  // KernelComputeV2 which allows for a safe error propagation by
+  // returning an OrtStatusPtr.
+  void(ORT_API_CALL* KernelCompute)(_In_ void* op_kernel, _In_ OrtKernelContext* context);
+  void(ORT_API_CALL* KernelDestroy)(_In_ void* op_kernel);
+
+  // Returns the characteristics of the input & output tensors
+  OrtCustomOpInputOutputCharacteristic(ORT_API_CALL* GetInputCharacteristic)(_In_ const struct OrtCustomOp* op, _In_ size_t index);
+  OrtCustomOpInputOutputCharacteristic(ORT_API_CALL* GetOutputCharacteristic)(_In_ const struct OrtCustomOp* op, _In_ size_t index);
+
+  // Returns the memory type of the input tensors. This API allows the custom op
+  // to place the inputs on specific devices. By default, it returns
+  // OrtMemTypeDefault, which means the input is placed on the default device for
+  // the execution provider. If the inputs need to be with different memory tyeps,
+  // this function can be overridden to return the specific memory types.
+  OrtMemType(ORT_API_CALL* GetInputMemoryType)(_In_ const struct OrtCustomOp* op, _In_ size_t index);
+
+  // Returns the minimum number of input arguments expected for the variadic input.
+  // Applicable only for custom ops that have a variadic input.
+  int(ORT_API_CALL* GetVariadicInputMinArity)(_In_ const struct OrtCustomOp* op);
+
+  // Returns true (non-zero) if all arguments of a variadic input have to be of the same type (homogeneous),
+  // and false (zero) otherwise.
+  // Applicable only for custom ops that have a variadic input.
+  int(ORT_API_CALL* GetVariadicInputHomogeneity)(_In_ const struct OrtCustomOp* op);
+
+  // Returns the minimum number of output values expected for the variadic output.
+  // Applicable only for custom ops that have a variadic output.
+  int(ORT_API_CALL* GetVariadicOutputMinArity)(_In_ const struct OrtCustomOp* op);
+
+  // Returns true (non-zero) if all outputs values of a variadic output have to be of the same type (homogeneous),
+  // and false (zero) otherwise.
+  // Applicable only for custom ops that have a variadic output.
+  int(ORT_API_CALL* GetVariadicOutputHomogeneity)(_In_ const struct OrtCustomOp* op);
+
+  // Create the kernel state which is passed to each compute call.
+  OrtStatusPtr(ORT_API_CALL* CreateKernelV2)(_In_ const struct OrtCustomOp* op, _In_ const OrtApi* api,
+                                             _In_ const OrtKernelInfo* info,
+                                             _Out_ void** kernel);
+
+  // Perform the computation step.
+  OrtStatusPtr(ORT_API_CALL* KernelComputeV2)(_In_ void* op_kernel, _In_ OrtKernelContext* context);
+};
+
+/*
+ * This is the old way to add the CUDA provider to the session, please use SessionOptionsAppendExecutionProvider_CUDA above to access the latest functionality
+ * This function always exists, but will only succeed if Onnxruntime was built with CUDA support and the CUDA provider shared library exists
+ *
+ * \param device_id CUDA device id, starts from zero.
+ */
+ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_CUDA, _In_ OrtSessionOptions* options, int device_id);
+
+/*
+ * This is the old way to add the ROCm provider to the session, please use
+ * SessionOptionsAppendExecutionProvider_ROCM above to access the latest functionality
+ * This function always exists, but will only succeed if Onnxruntime was built with
+ * HIP support and the ROCm provider shared library exists
+ *
+ * \param device_id HIP device id, starts from zero.
+ */
+ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_ROCM, _In_ OrtSessionOptions* options, int device_id);
+
+/*
+ * This is the old way to add the MIGraphX provider to the session, please use
+ * SessionOptionsAppendExecutionProvider_MIGraphX above to access the latest functionality
+ * This function always exists, but will only succeed if Onnxruntime was built with
+ * HIP support and the MIGraphX provider shared library exists
+ *
+ * \param device_id HIP device id, starts from zero.
+ */
+ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_MIGraphX, _In_ OrtSessionOptions* options, int device_id);
+
+/*
+ * This is the old way to add the oneDNN provider to the session, please use
+ * SessionOptionsAppendExecutionProvider_oneDNN above to access the latest functionality
+ * This function always exists, but will only succeed if Onnxruntime was built with
+ * oneDNN support and the oneDNN provider shared library exists
+ *
+ * \param use_arena zero: false. non-zero: true.
+ */
+ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_Dnnl, _In_ OrtSessionOptions* options, int use_arena);
+
+#ifdef __cplusplus
+}
+#endif
+/// @}
diff --git a/duix-sdk/src/main/cpp/third/arm/include/onnx/onnxruntime_cxx_api.h b/duix-sdk/src/main/cpp/third/arm/include/onnx/onnxruntime_cxx_api.h
new file mode 100644
index 0000000..709272d
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/onnx/onnxruntime_cxx_api.h
@@ -0,0 +1,2268 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// Summary: The Ort C++ API is a header only wrapper around the Ort C API.
+//
+// The C++ API simplifies usage by returning values directly instead of error codes, throwing exceptions on errors
+// and automatically releasing resources in the destructors. The primary purpose of C++ API is exception safety so
+// all the resources follow RAII and do not leak memory.
+//
+// Each of the C++ wrapper classes holds only a pointer to the C internal object. Treat them like smart pointers.
+// To create an empty object, pass 'nullptr' to the constructor (for example, Env e{nullptr};). However, you can't use them
+// until you assign an instance that actually holds an underlying object.
+//
+// For Ort objects only move assignment between objects is allowed, there are no copy constructors.
+// Some objects have explicit 'Clone' methods for this purpose.
+//
+// ConstXXXX types are copyable since they do not own the underlying C object, so you can pass them to functions as arguments
+// by value or by reference. ConstXXXX types are restricted to const only interfaces.
+//
+// UnownedXXXX are similar to ConstXXXX but also allow non-const interfaces.
+//
+// The lifetime of the corresponding owning object must eclipse the lifetimes of the ConstXXXX/UnownedXXXX types. They exists so you do not
+// have to fallback to C types and the API with the usual pitfalls. In general, do not use C API from your C++ code.
+
+#pragma once
+#define ORT_NO_EXCEPTIONS TRUE
+#include "onnxruntime_c_api.h"
+#include "onnxruntime_float16.h"
+
+#include <cstddef>
+#include <cstdio>
+#include <array>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <utility>
+#include <type_traits>
+
+#ifdef ORT_NO_EXCEPTIONS
+#include <iostream>
+#endif
+
+/** \brief All C++ Onnxruntime APIs are defined inside this namespace
+ *
+ */
+namespace Ort {
+
+/** \brief All C++ methods that can fail will throw an exception of this type
+ *
+ * If <tt>ORT_NO_EXCEPTIONS</tt> is defined, then any error will result in a call to abort()
+ */
+struct Exception : std::exception {
+  Exception(std::string&& string, OrtErrorCode code) : message_{std::move(string)}, code_{code} {}
+
+  OrtErrorCode GetOrtErrorCode() const { return code_; }
+  const char* what() const noexcept override { return message_.c_str(); }
+
+ private:
+  std::string message_;
+  OrtErrorCode code_;
+};
+
+//#define ORT_NO_EXCEPTIONS
+#ifdef ORT_NO_EXCEPTIONS
+// The #ifndef is for the very special case where the user of this library wants to define their own way of handling errors.
+// NOTE: This header expects control flow to not continue after calling ORT_CXX_API_THROW
+#ifndef ORT_CXX_API_THROW
+#define ORT_CXX_API_THROW(string, code)       \
+  while(false) {                                        \
+    std::cerr << Ort::Exception(string, code) \
+                     .what()                  \
+              << std::endl;                   \
+    abort();                                  \
+  }
+#endif
+#else
+#define ORT_CXX_API_THROW(string, code) \
+  throw Ort::Exception(string, code)
+#endif
+
+// This is used internally by the C++ API. This class holds the global variable that points to the OrtApi,
+//  it's in a template so that we can define a global variable in a header and make
+// it transparent to the users of the API.
+template <typename T>
+struct Global {
+  static const OrtApi* api_;
+};
+
+// If macro ORT_API_MANUAL_INIT is defined, no static initialization will be performed. Instead, user must call InitApi() before using it.
+template <typename T>
+#ifdef ORT_API_MANUAL_INIT
+const OrtApi* Global<T>::api_{};
+inline void InitApi() noexcept { Global<void>::api_ = OrtGetApiBase()->GetApi(ORT_API_VERSION); }
+
+// Used by custom operator libraries that are not linked to onnxruntime. Sets the global API object, which is
+// required by C++ APIs.
+//
+// Example mycustomop.cc:
+//
+// #define ORT_API_MANUAL_INIT
+// #include <onnxruntime_cxx_api.h>
+// #undef ORT_API_MANUAL_INIT
+//
+// OrtStatus* ORT_API_CALL RegisterCustomOps(OrtSessionOptions* options, const OrtApiBase* api_base) {
+//   Ort::InitApi(api_base->GetApi(ORT_API_VERSION));
+//   // ...
+// }
+//
+inline void InitApi(const OrtApi* api) noexcept { Global<void>::api_ = api; }
+#else
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push)
+// "Global initializer calls a non-constexpr function." Therefore you can't use ORT APIs in the other global initializers.
+// Please define ORT_API_MANUAL_INIT if it conerns you.
+#pragma warning(disable : 26426)
+#endif
+const OrtApi* Global<T>::api_ = OrtGetApiBase()->GetApi(ORT_API_VERSION);
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#endif
+#endif
+
+/// This returns a reference to the OrtApi interface in use
+inline const OrtApi& GetApi() noexcept { return *Global<void>::api_; }
+
+/// <summary>
+/// This function returns the onnxruntime version string
+/// </summary>
+/// <returns>version string major.minor.rev</returns>
+std::string GetVersionString();
+
+/// <summary>
+/// This function returns the onnxruntime build information: including git branch,
+/// git commit id, build type(Debug/Release/RelWithDebInfo) and cmake cpp flags.
+/// </summary>
+/// <returns>string</returns>
+std::string GetBuildInfoString();
+
+/// <summary>
+/// This is a C++ wrapper for OrtApi::GetAvailableProviders() and
+/// returns a vector of strings representing the available execution providers.
+/// </summary>
+/// <returns>vector of strings</returns>
+std::vector<std::string> GetAvailableProviders();
+
+/** \brief IEEE 754 half-precision floating point data type
+ *
+ * \details This struct is used for converting float to float16 and back
+ * so the user could feed inputs and fetch outputs using these type.
+ *
+ * The size of the structure should align with uint16_t and one can freely cast
+ * uint16_t buffers to/from Ort::Float16_t to feed and retrieve data.
+ *
+ * \code{.unparsed}
+ * // This example demonstrates converion from float to float16
+ * constexpr float values[] = {1.f, 2.f, 3.f, 4.f, 5.f};
+ * std::vector<Ort::Float16_t> fp16_values;
+ * fp16_values.reserve(std::size(values));
+ * std::transform(std::begin(values), std::end(values), std::back_inserter(fp16_values),
+ *     [](float value) { return Ort::Float16_t(value); });
+ *
+ * \endcode
+ */
+struct Float16_t : onnxruntime_float16::Float16Impl<Float16_t> {
+ private:
+  /// <summary>
+  /// Constructor from a 16-bit representation of a float16 value
+  /// No conversion is done here.
+  /// </summary>
+  /// <param name="v">16-bit representation</param>
+  constexpr explicit Float16_t(uint16_t v) noexcept { val = v; }
+
+ public:
+  using Base = onnxruntime_float16::Float16Impl<Float16_t>;
+
+  /// <summary>
+  /// Default constructor
+  /// </summary>
+  Float16_t() = default;
+
+  /// <summary>
+  /// Explicit conversion to uint16_t representation of float16.
+  /// </summary>
+  /// <param name="v">uint16_t bit representation of float16</param>
+  /// <returns>new instance of Float16_t</returns>
+  constexpr static Float16_t FromBits(uint16_t v) noexcept { return Float16_t(v); }
+
+  /// <summary>
+  /// __ctor from float. Float is converted into float16 16-bit representation.
+  /// </summary>
+  /// <param name="v">float value</param>
+  explicit Float16_t(float v) noexcept { val = Base::ToUint16Impl(v); }
+
+  /// <summary>
+  /// Converts float16 to float
+  /// </summary>
+  /// <returns>float representation of float16 value</returns>
+  float ToFloat() const noexcept { return Base::ToFloatImpl(); }
+
+  /// <summary>
+  /// Checks if the value is negative
+  /// </summary>
+  /// <returns>true if negative</returns>
+  using Base::IsNegative;
+
+  /// <summary>
+  /// Tests if the value is NaN
+  /// </summary>
+  /// <returns>true if NaN</returns>
+  using Base::IsNaN;
+
+  /// <summary>
+  /// Tests if the value is finite
+  /// </summary>
+  /// <returns>true if finite</returns>
+  using Base::IsFinite;
+
+  /// <summary>
+  /// Tests if the value represents positive infinity.
+  /// </summary>
+  /// <returns>true if positive infinity</returns>
+  using Base::IsPositiveInfinity;
+
+  /// <summary>
+  /// Tests if the value represents negative infinity
+  /// </summary>
+  /// <returns>true if negative infinity</returns>
+  using Base::IsNegativeInfinity;
+
+  /// <summary>
+  /// Tests if the value is either positive or negative infinity.
+  /// </summary>
+  /// <returns>True if absolute value is infinity</returns>
+  using Base::IsInfinity;
+
+  /// <summary>
+  /// Tests if the value is NaN or zero. Useful for comparisons.
+  /// </summary>
+  /// <returns>True if NaN or zero.</returns>
+  using Base::IsNaNOrZero;
+
+  /// <summary>
+  /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
+  /// </summary>
+  /// <returns>True if so</returns>
+  using Base::IsNormal;
+
+  /// <summary>
+  /// Tests if the value is subnormal (denormal).
+  /// </summary>
+  /// <returns>True if so</returns>
+  using Base::IsSubnormal;
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  using Base::Abs;
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  using Base::Negate;
+
+  /// <summary>
+  /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+  /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
+  /// and therefore equivalent, if the resulting value is still zero.
+  /// </summary>
+  /// <param name="lhs">first value</param>
+  /// <param name="rhs">second value</param>
+  /// <returns>True if both arguments represent zero</returns>
+  using Base::AreZero;
+
+  /// <summary>
+  /// User defined conversion operator. Converts Float16_t to float.
+  /// </summary>
+  explicit operator float() const noexcept { return ToFloat(); }
+
+  using Base::operator==;
+  using Base::operator!=;
+  using Base::operator<;
+};
+
+static_assert(sizeof(Float16_t) == sizeof(uint16_t), "Sizes must match");
+
+/** \brief bfloat16 (Brain Floating Point) data type
+ *
+ * \details This struct is used for converting float to bfloat16 and back
+ * so the user could feed inputs and fetch outputs using these type.
+ *
+ * The size of the structure should align with uint16_t and one can freely cast
+ * uint16_t buffers to/from Ort::BFloat16_t to feed and retrieve data.
+ *
+ * \code{.unparsed}
+ * // This example demonstrates converion from float to float16
+ * constexpr float values[] = {1.f, 2.f, 3.f, 4.f, 5.f};
+ * std::vector<Ort::BFloat16_t> bfp16_values;
+ * bfp16_values.reserve(std::size(values));
+ * std::transform(std::begin(values), std::end(values), std::back_inserter(bfp16_values),
+ *     [](float value) { return Ort::BFloat16_t(value); });
+ *
+ * \endcode
+ */
+struct BFloat16_t : onnxruntime_float16::BFloat16Impl<BFloat16_t> {
+ private:
+  /// <summary>
+  /// Constructor from a uint16_t representation of bfloat16
+  /// used in FromBits() to escape overload resolution issue with
+  /// constructor from float.
+  /// No conversion is done.
+  /// </summary>
+  /// <param name="v">16-bit bfloat16 value</param>
+  constexpr explicit BFloat16_t(uint16_t v) noexcept { val = v; }
+
+ public:
+  using Base = onnxruntime_float16::BFloat16Impl<BFloat16_t>;
+
+  BFloat16_t() = default;
+
+  /// <summary>
+  /// Explicit conversion to uint16_t representation of bfloat16.
+  /// </summary>
+  /// <param name="v">uint16_t bit representation of bfloat16</param>
+  /// <returns>new instance of BFloat16_t</returns>
+  static constexpr BFloat16_t FromBits(uint16_t v) noexcept { return BFloat16_t(v); }
+
+  /// <summary>
+  /// __ctor from float. Float is converted into bfloat16 16-bit representation.
+  /// </summary>
+  /// <param name="v">float value</param>
+  explicit BFloat16_t(float v) noexcept { val = Base::ToUint16Impl(v); }
+
+  /// <summary>
+  /// Converts bfloat16 to float
+  /// </summary>
+  /// <returns>float representation of bfloat16 value</returns>
+  float ToFloat() const noexcept { return Base::ToFloatImpl(); }
+
+  /// <summary>
+  /// Checks if the value is negative
+  /// </summary>
+  /// <returns>true if negative</returns>
+  using Base::IsNegative;
+
+  /// <summary>
+  /// Tests if the value is NaN
+  /// </summary>
+  /// <returns>true if NaN</returns>
+  using Base::IsNaN;
+
+  /// <summary>
+  /// Tests if the value is finite
+  /// </summary>
+  /// <returns>true if finite</returns>
+  using Base::IsFinite;
+
+  /// <summary>
+  /// Tests if the value represents positive infinity.
+  /// </summary>
+  /// <returns>true if positive infinity</returns>
+  using Base::IsPositiveInfinity;
+
+  /// <summary>
+  /// Tests if the value represents negative infinity
+  /// </summary>
+  /// <returns>true if negative infinity</returns>
+  using Base::IsNegativeInfinity;
+
+  /// <summary>
+  /// Tests if the value is either positive or negative infinity.
+  /// </summary>
+  /// <returns>True if absolute value is infinity</returns>
+  using Base::IsInfinity;
+
+  /// <summary>
+  /// Tests if the value is NaN or zero. Useful for comparisons.
+  /// </summary>
+  /// <returns>True if NaN or zero.</returns>
+  using Base::IsNaNOrZero;
+
+  /// <summary>
+  /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
+  /// </summary>
+  /// <returns>True if so</returns>
+  using Base::IsNormal;
+
+  /// <summary>
+  /// Tests if the value is subnormal (denormal).
+  /// </summary>
+  /// <returns>True if so</returns>
+  using Base::IsSubnormal;
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  using Base::Abs;
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  using Base::Negate;
+
+  /// <summary>
+  /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+  /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
+  /// and therefore equivalent, if the resulting value is still zero.
+  /// </summary>
+  /// <param name="lhs">first value</param>
+  /// <param name="rhs">second value</param>
+  /// <returns>True if both arguments represent zero</returns>
+  using Base::AreZero;
+
+  /// <summary>
+  /// User defined conversion operator. Converts BFloat16_t to float.
+  /// </summary>
+  explicit operator float() const noexcept { return ToFloat(); }
+
+  // We do not have an inherited impl for the below operators
+  // as the internal class implements them a little differently
+  bool operator==(const BFloat16_t& rhs) const noexcept;
+  bool operator!=(const BFloat16_t& rhs) const noexcept { return !(*this == rhs); }
+  bool operator<(const BFloat16_t& rhs) const noexcept;
+};
+
+static_assert(sizeof(BFloat16_t) == sizeof(uint16_t), "Sizes must match");
+
+/** \brief float8e4m3fn (Float8 Floating Point) data type
+ * \details It is necessary for type dispatching to make use of C++ API
+ * The type is implicitly convertible to/from uint8_t.
+ * See https://onnx.ai/onnx/technical/float8.html for further details.
+ */
+struct Float8E4M3FN_t {
+  uint8_t value;
+  constexpr Float8E4M3FN_t() noexcept : value(0) {}
+  constexpr Float8E4M3FN_t(uint8_t v) noexcept : value(v) {}
+  constexpr operator uint8_t() const noexcept { return value; }
+  // nan values are treated like any other value for operator ==, !=
+  constexpr bool operator==(const Float8E4M3FN_t& rhs) const noexcept { return value == rhs.value; };
+  constexpr bool operator!=(const Float8E4M3FN_t& rhs) const noexcept { return value != rhs.value; };
+};
+
+static_assert(sizeof(Float8E4M3FN_t) == sizeof(uint8_t), "Sizes must match");
+
+/** \brief float8e4m3fnuz (Float8 Floating Point) data type
+ * \details It is necessary for type dispatching to make use of C++ API
+ * The type is implicitly convertible to/from uint8_t.
+ * See https://onnx.ai/onnx/technical/float8.html for further details.
+ */
+struct Float8E4M3FNUZ_t {
+  uint8_t value;
+  constexpr Float8E4M3FNUZ_t() noexcept : value(0) {}
+  constexpr Float8E4M3FNUZ_t(uint8_t v) noexcept : value(v) {}
+  constexpr operator uint8_t() const noexcept { return value; }
+  // nan values are treated like any other value for operator ==, !=
+  constexpr bool operator==(const Float8E4M3FNUZ_t& rhs) const noexcept { return value == rhs.value; };
+  constexpr bool operator!=(const Float8E4M3FNUZ_t& rhs) const noexcept { return value != rhs.value; };
+};
+
+static_assert(sizeof(Float8E4M3FNUZ_t) == sizeof(uint8_t), "Sizes must match");
+
+/** \brief float8e5m2 (Float8 Floating Point) data type
+ * \details It is necessary for type dispatching to make use of C++ API
+ * The type is implicitly convertible to/from uint8_t.
+ * See https://onnx.ai/onnx/technical/float8.html for further details.
+ */
+struct Float8E5M2_t {
+  uint8_t value;
+  constexpr Float8E5M2_t() noexcept : value(0) {}
+  constexpr Float8E5M2_t(uint8_t v) noexcept : value(v) {}
+  constexpr operator uint8_t() const noexcept { return value; }
+  // nan values are treated like any other value for operator ==, !=
+  constexpr bool operator==(const Float8E5M2_t& rhs) const noexcept { return value == rhs.value; };
+  constexpr bool operator!=(const Float8E5M2_t& rhs) const noexcept { return value != rhs.value; };
+};
+
+static_assert(sizeof(Float8E5M2_t) == sizeof(uint8_t), "Sizes must match");
+
+/** \brief float8e5m2fnuz (Float8 Floating Point) data type
+ * \details It is necessary for type dispatching to make use of C++ API
+ * The type is implicitly convertible to/from uint8_t.
+ * See https://onnx.ai/onnx/technical/float8.html for further details.
+ */
+struct Float8E5M2FNUZ_t {
+  uint8_t value;
+  constexpr Float8E5M2FNUZ_t() noexcept : value(0) {}
+  constexpr Float8E5M2FNUZ_t(uint8_t v) noexcept : value(v) {}
+  constexpr operator uint8_t() const noexcept { return value; }
+  // nan values are treated like any other value for operator ==, !=
+  constexpr bool operator==(const Float8E5M2FNUZ_t& rhs) const noexcept { return value == rhs.value; };
+  constexpr bool operator!=(const Float8E5M2FNUZ_t& rhs) const noexcept { return value != rhs.value; };
+};
+
+static_assert(sizeof(Float8E5M2FNUZ_t) == sizeof(uint8_t), "Sizes must match");
+
+namespace detail {
+// This is used internally by the C++ API. This macro is to make it easy to generate overloaded methods for all of the various OrtRelease* functions for every Ort* type
+// This can't be done in the C API since C doesn't have function overloading.
+#define ORT_DEFINE_RELEASE(NAME) \
+  inline void OrtRelease(Ort##NAME* ptr) { GetApi().Release##NAME(ptr); }
+
+ORT_DEFINE_RELEASE(Allocator);
+ORT_DEFINE_RELEASE(MemoryInfo);
+ORT_DEFINE_RELEASE(CustomOpDomain);
+ORT_DEFINE_RELEASE(ThreadingOptions);
+ORT_DEFINE_RELEASE(Env);
+ORT_DEFINE_RELEASE(RunOptions);
+ORT_DEFINE_RELEASE(Session);
+ORT_DEFINE_RELEASE(SessionOptions);
+ORT_DEFINE_RELEASE(TensorTypeAndShapeInfo);
+ORT_DEFINE_RELEASE(SequenceTypeInfo);
+ORT_DEFINE_RELEASE(MapTypeInfo);
+ORT_DEFINE_RELEASE(TypeInfo);
+ORT_DEFINE_RELEASE(Value);
+ORT_DEFINE_RELEASE(ModelMetadata);
+ORT_DEFINE_RELEASE(IoBinding);
+ORT_DEFINE_RELEASE(ArenaCfg);
+ORT_DEFINE_RELEASE(Status);
+ORT_DEFINE_RELEASE(OpAttr);
+ORT_DEFINE_RELEASE(Op);
+ORT_DEFINE_RELEASE(KernelInfo);
+
+#undef ORT_DEFINE_RELEASE
+
+/** \brief This is a tagging template type. Use it with Base<T> to indicate that the C++ interface object
+ *   has no ownership of the underlying C object.
+ */
+template <typename T>
+struct Unowned {
+  using Type = T;
+};
+
+/** \brief Used internally by the C++ API. C++ wrapper types inherit from this.
+ *   This is a zero cost abstraction to wrap the C API objects and delete them on destruction.
+ *
+ * All of the C++ classes
+ *  a) serve as containers for pointers to objects that are created by the underlying C API.
+ *     Their size is just a pointer size, no need to dynamically allocate them. Use them by value.
+ *  b) Each of struct XXXX, XXX instances function as smart pointers to the underlying C API objects.
+ *     they would release objects owned automatically when going out of scope, they are move-only.
+ *  c) ConstXXXX and UnownedXXX structs function as non-owning, copyable containers for the above pointers.
+ *     ConstXXXX allow calling const interfaces only. They give access to objects that are owned by somebody else
+ *     such as Onnxruntime or instances of XXXX classes.
+ *  d) serve convenient interfaces that return C++ objects and further enhance exception and type safety so they can be used
+ *     in C++ code.
+ *
+ */
+
+/// <summary>
+/// This is a non-const pointer holder that is move-only. Disposes of the pointer on destruction.
+/// </summary>
+template <typename T>
+struct Base {
+  using contained_type = T;
+
+  constexpr Base() = default;
+  constexpr explicit Base(contained_type* p) noexcept : p_{p} {}
+  ~Base() { OrtRelease(p_); }
+
+  Base(const Base&) = delete;
+  Base& operator=(const Base&) = delete;
+
+  Base(Base&& v) noexcept : p_{v.p_} { v.p_ = nullptr; }
+  Base& operator=(Base&& v) noexcept {
+    OrtRelease(p_);
+    p_ = v.release();
+    return *this;
+  }
+
+  constexpr operator contained_type*() const noexcept { return p_; }
+
+  /// \brief Relinquishes ownership of the contained C object pointer
+  /// The underlying object is not destroyed
+  contained_type* release() {
+    T* p = p_;
+    p_ = nullptr;
+    return p;
+  }
+
+ protected:
+  contained_type* p_{};
+};
+
+// Undefined. For const types use Base<Unowned<const T>>
+template <typename T>
+struct Base<const T>;
+
+/// <summary>
+/// Covers unowned pointers owned by either the ORT
+/// or some other instance of CPP wrappers.
+/// Used for ConstXXX and UnownedXXXX types that are copyable.
+/// Also convenient to wrap raw OrtXX pointers .
+/// </summary>
+/// <typeparam name="T"></typeparam>
+template <typename T>
+struct Base<Unowned<T>> {
+  using contained_type = typename Unowned<T>::Type;
+
+  constexpr Base() = default;
+  constexpr explicit Base(contained_type* p) noexcept : p_{p} {}
+
+  ~Base() = default;
+
+  Base(const Base&) = default;
+  Base& operator=(const Base&) = default;
+
+  Base(Base&& v) noexcept : p_{v.p_} { v.p_ = nullptr; }
+  Base& operator=(Base&& v) noexcept {
+    p_ = nullptr;
+    std::swap(p_, v.p_);
+    return *this;
+  }
+
+  constexpr operator contained_type*() const noexcept { return p_; }
+
+ protected:
+  contained_type* p_{};
+};
+
+// Light functor to release memory with OrtAllocator
+struct AllocatedFree {
+  OrtAllocator* allocator_;
+  explicit AllocatedFree(OrtAllocator* allocator)
+      : allocator_(allocator) {}
+  void operator()(void* ptr) const {
+    if (ptr) allocator_->Free(allocator_, ptr);
+  }
+};
+
+}  // namespace detail
+
+struct AllocatorWithDefaultOptions;
+struct Env;
+struct TypeInfo;
+struct Value;
+struct ModelMetadata;
+
+/** \brief unique_ptr typedef used to own strings allocated by OrtAllocators
+ *  and release them at the end of the scope. The lifespan of the given allocator
+ *  must eclipse the lifespan of AllocatedStringPtr instance
+ */
+using AllocatedStringPtr = std::unique_ptr<char, detail::AllocatedFree>;
+
+/** \brief The Status that holds ownership of OrtStatus received from C API
+ *  Use it to safely destroy OrtStatus* returned from the C API. Use appropriate
+ *  constructors to construct an instance of a Status object from exceptions.
+ */
+struct Status : detail::Base<OrtStatus> {
+  explicit Status(std::nullptr_t) noexcept {}               ///< Create an empty object, must be assigned a valid one to be used
+  explicit Status(OrtStatus* status) noexcept;              ///< Takes ownership of OrtStatus instance returned from the C API.
+  explicit Status(const Exception&) noexcept;               ///< Creates status instance out of exception
+  explicit Status(const std::exception&) noexcept;          ///< Creates status instance out of exception
+  Status(const char* message, OrtErrorCode code) noexcept;  ///< Creates status instance out of null-terminated string message.
+  std::string GetErrorMessage() const;
+  OrtErrorCode GetErrorCode() const;
+  bool IsOK() const noexcept;  ///< Returns true if instance represents an OK (non-error) status.
+};
+
+/** \brief The ThreadingOptions
+ *
+ * The ThreadingOptions used for set global threadpools' options of The Env.
+ */
+struct ThreadingOptions : detail::Base<OrtThreadingOptions> {
+  /// \brief Wraps OrtApi::CreateThreadingOptions
+  ThreadingOptions();
+
+  /// \brief Wraps OrtApi::SetGlobalIntraOpNumThreads
+  ThreadingOptions& SetGlobalIntraOpNumThreads(int intra_op_num_threads);
+
+  /// \brief Wraps OrtApi::SetGlobalInterOpNumThreads
+  ThreadingOptions& SetGlobalInterOpNumThreads(int inter_op_num_threads);
+
+  /// \brief Wraps OrtApi::SetGlobalSpinControl
+  ThreadingOptions& SetGlobalSpinControl(int allow_spinning);
+
+  /// \brief Wraps OrtApi::SetGlobalDenormalAsZero
+  ThreadingOptions& SetGlobalDenormalAsZero();
+
+  /// \brief Wraps OrtApi::SetGlobalCustomCreateThreadFn
+  ThreadingOptions& SetGlobalCustomCreateThreadFn(OrtCustomCreateThreadFn ort_custom_create_thread_fn);
+
+  /// \brief Wraps OrtApi::SetGlobalCustomThreadCreationOptions
+  ThreadingOptions& SetGlobalCustomThreadCreationOptions(void* ort_custom_thread_creation_options);
+
+  /// \brief Wraps OrtApi::SetGlobalCustomJoinThreadFn
+  ThreadingOptions& SetGlobalCustomJoinThreadFn(OrtCustomJoinThreadFn ort_custom_join_thread_fn);
+};
+
+/** \brief The Env (Environment)
+ *
+ * The Env holds the logging state used by all other objects.
+ * <b>Note:</b> One Env must be created before using any other Onnxruntime functionality
+ */
+struct Env : detail::Base<OrtEnv> {
+  explicit Env(std::nullptr_t) {}  ///< Create an empty Env object, must be assigned a valid one to be used
+
+  /// \brief Wraps OrtApi::CreateEnv
+  Env(OrtLoggingLevel logging_level = ORT_LOGGING_LEVEL_WARNING, _In_ const char* logid = "");
+
+  /// \brief Wraps OrtApi::CreateEnvWithCustomLogger
+  Env(OrtLoggingLevel logging_level, const char* logid, OrtLoggingFunction logging_function, void* logger_param);
+
+  /// \brief Wraps OrtApi::CreateEnvWithGlobalThreadPools
+  Env(const OrtThreadingOptions* tp_options, OrtLoggingLevel logging_level = ORT_LOGGING_LEVEL_WARNING, _In_ const char* logid = "");
+
+  /// \brief Wraps OrtApi::CreateEnvWithCustomLoggerAndGlobalThreadPools
+  Env(const OrtThreadingOptions* tp_options, OrtLoggingFunction logging_function, void* logger_param,
+      OrtLoggingLevel logging_level = ORT_LOGGING_LEVEL_WARNING, _In_ const char* logid = "");
+
+  /// \brief C Interop Helper
+  explicit Env(OrtEnv* p) : Base<OrtEnv>{p} {}
+
+  Env& EnableTelemetryEvents();   ///< Wraps OrtApi::EnableTelemetryEvents
+  Env& DisableTelemetryEvents();  ///< Wraps OrtApi::DisableTelemetryEvents
+
+  Env& UpdateEnvWithCustomLogLevel(OrtLoggingLevel log_severity_level);  ///< Wraps OrtApi::UpdateEnvWithCustomLogLevel
+
+  Env& CreateAndRegisterAllocator(const OrtMemoryInfo* mem_info, const OrtArenaCfg* arena_cfg);  ///< Wraps OrtApi::CreateAndRegisterAllocator
+
+  Env& CreateAndRegisterAllocatorV2(const std::string& provider_type, const OrtMemoryInfo* mem_info, const std::unordered_map<std::string, std::string>& options, const OrtArenaCfg* arena_cfg);  ///< Wraps OrtApi::CreateAndRegisterAllocatorV2
+};
+
+/** \brief Custom Op Domain
+ *
+ */
+struct CustomOpDomain : detail::Base<OrtCustomOpDomain> {
+  explicit CustomOpDomain(std::nullptr_t) {}  ///< Create an empty CustomOpDomain object, must be assigned a valid one to be used
+
+  /// \brief Wraps OrtApi::CreateCustomOpDomain
+  explicit CustomOpDomain(const char* domain);
+
+  // This does not take ownership of the op, simply registers it.
+  void Add(const OrtCustomOp* op);  ///< Wraps CustomOpDomain_Add
+};
+
+/** \brief RunOptions
+ *
+ */
+struct RunOptions : detail::Base<OrtRunOptions> {
+  explicit RunOptions(std::nullptr_t) {}  ///< Create an empty RunOptions object, must be assigned a valid one to be used
+  RunOptions();                           ///< Wraps OrtApi::CreateRunOptions
+
+  RunOptions& SetRunLogVerbosityLevel(int);  ///< Wraps OrtApi::RunOptionsSetRunLogVerbosityLevel
+  int GetRunLogVerbosityLevel() const;       ///< Wraps OrtApi::RunOptionsGetRunLogVerbosityLevel
+
+  RunOptions& SetRunLogSeverityLevel(int);  ///< Wraps OrtApi::RunOptionsSetRunLogSeverityLevel
+  int GetRunLogSeverityLevel() const;       ///< Wraps OrtApi::RunOptionsGetRunLogSeverityLevel
+
+  RunOptions& SetRunTag(const char* run_tag);  ///< wraps OrtApi::RunOptionsSetRunTag
+  const char* GetRunTag() const;               ///< Wraps OrtApi::RunOptionsGetRunTag
+
+  RunOptions& AddConfigEntry(const char* config_key, const char* config_value);  ///< Wraps OrtApi::AddRunConfigEntry
+
+  /** \brief Terminates all currently executing Session::Run calls that were made using this RunOptions instance
+   *
+   * If a currently executing session needs to be force terminated, this can be called from another thread to force it to fail with an error
+   * Wraps OrtApi::RunOptionsSetTerminate
+   */
+  RunOptions& SetTerminate();
+
+  /** \brief Clears the terminate flag so this RunOptions instance can be used in a new Session::Run call without it instantly terminating
+   *
+   * Wraps OrtApi::RunOptionsUnsetTerminate
+   */
+  RunOptions& UnsetTerminate();
+};
+
+namespace detail {
+// Utility function that returns a SessionOption config entry key for a specific custom operator.
+// Ex: custom_op.[custom_op_name].[config]
+std::string MakeCustomOpConfigEntryKey(const char* custom_op_name, const char* config);
+}  // namespace detail
+
+/// <summary>
+/// Class that represents session configuration entries for one or more custom operators.
+///
+/// Example:
+///   Ort::CustomOpConfigs op_configs;
+///   op_configs.AddConfig("my_custom_op", "device_type", "CPU");
+///
+/// Passed to Ort::SessionOptions::RegisterCustomOpsLibrary.
+/// </summary>
+struct CustomOpConfigs {
+  CustomOpConfigs() = default;
+  ~CustomOpConfigs() = default;
+  CustomOpConfigs(const CustomOpConfigs&) = default;
+  CustomOpConfigs& operator=(const CustomOpConfigs&) = default;
+  CustomOpConfigs(CustomOpConfigs&& o) = default;
+  CustomOpConfigs& operator=(CustomOpConfigs&& o) = default;
+
+  /** \brief Adds a session configuration entry/value for a specific custom operator.
+   *
+   * \param custom_op_name The name of the custom operator for which to add a configuration entry.
+   *                       Must match the name returned by the CustomOp's GetName() method.
+   * \param config_key The name of the configuration entry.
+   * \param config_value The value of the configuration entry.
+   * \return A reference to this object to enable call chaining.
+   */
+  CustomOpConfigs& AddConfig(const char* custom_op_name, const char* config_key, const char* config_value);
+
+  /** \brief Returns a flattened map of custom operator configuration entries and their values.
+   *
+   * The keys has been flattened to include both the custom operator name and the configuration entry key name.
+   * For example, a prior call to AddConfig("my_op", "key", "value") corresponds to the flattened key/value pair
+   * {"my_op.key", "value"}.
+   *
+   * \return An unordered map of flattened configurations.
+   */
+  const std::unordered_map<std::string, std::string>& GetFlattenedConfigs() const;
+
+ private:
+  std::unordered_map<std::string, std::string> flat_configs_;
+};
+
+/** \brief Options object used when creating a new Session object
+ *
+ * Wraps ::OrtSessionOptions object and methods
+ */
+
+struct SessionOptions;
+
+namespace detail {
+// we separate const-only methods because passing const ptr to non-const methods
+// is only discovered when inline methods are compiled which is counter-intuitive
+template <typename T>
+struct ConstSessionOptionsImpl : Base<T> {
+  using B = Base<T>;
+  using B::B;
+
+  SessionOptions Clone() const;  ///< Creates and returns a copy of this SessionOptions object. Wraps OrtApi::CloneSessionOptions
+
+  std::string GetConfigEntry(const char* config_key) const;  ///< Wraps OrtApi::GetSessionConfigEntry
+  bool HasConfigEntry(const char* config_key) const;         ///< Wraps OrtApi::HasSessionConfigEntry
+  std::string GetConfigEntryOrDefault(const char* config_key, const std::string& def);
+};
+
+template <typename T>
+struct SessionOptionsImpl : ConstSessionOptionsImpl<T> {
+  using B = ConstSessionOptionsImpl<T>;
+  using B::B;
+
+  SessionOptionsImpl& SetIntraOpNumThreads(int intra_op_num_threads);                              ///< Wraps OrtApi::SetIntraOpNumThreads
+  SessionOptionsImpl& SetInterOpNumThreads(int inter_op_num_threads);                              ///< Wraps OrtApi::SetInterOpNumThreads
+  SessionOptionsImpl& SetGraphOptimizationLevel(GraphOptimizationLevel graph_optimization_level);  ///< Wraps OrtApi::SetSessionGraphOptimizationLevel
+
+  SessionOptionsImpl& EnableCpuMemArena();   ///< Wraps OrtApi::EnableCpuMemArena
+  SessionOptionsImpl& DisableCpuMemArena();  ///< Wraps OrtApi::DisableCpuMemArena
+
+  SessionOptionsImpl& SetOptimizedModelFilePath(const ORTCHAR_T* optimized_model_file);  ///< Wraps OrtApi::SetOptimizedModelFilePath
+
+  SessionOptionsImpl& EnableProfiling(const ORTCHAR_T* profile_file_prefix);  ///< Wraps OrtApi::EnableProfiling
+  SessionOptionsImpl& DisableProfiling();                                     ///< Wraps OrtApi::DisableProfiling
+
+  SessionOptionsImpl& EnableOrtCustomOps();  ///< Wraps OrtApi::EnableOrtCustomOps
+
+  SessionOptionsImpl& EnableMemPattern();   ///< Wraps OrtApi::EnableMemPattern
+  SessionOptionsImpl& DisableMemPattern();  ///< Wraps OrtApi::DisableMemPattern
+
+  SessionOptionsImpl& SetExecutionMode(ExecutionMode execution_mode);  ///< Wraps OrtApi::SetSessionExecutionMode
+
+  SessionOptionsImpl& SetLogId(const char* logid);     ///< Wraps OrtApi::SetSessionLogId
+  SessionOptionsImpl& SetLogSeverityLevel(int level);  ///< Wraps OrtApi::SetSessionLogSeverityLevel
+
+  SessionOptionsImpl& Add(OrtCustomOpDomain* custom_op_domain);  ///< Wraps OrtApi::AddCustomOpDomain
+
+  SessionOptionsImpl& DisablePerSessionThreads();  ///< Wraps OrtApi::DisablePerSessionThreads
+
+  SessionOptionsImpl& AddConfigEntry(const char* config_key, const char* config_value);  ///< Wraps OrtApi::AddSessionConfigEntry
+
+  SessionOptionsImpl& AddInitializer(const char* name, const OrtValue* ort_val);                                             ///< Wraps OrtApi::AddInitializer
+  SessionOptionsImpl& AddExternalInitializers(const std::vector<std::string>& names, const std::vector<Value>& ort_values);  ///< Wraps OrtApi::AddExternalInitializers
+
+  SessionOptionsImpl& AppendExecutionProvider_CUDA(const OrtCUDAProviderOptions& provider_options);               ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_CUDA
+  SessionOptionsImpl& AppendExecutionProvider_CUDA_V2(const OrtCUDAProviderOptionsV2& provider_options);          ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_CUDA_V2
+  SessionOptionsImpl& AppendExecutionProvider_ROCM(const OrtROCMProviderOptions& provider_options);               ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_ROCM
+  SessionOptionsImpl& AppendExecutionProvider_OpenVINO(const OrtOpenVINOProviderOptions& provider_options);       ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_OpenVINO
+  SessionOptionsImpl& AppendExecutionProvider_TensorRT(const OrtTensorRTProviderOptions& provider_options);       ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_TensorRT
+  SessionOptionsImpl& AppendExecutionProvider_TensorRT_V2(const OrtTensorRTProviderOptionsV2& provider_options);  ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_TensorRT
+  SessionOptionsImpl& AppendExecutionProvider_MIGraphX(const OrtMIGraphXProviderOptions& provider_options);       ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_MIGraphX
+  ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_CANN
+  SessionOptionsImpl& AppendExecutionProvider_CANN(const OrtCANNProviderOptions& provider_options);
+  ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_Dnnl
+  SessionOptionsImpl& AppendExecutionProvider_Dnnl(const OrtDnnlProviderOptions& provider_options);
+  /// Wraps OrtApi::SessionOptionsAppendExecutionProvider. Currently supports QNN, SNPE and XNNPACK.
+  SessionOptionsImpl& AppendExecutionProvider(const std::string& provider_name,
+                                              const std::unordered_map<std::string, std::string>& provider_options = {});
+
+  SessionOptionsImpl& SetCustomCreateThreadFn(OrtCustomCreateThreadFn ort_custom_create_thread_fn);  ///< Wraps OrtApi::SessionOptionsSetCustomCreateThreadFn
+  SessionOptionsImpl& SetCustomThreadCreationOptions(void* ort_custom_thread_creation_options);      ///< Wraps OrtApi::SessionOptionsSetCustomThreadCreationOptions
+  SessionOptionsImpl& SetCustomJoinThreadFn(OrtCustomJoinThreadFn ort_custom_join_thread_fn);        ///< Wraps OrtApi::SessionOptionsSetCustomJoinThreadFn
+
+  ///< Registers the custom operator from the specified shared library via OrtApi::RegisterCustomOpsLibrary_V2.
+  ///< The custom operator configurations are optional. If provided, custom operator configs are set via
+  ///< OrtApi::AddSessionConfigEntry.
+  SessionOptionsImpl& RegisterCustomOpsLibrary(const ORTCHAR_T* library_name, const CustomOpConfigs& custom_op_configs = {});
+
+  SessionOptionsImpl& RegisterCustomOpsUsingFunction(const char* function_name);  ///< Wraps OrtApi::RegisterCustomOpsUsingFunction
+};
+}  // namespace detail
+
+using UnownedSessionOptions = detail::SessionOptionsImpl<detail::Unowned<OrtSessionOptions>>;
+using ConstSessionOptions = detail::ConstSessionOptionsImpl<detail::Unowned<const OrtSessionOptions>>;
+
+/** \brief Wrapper around ::OrtSessionOptions
+ *
+ */
+struct SessionOptions : detail::SessionOptionsImpl<OrtSessionOptions> {
+  explicit SessionOptions(std::nullptr_t) {}                                                   ///< Create an empty SessionOptions object, must be assigned a valid one to be used
+  SessionOptions();                                                                            ///< Wraps OrtApi::CreateSessionOptions
+  explicit SessionOptions(OrtSessionOptions* p) : SessionOptionsImpl<OrtSessionOptions>{p} {}  ///< Used for interop with the C API
+  UnownedSessionOptions GetUnowned() const { return UnownedSessionOptions{this->p_}; }
+  ConstSessionOptions GetConst() const { return ConstSessionOptions{this->p_}; }
+};
+
+/** \brief Wrapper around ::OrtModelMetadata
+ *
+ */
+struct ModelMetadata : detail::Base<OrtModelMetadata> {
+  explicit ModelMetadata(std::nullptr_t) {}                                   ///< Create an empty ModelMetadata object, must be assigned a valid one to be used
+  explicit ModelMetadata(OrtModelMetadata* p) : Base<OrtModelMetadata>{p} {}  ///< Used for interop with the C API
+
+  /** \brief Returns a copy of the producer name.
+   *
+   * \param allocator to allocate memory for the copy of the name returned
+   * \return a instance of smart pointer that would deallocate the buffer when out of scope.
+   *  The OrtAllocator instances must be valid at the point of memory release.
+   */
+  AllocatedStringPtr GetProducerNameAllocated(OrtAllocator* allocator) const;  ///< Wraps OrtApi::ModelMetadataGetProducerName
+
+  /** \brief Returns a copy of the graph name.
+   *
+   * \param allocator to allocate memory for the copy of the name returned
+   * \return a instance of smart pointer that would deallocate the buffer when out of scope.
+   *  The OrtAllocator instances must be valid at the point of memory release.
+   */
+  AllocatedStringPtr GetGraphNameAllocated(OrtAllocator* allocator) const;  ///< Wraps OrtApi::ModelMetadataGetGraphName
+
+  /** \brief Returns a copy of the domain name.
+   *
+   * \param allocator to allocate memory for the copy of the name returned
+   * \return a instance of smart pointer that would deallocate the buffer when out of scope.
+   *  The OrtAllocator instances must be valid at the point of memory release.
+   */
+  AllocatedStringPtr GetDomainAllocated(OrtAllocator* allocator) const;  ///< Wraps OrtApi::ModelMetadataGetDomain
+
+  /** \brief Returns a copy of the description.
+   *
+   * \param allocator to allocate memory for the copy of the string returned
+   * \return a instance of smart pointer that would deallocate the buffer when out of scope.
+   *  The OrtAllocator instances must be valid at the point of memory release.
+   */
+  AllocatedStringPtr GetDescriptionAllocated(OrtAllocator* allocator) const;  ///< Wraps OrtApi::ModelMetadataGetDescription
+
+  /** \brief Returns a copy of the graph description.
+   *
+   * \param allocator to allocate memory for the copy of the string returned
+   * \return a instance of smart pointer that would deallocate the buffer when out of scope.
+   *  The OrtAllocator instances must be valid at the point of memory release.
+   */
+  AllocatedStringPtr GetGraphDescriptionAllocated(OrtAllocator* allocator) const;  ///< Wraps OrtApi::ModelMetadataGetGraphDescription
+
+  /** \brief Returns a vector of copies of the custom metadata keys.
+   *
+   * \param allocator to allocate memory for the copy of the string returned
+   * \return a instance std::vector of smart pointers that would deallocate the buffers when out of scope.
+   *  The OrtAllocator instance must be valid at the point of memory release.
+   */
+  std::vector<AllocatedStringPtr> GetCustomMetadataMapKeysAllocated(OrtAllocator* allocator) const;  ///< Wraps OrtApi::ModelMetadataGetCustomMetadataMapKeys
+
+  /** \brief Looks up a value by a key in the Custom Metadata map
+   *
+   * \param key zero terminated string key to lookup
+   * \param allocator to allocate memory for the copy of the string returned
+   * \return a instance of smart pointer that would deallocate the buffer when out of scope.
+   *  maybe nullptr if key is not found.
+   *
+   *  The OrtAllocator instances must be valid at the point of memory release.
+   */
+  AllocatedStringPtr LookupCustomMetadataMapAllocated(const char* key, OrtAllocator* allocator) const;  ///< Wraps OrtApi::ModelMetadataLookupCustomMetadataMap
+
+  int64_t GetVersion() const;  ///< Wraps OrtApi::ModelMetadataGetVersion
+};
+
+struct IoBinding;
+
+namespace detail {
+
+// we separate const-only methods because passing const ptr to non-const methods
+// is only discovered when inline methods are compiled which is counter-intuitive
+template <typename T>
+struct ConstSessionImpl : Base<T> {
+  using B = Base<T>;
+  using B::B;
+
+  size_t GetInputCount() const;                   ///< Returns the number of model inputs
+  size_t GetOutputCount() const;                  ///< Returns the number of model outputs
+  size_t GetOverridableInitializerCount() const;  ///< Returns the number of inputs that have defaults that can be overridden
+
+  /** \brief Returns a copy of input name at the specified index.
+   *
+   * \param index must less than the value returned by GetInputCount()
+   * \param allocator to allocate memory for the copy of the name returned
+   * \return a instance of smart pointer that would deallocate the buffer when out of scope.
+   *  The OrtAllocator instances must be valid at the point of memory release.
+   */
+  AllocatedStringPtr GetInputNameAllocated(size_t index, OrtAllocator* allocator) const;
+
+  /** \brief Returns a copy of output name at then specified index.
+   *
+   * \param index must less than the value returned by GetOutputCount()
+   * \param allocator to allocate memory for the copy of the name returned
+   * \return a instance of smart pointer that would deallocate the buffer when out of scope.
+   *  The OrtAllocator instances must be valid at the point of memory release.
+   */
+  AllocatedStringPtr GetOutputNameAllocated(size_t index, OrtAllocator* allocator) const;
+
+  /** \brief Returns a copy of the overridable initializer name at then specified index.
+   *
+   * \param index must less than the value returned by GetOverridableInitializerCount()
+   * \param allocator to allocate memory for the copy of the name returned
+   * \return a instance of smart pointer that would deallocate the buffer when out of scope.
+   *  The OrtAllocator instances must be valid at the point of memory release.
+   */
+  AllocatedStringPtr GetOverridableInitializerNameAllocated(size_t index, OrtAllocator* allocator) const;  ///< Wraps OrtApi::SessionGetOverridableInitializerName
+
+  uint64_t GetProfilingStartTimeNs() const;  ///< Wraps OrtApi::SessionGetProfilingStartTimeNs
+  ModelMetadata GetModelMetadata() const;    ///< Wraps OrtApi::SessionGetModelMetadata
+
+  TypeInfo GetInputTypeInfo(size_t index) const;                   ///< Wraps OrtApi::SessionGetInputTypeInfo
+  TypeInfo GetOutputTypeInfo(size_t index) const;                  ///< Wraps OrtApi::SessionGetOutputTypeInfo
+  TypeInfo GetOverridableInitializerTypeInfo(size_t index) const;  ///< Wraps OrtApi::SessionGetOverridableInitializerTypeInfo
+};
+
+template <typename T>
+struct SessionImpl : ConstSessionImpl<T> {
+  using B = ConstSessionImpl<T>;
+  using B::B;
+
+  /** \brief Run the model returning results in an Ort allocated vector.
+   *
+   * Wraps OrtApi::Run
+   *
+   * The caller provides a list of inputs and a list of the desired outputs to return.
+   *
+   * See the output logs for more information on warnings/errors that occur while processing the model.
+   * Common errors are.. (TODO)
+   *
+   * \param[in] run_options
+   * \param[in] input_names Array of null terminated strings of length input_count that is the list of input names
+   * \param[in] input_values Array of Value objects of length input_count that is the list of input values
+   * \param[in] input_count Number of inputs (the size of the input_names & input_values arrays)
+   * \param[in] output_names Array of C style strings of length output_count that is the list of output names
+   * \param[in] output_count Number of outputs (the size of the output_names array)
+   * \return A std::vector of Value objects that directly maps to the output_names array (eg. output_name[0] is the first entry of the returned vector)
+   */
+  std::vector<Value> Run(const RunOptions& run_options, const char* const* input_names, const Value* input_values, size_t input_count,
+                         const char* const* output_names, size_t output_count);
+
+  /** \brief Run the model returning results in user provided outputs
+   * Same as Run(const RunOptions&, const char* const*, const Value*, size_t,const char* const*, size_t)
+   */
+  void Run(const RunOptions& run_options, const char* const* input_names, const Value* input_values, size_t input_count,
+           const char* const* output_names, Value* output_values, size_t output_count);
+
+  void Run(const RunOptions& run_options, const IoBinding&);  ///< Wraps OrtApi::RunWithBinding
+
+  /** \brief Run the model asynchronously in a thread owned by intra op thread pool
+   *
+   * Wraps OrtApi::RunAsync
+   *
+   * \param[in] run_options
+   * \param[in] input_names Array of null terminated UTF8 encoded strings of the input names
+   * \param[in] input_values Array of Value objects of length input_count
+   * \param[in] input_count Number of elements in the input_names and inputs arrays
+   * \param[in] output_names Array of null terminated UTF8 encoded strings of the output names
+   * \param[out] output_values Array of provided Values to be filled with outputs.
+   *             On calling RunAsync, output_values[i] could either be initialized by a null pointer or a preallocated OrtValue*.
+   *             Later, on invoking the callback, each output_values[i] of null will be filled with an OrtValue* allocated by onnxruntime.
+   *             Then, an OrtValue** pointer will be casted from output_values, and pass to the callback.
+   *             NOTE: it is customer's duty to finally release output_values and each of its member,
+   *             regardless of whether the member (Ort::Value) is allocated by onnxruntime or preallocated by the customer.
+   * \param[in] output_count Number of elements in the output_names and outputs array
+   * \param[in] callback Callback function on model run completion
+   * \param[in] user_data User data that pass back to the callback
+   */
+  void RunAsync(const RunOptions& run_options, const char* const* input_names, const Value* input_values, size_t input_count,
+                const char* const* output_names, Value* output_values, size_t output_count, RunAsyncCallbackFn callback, void* user_data);
+
+  /** \brief End profiling and return a copy of the profiling file name.
+   *
+   * \param allocator to allocate memory for the copy of the string returned
+   * \return a instance of smart pointer that would deallocate the buffer when out of scope.
+   *  The OrtAllocator instances must be valid at the point of memory release.
+   */
+  AllocatedStringPtr EndProfilingAllocated(OrtAllocator* allocator);  ///< Wraps OrtApi::SessionEndProfiling
+};
+
+}  // namespace detail
+
+using ConstSession = detail::ConstSessionImpl<detail::Unowned<const OrtSession>>;
+using UnownedSession = detail::SessionImpl<detail::Unowned<OrtSession>>;
+
+/** \brief Wrapper around ::OrtSession
+ *
+ */
+struct Session : detail::SessionImpl<OrtSession> {
+  explicit Session(std::nullptr_t) {}                                                   ///< Create an empty Session object, must be assigned a valid one to be used
+  Session(const Env& env, const ORTCHAR_T* model_path, const SessionOptions& options);  ///< Wraps OrtApi::CreateSession
+  Session(const Env& env, const ORTCHAR_T* model_path, const SessionOptions& options,
+          OrtPrepackedWeightsContainer* prepacked_weights_container);                                        ///< Wraps OrtApi::CreateSessionWithPrepackedWeightsContainer
+  Session(const Env& env, const void* model_data, size_t model_data_length, const SessionOptions& options);  ///< Wraps OrtApi::CreateSessionFromArray
+  Session(const Env& env, const void* model_data, size_t model_data_length, const SessionOptions& options,
+          OrtPrepackedWeightsContainer* prepacked_weights_container);  ///< Wraps OrtApi::CreateSessionFromArrayWithPrepackedWeightsContainer
+
+  ConstSession GetConst() const { return ConstSession{this->p_}; }
+  UnownedSession GetUnowned() const { return UnownedSession{this->p_}; }
+};
+
+namespace detail {
+template <typename T>
+struct MemoryInfoImpl : Base<T> {
+  using B = Base<T>;
+  using B::B;
+
+  std::string GetAllocatorName() const;
+  OrtAllocatorType GetAllocatorType() const;
+  int GetDeviceId() const;
+  OrtMemoryInfoDeviceType GetDeviceType() const;
+  OrtMemType GetMemoryType() const;
+
+  template <typename U>
+  bool operator==(const MemoryInfoImpl<U>& o) const;
+};
+}  // namespace detail
+
+// Const object holder that does not own the underlying object
+using ConstMemoryInfo = detail::MemoryInfoImpl<detail::Unowned<const OrtMemoryInfo>>;
+
+/** \brief Wrapper around ::OrtMemoryInfo
+ *
+ */
+struct MemoryInfo : detail::MemoryInfoImpl<OrtMemoryInfo> {
+  static MemoryInfo CreateCpu(OrtAllocatorType type, OrtMemType mem_type1);
+  explicit MemoryInfo(std::nullptr_t) {}                                       ///< No instance is created
+  explicit MemoryInfo(OrtMemoryInfo* p) : MemoryInfoImpl<OrtMemoryInfo>{p} {}  ///< Take ownership of a pointer created by C Api
+  MemoryInfo(const char* name, OrtAllocatorType type, int id, OrtMemType mem_type);
+  ConstMemoryInfo GetConst() const { return ConstMemoryInfo{this->p_}; }
+};
+
+namespace detail {
+template <typename T>
+struct TensorTypeAndShapeInfoImpl : Base<T> {
+  using B = Base<T>;
+  using B::B;
+
+  ONNXTensorElementDataType GetElementType() const;  ///< Wraps OrtApi::GetTensorElementType
+  size_t GetElementCount() const;                    ///< Wraps OrtApi::GetTensorShapeElementCount
+
+  size_t GetDimensionsCount() const;  ///< Wraps OrtApi::GetDimensionsCount
+
+  /** \deprecated use GetShape() returning std::vector
+   * [[deprecated]]
+   * This interface is unsafe to use
+   */
+  [[deprecated("use GetShape()")]] void GetDimensions(int64_t* values, size_t values_count) const;  ///< Wraps OrtApi::GetDimensions
+
+  void GetSymbolicDimensions(const char** values, size_t values_count) const;  ///< Wraps OrtApi::GetSymbolicDimensions
+
+  std::vector<int64_t> GetShape() const;  ///< Uses GetDimensionsCount & GetDimensions to return a std::vector of the shape
+};
+
+}  // namespace detail
+
+using ConstTensorTypeAndShapeInfo = detail::TensorTypeAndShapeInfoImpl<detail::Unowned<const OrtTensorTypeAndShapeInfo>>;
+
+/** \brief Wrapper around ::OrtTensorTypeAndShapeInfo
+ *
+ */
+struct TensorTypeAndShapeInfo : detail::TensorTypeAndShapeInfoImpl<OrtTensorTypeAndShapeInfo> {
+  explicit TensorTypeAndShapeInfo(std::nullptr_t) {}                                                ///< Create an empty TensorTypeAndShapeInfo object, must be assigned a valid one to be used
+  explicit TensorTypeAndShapeInfo(OrtTensorTypeAndShapeInfo* p) : TensorTypeAndShapeInfoImpl{p} {}  ///< Used for interop with the C API
+  ConstTensorTypeAndShapeInfo GetConst() const { return ConstTensorTypeAndShapeInfo{this->p_}; }
+};
+
+namespace detail {
+template <typename T>
+struct SequenceTypeInfoImpl : Base<T> {
+  using B = Base<T>;
+  using B::B;
+  TypeInfo GetSequenceElementType() const;  ///< Wraps OrtApi::GetSequenceElementType
+};
+
+}  // namespace detail
+
+using ConstSequenceTypeInfo = detail::SequenceTypeInfoImpl<detail::Unowned<const OrtSequenceTypeInfo>>;
+
+/** \brief Wrapper around ::OrtSequenceTypeInfo
+ *
+ */
+struct SequenceTypeInfo : detail::SequenceTypeInfoImpl<OrtSequenceTypeInfo> {
+  explicit SequenceTypeInfo(std::nullptr_t) {}                                                         ///< Create an empty SequenceTypeInfo object, must be assigned a valid one to be used
+  explicit SequenceTypeInfo(OrtSequenceTypeInfo* p) : SequenceTypeInfoImpl<OrtSequenceTypeInfo>{p} {}  ///< Used for interop with the C API
+  ConstSequenceTypeInfo GetConst() const { return ConstSequenceTypeInfo{this->p_}; }
+};
+
+namespace detail {
+template <typename T>
+struct OptionalTypeInfoImpl : Base<T> {
+  using B = Base<T>;
+  using B::B;
+  TypeInfo GetOptionalElementType() const;  ///< Wraps OrtApi::CastOptionalTypeToContainedTypeInfo
+};
+
+}  // namespace detail
+
+// This is always owned by the TypeInfo and can only be obtained from it.
+using ConstOptionalTypeInfo = detail::OptionalTypeInfoImpl<detail::Unowned<const OrtOptionalTypeInfo>>;
+
+namespace detail {
+template <typename T>
+struct MapTypeInfoImpl : detail::Base<T> {
+  using B = Base<T>;
+  using B::B;
+  ONNXTensorElementDataType GetMapKeyType() const;  ///< Wraps OrtApi::GetMapKeyType
+  TypeInfo GetMapValueType() const;                 ///< Wraps OrtApi::GetMapValueType
+};
+
+}  // namespace detail
+
+using ConstMapTypeInfo = detail::MapTypeInfoImpl<detail::Unowned<const OrtMapTypeInfo>>;
+
+/** \brief Wrapper around ::OrtMapTypeInfo
+ *
+ */
+struct MapTypeInfo : detail::MapTypeInfoImpl<OrtMapTypeInfo> {
+  explicit MapTypeInfo(std::nullptr_t) {}                                          ///< Create an empty MapTypeInfo object, must be assigned a valid one to be used
+  explicit MapTypeInfo(OrtMapTypeInfo* p) : MapTypeInfoImpl<OrtMapTypeInfo>{p} {}  ///< Used for interop with the C API
+  ConstMapTypeInfo GetConst() const { return ConstMapTypeInfo{this->p_}; }
+};
+
+namespace detail {
+template <typename T>
+struct TypeInfoImpl : detail::Base<T> {
+  using B = Base<T>;
+  using B::B;
+
+  ConstTensorTypeAndShapeInfo GetTensorTypeAndShapeInfo() const;  ///< Wraps OrtApi::CastTypeInfoToTensorInfo
+  ConstSequenceTypeInfo GetSequenceTypeInfo() const;              ///< Wraps OrtApi::CastTypeInfoToSequenceTypeInfo
+  ConstMapTypeInfo GetMapTypeInfo() const;                        ///< Wraps OrtApi::CastTypeInfoToMapTypeInfo
+  ConstOptionalTypeInfo GetOptionalTypeInfo() const;              ///< wraps OrtApi::CastTypeInfoToOptionalTypeInfo
+
+  ONNXType GetONNXType() const;
+};
+}  // namespace detail
+
+/// <summary>
+/// Contains a constant, unowned OrtTypeInfo that can be copied and passed around by value.
+/// Provides access to const OrtTypeInfo APIs.
+/// </summary>
+using ConstTypeInfo = detail::TypeInfoImpl<detail::Unowned<const OrtTypeInfo>>;
+
+/// <summary>
+/// Type information that may contain either TensorTypeAndShapeInfo or
+/// the information about contained sequence or map depending on the ONNXType.
+/// </summary>
+struct TypeInfo : detail::TypeInfoImpl<OrtTypeInfo> {
+  explicit TypeInfo(std::nullptr_t) {}                                 ///< Create an empty TypeInfo object, must be assigned a valid one to be used
+  explicit TypeInfo(OrtTypeInfo* p) : TypeInfoImpl<OrtTypeInfo>{p} {}  ///< C API Interop
+
+  ConstTypeInfo GetConst() const { return ConstTypeInfo{this->p_}; }
+};
+
+namespace detail {
+// This structure is used to feed  sparse tensor values
+// information for use with FillSparseTensor<Format>() API
+// if the data type for the sparse tensor values is numeric
+// use data.p_data, otherwise, use data.str pointer to feed
+// values. data.str is an array of const char* that are zero terminated.
+// number of strings in the array must match shape size.
+// For fully sparse tensors use shape {0} and set p_data/str
+// to nullptr.
+struct OrtSparseValuesParam {
+  const int64_t* values_shape;
+  size_t values_shape_len;
+  union {
+    const void* p_data;
+    const char** str;
+  } data;
+};
+
+// Provides a way to pass shape in a single
+// argument
+struct Shape {
+  const int64_t* shape;
+  size_t shape_len;
+};
+
+template <typename T>
+struct ConstValueImpl : Base<T> {
+  using B = Base<T>;
+  using B::B;
+
+  /// <summary>
+  /// Obtains a pointer to a user defined data for experimental purposes
+  /// </summary>
+  template <typename R>
+  void GetOpaqueData(const char* domain, const char* type_name, R&) const;  ///< Wraps OrtApi::GetOpaqueValue
+
+  bool IsTensor() const;  ///< Returns true if Value is a tensor, false for other types like map/sequence/etc
+  bool HasValue() const;  /// < Return true if OrtValue contains data and returns false if the OrtValue is a None
+
+  size_t GetCount() const;  // If a non tensor, returns 2 for map and N for sequence, where N is the number of elements
+  Value GetValue(int index, OrtAllocator* allocator) const;
+
+  /// <summary>
+  /// This API returns a full length of string data contained within either a tensor or a sparse Tensor.
+  /// For sparse tensor it returns a full length of stored non-empty strings (values). The API is useful
+  /// for allocating necessary memory and calling GetStringTensorContent().
+  /// </summary>
+  /// <returns>total length of UTF-8 encoded bytes contained. No zero terminators counted.</returns>
+  size_t GetStringTensorDataLength() const;
+
+  /// <summary>
+  /// The API copies all of the UTF-8 encoded string data contained within a tensor or a sparse tensor
+  /// into a supplied buffer. Use GetStringTensorDataLength() to find out the length of the buffer to allocate.
+  /// The user must also allocate offsets buffer with the number of entries equal to that of the contained
+  /// strings.
+  ///
+  /// Strings are always assumed to be on CPU, no X-device copy.
+  /// </summary>
+  /// <param name="buffer">user allocated buffer</param>
+  /// <param name="buffer_length">length in bytes of the allocated buffer</param>
+  /// <param name="offsets">a pointer to the offsets user allocated buffer</param>
+  /// <param name="offsets_count">count of offsets, must be equal to the number of strings contained.
+  ///   that can be obtained from the shape of the tensor or from GetSparseTensorValuesTypeAndShapeInfo()
+  ///   for sparse tensors</param>
+  void GetStringTensorContent(void* buffer, size_t buffer_length, size_t* offsets, size_t offsets_count) const;
+
+  /// <summary>
+  /// Returns a const typed pointer to the tensor contained data.
+  /// No type checking is performed, the caller must ensure the type matches the tensor type.
+  /// </summary>
+  /// <typeparam name="T"></typeparam>
+  /// <returns>const pointer to data, no copies made</returns>
+  template <typename R>
+  const R* GetTensorData() const;  ///< Wraps OrtApi::GetTensorMutableData   /// <summary>
+
+  /// <summary>
+  /// Returns a non-typed pointer to a tensor contained data.
+  /// </summary>
+  /// <returns>const pointer to data, no copies made</returns>
+  const void* GetTensorRawData() const;
+
+  /// <summary>
+  /// The API returns type information for data contained in a tensor. For sparse
+  /// tensors it returns type information for contained non-zero values.
+  /// It returns dense shape for sparse tensors.
+  /// </summary>
+  /// <returns>TypeInfo</returns>
+  TypeInfo GetTypeInfo() const;
+
+  /// <summary>
+  /// The API returns type information for data contained in a tensor. For sparse
+  /// tensors it returns type information for contained non-zero values.
+  /// It returns dense shape for sparse tensors.
+  /// </summary>
+  /// <returns>TensorTypeAndShapeInfo</returns>
+  TensorTypeAndShapeInfo GetTensorTypeAndShapeInfo() const;
+
+  /// <summary>
+  /// This API returns information about the memory allocation used to hold data.
+  /// </summary>
+  /// <returns>Non owning instance of MemoryInfo</returns>
+  ConstMemoryInfo GetTensorMemoryInfo() const;
+
+  /// <summary>
+  /// The API copies UTF-8 encoded bytes for the requested string element
+  /// contained within a tensor or a sparse tensor into a provided buffer.
+  /// Use GetStringTensorElementLength() to obtain the length of the buffer to allocate.
+  /// </summary>
+  /// <param name="buffer_length"></param>
+  /// <param name="element_index"></param>
+  /// <param name="buffer"></param>
+  void GetStringTensorElement(size_t buffer_length, size_t element_index, void* buffer) const;
+
+  /// <summary>
+  /// Returns string tensor UTF-8 encoded string element.
+  /// Use of this API is recommended over GetStringTensorElement() that takes void* buffer pointer.
+  /// </summary>
+  /// <param name="element_index"></param>
+  /// <returns>std::string</returns>
+  std::string GetStringTensorElement(size_t element_index) const;
+
+  /// <summary>
+  /// The API returns a byte length of UTF-8 encoded string element
+  /// contained in either a tensor or a spare tensor values.
+  /// </summary>
+  /// <param name="element_index"></param>
+  /// <returns>byte length for the specified string element</returns>
+  size_t GetStringTensorElementLength(size_t element_index) const;
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+  /// <summary>
+  /// The API returns the sparse data format this OrtValue holds in a sparse tensor.
+  /// If the sparse tensor was not fully constructed, i.e. Use*() or Fill*() API were not used
+  /// the value returned is ORT_SPARSE_UNDEFINED.
+  /// </summary>
+  /// <returns>Format enum</returns>
+  OrtSparseFormat GetSparseFormat() const;
+
+  /// <summary>
+  /// The API returns type and shape information for stored non-zero values of the
+  /// sparse tensor. Use GetSparseTensorValues() to obtain values buffer pointer.
+  /// </summary>
+  /// <returns>TensorTypeAndShapeInfo values information</returns>
+  TensorTypeAndShapeInfo GetSparseTensorValuesTypeAndShapeInfo() const;
+
+  /// <summary>
+  /// The API returns type and shape information for the specified indices. Each supported
+  /// indices have their own enum values even if a give format has more than one kind of indices.
+  /// Use GetSparseTensorIndicesData() to obtain pointer to indices buffer.
+  /// </summary>
+  /// <param name="format">enum requested</param>
+  /// <returns>type and shape information</returns>
+  TensorTypeAndShapeInfo GetSparseTensorIndicesTypeShapeInfo(OrtSparseIndicesFormat format) const;
+
+  /// <summary>
+  /// The API retrieves a pointer to the internal indices buffer. The API merely performs
+  /// a convenience data type casting on the return type pointer. Make sure you are requesting
+  /// the right type, use GetSparseTensorIndicesTypeShapeInfo();
+  /// </summary>
+  /// <typeparam name="T">type to cast to</typeparam>
+  /// <param name="indices_format">requested indices kind</param>
+  /// <param name="num_indices">number of indices entries</param>
+  /// <returns>Pinter to the internal sparse tensor buffer containing indices. Do not free this pointer.</returns>
+  template <typename R>
+  const R* GetSparseTensorIndicesData(OrtSparseIndicesFormat indices_format, size_t& num_indices) const;
+
+  /// <summary>
+  /// Returns true if the OrtValue contains a sparse tensor
+  /// </summary>
+  /// <returns></returns>
+  bool IsSparseTensor() const;
+
+  /// <summary>
+  /// The API returns a pointer to an internal buffer of the sparse tensor
+  /// containing non-zero values. The API merely does casting. Make sure you
+  /// are requesting the right data type by calling GetSparseTensorValuesTypeAndShapeInfo()
+  /// first.
+  /// </summary>
+  /// <typeparam name="T">numeric data types only. Use GetStringTensor*() to retrieve strings.</typeparam>
+  /// <returns>a pointer to the internal values buffer. Do not free this pointer.</returns>
+  template <typename R>
+  const R* GetSparseTensorValues() const;
+
+#endif
+};
+
+template <typename T>
+struct ValueImpl : ConstValueImpl<T> {
+  using B = ConstValueImpl<T>;
+  using B::B;
+
+  /// <summary>
+  /// Returns a non-const typed pointer to an OrtValue/Tensor contained buffer
+  /// No type checking is performed, the caller must ensure the type matches the tensor type.
+  /// </summary>
+  /// <returns>non-const pointer to data, no copies made</returns>
+  template <typename R>
+  R* GetTensorMutableData();
+
+  /// <summary>
+  /// Returns a non-typed non-const pointer to a tensor contained data.
+  /// </summary>
+  /// <returns>pointer to data, no copies made</returns>
+  void* GetTensorMutableRawData();
+
+  /// <summary>
+  //  Obtain a reference to an element of data at the location specified
+  /// by the vector of dims.
+  /// </summary>
+  /// <typeparam name="R"></typeparam>
+  /// <param name="location">[in] expressed by a vecotr of dimensions offsets</param>
+  /// <returns></returns>
+  template <typename R>
+  R& At(const std::vector<int64_t>& location);
+
+  /// <summary>
+  /// Set all strings at once in a string tensor
+  /// </summary>
+  /// <param name="s">[in] An array of strings. Each string in this array must be null terminated.</param>
+  /// <param name="s_len">[in] Count of strings in s (Must match the size of \p value's tensor shape)</param>
+  void FillStringTensor(const char* const* s, size_t s_len);
+
+  /// <summary>
+  /// Set a single string in a string tensor
+  /// </summary>
+  /// <param name="s">[in] A null terminated UTF-8 encoded string</param>
+  /// <param name="index">[in] Index of the string in the tensor to set</param>
+  void FillStringTensorElement(const char* s, size_t index);
+
+  /// <summary>
+  /// Allocate if necessary and obtain a pointer to a UTF-8
+  /// encoded string element buffer indexed by the flat element index,
+  /// of the specified length.
+  ///
+  /// This API is for advanced usage. It avoids a need to construct
+  /// an auxiliary array of string pointers, and allows to write data directly
+  /// (do not zero terminate).
+  /// </summary>
+  /// <param name="index"></param>
+  /// <param name="buffer_length"></param>
+  /// <returns>a pointer to a writable buffer</returns>
+  char* GetResizedStringTensorElementBuffer(size_t index, size_t buffer_length);
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+  /// <summary>
+  /// Supplies COO format specific indices and marks the contained sparse tensor as being a COO format tensor.
+  /// Values are supplied with a CreateSparseTensor() API. The supplied indices are not copied and the user
+  /// allocated buffers lifespan must eclipse that of the OrtValue.
+  /// The location of the indices is assumed to be the same as specified by OrtMemoryInfo argument at the creation time.
+  /// </summary>
+  /// <param name="indices_data">pointer to the user allocated buffer with indices. Use nullptr for fully sparse tensors.</param>
+  /// <param name="indices_num">number of indices entries. Use 0 for fully sparse tensors</param>
+  void UseCooIndices(int64_t* indices_data, size_t indices_num);
+
+  /// <summary>
+  /// Supplies CSR format specific indices and marks the contained sparse tensor as being a CSR format tensor.
+  /// Values are supplied with a CreateSparseTensor() API. The supplied indices are not copied and the user
+  /// allocated buffers lifespan must eclipse that of the OrtValue.
+  /// The location of the indices is assumed to be the same as specified by OrtMemoryInfo argument at the creation time.
+  /// </summary>
+  /// <param name="inner_data">pointer to the user allocated buffer with inner indices or nullptr for fully sparse tensors</param>
+  /// <param name="inner_num">number of csr inner indices or 0 for fully sparse tensors</param>
+  /// <param name="outer_data">pointer to the user allocated buffer with outer indices or nullptr for fully sparse tensors</param>
+  /// <param name="outer_num">number of csr outer indices or 0 for fully sparse tensors</param>
+  void UseCsrIndices(int64_t* inner_data, size_t inner_num, int64_t* outer_data, size_t outer_num);
+
+  /// <summary>
+  /// Supplies BlockSparse format specific indices and marks the contained sparse tensor as being a BlockSparse format tensor.
+  /// Values are supplied with a CreateSparseTensor() API. The supplied indices are not copied and the user
+  /// allocated buffers lifespan must eclipse that of the OrtValue.
+  /// The location of the indices is assumed to be the same as specified by OrtMemoryInfo argument at the creation time.
+  /// </summary>
+  /// <param name="indices_shape">indices shape or a {0} for fully sparse</param>
+  /// <param name="indices_data">user allocated buffer with indices or nullptr for fully spare tensors</param>
+  void UseBlockSparseIndices(const Shape& indices_shape, int32_t* indices_data);
+
+  /// <summary>
+  /// The API will allocate memory using the allocator instance supplied to the CreateSparseTensor() API
+  /// and copy the values and COO indices into it. If data_mem_info specifies that the data is located
+  /// at difference device than the allocator, a X-device copy will be performed if possible.
+  /// </summary>
+  /// <param name="data_mem_info">specified buffer memory description</param>
+  /// <param name="values_param">values buffer information.</param>
+  /// <param name="indices_data">coo indices buffer or nullptr for fully sparse data</param>
+  /// <param name="indices_num">number of COO indices or 0 for fully sparse data</param>
+  void FillSparseTensorCoo(const OrtMemoryInfo* data_mem_info, const OrtSparseValuesParam& values_param,
+                           const int64_t* indices_data, size_t indices_num);
+
+  /// <summary>
+  /// The API will allocate memory using the allocator instance supplied to the CreateSparseTensor() API
+  /// and copy the values and CSR indices into it. If data_mem_info specifies that the data is located
+  /// at difference device than the allocator, a X-device copy will be performed if possible.
+  /// </summary>
+  /// <param name="data_mem_info">specified buffer memory description</param>
+  /// <param name="values">values buffer information</param>
+  /// <param name="inner_indices_data">csr inner indices pointer or nullptr for fully sparse tensors</param>
+  /// <param name="inner_indices_num">number of csr inner indices or 0 for fully sparse tensors</param>
+  /// <param name="outer_indices_data">pointer to csr indices data or nullptr for fully sparse tensors</param>
+  /// <param name="outer_indices_num">number of csr outer indices or 0</param>
+  void FillSparseTensorCsr(const OrtMemoryInfo* data_mem_info,
+                           const OrtSparseValuesParam& values,
+                           const int64_t* inner_indices_data, size_t inner_indices_num,
+                           const int64_t* outer_indices_data, size_t outer_indices_num);
+
+  /// <summary>
+  /// The API will allocate memory using the allocator instance supplied to the CreateSparseTensor() API
+  /// and copy the values and BlockSparse indices into it. If data_mem_info specifies that the data is located
+  /// at difference device than the allocator, a X-device copy will be performed if possible.
+  /// </summary>
+  /// <param name="data_mem_info">specified buffer memory description</param>
+  /// <param name="values">values buffer information</param>
+  /// <param name="indices_shape">indices shape. use {0} for fully sparse tensors</param>
+  /// <param name="indices_data">pointer to indices data or nullptr for fully sparse tensors</param>
+  void FillSparseTensorBlockSparse(const OrtMemoryInfo* data_mem_info,
+                                   const OrtSparseValuesParam& values,
+                                   const Shape& indices_shape,
+                                   const int32_t* indices_data);
+
+#endif
+};
+
+}  // namespace detail
+
+using ConstValue = detail::ConstValueImpl<detail::Unowned<const OrtValue>>;
+using UnownedValue = detail::ValueImpl<detail::Unowned<OrtValue>>;
+
+/** \brief Wrapper around ::OrtValue
+ *
+ */
+struct Value : detail::ValueImpl<OrtValue> {
+  using Base = detail::ValueImpl<OrtValue>;
+  using OrtSparseValuesParam = detail::OrtSparseValuesParam;
+  using Shape = detail::Shape;
+
+  explicit Value(std::nullptr_t) {}         ///< Create an empty Value object, must be assigned a valid one to be used
+  explicit Value(OrtValue* p) : Base{p} {}  ///< Used for interop with the C API
+  Value(Value&&) = default;
+  Value& operator=(Value&&) = default;
+
+  ConstValue GetConst() const { return ConstValue{this->p_}; }
+  UnownedValue GetUnowned() const { return UnownedValue{this->p_}; }
+
+  /** \brief Creates a tensor with a user supplied buffer. Wraps OrtApi::CreateTensorWithDataAsOrtValue.
+   * \tparam T The numeric datatype. This API is not suitable for strings.
+   * \param info Memory description of where the p_data buffer resides (CPU vs GPU etc).
+   * \param p_data Pointer to the data buffer.
+   * \param p_data_element_count The number of elements in the data buffer.
+   * \param shape Pointer to the tensor shape dimensions.
+   * \param shape_len The number of tensor shape dimensions.
+   */
+  template <typename T>
+  static Value CreateTensor(const OrtMemoryInfo* info, T* p_data, size_t p_data_element_count, const int64_t* shape, size_t shape_len);
+
+  /** \brief Creates a tensor with a user supplied buffer. Wraps OrtApi::CreateTensorWithDataAsOrtValue.
+   *
+   * \param info Memory description of where the p_data buffer resides (CPU vs GPU etc).
+   * \param p_data Pointer to the data buffer.
+   * \param p_data_byte_count The number of bytes in the data buffer.
+   * \param shape Pointer to the tensor shape dimensions.
+   * \param shape_len The number of tensor shape dimensions.
+   * \param type The data type.
+   */
+  static Value CreateTensor(const OrtMemoryInfo* info, void* p_data, size_t p_data_byte_count, const int64_t* shape, size_t shape_len,
+                            ONNXTensorElementDataType type);
+
+  /** \brief Creates an OrtValue with a tensor using a supplied OrtAllocator. Wraps OrtApi::CreateTensorAsOrtValue.
+   *         This overload will allocate the buffer for the tensor  according to the supplied shape and data type.
+   *         The allocated buffer will be owned by the returned OrtValue and will be freed when the OrtValue is released.
+   *         The input data would need to be copied into the allocated buffer.
+   *         This API is not suitable for strings.
+   *
+   * \tparam T The numeric datatype. This API is not suitable for strings.
+   * \param allocator The allocator to use.
+   * \param shape Pointer to the tensor shape dimensions.
+   * \param shape_len The number of tensor shape dimensions.
+   */
+  template <typename T>
+  static Value CreateTensor(OrtAllocator* allocator, const int64_t* shape, size_t shape_len);
+
+  /** \brief Creates an OrtValue with a tensor using the supplied OrtAllocator.
+   *   Wraps OrtApi::CreateTensorAsOrtValue.
+   *   The allocated buffer will be owned by the returned OrtValue and will be freed when the OrtValue is released.
+   *   The input data would need to be copied into the allocated buffer.
+   *   This API is not suitable for strings.
+   *
+   * \param allocator The allocator to use.
+   * \param shape Pointer to the tensor shape dimensions.
+   * \param shape_len The number of tensor shape dimensions.
+   * \param type The data type.
+   */
+  static Value CreateTensor(OrtAllocator* allocator, const int64_t* shape, size_t shape_len, ONNXTensorElementDataType type);
+
+  /** \brief Creates an OrtValue with a Map Onnx type representation.
+   *  The API would ref-count the supplied OrtValues and they will be released
+   *  when the returned OrtValue is released. The caller may release keys and values after the call
+   *  returns.
+   *
+   * \param keys an OrtValue containing a tensor with primitive data type keys.
+   * \param values an OrtValue that may contain a tensor. Ort currently supports only primitive data type values.
+   */
+  static Value CreateMap(const Value& keys, const Value& values);  ///< Wraps OrtApi::CreateValue
+
+  /** \brief Creates an OrtValue with a Sequence Onnx type representation.
+   *  The API would ref-count the supplied OrtValues and they will be released
+   *  when the returned OrtValue is released. The caller may release the values after the call
+   *  returns.
+   *
+   * \param values a vector of OrtValues that must have the same Onnx value type.
+   */
+  static Value CreateSequence(const std::vector<Value>& values);  ///< Wraps OrtApi::CreateValue
+
+  /** \brief Creates an OrtValue wrapping an Opaque type.
+   *  This is used for experimental support of non-tensor types.
+   *
+   * \tparam T - the type of the value.
+   * \param domain - zero terminated utf-8 string. Domain of the type.
+   * \param type_name - zero terminated utf-8 string. Name of the type.
+   * \param value - the value to be wrapped.
+   */
+  template <typename T>
+  static Value CreateOpaque(const char* domain, const char* type_name, const T& value);  ///< Wraps OrtApi::CreateOpaqueValue
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+  /// <summary>
+  /// This is a simple forwarding method to the other overload that helps deducing
+  /// data type enum value from the type of the buffer.
+  /// </summary>
+  /// <typeparam name="T">numeric datatype. This API is not suitable for strings.</typeparam>
+  /// <param name="info">Memory description where the user buffers reside (CPU vs GPU etc)</param>
+  /// <param name="p_data">pointer to the user supplied buffer, use nullptr for fully sparse tensors</param>
+  /// <param name="dense_shape">a would be dense shape of the tensor</param>
+  /// <param name="values_shape">non zero values shape. Use a single 0 shape for fully sparse tensors.</param>
+  /// <returns></returns>
+  template <typename T>
+  static Value CreateSparseTensor(const OrtMemoryInfo* info, T* p_data, const Shape& dense_shape,
+                                  const Shape& values_shape);
+
+  /// <summary>
+  /// Creates an OrtValue instance containing SparseTensor. This constructs
+  /// a sparse tensor that makes use of user allocated buffers. It does not make copies
+  /// of the user provided data and does not modify it. The lifespan of user provided buffers should
+  /// eclipse the life span of the resulting OrtValue. This call constructs an instance that only contain
+  /// a pointer to non-zero values. To fully populate the sparse tensor call Use<Format>Indices() API below
+  /// to supply a sparse format specific indices.
+  /// This API is not suitable for string data. Use CreateSparseTensor() with allocator specified so strings
+  /// can be properly copied into the allocated buffer.
+  /// </summary>
+  /// <param name="info">Memory description where the user buffers reside (CPU vs GPU etc)</param>
+  /// <param name="p_data">pointer to the user supplied buffer, use nullptr for fully sparse tensors</param>
+  /// <param name="dense_shape">a would be dense shape of the tensor</param>
+  /// <param name="values_shape">non zero values shape. Use a single 0 shape for fully sparse tensors.</param>
+  /// <param name="type">data type</param>
+  /// <returns>Ort::Value instance containing SparseTensor</returns>
+  static Value CreateSparseTensor(const OrtMemoryInfo* info, void* p_data, const Shape& dense_shape,
+                                  const Shape& values_shape, ONNXTensorElementDataType type);
+
+  /// <summary>
+  /// This is a simple forwarding method to the below CreateSparseTensor.
+  /// This helps to specify data type enum in terms of C++ data type.
+  /// Use CreateSparseTensor<T>
+  /// </summary>
+  /// <typeparam name="T">numeric data type only. String data enum must be specified explicitly.</typeparam>
+  /// <param name="allocator">allocator to use</param>
+  /// <param name="dense_shape">a would be dense shape of the tensor</param>
+  /// <returns>Ort::Value</returns>
+  template <typename T>
+  static Value CreateSparseTensor(OrtAllocator* allocator, const Shape& dense_shape);
+
+  /// <summary>
+  /// Creates an instance of OrtValue containing sparse tensor. The created instance has no data.
+  /// The data must be supplied by on of the FillSparseTensor<Format>() methods that take both non-zero values
+  /// and indices. The data will be copied into a buffer that would be allocated using the supplied allocator.
+  /// Use this API to create OrtValues that contain sparse tensors with all supported data types including
+  /// strings.
+  /// </summary>
+  /// <param name="allocator">allocator to use. The allocator lifespan must eclipse that of the resulting OrtValue</param>
+  /// <param name="dense_shape">a would be dense shape of the tensor</param>
+  /// <param name="type">data type</param>
+  /// <returns>an instance of Ort::Value</returns>
+  static Value CreateSparseTensor(OrtAllocator* allocator, const Shape& dense_shape, ONNXTensorElementDataType type);
+
+#endif  // !defined(DISABLE_SPARSE_TENSORS)
+};
+
+/// <summary>
+/// Represents native memory allocation coming from one of the
+/// OrtAllocators registered with OnnxRuntime.
+/// Use it to wrap an allocation made by an allocator
+/// so it can be automatically released when no longer needed.
+/// </summary>
+struct MemoryAllocation {
+  MemoryAllocation(OrtAllocator* allocator, void* p, size_t size);
+  ~MemoryAllocation();
+  MemoryAllocation(const MemoryAllocation&) = delete;
+  MemoryAllocation& operator=(const MemoryAllocation&) = delete;
+  MemoryAllocation(MemoryAllocation&&) noexcept;
+  MemoryAllocation& operator=(MemoryAllocation&&) noexcept;
+
+  void* get() { return p_; }
+  size_t size() const { return size_; }
+
+ private:
+  OrtAllocator* allocator_;
+  void* p_;
+  size_t size_;
+};
+
+namespace detail {
+template <typename T>
+struct AllocatorImpl : Base<T> {
+  using B = Base<T>;
+  using B::B;
+
+  void* Alloc(size_t size);
+  MemoryAllocation GetAllocation(size_t size);
+  void Free(void* p);
+  ConstMemoryInfo GetInfo() const;
+};
+
+}  // namespace detail
+
+/** \brief Wrapper around ::OrtAllocator default instance that is owned by Onnxruntime
+ *
+ */
+struct AllocatorWithDefaultOptions : detail::AllocatorImpl<detail::Unowned<OrtAllocator>> {
+  explicit AllocatorWithDefaultOptions(std::nullptr_t) {}  ///< Convenience to create a class member and then replace with an instance
+  AllocatorWithDefaultOptions();
+};
+
+/** \brief Wrapper around ::OrtAllocator
+ *
+ */
+struct Allocator : detail::AllocatorImpl<OrtAllocator> {
+  explicit Allocator(std::nullptr_t) {}  ///< Convenience to create a class member and then replace with an instance
+  Allocator(const Session& session, const OrtMemoryInfo*);
+};
+
+using UnownedAllocator = detail::AllocatorImpl<detail::Unowned<OrtAllocator>>;
+
+namespace detail {
+namespace binding_utils {
+// Bring these out of template
+std::vector<std::string> GetOutputNamesHelper(const OrtIoBinding* binding, OrtAllocator*);
+std::vector<Value> GetOutputValuesHelper(const OrtIoBinding* binding, OrtAllocator*);
+}  // namespace binding_utils
+
+template <typename T>
+struct ConstIoBindingImpl : Base<T> {
+  using B = Base<T>;
+  using B::B;
+
+  std::vector<std::string> GetOutputNames() const;
+  std::vector<std::string> GetOutputNames(OrtAllocator*) const;
+  std::vector<Value> GetOutputValues() const;
+  std::vector<Value> GetOutputValues(OrtAllocator*) const;
+};
+
+template <typename T>
+struct IoBindingImpl : ConstIoBindingImpl<T> {
+  using B = ConstIoBindingImpl<T>;
+  using B::B;
+
+  void BindInput(const char* name, const Value&);
+  void BindOutput(const char* name, const Value&);
+  void BindOutput(const char* name, const OrtMemoryInfo*);
+  void ClearBoundInputs();
+  void ClearBoundOutputs();
+  void SynchronizeInputs();
+  void SynchronizeOutputs();
+};
+
+}  // namespace detail
+
+using ConstIoBinding = detail::ConstIoBindingImpl<detail::Unowned<const OrtIoBinding>>;
+using UnownedIoBinding = detail::IoBindingImpl<detail::Unowned<OrtIoBinding>>;
+
+/** \brief Wrapper around ::OrtIoBinding
+ *
+ */
+struct IoBinding : detail::IoBindingImpl<OrtIoBinding> {
+  explicit IoBinding(std::nullptr_t) {}  ///< Create an empty object for convenience. Sometimes, we want to initialize members later.
+  explicit IoBinding(Session& session);
+  ConstIoBinding GetConst() const { return ConstIoBinding{this->p_}; }
+  UnownedIoBinding GetUnowned() const { return UnownedIoBinding{this->p_}; }
+};
+
+/*! \struct Ort::ArenaCfg
+ * \brief it is a structure that represents the configuration of an arena based allocator
+ * \details Please see docs/C_API.md for details
+ */
+struct ArenaCfg : detail::Base<OrtArenaCfg> {
+  explicit ArenaCfg(std::nullptr_t) {}  ///< Create an empty ArenaCfg object, must be assigned a valid one to be used
+  /**
+   * Wraps OrtApi::CreateArenaCfg
+   * \param max_mem - use 0 to allow ORT to choose the default
+   * \param arena_extend_strategy -  use -1 to allow ORT to choose the default, 0 = kNextPowerOfTwo, 1 = kSameAsRequested
+   * \param initial_chunk_size_bytes - use -1 to allow ORT to choose the default
+   * \param max_dead_bytes_per_chunk - use -1 to allow ORT to choose the default
+   * See docs/C_API.md for details on what the following parameters mean and how to choose these values
+   */
+  ArenaCfg(size_t max_mem, int arena_extend_strategy, int initial_chunk_size_bytes, int max_dead_bytes_per_chunk);
+};
+
+//
+// Custom OPs (only needed to implement custom OPs)
+//
+
+/// <summary>
+/// This struct provides life time management for custom op attribute
+/// </summary>
+struct OpAttr : detail::Base<OrtOpAttr> {
+  OpAttr(const char* name, const void* data, int len, OrtOpAttrType type);
+};
+
+/**
+ * Macro that logs a message using the provided logger. Throws an exception if OrtApi::Logger_LogMessage fails.
+ * Example: ORT_CXX_LOG(logger, ORT_LOGGING_LEVEL_INFO, "Log a message");
+ *
+ * \param logger The Ort::Logger instance to use. Must be a value or reference.
+ * \param message_severity The logging severity level of the message.
+ * \param message A null-terminated UTF-8 message to log.
+ */
+#define ORT_CXX_LOG(logger, message_severity, message)                                       \
+  do {                                                                                       \
+    if (message_severity >= logger.GetLoggingSeverityLevel()) {                              \
+      Ort::ThrowOnError(logger.LogMessage(message_severity, ORT_FILE, __LINE__,              \
+                                          static_cast<const char*>(__FUNCTION__), message)); \
+    }                                                                                        \
+  } while (false)
+
+/**
+ * Macro that logs a message using the provided logger. Can be used in noexcept code since errors are silently ignored.
+ * Example: ORT_CXX_LOG_NOEXCEPT(logger, ORT_LOGGING_LEVEL_INFO, "Log a message");
+ *
+ * \param logger The Ort::Logger instance to use. Must be a value or reference.
+ * \param message_severity The logging severity level of the message.
+ * \param message A null-terminated UTF-8 message to log.
+ */
+#define ORT_CXX_LOG_NOEXCEPT(logger, message_severity, message)                              \
+  do {                                                                                       \
+    if (message_severity >= logger.GetLoggingSeverityLevel()) {                              \
+      static_cast<void>(logger.LogMessage(message_severity, ORT_FILE, __LINE__,              \
+                                          static_cast<const char*>(__FUNCTION__), message)); \
+    }                                                                                        \
+  } while (false)
+
+/**
+ * Macro that logs a printf-like formatted message using the provided logger. Throws an exception if
+ * OrtApi::Logger_LogMessage fails or if a formatting error occurs.
+ * Example: ORT_CXX_LOGF(logger, ORT_LOGGING_LEVEL_INFO, "Log an int: %d", 12);
+ *
+ * \param logger The Ort::Logger instance to use. Must be a value or reference.
+ * \param message_severity The logging severity level of the message.
+ * \param format A null-terminated UTF-8 format string forwarded to a printf-like function.
+ *               Refer to https://en.cppreference.com/w/cpp/io/c/fprintf for information on valid formats.
+ * \param ... Zero or more variadic arguments referenced by the format string.
+ */
+#define ORT_CXX_LOGF(logger, message_severity, /*format,*/...)                                            \
+  do {                                                                                                    \
+    if (message_severity >= logger.GetLoggingSeverityLevel()) {                                           \
+      Ort::ThrowOnError(logger.LogFormattedMessage(message_severity, ORT_FILE, __LINE__,                  \
+                                                   static_cast<const char*>(__FUNCTION__), __VA_ARGS__)); \
+    }                                                                                                     \
+  } while (false)
+
+/**
+ * Macro that logs a printf-like formatted message using the provided logger. Can be used in noexcept code since errors
+ * are silently ignored.
+ * Example: ORT_CXX_LOGF_NOEXCEPT(logger, ORT_LOGGING_LEVEL_INFO, "Log an int: %d", 12);
+ *
+ * \param logger The Ort::Logger instance to use. Must be a value or reference.
+ * \param message_severity The logging severity level of the message.
+ * \param format A null-terminated UTF-8 format string forwarded to a printf-like function.
+ *               Refer to https://en.cppreference.com/w/cpp/io/c/fprintf for information on valid formats.
+ * \param ... Zero or more variadic arguments referenced by the format string.
+ */
+#define ORT_CXX_LOGF_NOEXCEPT(logger, message_severity, /*format,*/...)                                   \
+  do {                                                                                                    \
+    if (message_severity >= logger.GetLoggingSeverityLevel()) {                                           \
+      static_cast<void>(logger.LogFormattedMessage(message_severity, ORT_FILE, __LINE__,                  \
+                                                   static_cast<const char*>(__FUNCTION__), __VA_ARGS__)); \
+    }                                                                                                     \
+  } while (false)
+
+/// <summary>
+/// This class represents an ONNX Runtime logger that can be used to log information with an
+/// associated severity level and source code location (file path, line number, function name).
+///
+/// A Logger can be obtained from within custom operators by calling Ort::KernelInfo::GetLogger().
+/// Instances of Ort::Logger are the size of two pointers and can be passed by value.
+///
+/// Use the ORT_CXX_LOG macros to ensure the source code location is set properly from the callsite
+/// and to take advantage of a cached logging severity level that can bypass calls to the underlying C API.
+/// </summary>
+struct Logger {
+  /**
+   * Creates an empty Ort::Logger. Must be initialized from a valid Ort::Logger before use.
+   */
+  Logger() = default;
+
+  /**
+   * Creates an empty Ort::Logger. Must be initialized from a valid Ort::Logger before use.
+   */
+  explicit Logger(std::nullptr_t) {}
+
+  /**
+   * Creates a logger from an ::OrtLogger instance. Caches the logger's current severity level by calling
+   * OrtApi::Logger_GetLoggingSeverityLevel. Throws an exception if OrtApi::Logger_GetLoggingSeverityLevel fails.
+   *
+   * \param logger The ::OrtLogger to wrap.
+   */
+  explicit Logger(const OrtLogger* logger);
+
+  ~Logger() = default;
+
+  Logger(const Logger&) = default;
+  Logger& operator=(const Logger&) = default;
+
+  Logger(Logger&& v) noexcept = default;
+  Logger& operator=(Logger&& v) noexcept = default;
+
+  /**
+   * Returns the logger's current severity level from the cached member.
+   *
+   * \return The current ::OrtLoggingLevel.
+   */
+  OrtLoggingLevel GetLoggingSeverityLevel() const noexcept;
+
+  /**
+   * Logs the provided message via OrtApi::Logger_LogMessage. Use the ORT_CXX_LOG or ORT_CXX_LOG_NOEXCEPT
+   * macros to properly set the source code location and to use the cached severity level to potentially bypass
+   * calls to the underlying C API.
+   *
+   * \param log_severity_level The message's logging severity level.
+   * \param file_path The filepath of the file in which the message is logged. Usually the value of ORT_FILE.
+   * \param line_number The file line number in which the message is logged. Usually the value of __LINE__.
+   * \param func_name The name of the function in which the message is logged. Usually the value of __FUNCTION__.
+   * \param message The message to log.
+   * \return A Ort::Status value to indicate error or success.
+   */
+  Status LogMessage(OrtLoggingLevel log_severity_level, const ORTCHAR_T* file_path, int line_number,
+                    const char* func_name, const char* message) const noexcept;
+
+  /**
+   * Logs a printf-like formatted message via OrtApi::Logger_LogMessage. Use the ORT_CXX_LOGF or ORT_CXX_LOGF_NOEXCEPT
+   * macros to properly set the source code location and to use the cached severity level to potentially bypass
+   * calls to the underlying C API. Returns an error status if a formatting error occurs.
+   *
+   * \param log_severity_level The message's logging severity level.
+   * \param file_path The filepath of the file in which the message is logged. Usually the value of ORT_FILE.
+   * \param line_number The file line number in which the message is logged. Usually the value of __LINE__.
+   * \param func_name The name of the function in which the message is logged. Usually the value of __FUNCTION__.
+   * \param format A null-terminated UTF-8 format string forwarded to a printf-like function.
+   *               Refer to https://en.cppreference.com/w/cpp/io/c/fprintf for information on valid formats.
+   * \param args Zero or more variadic arguments referenced by the format string.
+   * \return A Ort::Status value to indicate error or success.
+   */
+  template <typename... Args>
+  Status LogFormattedMessage(OrtLoggingLevel log_severity_level, const ORTCHAR_T* file_path, int line_number,
+                             const char* func_name, const char* format, Args&&... args) const noexcept;
+
+ private:
+  const OrtLogger* logger_{};
+  OrtLoggingLevel cached_severity_level_{};
+};
+
+/// <summary>
+/// This class wraps a raw pointer OrtKernelContext* that is being passed
+/// to the custom kernel Compute() method. Use it to safely access context
+/// attributes, input and output parameters with exception safety guarantees.
+/// See usage example in onnxruntime/test/testdata/custom_op_library/custom_op_library.cc
+/// </summary>
+struct KernelContext {
+  explicit KernelContext(OrtKernelContext* context);
+  size_t GetInputCount() const;
+  size_t GetOutputCount() const;
+  ConstValue GetInput(size_t index) const;
+  UnownedValue GetOutput(size_t index, const int64_t* dim_values, size_t dim_count) const;
+  UnownedValue GetOutput(size_t index, const std::vector<int64_t>& dims) const;
+  void* GetGPUComputeStream() const;
+  Logger GetLogger() const;
+  OrtAllocator* GetAllocator(const OrtMemoryInfo& memory_info) const;
+
+ private:
+  OrtKernelContext* ctx_;
+};
+
+struct KernelInfo;
+
+namespace detail {
+namespace attr_utils {
+void GetAttr(const OrtKernelInfo* p, const char* name, float&);
+void GetAttr(const OrtKernelInfo* p, const char* name, int64_t&);
+void GetAttr(const OrtKernelInfo* p, const char* name, std::string&);
+void GetAttrs(const OrtKernelInfo* p, const char* name, std::vector<float>&);
+void GetAttrs(const OrtKernelInfo* p, const char* name, std::vector<int64_t>&);
+}  // namespace attr_utils
+
+template <typename T>
+struct KernelInfoImpl : Base<T> {
+  using B = Base<T>;
+  using B::B;
+
+  KernelInfo Copy() const;
+
+  template <typename R>  // R is only implemented for float, int64_t, and string
+  R GetAttribute(const char* name) const {
+    R val;
+    attr_utils::GetAttr(this->p_, name, val);
+    return val;
+  }
+
+  template <typename R>  // R is only implemented for std::vector<float>, std::vector<int64_t>
+  std::vector<R> GetAttributes(const char* name) const {
+    std::vector<R> result;
+    attr_utils::GetAttrs(this->p_, name, result);
+    return result;
+  }
+
+  Value GetTensorAttribute(const char* name, OrtAllocator* allocator) const;
+
+  size_t GetInputCount() const;
+  size_t GetOutputCount() const;
+
+  std::string GetInputName(size_t index) const;
+  std::string GetOutputName(size_t index) const;
+
+  TypeInfo GetInputTypeInfo(size_t index) const;
+  TypeInfo GetOutputTypeInfo(size_t index) const;
+
+  ConstValue GetTensorConstantInput(size_t index, int* is_constant) const;
+
+  std::string GetNodeName() const;
+  Logger GetLogger() const;
+};
+
+}  // namespace detail
+
+using ConstKernelInfo = detail::KernelInfoImpl<detail::Unowned<const OrtKernelInfo>>;
+
+/// <summary>
+/// This struct owns the OrtKernInfo* pointer when a copy is made.
+/// For convenient wrapping of OrtKernelInfo* passed to kernel constructor
+/// and query attributes, warp the pointer with Ort::Unowned<KernelInfo> instance
+/// so it does not destroy the pointer the kernel does not own.
+/// </summary>
+struct KernelInfo : detail::KernelInfoImpl<OrtKernelInfo> {
+  explicit KernelInfo(std::nullptr_t) {}     ///< Create an empty instance to initialize later
+  explicit KernelInfo(OrtKernelInfo* info);  ///< Take ownership of the instance
+  ConstKernelInfo GetConst() const { return ConstKernelInfo{this->p_}; }
+};
+
+/// <summary>
+/// Create and own custom defined operation.
+/// </summary>
+struct Op : detail::Base<OrtOp> {
+  explicit Op(std::nullptr_t) {}  ///< Create an empty Operator object, must be assigned a valid one to be used
+
+  explicit Op(OrtOp*);  ///< Take ownership of the OrtOp
+
+  static Op Create(const OrtKernelInfo* info, const char* op_name, const char* domain,
+                   int version, const char** type_constraint_names,
+                   const ONNXTensorElementDataType* type_constraint_values,
+                   size_t type_constraint_count,
+                   const OpAttr* attr_values,
+                   size_t attr_count,
+                   size_t input_count, size_t output_count);
+
+  void Invoke(const OrtKernelContext* context,
+              const Value* input_values,
+              size_t input_count,
+              Value* output_values,
+              size_t output_count);
+
+  // For easier refactoring
+  void Invoke(const OrtKernelContext* context,
+              const OrtValue* const* input_values,
+              size_t input_count,
+              OrtValue* const* output_values,
+              size_t output_count);
+};
+
+template <typename TOp, typename TKernel, bool WithStatus = false>
+struct CustomOpBase : OrtCustomOp {
+  CustomOpBase() {
+    OrtCustomOp::version = ORT_API_VERSION;
+    OrtCustomOp::GetName = [](const OrtCustomOp* this_) { return static_cast<const TOp*>(this_)->GetName(); };
+
+    OrtCustomOp::GetExecutionProviderType = [](const OrtCustomOp* this_) { return static_cast<const TOp*>(this_)->GetExecutionProviderType(); };
+
+    OrtCustomOp::GetInputTypeCount = [](const OrtCustomOp* this_) { return static_cast<const TOp*>(this_)->GetInputTypeCount(); };
+    OrtCustomOp::GetInputType = [](const OrtCustomOp* this_, size_t index) { return static_cast<const TOp*>(this_)->GetInputType(index); };
+    OrtCustomOp::GetInputMemoryType = [](const OrtCustomOp* this_, size_t index) { return static_cast<const TOp*>(this_)->GetInputMemoryType(index); };
+
+    OrtCustomOp::GetOutputTypeCount = [](const OrtCustomOp* this_) { return static_cast<const TOp*>(this_)->GetOutputTypeCount(); };
+    OrtCustomOp::GetOutputType = [](const OrtCustomOp* this_, size_t index) { return static_cast<const TOp*>(this_)->GetOutputType(index); };
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push)
+#pragma warning(disable : 26409)
+#endif
+    OrtCustomOp::KernelDestroy = [](void* op_kernel) { delete static_cast<TKernel*>(op_kernel); };
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#endif
+    OrtCustomOp::GetInputCharacteristic = [](const OrtCustomOp* this_, size_t index) { return static_cast<const TOp*>(this_)->GetInputCharacteristic(index); };
+    OrtCustomOp::GetOutputCharacteristic = [](const OrtCustomOp* this_, size_t index) { return static_cast<const TOp*>(this_)->GetOutputCharacteristic(index); };
+
+    OrtCustomOp::GetVariadicInputMinArity = [](const OrtCustomOp* this_) { return static_cast<const TOp*>(this_)->GetVariadicInputMinArity(); };
+    OrtCustomOp::GetVariadicInputHomogeneity = [](const OrtCustomOp* this_) { return static_cast<int>(static_cast<const TOp*>(this_)->GetVariadicInputHomogeneity()); };
+    OrtCustomOp::GetVariadicOutputMinArity = [](const OrtCustomOp* this_) { return static_cast<const TOp*>(this_)->GetVariadicOutputMinArity(); };
+    OrtCustomOp::GetVariadicOutputHomogeneity = [](const OrtCustomOp* this_) { return static_cast<int>(static_cast<const TOp*>(this_)->GetVariadicOutputHomogeneity()); };
+#ifdef __cpp_if_constexpr
+    if constexpr (WithStatus) {
+#else
+    if (WithStatus) {
+#endif
+      OrtCustomOp::CreateKernelV2 = [](const OrtCustomOp* this_, const OrtApi* api, const OrtKernelInfo* info, void** op_kernel) -> OrtStatusPtr {
+        return static_cast<const TOp*>(this_)->CreateKernelV2(*api, info, op_kernel);
+      };
+      OrtCustomOp::KernelComputeV2 = [](void* op_kernel, OrtKernelContext* context) -> OrtStatusPtr {
+        return static_cast<TKernel*>(op_kernel)->ComputeV2(context);
+      };
+    } else {
+      OrtCustomOp::CreateKernelV2 = nullptr;
+      OrtCustomOp::KernelComputeV2 = nullptr;
+
+      OrtCustomOp::CreateKernel = [](const OrtCustomOp* this_, const OrtApi* api, const OrtKernelInfo* info) { return static_cast<const TOp*>(this_)->CreateKernel(*api, info); };
+      OrtCustomOp::KernelCompute = [](void* op_kernel, OrtKernelContext* context) {
+        static_cast<TKernel*>(op_kernel)->Compute(context);
+      };
+    }
+  }
+
+  // Default implementation of GetExecutionProviderType that returns nullptr to default to the CPU provider
+  const char* GetExecutionProviderType() const { return nullptr; }
+
+  // Default implementations of GetInputCharacteristic() and GetOutputCharacteristic() below
+  // (inputs and outputs are required by default)
+  OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(size_t /*index*/) const {
+    return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
+  }
+
+  OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(size_t /*index*/) const {
+    return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
+  }
+
+  // Default implemention of GetInputMemoryType() that returns OrtMemTypeDefault
+  OrtMemType GetInputMemoryType(size_t /*index*/) const {
+    return OrtMemTypeDefault;
+  }
+
+  // Default implementation of GetVariadicInputMinArity() returns 1 to specify that a variadic input
+  // should expect at least 1 argument.
+  int GetVariadicInputMinArity() const {
+    return 1;
+  }
+
+  // Default implementation of GetVariadicInputHomegeneity() returns true to specify that all arguments
+  // to a variadic input should be of the same type.
+  bool GetVariadicInputHomogeneity() const {
+    return true;
+  }
+
+  // Default implementation of GetVariadicOutputMinArity() returns 1 to specify that a variadic output
+  // should produce at least 1 output value.
+  int GetVariadicOutputMinArity() const {
+    return 1;
+  }
+
+  // Default implementation of GetVariadicOutputHomegeneity() returns true to specify that all output values
+  // produced by a variadic output should be of the same type.
+  bool GetVariadicOutputHomogeneity() const {
+    return true;
+  }
+
+  // Declare list of session config entries used by this Custom Op.
+  // Implement this function in order to get configs from CustomOpBase::GetSessionConfigs().
+  // This default implementation returns an empty vector of config entries.
+  std::vector<std::string> GetSessionConfigKeys() const {
+    return std::vector<std::string>{};
+  }
+
+ protected:
+  // Helper function that returns a map of session config entries specified by CustomOpBase::GetSessionConfigKeys.
+  void GetSessionConfigs(std::unordered_map<std::string, std::string>& out, ConstSessionOptions options) const;
+};
+
+}  // namespace Ort
+
+#include "onnxruntime_cxx_inline.h"
diff --git a/duix-sdk/src/main/cpp/third/arm/include/onnx/onnxruntime_cxx_inline.h b/duix-sdk/src/main/cpp/third/arm/include/onnx/onnxruntime_cxx_inline.h
new file mode 100644
index 0000000..2217283
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/onnx/onnxruntime_cxx_inline.h
@@ -0,0 +1,1886 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// Do not include this file directly. Please include "onnxruntime_cxx_api.h" instead.
+// If interested in trying out features of the new experimental C++ API, include "experimental_onnxruntime_cxx_api.h" instead.
+//
+// These are the inline implementations of the C++ header APIs. They're in this separate file as to not clutter
+// the main C++ file with implementation details.
+
+#include <cstring>
+
+namespace Ort {
+
+namespace detail {
+inline void ThrowStatus(const Status& st) {
+  std::string error_message = st.GetErrorMessage();
+  OrtErrorCode error_code = st.GetErrorCode();
+  ORT_CXX_API_THROW(std::move(error_message), error_code);
+}
+}  // namespace detail
+
+inline void ThrowOnError(OrtStatus* ort_status) {
+  if (ort_status) {
+    Ort::Status st(ort_status);
+    detail::ThrowStatus(st);
+  }
+}
+
+inline void ThrowOnError(const Status& st) {
+  if (st) {
+    detail::ThrowStatus(st);
+  }
+}
+
+inline Status::Status(OrtStatus* status) noexcept : Base<OrtStatus>{status} {
+}
+
+inline Status::Status(const std::exception& e) noexcept {
+  p_ = GetApi().CreateStatus(ORT_FAIL, e.what());
+}
+
+inline Status::Status(const Exception& e) noexcept {
+  p_ = GetApi().CreateStatus(e.GetOrtErrorCode(), e.what());
+}
+
+inline Status::Status(const char* message, OrtErrorCode code) noexcept {
+  p_ = GetApi().CreateStatus(code, message);
+}
+
+inline std::string Status::GetErrorMessage() const {
+  std::string message(GetApi().GetErrorMessage(p_));
+  return message;
+}
+
+inline OrtErrorCode Status::GetErrorCode() const {
+  return GetApi().GetErrorCode(p_);
+}
+
+inline bool Status::IsOK() const noexcept {
+  return (p_ == nullptr);
+}
+
+// This template converts a C++ type into it's ONNXTensorElementDataType
+template <typename T>
+struct TypeToTensorType;
+template <>
+struct TypeToTensorType<float> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+};
+template <>
+struct TypeToTensorType<Float16_t> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16;
+};
+template <>
+struct TypeToTensorType<BFloat16_t> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16;
+};
+template <>
+struct TypeToTensorType<double> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE;
+};
+template <>
+struct TypeToTensorType<int8_t> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8;
+};
+template <>
+struct TypeToTensorType<int16_t> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16;
+};
+template <>
+struct TypeToTensorType<int32_t> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32;
+};
+template <>
+struct TypeToTensorType<int64_t> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
+};
+template <>
+struct TypeToTensorType<uint8_t> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8;
+};
+template <>
+struct TypeToTensorType<uint16_t> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16;
+};
+template <>
+struct TypeToTensorType<uint32_t> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32;
+};
+template <>
+struct TypeToTensorType<uint64_t> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64;
+};
+template <>
+struct TypeToTensorType<bool> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL;
+};
+
+template <>
+struct TypeToTensorType<Float8E4M3FN_t> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FN;
+};
+template <>
+struct TypeToTensorType<Float8E4M3FNUZ_t> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FNUZ;
+};
+template <>
+struct TypeToTensorType<Float8E5M2_t> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2;
+};
+template <>
+struct TypeToTensorType<Float8E5M2FNUZ_t> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2FNUZ;
+};
+
+inline bool BFloat16_t::operator==(const BFloat16_t& rhs) const noexcept {
+  if (IsNaN() || rhs.IsNaN()) {
+    // IEEE defines that NaN is not equal to anything, including itself.
+    return false;
+  }
+  return val == rhs.val;
+}
+
+inline bool BFloat16_t::operator<(const BFloat16_t& rhs) const noexcept {
+  if (IsNaN() || rhs.IsNaN()) {
+    // IEEE defines that NaN is unordered with respect to everything, including itself.
+    return false;
+  }
+
+  const bool left_is_negative = IsNegative();
+  if (left_is_negative != rhs.IsNegative()) {
+    // When the signs of left and right differ, we know that left is less than right if it is
+    // the negative value. The exception to this is if both values are zero, in which case IEEE
+    // says they should be equal, even if the signs differ.
+    return left_is_negative && !AreZero(*this, rhs);
+  }
+  return (val != rhs.val) && ((val < rhs.val) ^ left_is_negative);
+}
+
+inline MemoryAllocation::MemoryAllocation(OrtAllocator* allocator, void* p, size_t size)
+    : allocator_(allocator), p_(p), size_(size) {
+}
+
+inline MemoryAllocation::~MemoryAllocation() {
+  if (p_ != nullptr) {
+    // We do not throw out of destructor
+    auto ret = GetApi().AllocatorFree(allocator_, p_);
+    static_cast<void>(ret);
+  }
+}
+
+inline MemoryAllocation::MemoryAllocation(MemoryAllocation&& o) noexcept : allocator_(nullptr), p_(nullptr), size_(0) {
+  *this = std::move(o);
+}
+
+inline MemoryAllocation& MemoryAllocation::operator=(MemoryAllocation&& o) noexcept {
+  OrtAllocator* alloc = nullptr;
+  void* p = nullptr;
+  size_t sz = 0;
+
+  // Swap out this
+  std::swap(alloc, allocator_);
+  std::swap(p, p_);
+  std::swap(sz, size_);
+
+  // Swap with incoming
+  std::swap(allocator_, o.allocator_);
+  std::swap(p_, o.p_);
+  std::swap(size_, o.size_);
+
+  // Destroy this instance if needed
+  MemoryAllocation this_alloc(alloc, p, sz);
+  return *this;
+}
+
+namespace detail {
+
+template <typename T>
+inline void* AllocatorImpl<T>::Alloc(size_t size) {
+  void* out;
+  ThrowOnError(GetApi().AllocatorAlloc(this->p_, size, &out));
+  return out;
+}
+
+template <typename T>
+inline MemoryAllocation AllocatorImpl<T>::GetAllocation(size_t size) {
+  void* out;
+  ThrowOnError(GetApi().AllocatorAlloc(this->p_, size, &out));
+  MemoryAllocation result(this->p_, out, size);
+  return result;
+}
+
+template <typename T>
+inline void AllocatorImpl<T>::Free(void* p) {
+  ThrowOnError(GetApi().AllocatorFree(this->p_, p));
+}
+
+template <typename T>
+inline ConstMemoryInfo AllocatorImpl<T>::GetInfo() const {
+  const OrtMemoryInfo* out;
+  ThrowOnError(GetApi().AllocatorGetInfo(this->p_, &out));
+  return ConstMemoryInfo{out};
+}
+
+}  // namespace detail
+
+inline AllocatorWithDefaultOptions::AllocatorWithDefaultOptions() {
+  ThrowOnError(GetApi().GetAllocatorWithDefaultOptions(&this->p_));
+}
+
+inline Allocator::Allocator(const Session& sess, const OrtMemoryInfo* mem_info) {
+  ThrowOnError(GetApi().CreateAllocator(sess, mem_info, &this->p_));
+}
+
+namespace detail {
+
+template <typename T>
+inline std::string MemoryInfoImpl<T>::GetAllocatorName() const {
+  const char* name = nullptr;
+  ThrowOnError(GetApi().MemoryInfoGetName(this->p_, &name));
+  return std::string(name);
+}
+
+template <typename T>
+inline OrtAllocatorType MemoryInfoImpl<T>::GetAllocatorType() const {
+  OrtAllocatorType type;
+  ThrowOnError(GetApi().MemoryInfoGetType(this->p_, &type));
+  return type;
+}
+
+template <typename T>
+inline int MemoryInfoImpl<T>::GetDeviceId() const {
+  int id = 0;
+  ThrowOnError(GetApi().MemoryInfoGetId(this->p_, &id));
+  return id;
+}
+
+template <typename T>
+inline OrtMemoryInfoDeviceType MemoryInfoImpl<T>::GetDeviceType() const {
+  OrtMemoryInfoDeviceType type;
+  GetApi().MemoryInfoGetDeviceType(this->p_, &type);
+  return type;
+}
+
+template <typename T>
+inline OrtMemType MemoryInfoImpl<T>::GetMemoryType() const {
+  OrtMemType type;
+  ThrowOnError(GetApi().MemoryInfoGetMemType(this->p_, &type));
+  return type;
+}
+
+template <typename T>
+template <typename U>
+inline bool MemoryInfoImpl<T>::operator==(const MemoryInfoImpl<U>& o) const {
+  int comp_result = 0;
+  ThrowOnError(Ort::GetApi().CompareMemoryInfo(this->p_, o, &comp_result));
+  return comp_result == 0;
+}
+
+}  // namespace detail
+
+inline MemoryInfo MemoryInfo::CreateCpu(OrtAllocatorType type, OrtMemType mem_type) {
+  OrtMemoryInfo* p;
+  ThrowOnError(GetApi().CreateCpuMemoryInfo(type, mem_type, &p));
+  return MemoryInfo(p);
+}
+
+inline MemoryInfo::MemoryInfo(const char* name, OrtAllocatorType type, int id, OrtMemType mem_type) {
+  ThrowOnError(GetApi().CreateMemoryInfo(name, type, id, mem_type, &this->p_));
+}
+
+namespace detail {
+template <typename T>
+inline std::vector<std::string> ConstIoBindingImpl<T>::GetOutputNames() const {
+  AllocatorWithDefaultOptions allocator;
+  return binding_utils::GetOutputNamesHelper(this->p_, allocator);
+}
+
+template <typename T>
+inline std::vector<std::string> ConstIoBindingImpl<T>::GetOutputNames(OrtAllocator* allocator) const {
+  return binding_utils::GetOutputNamesHelper(this->p_, allocator);
+}
+
+template <typename T>
+inline std::vector<Value> ConstIoBindingImpl<T>::GetOutputValues() const {
+  AllocatorWithDefaultOptions allocator;
+  return binding_utils::GetOutputValuesHelper(this->p_, allocator);
+}
+
+template <typename T>
+inline std::vector<Value> ConstIoBindingImpl<T>::GetOutputValues(OrtAllocator* allocator) const {
+  return binding_utils::GetOutputValuesHelper(this->p_, allocator);
+}
+
+template <typename T>
+inline void IoBindingImpl<T>::BindInput(const char* name, const Value& value) {
+  ThrowOnError(GetApi().BindInput(this->p_, name, value));
+}
+
+template <typename T>
+inline void IoBindingImpl<T>::BindOutput(const char* name, const Value& value) {
+  ThrowOnError(GetApi().BindOutput(this->p_, name, value));
+}
+
+template <typename T>
+inline void IoBindingImpl<T>::BindOutput(const char* name, const OrtMemoryInfo* mem_info) {
+  ThrowOnError(GetApi().BindOutputToDevice(this->p_, name, mem_info));
+}
+
+template <typename T>
+inline void IoBindingImpl<T>::ClearBoundInputs() {
+  GetApi().ClearBoundInputs(this->p_);
+}
+
+template <typename T>
+inline void IoBindingImpl<T>::ClearBoundOutputs() {
+  GetApi().ClearBoundOutputs(this->p_);
+}
+
+template <typename T>
+inline void IoBindingImpl<T>::SynchronizeInputs() {
+  ThrowOnError(GetApi().SynchronizeBoundInputs(this->p_));
+}
+
+template <typename T>
+inline void IoBindingImpl<T>::SynchronizeOutputs() {
+  ThrowOnError(GetApi().SynchronizeBoundOutputs(this->p_));
+}
+
+namespace binding_utils {
+inline std::vector<std::string> GetOutputNamesHelper(const OrtIoBinding* binding, OrtAllocator* allocator) {
+  std::vector<std::string> result;
+  auto free_fn = detail::AllocatedFree(allocator);
+  using Ptr = std::unique_ptr<void, decltype(free_fn)>;
+
+  char* buffer = nullptr;
+  size_t* lengths = nullptr;
+  size_t count = 0;
+  ThrowOnError(GetApi().GetBoundOutputNames(binding, allocator, &buffer, &lengths, &count));
+
+  if (count == 0) {
+    return result;
+  }
+
+  Ptr buffer_g(buffer, free_fn);
+  Ptr lengths_g(lengths, free_fn);
+
+  result.reserve(count);
+  for (size_t i = 0; i < count; ++i) {
+    auto sz = *lengths;
+    result.emplace_back(buffer, sz);
+    buffer += sz;
+    ++lengths;
+  }
+  return result;
+}
+
+inline std::vector<Value> GetOutputValuesHelper(const OrtIoBinding* binding, OrtAllocator* allocator) {
+  std::vector<Value> result;
+  size_t owned = 0;
+  size_t output_count = 0;
+  // Lambda to release the buffer when no longer needed and
+  // make sure that we destroy all instances on exception
+  auto free_fn = [&owned, &output_count, allocator](OrtValue** buffer) {
+    if (buffer) {
+      while (owned < output_count) {
+        auto* p = buffer + owned++;
+        GetApi().ReleaseValue(*p);
+      }
+      allocator->Free(allocator, buffer);
+    }
+  };
+  using Ptr = std::unique_ptr<OrtValue*, decltype(free_fn)>;
+
+  OrtValue** output_buffer = nullptr;
+  ThrowOnError(GetApi().GetBoundOutputValues(binding, allocator, &output_buffer, &output_count));
+  if (output_count == 0) {
+    return result;
+  }
+
+  Ptr buffer_g(output_buffer, free_fn);
+
+  result.reserve(output_count);
+  for (size_t i = 0; i < output_count; ++i) {
+    result.emplace_back(output_buffer[i]);
+    ++owned;
+  }
+  return result;
+}
+
+}  // namespace binding_utils
+}  // namespace detail
+
+inline IoBinding::IoBinding(Session& session) {
+  ThrowOnError(GetApi().CreateIoBinding(session, &this->p_));
+}
+
+inline ArenaCfg::ArenaCfg(size_t max_mem, int arena_extend_strategy, int initial_chunk_size_bytes, int max_dead_bytes_per_chunk) {
+  ThrowOnError(GetApi().CreateArenaCfg(max_mem, arena_extend_strategy, initial_chunk_size_bytes, max_dead_bytes_per_chunk, &p_));
+}
+
+inline ThreadingOptions::ThreadingOptions() {
+  ThrowOnError(GetApi().CreateThreadingOptions(&p_));
+}
+
+inline ThreadingOptions& ThreadingOptions::SetGlobalIntraOpNumThreads(int intra_op_num_threads) {
+  ThrowOnError(GetApi().SetGlobalIntraOpNumThreads(p_, intra_op_num_threads));
+  return *this;
+}
+
+inline ThreadingOptions& ThreadingOptions::SetGlobalInterOpNumThreads(int inter_op_num_threads) {
+  ThrowOnError(GetApi().SetGlobalInterOpNumThreads(p_, inter_op_num_threads));
+  return *this;
+}
+
+inline ThreadingOptions& ThreadingOptions::SetGlobalSpinControl(int allow_spinning) {
+  ThrowOnError(GetApi().SetGlobalSpinControl(p_, allow_spinning));
+  return *this;
+}
+
+inline ThreadingOptions& ThreadingOptions::SetGlobalDenormalAsZero() {
+  ThrowOnError(GetApi().SetGlobalDenormalAsZero(p_));
+  return *this;
+}
+
+inline ThreadingOptions& ThreadingOptions::SetGlobalCustomCreateThreadFn(OrtCustomCreateThreadFn ort_custom_create_thread_fn) {
+  ThrowOnError(GetApi().SetGlobalCustomCreateThreadFn(p_, ort_custom_create_thread_fn));
+  return *this;
+}
+
+inline ThreadingOptions& ThreadingOptions::SetGlobalCustomThreadCreationOptions(void* ort_custom_thread_creation_options) {
+  ThrowOnError(GetApi().SetGlobalCustomThreadCreationOptions(p_, ort_custom_thread_creation_options));
+  return *this;
+}
+
+inline ThreadingOptions& ThreadingOptions::SetGlobalCustomJoinThreadFn(OrtCustomJoinThreadFn ort_custom_join_thread_fn) {
+  ThrowOnError(GetApi().SetGlobalCustomJoinThreadFn(p_, ort_custom_join_thread_fn));
+  return *this;
+}
+
+inline Env::Env(OrtLoggingLevel logging_level, _In_ const char* logid) {
+  ThrowOnError(GetApi().CreateEnv(logging_level, logid, &p_));
+  if (strcmp(logid, "onnxruntime-node") == 0) {
+    ThrowOnError(GetApi().SetLanguageProjection(p_, OrtLanguageProjection::ORT_PROJECTION_NODEJS));
+  } else {
+    ThrowOnError(GetApi().SetLanguageProjection(p_, OrtLanguageProjection::ORT_PROJECTION_CPLUSPLUS));
+  }
+}
+
+inline Env::Env(OrtLoggingLevel logging_level, const char* logid, OrtLoggingFunction logging_function, void* logger_param) {
+  ThrowOnError(GetApi().CreateEnvWithCustomLogger(logging_function, logger_param, logging_level, logid, &p_));
+  if (strcmp(logid, "onnxruntime-node") == 0) {
+    ThrowOnError(GetApi().SetLanguageProjection(p_, OrtLanguageProjection::ORT_PROJECTION_NODEJS));
+  } else {
+    ThrowOnError(GetApi().SetLanguageProjection(p_, OrtLanguageProjection::ORT_PROJECTION_CPLUSPLUS));
+  }
+}
+
+inline Env::Env(const OrtThreadingOptions* tp_options, OrtLoggingLevel logging_level, _In_ const char* logid) {
+  ThrowOnError(GetApi().CreateEnvWithGlobalThreadPools(logging_level, logid, tp_options, &p_));
+  if (strcmp(logid, "onnxruntime-node") == 0) {
+    ThrowOnError(GetApi().SetLanguageProjection(p_, OrtLanguageProjection::ORT_PROJECTION_NODEJS));
+  } else {
+    ThrowOnError(GetApi().SetLanguageProjection(p_, OrtLanguageProjection::ORT_PROJECTION_CPLUSPLUS));
+  }
+}
+
+inline Env::Env(const OrtThreadingOptions* tp_options, OrtLoggingFunction logging_function, void* logger_param,
+                OrtLoggingLevel logging_level, _In_ const char* logid) {
+  ThrowOnError(GetApi().CreateEnvWithCustomLoggerAndGlobalThreadPools(logging_function, logger_param, logging_level, logid, tp_options, &p_));
+  if (strcmp(logid, "onnxruntime-node") == 0) {
+    ThrowOnError(GetApi().SetLanguageProjection(p_, OrtLanguageProjection::ORT_PROJECTION_NODEJS));
+  } else {
+    ThrowOnError(GetApi().SetLanguageProjection(p_, OrtLanguageProjection::ORT_PROJECTION_CPLUSPLUS));
+  }
+}
+
+inline Env& Env::EnableTelemetryEvents() {
+  ThrowOnError(GetApi().EnableTelemetryEvents(p_));
+  return *this;
+}
+
+inline Env& Env::DisableTelemetryEvents() {
+  ThrowOnError(GetApi().DisableTelemetryEvents(p_));
+  return *this;
+}
+
+inline Env& Env::UpdateEnvWithCustomLogLevel(OrtLoggingLevel log_severity_level) {
+  ThrowOnError(GetApi().UpdateEnvWithCustomLogLevel(p_, log_severity_level));
+  return *this;
+}
+
+inline Env& Env::CreateAndRegisterAllocator(const OrtMemoryInfo* mem_info, const OrtArenaCfg* arena_cfg) {
+  ThrowOnError(GetApi().CreateAndRegisterAllocator(p_, mem_info, arena_cfg));
+  return *this;
+}
+
+inline Env& Env::CreateAndRegisterAllocatorV2(const std::string& provider_type, const OrtMemoryInfo* mem_info, const std::unordered_map<std::string, std::string>& options, const OrtArenaCfg* arena_cfg) {
+  std::vector<const char*> keys, values;
+  auto num_entries = options.size();
+  if (num_entries > 0) {
+    keys.reserve(num_entries);
+    values.reserve(num_entries);
+    for (const auto& entry : options) {
+      keys.push_back(entry.first.c_str());
+      values.push_back(entry.second.c_str());
+    }
+  }
+  ThrowOnError(GetApi().CreateAndRegisterAllocatorV2(p_, provider_type.c_str(), mem_info, arena_cfg, keys.data(), values.data(), num_entries));
+  return *this;
+}
+
+inline CustomOpDomain::CustomOpDomain(const char* domain) {
+  ThrowOnError(GetApi().CreateCustomOpDomain(domain, &p_));
+}
+
+inline void CustomOpDomain::Add(const OrtCustomOp* op) {
+  ThrowOnError(GetApi().CustomOpDomain_Add(p_, op));
+}
+
+inline RunOptions::RunOptions() {
+  ThrowOnError(GetApi().CreateRunOptions(&p_));
+}
+
+inline RunOptions& RunOptions::SetRunLogVerbosityLevel(int level) {
+  ThrowOnError(GetApi().RunOptionsSetRunLogVerbosityLevel(p_, level));
+  return *this;
+}
+
+inline RunOptions& RunOptions::SetRunLogSeverityLevel(int level) {
+  ThrowOnError(GetApi().RunOptionsSetRunLogSeverityLevel(p_, level));
+  return *this;
+}
+
+inline int RunOptions::GetRunLogVerbosityLevel() const {
+  int out;
+  ThrowOnError(GetApi().RunOptionsGetRunLogVerbosityLevel(p_, &out));
+  return out;
+}
+
+inline int RunOptions::GetRunLogSeverityLevel() const {
+  int out;
+  ThrowOnError(GetApi().RunOptionsGetRunLogSeverityLevel(p_, &out));
+  return out;
+}
+
+inline RunOptions& RunOptions::SetRunTag(const char* run_tag) {
+  ThrowOnError(GetApi().RunOptionsSetRunTag(p_, run_tag));
+  return *this;
+}
+
+inline const char* RunOptions::GetRunTag() const {
+  const char* out;
+  ThrowOnError(GetApi().RunOptionsGetRunTag(p_, &out));
+  return out;
+}
+
+inline RunOptions& RunOptions::AddConfigEntry(const char* config_key, const char* config_value) {
+  ThrowOnError(GetApi().AddRunConfigEntry(p_, config_key, config_value));
+  return *this;
+}
+
+inline RunOptions& RunOptions::SetTerminate() {
+  ThrowOnError(GetApi().RunOptionsSetTerminate(p_));
+  return *this;
+}
+
+inline RunOptions& RunOptions::UnsetTerminate() {
+  ThrowOnError(GetApi().RunOptionsUnsetTerminate(p_));
+  return *this;
+}
+
+namespace detail {
+
+template <typename T>
+inline Ort::SessionOptions ConstSessionOptionsImpl<T>::Clone() const {
+  OrtSessionOptions* out;
+  ThrowOnError(GetApi().CloneSessionOptions(this->p_, &out));
+  return SessionOptions{out};
+}
+
+template <typename T>
+inline std::string ConstSessionOptionsImpl<T>::GetConfigEntry(const char* config_key) const {
+  size_t size = 0;
+  // Feed nullptr for the data buffer to query the true size of the string value
+  Ort::ThrowOnError(GetApi().GetSessionConfigEntry(this->p_, config_key, nullptr, &size));
+
+  std::string out;
+  out.resize(size);
+  Ort::ThrowOnError(GetApi().GetSessionConfigEntry(this->p_, config_key, &out[0], &size));
+  out.resize(size - 1);  // remove the terminating character '\0'
+
+  return out;
+}
+
+template <typename T>
+inline bool ConstSessionOptionsImpl<T>::HasConfigEntry(const char* config_key) const {
+  int out = 0;
+  Ort::ThrowOnError(GetApi().HasSessionConfigEntry(this->p_, config_key, &out));
+  return static_cast<bool>(out);
+}
+
+template <typename T>
+inline std::string ConstSessionOptionsImpl<T>::GetConfigEntryOrDefault(const char* config_key, const std::string& def) {
+  if (!this->HasConfigEntry(config_key)) {
+    return def;
+  }
+
+  return this->GetConfigEntry(config_key);
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::SetIntraOpNumThreads(int intra_op_num_threads) {
+  ThrowOnError(GetApi().SetIntraOpNumThreads(this->p_, intra_op_num_threads));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::SetInterOpNumThreads(int inter_op_num_threads) {
+  ThrowOnError(GetApi().SetInterOpNumThreads(this->p_, inter_op_num_threads));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::SetGraphOptimizationLevel(GraphOptimizationLevel graph_optimization_level) {
+  ThrowOnError(GetApi().SetSessionGraphOptimizationLevel(this->p_, graph_optimization_level));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::SetOptimizedModelFilePath(const ORTCHAR_T* optimized_model_filepath) {
+  ThrowOnError(GetApi().SetOptimizedModelFilePath(this->p_, optimized_model_filepath));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::EnableProfiling(const ORTCHAR_T* profile_file_prefix) {
+  ThrowOnError(GetApi().EnableProfiling(this->p_, profile_file_prefix));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::DisableProfiling() {
+  ThrowOnError(GetApi().DisableProfiling(this->p_));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::EnableOrtCustomOps() {
+  ThrowOnError(GetApi().EnableOrtCustomOps(this->p_));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::EnableMemPattern() {
+  ThrowOnError(GetApi().EnableMemPattern(this->p_));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::DisableMemPattern() {
+  ThrowOnError(GetApi().DisableMemPattern(this->p_));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::EnableCpuMemArena() {
+  ThrowOnError(GetApi().EnableCpuMemArena(this->p_));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::DisableCpuMemArena() {
+  ThrowOnError(GetApi().DisableCpuMemArena(this->p_));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::SetExecutionMode(ExecutionMode execution_mode) {
+  ThrowOnError(GetApi().SetSessionExecutionMode(this->p_, execution_mode));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::SetLogId(const char* logid) {
+  ThrowOnError(GetApi().SetSessionLogId(this->p_, logid));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::SetLogSeverityLevel(int level) {
+  ThrowOnError(GetApi().SetSessionLogSeverityLevel(this->p_, level));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::Add(OrtCustomOpDomain* custom_op_domain) {
+  ThrowOnError(GetApi().AddCustomOpDomain(this->p_, custom_op_domain));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AddConfigEntry(const char* config_key, const char* config_value) {
+  ThrowOnError(GetApi().AddSessionConfigEntry(this->p_, config_key, config_value));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AddInitializer(const char* name, const OrtValue* ort_val) {
+  ThrowOnError(GetApi().AddInitializer(this->p_, name, ort_val));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::DisablePerSessionThreads() {
+  ThrowOnError(GetApi().DisablePerSessionThreads(this->p_));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AddExternalInitializers(const std::vector<std::string>& names,
+                                                                             const std::vector<Value>& ort_values) {
+  const size_t inputs_num = names.size();
+  if (inputs_num != ort_values.size()) {
+    ORT_CXX_API_THROW("Expecting names and ort_values to have the same length", ORT_INVALID_ARGUMENT);
+  }
+  std::vector<const char*> names_ptr;
+  std::vector<const OrtValue*> ort_values_ptrs;
+  names_ptr.reserve(inputs_num);
+  ort_values_ptrs.reserve(inputs_num);
+  for (size_t i = 0; i < inputs_num; ++i) {
+    names_ptr.push_back(names[i].c_str());
+    ort_values_ptrs.push_back(ort_values[i]);
+  }
+  ThrowOnError(GetApi().AddExternalInitializers(this->p_, names_ptr.data(), ort_values_ptrs.data(), inputs_num));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider_CUDA(const OrtCUDAProviderOptions& provider_options) {
+  ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_CUDA(this->p_, &provider_options));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider_CUDA_V2(const OrtCUDAProviderOptionsV2& provider_options) {
+  ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_CUDA_V2(this->p_, &provider_options));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider_ROCM(const OrtROCMProviderOptions& provider_options) {
+  ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_ROCM(this->p_, &provider_options));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider_TensorRT(const OrtTensorRTProviderOptions& provider_options) {
+  ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_TensorRT(this->p_, &provider_options));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider_TensorRT_V2(const OrtTensorRTProviderOptionsV2& provider_options) {
+  ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_TensorRT_V2(this->p_, &provider_options));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider_MIGraphX(const OrtMIGraphXProviderOptions& provider_options) {
+  ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_MIGraphX(this->p_, &provider_options));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider_CANN(const OrtCANNProviderOptions& provider_options) {
+  ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_CANN(this->p_, &provider_options));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider_Dnnl(const OrtDnnlProviderOptions& provider_options) {
+  ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_Dnnl(this->p_, &provider_options));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider(
+    const std::string& provider_name,
+    const std::unordered_map<std::string, std::string>& provider_options) {
+  auto num_entries = provider_options.size();
+  std::vector<const char*> keys, values;
+  if (num_entries > 0) {
+    keys.reserve(num_entries);
+    values.reserve(num_entries);
+
+    for (const auto& entry : provider_options) {
+      keys.push_back(entry.first.c_str());
+      values.push_back(entry.second.c_str());
+    }
+  }
+
+  ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider(this->p_, provider_name.c_str(),
+                                                              keys.data(), values.data(), num_entries));
+
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::SetCustomCreateThreadFn(OrtCustomCreateThreadFn ort_custom_create_thread_fn) {
+  ThrowOnError(GetApi().SessionOptionsSetCustomCreateThreadFn(this->p_, ort_custom_create_thread_fn));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::SetCustomThreadCreationOptions(void* ort_custom_thread_creation_options) {
+  ThrowOnError(GetApi().SessionOptionsSetCustomThreadCreationOptions(this->p_, ort_custom_thread_creation_options));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::SetCustomJoinThreadFn(OrtCustomJoinThreadFn ort_custom_join_thread_fn) {
+  ThrowOnError(GetApi().SessionOptionsSetCustomJoinThreadFn(this->p_, ort_custom_join_thread_fn));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider_OpenVINO(const OrtOpenVINOProviderOptions& provider_options) {
+  ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_OpenVINO(this->p_, &provider_options));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::RegisterCustomOpsLibrary(const ORTCHAR_T* library_name,
+                                                                              const CustomOpConfigs& custom_op_configs) {
+  // Add custom op config entries before registering the custom op library. Otherwise, the config entries _may_ be ignored by
+  // the custom op library.
+  for (const auto& config_iter : custom_op_configs.GetFlattenedConfigs()) {
+    AddConfigEntry(config_iter.first.c_str(), config_iter.second.c_str());
+  }
+
+  ThrowOnError(GetApi().RegisterCustomOpsLibrary_V2(this->p_, library_name));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::RegisterCustomOpsUsingFunction(const char* registration_function_name) {
+  ThrowOnError(GetApi().RegisterCustomOpsUsingFunction(this->p_, registration_function_name));
+  return *this;
+}
+
+/// Session
+template <typename T>
+inline size_t ConstSessionImpl<T>::GetInputCount() const {
+  size_t out;
+  ThrowOnError(GetApi().SessionGetInputCount(this->p_, &out));
+  return out;
+}
+
+template <typename T>
+inline size_t ConstSessionImpl<T>::GetOutputCount() const {
+  size_t out;
+  ThrowOnError(GetApi().SessionGetOutputCount(this->p_, &out));
+  return out;
+}
+
+template <typename T>
+inline size_t ConstSessionImpl<T>::GetOverridableInitializerCount() const {
+  size_t out;
+  ThrowOnError(GetApi().SessionGetOverridableInitializerCount(this->p_, &out));
+  return out;
+}
+
+template <typename T>
+inline AllocatedStringPtr ConstSessionImpl<T>::GetInputNameAllocated(size_t index, OrtAllocator* allocator) const {
+  char* out;
+  ThrowOnError(GetApi().SessionGetInputName(this->p_, index, allocator, &out));
+  return AllocatedStringPtr(out, detail::AllocatedFree(allocator));
+}
+
+template <typename T>
+inline AllocatedStringPtr ConstSessionImpl<T>::GetOutputNameAllocated(size_t index, OrtAllocator* allocator) const {
+  char* out;
+  ThrowOnError(GetApi().SessionGetOutputName(this->p_, index, allocator, &out));
+  return AllocatedStringPtr(out, detail::AllocatedFree(allocator));
+}
+
+template <typename T>
+inline AllocatedStringPtr ConstSessionImpl<T>::GetOverridableInitializerNameAllocated(size_t index, OrtAllocator* allocator) const {
+  char* out;
+  ThrowOnError(GetApi().SessionGetOverridableInitializerName(this->p_, index, allocator, &out));
+  return AllocatedStringPtr(out, detail::AllocatedFree(allocator));
+}
+
+template <typename T>
+inline uint64_t ConstSessionImpl<T>::GetProfilingStartTimeNs() const {
+  uint64_t out;
+  ThrowOnError(GetApi().SessionGetProfilingStartTimeNs(this->p_, &out));
+  return out;
+}
+
+template <typename T>
+inline ModelMetadata ConstSessionImpl<T>::GetModelMetadata() const {
+  OrtModelMetadata* out;
+  ThrowOnError(GetApi().SessionGetModelMetadata(this->p_, &out));
+  return ModelMetadata{out};
+}
+
+template <typename T>
+inline TypeInfo ConstSessionImpl<T>::GetInputTypeInfo(size_t index) const {
+  OrtTypeInfo* out;
+  ThrowOnError(GetApi().SessionGetInputTypeInfo(this->p_, index, &out));
+  return TypeInfo{out};
+}
+
+template <typename T>
+inline TypeInfo ConstSessionImpl<T>::GetOutputTypeInfo(size_t index) const {
+  OrtTypeInfo* out;
+  ThrowOnError(GetApi().SessionGetOutputTypeInfo(this->p_, index, &out));
+  return TypeInfo{out};
+}
+
+template <typename T>
+inline TypeInfo ConstSessionImpl<T>::GetOverridableInitializerTypeInfo(size_t index) const {
+  OrtTypeInfo* out;
+  ThrowOnError(GetApi().SessionGetOverridableInitializerTypeInfo(this->p_, index, &out));
+  return TypeInfo{out};
+}
+
+template <typename T>
+inline std::vector<Value> SessionImpl<T>::Run(const RunOptions& run_options, const char* const* input_names, const Value* input_values, size_t input_count,
+                                              const char* const* output_names, size_t output_count) {
+  std::vector<Value> output_values;
+  output_values.reserve(output_count);
+  for (size_t i = 0; i < output_count; i++)
+    output_values.emplace_back(nullptr);
+  Run(run_options, input_names, input_values, input_count, output_names, output_values.data(), output_count);
+  return output_values;
+}
+
+template <typename T>
+inline void SessionImpl<T>::Run(const RunOptions& run_options, const char* const* input_names, const Value* input_values, size_t input_count,
+                                const char* const* output_names, Value* output_values, size_t output_count) {
+  static_assert(sizeof(Value) == sizeof(OrtValue*), "Value is really just an array of OrtValue* in memory, so we can reinterpret_cast safely");
+  auto ort_input_values = reinterpret_cast<const OrtValue* const*>(input_values);
+  auto ort_output_values = reinterpret_cast<OrtValue**>(output_values);
+  ThrowOnError(GetApi().Run(this->p_, run_options, input_names, ort_input_values, input_count, output_names, output_count, ort_output_values));
+}
+
+template <typename T>
+inline void SessionImpl<T>::Run(const RunOptions& run_options, const IoBinding& io_binding) {
+  ThrowOnError(GetApi().RunWithBinding(this->p_, run_options, io_binding));
+}
+
+template <typename T>
+inline void SessionImpl<T>::RunAsync(const RunOptions& run_options, const char* const* input_names, const Value* input_values, size_t input_count,
+                                     const char* const* output_names, Value* output_values, size_t output_count, RunAsyncCallbackFn callback, void* user_data) {
+  auto ort_input_values = reinterpret_cast<const OrtValue* const*>(input_values);
+  auto ort_output_values = reinterpret_cast<OrtValue**>(output_values);
+  ThrowOnError(GetApi().RunAsync(this->p_, run_options, input_names,
+                                 ort_input_values, input_count, output_names, output_count,
+                                 ort_output_values, callback, user_data));
+}
+
+template <typename T>
+inline AllocatedStringPtr SessionImpl<T>::EndProfilingAllocated(OrtAllocator* allocator) {
+  char* out = nullptr;
+  ThrowOnError(GetApi().SessionEndProfiling(this->p_, allocator, &out));
+  return AllocatedStringPtr(out, detail::AllocatedFree(allocator));
+}
+
+}  // namespace detail
+
+inline SessionOptions::SessionOptions() {
+  ThrowOnError(GetApi().CreateSessionOptions(&this->p_));
+}
+
+/// CustomOpConfigs
+inline std::string detail::MakeCustomOpConfigEntryKey(const char* custom_op_name, const char* config) {
+  std::string config_key = "custom_op.";
+
+  config_key += custom_op_name;
+  config_key += ".";
+  config_key += config;
+
+  return config_key;
+}
+
+inline CustomOpConfigs& CustomOpConfigs::AddConfig(const char* custom_op_name, const char* config_key, const char* config_value) {
+  const std::string full_flat_key = detail::MakeCustomOpConfigEntryKey(custom_op_name, config_key);
+  flat_configs_[full_flat_key] = config_value;
+  return *this;
+}
+
+inline const std::unordered_map<std::string, std::string>& CustomOpConfigs::GetFlattenedConfigs() const {
+  return flat_configs_;
+}
+
+inline Session::Session(const Env& env, const ORTCHAR_T* model_path, const SessionOptions& options) {
+  ThrowOnError(GetApi().CreateSession(env, model_path, options, &this->p_));
+}
+
+inline Session::Session(const Env& env, const ORTCHAR_T* model_path, const SessionOptions& options,
+                        OrtPrepackedWeightsContainer* prepacked_weights_container) {
+  ThrowOnError(GetApi().CreateSessionWithPrepackedWeightsContainer(env, model_path, options, prepacked_weights_container, &this->p_));
+}
+
+inline Session::Session(const Env& env, const void* model_data, size_t model_data_length, const SessionOptions& options) {
+  ThrowOnError(GetApi().CreateSessionFromArray(env, model_data, model_data_length, options, &this->p_));
+}
+
+inline Session::Session(const Env& env, const void* model_data, size_t model_data_length,
+                        const SessionOptions& options, OrtPrepackedWeightsContainer* prepacked_weights_container) {
+  ThrowOnError(GetApi().CreateSessionFromArrayWithPrepackedWeightsContainer(env, model_data, model_data_length, options,
+                                                                            prepacked_weights_container, &this->p_));
+}
+
+inline AllocatedStringPtr ModelMetadata::GetProducerNameAllocated(OrtAllocator* allocator) const {
+  char* out;
+  ThrowOnError(GetApi().ModelMetadataGetProducerName(p_, allocator, &out));
+  return AllocatedStringPtr(out, detail::AllocatedFree(allocator));
+}
+
+inline AllocatedStringPtr ModelMetadata::GetGraphNameAllocated(OrtAllocator* allocator) const {
+  char* out;
+  ThrowOnError(GetApi().ModelMetadataGetGraphName(p_, allocator, &out));
+  return AllocatedStringPtr(out, detail::AllocatedFree(allocator));
+}
+
+inline AllocatedStringPtr ModelMetadata::GetDomainAllocated(OrtAllocator* allocator) const {
+  char* out;
+  ThrowOnError(GetApi().ModelMetadataGetDomain(p_, allocator, &out));
+  return AllocatedStringPtr(out, detail::AllocatedFree(allocator));
+}
+
+inline AllocatedStringPtr Ort::ModelMetadata::GetDescriptionAllocated(OrtAllocator* allocator) const {
+  char* out;
+  ThrowOnError(GetApi().ModelMetadataGetDescription(p_, allocator, &out));
+  return AllocatedStringPtr(out, detail::AllocatedFree(allocator));
+}
+
+inline AllocatedStringPtr ModelMetadata::GetGraphDescriptionAllocated(OrtAllocator* allocator) const {
+  char* out;
+  ThrowOnError(GetApi().ModelMetadataGetGraphDescription(p_, allocator, &out));
+  return AllocatedStringPtr(out, detail::AllocatedFree(allocator));
+}
+
+inline AllocatedStringPtr ModelMetadata::LookupCustomMetadataMapAllocated(const char* key, OrtAllocator* allocator) const {
+  char* out;
+  ThrowOnError(GetApi().ModelMetadataLookupCustomMetadataMap(p_, allocator, key, &out));
+  return AllocatedStringPtr(out, detail::AllocatedFree(allocator));
+}
+
+inline std::vector<AllocatedStringPtr> ModelMetadata::GetCustomMetadataMapKeysAllocated(OrtAllocator* allocator) const {
+  auto deletor = detail::AllocatedFree(allocator);
+  std::vector<AllocatedStringPtr> result;
+
+  char** out = nullptr;
+  int64_t num_keys = 0;
+  ThrowOnError(GetApi().ModelMetadataGetCustomMetadataMapKeys(p_, allocator, &out, &num_keys));
+  if (num_keys <= 0) {
+    return result;
+  }
+
+  // array of pointers will be freed
+  std::unique_ptr<void, decltype(deletor)> array_guard(out, deletor);
+  // reserve may throw
+  auto strings_deletor = [&deletor, num_keys](char** out) { for(int64_t i = 0; i < num_keys; ++i) deletor(out[i]); };
+  std::unique_ptr<char*, decltype(strings_deletor)> strings_guard(out, strings_deletor);
+  result.reserve(static_cast<size_t>(num_keys));
+  strings_guard.release();
+  for (int64_t i = 0; i < num_keys; ++i) {
+    result.push_back(AllocatedStringPtr(out[i], deletor));
+  }
+
+  return result;
+}
+
+inline int64_t ModelMetadata::GetVersion() const {
+  int64_t out;
+  ThrowOnError(GetApi().ModelMetadataGetVersion(p_, &out));
+  return out;
+}
+
+namespace detail {
+
+template <typename T>
+inline ONNXTensorElementDataType TensorTypeAndShapeInfoImpl<T>::GetElementType() const {
+  ONNXTensorElementDataType out;
+  ThrowOnError(GetApi().GetTensorElementType(this->p_, &out));
+  return out;
+}
+
+template <typename T>
+inline size_t TensorTypeAndShapeInfoImpl<T>::GetElementCount() const {
+  size_t out;
+  ThrowOnError(GetApi().GetTensorShapeElementCount(this->p_, &out));
+  return static_cast<size_t>(out);
+}
+
+template <typename T>
+inline size_t TensorTypeAndShapeInfoImpl<T>::GetDimensionsCount() const {
+  size_t out;
+  ThrowOnError(GetApi().GetDimensionsCount(this->p_, &out));
+  return out;
+}
+
+template <typename T>
+inline void TensorTypeAndShapeInfoImpl<T>::GetDimensions(int64_t* values, size_t values_count) const {
+  ThrowOnError(GetApi().GetDimensions(this->p_, values, values_count));
+}
+
+template <typename T>
+inline void TensorTypeAndShapeInfoImpl<T>::GetSymbolicDimensions(const char** values, size_t values_count) const {
+  ThrowOnError(GetApi().GetSymbolicDimensions(this->p_, values, values_count));
+}
+
+template <typename T>
+inline std::vector<int64_t> TensorTypeAndShapeInfoImpl<T>::GetShape() const {
+  std::vector<int64_t> out(GetDimensionsCount(), 0);
+  ThrowOnError(GetApi().GetDimensions(this->p_, out.data(), out.size()));
+  return out;
+}
+
+template <typename T>
+inline ConstTensorTypeAndShapeInfo TypeInfoImpl<T>::GetTensorTypeAndShapeInfo() const {
+  const OrtTensorTypeAndShapeInfo* out;
+  ThrowOnError(GetApi().CastTypeInfoToTensorInfo(this->p_, &out));
+  return ConstTensorTypeAndShapeInfo{out};
+}
+
+template <typename T>
+inline ConstSequenceTypeInfo TypeInfoImpl<T>::GetSequenceTypeInfo() const {
+  const OrtSequenceTypeInfo* out;
+  ThrowOnError(GetApi().CastTypeInfoToSequenceTypeInfo(this->p_, &out));
+  return ConstSequenceTypeInfo{out};
+}
+
+template <typename T>
+inline ConstMapTypeInfo TypeInfoImpl<T>::GetMapTypeInfo() const {
+  const OrtMapTypeInfo* out;
+  ThrowOnError(GetApi().CastTypeInfoToMapTypeInfo(this->p_, &out));
+  return ConstMapTypeInfo{out};
+}
+
+template <typename T>
+inline ONNXType TypeInfoImpl<T>::GetONNXType() const {
+  ONNXType out;
+  ThrowOnError(GetApi().GetOnnxTypeFromTypeInfo(this->p_, &out));
+  return out;
+}
+
+template <typename T>
+inline TypeInfo SequenceTypeInfoImpl<T>::GetSequenceElementType() const {
+  OrtTypeInfo* output;
+  ThrowOnError(GetApi().GetSequenceElementType(this->p_, &output));
+  return TypeInfo{output};
+}
+
+template <typename T>
+inline TypeInfo OptionalTypeInfoImpl<T>::GetOptionalElementType() const {
+  OrtTypeInfo* info;
+  ThrowOnError(GetApi().GetOptionalContainedTypeInfo(this->p_, &info));
+  return TypeInfo{info};
+}
+
+template <typename T>
+inline ONNXTensorElementDataType MapTypeInfoImpl<T>::GetMapKeyType() const {
+  ONNXTensorElementDataType out;
+  ThrowOnError(GetApi().GetMapKeyType(this->p_, &out));
+  return out;
+}
+
+template <typename T>
+inline TypeInfo MapTypeInfoImpl<T>::GetMapValueType() const {
+  OrtTypeInfo* output;
+  ThrowOnError(GetApi().GetMapValueType(this->p_, &output));
+  return TypeInfo{output};
+}
+
+template <typename T>
+inline ConstOptionalTypeInfo TypeInfoImpl<T>::GetOptionalTypeInfo() const {
+  const OrtOptionalTypeInfo* info;
+  ThrowOnError(GetApi().CastTypeInfoToOptionalTypeInfo(this->p_, &info));
+  return ConstOptionalTypeInfo{info};
+}
+
+}  // namespace detail
+
+namespace detail {
+
+template <typename T>
+template <typename R>
+inline void ConstValueImpl<T>::GetOpaqueData(const char* domain, const char* type_name, R& out) const {
+  ThrowOnError(GetApi().GetOpaqueValue(domain, type_name, this->p_, &out, sizeof(R)));
+}
+
+template <typename T>
+inline bool ConstValueImpl<T>::IsTensor() const {
+  int out;
+  ThrowOnError(GetApi().IsTensor(this->p_, &out));
+  return out != 0;
+}
+
+template <typename T>
+inline bool ConstValueImpl<T>::HasValue() const {
+  int out;
+  ThrowOnError(GetApi().HasValue(this->p_, &out));
+  return out != 0;
+}
+
+template <typename T>
+inline size_t ConstValueImpl<T>::GetCount() const {
+  size_t out;
+  ThrowOnError(GetApi().GetValueCount(this->p_, &out));
+  return out;
+}
+
+template <typename T>
+inline Value ConstValueImpl<T>::GetValue(int index, OrtAllocator* allocator) const {
+  OrtValue* out;
+  ThrowOnError(GetApi().GetValue(this->p_, index, allocator, &out));
+  return Value{out};
+}
+
+template <typename T>
+inline size_t ConstValueImpl<T>::GetStringTensorDataLength() const {
+  size_t out;
+  ThrowOnError(GetApi().GetStringTensorDataLength(this->p_, &out));
+  return out;
+}
+
+template <typename T>
+inline size_t ConstValueImpl<T>::GetStringTensorElementLength(size_t element_index) const {
+  size_t out;
+  ThrowOnError(GetApi().GetStringTensorElementLength(this->p_, element_index, &out));
+  return out;
+}
+
+template <typename T>
+template <typename R>
+inline const R* ConstValueImpl<T>::GetTensorData() const {
+  R* out;
+  ThrowOnError(GetApi().GetTensorMutableData(const_cast<OrtValue*>(this->p_), (void**)&out));
+  return out;
+}
+
+template <typename T>
+inline const void* ConstValueImpl<T>::GetTensorRawData() const {
+  void* out;
+  ThrowOnError(GetApi().GetTensorMutableData(const_cast<OrtValue*>(this->p_), &out));
+  return out;
+}
+
+template <typename T>
+inline TypeInfo ConstValueImpl<T>::GetTypeInfo() const {
+  OrtTypeInfo* output;
+  ThrowOnError(GetApi().GetTypeInfo(this->p_, &output));
+  return TypeInfo{output};
+}
+
+template <typename T>
+inline TensorTypeAndShapeInfo ConstValueImpl<T>::GetTensorTypeAndShapeInfo() const {
+  OrtTensorTypeAndShapeInfo* output;
+  ThrowOnError(GetApi().GetTensorTypeAndShape(this->p_, &output));
+  return TensorTypeAndShapeInfo{output};
+}
+
+template <typename T>
+inline ConstMemoryInfo ConstValueImpl<T>::GetTensorMemoryInfo() const {
+  const OrtMemoryInfo* mem_info;
+  ThrowOnError(GetApi().GetTensorMemoryInfo(this->p_, &mem_info));
+  return ConstMemoryInfo(mem_info);
+}
+
+template <typename T>
+inline void ConstValueImpl<T>::GetStringTensorElement(size_t buffer_length, size_t element_index, void* buffer) const {
+  ThrowOnError(GetApi().GetStringTensorElement(this->p_, buffer_length, element_index, buffer));
+}
+
+template <typename T>
+inline std::string ConstValueImpl<T>::GetStringTensorElement(size_t element_index) const {
+  size_t buffer_length;
+  ThrowOnError(GetApi().GetStringTensorElementLength(this->p_, element_index, &buffer_length));
+
+  std::string s;
+  s.resize(buffer_length);
+  ThrowOnError(GetApi().GetStringTensorElement(this->p_, buffer_length, element_index, &s[0]));
+  return s;
+}
+
+template <typename T>
+inline void ConstValueImpl<T>::GetStringTensorContent(void* buffer, size_t buffer_length, size_t* offsets, size_t offsets_count) const {
+  ThrowOnError(GetApi().GetStringTensorContent(this->p_, buffer, buffer_length, offsets, offsets_count));
+}
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+template <typename T>
+inline OrtSparseFormat ConstValueImpl<T>::GetSparseFormat() const {
+  OrtSparseFormat format;
+  ThrowOnError(GetApi().GetSparseTensorFormat(this->p_, &format));
+  return format;
+}
+
+template <typename T>
+inline TensorTypeAndShapeInfo ConstValueImpl<T>::GetSparseTensorValuesTypeAndShapeInfo() const {
+  OrtTensorTypeAndShapeInfo* output;
+  ThrowOnError(GetApi().GetSparseTensorValuesTypeAndShape(this->p_, &output));
+  return TensorTypeAndShapeInfo{output};
+}
+
+template <typename T>
+inline TensorTypeAndShapeInfo ConstValueImpl<T>::GetSparseTensorIndicesTypeShapeInfo(OrtSparseIndicesFormat indices_format) const {
+  OrtTensorTypeAndShapeInfo* output;
+  ThrowOnError(GetApi().GetSparseTensorIndicesTypeShape(this->p_, indices_format, &output));
+  return TensorTypeAndShapeInfo{output};
+}
+
+template <typename T>
+template <typename R>
+inline const R* ConstValueImpl<T>::GetSparseTensorIndicesData(OrtSparseIndicesFormat indices_format, size_t& num_indices) const {
+  const void* out;
+  ThrowOnError(GetApi().GetSparseTensorIndices(this->p_, indices_format, &num_indices, &out));
+  return reinterpret_cast<const R*>(out);
+}
+
+template <typename T>
+inline bool ConstValueImpl<T>::IsSparseTensor() const {
+  int out;
+  ThrowOnError(GetApi().IsSparseTensor(this->p_, &out));
+  return out != 0;
+}
+
+template <typename T>
+template <typename R>
+inline const R* ConstValueImpl<T>::GetSparseTensorValues() const {
+  const void* out;
+  ThrowOnError(GetApi().GetSparseTensorValues(this->p_, &out));
+  return reinterpret_cast<const R*>(out);
+}
+
+#endif
+
+template <typename T>
+void ValueImpl<T>::FillStringTensor(const char* const* s, size_t s_len) {
+  ThrowOnError(GetApi().FillStringTensor(this->p_, s, s_len));
+}
+
+template <typename T>
+void ValueImpl<T>::FillStringTensorElement(const char* s, size_t index) {
+  ThrowOnError(GetApi().FillStringTensorElement(this->p_, s, index));
+}
+
+template <typename T>
+inline char* ValueImpl<T>::GetResizedStringTensorElementBuffer(size_t index, size_t buffer_length) {
+  char* result;
+  ThrowOnError(GetApi().GetResizedStringTensorElementBuffer(this->p_, index, buffer_length, &result));
+  return result;
+}
+
+template <typename T>
+void* ValueImpl<T>::GetTensorMutableRawData() {
+  void* out;
+  ThrowOnError(GetApi().GetTensorMutableData(this->p_, &out));
+  return out;
+}
+
+template <typename T>
+template <typename R>
+R* ValueImpl<T>::GetTensorMutableData() {
+  R* out;
+  ThrowOnError(GetApi().GetTensorMutableData(this->p_, (void**)&out));
+  return out;
+}
+
+template <typename T>
+template <typename R>
+R& ValueImpl<T>::At(const std::vector<int64_t>& location) {
+  static_assert(!std::is_same<T, std::string>::value, "this api does not support std::string");
+  R* out;
+  ThrowOnError(GetApi().TensorAt(this->p_, location.data(), location.size(), (void**)&out));
+  return *out;
+}
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+template <typename T>
+void ValueImpl<T>::UseCooIndices(int64_t* indices_data, size_t indices_num) {
+  ThrowOnError(GetApi().UseCooIndices(this->p_, indices_data, indices_num));
+}
+
+template <typename T>
+void ValueImpl<T>::UseCsrIndices(int64_t* inner_data, size_t inner_num, int64_t* outer_data, size_t outer_num) {
+  ThrowOnError(GetApi().UseCsrIndices(this->p_, inner_data, inner_num, outer_data, outer_num));
+}
+
+template <typename T>
+void ValueImpl<T>::UseBlockSparseIndices(const Shape& indices_shape, int32_t* indices_data) {
+  ThrowOnError(GetApi().UseBlockSparseIndices(this->p_, indices_shape.shape, indices_shape.shape_len, indices_data));
+}
+
+template <typename T>
+void ValueImpl<T>::FillSparseTensorCoo(const OrtMemoryInfo* mem_info, const OrtSparseValuesParam& values_param,
+                                       const int64_t* indices_data, size_t indices_num) {
+  ThrowOnError(GetApi().FillSparseTensorCoo(this->p_, mem_info, values_param.values_shape,
+                                            values_param.values_shape_len, values_param.data.p_data,
+                                            indices_data, indices_num));
+}
+
+template <typename T>
+void ValueImpl<T>::FillSparseTensorCsr(const OrtMemoryInfo* data_mem_info,
+                                       const OrtSparseValuesParam& values,
+                                       const int64_t* inner_indices_data, size_t inner_indices_num,
+                                       const int64_t* outer_indices_data, size_t outer_indices_num) {
+  ThrowOnError(GetApi().FillSparseTensorCsr(this->p_, data_mem_info, values.values_shape, values.values_shape_len, values.data.p_data,
+                                            inner_indices_data, inner_indices_num,
+                                            outer_indices_data, outer_indices_num));
+}
+
+template <typename T>
+void ValueImpl<T>::FillSparseTensorBlockSparse(const OrtMemoryInfo* data_mem_info,
+                                               const OrtSparseValuesParam& values,
+                                               const Shape& indices_shape,
+                                               const int32_t* indices_data) {
+  ThrowOnError(GetApi().FillSparseTensorBlockSparse(this->p_, data_mem_info, values.values_shape, values.values_shape_len, values.data.p_data,
+                                                    indices_shape.shape, indices_shape.shape_len,
+                                                    indices_data));
+}
+
+#endif  // !defined(DISABLE_SPARSE_TENSORS)
+
+}  // namespace detail
+
+template <typename T>
+inline Value Value::CreateTensor(const OrtMemoryInfo* info, T* p_data, size_t p_data_element_count, const int64_t* shape, size_t shape_len) {
+  return CreateTensor(info, p_data, p_data_element_count * sizeof(T), shape, shape_len, TypeToTensorType<T>::type);
+}
+
+inline Value Value::CreateTensor(const OrtMemoryInfo* info, void* p_data, size_t p_data_byte_count, const int64_t* shape, size_t shape_len,
+                                 ONNXTensorElementDataType type) {
+  OrtValue* out;
+  ThrowOnError(GetApi().CreateTensorWithDataAsOrtValue(info, p_data, p_data_byte_count, shape, shape_len, type, &out));
+  return Value{out};
+}
+
+template <typename T>
+inline Value Value::CreateTensor(OrtAllocator* allocator, const int64_t* shape, size_t shape_len) {
+  return CreateTensor(allocator, shape, shape_len, TypeToTensorType<T>::type);
+}
+
+inline Value Value::CreateTensor(OrtAllocator* allocator, const int64_t* shape, size_t shape_len, ONNXTensorElementDataType type) {
+  OrtValue* out;
+  ThrowOnError(GetApi().CreateTensorAsOrtValue(allocator, shape, shape_len, type, &out));
+  return Value{out};
+}
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+
+template <typename T>
+inline Value Value::CreateSparseTensor(const OrtMemoryInfo* info, T* p_data, const Shape& dense_shape,
+                                       const Shape& values_shape) {
+  return CreateSparseTensor(info, p_data, dense_shape, values_shape, TypeToTensorType<T>::type);
+}
+
+inline Value Value::CreateSparseTensor(const OrtMemoryInfo* info, void* p_data, const Shape& dense_shape,
+                                       const Shape& values_shape, ONNXTensorElementDataType type) {
+  OrtValue* out;
+  ThrowOnError(GetApi().CreateSparseTensorWithValuesAsOrtValue(info, p_data, dense_shape.shape, dense_shape.shape_len,
+                                                               values_shape.shape, values_shape.shape_len, type, &out));
+  return Value{out};
+}
+
+template <typename T>
+inline Value Value::CreateSparseTensor(OrtAllocator* allocator, const Shape& dense_shape) {
+  return CreateSparseTensor(allocator, dense_shape, TypeToTensorType<T>::type);
+}
+
+inline Value Value::CreateSparseTensor(OrtAllocator* allocator, const Shape& dense_shape,
+                                       ONNXTensorElementDataType type) {
+  OrtValue* out;
+  ThrowOnError(GetApi().CreateSparseTensorAsOrtValue(allocator, dense_shape.shape, dense_shape.shape_len, type, &out));
+  return Value{out};
+}
+#endif  // !defined(DISABLE_SPARSE_TENSORS)
+
+inline Value Value::CreateMap(const Value& keys, const Value& values) {
+  OrtValue* out;
+  const OrtValue* inputs[2] = {keys, values};
+  ThrowOnError(GetApi().CreateValue(inputs, 2, ONNX_TYPE_MAP, &out));
+  return Value{out};
+}
+
+inline Value Value::CreateSequence(const std::vector<Value>& values) {
+  OrtValue* out;
+  std::vector<const OrtValue*> values_ort{values.data(), values.data() + values.size()};
+  ThrowOnError(GetApi().CreateValue(values_ort.data(), values_ort.size(), ONNX_TYPE_SEQUENCE, &out));
+  return Value{out};
+}
+
+template <typename T>
+inline Value Value::CreateOpaque(const char* domain, const char* type_name, const T& data_container) {
+  OrtValue* out;
+  ThrowOnError(GetApi().CreateOpaqueValue(domain, type_name, &data_container, sizeof(T), &out));
+  return Value{out};
+}
+
+//
+// Custom OP Inlines
+//
+inline Logger::Logger(const OrtLogger* logger) : logger_(logger) {
+  Ort::ThrowOnError(GetApi().Logger_GetLoggingSeverityLevel(this->logger_, &this->cached_severity_level_));
+}
+
+inline OrtLoggingLevel Logger::GetLoggingSeverityLevel() const noexcept {
+  return cached_severity_level_;
+}
+
+inline Status Logger::LogMessage(OrtLoggingLevel log_severity_level, const ORTCHAR_T* file_path, int line_number,
+                                 const char* func_name, const char* message) const noexcept {
+  OrtStatus* status = GetApi().Logger_LogMessage(logger_, log_severity_level, message, file_path, line_number,
+                                                 func_name);
+  return Status{status};
+}
+
+// Disable warnings about the format string not being a literal (-Wformat-nonliteral and -Wformat-security)
+// for gcc and clang. The alternative is to use actual C-style variadic parameters and apply
+// __attribute__(format(printf...)), which does not work with variadic templates.
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wformat-nonliteral"
+#pragma GCC diagnostic ignored "-Wformat-security"
+#elif defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wformat-nonliteral"
+#pragma clang diagnostic ignored "-Wformat-security"
+#endif
+template <typename... Args>
+inline Status Logger::LogFormattedMessage(OrtLoggingLevel log_severity_level, const ORTCHAR_T* file_path,
+                                          int line_number, const char* func_name, const char* format,
+                                          Args&&... args) const noexcept {
+  int msg_len = std::snprintf(nullptr, 0U, format, std::forward<Args>(args)...);
+
+  if (msg_len < 0) {  // Formatting error
+    return Status("Failed to log message due to formatting error", OrtErrorCode::ORT_FAIL);
+  }
+
+  OrtStatus* status = nullptr;
+  const size_t buffer_size = static_cast<size_t>(msg_len) + 1U;
+
+  constexpr size_t kStackBufferSize = 1024;
+
+  if (buffer_size < kStackBufferSize) {
+    char buffer[kStackBufferSize];
+    snprintf(buffer, kStackBufferSize, format, std::forward<Args>(args)...);
+    status = GetApi().Logger_LogMessage(logger_, log_severity_level, buffer, file_path, line_number, func_name);
+  } else {
+    // std::make_unique is only supported starting at C++14.
+#if (__cplusplus >= 201402L) || (_MSC_VER >= 1900)
+    auto buffer = std::make_unique<char[]>(buffer_size);
+#else
+    std::unique_ptr<char[]> buffer(new char[buffer_size]);
+#endif
+    std::snprintf(buffer.get(), buffer_size, format, std::forward<Args>(args)...);
+    status = GetApi().Logger_LogMessage(logger_, log_severity_level, buffer.get(), file_path, line_number, func_name);
+  }
+
+  return Status{status};
+}
+// Re-enable -Wformat-nonliteral and -Wformat-security
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#elif defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+
+inline KernelContext::KernelContext(OrtKernelContext* context) : ctx_(context) {
+}
+
+inline size_t KernelContext::GetInputCount() const {
+  size_t out = 0;
+  Ort::ThrowOnError(GetApi().KernelContext_GetInputCount(ctx_, &out));
+  return out;
+}
+
+inline size_t KernelContext::GetOutputCount() const {
+  size_t out = 0;
+  Ort::ThrowOnError(GetApi().KernelContext_GetOutputCount(ctx_, &out));
+  return out;
+}
+
+inline ConstValue KernelContext::GetInput(size_t index) const {
+  const OrtValue* out = nullptr;
+  Ort::ThrowOnError(GetApi().KernelContext_GetInput(ctx_, index, &out));
+  return ConstValue{out};
+}
+
+inline UnownedValue KernelContext::GetOutput(size_t index, const int64_t* dim_values, size_t dim_count) const {
+  OrtValue* out = nullptr;
+  Ort::ThrowOnError(GetApi().KernelContext_GetOutput(ctx_, index, dim_values, dim_count, &out));
+  return UnownedValue(out);
+}
+
+inline UnownedValue KernelContext::GetOutput(size_t index, const std::vector<int64_t>& dims) const {
+  OrtValue* out = nullptr;
+  Ort::ThrowOnError(GetApi().KernelContext_GetOutput(ctx_, index, dims.data(), dims.size(), &out));
+  return UnownedValue(out);
+}
+
+inline void* KernelContext::GetGPUComputeStream() const {
+  void* out = nullptr;
+  Ort::ThrowOnError(GetApi().KernelContext_GetGPUComputeStream(ctx_, &out));
+  return out;
+}
+
+inline OrtAllocator* KernelContext::GetAllocator(const OrtMemoryInfo& memory_info) const {
+  OrtAllocator* out = nullptr;
+  Ort::ThrowOnError(GetApi().KernelContext_GetAllocator(ctx_, &memory_info, &out));
+  return out;
+}
+
+inline Logger KernelContext::GetLogger() const {
+  const OrtLogger* out = nullptr;
+  ThrowOnError(GetApi().KernelContext_GetLogger(this->ctx_, &out));
+  return Logger{out};
+}
+
+inline OpAttr::OpAttr(const char* name, const void* data, int len, OrtOpAttrType type) {
+  Ort::ThrowOnError(GetApi().CreateOpAttr(name, data, len, type, &p_));
+}
+
+namespace detail {
+template <typename T>
+inline KernelInfo KernelInfoImpl<T>::Copy() const {
+  OrtKernelInfo* info_copy = nullptr;
+  Ort::ThrowOnError(GetApi().CopyKernelInfo(this->p_, &info_copy));
+  return KernelInfo{info_copy};
+}
+
+template <typename T>
+inline size_t KernelInfoImpl<T>::GetInputCount() const {
+  size_t out = 0;
+  ThrowOnError(GetApi().KernelInfo_GetInputCount(this->p_, &out));
+  return out;
+}
+
+template <typename T>
+inline size_t KernelInfoImpl<T>::GetOutputCount() const {
+  size_t out = 0;
+  ThrowOnError(GetApi().KernelInfo_GetOutputCount(this->p_, &out));
+  return out;
+}
+
+template <typename T>
+inline std::string KernelInfoImpl<T>::GetInputName(size_t index) const {
+  size_t size = 0;
+
+  // Feed nullptr for the data buffer to query the true size of the string value
+  Ort::ThrowOnError(GetApi().KernelInfo_GetInputName(this->p_, index, nullptr, &size));
+
+  std::string out;
+  out.resize(size);
+  Ort::ThrowOnError(GetApi().KernelInfo_GetInputName(this->p_, index, &out[0], &size));
+  out.resize(size - 1);  // remove the terminating character '\0'
+
+  return out;
+}
+
+template <typename T>
+inline std::string KernelInfoImpl<T>::GetOutputName(size_t index) const {
+  size_t size = 0;
+
+  // Feed nullptr for the data buffer to query the true size of the string value
+  Ort::ThrowOnError(GetApi().KernelInfo_GetOutputName(this->p_, index, nullptr, &size));
+
+  std::string out;
+  out.resize(size);
+  Ort::ThrowOnError(GetApi().KernelInfo_GetOutputName(this->p_, index, &out[0], &size));
+  out.resize(size - 1);  // remove the terminating character '\0'
+
+  return out;
+}
+
+template <typename T>
+inline TypeInfo KernelInfoImpl<T>::GetInputTypeInfo(size_t index) const {
+  OrtTypeInfo* out = nullptr;
+  ThrowOnError(GetApi().KernelInfo_GetInputTypeInfo(this->p_, index, &out));
+  return TypeInfo{out};
+}
+
+template <typename T>
+inline TypeInfo KernelInfoImpl<T>::GetOutputTypeInfo(size_t index) const {
+  OrtTypeInfo* out = nullptr;
+  ThrowOnError(GetApi().KernelInfo_GetOutputTypeInfo(this->p_, index, &out));
+  return TypeInfo{out};
+}
+
+template <typename T>
+inline Value KernelInfoImpl<T>::GetTensorAttribute(const char* name, OrtAllocator* allocator) const {
+  OrtValue* out = nullptr;
+  ThrowOnError(GetApi().KernelInfoGetAttribute_tensor(this->p_, name, allocator, &out));
+  return Value{out};
+}
+
+template <typename T>
+inline ConstValue KernelInfoImpl<T>::GetTensorConstantInput(size_t index, int* is_constant) const {
+  const OrtValue* out = nullptr;
+  ThrowOnError(GetApi().KernelInfoGetConstantInput_tensor(this->p_, index, is_constant, &out));
+  return ConstValue{out};
+}
+
+template <typename T>
+inline std::string KernelInfoImpl<T>::GetNodeName() const {
+  size_t size = 0;
+
+  // Feed nullptr for the data buffer to query the true size of the string value
+  Ort::ThrowOnError(GetApi().KernelInfo_GetNodeName(this->p_, nullptr, &size));
+
+  std::string out;
+  out.resize(size);
+  Ort::ThrowOnError(GetApi().KernelInfo_GetNodeName(this->p_, &out[0], &size));
+  out.resize(size - 1);  // remove the terminating character '\0'
+
+  return out;
+}
+
+template <typename T>
+inline Logger KernelInfoImpl<T>::GetLogger() const {
+  const OrtLogger* out = nullptr;
+  ThrowOnError(GetApi().KernelInfo_GetLogger(this->p_, &out));
+  return Logger{out};
+}
+
+inline void attr_utils::GetAttr(const OrtKernelInfo* p, const char* name, float& out) {
+  Ort::ThrowOnError(GetApi().KernelInfoGetAttribute_float(p, name, &out));
+}
+
+inline void attr_utils::GetAttr(const OrtKernelInfo* p, const char* name, int64_t& out) {
+  Ort::ThrowOnError(GetApi().KernelInfoGetAttribute_int64(p, name, &out));
+}
+
+inline void attr_utils::GetAttr(const OrtKernelInfo* p, const char* name, std::string& result) {
+  size_t size = 0;
+  // Feed nullptr for the data buffer to query the true size of the string attribute
+  Ort::ThrowOnError(GetApi().KernelInfoGetAttribute_string(p, name, nullptr, &size));
+
+  std::string out;
+  out.resize(size);
+  Ort::ThrowOnError(GetApi().KernelInfoGetAttribute_string(p, name, &out[0], &size));
+  out.resize(size - 1);  // remove the terminating character '\0'
+  out.swap(result);
+}
+
+inline void attr_utils::GetAttrs(const OrtKernelInfo* p, const char* name, std::vector<float>& result) {
+  size_t size = 0;
+  // Feed nullptr for the data buffer to query the true size of the attribute
+  Ort::ThrowOnError(GetApi().KernelInfoGetAttributeArray_float(p, name, nullptr, &size));
+
+  std::vector<float> out;
+  out.resize(size);
+  Ort::ThrowOnError(GetApi().KernelInfoGetAttributeArray_float(p, name, out.data(), &size));
+  out.swap(result);
+}
+
+inline void attr_utils::GetAttrs(const OrtKernelInfo* p, const char* name, std::vector<int64_t>& result) {
+  size_t size = 0;
+
+  // Feed nullptr for the data buffer to query the true size of the attribute
+  Ort::ThrowOnError(GetApi().KernelInfoGetAttributeArray_int64(p, name, nullptr, &size));
+
+  std::vector<int64_t> out;
+  out.resize(size);
+  Ort::ThrowOnError(GetApi().KernelInfoGetAttributeArray_int64(p, name, out.data(), &size));
+  out.swap(result);
+}
+}  // namespace detail
+
+inline KernelInfo::KernelInfo(OrtKernelInfo* info) : detail::KernelInfoImpl<OrtKernelInfo>{info} {}
+
+inline Op::Op(OrtOp* p) : Base<OrtOp>(p) {}
+
+inline Op Op::Create(const OrtKernelInfo* info, const char* op_name, const char* domain, int version,
+                     const char** type_constraint_names,
+                     const ONNXTensorElementDataType* type_constraint_values,
+                     size_t type_constraint_count,
+                     const OpAttr* attr_values, size_t attr_count,
+                     size_t input_count, size_t output_count) {
+  static_assert(sizeof(OpAttr) == sizeof(OrtOpAttr*),
+                "OpAttr's is expected to be just an array of OrtOpAttr in memory so we can reinterpret safely");
+  auto attr_input_values = reinterpret_cast<const OrtOpAttr* const*>(attr_values);
+  OrtOp* op;
+  Ort::ThrowOnError(GetApi().CreateOp(info, op_name, domain, version, type_constraint_names, type_constraint_values,
+                                      static_cast<int>(type_constraint_count),
+                                      attr_input_values,
+                                      static_cast<int>(attr_count),
+                                      static_cast<int>(input_count),
+                                      static_cast<int>(output_count), &op));
+  return Op{op};
+}
+
+inline void Op::Invoke(const OrtKernelContext* context,
+                       const Value* input_values,
+                       size_t input_count,
+                       Value* output_values,
+                       size_t output_count) {
+  static_assert(sizeof(Value) == sizeof(OrtValue*),
+                "Value is really just an array of OrtValue* in memory, so we can reinterpret_cast safely");
+  auto ort_input_values = reinterpret_cast<const OrtValue* const*>(input_values);
+  auto ort_output_values = reinterpret_cast<OrtValue**>(output_values);
+  Ort::ThrowOnError(GetApi().InvokeOp(context, p_, ort_input_values, static_cast<int>(input_count),
+                                      ort_output_values, static_cast<int>(output_count)));
+}
+
+inline void Op::Invoke(const OrtKernelContext* context,
+                       const OrtValue* const* input_values,
+                       size_t input_count,
+                       OrtValue* const* output_values,
+                       size_t output_count) {
+  Ort::ThrowOnError(GetApi().InvokeOp(context, p_, input_values, static_cast<int>(input_count),
+                                      output_values, static_cast<int>(output_count)));
+}
+
+inline std::string GetVersionString() {
+  return OrtGetApiBase()->GetVersionString();
+}
+
+inline std::string GetBuildInfoString() {
+  return GetApi().GetBuildInfoString();
+}
+
+inline std::vector<std::string> GetAvailableProviders() {
+  char** providers;
+  int len;
+
+  auto release_fn = [&len](char** providers) {
+    // This should always return nullptr.
+    ThrowOnError(GetApi().ReleaseAvailableProviders(providers, len));
+  };
+
+  ThrowOnError(GetApi().GetAvailableProviders(&providers, &len));
+  std::unique_ptr<char*, decltype(release_fn)> guard(providers, release_fn);
+  std::vector<std::string> available_providers;
+  available_providers.reserve(static_cast<size_t>(len));
+  for (int i = 0; i < len; ++i) {
+    available_providers.emplace_back(providers[i]);
+  }
+  return available_providers;
+}
+
+template <typename TOp, typename TKernel, bool WithStatus>
+void CustomOpBase<TOp, TKernel, WithStatus>::GetSessionConfigs(std::unordered_map<std::string, std::string>& out,
+                                                               ConstSessionOptions options) const {
+  const TOp* derived = static_cast<const TOp*>(this);
+  std::vector<std::string> keys = derived->GetSessionConfigKeys();
+
+  out.reserve(keys.size());
+
+  std::string config_entry_key = detail::MakeCustomOpConfigEntryKey(derived->GetName(), "");
+  const size_t prefix_size = config_entry_key.length();
+
+  for (const auto& key : keys) {
+    config_entry_key.resize(prefix_size);
+    config_entry_key.append(key);
+    out[key] = options.GetConfigEntryOrDefault(config_entry_key.c_str(), "");
+  }
+}
+
+}  // namespace Ort
diff --git a/duix-sdk/src/main/cpp/third/arm/include/onnx/onnxruntime_float16.h b/duix-sdk/src/main/cpp/third/arm/include/onnx/onnxruntime_float16.h
new file mode 100644
index 0000000..0b066a9
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/onnx/onnxruntime_float16.h
@@ -0,0 +1,540 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <stdint.h>
+#include <cmath>
+#include <cstring>
+#include <limits>
+
+namespace onnxruntime_float16 {
+
+namespace detail {
+
+enum class endian {
+#if defined(_WIN32)
+  little = 0,
+  big = 1,
+  native = little,
+#elif defined(__GNUC__) || defined(__clang__)
+  little = __ORDER_LITTLE_ENDIAN__,
+  big = __ORDER_BIG_ENDIAN__,
+  native = __BYTE_ORDER__,
+#else
+#error onnxruntime_float16::detail::endian is not implemented in this environment.
+#endif
+};
+
+static_assert(
+    endian::native == endian::little || endian::native == endian::big,
+    "Only little-endian or big-endian native byte orders are supported.");
+
+}  // namespace detail
+
+/// <summary>
+/// Shared implementation between public and internal classes. CRTP pattern.
+/// </summary>
+template <class Derived>
+struct Float16Impl {
+ protected:
+  /// <summary>
+  /// Converts from float to uint16_t float16 representation
+  /// </summary>
+  /// <param name="v"></param>
+  /// <returns></returns>
+  constexpr static uint16_t ToUint16Impl(float v) noexcept;
+
+  /// <summary>
+  /// Converts float16 to float
+  /// </summary>
+  /// <returns>float representation of float16 value</returns>
+  float ToFloatImpl() const noexcept;
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  uint16_t AbsImpl() const noexcept {
+    return static_cast<uint16_t>(val & ~kSignMask);
+  }
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  uint16_t NegateImpl() const noexcept {
+    return IsNaN() ? val : static_cast<uint16_t>(val ^ kSignMask);
+  }
+
+ public:
+  // uint16_t special values
+  static constexpr uint16_t kSignMask = 0x8000U;
+  static constexpr uint16_t kBiasedExponentMask = 0x7C00U;
+  static constexpr uint16_t kPositiveInfinityBits = 0x7C00U;
+  static constexpr uint16_t kNegativeInfinityBits = 0xFC00U;
+  static constexpr uint16_t kPositiveQNaNBits = 0x7E00U;
+  static constexpr uint16_t kNegativeQNaNBits = 0xFE00U;
+  static constexpr uint16_t kEpsilonBits = 0x4170U;
+  static constexpr uint16_t kMinValueBits = 0xFBFFU;  // Minimum normal number
+  static constexpr uint16_t kMaxValueBits = 0x7BFFU;  // Largest normal number
+  static constexpr uint16_t kOneBits = 0x3C00U;
+  static constexpr uint16_t kMinusOneBits = 0xBC00U;
+
+  uint16_t val{0};
+
+  Float16Impl() = default;
+
+  /// <summary>
+  /// Checks if the value is negative
+  /// </summary>
+  /// <returns>true if negative</returns>
+  bool IsNegative() const noexcept {
+    return static_cast<int16_t>(val) < 0;
+  }
+
+  /// <summary>
+  /// Tests if the value is NaN
+  /// </summary>
+  /// <returns>true if NaN</returns>
+  bool IsNaN() const noexcept {
+    return AbsImpl() > kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is finite
+  /// </summary>
+  /// <returns>true if finite</returns>
+  bool IsFinite() const noexcept {
+    return AbsImpl() < kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value represents positive infinity.
+  /// </summary>
+  /// <returns>true if positive infinity</returns>
+  bool IsPositiveInfinity() const noexcept {
+    return val == kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value represents negative infinity
+  /// </summary>
+  /// <returns>true if negative infinity</returns>
+  bool IsNegativeInfinity() const noexcept {
+    return val == kNegativeInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is either positive or negative infinity.
+  /// </summary>
+  /// <returns>True if absolute value is infinity</returns>
+  bool IsInfinity() const noexcept {
+    return AbsImpl() == kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is NaN or zero. Useful for comparisons.
+  /// </summary>
+  /// <returns>True if NaN or zero.</returns>
+  bool IsNaNOrZero() const noexcept {
+    auto abs = AbsImpl();
+    return (abs == 0 || abs > kPositiveInfinityBits);
+  }
+
+  /// <summary>
+  /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
+  /// </summary>
+  /// <returns>True if so</returns>
+  bool IsNormal() const noexcept {
+    auto abs = AbsImpl();
+    return (abs < kPositiveInfinityBits)           // is finite
+           && (abs != 0)                           // is not zero
+           && ((abs & kBiasedExponentMask) != 0);  // is not subnormal (has a non-zero exponent)
+  }
+
+  /// <summary>
+  /// Tests if the value is subnormal (denormal).
+  /// </summary>
+  /// <returns>True if so</returns>
+  bool IsSubnormal() const noexcept {
+    auto abs = AbsImpl();
+    return (abs < kPositiveInfinityBits)           // is finite
+           && (abs != 0)                           // is not zero
+           && ((abs & kBiasedExponentMask) == 0);  // is subnormal (has a zero exponent)
+  }
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  Derived Abs() const noexcept { return Derived::FromBits(AbsImpl()); }
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  Derived Negate() const noexcept { return Derived::FromBits(NegateImpl()); }
+
+  /// <summary>
+  /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+  /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
+  /// and therefore equivalent, if the resulting value is still zero.
+  /// </summary>
+  /// <param name="lhs">first value</param>
+  /// <param name="rhs">second value</param>
+  /// <returns>True if both arguments represent zero</returns>
+  static bool AreZero(const Float16Impl& lhs, const Float16Impl& rhs) noexcept {
+    return static_cast<uint16_t>((lhs.val | rhs.val) & ~kSignMask) == 0;
+  }
+
+  bool operator==(const Float16Impl& rhs) const noexcept {
+    if (IsNaN() || rhs.IsNaN()) {
+      // IEEE defines that NaN is not equal to anything, including itself.
+      return false;
+    }
+    return val == rhs.val;
+  }
+
+  bool operator!=(const Float16Impl& rhs) const noexcept { return !(*this == rhs); }
+
+  bool operator<(const Float16Impl& rhs) const noexcept {
+    if (IsNaN() || rhs.IsNaN()) {
+      // IEEE defines that NaN is unordered with respect to everything, including itself.
+      return false;
+    }
+
+    const bool left_is_negative = IsNegative();
+    if (left_is_negative != rhs.IsNegative()) {
+      // When the signs of left and right differ, we know that left is less than right if it is
+      // the negative value. The exception to this is if both values are zero, in which case IEEE
+      // says they should be equal, even if the signs differ.
+      return left_is_negative && !AreZero(*this, rhs);
+    }
+    return (val != rhs.val) && ((val < rhs.val) ^ left_is_negative);
+  }
+};
+
+// The following Float16_t conversions are based on the code from
+// Eigen library.
+
+// The conversion routines are Copyright (c) Fabian Giesen, 2016.
+// The original license follows:
+//
+// Copyright (c) Fabian Giesen, 2016
+// All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted.
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+namespace detail {
+union float32_bits {
+  unsigned int u;
+  float f;
+};
+}  // namespace detail
+
+template <class Derived>
+inline constexpr uint16_t Float16Impl<Derived>::ToUint16Impl(float v) noexcept {
+  detail::float32_bits f{};
+  f.f = v;
+
+  constexpr detail::float32_bits f32infty = {255 << 23};
+  constexpr detail::float32_bits f16max = {(127 + 16) << 23};
+  constexpr detail::float32_bits denorm_magic = {((127 - 15) + (23 - 10) + 1) << 23};
+  constexpr unsigned int sign_mask = 0x80000000u;
+  uint16_t val = static_cast<uint16_t>(0x0u);
+
+  unsigned int sign = f.u & sign_mask;
+  f.u ^= sign;
+
+  // NOTE all the integer compares in this function can be safely
+  // compiled into signed compares since all operands are below
+  // 0x80000000. Important if you want fast straight SSE2 code
+  // (since there's no unsigned PCMPGTD).
+
+  if (f.u >= f16max.u) {                         // result is Inf or NaN (all exponent bits set)
+    val = (f.u > f32infty.u) ? 0x7e00 : 0x7c00;  // NaN->qNaN and Inf->Inf
+  } else {                                       // (De)normalized number or zero
+    if (f.u < (113 << 23)) {                     // resulting FP16 is subnormal or zero
+      // use a magic value to align our 10 mantissa bits at the bottom of
+      // the float. as long as FP addition is round-to-nearest-even this
+      // just works.
+      f.f += denorm_magic.f;
+
+      // and one integer subtract of the bias later, we have our final float!
+      val = static_cast<uint16_t>(f.u - denorm_magic.u);
+    } else {
+      unsigned int mant_odd = (f.u >> 13) & 1;  // resulting mantissa is odd
+
+      // update exponent, rounding bias part 1
+      // Equivalent to `f.u += ((unsigned int)(15 - 127) << 23) + 0xfff`, but
+      // without arithmetic overflow.
+      f.u += 0xc8000fffU;
+      // rounding bias part 2
+      f.u += mant_odd;
+      // take the bits!
+      val = static_cast<uint16_t>(f.u >> 13);
+    }
+  }
+
+  val |= static_cast<uint16_t>(sign >> 16);
+  return val;
+}
+
+template <class Derived>
+inline float Float16Impl<Derived>::ToFloatImpl() const noexcept {
+  constexpr detail::float32_bits magic = {113 << 23};
+  constexpr unsigned int shifted_exp = 0x7c00 << 13;  // exponent mask after shift
+  detail::float32_bits o{};
+
+  o.u = (val & 0x7fff) << 13;            // exponent/mantissa bits
+  unsigned int exp = shifted_exp & o.u;  // just the exponent
+  o.u += (127 - 15) << 23;               // exponent adjust
+
+  // handle exponent special cases
+  if (exp == shifted_exp) {   // Inf/NaN?
+    o.u += (128 - 16) << 23;  // extra exp adjust
+  } else if (exp == 0) {      // Zero/Denormal?
+    o.u += 1 << 23;           // extra exp adjust
+    o.f -= magic.f;           // re-normalize
+  }
+
+  // Attempt to workaround the Internal Compiler Error on ARM64
+  // for bitwise | operator, including std::bitset
+#if (defined _MSC_VER) && (defined _M_ARM || defined _M_ARM64 || defined _M_ARM64EC)
+  if (IsNegative()) {
+    return -o.f;
+  }
+#else
+  // original code:
+  o.u |= (val & 0x8000U) << 16U;  // sign bit
+#endif
+  return o.f;
+}
+
+/// Shared implementation between public and internal classes. CRTP pattern.
+template <class Derived>
+struct BFloat16Impl {
+ protected:
+  /// <summary>
+  /// Converts from float to uint16_t float16 representation
+  /// </summary>
+  /// <param name="v"></param>
+  /// <returns></returns>
+  static uint16_t ToUint16Impl(float v) noexcept;
+
+  /// <summary>
+  /// Converts bfloat16 to float
+  /// </summary>
+  /// <returns>float representation of bfloat16 value</returns>
+  float ToFloatImpl() const noexcept;
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  uint16_t AbsImpl() const noexcept {
+    return static_cast<uint16_t>(val & ~kSignMask);
+  }
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  uint16_t NegateImpl() const noexcept {
+    return IsNaN() ? val : static_cast<uint16_t>(val ^ kSignMask);
+  }
+
+ public:
+  // uint16_t special values
+  static constexpr uint16_t kSignMask = 0x8000U;
+  static constexpr uint16_t kBiasedExponentMask = 0x7F80U;
+  static constexpr uint16_t kPositiveInfinityBits = 0x7F80U;
+  static constexpr uint16_t kNegativeInfinityBits = 0xFF80U;
+  static constexpr uint16_t kPositiveQNaNBits = 0x7FC1U;
+  static constexpr uint16_t kNegativeQNaNBits = 0xFFC1U;
+  static constexpr uint16_t kSignaling_NaNBits = 0x7F80U;
+  static constexpr uint16_t kEpsilonBits = 0x0080U;
+  static constexpr uint16_t kMinValueBits = 0xFF7FU;
+  static constexpr uint16_t kMaxValueBits = 0x7F7FU;
+  static constexpr uint16_t kRoundToNearest = 0x7FFFU;
+  static constexpr uint16_t kOneBits = 0x3F80U;
+  static constexpr uint16_t kMinusOneBits = 0xBF80U;
+
+  uint16_t val{0};
+
+  BFloat16Impl() = default;
+
+  /// <summary>
+  /// Checks if the value is negative
+  /// </summary>
+  /// <returns>true if negative</returns>
+  bool IsNegative() const noexcept {
+    return static_cast<int16_t>(val) < 0;
+  }
+
+  /// <summary>
+  /// Tests if the value is NaN
+  /// </summary>
+  /// <returns>true if NaN</returns>
+  bool IsNaN() const noexcept {
+    return AbsImpl() > kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is finite
+  /// </summary>
+  /// <returns>true if finite</returns>
+  bool IsFinite() const noexcept {
+    return AbsImpl() < kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value represents positive infinity.
+  /// </summary>
+  /// <returns>true if positive infinity</returns>
+  bool IsPositiveInfinity() const noexcept {
+    return val == kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value represents negative infinity
+  /// </summary>
+  /// <returns>true if negative infinity</returns>
+  bool IsNegativeInfinity() const noexcept {
+    return val == kNegativeInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is either positive or negative infinity.
+  /// </summary>
+  /// <returns>True if absolute value is infinity</returns>
+  bool IsInfinity() const noexcept {
+    return AbsImpl() == kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is NaN or zero. Useful for comparisons.
+  /// </summary>
+  /// <returns>True if NaN or zero.</returns>
+  bool IsNaNOrZero() const noexcept {
+    auto abs = AbsImpl();
+    return (abs == 0 || abs > kPositiveInfinityBits);
+  }
+
+  /// <summary>
+  /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
+  /// </summary>
+  /// <returns>True if so</returns>
+  bool IsNormal() const noexcept {
+    auto abs = AbsImpl();
+    return (abs < kPositiveInfinityBits)           // is finite
+           && (abs != 0)                           // is not zero
+           && ((abs & kBiasedExponentMask) != 0);  // is not subnormal (has a non-zero exponent)
+  }
+
+  /// <summary>
+  /// Tests if the value is subnormal (denormal).
+  /// </summary>
+  /// <returns>True if so</returns>
+  bool IsSubnormal() const noexcept {
+    auto abs = AbsImpl();
+    return (abs < kPositiveInfinityBits)           // is finite
+           && (abs != 0)                           // is not zero
+           && ((abs & kBiasedExponentMask) == 0);  // is subnormal (has a zero exponent)
+  }
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  Derived Abs() const noexcept { return Derived::FromBits(AbsImpl()); }
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  Derived Negate() const noexcept { return Derived::FromBits(NegateImpl()); }
+
+  /// <summary>
+  /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+  /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
+  /// and therefore equivalent, if the resulting value is still zero.
+  /// </summary>
+  /// <param name="lhs">first value</param>
+  /// <param name="rhs">second value</param>
+  /// <returns>True if both arguments represent zero</returns>
+  static bool AreZero(const BFloat16Impl& lhs, const BFloat16Impl& rhs) noexcept {
+    // IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+    // for two values by or'ing the private bits together and stripping the sign. They are both zero,
+    // and therefore equivalent, if the resulting value is still zero.
+    return static_cast<uint16_t>((lhs.val | rhs.val) & ~kSignMask) == 0;
+  }
+};
+
+template <class Derived>
+inline uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept {
+  uint16_t result;
+  if (std::isnan(v)) {
+    result = kPositiveQNaNBits;
+  } else {
+    auto get_msb_half = [](float fl) {
+      uint16_t result;
+#ifdef __cpp_if_constexpr
+      if constexpr (detail::endian::native == detail::endian::little) {
+#else
+      if (detail::endian::native == detail::endian::little) {
+#endif
+        std::memcpy(&result, reinterpret_cast<char*>(&fl) + sizeof(uint16_t), sizeof(uint16_t));
+      } else {
+        std::memcpy(&result, &fl, sizeof(uint16_t));
+      }
+      return result;
+    };
+
+    uint16_t upper_bits = get_msb_half(v);
+    union {
+      uint32_t U32;
+      float F32;
+    };
+    F32 = v;
+    U32 += (upper_bits & 1) + kRoundToNearest;
+    result = get_msb_half(F32);
+  }
+  return result;
+}
+
+template <class Derived>
+inline float BFloat16Impl<Derived>::ToFloatImpl() const noexcept {
+  if (IsNaN()) {
+    return std::numeric_limits<float>::quiet_NaN();
+  }
+  float result;
+  char* const first = reinterpret_cast<char*>(&result);
+  char* const second = first + sizeof(uint16_t);
+#ifdef __cpp_if_constexpr
+  if constexpr (detail::endian::native == detail::endian::little) {
+#else
+  if (detail::endian::native == detail::endian::little) {
+#endif
+    std::memset(first, 0, sizeof(uint16_t));
+    std::memcpy(second, &val, sizeof(uint16_t));
+  } else {
+    std::memcpy(first, &val, sizeof(uint16_t));
+    std::memset(second, 0, sizeof(uint16_t));
+  }
+  return result;
+}
+
+}  // namespace onnxruntime_float16
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core.hpp
new file mode 100644
index 0000000..f7807e3
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core.hpp
@@ -0,0 +1,3354 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2015, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2015, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_HPP
+#define OPENCV_CORE_HPP
+
+#ifndef __cplusplus
+#  error core.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core/cvdef.h"
+#include "opencv2/core/base.hpp"
+#include "opencv2/core/cvstd.hpp"
+#include "opencv2/core/traits.hpp"
+#include "opencv2/core/matx.hpp"
+#include "opencv2/core/types.hpp"
+#include "opencv2/core/mat.hpp"
+#include "opencv2/core/persistence.hpp"
+
+/**
+@defgroup core Core functionality
+@{
+    @defgroup core_basic Basic structures
+    @defgroup core_c C structures and operations
+    @{
+        @defgroup core_c_glue Connections with C++
+    @}
+    @defgroup core_array Operations on arrays
+    @defgroup core_async Asynchronous API
+    @defgroup core_xml XML/YAML Persistence
+    @defgroup core_cluster Clustering
+    @defgroup core_utils Utility and system functions and macros
+    @{
+        @defgroup core_logging Logging facilities
+        @defgroup core_utils_sse SSE utilities
+        @defgroup core_utils_neon NEON utilities
+        @defgroup core_utils_vsx VSX utilities
+        @defgroup core_utils_softfloat Softfloat support
+        @defgroup core_utils_samples Utility functions for OpenCV samples
+    @}
+    @defgroup core_opengl OpenGL interoperability
+    @defgroup core_ipp Intel IPP Asynchronous C/C++ Converters
+    @defgroup core_optim Optimization Algorithms
+    @defgroup core_directx DirectX interoperability
+    @defgroup core_eigen Eigen support
+    @defgroup core_opencl OpenCL support
+    @defgroup core_va_intel Intel VA-API/OpenCL (CL-VA) interoperability
+    @defgroup core_hal Hardware Acceleration Layer
+    @{
+        @defgroup core_hal_functions Functions
+        @defgroup core_hal_interface Interface
+        @defgroup core_hal_intrin Universal intrinsics
+        @{
+            @defgroup core_hal_intrin_impl Private implementation helpers
+        @}
+        @defgroup core_lowlevel_api Low-level API for external libraries / plugins
+    @}
+    @defgroup core_parallel Parallel Processing
+    @{
+        @defgroup core_parallel_backend Parallel backends API
+    @}
+@}
+ */
+
+namespace cv {
+
+//! @addtogroup core_utils
+//! @{
+
+/*! @brief Class passed to an error.
+
+This class encapsulates all or almost all necessary
+information about the error happened in the program. The exception is
+usually constructed and thrown implicitly via CV_Error and CV_Error_ macros.
+@see error
+ */
+class CV_EXPORTS Exception : public std::exception
+{
+public:
+    /*!
+     Default constructor
+     */
+    Exception();
+    /*!
+     Full constructor. Normally the constructor is not called explicitly.
+     Instead, the macros CV_Error(), CV_Error_() and CV_Assert() are used.
+    */
+    Exception(int _code, const String& _err, const String& _func, const String& _file, int _line);
+    virtual ~Exception() throw();
+
+    /*!
+     \return the error description and the context as a text string.
+    */
+    virtual const char *what() const throw() CV_OVERRIDE;
+    void formatMessage();
+
+    String msg; ///< the formatted error message
+
+    int code; ///< error code @see CVStatus
+    String err; ///< error description
+    String func; ///< function name. Available only when the compiler supports getting it
+    String file; ///< source file name where the error has occurred
+    int line; ///< line number in the source file where the error has occurred
+};
+
+/*! @brief Signals an error and raises the exception.
+
+By default the function prints information about the error to stderr,
+then it either stops if cv::setBreakOnError() had been called before or raises the exception.
+It is possible to alternate error processing by using #redirectError().
+@param exc the exception raisen.
+@deprecated drop this version
+ */
+CV_EXPORTS CV_NORETURN void error(const Exception& exc);
+
+enum SortFlags { SORT_EVERY_ROW    = 0, //!< each matrix row is sorted independently
+                 SORT_EVERY_COLUMN = 1, //!< each matrix column is sorted
+                                        //!< independently; this flag and the previous one are
+                                        //!< mutually exclusive.
+                 SORT_ASCENDING    = 0, //!< each matrix row is sorted in the ascending
+                                        //!< order.
+                 SORT_DESCENDING   = 16 //!< each matrix row is sorted in the
+                                        //!< descending order; this flag and the previous one are also
+                                        //!< mutually exclusive.
+               };
+
+//! @} core_utils
+
+//! @addtogroup core
+//! @{
+
+//! Covariation flags
+enum CovarFlags {
+    /** The output covariance matrix is calculated as:
+       \f[\texttt{scale}   \cdot  [  \texttt{vects}  [0]-  \texttt{mean}  , \texttt{vects}  [1]-  \texttt{mean}  ,...]^T  \cdot  [ \texttt{vects}  [0]- \texttt{mean}  , \texttt{vects}  [1]- \texttt{mean}  ,...],\f]
+       The covariance matrix will be nsamples x nsamples. Such an unusual covariance matrix is used
+       for fast PCA of a set of very large vectors (see, for example, the EigenFaces technique for
+       face recognition). Eigenvalues of this "scrambled" matrix match the eigenvalues of the true
+       covariance matrix. The "true" eigenvectors can be easily calculated from the eigenvectors of
+       the "scrambled" covariance matrix. */
+    COVAR_SCRAMBLED = 0,
+    /**The output covariance matrix is calculated as:
+        \f[\texttt{scale}   \cdot  [  \texttt{vects}  [0]-  \texttt{mean}  , \texttt{vects}  [1]-  \texttt{mean}  ,...]  \cdot  [ \texttt{vects}  [0]- \texttt{mean}  , \texttt{vects}  [1]- \texttt{mean}  ,...]^T,\f]
+        covar will be a square matrix of the same size as the total number of elements in each input
+        vector. One and only one of #COVAR_SCRAMBLED and #COVAR_NORMAL must be specified.*/
+    COVAR_NORMAL    = 1,
+    /** If the flag is specified, the function does not calculate mean from
+        the input vectors but, instead, uses the passed mean vector. This is useful if mean has been
+        pre-calculated or known in advance, or if the covariance matrix is calculated by parts. In
+        this case, mean is not a mean vector of the input sub-set of vectors but rather the mean
+        vector of the whole set.*/
+    COVAR_USE_AVG   = 2,
+    /** If the flag is specified, the covariance matrix is scaled. In the
+        "normal" mode, scale is 1./nsamples . In the "scrambled" mode, scale is the reciprocal of the
+        total number of elements in each input vector. By default (if the flag is not specified), the
+        covariance matrix is not scaled ( scale=1 ).*/
+    COVAR_SCALE     = 4,
+    /** If the flag is
+        specified, all the input vectors are stored as rows of the samples matrix. mean should be a
+        single-row vector in this case.*/
+    COVAR_ROWS      = 8,
+    /** If the flag is
+        specified, all the input vectors are stored as columns of the samples matrix. mean should be a
+        single-column vector in this case.*/
+    COVAR_COLS      = 16
+};
+
+//! @addtogroup core_cluster
+//!  @{
+
+//! k-Means flags
+enum KmeansFlags {
+    /** Select random initial centers in each attempt.*/
+    KMEANS_RANDOM_CENTERS     = 0,
+    /** Use kmeans++ center initialization by Arthur and Vassilvitskii [Arthur2007].*/
+    KMEANS_PP_CENTERS         = 2,
+    /** During the first (and possibly the only) attempt, use the
+        user-supplied labels instead of computing them from the initial centers. For the second and
+        further attempts, use the random or semi-random centers. Use one of KMEANS_\*_CENTERS flag
+        to specify the exact method.*/
+    KMEANS_USE_INITIAL_LABELS = 1
+};
+
+//! @} core_cluster
+
+//! @addtogroup core_array
+//! @{
+
+enum ReduceTypes { REDUCE_SUM = 0, //!< the output is the sum of all rows/columns of the matrix.
+                   REDUCE_AVG = 1, //!< the output is the mean vector of all rows/columns of the matrix.
+                   REDUCE_MAX = 2, //!< the output is the maximum (column/row-wise) of all rows/columns of the matrix.
+                   REDUCE_MIN = 3  //!< the output is the minimum (column/row-wise) of all rows/columns of the matrix.
+                 };
+
+//! @} core_array
+
+/** @brief Swaps two matrices
+*/
+CV_EXPORTS void swap(Mat& a, Mat& b);
+/** @overload */
+CV_EXPORTS void swap( UMat& a, UMat& b );
+
+//! @} core
+
+//! @addtogroup core_array
+//! @{
+
+/** @brief Computes the source location of an extrapolated pixel.
+
+The function computes and returns the coordinate of a donor pixel corresponding to the specified
+extrapolated pixel when using the specified extrapolation border mode. For example, if you use
+cv::BORDER_WRAP mode in the horizontal direction, cv::BORDER_REFLECT_101 in the vertical direction and
+want to compute value of the "virtual" pixel Point(-5, 100) in a floating-point image img , it
+looks like:
+@code{.cpp}
+    float val = img.at<float>(borderInterpolate(100, img.rows, cv::BORDER_REFLECT_101),
+                              borderInterpolate(-5, img.cols, cv::BORDER_WRAP));
+@endcode
+Normally, the function is not called directly. It is used inside filtering functions and also in
+copyMakeBorder.
+@param p 0-based coordinate of the extrapolated pixel along one of the axes, likely \<0 or \>= len
+@param len Length of the array along the corresponding axis.
+@param borderType Border type, one of the #BorderTypes, except for #BORDER_TRANSPARENT and
+#BORDER_ISOLATED . When borderType==#BORDER_CONSTANT , the function always returns -1, regardless
+of p and len.
+
+@sa copyMakeBorder
+*/
+CV_EXPORTS_W int borderInterpolate(int p, int len, int borderType);
+
+/** @example samples/cpp/tutorial_code/ImgTrans/copyMakeBorder_demo.cpp
+An example using copyMakeBorder function.
+Check @ref tutorial_copyMakeBorder "the corresponding tutorial" for more details
+*/
+
+/** @brief Forms a border around an image.
+
+The function copies the source image into the middle of the destination image. The areas to the
+left, to the right, above and below the copied source image will be filled with extrapolated
+pixels. This is not what filtering functions based on it do (they extrapolate pixels on-fly), but
+what other more complex functions, including your own, may do to simplify image boundary handling.
+
+The function supports the mode when src is already in the middle of dst . In this case, the
+function does not copy src itself but simply constructs the border, for example:
+
+@code{.cpp}
+    // let border be the same in all directions
+    int border=2;
+    // constructs a larger image to fit both the image and the border
+    Mat gray_buf(rgb.rows + border*2, rgb.cols + border*2, rgb.depth());
+    // select the middle part of it w/o copying data
+    Mat gray(gray_canvas, Rect(border, border, rgb.cols, rgb.rows));
+    // convert image from RGB to grayscale
+    cvtColor(rgb, gray, COLOR_RGB2GRAY);
+    // form a border in-place
+    copyMakeBorder(gray, gray_buf, border, border,
+                   border, border, BORDER_REPLICATE);
+    // now do some custom filtering ...
+    ...
+@endcode
+@note When the source image is a part (ROI) of a bigger image, the function will try to use the
+pixels outside of the ROI to form a border. To disable this feature and always do extrapolation, as
+if src was not a ROI, use borderType | #BORDER_ISOLATED.
+
+@param src Source image.
+@param dst Destination image of the same type as src and the size Size(src.cols+left+right,
+src.rows+top+bottom) .
+@param top the top pixels
+@param bottom the bottom pixels
+@param left the left pixels
+@param right Parameter specifying how many pixels in each direction from the source image rectangle
+to extrapolate. For example, top=1, bottom=1, left=1, right=1 mean that 1 pixel-wide border needs
+to be built.
+@param borderType Border type. See borderInterpolate for details.
+@param value Border value if borderType==BORDER_CONSTANT .
+
+@sa  borderInterpolate
+*/
+CV_EXPORTS_W void copyMakeBorder(InputArray src, OutputArray dst,
+                                 int top, int bottom, int left, int right,
+                                 int borderType, const Scalar& value = Scalar() );
+
+/** @brief Calculates the per-element sum of two arrays or an array and a scalar.
+
+The function add calculates:
+- Sum of two arrays when both input arrays have the same size and the same number of channels:
+\f[\texttt{dst}(I) =  \texttt{saturate} ( \texttt{src1}(I) +  \texttt{src2}(I)) \quad \texttt{if mask}(I) \ne0\f]
+- Sum of an array and a scalar when src2 is constructed from Scalar or has the same number of
+elements as `src1.channels()`:
+\f[\texttt{dst}(I) =  \texttt{saturate} ( \texttt{src1}(I) +  \texttt{src2} ) \quad \texttt{if mask}(I) \ne0\f]
+- Sum of a scalar and an array when src1 is constructed from Scalar or has the same number of
+elements as `src2.channels()`:
+\f[\texttt{dst}(I) =  \texttt{saturate} ( \texttt{src1} +  \texttt{src2}(I) ) \quad \texttt{if mask}(I) \ne0\f]
+where `I` is a multi-dimensional index of array elements. In case of multi-channel arrays, each
+channel is processed independently.
+
+The first function in the list above can be replaced with matrix expressions:
+@code{.cpp}
+    dst = src1 + src2;
+    dst += src1; // equivalent to add(dst, src1, dst);
+@endcode
+The input arrays and the output array can all have the same or different depths. For example, you
+can add a 16-bit unsigned array to a 8-bit signed array and store the sum as a 32-bit
+floating-point array. Depth of the output array is determined by the dtype parameter. In the second
+and third cases above, as well as in the first case, when src1.depth() == src2.depth(), dtype can
+be set to the default -1. In this case, the output array will have the same depth as the input
+array, be it src1, src2 or both.
+@note Saturation is not applied when the output array has the depth CV_32S. You may even get
+result of an incorrect sign in the case of overflow.
+@param src1 first input array or a scalar.
+@param src2 second input array or a scalar.
+@param dst output array that has the same size and number of channels as the input array(s); the
+depth is defined by dtype or src1/src2.
+@param mask optional operation mask - 8-bit single channel array, that specifies elements of the
+output array to be changed.
+@param dtype optional depth of the output array (see the discussion below).
+@sa subtract, addWeighted, scaleAdd, Mat::convertTo
+*/
+CV_EXPORTS_W void add(InputArray src1, InputArray src2, OutputArray dst,
+                      InputArray mask = noArray(), int dtype = -1);
+
+/** @brief Calculates the per-element difference between two arrays or array and a scalar.
+
+The function subtract calculates:
+- Difference between two arrays, when both input arrays have the same size and the same number of
+channels:
+    \f[\texttt{dst}(I) =  \texttt{saturate} ( \texttt{src1}(I) -  \texttt{src2}(I)) \quad \texttt{if mask}(I) \ne0\f]
+- Difference between an array and a scalar, when src2 is constructed from Scalar or has the same
+number of elements as `src1.channels()`:
+    \f[\texttt{dst}(I) =  \texttt{saturate} ( \texttt{src1}(I) -  \texttt{src2} ) \quad \texttt{if mask}(I) \ne0\f]
+- Difference between a scalar and an array, when src1 is constructed from Scalar or has the same
+number of elements as `src2.channels()`:
+    \f[\texttt{dst}(I) =  \texttt{saturate} ( \texttt{src1} -  \texttt{src2}(I) ) \quad \texttt{if mask}(I) \ne0\f]
+- The reverse difference between a scalar and an array in the case of `SubRS`:
+    \f[\texttt{dst}(I) =  \texttt{saturate} ( \texttt{src2} -  \texttt{src1}(I) ) \quad \texttt{if mask}(I) \ne0\f]
+where I is a multi-dimensional index of array elements. In case of multi-channel arrays, each
+channel is processed independently.
+
+The first function in the list above can be replaced with matrix expressions:
+@code{.cpp}
+    dst = src1 - src2;
+    dst -= src1; // equivalent to subtract(dst, src1, dst);
+@endcode
+The input arrays and the output array can all have the same or different depths. For example, you
+can subtract to 8-bit unsigned arrays and store the difference in a 16-bit signed array. Depth of
+the output array is determined by dtype parameter. In the second and third cases above, as well as
+in the first case, when src1.depth() == src2.depth(), dtype can be set to the default -1. In this
+case the output array will have the same depth as the input array, be it src1, src2 or both.
+@note Saturation is not applied when the output array has the depth CV_32S. You may even get
+result of an incorrect sign in the case of overflow.
+@param src1 first input array or a scalar.
+@param src2 second input array or a scalar.
+@param dst output array of the same size and the same number of channels as the input array.
+@param mask optional operation mask; this is an 8-bit single channel array that specifies elements
+of the output array to be changed.
+@param dtype optional depth of the output array
+@sa  add, addWeighted, scaleAdd, Mat::convertTo
+  */
+CV_EXPORTS_W void subtract(InputArray src1, InputArray src2, OutputArray dst,
+                           InputArray mask = noArray(), int dtype = -1);
+
+
+/** @brief Calculates the per-element scaled product of two arrays.
+
+The function multiply calculates the per-element product of two arrays:
+
+\f[\texttt{dst} (I)= \texttt{saturate} ( \texttt{scale} \cdot \texttt{src1} (I)  \cdot \texttt{src2} (I))\f]
+
+There is also a @ref MatrixExpressions -friendly variant of the first function. See Mat::mul .
+
+For a not-per-element matrix product, see gemm .
+
+@note Saturation is not applied when the output array has the depth
+CV_32S. You may even get result of an incorrect sign in the case of
+overflow.
+@param src1 first input array.
+@param src2 second input array of the same size and the same type as src1.
+@param dst output array of the same size and type as src1.
+@param scale optional scale factor.
+@param dtype optional depth of the output array
+@sa add, subtract, divide, scaleAdd, addWeighted, accumulate, accumulateProduct, accumulateSquare,
+Mat::convertTo
+*/
+CV_EXPORTS_W void multiply(InputArray src1, InputArray src2,
+                           OutputArray dst, double scale = 1, int dtype = -1);
+
+/** @brief Performs per-element division of two arrays or a scalar by an array.
+
+The function cv::divide divides one array by another:
+\f[\texttt{dst(I) = saturate(src1(I)*scale/src2(I))}\f]
+or a scalar by an array when there is no src1 :
+\f[\texttt{dst(I) = saturate(scale/src2(I))}\f]
+
+Different channels of multi-channel arrays are processed independently.
+
+For integer types when src2(I) is zero, dst(I) will also be zero.
+
+@note In case of floating point data there is no special defined behavior for zero src2(I) values.
+Regular floating-point division is used.
+Expect correct IEEE-754 behaviour for floating-point data (with NaN, Inf result values).
+
+@note Saturation is not applied when the output array has the depth CV_32S. You may even get
+result of an incorrect sign in the case of overflow.
+@param src1 first input array.
+@param src2 second input array of the same size and type as src1.
+@param scale scalar factor.
+@param dst output array of the same size and type as src2.
+@param dtype optional depth of the output array; if -1, dst will have depth src2.depth(), but in
+case of an array-by-array division, you can only pass -1 when src1.depth()==src2.depth().
+@sa  multiply, add, subtract
+*/
+CV_EXPORTS_W void divide(InputArray src1, InputArray src2, OutputArray dst,
+                         double scale = 1, int dtype = -1);
+
+/** @overload */
+CV_EXPORTS_W void divide(double scale, InputArray src2,
+                         OutputArray dst, int dtype = -1);
+
+/** @brief Calculates the sum of a scaled array and another array.
+
+The function scaleAdd is one of the classical primitive linear algebra operations, known as DAXPY
+or SAXPY in [BLAS](http://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms). It calculates
+the sum of a scaled array and another array:
+\f[\texttt{dst} (I)= \texttt{scale} \cdot \texttt{src1} (I) +  \texttt{src2} (I)\f]
+The function can also be emulated with a matrix expression, for example:
+@code{.cpp}
+    Mat A(3, 3, CV_64F);
+    ...
+    A.row(0) = A.row(1)*2 + A.row(2);
+@endcode
+@param src1 first input array.
+@param alpha scale factor for the first array.
+@param src2 second input array of the same size and type as src1.
+@param dst output array of the same size and type as src1.
+@sa add, addWeighted, subtract, Mat::dot, Mat::convertTo
+*/
+CV_EXPORTS_W void scaleAdd(InputArray src1, double alpha, InputArray src2, OutputArray dst);
+
+/** @example samples/cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp
+Check @ref tutorial_trackbar "the corresponding tutorial" for more details
+*/
+
+/** @brief Calculates the weighted sum of two arrays.
+
+The function addWeighted calculates the weighted sum of two arrays as follows:
+\f[\texttt{dst} (I)= \texttt{saturate} ( \texttt{src1} (I)* \texttt{alpha} +  \texttt{src2} (I)* \texttt{beta} +  \texttt{gamma} )\f]
+where I is a multi-dimensional index of array elements. In case of multi-channel arrays, each
+channel is processed independently.
+The function can be replaced with a matrix expression:
+@code{.cpp}
+    dst = src1*alpha + src2*beta + gamma;
+@endcode
+@note Saturation is not applied when the output array has the depth CV_32S. You may even get
+result of an incorrect sign in the case of overflow.
+@param src1 first input array.
+@param alpha weight of the first array elements.
+@param src2 second input array of the same size and channel number as src1.
+@param beta weight of the second array elements.
+@param gamma scalar added to each sum.
+@param dst output array that has the same size and number of channels as the input arrays.
+@param dtype optional depth of the output array; when both input arrays have the same depth, dtype
+can be set to -1, which will be equivalent to src1.depth().
+@sa  add, subtract, scaleAdd, Mat::convertTo
+*/
+CV_EXPORTS_W void addWeighted(InputArray src1, double alpha, InputArray src2,
+                              double beta, double gamma, OutputArray dst, int dtype = -1);
+
+/** @brief Scales, calculates absolute values, and converts the result to 8-bit.
+
+On each element of the input array, the function convertScaleAbs
+performs three operations sequentially: scaling, taking an absolute
+value, conversion to an unsigned 8-bit type:
+\f[\texttt{dst} (I)= \texttt{saturate\_cast<uchar>} (| \texttt{src} (I)* \texttt{alpha} +  \texttt{beta} |)\f]
+In case of multi-channel arrays, the function processes each channel
+independently. When the output is not 8-bit, the operation can be
+emulated by calling the Mat::convertTo method (or by using matrix
+expressions) and then by calculating an absolute value of the result.
+For example:
+@code{.cpp}
+    Mat_<float> A(30,30);
+    randu(A, Scalar(-100), Scalar(100));
+    Mat_<float> B = A*5 + 3;
+    B = abs(B);
+    // Mat_<float> B = abs(A*5+3) will also do the job,
+    // but it will allocate a temporary matrix
+@endcode
+@param src input array.
+@param dst output array.
+@param alpha optional scale factor.
+@param beta optional delta added to the scaled values.
+@sa  Mat::convertTo, cv::abs(const Mat&)
+*/
+CV_EXPORTS_W void convertScaleAbs(InputArray src, OutputArray dst,
+                                  double alpha = 1, double beta = 0);
+
+/** @brief Converts an array to half precision floating number.
+
+This function converts FP32 (single precision floating point) from/to FP16 (half precision floating point). CV_16S format is used to represent FP16 data.
+There are two use modes (src -> dst): CV_32F -> CV_16S and CV_16S -> CV_32F. The input array has to have type of CV_32F or
+CV_16S to represent the bit depth. If the input array is neither of them, the function will raise an error.
+The format of half precision floating point is defined in IEEE 754-2008.
+
+@param src input array.
+@param dst output array.
+*/
+CV_EXPORTS_W void convertFp16(InputArray src, OutputArray dst);
+
+/** @brief Performs a look-up table transform of an array.
+
+The function LUT fills the output array with values from the look-up table. Indices of the entries
+are taken from the input array. That is, the function processes each element of src as follows:
+\f[\texttt{dst} (I)  \leftarrow \texttt{lut(src(I) + d)}\f]
+where
+\f[d =  \fork{0}{if \(\texttt{src}\) has depth \(\texttt{CV_8U}\)}{128}{if \(\texttt{src}\) has depth \(\texttt{CV_8S}\)}\f]
+@param src input array of 8-bit elements.
+@param lut look-up table of 256 elements; in case of multi-channel input array, the table should
+either have a single channel (in this case the same table is used for all channels) or the same
+number of channels as in the input array.
+@param dst output array of the same size and number of channels as src, and the same depth as lut.
+@sa  convertScaleAbs, Mat::convertTo
+*/
+CV_EXPORTS_W void LUT(InputArray src, InputArray lut, OutputArray dst);
+
+/** @brief Calculates the sum of array elements.
+
+The function cv::sum calculates and returns the sum of array elements,
+independently for each channel.
+@param src input array that must have from 1 to 4 channels.
+@sa  countNonZero, mean, meanStdDev, norm, minMaxLoc, reduce
+*/
+CV_EXPORTS_AS(sumElems) Scalar sum(InputArray src);
+
+/** @brief Counts non-zero array elements.
+
+The function returns the number of non-zero elements in src :
+\f[\sum _{I: \; \texttt{src} (I) \ne0 } 1\f]
+@param src single-channel array.
+@sa  mean, meanStdDev, norm, minMaxLoc, calcCovarMatrix
+*/
+CV_EXPORTS_W int countNonZero( InputArray src );
+
+/** @brief Returns the list of locations of non-zero pixels
+
+Given a binary matrix (likely returned from an operation such
+as threshold(), compare(), >, ==, etc, return all of
+the non-zero indices as a cv::Mat or std::vector<cv::Point> (x,y)
+For example:
+@code{.cpp}
+    cv::Mat binaryImage; // input, binary image
+    cv::Mat locations;   // output, locations of non-zero pixels
+    cv::findNonZero(binaryImage, locations);
+
+    // access pixel coordinates
+    Point pnt = locations.at<Point>(i);
+@endcode
+or
+@code{.cpp}
+    cv::Mat binaryImage; // input, binary image
+    vector<Point> locations;   // output, locations of non-zero pixels
+    cv::findNonZero(binaryImage, locations);
+
+    // access pixel coordinates
+    Point pnt = locations[i];
+@endcode
+@param src single-channel array
+@param idx the output array, type of cv::Mat or std::vector<Point>, corresponding to non-zero indices in the input
+*/
+CV_EXPORTS_W void findNonZero( InputArray src, OutputArray idx );
+
+/** @brief Calculates an average (mean) of array elements.
+
+The function cv::mean calculates the mean value M of array elements,
+independently for each channel, and return it:
+\f[\begin{array}{l} N =  \sum _{I: \; \texttt{mask} (I) \ne 0} 1 \\ M_c =  \left ( \sum _{I: \; \texttt{mask} (I) \ne 0}{ \texttt{mtx} (I)_c} \right )/N \end{array}\f]
+When all the mask elements are 0's, the function returns Scalar::all(0)
+@param src input array that should have from 1 to 4 channels so that the result can be stored in
+Scalar_ .
+@param mask optional operation mask.
+@sa  countNonZero, meanStdDev, norm, minMaxLoc
+*/
+CV_EXPORTS_W Scalar mean(InputArray src, InputArray mask = noArray());
+
+/** Calculates a mean and standard deviation of array elements.
+
+The function cv::meanStdDev calculates the mean and the standard deviation M
+of array elements independently for each channel and returns it via the
+output parameters:
+\f[\begin{array}{l} N =  \sum _{I, \texttt{mask} (I)  \ne 0} 1 \\ \texttt{mean} _c =  \frac{\sum_{ I: \; \texttt{mask}(I) \ne 0} \texttt{src} (I)_c}{N} \\ \texttt{stddev} _c =  \sqrt{\frac{\sum_{ I: \; \texttt{mask}(I) \ne 0} \left ( \texttt{src} (I)_c -  \texttt{mean} _c \right )^2}{N}} \end{array}\f]
+When all the mask elements are 0's, the function returns
+mean=stddev=Scalar::all(0).
+@note The calculated standard deviation is only the diagonal of the
+complete normalized covariance matrix. If the full matrix is needed, you
+can reshape the multi-channel array M x N to the single-channel array
+M\*N x mtx.channels() (only possible when the matrix is continuous) and
+then pass the matrix to calcCovarMatrix .
+@param src input array that should have from 1 to 4 channels so that the results can be stored in
+Scalar_ 's.
+@param mean output parameter: calculated mean value.
+@param stddev output parameter: calculated standard deviation.
+@param mask optional operation mask.
+@sa  countNonZero, mean, norm, minMaxLoc, calcCovarMatrix
+*/
+CV_EXPORTS_W void meanStdDev(InputArray src, OutputArray mean, OutputArray stddev,
+                             InputArray mask=noArray());
+
+/** @brief Calculates the  absolute norm of an array.
+
+This version of #norm calculates the absolute norm of src1. The type of norm to calculate is specified using #NormTypes.
+
+As example for one array consider the function \f$r(x)= \begin{pmatrix} x \\ 1-x \end{pmatrix}, x \in [-1;1]\f$.
+The \f$ L_{1}, L_{2} \f$ and \f$ L_{\infty} \f$ norm for the sample value \f$r(-1) = \begin{pmatrix} -1 \\ 2 \end{pmatrix}\f$
+is calculated as follows
+\f{align*}
+    \| r(-1) \|_{L_1} &= |-1| + |2| = 3 \\
+    \| r(-1) \|_{L_2} &= \sqrt{(-1)^{2} + (2)^{2}} = \sqrt{5} \\
+    \| r(-1) \|_{L_\infty} &= \max(|-1|,|2|) = 2
+\f}
+and for \f$r(0.5) = \begin{pmatrix} 0.5 \\ 0.5 \end{pmatrix}\f$ the calculation is
+\f{align*}
+    \| r(0.5) \|_{L_1} &= |0.5| + |0.5| = 1 \\
+    \| r(0.5) \|_{L_2} &= \sqrt{(0.5)^{2} + (0.5)^{2}} = \sqrt{0.5} \\
+    \| r(0.5) \|_{L_\infty} &= \max(|0.5|,|0.5|) = 0.5.
+\f}
+The following graphic shows all values for the three norm functions \f$\| r(x) \|_{L_1}, \| r(x) \|_{L_2}\f$ and \f$\| r(x) \|_{L_\infty}\f$.
+It is notable that the \f$ L_{1} \f$ norm forms the upper and the \f$ L_{\infty} \f$ norm forms the lower border for the example function \f$ r(x) \f$.
+![Graphs for the different norm functions from the above example](pics/NormTypes_OneArray_1-2-INF.png)
+
+When the mask parameter is specified and it is not empty, the norm is
+
+If normType is not specified, #NORM_L2 is used.
+calculated only over the region specified by the mask.
+
+Multi-channel input arrays are treated as single-channel arrays, that is,
+the results for all channels are combined.
+
+Hamming norms can only be calculated with CV_8U depth arrays.
+
+@param src1 first input array.
+@param normType type of the norm (see #NormTypes).
+@param mask optional operation mask; it must have the same size as src1 and CV_8UC1 type.
+*/
+CV_EXPORTS_W double norm(InputArray src1, int normType = NORM_L2, InputArray mask = noArray());
+
+/** @brief Calculates an absolute difference norm or a relative difference norm.
+
+This version of cv::norm calculates the absolute difference norm
+or the relative difference norm of arrays src1 and src2.
+The type of norm to calculate is specified using #NormTypes.
+
+@param src1 first input array.
+@param src2 second input array of the same size and the same type as src1.
+@param normType type of the norm (see #NormTypes).
+@param mask optional operation mask; it must have the same size as src1 and CV_8UC1 type.
+*/
+CV_EXPORTS_W double norm(InputArray src1, InputArray src2,
+                         int normType = NORM_L2, InputArray mask = noArray());
+/** @overload
+@param src first input array.
+@param normType type of the norm (see #NormTypes).
+*/
+CV_EXPORTS double norm( const SparseMat& src, int normType );
+
+/** @brief Computes the Peak Signal-to-Noise Ratio (PSNR) image quality metric.
+
+This function calculates the Peak Signal-to-Noise Ratio (PSNR) image quality metric in decibels (dB),
+between two input arrays src1 and src2. The arrays must have the same type.
+
+The PSNR is calculated as follows:
+
+\f[
+\texttt{PSNR} = 10 \cdot \log_{10}{\left( \frac{R^2}{MSE} \right) }
+\f]
+
+where R is the maximum integer value of depth (e.g. 255 in the case of CV_8U data)
+and MSE is the mean squared error between the two arrays.
+
+@param src1 first input array.
+@param src2 second input array of the same size as src1.
+@param R the maximum pixel value (255 by default)
+
+  */
+CV_EXPORTS_W double PSNR(InputArray src1, InputArray src2, double R=255.);
+
+/** @brief naive nearest neighbor finder
+
+see http://en.wikipedia.org/wiki/Nearest_neighbor_search
+@todo document
+  */
+CV_EXPORTS_W void batchDistance(InputArray src1, InputArray src2,
+                                OutputArray dist, int dtype, OutputArray nidx,
+                                int normType = NORM_L2, int K = 0,
+                                InputArray mask = noArray(), int update = 0,
+                                bool crosscheck = false);
+
+/** @brief Normalizes the norm or value range of an array.
+
+The function cv::normalize normalizes scale and shift the input array elements so that
+\f[\| \texttt{dst} \| _{L_p}= \texttt{alpha}\f]
+(where p=Inf, 1 or 2) when normType=NORM_INF, NORM_L1, or NORM_L2, respectively; or so that
+\f[\min _I  \texttt{dst} (I)= \texttt{alpha} , \, \, \max _I  \texttt{dst} (I)= \texttt{beta}\f]
+
+when normType=NORM_MINMAX (for dense arrays only). The optional mask specifies a sub-array to be
+normalized. This means that the norm or min-n-max are calculated over the sub-array, and then this
+sub-array is modified to be normalized. If you want to only use the mask to calculate the norm or
+min-max but modify the whole array, you can use norm and Mat::convertTo.
+
+In case of sparse matrices, only the non-zero values are analyzed and transformed. Because of this,
+the range transformation for sparse matrices is not allowed since it can shift the zero level.
+
+Possible usage with some positive example data:
+@code{.cpp}
+    vector<double> positiveData = { 2.0, 8.0, 10.0 };
+    vector<double> normalizedData_l1, normalizedData_l2, normalizedData_inf, normalizedData_minmax;
+
+    // Norm to probability (total count)
+    // sum(numbers) = 20.0
+    // 2.0      0.1     (2.0/20.0)
+    // 8.0      0.4     (8.0/20.0)
+    // 10.0     0.5     (10.0/20.0)
+    normalize(positiveData, normalizedData_l1, 1.0, 0.0, NORM_L1);
+
+    // Norm to unit vector: ||positiveData|| = 1.0
+    // 2.0      0.15
+    // 8.0      0.62
+    // 10.0     0.77
+    normalize(positiveData, normalizedData_l2, 1.0, 0.0, NORM_L2);
+
+    // Norm to max element
+    // 2.0      0.2     (2.0/10.0)
+    // 8.0      0.8     (8.0/10.0)
+    // 10.0     1.0     (10.0/10.0)
+    normalize(positiveData, normalizedData_inf, 1.0, 0.0, NORM_INF);
+
+    // Norm to range [0.0;1.0]
+    // 2.0      0.0     (shift to left border)
+    // 8.0      0.75    (6.0/8.0)
+    // 10.0     1.0     (shift to right border)
+    normalize(positiveData, normalizedData_minmax, 1.0, 0.0, NORM_MINMAX);
+@endcode
+
+@param src input array.
+@param dst output array of the same size as src .
+@param alpha norm value to normalize to or the lower range boundary in case of the range
+normalization.
+@param beta upper range boundary in case of the range normalization; it is not used for the norm
+normalization.
+@param norm_type normalization type (see cv::NormTypes).
+@param dtype when negative, the output array has the same type as src; otherwise, it has the same
+number of channels as src and the depth =CV_MAT_DEPTH(dtype).
+@param mask optional operation mask.
+@sa norm, Mat::convertTo, SparseMat::convertTo
+*/
+CV_EXPORTS_W void normalize( InputArray src, InputOutputArray dst, double alpha = 1, double beta = 0,
+                             int norm_type = NORM_L2, int dtype = -1, InputArray mask = noArray());
+
+/** @overload
+@param src input array.
+@param dst output array of the same size as src .
+@param alpha norm value to normalize to or the lower range boundary in case of the range
+normalization.
+@param normType normalization type (see cv::NormTypes).
+*/
+CV_EXPORTS void normalize( const SparseMat& src, SparseMat& dst, double alpha, int normType );
+
+/** @brief Finds the global minimum and maximum in an array.
+
+The function cv::minMaxLoc finds the minimum and maximum element values and their positions. The
+extremums are searched across the whole array or, if mask is not an empty array, in the specified
+array region.
+
+The function do not work with multi-channel arrays. If you need to find minimum or maximum
+elements across all the channels, use Mat::reshape first to reinterpret the array as
+single-channel. Or you may extract the particular channel using either extractImageCOI , or
+mixChannels , or split .
+@param src input single-channel array.
+@param minVal pointer to the returned minimum value; NULL is used if not required.
+@param maxVal pointer to the returned maximum value; NULL is used if not required.
+@param minLoc pointer to the returned minimum location (in 2D case); NULL is used if not required.
+@param maxLoc pointer to the returned maximum location (in 2D case); NULL is used if not required.
+@param mask optional mask used to select a sub-array.
+@sa max, min, reduceArgMin, reduceArgMax, compare, inRange, extractImageCOI, mixChannels, split, Mat::reshape
+*/
+CV_EXPORTS_W void minMaxLoc(InputArray src, CV_OUT double* minVal,
+                            CV_OUT double* maxVal = 0, CV_OUT Point* minLoc = 0,
+                            CV_OUT Point* maxLoc = 0, InputArray mask = noArray());
+
+/**
+ * @brief Finds indices of min elements along provided axis
+ *
+ * @note
+ *      - If input or output array is not continuous, this function will create an internal copy.
+ *      - NaN handling is left unspecified, see patchNaNs().
+ *      - The returned index is always in bounds of input matrix.
+ *
+ * @param src input single-channel array.
+ * @param dst output array of type CV_32SC1 with the same dimensionality as src,
+ * except for axis being reduced - it should be set to 1.
+ * @param lastIndex whether to get the index of first or last occurrence of min.
+ * @param axis axis to reduce along.
+ * @sa reduceArgMax, minMaxLoc, min, max, compare, reduce
+ */
+CV_EXPORTS_W void reduceArgMin(InputArray src, OutputArray dst, int axis, bool lastIndex = false);
+
+/**
+ * @brief Finds indices of max elements along provided axis
+ *
+ * @note
+ *      - If input or output array is not continuous, this function will create an internal copy.
+ *      - NaN handling is left unspecified, see patchNaNs().
+ *      - The returned index is always in bounds of input matrix.
+ *
+ * @param src input single-channel array.
+ * @param dst output array of type CV_32SC1 with the same dimensionality as src,
+ * except for axis being reduced - it should be set to 1.
+ * @param lastIndex whether to get the index of first or last occurrence of max.
+ * @param axis axis to reduce along.
+ * @sa reduceArgMin, minMaxLoc, min, max, compare, reduce
+ */
+CV_EXPORTS_W void reduceArgMax(InputArray src, OutputArray dst, int axis, bool lastIndex = false);
+
+/** @brief Finds the global minimum and maximum in an array
+
+The function cv::minMaxIdx finds the minimum and maximum element values and their positions. The
+extremums are searched across the whole array or, if mask is not an empty array, in the specified
+array region. The function does not work with multi-channel arrays. If you need to find minimum or
+maximum elements across all the channels, use Mat::reshape first to reinterpret the array as
+single-channel. Or you may extract the particular channel using either extractImageCOI , or
+mixChannels , or split . In case of a sparse matrix, the minimum is found among non-zero elements
+only.
+@note When minIdx is not NULL, it must have at least 2 elements (as well as maxIdx), even if src is
+a single-row or single-column matrix. In OpenCV (following MATLAB) each array has at least 2
+dimensions, i.e. single-column matrix is Mx1 matrix (and therefore minIdx/maxIdx will be
+(i1,0)/(i2,0)) and single-row matrix is 1xN matrix (and therefore minIdx/maxIdx will be
+(0,j1)/(0,j2)).
+@param src input single-channel array.
+@param minVal pointer to the returned minimum value; NULL is used if not required.
+@param maxVal pointer to the returned maximum value; NULL is used if not required.
+@param minIdx pointer to the returned minimum location (in nD case); NULL is used if not required;
+Otherwise, it must point to an array of src.dims elements, the coordinates of the minimum element
+in each dimension are stored there sequentially.
+@param maxIdx pointer to the returned maximum location (in nD case). NULL is used if not required.
+@param mask specified array region
+*/
+CV_EXPORTS void minMaxIdx(InputArray src, double* minVal, double* maxVal = 0,
+                          int* minIdx = 0, int* maxIdx = 0, InputArray mask = noArray());
+
+/** @overload
+@param a input single-channel array.
+@param minVal pointer to the returned minimum value; NULL is used if not required.
+@param maxVal pointer to the returned maximum value; NULL is used if not required.
+@param minIdx pointer to the returned minimum location (in nD case); NULL is used if not required;
+Otherwise, it must point to an array of src.dims elements, the coordinates of the minimum element
+in each dimension are stored there sequentially.
+@param maxIdx pointer to the returned maximum location (in nD case). NULL is used if not required.
+*/
+CV_EXPORTS void minMaxLoc(const SparseMat& a, double* minVal,
+                          double* maxVal, int* minIdx = 0, int* maxIdx = 0);
+
+/** @brief Reduces a matrix to a vector.
+
+The function #reduce reduces the matrix to a vector by treating the matrix rows/columns as a set of
+1D vectors and performing the specified operation on the vectors until a single row/column is
+obtained. For example, the function can be used to compute horizontal and vertical projections of a
+raster image. In case of #REDUCE_MAX and #REDUCE_MIN , the output image should have the same type as the source one.
+In case of #REDUCE_SUM and #REDUCE_AVG , the output may have a larger element bit-depth to preserve accuracy.
+And multi-channel arrays are also supported in these two reduction modes.
+
+The following code demonstrates its usage for a single channel matrix.
+@snippet snippets/core_reduce.cpp example
+
+And the following code demonstrates its usage for a two-channel matrix.
+@snippet snippets/core_reduce.cpp example2
+
+@param src input 2D matrix.
+@param dst output vector. Its size and type is defined by dim and dtype parameters.
+@param dim dimension index along which the matrix is reduced. 0 means that the matrix is reduced to
+a single row. 1 means that the matrix is reduced to a single column.
+@param rtype reduction operation that could be one of #ReduceTypes
+@param dtype when negative, the output vector will have the same type as the input matrix,
+otherwise, its type will be CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()).
+@sa repeat, reduceArgMin, reduceArgMax
+*/
+CV_EXPORTS_W void reduce(InputArray src, OutputArray dst, int dim, int rtype, int dtype = -1);
+
+/** @brief Creates one multi-channel array out of several single-channel ones.
+
+The function cv::merge merges several arrays to make a single multi-channel array. That is, each
+element of the output array will be a concatenation of the elements of the input arrays, where
+elements of i-th input array are treated as mv[i].channels()-element vectors.
+
+The function cv::split does the reverse operation. If you need to shuffle channels in some other
+advanced way, use cv::mixChannels.
+
+The following example shows how to merge 3 single channel matrices into a single 3-channel matrix.
+@snippet snippets/core_merge.cpp example
+
+@param mv input array of matrices to be merged; all the matrices in mv must have the same
+size and the same depth.
+@param count number of input matrices when mv is a plain C array; it must be greater than zero.
+@param dst output array of the same size and the same depth as mv[0]; The number of channels will
+be equal to the parameter count.
+@sa  mixChannels, split, Mat::reshape
+*/
+CV_EXPORTS void merge(const Mat* mv, size_t count, OutputArray dst);
+
+/** @overload
+@param mv input vector of matrices to be merged; all the matrices in mv must have the same
+size and the same depth.
+@param dst output array of the same size and the same depth as mv[0]; The number of channels will
+be the total number of channels in the matrix array.
+  */
+CV_EXPORTS_W void merge(InputArrayOfArrays mv, OutputArray dst);
+
+/** @brief Divides a multi-channel array into several single-channel arrays.
+
+The function cv::split splits a multi-channel array into separate single-channel arrays:
+\f[\texttt{mv} [c](I) =  \texttt{src} (I)_c\f]
+If you need to extract a single channel or do some other sophisticated channel permutation, use
+mixChannels .
+
+The following example demonstrates how to split a 3-channel matrix into 3 single channel matrices.
+@snippet snippets/core_split.cpp example
+
+@param src input multi-channel array.
+@param mvbegin output array; the number of arrays must match src.channels(); the arrays themselves are
+reallocated, if needed.
+@sa merge, mixChannels, cvtColor
+*/
+CV_EXPORTS void split(const Mat& src, Mat* mvbegin);
+
+/** @overload
+@param m input multi-channel array.
+@param mv output vector of arrays; the arrays themselves are reallocated, if needed.
+*/
+CV_EXPORTS_W void split(InputArray m, OutputArrayOfArrays mv);
+
+/** @brief Copies specified channels from input arrays to the specified channels of
+output arrays.
+
+The function cv::mixChannels provides an advanced mechanism for shuffling image channels.
+
+cv::split,cv::merge,cv::extractChannel,cv::insertChannel and some forms of cv::cvtColor are partial cases of cv::mixChannels.
+
+In the example below, the code splits a 4-channel BGRA image into a 3-channel BGR (with B and R
+channels swapped) and a separate alpha-channel image:
+@code{.cpp}
+    Mat bgra( 100, 100, CV_8UC4, Scalar(255,0,0,255) );
+    Mat bgr( bgra.rows, bgra.cols, CV_8UC3 );
+    Mat alpha( bgra.rows, bgra.cols, CV_8UC1 );
+
+    // forming an array of matrices is a quite efficient operation,
+    // because the matrix data is not copied, only the headers
+    Mat out[] = { bgr, alpha };
+    // bgra[0] -> bgr[2], bgra[1] -> bgr[1],
+    // bgra[2] -> bgr[0], bgra[3] -> alpha[0]
+    int from_to[] = { 0,2, 1,1, 2,0, 3,3 };
+    mixChannels( &bgra, 1, out, 2, from_to, 4 );
+@endcode
+@note Unlike many other new-style C++ functions in OpenCV (see the introduction section and
+Mat::create ), cv::mixChannels requires the output arrays to be pre-allocated before calling the
+function.
+@param src input array or vector of matrices; all of the matrices must have the same size and the
+same depth.
+@param nsrcs number of matrices in `src`.
+@param dst output array or vector of matrices; all the matrices **must be allocated**; their size and
+depth must be the same as in `src[0]`.
+@param ndsts number of matrices in `dst`.
+@param fromTo array of index pairs specifying which channels are copied and where; fromTo[k\*2] is
+a 0-based index of the input channel in src, fromTo[k\*2+1] is an index of the output channel in
+dst; the continuous channel numbering is used: the first input image channels are indexed from 0 to
+src[0].channels()-1, the second input image channels are indexed from src[0].channels() to
+src[0].channels() + src[1].channels()-1, and so on, the same scheme is used for the output image
+channels; as a special case, when fromTo[k\*2] is negative, the corresponding output channel is
+filled with zero .
+@param npairs number of index pairs in `fromTo`.
+@sa split, merge, extractChannel, insertChannel, cvtColor
+*/
+CV_EXPORTS void mixChannels(const Mat* src, size_t nsrcs, Mat* dst, size_t ndsts,
+                            const int* fromTo, size_t npairs);
+
+/** @overload
+@param src input array or vector of matrices; all of the matrices must have the same size and the
+same depth.
+@param dst output array or vector of matrices; all the matrices **must be allocated**; their size and
+depth must be the same as in src[0].
+@param fromTo array of index pairs specifying which channels are copied and where; fromTo[k\*2] is
+a 0-based index of the input channel in src, fromTo[k\*2+1] is an index of the output channel in
+dst; the continuous channel numbering is used: the first input image channels are indexed from 0 to
+src[0].channels()-1, the second input image channels are indexed from src[0].channels() to
+src[0].channels() + src[1].channels()-1, and so on, the same scheme is used for the output image
+channels; as a special case, when fromTo[k\*2] is negative, the corresponding output channel is
+filled with zero .
+@param npairs number of index pairs in fromTo.
+*/
+CV_EXPORTS void mixChannels(InputArrayOfArrays src, InputOutputArrayOfArrays dst,
+                            const int* fromTo, size_t npairs);
+
+/** @overload
+@param src input array or vector of matrices; all of the matrices must have the same size and the
+same depth.
+@param dst output array or vector of matrices; all the matrices **must be allocated**; their size and
+depth must be the same as in src[0].
+@param fromTo array of index pairs specifying which channels are copied and where; fromTo[k\*2] is
+a 0-based index of the input channel in src, fromTo[k\*2+1] is an index of the output channel in
+dst; the continuous channel numbering is used: the first input image channels are indexed from 0 to
+src[0].channels()-1, the second input image channels are indexed from src[0].channels() to
+src[0].channels() + src[1].channels()-1, and so on, the same scheme is used for the output image
+channels; as a special case, when fromTo[k\*2] is negative, the corresponding output channel is
+filled with zero .
+*/
+CV_EXPORTS_W void mixChannels(InputArrayOfArrays src, InputOutputArrayOfArrays dst,
+                              const std::vector<int>& fromTo);
+
+/** @brief Extracts a single channel from src (coi is 0-based index)
+@param src input array
+@param dst output array
+@param coi index of channel to extract
+@sa mixChannels, split
+*/
+CV_EXPORTS_W void extractChannel(InputArray src, OutputArray dst, int coi);
+
+/** @brief Inserts a single channel to dst (coi is 0-based index)
+@param src input array
+@param dst output array
+@param coi index of channel for insertion
+@sa mixChannels, merge
+*/
+CV_EXPORTS_W void insertChannel(InputArray src, InputOutputArray dst, int coi);
+
+/** @brief Flips a 2D array around vertical, horizontal, or both axes.
+
+The function cv::flip flips the array in one of three different ways (row
+and column indices are 0-based):
+\f[\texttt{dst} _{ij} =
+\left\{
+\begin{array}{l l}
+\texttt{src} _{\texttt{src.rows}-i-1,j} & if\;  \texttt{flipCode} = 0 \\
+\texttt{src} _{i, \texttt{src.cols} -j-1} & if\;  \texttt{flipCode} > 0 \\
+\texttt{src} _{ \texttt{src.rows} -i-1, \texttt{src.cols} -j-1} & if\; \texttt{flipCode} < 0 \\
+\end{array}
+\right.\f]
+The example scenarios of using the function are the following:
+*   Vertical flipping of the image (flipCode == 0) to switch between
+    top-left and bottom-left image origin. This is a typical operation
+    in video processing on Microsoft Windows\* OS.
+*   Horizontal flipping of the image with the subsequent horizontal
+    shift and absolute difference calculation to check for a
+    vertical-axis symmetry (flipCode \> 0).
+*   Simultaneous horizontal and vertical flipping of the image with
+    the subsequent shift and absolute difference calculation to check
+    for a central symmetry (flipCode \< 0).
+*   Reversing the order of point arrays (flipCode \> 0 or
+    flipCode == 0).
+@param src input array.
+@param dst output array of the same size and type as src.
+@param flipCode a flag to specify how to flip the array; 0 means
+flipping around the x-axis and positive value (for example, 1) means
+flipping around y-axis. Negative value (for example, -1) means flipping
+around both axes.
+@sa transpose , repeat , completeSymm
+*/
+CV_EXPORTS_W void flip(InputArray src, OutputArray dst, int flipCode);
+
+enum RotateFlags {
+    ROTATE_90_CLOCKWISE = 0, //!<Rotate 90 degrees clockwise
+    ROTATE_180 = 1, //!<Rotate 180 degrees clockwise
+    ROTATE_90_COUNTERCLOCKWISE = 2, //!<Rotate 270 degrees clockwise
+};
+/** @brief Rotates a 2D array in multiples of 90 degrees.
+The function cv::rotate rotates the array in one of three different ways:
+*   Rotate by 90 degrees clockwise (rotateCode = ROTATE_90_CLOCKWISE).
+*   Rotate by 180 degrees clockwise (rotateCode = ROTATE_180).
+*   Rotate by 270 degrees clockwise (rotateCode = ROTATE_90_COUNTERCLOCKWISE).
+@param src input array.
+@param dst output array of the same type as src.  The size is the same with ROTATE_180,
+and the rows and cols are switched for ROTATE_90_CLOCKWISE and ROTATE_90_COUNTERCLOCKWISE.
+@param rotateCode an enum to specify how to rotate the array; see the enum #RotateFlags
+@sa transpose , repeat , completeSymm, flip, RotateFlags
+*/
+CV_EXPORTS_W void rotate(InputArray src, OutputArray dst, int rotateCode);
+
+/** @brief Fills the output array with repeated copies of the input array.
+
+The function cv::repeat duplicates the input array one or more times along each of the two axes:
+\f[\texttt{dst} _{ij}= \texttt{src} _{i\mod src.rows, \; j\mod src.cols }\f]
+The second variant of the function is more convenient to use with @ref MatrixExpressions.
+@param src input array to replicate.
+@param ny Flag to specify how many times the `src` is repeated along the
+vertical axis.
+@param nx Flag to specify how many times the `src` is repeated along the
+horizontal axis.
+@param dst output array of the same type as `src`.
+@sa cv::reduce
+*/
+CV_EXPORTS_W void repeat(InputArray src, int ny, int nx, OutputArray dst);
+
+/** @overload
+@param src input array to replicate.
+@param ny Flag to specify how many times the `src` is repeated along the
+vertical axis.
+@param nx Flag to specify how many times the `src` is repeated along the
+horizontal axis.
+  */
+CV_EXPORTS Mat repeat(const Mat& src, int ny, int nx);
+
+/** @brief Applies horizontal concatenation to given matrices.
+
+The function horizontally concatenates two or more cv::Mat matrices (with the same number of rows).
+@code{.cpp}
+    cv::Mat matArray[] = { cv::Mat(4, 1, CV_8UC1, cv::Scalar(1)),
+                           cv::Mat(4, 1, CV_8UC1, cv::Scalar(2)),
+                           cv::Mat(4, 1, CV_8UC1, cv::Scalar(3)),};
+
+    cv::Mat out;
+    cv::hconcat( matArray, 3, out );
+    //out:
+    //[1, 2, 3;
+    // 1, 2, 3;
+    // 1, 2, 3;
+    // 1, 2, 3]
+@endcode
+@param src input array or vector of matrices. all of the matrices must have the same number of rows and the same depth.
+@param nsrc number of matrices in src.
+@param dst output array. It has the same number of rows and depth as the src, and the sum of cols of the src.
+@sa cv::vconcat(const Mat*, size_t, OutputArray), @sa cv::vconcat(InputArrayOfArrays, OutputArray) and @sa cv::vconcat(InputArray, InputArray, OutputArray)
+*/
+CV_EXPORTS void hconcat(const Mat* src, size_t nsrc, OutputArray dst);
+/** @overload
+ @code{.cpp}
+    cv::Mat_<float> A = (cv::Mat_<float>(3, 2) << 1, 4,
+                                                  2, 5,
+                                                  3, 6);
+    cv::Mat_<float> B = (cv::Mat_<float>(3, 2) << 7, 10,
+                                                  8, 11,
+                                                  9, 12);
+
+    cv::Mat C;
+    cv::hconcat(A, B, C);
+    //C:
+    //[1, 4, 7, 10;
+    // 2, 5, 8, 11;
+    // 3, 6, 9, 12]
+ @endcode
+ @param src1 first input array to be considered for horizontal concatenation.
+ @param src2 second input array to be considered for horizontal concatenation.
+ @param dst output array. It has the same number of rows and depth as the src1 and src2, and the sum of cols of the src1 and src2.
+ */
+CV_EXPORTS void hconcat(InputArray src1, InputArray src2, OutputArray dst);
+/** @overload
+ @code{.cpp}
+    std::vector<cv::Mat> matrices = { cv::Mat(4, 1, CV_8UC1, cv::Scalar(1)),
+                                      cv::Mat(4, 1, CV_8UC1, cv::Scalar(2)),
+                                      cv::Mat(4, 1, CV_8UC1, cv::Scalar(3)),};
+
+    cv::Mat out;
+    cv::hconcat( matrices, out );
+    //out:
+    //[1, 2, 3;
+    // 1, 2, 3;
+    // 1, 2, 3;
+    // 1, 2, 3]
+ @endcode
+ @param src input array or vector of matrices. all of the matrices must have the same number of rows and the same depth.
+ @param dst output array. It has the same number of rows and depth as the src, and the sum of cols of the src.
+same depth.
+ */
+CV_EXPORTS_W void hconcat(InputArrayOfArrays src, OutputArray dst);
+
+/** @brief Applies vertical concatenation to given matrices.
+
+The function vertically concatenates two or more cv::Mat matrices (with the same number of cols).
+@code{.cpp}
+    cv::Mat matArray[] = { cv::Mat(1, 4, CV_8UC1, cv::Scalar(1)),
+                           cv::Mat(1, 4, CV_8UC1, cv::Scalar(2)),
+                           cv::Mat(1, 4, CV_8UC1, cv::Scalar(3)),};
+
+    cv::Mat out;
+    cv::vconcat( matArray, 3, out );
+    //out:
+    //[1,   1,   1,   1;
+    // 2,   2,   2,   2;
+    // 3,   3,   3,   3]
+@endcode
+@param src input array or vector of matrices. all of the matrices must have the same number of cols and the same depth.
+@param nsrc number of matrices in src.
+@param dst output array. It has the same number of cols and depth as the src, and the sum of rows of the src.
+@sa cv::hconcat(const Mat*, size_t, OutputArray), @sa cv::hconcat(InputArrayOfArrays, OutputArray) and @sa cv::hconcat(InputArray, InputArray, OutputArray)
+*/
+CV_EXPORTS void vconcat(const Mat* src, size_t nsrc, OutputArray dst);
+/** @overload
+ @code{.cpp}
+    cv::Mat_<float> A = (cv::Mat_<float>(3, 2) << 1, 7,
+                                                  2, 8,
+                                                  3, 9);
+    cv::Mat_<float> B = (cv::Mat_<float>(3, 2) << 4, 10,
+                                                  5, 11,
+                                                  6, 12);
+
+    cv::Mat C;
+    cv::vconcat(A, B, C);
+    //C:
+    //[1, 7;
+    // 2, 8;
+    // 3, 9;
+    // 4, 10;
+    // 5, 11;
+    // 6, 12]
+ @endcode
+ @param src1 first input array to be considered for vertical concatenation.
+ @param src2 second input array to be considered for vertical concatenation.
+ @param dst output array. It has the same number of cols and depth as the src1 and src2, and the sum of rows of the src1 and src2.
+ */
+CV_EXPORTS void vconcat(InputArray src1, InputArray src2, OutputArray dst);
+/** @overload
+ @code{.cpp}
+    std::vector<cv::Mat> matrices = { cv::Mat(1, 4, CV_8UC1, cv::Scalar(1)),
+                                      cv::Mat(1, 4, CV_8UC1, cv::Scalar(2)),
+                                      cv::Mat(1, 4, CV_8UC1, cv::Scalar(3)),};
+
+    cv::Mat out;
+    cv::vconcat( matrices, out );
+    //out:
+    //[1,   1,   1,   1;
+    // 2,   2,   2,   2;
+    // 3,   3,   3,   3]
+ @endcode
+ @param src input array or vector of matrices. all of the matrices must have the same number of cols and the same depth
+ @param dst output array. It has the same number of cols and depth as the src, and the sum of rows of the src.
+same depth.
+ */
+CV_EXPORTS_W void vconcat(InputArrayOfArrays src, OutputArray dst);
+
+/** @brief computes bitwise conjunction of the two arrays (dst = src1 & src2)
+Calculates the per-element bit-wise conjunction of two arrays or an
+array and a scalar.
+
+The function cv::bitwise_and calculates the per-element bit-wise logical conjunction for:
+*   Two arrays when src1 and src2 have the same size:
+    \f[\texttt{dst} (I) =  \texttt{src1} (I)  \wedge \texttt{src2} (I) \quad \texttt{if mask} (I) \ne0\f]
+*   An array and a scalar when src2 is constructed from Scalar or has
+    the same number of elements as `src1.channels()`:
+    \f[\texttt{dst} (I) =  \texttt{src1} (I)  \wedge \texttt{src2} \quad \texttt{if mask} (I) \ne0\f]
+*   A scalar and an array when src1 is constructed from Scalar or has
+    the same number of elements as `src2.channels()`:
+    \f[\texttt{dst} (I) =  \texttt{src1}  \wedge \texttt{src2} (I) \quad \texttt{if mask} (I) \ne0\f]
+In case of floating-point arrays, their machine-specific bit
+representations (usually IEEE754-compliant) are used for the operation.
+In case of multi-channel arrays, each channel is processed
+independently. In the second and third cases above, the scalar is first
+converted to the array type.
+@param src1 first input array or a scalar.
+@param src2 second input array or a scalar.
+@param dst output array that has the same size and type as the input
+arrays.
+@param mask optional operation mask, 8-bit single channel array, that
+specifies elements of the output array to be changed.
+*/
+CV_EXPORTS_W void bitwise_and(InputArray src1, InputArray src2,
+                              OutputArray dst, InputArray mask = noArray());
+
+/** @brief Calculates the per-element bit-wise disjunction of two arrays or an
+array and a scalar.
+
+The function cv::bitwise_or calculates the per-element bit-wise logical disjunction for:
+*   Two arrays when src1 and src2 have the same size:
+    \f[\texttt{dst} (I) =  \texttt{src1} (I)  \vee \texttt{src2} (I) \quad \texttt{if mask} (I) \ne0\f]
+*   An array and a scalar when src2 is constructed from Scalar or has
+    the same number of elements as `src1.channels()`:
+    \f[\texttt{dst} (I) =  \texttt{src1} (I)  \vee \texttt{src2} \quad \texttt{if mask} (I) \ne0\f]
+*   A scalar and an array when src1 is constructed from Scalar or has
+    the same number of elements as `src2.channels()`:
+    \f[\texttt{dst} (I) =  \texttt{src1}  \vee \texttt{src2} (I) \quad \texttt{if mask} (I) \ne0\f]
+In case of floating-point arrays, their machine-specific bit
+representations (usually IEEE754-compliant) are used for the operation.
+In case of multi-channel arrays, each channel is processed
+independently. In the second and third cases above, the scalar is first
+converted to the array type.
+@param src1 first input array or a scalar.
+@param src2 second input array or a scalar.
+@param dst output array that has the same size and type as the input
+arrays.
+@param mask optional operation mask, 8-bit single channel array, that
+specifies elements of the output array to be changed.
+*/
+CV_EXPORTS_W void bitwise_or(InputArray src1, InputArray src2,
+                             OutputArray dst, InputArray mask = noArray());
+
+/** @brief Calculates the per-element bit-wise "exclusive or" operation on two
+arrays or an array and a scalar.
+
+The function cv::bitwise_xor calculates the per-element bit-wise logical "exclusive-or"
+operation for:
+*   Two arrays when src1 and src2 have the same size:
+    \f[\texttt{dst} (I) =  \texttt{src1} (I)  \oplus \texttt{src2} (I) \quad \texttt{if mask} (I) \ne0\f]
+*   An array and a scalar when src2 is constructed from Scalar or has
+    the same number of elements as `src1.channels()`:
+    \f[\texttt{dst} (I) =  \texttt{src1} (I)  \oplus \texttt{src2} \quad \texttt{if mask} (I) \ne0\f]
+*   A scalar and an array when src1 is constructed from Scalar or has
+    the same number of elements as `src2.channels()`:
+    \f[\texttt{dst} (I) =  \texttt{src1}  \oplus \texttt{src2} (I) \quad \texttt{if mask} (I) \ne0\f]
+In case of floating-point arrays, their machine-specific bit
+representations (usually IEEE754-compliant) are used for the operation.
+In case of multi-channel arrays, each channel is processed
+independently. In the 2nd and 3rd cases above, the scalar is first
+converted to the array type.
+@param src1 first input array or a scalar.
+@param src2 second input array or a scalar.
+@param dst output array that has the same size and type as the input
+arrays.
+@param mask optional operation mask, 8-bit single channel array, that
+specifies elements of the output array to be changed.
+*/
+CV_EXPORTS_W void bitwise_xor(InputArray src1, InputArray src2,
+                              OutputArray dst, InputArray mask = noArray());
+
+/** @brief  Inverts every bit of an array.
+
+The function cv::bitwise_not calculates per-element bit-wise inversion of the input
+array:
+\f[\texttt{dst} (I) =  \neg \texttt{src} (I)\f]
+In case of a floating-point input array, its machine-specific bit
+representation (usually IEEE754-compliant) is used for the operation. In
+case of multi-channel arrays, each channel is processed independently.
+@param src input array.
+@param dst output array that has the same size and type as the input
+array.
+@param mask optional operation mask, 8-bit single channel array, that
+specifies elements of the output array to be changed.
+*/
+CV_EXPORTS_W void bitwise_not(InputArray src, OutputArray dst,
+                              InputArray mask = noArray());
+
+/** @brief Calculates the per-element absolute difference between two arrays or between an array and a scalar.
+
+The function cv::absdiff calculates:
+*   Absolute difference between two arrays when they have the same
+    size and type:
+    \f[\texttt{dst}(I) =  \texttt{saturate} (| \texttt{src1}(I) -  \texttt{src2}(I)|)\f]
+*   Absolute difference between an array and a scalar when the second
+    array is constructed from Scalar or has as many elements as the
+    number of channels in `src1`:
+    \f[\texttt{dst}(I) =  \texttt{saturate} (| \texttt{src1}(I) -  \texttt{src2} |)\f]
+*   Absolute difference between a scalar and an array when the first
+    array is constructed from Scalar or has as many elements as the
+    number of channels in `src2`:
+    \f[\texttt{dst}(I) =  \texttt{saturate} (| \texttt{src1} -  \texttt{src2}(I) |)\f]
+    where I is a multi-dimensional index of array elements. In case of
+    multi-channel arrays, each channel is processed independently.
+@note Saturation is not applied when the arrays have the depth CV_32S.
+You may even get a negative value in the case of overflow.
+@param src1 first input array or a scalar.
+@param src2 second input array or a scalar.
+@param dst output array that has the same size and type as input arrays.
+@sa cv::abs(const Mat&)
+*/
+CV_EXPORTS_W void absdiff(InputArray src1, InputArray src2, OutputArray dst);
+
+/** @brief  This is an overloaded member function, provided for convenience (python)
+Copies the matrix to another one.
+When the operation mask is specified, if the Mat::create call shown above reallocates the matrix, the newly allocated matrix is initialized with all zeros before copying the data.
+@param src source matrix.
+@param dst Destination matrix. If it does not have a proper size or type before the operation, it is
+reallocated.
+@param mask Operation mask of the same size as \*this. Its non-zero elements indicate which matrix
+elements need to be copied. The mask has to be of type CV_8U and can have 1 or multiple channels.
+*/
+
+void CV_EXPORTS_W copyTo(InputArray src, OutputArray dst, InputArray mask);
+/** @brief  Checks if array elements lie between the elements of two other arrays.
+
+The function checks the range as follows:
+-   For every element of a single-channel input array:
+    \f[\texttt{dst} (I)= \texttt{lowerb} (I)_0  \leq \texttt{src} (I)_0 \leq  \texttt{upperb} (I)_0\f]
+-   For two-channel arrays:
+    \f[\texttt{dst} (I)= \texttt{lowerb} (I)_0  \leq \texttt{src} (I)_0 \leq  \texttt{upperb} (I)_0  \land \texttt{lowerb} (I)_1  \leq \texttt{src} (I)_1 \leq  \texttt{upperb} (I)_1\f]
+-   and so forth.
+
+That is, dst (I) is set to 255 (all 1 -bits) if src (I) is within the
+specified 1D, 2D, 3D, ... box and 0 otherwise.
+
+When the lower and/or upper boundary parameters are scalars, the indexes
+(I) at lowerb and upperb in the above formulas should be omitted.
+@param src first input array.
+@param lowerb inclusive lower boundary array or a scalar.
+@param upperb inclusive upper boundary array or a scalar.
+@param dst output array of the same size as src and CV_8U type.
+*/
+CV_EXPORTS_W void inRange(InputArray src, InputArray lowerb,
+                          InputArray upperb, OutputArray dst);
+
+/** @brief Performs the per-element comparison of two arrays or an array and scalar value.
+
+The function compares:
+*   Elements of two arrays when src1 and src2 have the same size:
+    \f[\texttt{dst} (I) =  \texttt{src1} (I)  \,\texttt{cmpop}\, \texttt{src2} (I)\f]
+*   Elements of src1 with a scalar src2 when src2 is constructed from
+    Scalar or has a single element:
+    \f[\texttt{dst} (I) =  \texttt{src1}(I) \,\texttt{cmpop}\,  \texttt{src2}\f]
+*   src1 with elements of src2 when src1 is constructed from Scalar or
+    has a single element:
+    \f[\texttt{dst} (I) =  \texttt{src1}  \,\texttt{cmpop}\, \texttt{src2} (I)\f]
+When the comparison result is true, the corresponding element of output
+array is set to 255. The comparison operations can be replaced with the
+equivalent matrix expressions:
+@code{.cpp}
+    Mat dst1 = src1 >= src2;
+    Mat dst2 = src1 < 8;
+    ...
+@endcode
+@param src1 first input array or a scalar; when it is an array, it must have a single channel.
+@param src2 second input array or a scalar; when it is an array, it must have a single channel.
+@param dst output array of type ref CV_8U that has the same size and the same number of channels as
+    the input arrays.
+@param cmpop a flag, that specifies correspondence between the arrays (cv::CmpTypes)
+@sa checkRange, min, max, threshold
+*/
+CV_EXPORTS_W void compare(InputArray src1, InputArray src2, OutputArray dst, int cmpop);
+
+/** @brief Calculates per-element minimum of two arrays or an array and a scalar.
+
+The function cv::min calculates the per-element minimum of two arrays:
+\f[\texttt{dst} (I)= \min ( \texttt{src1} (I), \texttt{src2} (I))\f]
+or array and a scalar:
+\f[\texttt{dst} (I)= \min ( \texttt{src1} (I), \texttt{value} )\f]
+@param src1 first input array.
+@param src2 second input array of the same size and type as src1.
+@param dst output array of the same size and type as src1.
+@sa max, compare, inRange, minMaxLoc
+*/
+CV_EXPORTS_W void min(InputArray src1, InputArray src2, OutputArray dst);
+/** @overload
+needed to avoid conflicts with const _Tp& std::min(const _Tp&, const _Tp&, _Compare)
+*/
+CV_EXPORTS void min(const Mat& src1, const Mat& src2, Mat& dst);
+/** @overload
+needed to avoid conflicts with const _Tp& std::min(const _Tp&, const _Tp&, _Compare)
+*/
+CV_EXPORTS void min(const UMat& src1, const UMat& src2, UMat& dst);
+
+/** @brief Calculates per-element maximum of two arrays or an array and a scalar.
+
+The function cv::max calculates the per-element maximum of two arrays:
+\f[\texttt{dst} (I)= \max ( \texttt{src1} (I), \texttt{src2} (I))\f]
+or array and a scalar:
+\f[\texttt{dst} (I)= \max ( \texttt{src1} (I), \texttt{value} )\f]
+@param src1 first input array.
+@param src2 second input array of the same size and type as src1 .
+@param dst output array of the same size and type as src1.
+@sa  min, compare, inRange, minMaxLoc, @ref MatrixExpressions
+*/
+CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst);
+/** @overload
+needed to avoid conflicts with const _Tp& std::min(const _Tp&, const _Tp&, _Compare)
+*/
+CV_EXPORTS void max(const Mat& src1, const Mat& src2, Mat& dst);
+/** @overload
+needed to avoid conflicts with const _Tp& std::min(const _Tp&, const _Tp&, _Compare)
+*/
+CV_EXPORTS void max(const UMat& src1, const UMat& src2, UMat& dst);
+
+/** @brief Calculates a square root of array elements.
+
+The function cv::sqrt calculates a square root of each input array element.
+In case of multi-channel arrays, each channel is processed
+independently. The accuracy is approximately the same as of the built-in
+std::sqrt .
+@param src input floating-point array.
+@param dst output array of the same size and type as src.
+*/
+CV_EXPORTS_W void sqrt(InputArray src, OutputArray dst);
+
+/** @brief Raises every array element to a power.
+
+The function cv::pow raises every element of the input array to power :
+\f[\texttt{dst} (I) =  \fork{\texttt{src}(I)^{power}}{if \(\texttt{power}\) is integer}{|\texttt{src}(I)|^{power}}{otherwise}\f]
+
+So, for a non-integer power exponent, the absolute values of input array
+elements are used. However, it is possible to get true values for
+negative values using some extra operations. In the example below,
+computing the 5th root of array src shows:
+@code{.cpp}
+    Mat mask = src < 0;
+    pow(src, 1./5, dst);
+    subtract(Scalar::all(0), dst, dst, mask);
+@endcode
+For some values of power, such as integer values, 0.5 and -0.5,
+specialized faster algorithms are used.
+
+Special values (NaN, Inf) are not handled.
+@param src input array.
+@param power exponent of power.
+@param dst output array of the same size and type as src.
+@sa sqrt, exp, log, cartToPolar, polarToCart
+*/
+CV_EXPORTS_W void pow(InputArray src, double power, OutputArray dst);
+
+/** @brief Calculates the exponent of every array element.
+
+The function cv::exp calculates the exponent of every element of the input
+array:
+\f[\texttt{dst} [I] = e^{ src(I) }\f]
+
+The maximum relative error is about 7e-6 for single-precision input and
+less than 1e-10 for double-precision input. Currently, the function
+converts denormalized values to zeros on output. Special values (NaN,
+Inf) are not handled.
+@param src input array.
+@param dst output array of the same size and type as src.
+@sa log , cartToPolar , polarToCart , phase , pow , sqrt , magnitude
+*/
+CV_EXPORTS_W void exp(InputArray src, OutputArray dst);
+
+/** @brief Calculates the natural logarithm of every array element.
+
+The function cv::log calculates the natural logarithm of every element of the input array:
+\f[\texttt{dst} (I) =  \log (\texttt{src}(I)) \f]
+
+Output on zero, negative and special (NaN, Inf) values is undefined.
+
+@param src input array.
+@param dst output array of the same size and type as src .
+@sa exp, cartToPolar, polarToCart, phase, pow, sqrt, magnitude
+*/
+CV_EXPORTS_W void log(InputArray src, OutputArray dst);
+
+/** @brief Calculates x and y coordinates of 2D vectors from their magnitude and angle.
+
+The function cv::polarToCart calculates the Cartesian coordinates of each 2D
+vector represented by the corresponding elements of magnitude and angle:
+\f[\begin{array}{l} \texttt{x} (I) =  \texttt{magnitude} (I) \cos ( \texttt{angle} (I)) \\ \texttt{y} (I) =  \texttt{magnitude} (I) \sin ( \texttt{angle} (I)) \\ \end{array}\f]
+
+The relative accuracy of the estimated coordinates is about 1e-6.
+@param magnitude input floating-point array of magnitudes of 2D vectors;
+it can be an empty matrix (=Mat()), in this case, the function assumes
+that all the magnitudes are =1; if it is not empty, it must have the
+same size and type as angle.
+@param angle input floating-point array of angles of 2D vectors.
+@param x output array of x-coordinates of 2D vectors; it has the same
+size and type as angle.
+@param y output array of y-coordinates of 2D vectors; it has the same
+size and type as angle.
+@param angleInDegrees when true, the input angles are measured in
+degrees, otherwise, they are measured in radians.
+@sa cartToPolar, magnitude, phase, exp, log, pow, sqrt
+*/
+CV_EXPORTS_W void polarToCart(InputArray magnitude, InputArray angle,
+                              OutputArray x, OutputArray y, bool angleInDegrees = false);
+
+/** @brief Calculates the magnitude and angle of 2D vectors.
+
+The function cv::cartToPolar calculates either the magnitude, angle, or both
+for every 2D vector (x(I),y(I)):
+\f[\begin{array}{l} \texttt{magnitude} (I)= \sqrt{\texttt{x}(I)^2+\texttt{y}(I)^2} , \\ \texttt{angle} (I)= \texttt{atan2} ( \texttt{y} (I), \texttt{x} (I))[ \cdot180 / \pi ] \end{array}\f]
+
+The angles are calculated with accuracy about 0.3 degrees. For the point
+(0,0), the angle is set to 0.
+@param x array of x-coordinates; this must be a single-precision or
+double-precision floating-point array.
+@param y array of y-coordinates, that must have the same size and same type as x.
+@param magnitude output array of magnitudes of the same size and type as x.
+@param angle output array of angles that has the same size and type as
+x; the angles are measured in radians (from 0 to 2\*Pi) or in degrees (0 to 360 degrees).
+@param angleInDegrees a flag, indicating whether the angles are measured
+in radians (which is by default), or in degrees.
+@sa Sobel, Scharr
+*/
+CV_EXPORTS_W void cartToPolar(InputArray x, InputArray y,
+                              OutputArray magnitude, OutputArray angle,
+                              bool angleInDegrees = false);
+
+/** @brief Calculates the rotation angle of 2D vectors.
+
+The function cv::phase calculates the rotation angle of each 2D vector that
+is formed from the corresponding elements of x and y :
+\f[\texttt{angle} (I) =  \texttt{atan2} ( \texttt{y} (I), \texttt{x} (I))\f]
+
+The angle estimation accuracy is about 0.3 degrees. When x(I)=y(I)=0 ,
+the corresponding angle(I) is set to 0.
+@param x input floating-point array of x-coordinates of 2D vectors.
+@param y input array of y-coordinates of 2D vectors; it must have the
+same size and the same type as x.
+@param angle output array of vector angles; it has the same size and
+same type as x .
+@param angleInDegrees when true, the function calculates the angle in
+degrees, otherwise, they are measured in radians.
+*/
+CV_EXPORTS_W void phase(InputArray x, InputArray y, OutputArray angle,
+                        bool angleInDegrees = false);
+
+/** @brief Calculates the magnitude of 2D vectors.
+
+The function cv::magnitude calculates the magnitude of 2D vectors formed
+from the corresponding elements of x and y arrays:
+\f[\texttt{dst} (I) =  \sqrt{\texttt{x}(I)^2 + \texttt{y}(I)^2}\f]
+@param x floating-point array of x-coordinates of the vectors.
+@param y floating-point array of y-coordinates of the vectors; it must
+have the same size as x.
+@param magnitude output array of the same size and type as x.
+@sa cartToPolar, polarToCart, phase, sqrt
+*/
+CV_EXPORTS_W void magnitude(InputArray x, InputArray y, OutputArray magnitude);
+
+/** @brief Checks every element of an input array for invalid values.
+
+The function cv::checkRange checks that every array element is neither NaN nor infinite. When minVal \>
+-DBL_MAX and maxVal \< DBL_MAX, the function also checks that each value is between minVal and
+maxVal. In case of multi-channel arrays, each channel is processed independently. If some values
+are out of range, position of the first outlier is stored in pos (when pos != NULL). Then, the
+function either returns false (when quiet=true) or throws an exception.
+@param a input array.
+@param quiet a flag, indicating whether the functions quietly return false when the array elements
+are out of range or they throw an exception.
+@param pos optional output parameter, when not NULL, must be a pointer to array of src.dims
+elements.
+@param minVal inclusive lower boundary of valid values range.
+@param maxVal exclusive upper boundary of valid values range.
+*/
+CV_EXPORTS_W bool checkRange(InputArray a, bool quiet = true, CV_OUT Point* pos = 0,
+                            double minVal = -DBL_MAX, double maxVal = DBL_MAX);
+
+/** @brief converts NaNs to the given number
+@param a input/output matrix (CV_32F type).
+@param val value to convert the NaNs
+*/
+CV_EXPORTS_W void patchNaNs(InputOutputArray a, double val = 0);
+
+/** @brief Performs generalized matrix multiplication.
+
+The function cv::gemm performs generalized matrix multiplication similar to the
+gemm functions in BLAS level 3. For example,
+`gemm(src1, src2, alpha, src3, beta, dst, GEMM_1_T + GEMM_3_T)`
+corresponds to
+\f[\texttt{dst} =  \texttt{alpha} \cdot \texttt{src1} ^T  \cdot \texttt{src2} +  \texttt{beta} \cdot \texttt{src3} ^T\f]
+
+In case of complex (two-channel) data, performed a complex matrix
+multiplication.
+
+The function can be replaced with a matrix expression. For example, the
+above call can be replaced with:
+@code{.cpp}
+    dst = alpha*src1.t()*src2 + beta*src3.t();
+@endcode
+@param src1 first multiplied input matrix that could be real(CV_32FC1,
+CV_64FC1) or complex(CV_32FC2, CV_64FC2).
+@param src2 second multiplied input matrix of the same type as src1.
+@param alpha weight of the matrix product.
+@param src3 third optional delta matrix added to the matrix product; it
+should have the same type as src1 and src2.
+@param beta weight of src3.
+@param dst output matrix; it has the proper size and the same type as
+input matrices.
+@param flags operation flags (cv::GemmFlags)
+@sa mulTransposed , transform
+*/
+CV_EXPORTS_W void gemm(InputArray src1, InputArray src2, double alpha,
+                       InputArray src3, double beta, OutputArray dst, int flags = 0);
+
+/** @brief Calculates the product of a matrix and its transposition.
+
+The function cv::mulTransposed calculates the product of src and its
+transposition:
+\f[\texttt{dst} = \texttt{scale} ( \texttt{src} - \texttt{delta} )^T ( \texttt{src} - \texttt{delta} )\f]
+if aTa=true , and
+\f[\texttt{dst} = \texttt{scale} ( \texttt{src} - \texttt{delta} ) ( \texttt{src} - \texttt{delta} )^T\f]
+otherwise. The function is used to calculate the covariance matrix. With
+zero delta, it can be used as a faster substitute for general matrix
+product A\*B when B=A'
+@param src input single-channel matrix. Note that unlike gemm, the
+function can multiply not only floating-point matrices.
+@param dst output square matrix.
+@param aTa Flag specifying the multiplication ordering. See the
+description below.
+@param delta Optional delta matrix subtracted from src before the
+multiplication. When the matrix is empty ( delta=noArray() ), it is
+assumed to be zero, that is, nothing is subtracted. If it has the same
+size as src , it is simply subtracted. Otherwise, it is "repeated" (see
+repeat ) to cover the full src and then subtracted. Type of the delta
+matrix, when it is not empty, must be the same as the type of created
+output matrix. See the dtype parameter description below.
+@param scale Optional scale factor for the matrix product.
+@param dtype Optional type of the output matrix. When it is negative,
+the output matrix will have the same type as src . Otherwise, it will be
+type=CV_MAT_DEPTH(dtype) that should be either CV_32F or CV_64F .
+@sa calcCovarMatrix, gemm, repeat, reduce
+*/
+CV_EXPORTS_W void mulTransposed( InputArray src, OutputArray dst, bool aTa,
+                                 InputArray delta = noArray(),
+                                 double scale = 1, int dtype = -1 );
+
+/** @brief Transposes a matrix.
+
+The function cv::transpose transposes the matrix src :
+\f[\texttt{dst} (i,j) =  \texttt{src} (j,i)\f]
+@note No complex conjugation is done in case of a complex matrix. It
+should be done separately if needed.
+@param src input array.
+@param dst output array of the same type as src.
+*/
+CV_EXPORTS_W void transpose(InputArray src, OutputArray dst);
+
+/** @brief Transpose for n-dimensional matrices.
+ *
+ * @note Input should be continuous single-channel matrix.
+ * @param src input array.
+ * @param order a permutation of [0,1,..,N-1] where N is the number of axes of src.
+ * The i’th axis of dst will correspond to the axis numbered order[i] of the input.
+ * @param dst output array of the same type as src.
+ */
+CV_EXPORTS_W void transposeND(InputArray src, const std::vector<int>& order, OutputArray dst);
+
+/** @brief Performs the matrix transformation of every array element.
+
+The function cv::transform performs the matrix transformation of every
+element of the array src and stores the results in dst :
+\f[\texttt{dst} (I) =  \texttt{m} \cdot \texttt{src} (I)\f]
+(when m.cols=src.channels() ), or
+\f[\texttt{dst} (I) =  \texttt{m} \cdot [ \texttt{src} (I); 1]\f]
+(when m.cols=src.channels()+1 )
+
+Every element of the N -channel array src is interpreted as N -element
+vector that is transformed using the M x N or M x (N+1) matrix m to
+M-element vector - the corresponding element of the output array dst .
+
+The function may be used for geometrical transformation of
+N -dimensional points, arbitrary linear color space transformation (such
+as various kinds of RGB to YUV transforms), shuffling the image
+channels, and so forth.
+@param src input array that must have as many channels (1 to 4) as
+m.cols or m.cols-1.
+@param dst output array of the same size and depth as src; it has as
+many channels as m.rows.
+@param m transformation 2x2 or 2x3 floating-point matrix.
+@sa perspectiveTransform, getAffineTransform, estimateAffine2D, warpAffine, warpPerspective
+*/
+CV_EXPORTS_W void transform(InputArray src, OutputArray dst, InputArray m );
+
+/** @brief Performs the perspective matrix transformation of vectors.
+
+The function cv::perspectiveTransform transforms every element of src by
+treating it as a 2D or 3D vector, in the following way:
+\f[(x, y, z)  \rightarrow (x'/w, y'/w, z'/w)\f]
+where
+\f[(x', y', z', w') =  \texttt{mat} \cdot \begin{bmatrix} x & y & z & 1  \end{bmatrix}\f]
+and
+\f[w =  \fork{w'}{if \(w' \ne 0\)}{\infty}{otherwise}\f]
+
+Here a 3D vector transformation is shown. In case of a 2D vector
+transformation, the z component is omitted.
+
+@note The function transforms a sparse set of 2D or 3D vectors. If you
+want to transform an image using perspective transformation, use
+warpPerspective . If you have an inverse problem, that is, you want to
+compute the most probable perspective transformation out of several
+pairs of corresponding points, you can use getPerspectiveTransform or
+findHomography .
+@param src input two-channel or three-channel floating-point array; each
+element is a 2D/3D vector to be transformed.
+@param dst output array of the same size and type as src.
+@param m 3x3 or 4x4 floating-point transformation matrix.
+@sa  transform, warpPerspective, getPerspectiveTransform, findHomography
+*/
+CV_EXPORTS_W void perspectiveTransform(InputArray src, OutputArray dst, InputArray m );
+
+/** @brief Copies the lower or the upper half of a square matrix to its another half.
+
+The function cv::completeSymm copies the lower or the upper half of a square matrix to
+its another half. The matrix diagonal remains unchanged:
+ - \f$\texttt{m}_{ij}=\texttt{m}_{ji}\f$ for \f$i > j\f$ if
+    lowerToUpper=false
+ - \f$\texttt{m}_{ij}=\texttt{m}_{ji}\f$ for \f$i < j\f$ if
+    lowerToUpper=true
+
+@param m input-output floating-point square matrix.
+@param lowerToUpper operation flag; if true, the lower half is copied to
+the upper half. Otherwise, the upper half is copied to the lower half.
+@sa flip, transpose
+*/
+CV_EXPORTS_W void completeSymm(InputOutputArray m, bool lowerToUpper = false);
+
+/** @brief Initializes a scaled identity matrix.
+
+The function cv::setIdentity initializes a scaled identity matrix:
+\f[\texttt{mtx} (i,j)= \fork{\texttt{value}}{ if \(i=j\)}{0}{otherwise}\f]
+
+The function can also be emulated using the matrix initializers and the
+matrix expressions:
+@code
+    Mat A = Mat::eye(4, 3, CV_32F)*5;
+    // A will be set to [[5, 0, 0], [0, 5, 0], [0, 0, 5], [0, 0, 0]]
+@endcode
+@param mtx matrix to initialize (not necessarily square).
+@param s value to assign to diagonal elements.
+@sa Mat::zeros, Mat::ones, Mat::setTo, Mat::operator=
+*/
+CV_EXPORTS_W void setIdentity(InputOutputArray mtx, const Scalar& s = Scalar(1));
+
+/** @brief Returns the determinant of a square floating-point matrix.
+
+The function cv::determinant calculates and returns the determinant of the
+specified matrix. For small matrices ( mtx.cols=mtx.rows\<=3 ), the
+direct method is used. For larger matrices, the function uses LU
+factorization with partial pivoting.
+
+For symmetric positively-determined matrices, it is also possible to use
+eigen decomposition to calculate the determinant.
+@param mtx input matrix that must have CV_32FC1 or CV_64FC1 type and
+square size.
+@sa trace, invert, solve, eigen, @ref MatrixExpressions
+*/
+CV_EXPORTS_W double determinant(InputArray mtx);
+
+/** @brief Returns the trace of a matrix.
+
+The function cv::trace returns the sum of the diagonal elements of the
+matrix mtx .
+\f[\mathrm{tr} ( \texttt{mtx} ) =  \sum _i  \texttt{mtx} (i,i)\f]
+@param mtx input matrix.
+*/
+CV_EXPORTS_W Scalar trace(InputArray mtx);
+
+/** @brief Finds the inverse or pseudo-inverse of a matrix.
+
+The function cv::invert inverts the matrix src and stores the result in dst
+. When the matrix src is singular or non-square, the function calculates
+the pseudo-inverse matrix (the dst matrix) so that norm(src\*dst - I) is
+minimal, where I is an identity matrix.
+
+In case of the #DECOMP_LU method, the function returns non-zero value if
+the inverse has been successfully calculated and 0 if src is singular.
+
+In case of the #DECOMP_SVD method, the function returns the inverse
+condition number of src (the ratio of the smallest singular value to the
+largest singular value) and 0 if src is singular. The SVD method
+calculates a pseudo-inverse matrix if src is singular.
+
+Similarly to #DECOMP_LU, the method #DECOMP_CHOLESKY works only with
+non-singular square matrices that should also be symmetrical and
+positively defined. In this case, the function stores the inverted
+matrix in dst and returns non-zero. Otherwise, it returns 0.
+
+@param src input floating-point M x N matrix.
+@param dst output matrix of N x M size and the same type as src.
+@param flags inversion method (cv::DecompTypes)
+@sa solve, SVD
+*/
+CV_EXPORTS_W double invert(InputArray src, OutputArray dst, int flags = DECOMP_LU);
+
+/** @brief Solves one or more linear systems or least-squares problems.
+
+The function cv::solve solves a linear system or least-squares problem (the
+latter is possible with SVD or QR methods, or by specifying the flag
+#DECOMP_NORMAL ):
+\f[\texttt{dst} =  \arg \min _X \| \texttt{src1} \cdot \texttt{X} -  \texttt{src2} \|\f]
+
+If #DECOMP_LU or #DECOMP_CHOLESKY method is used, the function returns 1
+if src1 (or \f$\texttt{src1}^T\texttt{src1}\f$ ) is non-singular. Otherwise,
+it returns 0. In the latter case, dst is not valid. Other methods find a
+pseudo-solution in case of a singular left-hand side part.
+
+@note If you want to find a unity-norm solution of an under-defined
+singular system \f$\texttt{src1}\cdot\texttt{dst}=0\f$ , the function solve
+will not do the work. Use SVD::solveZ instead.
+
+@param src1 input matrix on the left-hand side of the system.
+@param src2 input matrix on the right-hand side of the system.
+@param dst output solution.
+@param flags solution (matrix inversion) method (#DecompTypes)
+@sa invert, SVD, eigen
+*/
+CV_EXPORTS_W bool solve(InputArray src1, InputArray src2,
+                        OutputArray dst, int flags = DECOMP_LU);
+
+/** @brief Sorts each row or each column of a matrix.
+
+The function cv::sort sorts each matrix row or each matrix column in
+ascending or descending order. So you should pass two operation flags to
+get desired behaviour. If you want to sort matrix rows or columns
+lexicographically, you can use STL std::sort generic function with the
+proper comparison predicate.
+
+@param src input single-channel array.
+@param dst output array of the same size and type as src.
+@param flags operation flags, a combination of #SortFlags
+@sa sortIdx, randShuffle
+*/
+CV_EXPORTS_W void sort(InputArray src, OutputArray dst, int flags);
+
+/** @brief Sorts each row or each column of a matrix.
+
+The function cv::sortIdx sorts each matrix row or each matrix column in the
+ascending or descending order. So you should pass two operation flags to
+get desired behaviour. Instead of reordering the elements themselves, it
+stores the indices of sorted elements in the output array. For example:
+@code
+    Mat A = Mat::eye(3,3,CV_32F), B;
+    sortIdx(A, B, SORT_EVERY_ROW + SORT_ASCENDING);
+    // B will probably contain
+    // (because of equal elements in A some permutations are possible):
+    // [[1, 2, 0], [0, 2, 1], [0, 1, 2]]
+@endcode
+@param src input single-channel array.
+@param dst output integer array of the same size as src.
+@param flags operation flags that could be a combination of cv::SortFlags
+@sa sort, randShuffle
+*/
+CV_EXPORTS_W void sortIdx(InputArray src, OutputArray dst, int flags);
+
+/** @brief Finds the real roots of a cubic equation.
+
+The function solveCubic finds the real roots of a cubic equation:
+-   if coeffs is a 4-element vector:
+\f[\texttt{coeffs} [0] x^3 +  \texttt{coeffs} [1] x^2 +  \texttt{coeffs} [2] x +  \texttt{coeffs} [3] = 0\f]
+-   if coeffs is a 3-element vector:
+\f[x^3 +  \texttt{coeffs} [0] x^2 +  \texttt{coeffs} [1] x +  \texttt{coeffs} [2] = 0\f]
+
+The roots are stored in the roots array.
+@param coeffs equation coefficients, an array of 3 or 4 elements.
+@param roots output array of real roots that has 1 or 3 elements.
+@return number of real roots. It can be 0, 1 or 2.
+*/
+CV_EXPORTS_W int solveCubic(InputArray coeffs, OutputArray roots);
+
+/** @brief Finds the real or complex roots of a polynomial equation.
+
+The function cv::solvePoly finds real and complex roots of a polynomial equation:
+\f[\texttt{coeffs} [n] x^{n} +  \texttt{coeffs} [n-1] x^{n-1} + ... +  \texttt{coeffs} [1] x +  \texttt{coeffs} [0] = 0\f]
+@param coeffs array of polynomial coefficients.
+@param roots output (complex) array of roots.
+@param maxIters maximum number of iterations the algorithm does.
+*/
+CV_EXPORTS_W double solvePoly(InputArray coeffs, OutputArray roots, int maxIters = 300);
+
+/** @brief Calculates eigenvalues and eigenvectors of a symmetric matrix.
+
+The function cv::eigen calculates just eigenvalues, or eigenvalues and eigenvectors of the symmetric
+matrix src:
+@code
+    src*eigenvectors.row(i).t() = eigenvalues.at<srcType>(i)*eigenvectors.row(i).t()
+@endcode
+
+@note Use cv::eigenNonSymmetric for calculation of real eigenvalues and eigenvectors of non-symmetric matrix.
+
+@param src input matrix that must have CV_32FC1 or CV_64FC1 type, square size and be symmetrical
+(src ^T^ == src).
+@param eigenvalues output vector of eigenvalues of the same type as src; the eigenvalues are stored
+in the descending order.
+@param eigenvectors output matrix of eigenvectors; it has the same size and type as src; the
+eigenvectors are stored as subsequent matrix rows, in the same order as the corresponding
+eigenvalues.
+@sa eigenNonSymmetric, completeSymm , PCA
+*/
+CV_EXPORTS_W bool eigen(InputArray src, OutputArray eigenvalues,
+                        OutputArray eigenvectors = noArray());
+
+/** @brief Calculates eigenvalues and eigenvectors of a non-symmetric matrix (real eigenvalues only).
+
+@note Assumes real eigenvalues.
+
+The function calculates eigenvalues and eigenvectors (optional) of the square matrix src:
+@code
+    src*eigenvectors.row(i).t() = eigenvalues.at<srcType>(i)*eigenvectors.row(i).t()
+@endcode
+
+@param src input matrix (CV_32FC1 or CV_64FC1 type).
+@param eigenvalues output vector of eigenvalues (type is the same type as src).
+@param eigenvectors output matrix of eigenvectors (type is the same type as src). The eigenvectors are stored as subsequent matrix rows, in the same order as the corresponding eigenvalues.
+@sa eigen
+*/
+CV_EXPORTS_W void eigenNonSymmetric(InputArray src, OutputArray eigenvalues,
+                                    OutputArray eigenvectors);
+
+/** @brief Calculates the covariance matrix of a set of vectors.
+
+The function cv::calcCovarMatrix calculates the covariance matrix and, optionally, the mean vector of
+the set of input vectors.
+@param samples samples stored as separate matrices
+@param nsamples number of samples
+@param covar output covariance matrix of the type ctype and square size.
+@param mean input or output (depending on the flags) array as the average value of the input vectors.
+@param flags operation flags as a combination of #CovarFlags
+@param ctype type of the matrixl; it equals 'CV_64F' by default.
+@sa PCA, mulTransposed, Mahalanobis
+@todo InputArrayOfArrays
+*/
+CV_EXPORTS void calcCovarMatrix( const Mat* samples, int nsamples, Mat& covar, Mat& mean,
+                                 int flags, int ctype = CV_64F);
+
+/** @overload
+@note use #COVAR_ROWS or #COVAR_COLS flag
+@param samples samples stored as rows/columns of a single matrix.
+@param covar output covariance matrix of the type ctype and square size.
+@param mean input or output (depending on the flags) array as the average value of the input vectors.
+@param flags operation flags as a combination of #CovarFlags
+@param ctype type of the matrixl; it equals 'CV_64F' by default.
+*/
+CV_EXPORTS_W void calcCovarMatrix( InputArray samples, OutputArray covar,
+                                   InputOutputArray mean, int flags, int ctype = CV_64F);
+
+/** wrap PCA::operator() */
+CV_EXPORTS_W void PCACompute(InputArray data, InputOutputArray mean,
+                             OutputArray eigenvectors, int maxComponents = 0);
+
+/** wrap PCA::operator() and add eigenvalues output parameter */
+CV_EXPORTS_AS(PCACompute2) void PCACompute(InputArray data, InputOutputArray mean,
+                                           OutputArray eigenvectors, OutputArray eigenvalues,
+                                           int maxComponents = 0);
+
+/** wrap PCA::operator() */
+CV_EXPORTS_W void PCACompute(InputArray data, InputOutputArray mean,
+                             OutputArray eigenvectors, double retainedVariance);
+
+/** wrap PCA::operator() and add eigenvalues output parameter */
+CV_EXPORTS_AS(PCACompute2) void PCACompute(InputArray data, InputOutputArray mean,
+                                           OutputArray eigenvectors, OutputArray eigenvalues,
+                                           double retainedVariance);
+
+/** wrap PCA::project */
+CV_EXPORTS_W void PCAProject(InputArray data, InputArray mean,
+                             InputArray eigenvectors, OutputArray result);
+
+/** wrap PCA::backProject */
+CV_EXPORTS_W void PCABackProject(InputArray data, InputArray mean,
+                                 InputArray eigenvectors, OutputArray result);
+
+/** wrap SVD::compute */
+CV_EXPORTS_W void SVDecomp( InputArray src, OutputArray w, OutputArray u, OutputArray vt, int flags = 0 );
+
+/** wrap SVD::backSubst */
+CV_EXPORTS_W void SVBackSubst( InputArray w, InputArray u, InputArray vt,
+                               InputArray rhs, OutputArray dst );
+
+/** @brief Calculates the Mahalanobis distance between two vectors.
+
+The function cv::Mahalanobis calculates and returns the weighted distance between two vectors:
+\f[d( \texttt{vec1} , \texttt{vec2} )= \sqrt{\sum_{i,j}{\texttt{icovar(i,j)}\cdot(\texttt{vec1}(I)-\texttt{vec2}(I))\cdot(\texttt{vec1(j)}-\texttt{vec2(j)})} }\f]
+The covariance matrix may be calculated using the #calcCovarMatrix function and then inverted using
+the invert function (preferably using the #DECOMP_SVD method, as the most accurate).
+@param v1 first 1D input vector.
+@param v2 second 1D input vector.
+@param icovar inverse covariance matrix.
+*/
+CV_EXPORTS_W double Mahalanobis(InputArray v1, InputArray v2, InputArray icovar);
+
+/** @brief Performs a forward or inverse Discrete Fourier transform of a 1D or 2D floating-point array.
+
+The function cv::dft performs one of the following:
+-   Forward the Fourier transform of a 1D vector of N elements:
+    \f[Y = F^{(N)}  \cdot X,\f]
+    where \f$F^{(N)}_{jk}=\exp(-2\pi i j k/N)\f$ and \f$i=\sqrt{-1}\f$
+-   Inverse the Fourier transform of a 1D vector of N elements:
+    \f[\begin{array}{l} X'=  \left (F^{(N)} \right )^{-1}  \cdot Y =  \left (F^{(N)} \right )^*  \cdot y  \\ X = (1/N)  \cdot X, \end{array}\f]
+    where \f$F^*=\left(\textrm{Re}(F^{(N)})-\textrm{Im}(F^{(N)})\right)^T\f$
+-   Forward the 2D Fourier transform of a M x N matrix:
+    \f[Y = F^{(M)}  \cdot X  \cdot F^{(N)}\f]
+-   Inverse the 2D Fourier transform of a M x N matrix:
+    \f[\begin{array}{l} X'=  \left (F^{(M)} \right )^*  \cdot Y  \cdot \left (F^{(N)} \right )^* \\ X =  \frac{1}{M \cdot N} \cdot X' \end{array}\f]
+
+In case of real (single-channel) data, the output spectrum of the forward Fourier transform or input
+spectrum of the inverse Fourier transform can be represented in a packed format called *CCS*
+(complex-conjugate-symmetrical). It was borrowed from IPL (Intel\* Image Processing Library). Here
+is how 2D *CCS* spectrum looks:
+\f[\begin{bmatrix} Re Y_{0,0} & Re Y_{0,1} & Im Y_{0,1} & Re Y_{0,2} & Im Y_{0,2} &  \cdots & Re Y_{0,N/2-1} & Im Y_{0,N/2-1} & Re Y_{0,N/2}  \\ Re Y_{1,0} & Re Y_{1,1} & Im Y_{1,1} & Re Y_{1,2} & Im Y_{1,2} &  \cdots & Re Y_{1,N/2-1} & Im Y_{1,N/2-1} & Re Y_{1,N/2}  \\ Im Y_{1,0} & Re Y_{2,1} & Im Y_{2,1} & Re Y_{2,2} & Im Y_{2,2} &  \cdots & Re Y_{2,N/2-1} & Im Y_{2,N/2-1} & Im Y_{1,N/2}  \\ \hdotsfor{9} \\ Re Y_{M/2-1,0} &  Re Y_{M-3,1}  & Im Y_{M-3,1} &  \hdotsfor{3} & Re Y_{M-3,N/2-1} & Im Y_{M-3,N/2-1}& Re Y_{M/2-1,N/2}  \\ Im Y_{M/2-1,0} &  Re Y_{M-2,1}  & Im Y_{M-2,1} &  \hdotsfor{3} & Re Y_{M-2,N/2-1} & Im Y_{M-2,N/2-1}& Im Y_{M/2-1,N/2}  \\ Re Y_{M/2,0}  &  Re Y_{M-1,1} &  Im Y_{M-1,1} &  \hdotsfor{3} & Re Y_{M-1,N/2-1} & Im Y_{M-1,N/2-1}& Re Y_{M/2,N/2} \end{bmatrix}\f]
+
+In case of 1D transform of a real vector, the output looks like the first row of the matrix above.
+
+So, the function chooses an operation mode depending on the flags and size of the input array:
+-   If #DFT_ROWS is set or the input array has a single row or single column, the function
+    performs a 1D forward or inverse transform of each row of a matrix when #DFT_ROWS is set.
+    Otherwise, it performs a 2D transform.
+-   If the input array is real and #DFT_INVERSE is not set, the function performs a forward 1D or
+    2D transform:
+    -   When #DFT_COMPLEX_OUTPUT is set, the output is a complex matrix of the same size as
+        input.
+    -   When #DFT_COMPLEX_OUTPUT is not set, the output is a real matrix of the same size as
+        input. In case of 2D transform, it uses the packed format as shown above. In case of a
+        single 1D transform, it looks like the first row of the matrix above. In case of
+        multiple 1D transforms (when using the #DFT_ROWS flag), each row of the output matrix
+        looks like the first row of the matrix above.
+-   If the input array is complex and either #DFT_INVERSE or #DFT_REAL_OUTPUT are not set, the
+    output is a complex array of the same size as input. The function performs a forward or
+    inverse 1D or 2D transform of the whole input array or each row of the input array
+    independently, depending on the flags DFT_INVERSE and DFT_ROWS.
+-   When #DFT_INVERSE is set and the input array is real, or it is complex but #DFT_REAL_OUTPUT
+    is set, the output is a real array of the same size as input. The function performs a 1D or 2D
+    inverse transformation of the whole input array or each individual row, depending on the flags
+    #DFT_INVERSE and #DFT_ROWS.
+
+If #DFT_SCALE is set, the scaling is done after the transformation.
+
+Unlike dct , the function supports arrays of arbitrary size. But only those arrays are processed
+efficiently, whose sizes can be factorized in a product of small prime numbers (2, 3, and 5 in the
+current implementation). Such an efficient DFT size can be calculated using the getOptimalDFTSize
+method.
+
+The sample below illustrates how to calculate a DFT-based convolution of two 2D real arrays:
+@code
+    void convolveDFT(InputArray A, InputArray B, OutputArray C)
+    {
+        // reallocate the output array if needed
+        C.create(abs(A.rows - B.rows)+1, abs(A.cols - B.cols)+1, A.type());
+        Size dftSize;
+        // calculate the size of DFT transform
+        dftSize.width = getOptimalDFTSize(A.cols + B.cols - 1);
+        dftSize.height = getOptimalDFTSize(A.rows + B.rows - 1);
+
+        // allocate temporary buffers and initialize them with 0's
+        Mat tempA(dftSize, A.type(), Scalar::all(0));
+        Mat tempB(dftSize, B.type(), Scalar::all(0));
+
+        // copy A and B to the top-left corners of tempA and tempB, respectively
+        Mat roiA(tempA, Rect(0,0,A.cols,A.rows));
+        A.copyTo(roiA);
+        Mat roiB(tempB, Rect(0,0,B.cols,B.rows));
+        B.copyTo(roiB);
+
+        // now transform the padded A & B in-place;
+        // use "nonzeroRows" hint for faster processing
+        dft(tempA, tempA, 0, A.rows);
+        dft(tempB, tempB, 0, B.rows);
+
+        // multiply the spectrums;
+        // the function handles packed spectrum representations well
+        mulSpectrums(tempA, tempB, tempA);
+
+        // transform the product back from the frequency domain.
+        // Even though all the result rows will be non-zero,
+        // you need only the first C.rows of them, and thus you
+        // pass nonzeroRows == C.rows
+        dft(tempA, tempA, DFT_INVERSE + DFT_SCALE, C.rows);
+
+        // now copy the result back to C.
+        tempA(Rect(0, 0, C.cols, C.rows)).copyTo(C);
+
+        // all the temporary buffers will be deallocated automatically
+    }
+@endcode
+To optimize this sample, consider the following approaches:
+-   Since nonzeroRows != 0 is passed to the forward transform calls and since A and B are copied to
+    the top-left corners of tempA and tempB, respectively, it is not necessary to clear the whole
+    tempA and tempB. It is only necessary to clear the tempA.cols - A.cols ( tempB.cols - B.cols)
+    rightmost columns of the matrices.
+-   This DFT-based convolution does not have to be applied to the whole big arrays, especially if B
+    is significantly smaller than A or vice versa. Instead, you can calculate convolution by parts.
+    To do this, you need to split the output array C into multiple tiles. For each tile, estimate
+    which parts of A and B are required to calculate convolution in this tile. If the tiles in C are
+    too small, the speed will decrease a lot because of repeated work. In the ultimate case, when
+    each tile in C is a single pixel, the algorithm becomes equivalent to the naive convolution
+    algorithm. If the tiles are too big, the temporary arrays tempA and tempB become too big and
+    there is also a slowdown because of bad cache locality. So, there is an optimal tile size
+    somewhere in the middle.
+-   If different tiles in C can be calculated in parallel and, thus, the convolution is done by
+    parts, the loop can be threaded.
+
+All of the above improvements have been implemented in #matchTemplate and #filter2D . Therefore, by
+using them, you can get the performance even better than with the above theoretically optimal
+implementation. Though, those two functions actually calculate cross-correlation, not convolution,
+so you need to "flip" the second convolution operand B vertically and horizontally using flip .
+@note
+-   An example using the discrete fourier transform can be found at
+    opencv_source_code/samples/cpp/dft.cpp
+-   (Python) An example using the dft functionality to perform Wiener deconvolution can be found
+    at opencv_source/samples/python/deconvolution.py
+-   (Python) An example rearranging the quadrants of a Fourier image can be found at
+    opencv_source/samples/python/dft.py
+@param src input array that could be real or complex.
+@param dst output array whose size and type depends on the flags .
+@param flags transformation flags, representing a combination of the #DftFlags
+@param nonzeroRows when the parameter is not zero, the function assumes that only the first
+nonzeroRows rows of the input array (#DFT_INVERSE is not set) or only the first nonzeroRows of the
+output array (#DFT_INVERSE is set) contain non-zeros, thus, the function can handle the rest of the
+rows more efficiently and save some time; this technique is very useful for calculating array
+cross-correlation or convolution using DFT.
+@sa dct , getOptimalDFTSize , mulSpectrums, filter2D , matchTemplate , flip , cartToPolar ,
+magnitude , phase
+*/
+CV_EXPORTS_W void dft(InputArray src, OutputArray dst, int flags = 0, int nonzeroRows = 0);
+
+/** @brief Calculates the inverse Discrete Fourier Transform of a 1D or 2D array.
+
+idft(src, dst, flags) is equivalent to dft(src, dst, flags | #DFT_INVERSE) .
+@note None of dft and idft scales the result by default. So, you should pass #DFT_SCALE to one of
+dft or idft explicitly to make these transforms mutually inverse.
+@sa dft, dct, idct, mulSpectrums, getOptimalDFTSize
+@param src input floating-point real or complex array.
+@param dst output array whose size and type depend on the flags.
+@param flags operation flags (see dft and #DftFlags).
+@param nonzeroRows number of dst rows to process; the rest of the rows have undefined content (see
+the convolution sample in dft description.
+*/
+CV_EXPORTS_W void idft(InputArray src, OutputArray dst, int flags = 0, int nonzeroRows = 0);
+
+/** @brief Performs a forward or inverse discrete Cosine transform of 1D or 2D array.
+
+The function cv::dct performs a forward or inverse discrete Cosine transform (DCT) of a 1D or 2D
+floating-point array:
+-   Forward Cosine transform of a 1D vector of N elements:
+    \f[Y = C^{(N)}  \cdot X\f]
+    where
+    \f[C^{(N)}_{jk}= \sqrt{\alpha_j/N} \cos \left ( \frac{\pi(2k+1)j}{2N} \right )\f]
+    and
+    \f$\alpha_0=1\f$, \f$\alpha_j=2\f$ for *j \> 0*.
+-   Inverse Cosine transform of a 1D vector of N elements:
+    \f[X =  \left (C^{(N)} \right )^{-1}  \cdot Y =  \left (C^{(N)} \right )^T  \cdot Y\f]
+    (since \f$C^{(N)}\f$ is an orthogonal matrix, \f$C^{(N)} \cdot \left(C^{(N)}\right)^T = I\f$ )
+-   Forward 2D Cosine transform of M x N matrix:
+    \f[Y = C^{(N)}  \cdot X  \cdot \left (C^{(N)} \right )^T\f]
+-   Inverse 2D Cosine transform of M x N matrix:
+    \f[X =  \left (C^{(N)} \right )^T  \cdot X  \cdot C^{(N)}\f]
+
+The function chooses the mode of operation by looking at the flags and size of the input array:
+-   If (flags & #DCT_INVERSE) == 0 , the function does a forward 1D or 2D transform. Otherwise, it
+    is an inverse 1D or 2D transform.
+-   If (flags & #DCT_ROWS) != 0 , the function performs a 1D transform of each row.
+-   If the array is a single column or a single row, the function performs a 1D transform.
+-   If none of the above is true, the function performs a 2D transform.
+
+@note Currently dct supports even-size arrays (2, 4, 6 ...). For data analysis and approximation, you
+can pad the array when necessary.
+Also, the function performance depends very much, and not monotonically, on the array size (see
+getOptimalDFTSize ). In the current implementation DCT of a vector of size N is calculated via DFT
+of a vector of size N/2 . Thus, the optimal DCT size N1 \>= N can be calculated as:
+@code
+    size_t getOptimalDCTSize(size_t N) { return 2*getOptimalDFTSize((N+1)/2); }
+    N1 = getOptimalDCTSize(N);
+@endcode
+@param src input floating-point array.
+@param dst output array of the same size and type as src .
+@param flags transformation flags as a combination of cv::DftFlags (DCT_*)
+@sa dft , getOptimalDFTSize , idct
+*/
+CV_EXPORTS_W void dct(InputArray src, OutputArray dst, int flags = 0);
+
+/** @brief Calculates the inverse Discrete Cosine Transform of a 1D or 2D array.
+
+idct(src, dst, flags) is equivalent to dct(src, dst, flags | DCT_INVERSE).
+@param src input floating-point single-channel array.
+@param dst output array of the same size and type as src.
+@param flags operation flags.
+@sa  dct, dft, idft, getOptimalDFTSize
+*/
+CV_EXPORTS_W void idct(InputArray src, OutputArray dst, int flags = 0);
+
+/** @brief Performs the per-element multiplication of two Fourier spectrums.
+
+The function cv::mulSpectrums performs the per-element multiplication of the two CCS-packed or complex
+matrices that are results of a real or complex Fourier transform.
+
+The function, together with dft and idft , may be used to calculate convolution (pass conjB=false )
+or correlation (pass conjB=true ) of two arrays rapidly. When the arrays are complex, they are
+simply multiplied (per element) with an optional conjugation of the second-array elements. When the
+arrays are real, they are assumed to be CCS-packed (see dft for details).
+@param a first input array.
+@param b second input array of the same size and type as src1 .
+@param c output array of the same size and type as src1 .
+@param flags operation flags; currently, the only supported flag is cv::DFT_ROWS, which indicates that
+each row of src1 and src2 is an independent 1D Fourier spectrum. If you do not want to use this flag, then simply add a `0` as value.
+@param conjB optional flag that conjugates the second input array before the multiplication (true)
+or not (false).
+*/
+CV_EXPORTS_W void mulSpectrums(InputArray a, InputArray b, OutputArray c,
+                               int flags, bool conjB = false);
+
+/** @brief Returns the optimal DFT size for a given vector size.
+
+DFT performance is not a monotonic function of a vector size. Therefore, when you calculate
+convolution of two arrays or perform the spectral analysis of an array, it usually makes sense to
+pad the input data with zeros to get a bit larger array that can be transformed much faster than the
+original one. Arrays whose size is a power-of-two (2, 4, 8, 16, 32, ...) are the fastest to process.
+Though, the arrays whose size is a product of 2's, 3's, and 5's (for example, 300 = 5\*5\*3\*2\*2)
+are also processed quite efficiently.
+
+The function cv::getOptimalDFTSize returns the minimum number N that is greater than or equal to vecsize
+so that the DFT of a vector of size N can be processed efficiently. In the current implementation N
+= 2 ^p^ \* 3 ^q^ \* 5 ^r^ for some integer p, q, r.
+
+The function returns a negative number if vecsize is too large (very close to INT_MAX ).
+
+While the function cannot be used directly to estimate the optimal vector size for DCT transform
+(since the current DCT implementation supports only even-size vectors), it can be easily processed
+as getOptimalDFTSize((vecsize+1)/2)\*2.
+@param vecsize vector size.
+@sa dft , dct , idft , idct , mulSpectrums
+*/
+CV_EXPORTS_W int getOptimalDFTSize(int vecsize);
+
+/** @brief Returns the default random number generator.
+
+The function cv::theRNG returns the default random number generator. For each thread, there is a
+separate random number generator, so you can use the function safely in multi-thread environments.
+If you just need to get a single random number using this generator or initialize an array, you can
+use randu or randn instead. But if you are going to generate many random numbers inside a loop, it
+is much faster to use this function to retrieve the generator and then use RNG::operator _Tp() .
+@sa RNG, randu, randn
+*/
+CV_EXPORTS RNG& theRNG();
+
+/** @brief Sets state of default random number generator.
+
+The function cv::setRNGSeed sets state of default random number generator to custom value.
+@param seed new state for default random number generator
+@sa RNG, randu, randn
+*/
+CV_EXPORTS_W void setRNGSeed(int seed);
+
+/** @brief Generates a single uniformly-distributed random number or an array of random numbers.
+
+Non-template variant of the function fills the matrix dst with uniformly-distributed
+random numbers from the specified range:
+\f[\texttt{low} _c  \leq \texttt{dst} (I)_c <  \texttt{high} _c\f]
+@param dst output array of random numbers; the array must be pre-allocated.
+@param low inclusive lower boundary of the generated random numbers.
+@param high exclusive upper boundary of the generated random numbers.
+@sa RNG, randn, theRNG
+*/
+CV_EXPORTS_W void randu(InputOutputArray dst, InputArray low, InputArray high);
+
+/** @brief Fills the array with normally distributed random numbers.
+
+The function cv::randn fills the matrix dst with normally distributed random numbers with the specified
+mean vector and the standard deviation matrix. The generated random numbers are clipped to fit the
+value range of the output array data type.
+@param dst output array of random numbers; the array must be pre-allocated and have 1 to 4 channels.
+@param mean mean value (expectation) of the generated random numbers.
+@param stddev standard deviation of the generated random numbers; it can be either a vector (in
+which case a diagonal standard deviation matrix is assumed) or a square matrix.
+@sa RNG, randu
+*/
+CV_EXPORTS_W void randn(InputOutputArray dst, InputArray mean, InputArray stddev);
+
+/** @brief Shuffles the array elements randomly.
+
+The function cv::randShuffle shuffles the specified 1D array by randomly choosing pairs of elements and
+swapping them. The number of such swap operations will be dst.rows\*dst.cols\*iterFactor .
+@param dst input/output numerical 1D array.
+@param iterFactor scale factor that determines the number of random swap operations (see the details
+below).
+@param rng optional random number generator used for shuffling; if it is zero, theRNG () is used
+instead.
+@sa RNG, sort
+*/
+CV_EXPORTS_W void randShuffle(InputOutputArray dst, double iterFactor = 1., RNG* rng = 0);
+
+/** @brief Principal Component Analysis
+
+The class is used to calculate a special basis for a set of vectors. The
+basis will consist of eigenvectors of the covariance matrix calculated
+from the input set of vectors. The class %PCA can also transform
+vectors to/from the new coordinate space defined by the basis. Usually,
+in this new coordinate system, each vector from the original set (and
+any linear combination of such vectors) can be quite accurately
+approximated by taking its first few components, corresponding to the
+eigenvectors of the largest eigenvalues of the covariance matrix.
+Geometrically it means that you calculate a projection of the vector to
+a subspace formed by a few eigenvectors corresponding to the dominant
+eigenvalues of the covariance matrix. And usually such a projection is
+very close to the original vector. So, you can represent the original
+vector from a high-dimensional space with a much shorter vector
+consisting of the projected vector's coordinates in the subspace. Such a
+transformation is also known as Karhunen-Loeve Transform, or KLT.
+See http://en.wikipedia.org/wiki/Principal_component_analysis
+
+The sample below is the function that takes two matrices. The first
+function stores a set of vectors (a row per vector) that is used to
+calculate PCA. The second function stores another "test" set of vectors
+(a row per vector). First, these vectors are compressed with PCA, then
+reconstructed back, and then the reconstruction error norm is computed
+and printed for each vector. :
+
+@code{.cpp}
+using namespace cv;
+
+PCA compressPCA(const Mat& pcaset, int maxComponents,
+                const Mat& testset, Mat& compressed)
+{
+    PCA pca(pcaset, // pass the data
+            Mat(), // we do not have a pre-computed mean vector,
+                   // so let the PCA engine to compute it
+            PCA::DATA_AS_ROW, // indicate that the vectors
+                                // are stored as matrix rows
+                                // (use PCA::DATA_AS_COL if the vectors are
+                                // the matrix columns)
+            maxComponents // specify, how many principal components to retain
+            );
+    // if there is no test data, just return the computed basis, ready-to-use
+    if( !testset.data )
+        return pca;
+    CV_Assert( testset.cols == pcaset.cols );
+
+    compressed.create(testset.rows, maxComponents, testset.type());
+
+    Mat reconstructed;
+    for( int i = 0; i < testset.rows; i++ )
+    {
+        Mat vec = testset.row(i), coeffs = compressed.row(i), reconstructed;
+        // compress the vector, the result will be stored
+        // in the i-th row of the output matrix
+        pca.project(vec, coeffs);
+        // and then reconstruct it
+        pca.backProject(coeffs, reconstructed);
+        // and measure the error
+        printf("%d. diff = %g\n", i, norm(vec, reconstructed, NORM_L2));
+    }
+    return pca;
+}
+@endcode
+@sa calcCovarMatrix, mulTransposed, SVD, dft, dct
+*/
+class CV_EXPORTS PCA
+{
+public:
+    enum Flags { DATA_AS_ROW = 0, //!< indicates that the input samples are stored as matrix rows
+                 DATA_AS_COL = 1, //!< indicates that the input samples are stored as matrix columns
+                 USE_AVG     = 2  //!
+               };
+
+    /** @brief default constructor
+
+    The default constructor initializes an empty %PCA structure. The other
+    constructors initialize the structure and call PCA::operator()().
+    */
+    PCA();
+
+    /** @overload
+    @param data input samples stored as matrix rows or matrix columns.
+    @param mean optional mean value; if the matrix is empty (@c noArray()),
+    the mean is computed from the data.
+    @param flags operation flags; currently the parameter is only used to
+    specify the data layout (PCA::Flags)
+    @param maxComponents maximum number of components that %PCA should
+    retain; by default, all the components are retained.
+    */
+    PCA(InputArray data, InputArray mean, int flags, int maxComponents = 0);
+
+    /** @overload
+    @param data input samples stored as matrix rows or matrix columns.
+    @param mean optional mean value; if the matrix is empty (noArray()),
+    the mean is computed from the data.
+    @param flags operation flags; currently the parameter is only used to
+    specify the data layout (PCA::Flags)
+    @param retainedVariance Percentage of variance that PCA should retain.
+    Using this parameter will let the PCA decided how many components to
+    retain but it will always keep at least 2.
+    */
+    PCA(InputArray data, InputArray mean, int flags, double retainedVariance);
+
+    /** @brief performs %PCA
+
+    The operator performs %PCA of the supplied dataset. It is safe to reuse
+    the same PCA structure for multiple datasets. That is, if the structure
+    has been previously used with another dataset, the existing internal
+    data is reclaimed and the new @ref eigenvalues, @ref eigenvectors and @ref
+    mean are allocated and computed.
+
+    The computed @ref eigenvalues are sorted from the largest to the smallest and
+    the corresponding @ref eigenvectors are stored as eigenvectors rows.
+
+    @param data input samples stored as the matrix rows or as the matrix
+    columns.
+    @param mean optional mean value; if the matrix is empty (noArray()),
+    the mean is computed from the data.
+    @param flags operation flags; currently the parameter is only used to
+    specify the data layout. (Flags)
+    @param maxComponents maximum number of components that PCA should
+    retain; by default, all the components are retained.
+    */
+    PCA& operator()(InputArray data, InputArray mean, int flags, int maxComponents = 0);
+
+    /** @overload
+    @param data input samples stored as the matrix rows or as the matrix
+    columns.
+    @param mean optional mean value; if the matrix is empty (noArray()),
+    the mean is computed from the data.
+    @param flags operation flags; currently the parameter is only used to
+    specify the data layout. (PCA::Flags)
+    @param retainedVariance Percentage of variance that %PCA should retain.
+    Using this parameter will let the %PCA decided how many components to
+    retain but it will always keep at least 2.
+     */
+    PCA& operator()(InputArray data, InputArray mean, int flags, double retainedVariance);
+
+    /** @brief Projects vector(s) to the principal component subspace.
+
+    The methods project one or more vectors to the principal component
+    subspace, where each vector projection is represented by coefficients in
+    the principal component basis. The first form of the method returns the
+    matrix that the second form writes to the result. So the first form can
+    be used as a part of expression while the second form can be more
+    efficient in a processing loop.
+    @param vec input vector(s); must have the same dimensionality and the
+    same layout as the input data used at %PCA phase, that is, if
+    DATA_AS_ROW are specified, then `vec.cols==data.cols`
+    (vector dimensionality) and `vec.rows` is the number of vectors to
+    project, and the same is true for the PCA::DATA_AS_COL case.
+    */
+    Mat project(InputArray vec) const;
+
+    /** @overload
+    @param vec input vector(s); must have the same dimensionality and the
+    same layout as the input data used at PCA phase, that is, if
+    DATA_AS_ROW are specified, then `vec.cols==data.cols`
+    (vector dimensionality) and `vec.rows` is the number of vectors to
+    project, and the same is true for the PCA::DATA_AS_COL case.
+    @param result output vectors; in case of PCA::DATA_AS_COL, the
+    output matrix has as many columns as the number of input vectors, this
+    means that `result.cols==vec.cols` and the number of rows match the
+    number of principal components (for example, `maxComponents` parameter
+    passed to the constructor).
+     */
+    void project(InputArray vec, OutputArray result) const;
+
+    /** @brief Reconstructs vectors from their PC projections.
+
+    The methods are inverse operations to PCA::project. They take PC
+    coordinates of projected vectors and reconstruct the original vectors.
+    Unless all the principal components have been retained, the
+    reconstructed vectors are different from the originals. But typically,
+    the difference is small if the number of components is large enough (but
+    still much smaller than the original vector dimensionality). As a
+    result, PCA is used.
+    @param vec coordinates of the vectors in the principal component
+    subspace, the layout and size are the same as of PCA::project output
+    vectors.
+     */
+    Mat backProject(InputArray vec) const;
+
+    /** @overload
+    @param vec coordinates of the vectors in the principal component
+    subspace, the layout and size are the same as of PCA::project output
+    vectors.
+    @param result reconstructed vectors; the layout and size are the same as
+    of PCA::project input vectors.
+     */
+    void backProject(InputArray vec, OutputArray result) const;
+
+    /** @brief write PCA objects
+
+    Writes @ref eigenvalues @ref eigenvectors and @ref mean to specified FileStorage
+     */
+    void write(FileStorage& fs) const;
+
+    /** @brief load PCA objects
+
+    Loads @ref eigenvalues @ref eigenvectors and @ref mean from specified FileNode
+     */
+    void read(const FileNode& fn);
+
+    Mat eigenvectors; //!< eigenvectors of the covariation matrix
+    Mat eigenvalues; //!< eigenvalues of the covariation matrix
+    Mat mean; //!< mean value subtracted before the projection and added after the back projection
+};
+
+/** @example samples/cpp/pca.cpp
+An example using %PCA for dimensionality reduction while maintaining an amount of variance
+*/
+
+/** @example samples/cpp/tutorial_code/ml/introduction_to_pca/introduction_to_pca.cpp
+Check @ref tutorial_introduction_to_pca "the corresponding tutorial" for more details
+*/
+
+/**
+@brief Linear Discriminant Analysis
+@todo document this class
+*/
+class CV_EXPORTS LDA
+{
+public:
+    /** @brief constructor
+    Initializes a LDA with num_components (default 0).
+    */
+    explicit LDA(int num_components = 0);
+
+    /** Initializes and performs a Discriminant Analysis with Fisher's
+     Optimization Criterion on given data in src and corresponding labels
+     in labels. If 0 (or less) number of components are given, they are
+     automatically determined for given data in computation.
+    */
+    LDA(InputArrayOfArrays src, InputArray labels, int num_components = 0);
+
+    /** Serializes this object to a given filename.
+      */
+    void save(const String& filename) const;
+
+    /** Deserializes this object from a given filename.
+      */
+    void load(const String& filename);
+
+    /** Serializes this object to a given cv::FileStorage.
+      */
+    void save(FileStorage& fs) const;
+
+    /** Deserializes this object from a given cv::FileStorage.
+      */
+    void load(const FileStorage& node);
+
+    /** destructor
+      */
+    ~LDA();
+
+    /** Compute the discriminants for data in src (row aligned) and labels.
+      */
+    void compute(InputArrayOfArrays src, InputArray labels);
+
+    /** Projects samples into the LDA subspace.
+        src may be one or more row aligned samples.
+      */
+    Mat project(InputArray src);
+
+    /** Reconstructs projections from the LDA subspace.
+        src may be one or more row aligned projections.
+      */
+    Mat reconstruct(InputArray src);
+
+    /** Returns the eigenvectors of this LDA.
+      */
+    Mat eigenvectors() const { return _eigenvectors; }
+
+    /** Returns the eigenvalues of this LDA.
+      */
+    Mat eigenvalues() const { return _eigenvalues; }
+
+    static Mat subspaceProject(InputArray W, InputArray mean, InputArray src);
+    static Mat subspaceReconstruct(InputArray W, InputArray mean, InputArray src);
+
+protected:
+    int _num_components;
+    Mat _eigenvectors;
+    Mat _eigenvalues;
+    void lda(InputArrayOfArrays src, InputArray labels);
+};
+
+/** @brief Singular Value Decomposition
+
+Class for computing Singular Value Decomposition of a floating-point
+matrix. The Singular Value Decomposition is used to solve least-square
+problems, under-determined linear systems, invert matrices, compute
+condition numbers, and so on.
+
+If you want to compute a condition number of a matrix or an absolute value of
+its determinant, you do not need `u` and `vt`. You can pass
+flags=SVD::NO_UV|... . Another flag SVD::FULL_UV indicates that full-size u
+and vt must be computed, which is not necessary most of the time.
+
+@sa invert, solve, eigen, determinant
+*/
+class CV_EXPORTS SVD
+{
+public:
+    enum Flags {
+        /** allow the algorithm to modify the decomposed matrix; it can save space and speed up
+            processing. currently ignored. */
+        MODIFY_A = 1,
+        /** indicates that only a vector of singular values `w` is to be processed, while u and vt
+            will be set to empty matrices */
+        NO_UV    = 2,
+        /** when the matrix is not square, by default the algorithm produces u and vt matrices of
+            sufficiently large size for the further A reconstruction; if, however, FULL_UV flag is
+            specified, u and vt will be full-size square orthogonal matrices.*/
+        FULL_UV  = 4
+    };
+
+    /** @brief the default constructor
+
+    initializes an empty SVD structure
+      */
+    SVD();
+
+    /** @overload
+    initializes an empty SVD structure and then calls SVD::operator()
+    @param src decomposed matrix. The depth has to be CV_32F or CV_64F.
+    @param flags operation flags (SVD::Flags)
+      */
+    SVD( InputArray src, int flags = 0 );
+
+    /** @brief the operator that performs SVD. The previously allocated u, w and vt are released.
+
+    The operator performs the singular value decomposition of the supplied
+    matrix. The u,`vt` , and the vector of singular values w are stored in
+    the structure. The same SVD structure can be reused many times with
+    different matrices. Each time, if needed, the previous u,`vt` , and w
+    are reclaimed and the new matrices are created, which is all handled by
+    Mat::create.
+    @param src decomposed matrix. The depth has to be CV_32F or CV_64F.
+    @param flags operation flags (SVD::Flags)
+      */
+    SVD& operator ()( InputArray src, int flags = 0 );
+
+    /** @brief decomposes matrix and stores the results to user-provided matrices
+
+    The methods/functions perform SVD of matrix. Unlike SVD::SVD constructor
+    and SVD::operator(), they store the results to the user-provided
+    matrices:
+
+    @code{.cpp}
+    Mat A, w, u, vt;
+    SVD::compute(A, w, u, vt);
+    @endcode
+
+    @param src decomposed matrix. The depth has to be CV_32F or CV_64F.
+    @param w calculated singular values
+    @param u calculated left singular vectors
+    @param vt transposed matrix of right singular vectors
+    @param flags operation flags - see SVD::Flags.
+      */
+    static void compute( InputArray src, OutputArray w,
+                         OutputArray u, OutputArray vt, int flags = 0 );
+
+    /** @overload
+    computes singular values of a matrix
+    @param src decomposed matrix. The depth has to be CV_32F or CV_64F.
+    @param w calculated singular values
+    @param flags operation flags - see SVD::Flags.
+      */
+    static void compute( InputArray src, OutputArray w, int flags = 0 );
+
+    /** @brief performs back substitution
+      */
+    static void backSubst( InputArray w, InputArray u,
+                           InputArray vt, InputArray rhs,
+                           OutputArray dst );
+
+    /** @brief solves an under-determined singular linear system
+
+    The method finds a unit-length solution x of a singular linear system
+    A\*x = 0. Depending on the rank of A, there can be no solutions, a
+    single solution or an infinite number of solutions. In general, the
+    algorithm solves the following problem:
+    \f[dst =  \arg \min _{x:  \| x \| =1}  \| src  \cdot x  \|\f]
+    @param src left-hand-side matrix.
+    @param dst found solution.
+      */
+    static void solveZ( InputArray src, OutputArray dst );
+
+    /** @brief performs a singular value back substitution.
+
+    The method calculates a back substitution for the specified right-hand
+    side:
+
+    \f[\texttt{x} =  \texttt{vt} ^T  \cdot diag( \texttt{w} )^{-1}  \cdot \texttt{u} ^T  \cdot \texttt{rhs} \sim \texttt{A} ^{-1}  \cdot \texttt{rhs}\f]
+
+    Using this technique you can either get a very accurate solution of the
+    convenient linear system, or the best (in the least-squares terms)
+    pseudo-solution of an overdetermined linear system.
+
+    @param rhs right-hand side of a linear system (u\*w\*v')\*dst = rhs to
+    be solved, where A has been previously decomposed.
+
+    @param dst found solution of the system.
+
+    @note Explicit SVD with the further back substitution only makes sense
+    if you need to solve many linear systems with the same left-hand side
+    (for example, src ). If all you need is to solve a single system
+    (possibly with multiple rhs immediately available), simply call solve
+    add pass #DECOMP_SVD there. It does absolutely the same thing.
+      */
+    void backSubst( InputArray rhs, OutputArray dst ) const;
+
+    /** @todo document */
+    template<typename _Tp, int m, int n, int nm> static
+    void compute( const Matx<_Tp, m, n>& a, Matx<_Tp, nm, 1>& w, Matx<_Tp, m, nm>& u, Matx<_Tp, n, nm>& vt );
+
+    /** @todo document */
+    template<typename _Tp, int m, int n, int nm> static
+    void compute( const Matx<_Tp, m, n>& a, Matx<_Tp, nm, 1>& w );
+
+    /** @todo document */
+    template<typename _Tp, int m, int n, int nm, int nb> static
+    void backSubst( const Matx<_Tp, nm, 1>& w, const Matx<_Tp, m, nm>& u, const Matx<_Tp, n, nm>& vt, const Matx<_Tp, m, nb>& rhs, Matx<_Tp, n, nb>& dst );
+
+    Mat u, w, vt;
+};
+
+/** @brief Random Number Generator
+
+Random number generator. It encapsulates the state (currently, a 64-bit
+integer) and has methods to return scalar random values and to fill
+arrays with random values. Currently it supports uniform and Gaussian
+(normal) distributions. The generator uses Multiply-With-Carry
+algorithm, introduced by G. Marsaglia (
+<http://en.wikipedia.org/wiki/Multiply-with-carry> ).
+Gaussian-distribution random numbers are generated using the Ziggurat
+algorithm ( <http://en.wikipedia.org/wiki/Ziggurat_algorithm> ),
+introduced by G. Marsaglia and W. W. Tsang.
+*/
+class CV_EXPORTS RNG
+{
+public:
+    enum { UNIFORM = 0,
+           NORMAL  = 1
+         };
+
+    /** @brief constructor
+
+    These are the RNG constructors. The first form sets the state to some
+    pre-defined value, equal to 2\*\*32-1 in the current implementation. The
+    second form sets the state to the specified value. If you passed state=0
+    , the constructor uses the above default value instead to avoid the
+    singular random number sequence, consisting of all zeros.
+    */
+    RNG();
+    /** @overload
+    @param state 64-bit value used to initialize the RNG.
+    */
+    RNG(uint64 state);
+    /**The method updates the state using the MWC algorithm and returns the
+    next 32-bit random number.*/
+    unsigned next();
+
+    /**Each of the methods updates the state using the MWC algorithm and
+    returns the next random number of the specified type. In case of integer
+    types, the returned number is from the available value range for the
+    specified type. In case of floating-point types, the returned value is
+    from [0,1) range.
+    */
+    operator uchar();
+    /** @overload */
+    operator schar();
+    /** @overload */
+    operator ushort();
+    /** @overload */
+    operator short();
+    /** @overload */
+    operator unsigned();
+    /** @overload */
+    operator int();
+    /** @overload */
+    operator float();
+    /** @overload */
+    operator double();
+
+    /** @brief returns a random integer sampled uniformly from [0, N).
+
+    The methods transform the state using the MWC algorithm and return the
+    next random number. The first form is equivalent to RNG::next . The
+    second form returns the random number modulo N , which means that the
+    result is in the range [0, N) .
+    */
+    unsigned operator ()();
+    /** @overload
+    @param N upper non-inclusive boundary of the returned random number.
+    */
+    unsigned operator ()(unsigned N);
+
+    /** @brief returns uniformly distributed integer random number from [a,b) range
+
+    The methods transform the state using the MWC algorithm and return the
+    next uniformly-distributed random number of the specified type, deduced
+    from the input parameter type, from the range [a, b) . There is a nuance
+    illustrated by the following sample:
+
+    @code{.cpp}
+    RNG rng;
+
+    // always produces 0
+    double a = rng.uniform(0, 1);
+
+    // produces double from [0, 1)
+    double a1 = rng.uniform((double)0, (double)1);
+
+    // produces float from [0, 1)
+    float b = rng.uniform(0.f, 1.f);
+
+    // produces double from [0, 1)
+    double c = rng.uniform(0., 1.);
+
+    // may cause compiler error because of ambiguity:
+    //  RNG::uniform(0, (int)0.999999)? or RNG::uniform((double)0, 0.99999)?
+    double d = rng.uniform(0, 0.999999);
+    @endcode
+
+    The compiler does not take into account the type of the variable to
+    which you assign the result of RNG::uniform . The only thing that
+    matters to the compiler is the type of a and b parameters. So, if you
+    want a floating-point random number, but the range boundaries are
+    integer numbers, either put dots in the end, if they are constants, or
+    use explicit type cast operators, as in the a1 initialization above.
+    @param a lower inclusive boundary of the returned random number.
+    @param b upper non-inclusive boundary of the returned random number.
+    */
+    int uniform(int a, int b);
+    /** @overload */
+    float uniform(float a, float b);
+    /** @overload */
+    double uniform(double a, double b);
+
+    /** @brief Fills arrays with random numbers.
+
+    @param mat 2D or N-dimensional matrix; currently matrices with more than
+    4 channels are not supported by the methods, use Mat::reshape as a
+    possible workaround.
+    @param distType distribution type, RNG::UNIFORM or RNG::NORMAL.
+    @param a first distribution parameter; in case of the uniform
+    distribution, this is an inclusive lower boundary, in case of the normal
+    distribution, this is a mean value.
+    @param b second distribution parameter; in case of the uniform
+    distribution, this is a non-inclusive upper boundary, in case of the
+    normal distribution, this is a standard deviation (diagonal of the
+    standard deviation matrix or the full standard deviation matrix).
+    @param saturateRange pre-saturation flag; for uniform distribution only;
+    if true, the method will first convert a and b to the acceptable value
+    range (according to the mat datatype) and then will generate uniformly
+    distributed random numbers within the range [saturate(a), saturate(b)),
+    if saturateRange=false, the method will generate uniformly distributed
+    random numbers in the original range [a, b) and then will saturate them,
+    it means, for example, that
+    <tt>theRNG().fill(mat_8u, RNG::UNIFORM, -DBL_MAX, DBL_MAX)</tt> will likely
+    produce array mostly filled with 0's and 255's, since the range (0, 255)
+    is significantly smaller than [-DBL_MAX, DBL_MAX).
+
+    Each of the methods fills the matrix with the random values from the
+    specified distribution. As the new numbers are generated, the RNG state
+    is updated accordingly. In case of multiple-channel images, every
+    channel is filled independently, which means that RNG cannot generate
+    samples from the multi-dimensional Gaussian distribution with
+    non-diagonal covariance matrix directly. To do that, the method
+    generates samples from multi-dimensional standard Gaussian distribution
+    with zero mean and identity covariation matrix, and then transforms them
+    using transform to get samples from the specified Gaussian distribution.
+    */
+    void fill( InputOutputArray mat, int distType, InputArray a, InputArray b, bool saturateRange = false );
+
+    /** @brief Returns the next random number sampled from the Gaussian distribution
+    @param sigma standard deviation of the distribution.
+
+    The method transforms the state using the MWC algorithm and returns the
+    next random number from the Gaussian distribution N(0,sigma) . That is,
+    the mean value of the returned random numbers is zero and the standard
+    deviation is the specified sigma .
+    */
+    double gaussian(double sigma);
+
+    uint64 state;
+
+    bool operator ==(const RNG& other) const;
+};
+
+/** @brief Mersenne Twister random number generator
+
+Inspired by http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/MT2002/CODES/mt19937ar.c
+@todo document
+*/
+class CV_EXPORTS RNG_MT19937
+{
+public:
+    RNG_MT19937();
+    RNG_MT19937(unsigned s);
+    void seed(unsigned s);
+
+    unsigned next();
+
+    operator int();
+    operator unsigned();
+    operator float();
+    operator double();
+
+    unsigned operator ()(unsigned N);
+    unsigned operator ()();
+
+    /** @brief returns uniformly distributed integer random number from [a,b) range*/
+    int uniform(int a, int b);
+    /** @brief returns uniformly distributed floating-point random number from [a,b) range*/
+    float uniform(float a, float b);
+    /** @brief returns uniformly distributed double-precision floating-point random number from [a,b) range*/
+    double uniform(double a, double b);
+
+private:
+    enum PeriodParameters {N = 624, M = 397};
+    unsigned state[N];
+    int mti;
+};
+
+//! @} core_array
+
+//! @addtogroup core_cluster
+//!  @{
+
+/** @example samples/cpp/kmeans.cpp
+An example on K-means clustering
+*/
+
+/** @brief Finds centers of clusters and groups input samples around the clusters.
+
+The function kmeans implements a k-means algorithm that finds the centers of cluster_count clusters
+and groups the input samples around the clusters. As an output, \f$\texttt{bestLabels}_i\f$ contains a
+0-based cluster index for the sample stored in the \f$i^{th}\f$ row of the samples matrix.
+
+@note
+-   (Python) An example on K-means clustering can be found at
+    opencv_source_code/samples/python/kmeans.py
+@param data Data for clustering. An array of N-Dimensional points with float coordinates is needed.
+Examples of this array can be:
+-   Mat points(count, 2, CV_32F);
+-   Mat points(count, 1, CV_32FC2);
+-   Mat points(1, count, CV_32FC2);
+-   std::vector\<cv::Point2f\> points(sampleCount);
+@param K Number of clusters to split the set by.
+@param bestLabels Input/output integer array that stores the cluster indices for every sample.
+@param criteria The algorithm termination criteria, that is, the maximum number of iterations and/or
+the desired accuracy. The accuracy is specified as criteria.epsilon. As soon as each of the cluster
+centers moves by less than criteria.epsilon on some iteration, the algorithm stops.
+@param attempts Flag to specify the number of times the algorithm is executed using different
+initial labellings. The algorithm returns the labels that yield the best compactness (see the last
+function parameter).
+@param flags Flag that can take values of cv::KmeansFlags
+@param centers Output matrix of the cluster centers, one row per each cluster center.
+@return The function returns the compactness measure that is computed as
+\f[\sum _i  \| \texttt{samples} _i -  \texttt{centers} _{ \texttt{labels} _i} \| ^2\f]
+after every attempt. The best (minimum) value is chosen and the corresponding labels and the
+compactness value are returned by the function. Basically, you can use only the core of the
+function, set the number of attempts to 1, initialize labels each time using a custom algorithm,
+pass them with the ( flags = #KMEANS_USE_INITIAL_LABELS ) flag, and then choose the best
+(most-compact) clustering.
+*/
+CV_EXPORTS_W double kmeans( InputArray data, int K, InputOutputArray bestLabels,
+                            TermCriteria criteria, int attempts,
+                            int flags, OutputArray centers = noArray() );
+
+//! @} core_cluster
+
+//! @addtogroup core_basic
+//! @{
+
+/////////////////////////////// Formatted output of cv::Mat ///////////////////////////
+
+/** @todo document */
+class CV_EXPORTS Formatted
+{
+public:
+    virtual const char* next() = 0;
+    virtual void reset() = 0;
+    virtual ~Formatted();
+};
+
+/** @todo document */
+class CV_EXPORTS Formatter
+{
+public:
+    enum FormatType {
+           FMT_DEFAULT = 0,
+           FMT_MATLAB  = 1,
+           FMT_CSV     = 2,
+           FMT_PYTHON  = 3,
+           FMT_NUMPY   = 4,
+           FMT_C       = 5
+         };
+
+    virtual ~Formatter();
+
+    virtual Ptr<Formatted> format(const Mat& mtx) const = 0;
+
+    virtual void set16fPrecision(int p = 4) = 0;
+    virtual void set32fPrecision(int p = 8) = 0;
+    virtual void set64fPrecision(int p = 16) = 0;
+    virtual void setMultiline(bool ml = true) = 0;
+
+    static Ptr<Formatter> get(Formatter::FormatType fmt = FMT_DEFAULT);
+
+};
+
+static inline
+String& operator << (String& out, Ptr<Formatted> fmtd)
+{
+    fmtd->reset();
+    for(const char* str = fmtd->next(); str; str = fmtd->next())
+        out += cv::String(str);
+    return out;
+}
+
+static inline
+String& operator << (String& out, const Mat& mtx)
+{
+    return out << Formatter::get()->format(mtx);
+}
+
+//////////////////////////////////////// Algorithm ////////////////////////////////////
+
+class CV_EXPORTS Algorithm;
+
+template<typename _Tp, typename _EnumTp = void> struct ParamType {};
+
+
+/** @brief This is a base class for all more or less complex algorithms in OpenCV
+
+especially for classes of algorithms, for which there can be multiple implementations. The examples
+are stereo correspondence (for which there are algorithms like block matching, semi-global block
+matching, graph-cut etc.), background subtraction (which can be done using mixture-of-gaussians
+models, codebook-based algorithm etc.), optical flow (block matching, Lucas-Kanade, Horn-Schunck
+etc.).
+
+Here is example of SimpleBlobDetector use in your application via Algorithm interface:
+@snippet snippets/core_various.cpp Algorithm
+*/
+class CV_EXPORTS_W Algorithm
+{
+public:
+    Algorithm();
+    virtual ~Algorithm();
+
+    /** @brief Clears the algorithm state
+    */
+    CV_WRAP virtual void clear() {}
+
+    /** @brief Stores algorithm parameters in a file storage
+    */
+    virtual void write(FileStorage& fs) const { CV_UNUSED(fs); }
+
+    /** @brief simplified API for language bindings
+    * @overload
+    */
+    CV_WRAP void write(const Ptr<FileStorage>& fs, const String& name = String()) const;
+
+    /** @brief Reads algorithm parameters from a file storage
+    */
+    CV_WRAP virtual void read(const FileNode& fn) { CV_UNUSED(fn); }
+
+    /** @brief Returns true if the Algorithm is empty (e.g. in the very beginning or after unsuccessful read
+    */
+    CV_WRAP virtual bool empty() const { return false; }
+
+    /** @brief Reads algorithm from the file node
+
+    This is static template method of Algorithm. It's usage is following (in the case of SVM):
+    @code
+    cv::FileStorage fsRead("example.xml", FileStorage::READ);
+    Ptr<SVM> svm = Algorithm::read<SVM>(fsRead.root());
+    @endcode
+    In order to make this method work, the derived class must overwrite Algorithm::read(const
+    FileNode& fn) and also have static create() method without parameters
+    (or with all the optional parameters)
+    */
+    template<typename _Tp> static Ptr<_Tp> read(const FileNode& fn)
+    {
+        Ptr<_Tp> obj = _Tp::create();
+        obj->read(fn);
+        return !obj->empty() ? obj : Ptr<_Tp>();
+    }
+
+    /** @brief Loads algorithm from the file
+
+    @param filename Name of the file to read.
+    @param objname The optional name of the node to read (if empty, the first top-level node will be used)
+
+    This is static template method of Algorithm. It's usage is following (in the case of SVM):
+    @code
+    Ptr<SVM> svm = Algorithm::load<SVM>("my_svm_model.xml");
+    @endcode
+    In order to make this method work, the derived class must overwrite Algorithm::read(const
+    FileNode& fn).
+    */
+    template<typename _Tp> static Ptr<_Tp> load(const String& filename, const String& objname=String())
+    {
+        FileStorage fs(filename, FileStorage::READ);
+        CV_Assert(fs.isOpened());
+        FileNode fn = objname.empty() ? fs.getFirstTopLevelNode() : fs[objname];
+        if (fn.empty()) return Ptr<_Tp>();
+        Ptr<_Tp> obj = _Tp::create();
+        obj->read(fn);
+        return !obj->empty() ? obj : Ptr<_Tp>();
+    }
+
+    /** @brief Loads algorithm from a String
+
+    @param strModel The string variable containing the model you want to load.
+    @param objname The optional name of the node to read (if empty, the first top-level node will be used)
+
+    This is static template method of Algorithm. It's usage is following (in the case of SVM):
+    @code
+    Ptr<SVM> svm = Algorithm::loadFromString<SVM>(myStringModel);
+    @endcode
+    */
+    template<typename _Tp> static Ptr<_Tp> loadFromString(const String& strModel, const String& objname=String())
+    {
+        FileStorage fs(strModel, FileStorage::READ + FileStorage::MEMORY);
+        FileNode fn = objname.empty() ? fs.getFirstTopLevelNode() : fs[objname];
+        Ptr<_Tp> obj = _Tp::create();
+        obj->read(fn);
+        return !obj->empty() ? obj : Ptr<_Tp>();
+    }
+
+    /** Saves the algorithm to a file.
+    In order to make this method work, the derived class must implement Algorithm::write(FileStorage& fs). */
+    CV_WRAP virtual void save(const String& filename) const;
+
+    /** Returns the algorithm string identifier.
+    This string is used as top level xml/yml node tag when the object is saved to a file or string. */
+    CV_WRAP virtual String getDefaultName() const;
+
+protected:
+    void writeFormat(FileStorage& fs) const;
+};
+
+enum struct Param {
+    INT=0, BOOLEAN=1, REAL=2, STRING=3, MAT=4, MAT_VECTOR=5, ALGORITHM=6, FLOAT=7,
+    UNSIGNED_INT=8, UINT64=9, UCHAR=11, SCALAR=12
+};
+
+
+
+template<> struct ParamType<bool>
+{
+    typedef bool const_param_type;
+    typedef bool member_type;
+
+    static const Param type = Param::BOOLEAN;
+};
+
+template<> struct ParamType<int>
+{
+    typedef int const_param_type;
+    typedef int member_type;
+
+    static const Param type = Param::INT;
+};
+
+template<> struct ParamType<double>
+{
+    typedef double const_param_type;
+    typedef double member_type;
+
+    static const Param type = Param::REAL;
+};
+
+template<> struct ParamType<String>
+{
+    typedef const String& const_param_type;
+    typedef String member_type;
+
+    static const Param type = Param::STRING;
+};
+
+template<> struct ParamType<Mat>
+{
+    typedef const Mat& const_param_type;
+    typedef Mat member_type;
+
+    static const Param type = Param::MAT;
+};
+
+template<> struct ParamType<std::vector<Mat> >
+{
+    typedef const std::vector<Mat>& const_param_type;
+    typedef std::vector<Mat> member_type;
+
+    static const Param type = Param::MAT_VECTOR;
+};
+
+template<> struct ParamType<Algorithm>
+{
+    typedef const Ptr<Algorithm>& const_param_type;
+    typedef Ptr<Algorithm> member_type;
+
+    static const Param type = Param::ALGORITHM;
+};
+
+template<> struct ParamType<float>
+{
+    typedef float const_param_type;
+    typedef float member_type;
+
+    static const Param type = Param::FLOAT;
+};
+
+template<> struct ParamType<unsigned>
+{
+    typedef unsigned const_param_type;
+    typedef unsigned member_type;
+
+    static const Param type = Param::UNSIGNED_INT;
+};
+
+template<> struct ParamType<uint64>
+{
+    typedef uint64 const_param_type;
+    typedef uint64 member_type;
+
+    static const Param type = Param::UINT64;
+};
+
+template<> struct ParamType<uchar>
+{
+    typedef uchar const_param_type;
+    typedef uchar member_type;
+
+    static const Param type = Param::UCHAR;
+};
+
+template<> struct ParamType<Scalar>
+{
+    typedef const Scalar& const_param_type;
+    typedef Scalar member_type;
+
+    static const Param type = Param::SCALAR;
+};
+
+template<typename _Tp>
+struct ParamType<_Tp, typename std::enable_if< std::is_enum<_Tp>::value >::type>
+{
+    typedef typename std::underlying_type<_Tp>::type const_param_type;
+    typedef typename std::underlying_type<_Tp>::type member_type;
+
+    static const Param type = Param::INT;
+};
+
+//! @} core_basic
+
+} //namespace cv
+
+#include "opencv2/core/operations.hpp"
+#include "opencv2/core/cvstd.inl.hpp"
+#include "opencv2/core/utility.hpp"
+#include "opencv2/core/optim.hpp"
+#include "opencv2/core/ovx.hpp"
+
+#endif /*OPENCV_CORE_HPP*/
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/affine.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/affine.hpp
new file mode 100644
index 0000000..1806382
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/affine.hpp
@@ -0,0 +1,678 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_AFFINE3_HPP
+#define OPENCV_CORE_AFFINE3_HPP
+
+#ifdef __cplusplus
+
+#include <opencv2/core.hpp>
+
+namespace cv
+{
+
+//! @addtogroup core
+//! @{
+
+    /** @brief Affine transform
+     *
+     * It represents a 4x4 homogeneous transformation matrix \f$T\f$
+     *
+     *  \f[T =
+     *  \begin{bmatrix}
+     *  R & t\\
+     *  0 & 1\\
+     *  \end{bmatrix}
+     *  \f]
+     *
+     *  where \f$R\f$ is a 3x3 rotation matrix and \f$t\f$ is a 3x1 translation vector.
+     *
+     *  You can specify \f$R\f$ either by a 3x3 rotation matrix or by a 3x1 rotation vector,
+     *  which is converted to a 3x3 rotation matrix by the Rodrigues formula.
+     *
+     *  To construct a matrix \f$T\f$ representing first rotation around the axis \f$r\f$ with rotation
+     *  angle \f$|r|\f$ in radian (right hand rule) and then translation by the vector \f$t\f$, you can use
+     *
+     *  @code
+     *  cv::Vec3f r, t;
+     *  cv::Affine3f T(r, t);
+     *  @endcode
+     *
+     *  If you already have the rotation matrix \f$R\f$, then you can use
+     *
+     *  @code
+     *  cv::Matx33f R;
+     *  cv::Affine3f T(R, t);
+     *  @endcode
+     *
+     *  To extract the rotation matrix \f$R\f$ from \f$T\f$, use
+     *
+     *  @code
+     *  cv::Matx33f R = T.rotation();
+     *  @endcode
+     *
+     *  To extract the translation vector \f$t\f$ from \f$T\f$, use
+     *
+     *  @code
+     *  cv::Vec3f t = T.translation();
+     *  @endcode
+     *
+     *  To extract the rotation vector \f$r\f$ from \f$T\f$, use
+     *
+     *  @code
+     *  cv::Vec3f r = T.rvec();
+     *  @endcode
+     *
+     *  Note that since the mapping from rotation vectors to rotation matrices
+     *  is many to one. The returned rotation vector is not necessarily the one
+     *  you used before to set the matrix.
+     *
+     *  If you have two transformations \f$T = T_1 * T_2\f$, use
+     *
+     *  @code
+     *  cv::Affine3f T, T1, T2;
+     *  T = T2.concatenate(T1);
+     *  @endcode
+     *
+     *  To get the inverse transform of \f$T\f$, use
+     *
+     *  @code
+     *  cv::Affine3f T, T_inv;
+     *  T_inv = T.inv();
+     *  @endcode
+     *
+     */
+    template<typename T>
+    class Affine3
+    {
+    public:
+        typedef T float_type;
+        typedef Matx<float_type, 3, 3> Mat3;
+        typedef Matx<float_type, 4, 4> Mat4;
+        typedef Vec<float_type, 3> Vec3;
+
+       //! Default constructor. It represents a 4x4 identity matrix.
+        Affine3();
+
+        //! Augmented affine matrix
+        Affine3(const Mat4& affine);
+
+        /**
+         *  The resulting 4x4 matrix is
+         *
+         *  \f[
+         *  \begin{bmatrix}
+         *  R & t\\
+         *  0 & 1\\
+         *  \end{bmatrix}
+         *  \f]
+         *
+         * @param R 3x3 rotation matrix.
+         * @param t 3x1 translation vector.
+         */
+        Affine3(const Mat3& R, const Vec3& t = Vec3::all(0));
+
+        /**
+         * Rodrigues vector.
+         *
+         * The last row of the current matrix is set to [0,0,0,1].
+         *
+         * @param rvec 3x1 rotation vector. Its direction indicates the rotation axis and its length
+         *             indicates the rotation angle in radian (using right hand rule).
+         * @param t 3x1 translation vector.
+         */
+        Affine3(const Vec3& rvec, const Vec3& t = Vec3::all(0));
+
+        /**
+         * Combines all constructors above. Supports 4x4, 3x4, 3x3, 1x3, 3x1 sizes of data matrix.
+         *
+         * The last row of the current matrix is set to [0,0,0,1] when data is not 4x4.
+         *
+         * @param data 1-channel matrix.
+         *             when it is 4x4, it is copied to the current matrix and t is not used.
+         *             When it is 3x4, it is copied to the upper part 3x4 of the current matrix and t is not used.
+         *             When it is 3x3, it is copied to the upper left 3x3 part of the current matrix.
+         *             When it is 3x1 or 1x3, it is treated as a rotation vector and the Rodrigues formula is used
+         *                             to compute a 3x3 rotation matrix.
+         * @param t 3x1 translation vector. It is used only when data is neither 4x4 nor 3x4.
+         */
+        explicit Affine3(const Mat& data, const Vec3& t = Vec3::all(0));
+
+        //! From 16-element array
+        explicit Affine3(const float_type* vals);
+
+        //! Create an 4x4 identity transform
+        static Affine3 Identity();
+
+        /**
+         * Rotation matrix.
+         *
+         * Copy the rotation matrix to the upper left 3x3 part of the current matrix.
+         * The remaining elements of the current matrix are not changed.
+         *
+         * @param R 3x3 rotation matrix.
+         *
+         */
+        void rotation(const Mat3& R);
+
+        /**
+         * Rodrigues vector.
+         *
+         * It sets the upper left 3x3 part of the matrix. The remaining part is unaffected.
+         *
+         * @param rvec 3x1 rotation vector. The direction indicates the rotation axis and
+         *             its length indicates the rotation angle in radian (using the right thumb convention).
+         */
+        void rotation(const Vec3& rvec);
+
+        /**
+         * Combines rotation methods above. Supports 3x3, 1x3, 3x1 sizes of data matrix.
+         *
+         * It sets the upper left 3x3 part of the matrix. The remaining part is unaffected.
+         *
+         * @param data 1-channel matrix.
+         *             When it is a 3x3 matrix, it sets the upper left 3x3 part of the current matrix.
+         *             When it is a 1x3 or 3x1 matrix, it is used as a rotation vector. The Rodrigues formula
+         *             is used to compute the rotation matrix and sets the upper left 3x3 part of the current matrix.
+         */
+        void rotation(const Mat& data);
+
+        /**
+         * Copy the 3x3 matrix L to the upper left part of the current matrix
+         *
+         * It sets the upper left 3x3 part of the matrix. The remaining part is unaffected.
+         *
+         * @param L 3x3 matrix.
+         */
+        void linear(const Mat3& L);
+
+        /**
+         * Copy t to the first three elements of the last column of the current matrix
+         *
+         * It sets the upper right 3x1 part of the matrix. The remaining part is unaffected.
+         *
+         * @param t 3x1 translation vector.
+         */
+        void translation(const Vec3& t);
+
+        //! @return the upper left 3x3 part
+        Mat3 rotation() const;
+
+        //! @return the upper left 3x3 part
+        Mat3 linear() const;
+
+        //! @return the upper right 3x1 part
+        Vec3 translation() const;
+
+        //! Rodrigues vector.
+        //! @return a vector representing the upper left 3x3 rotation matrix of the current matrix.
+        //! @warning  Since the mapping between rotation vectors and rotation matrices is many to one,
+        //!           this function returns only one rotation vector that represents the current rotation matrix,
+        //!           which is not necessarily the same one set by `rotation(const Vec3& rvec)`.
+        Vec3 rvec() const;
+
+        //! @return the inverse of the current matrix.
+        Affine3 inv(int method = cv::DECOMP_SVD) const;
+
+        //! a.rotate(R) is equivalent to Affine(R, 0) * a;
+        Affine3 rotate(const Mat3& R) const;
+
+        //! a.rotate(rvec) is equivalent to Affine(rvec, 0) * a;
+        Affine3 rotate(const Vec3& rvec) const;
+
+        //! a.translate(t) is equivalent to Affine(E, t) * a, where E is an identity matrix
+        Affine3 translate(const Vec3& t) const;
+
+        //! a.concatenate(affine) is equivalent to affine * a;
+        Affine3 concatenate(const Affine3& affine) const;
+
+        template <typename Y> operator Affine3<Y>() const;
+
+        template <typename Y> Affine3<Y> cast() const;
+
+        Mat4 matrix;
+
+#if defined EIGEN_WORLD_VERSION && defined EIGEN_GEOMETRY_MODULE_H
+        Affine3(const Eigen::Transform<T, 3, Eigen::Affine, (Eigen::RowMajor)>& affine);
+        Affine3(const Eigen::Transform<T, 3, Eigen::Affine>& affine);
+        operator Eigen::Transform<T, 3, Eigen::Affine, (Eigen::RowMajor)>() const;
+        operator Eigen::Transform<T, 3, Eigen::Affine>() const;
+#endif
+    };
+
+    template<typename T> static
+    Affine3<T> operator*(const Affine3<T>& affine1, const Affine3<T>& affine2);
+
+    //! V is a 3-element vector with member fields x, y and z
+    template<typename T, typename V> static
+    V operator*(const Affine3<T>& affine, const V& vector);
+
+    typedef Affine3<float> Affine3f;
+    typedef Affine3<double> Affine3d;
+
+    static Vec3f operator*(const Affine3f& affine, const Vec3f& vector);
+    static Vec3d operator*(const Affine3d& affine, const Vec3d& vector);
+
+    template<typename _Tp> class DataType< Affine3<_Tp> >
+    {
+    public:
+        typedef Affine3<_Tp>                               value_type;
+        typedef Affine3<typename DataType<_Tp>::work_type> work_type;
+        typedef _Tp                                        channel_type;
+
+        enum { generic_type = 0,
+               channels     = 16,
+               fmt          = traits::SafeFmt<channel_type>::fmt + ((channels - 1) << 8)
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+               ,depth        = DataType<channel_type>::depth
+               ,type         = CV_MAKETYPE(depth, channels)
+#endif
+             };
+
+        typedef Vec<channel_type, channels> vec_type;
+    };
+
+    namespace traits {
+    template<typename _Tp>
+    struct Depth< Affine3<_Tp> > { enum { value = Depth<_Tp>::value }; };
+    template<typename _Tp>
+    struct Type< Affine3<_Tp> > { enum { value = CV_MAKETYPE(Depth<_Tp>::value, 16) }; };
+    } // namespace
+
+//! @} core
+
+}
+
+//! @cond IGNORED
+
+///////////////////////////////////////////////////////////////////////////////////
+// Implementation
+
+template<typename T> inline
+cv::Affine3<T>::Affine3()
+    : matrix(Mat4::eye())
+{}
+
+template<typename T> inline
+cv::Affine3<T>::Affine3(const Mat4& affine)
+    : matrix(affine)
+{}
+
+template<typename T> inline
+cv::Affine3<T>::Affine3(const Mat3& R, const Vec3& t)
+{
+    rotation(R);
+    translation(t);
+    matrix.val[12] = matrix.val[13] = matrix.val[14] = 0;
+    matrix.val[15] = 1;
+}
+
+template<typename T> inline
+cv::Affine3<T>::Affine3(const Vec3& _rvec, const Vec3& t)
+{
+    rotation(_rvec);
+    translation(t);
+    matrix.val[12] = matrix.val[13] = matrix.val[14] = 0;
+    matrix.val[15] = 1;
+}
+
+template<typename T> inline
+cv::Affine3<T>::Affine3(const cv::Mat& data, const Vec3& t)
+{
+    CV_Assert(data.type() == cv::traits::Type<T>::value);
+    CV_Assert(data.channels() == 1);
+
+    if (data.cols == 4 && data.rows == 4)
+    {
+        data.copyTo(matrix);
+        return;
+    }
+    else if (data.cols == 4 && data.rows == 3)
+    {
+        rotation(data(Rect(0, 0, 3, 3)));
+        translation(data(Rect(3, 0, 1, 3)));
+    }
+    else
+    {
+        rotation(data);
+        translation(t);
+    }
+
+    matrix.val[12] = matrix.val[13] = matrix.val[14] = 0;
+    matrix.val[15] = 1;
+}
+
+template<typename T> inline
+cv::Affine3<T>::Affine3(const float_type* vals) : matrix(vals)
+{}
+
+template<typename T> inline
+cv::Affine3<T> cv::Affine3<T>::Identity()
+{
+    return Affine3<T>(cv::Affine3<T>::Mat4::eye());
+}
+
+template<typename T> inline
+void cv::Affine3<T>::rotation(const Mat3& R)
+{
+    linear(R);
+}
+
+template<typename T> inline
+void cv::Affine3<T>::rotation(const Vec3& _rvec)
+{
+    double theta = norm(_rvec);
+
+    if (theta < DBL_EPSILON)
+        rotation(Mat3::eye());
+    else
+    {
+        double c = std::cos(theta);
+        double s = std::sin(theta);
+        double c1 = 1. - c;
+        double itheta = (theta != 0) ? 1./theta : 0.;
+
+        Point3_<T> r = _rvec*itheta;
+
+        Mat3 rrt( r.x*r.x, r.x*r.y, r.x*r.z, r.x*r.y, r.y*r.y, r.y*r.z, r.x*r.z, r.y*r.z, r.z*r.z );
+        Mat3 r_x( 0, -r.z, r.y, r.z, 0, -r.x, -r.y, r.x, 0 );
+
+        // R = cos(theta)*I + (1 - cos(theta))*r*rT + sin(theta)*[r_x]
+        // where [r_x] is [0 -rz ry; rz 0 -rx; -ry rx 0]
+        Mat3 R = c*Mat3::eye() + c1*rrt + s*r_x;
+
+        rotation(R);
+    }
+}
+
+//Combines rotation methods above. Supports 3x3, 1x3, 3x1 sizes of data matrix;
+template<typename T> inline
+void cv::Affine3<T>::rotation(const cv::Mat& data)
+{
+    CV_Assert(data.type() == cv::traits::Type<T>::value);
+    CV_Assert(data.channels() == 1);
+
+    if (data.cols == 3 && data.rows == 3)
+    {
+        Mat3 R;
+        data.copyTo(R);
+        rotation(R);
+    }
+    else if ((data.cols == 3 && data.rows == 1) || (data.cols == 1 && data.rows == 3))
+    {
+        Vec3 _rvec;
+        data.reshape(1, 3).copyTo(_rvec);
+        rotation(_rvec);
+    }
+    else
+        CV_Error(Error::StsError, "Input matrix can only be 3x3, 1x3 or 3x1");
+}
+
+template<typename T> inline
+void cv::Affine3<T>::linear(const Mat3& L)
+{
+    matrix.val[0] = L.val[0]; matrix.val[1] = L.val[1];  matrix.val[ 2] = L.val[2];
+    matrix.val[4] = L.val[3]; matrix.val[5] = L.val[4];  matrix.val[ 6] = L.val[5];
+    matrix.val[8] = L.val[6]; matrix.val[9] = L.val[7];  matrix.val[10] = L.val[8];
+}
+
+template<typename T> inline
+void cv::Affine3<T>::translation(const Vec3& t)
+{
+    matrix.val[3] = t[0]; matrix.val[7] = t[1]; matrix.val[11] = t[2];
+}
+
+template<typename T> inline
+typename cv::Affine3<T>::Mat3 cv::Affine3<T>::rotation() const
+{
+    return linear();
+}
+
+template<typename T> inline
+typename cv::Affine3<T>::Mat3 cv::Affine3<T>::linear() const
+{
+    typename cv::Affine3<T>::Mat3 R;
+    R.val[0] = matrix.val[0];  R.val[1] = matrix.val[1];  R.val[2] = matrix.val[ 2];
+    R.val[3] = matrix.val[4];  R.val[4] = matrix.val[5];  R.val[5] = matrix.val[ 6];
+    R.val[6] = matrix.val[8];  R.val[7] = matrix.val[9];  R.val[8] = matrix.val[10];
+    return R;
+}
+
+template<typename T> inline
+typename cv::Affine3<T>::Vec3 cv::Affine3<T>::translation() const
+{
+    return Vec3(matrix.val[3], matrix.val[7], matrix.val[11]);
+}
+
+template<typename T> inline
+typename cv::Affine3<T>::Vec3 cv::Affine3<T>::rvec() const
+{
+    cv::Vec3d w;
+    cv::Matx33d u, vt, R = rotation();
+    cv::SVD::compute(R, w, u, vt, cv::SVD::FULL_UV + cv::SVD::MODIFY_A);
+    R = u * vt;
+
+    double rx = R.val[7] - R.val[5];
+    double ry = R.val[2] - R.val[6];
+    double rz = R.val[3] - R.val[1];
+
+    double s = std::sqrt((rx*rx + ry*ry + rz*rz)*0.25);
+    double c = (R.val[0] + R.val[4] + R.val[8] - 1) * 0.5;
+    c = c > 1.0 ? 1.0 : c < -1.0 ? -1.0 : c;
+    double theta = std::acos(c);
+
+    if( s < 1e-5 )
+    {
+        if( c > 0 )
+            rx = ry = rz = 0;
+        else
+        {
+            double t;
+            t = (R.val[0] + 1) * 0.5;
+            rx = std::sqrt(std::max(t, 0.0));
+            t = (R.val[4] + 1) * 0.5;
+            ry = std::sqrt(std::max(t, 0.0)) * (R.val[1] < 0 ? -1.0 : 1.0);
+            t = (R.val[8] + 1) * 0.5;
+            rz = std::sqrt(std::max(t, 0.0)) * (R.val[2] < 0 ? -1.0 : 1.0);
+
+            if( fabs(rx) < fabs(ry) && fabs(rx) < fabs(rz) && (R.val[5] > 0) != (ry*rz > 0) )
+                rz = -rz;
+            theta /= std::sqrt(rx*rx + ry*ry + rz*rz);
+            rx *= theta;
+            ry *= theta;
+            rz *= theta;
+        }
+    }
+    else
+    {
+        double vth = 1/(2*s);
+        vth *= theta;
+        rx *= vth; ry *= vth; rz *= vth;
+    }
+
+    return cv::Vec3d(rx, ry, rz);
+}
+
+template<typename T> inline
+cv::Affine3<T> cv::Affine3<T>::inv(int method) const
+{
+    return matrix.inv(method);
+}
+
+template<typename T> inline
+cv::Affine3<T> cv::Affine3<T>::rotate(const Mat3& R) const
+{
+    Mat3 Lc = linear();
+    Vec3 tc = translation();
+    Mat4 result;
+    result.val[12] = result.val[13] = result.val[14] = 0;
+    result.val[15] = 1;
+
+    for(int j = 0; j < 3; ++j)
+    {
+        for(int i = 0; i < 3; ++i)
+        {
+            float_type value = 0;
+            for(int k = 0; k < 3; ++k)
+                value += R(j, k) * Lc(k, i);
+            result(j, i) = value;
+        }
+
+        result(j, 3) = R.row(j).dot(tc.t());
+    }
+    return result;
+}
+
+template<typename T> inline
+cv::Affine3<T> cv::Affine3<T>::rotate(const Vec3& _rvec) const
+{
+    return rotate(Affine3f(_rvec).rotation());
+}
+
+template<typename T> inline
+cv::Affine3<T> cv::Affine3<T>::translate(const Vec3& t) const
+{
+    Mat4 m = matrix;
+    m.val[ 3] += t[0];
+    m.val[ 7] += t[1];
+    m.val[11] += t[2];
+    return m;
+}
+
+template<typename T> inline
+cv::Affine3<T> cv::Affine3<T>::concatenate(const Affine3<T>& affine) const
+{
+    return (*this).rotate(affine.rotation()).translate(affine.translation());
+}
+
+template<typename T> template <typename Y> inline
+cv::Affine3<T>::operator Affine3<Y>() const
+{
+    return Affine3<Y>(matrix);
+}
+
+template<typename T> template <typename Y> inline
+cv::Affine3<Y> cv::Affine3<T>::cast() const
+{
+    return Affine3<Y>(matrix);
+}
+
+template<typename T> inline
+cv::Affine3<T> cv::operator*(const cv::Affine3<T>& affine1, const cv::Affine3<T>& affine2)
+{
+    return affine2.concatenate(affine1);
+}
+
+template<typename T, typename V> inline
+V cv::operator*(const cv::Affine3<T>& affine, const V& v)
+{
+    const typename Affine3<T>::Mat4& m = affine.matrix;
+
+    V r;
+    r.x = m.val[0] * v.x + m.val[1] * v.y + m.val[ 2] * v.z + m.val[ 3];
+    r.y = m.val[4] * v.x + m.val[5] * v.y + m.val[ 6] * v.z + m.val[ 7];
+    r.z = m.val[8] * v.x + m.val[9] * v.y + m.val[10] * v.z + m.val[11];
+    return r;
+}
+
+static inline
+cv::Vec3f cv::operator*(const cv::Affine3f& affine, const cv::Vec3f& v)
+{
+    const cv::Matx44f& m = affine.matrix;
+    cv::Vec3f r;
+    r.val[0] = m.val[0] * v[0] + m.val[1] * v[1] + m.val[ 2] * v[2] + m.val[ 3];
+    r.val[1] = m.val[4] * v[0] + m.val[5] * v[1] + m.val[ 6] * v[2] + m.val[ 7];
+    r.val[2] = m.val[8] * v[0] + m.val[9] * v[1] + m.val[10] * v[2] + m.val[11];
+    return r;
+}
+
+static inline
+cv::Vec3d cv::operator*(const cv::Affine3d& affine, const cv::Vec3d& v)
+{
+    const cv::Matx44d& m = affine.matrix;
+    cv::Vec3d r;
+    r.val[0] = m.val[0] * v[0] + m.val[1] * v[1] + m.val[ 2] * v[2] + m.val[ 3];
+    r.val[1] = m.val[4] * v[0] + m.val[5] * v[1] + m.val[ 6] * v[2] + m.val[ 7];
+    r.val[2] = m.val[8] * v[0] + m.val[9] * v[1] + m.val[10] * v[2] + m.val[11];
+    return r;
+}
+
+
+
+#if defined EIGEN_WORLD_VERSION && defined EIGEN_GEOMETRY_MODULE_H
+
+template<typename T> inline
+cv::Affine3<T>::Affine3(const Eigen::Transform<T, 3, Eigen::Affine, (Eigen::RowMajor)>& affine)
+{
+    cv::Mat(4, 4, cv::traits::Type<T>::value, affine.matrix().data()).copyTo(matrix);
+}
+
+template<typename T> inline
+cv::Affine3<T>::Affine3(const Eigen::Transform<T, 3, Eigen::Affine>& affine)
+{
+    Eigen::Transform<T, 3, Eigen::Affine, (Eigen::RowMajor)> a = affine;
+    cv::Mat(4, 4, cv::traits::Type<T>::value, a.matrix().data()).copyTo(matrix);
+}
+
+template<typename T> inline
+cv::Affine3<T>::operator Eigen::Transform<T, 3, Eigen::Affine, (Eigen::RowMajor)>() const
+{
+    Eigen::Transform<T, 3, Eigen::Affine, (Eigen::RowMajor)> r;
+    cv::Mat hdr(4, 4, cv::traits::Type<T>::value, r.matrix().data());
+    cv::Mat(matrix, false).copyTo(hdr);
+    return r;
+}
+
+template<typename T> inline
+cv::Affine3<T>::operator Eigen::Transform<T, 3, Eigen::Affine>() const
+{
+    return this->operator Eigen::Transform<T, 3, Eigen::Affine, (Eigen::RowMajor)>();
+}
+
+#endif /* defined EIGEN_WORLD_VERSION && defined EIGEN_GEOMETRY_MODULE_H */
+
+//! @endcond
+
+#endif /* __cplusplus */
+
+#endif /* OPENCV_CORE_AFFINE3_HPP */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/async.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/async.hpp
new file mode 100644
index 0000000..54560c7
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/async.hpp
@@ -0,0 +1,105 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_ASYNC_HPP
+#define OPENCV_CORE_ASYNC_HPP
+
+#include <opencv2/core/mat.hpp>
+
+#ifdef CV_CXX11
+//#include <future>
+#include <chrono>
+#endif
+
+namespace cv {
+
+/** @addtogroup core_async
+
+@{
+*/
+
+
+/** @brief Returns result of asynchronous operations
+
+Object has attached asynchronous state.
+Assignment operator doesn't clone asynchronous state (it is shared between all instances).
+
+Result can be fetched via get() method only once.
+
+*/
+class CV_EXPORTS_W AsyncArray
+{
+public:
+    ~AsyncArray() CV_NOEXCEPT;
+    CV_WRAP AsyncArray() CV_NOEXCEPT;
+    AsyncArray(const AsyncArray& o) CV_NOEXCEPT;
+    AsyncArray& operator=(const AsyncArray& o) CV_NOEXCEPT;
+    CV_WRAP void release() CV_NOEXCEPT;
+
+    /** Fetch the result.
+    @param[out] dst destination array
+
+    Waits for result until container has valid result.
+    Throws exception if exception was stored as a result.
+
+    Throws exception on invalid container state.
+
+    @note Result or stored exception can be fetched only once.
+    */
+    CV_WRAP void get(OutputArray dst) const;
+
+    /** Retrieving the result with timeout
+    @param[out] dst destination array
+    @param[in] timeoutNs timeout in nanoseconds, -1 for infinite wait
+
+    @returns true if result is ready, false if the timeout has expired
+
+    @note Result or stored exception can be fetched only once.
+    */
+    bool get(OutputArray dst, int64 timeoutNs) const;
+
+    CV_WRAP inline
+    bool get(OutputArray dst, double timeoutNs) const { return get(dst, (int64)timeoutNs); }
+
+    bool wait_for(int64 timeoutNs) const;
+
+    CV_WRAP inline
+    bool wait_for(double timeoutNs) const { return wait_for((int64)timeoutNs); }
+
+    CV_WRAP bool valid() const CV_NOEXCEPT;
+
+#ifdef CV_CXX11
+    inline AsyncArray(AsyncArray&& o) { p = o.p; o.p = NULL; }
+    inline AsyncArray& operator=(AsyncArray&& o) CV_NOEXCEPT { std::swap(p, o.p); return *this; }
+
+    template<typename _Rep, typename _Period>
+    inline bool get(OutputArray dst, const std::chrono::duration<_Rep, _Period>& timeout)
+    {
+        return get(dst, (int64)(std::chrono::nanoseconds(timeout).count()));
+    }
+
+    template<typename _Rep, typename _Period>
+    inline bool wait_for(const std::chrono::duration<_Rep, _Period>& timeout)
+    {
+        return wait_for((int64)(std::chrono::nanoseconds(timeout).count()));
+    }
+
+#if 0
+    std::future<Mat> getFutureMat() const;
+    std::future<UMat> getFutureUMat() const;
+#endif
+#endif
+
+
+    // PImpl
+    struct Impl; friend struct Impl;
+    inline void* _getImpl() const CV_NOEXCEPT { return p; }
+protected:
+    Impl* p;
+};
+
+
+//! @}
+} // namespace
+#endif // OPENCV_CORE_ASYNC_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/base.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/base.hpp
new file mode 100644
index 0000000..21a61a4
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/base.hpp
@@ -0,0 +1,664 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2014, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_BASE_HPP
+#define OPENCV_CORE_BASE_HPP
+
+#ifndef __cplusplus
+#  error base.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/opencv_modules.hpp"
+
+#include <climits>
+#include <algorithm>
+
+#include "opencv2/core/cvdef.h"
+#include "opencv2/core/cvstd.hpp"
+
+namespace cv
+{
+
+//! @addtogroup core_utils
+//! @{
+
+namespace Error {
+//! error codes
+enum Code {
+    StsOk=                       0,  //!< everything is ok
+    StsBackTrace=               -1,  //!< pseudo error for back trace
+    StsError=                   -2,  //!< unknown /unspecified error
+    StsInternal=                -3,  //!< internal error (bad state)
+    StsNoMem=                   -4,  //!< insufficient memory
+    StsBadArg=                  -5,  //!< function arg/param is bad
+    StsBadFunc=                 -6,  //!< unsupported function
+    StsNoConv=                  -7,  //!< iteration didn't converge
+    StsAutoTrace=               -8,  //!< tracing
+    HeaderIsNull=               -9,  //!< image header is NULL
+    BadImageSize=              -10,  //!< image size is invalid
+    BadOffset=                 -11,  //!< offset is invalid
+    BadDataPtr=                -12,  //!<
+    BadStep=                   -13,  //!< image step is wrong, this may happen for a non-continuous matrix.
+    BadModelOrChSeq=           -14,  //!<
+    BadNumChannels=            -15,  //!< bad number of channels, for example, some functions accept only single channel matrices.
+    BadNumChannel1U=           -16,  //!<
+    BadDepth=                  -17,  //!< input image depth is not supported by the function
+    BadAlphaChannel=           -18,  //!<
+    BadOrder=                  -19,  //!< number of dimensions is out of range
+    BadOrigin=                 -20,  //!< incorrect input origin
+    BadAlign=                  -21,  //!< incorrect input align
+    BadCallBack=               -22,  //!<
+    BadTileSize=               -23,  //!<
+    BadCOI=                    -24,  //!< input COI is not supported
+    BadROISize=                -25,  //!< incorrect input roi
+    MaskIsTiled=               -26,  //!<
+    StsNullPtr=                -27,  //!< null pointer
+    StsVecLengthErr=           -28,  //!< incorrect vector length
+    StsFilterStructContentErr= -29,  //!< incorrect filter structure content
+    StsKernelStructContentErr= -30,  //!< incorrect transform kernel content
+    StsFilterOffsetErr=        -31,  //!< incorrect filter offset value
+    StsBadSize=                -201, //!< the input/output structure size is incorrect
+    StsDivByZero=              -202, //!< division by zero
+    StsInplaceNotSupported=    -203, //!< in-place operation is not supported
+    StsObjectNotFound=         -204, //!< request can't be completed
+    StsUnmatchedFormats=       -205, //!< formats of input/output arrays differ
+    StsBadFlag=                -206, //!< flag is wrong or not supported
+    StsBadPoint=               -207, //!< bad CvPoint
+    StsBadMask=                -208, //!< bad format of mask (neither 8uC1 nor 8sC1)
+    StsUnmatchedSizes=         -209, //!< sizes of input/output structures do not match
+    StsUnsupportedFormat=      -210, //!< the data format/type is not supported by the function
+    StsOutOfRange=             -211, //!< some of parameters are out of range
+    StsParseError=             -212, //!< invalid syntax/structure of the parsed file
+    StsNotImplemented=         -213, //!< the requested function/feature is not implemented
+    StsBadMemBlock=            -214, //!< an allocated block has been corrupted
+    StsAssert=                 -215, //!< assertion failed
+    GpuNotSupported=           -216, //!< no CUDA support
+    GpuApiCallError=           -217, //!< GPU API call error
+    OpenGlNotSupported=        -218, //!< no OpenGL support
+    OpenGlApiCallError=        -219, //!< OpenGL API call error
+    OpenCLApiCallError=        -220, //!< OpenCL API call error
+    OpenCLDoubleNotSupported=  -221,
+    OpenCLInitError=           -222, //!< OpenCL initialization error
+    OpenCLNoAMDBlasFft=        -223
+};
+} //Error
+
+//! @} core_utils
+
+//! @addtogroup core_array
+//! @{
+
+//! matrix decomposition types
+enum DecompTypes {
+    /** Gaussian elimination with the optimal pivot element chosen. */
+    DECOMP_LU       = 0,
+    /** singular value decomposition (SVD) method; the system can be over-defined and/or the matrix
+    src1 can be singular */
+    DECOMP_SVD      = 1,
+    /** eigenvalue decomposition; the matrix src1 must be symmetrical */
+    DECOMP_EIG      = 2,
+    /** Cholesky \f$LL^T\f$ factorization; the matrix src1 must be symmetrical and positively
+    defined */
+    DECOMP_CHOLESKY = 3,
+    /** QR factorization; the system can be over-defined and/or the matrix src1 can be singular */
+    DECOMP_QR       = 4,
+    /** while all the previous flags are mutually exclusive, this flag can be used together with
+    any of the previous; it means that the normal equations
+    \f$\texttt{src1}^T\cdot\texttt{src1}\cdot\texttt{dst}=\texttt{src1}^T\texttt{src2}\f$ are
+    solved instead of the original system
+    \f$\texttt{src1}\cdot\texttt{dst}=\texttt{src2}\f$ */
+    DECOMP_NORMAL   = 16
+};
+
+/** norm types
+
+src1 and src2 denote input arrays.
+*/
+
+enum NormTypes {
+                /**
+                \f[
+                norm =  \forkthree
+                {\|\texttt{src1}\|_{L_{\infty}} =  \max _I | \texttt{src1} (I)|}{if  \(\texttt{normType} = \texttt{NORM_INF}\) }
+                {\|\texttt{src1}-\texttt{src2}\|_{L_{\infty}} =  \max _I | \texttt{src1} (I) -  \texttt{src2} (I)|}{if  \(\texttt{normType} = \texttt{NORM_INF}\) }
+                {\frac{\|\texttt{src1}-\texttt{src2}\|_{L_{\infty}}    }{\|\texttt{src2}\|_{L_{\infty}} }}{if  \(\texttt{normType} = \texttt{NORM_RELATIVE | NORM_INF}\) }
+                \f]
+                */
+                NORM_INF       = 1,
+                /**
+                \f[
+                norm =  \forkthree
+                {\| \texttt{src1} \| _{L_1} =  \sum _I | \texttt{src1} (I)|}{if  \(\texttt{normType} = \texttt{NORM_L1}\)}
+                { \| \texttt{src1} - \texttt{src2} \| _{L_1} =  \sum _I | \texttt{src1} (I) -  \texttt{src2} (I)|}{if  \(\texttt{normType} = \texttt{NORM_L1}\) }
+                { \frac{\|\texttt{src1}-\texttt{src2}\|_{L_1} }{\|\texttt{src2}\|_{L_1}} }{if  \(\texttt{normType} = \texttt{NORM_RELATIVE | NORM_L1}\) }
+                \f]*/
+                 NORM_L1        = 2,
+                 /**
+                 \f[
+                 norm =  \forkthree
+                 { \| \texttt{src1} \| _{L_2} =  \sqrt{\sum_I \texttt{src1}(I)^2} }{if  \(\texttt{normType} = \texttt{NORM_L2}\) }
+                 { \| \texttt{src1} - \texttt{src2} \| _{L_2} =  \sqrt{\sum_I (\texttt{src1}(I) - \texttt{src2}(I))^2} }{if  \(\texttt{normType} = \texttt{NORM_L2}\) }
+                 { \frac{\|\texttt{src1}-\texttt{src2}\|_{L_2} }{\|\texttt{src2}\|_{L_2}} }{if  \(\texttt{normType} = \texttt{NORM_RELATIVE | NORM_L2}\) }
+                 \f]
+                 */
+                 NORM_L2        = 4,
+                 /**
+                 \f[
+                 norm =  \forkthree
+                 { \| \texttt{src1} \| _{L_2} ^{2} = \sum_I \texttt{src1}(I)^2} {if  \(\texttt{normType} = \texttt{NORM_L2SQR}\)}
+                 { \| \texttt{src1} - \texttt{src2} \| _{L_2} ^{2} =  \sum_I (\texttt{src1}(I) - \texttt{src2}(I))^2 }{if  \(\texttt{normType} = \texttt{NORM_L2SQR}\) }
+                 { \left(\frac{\|\texttt{src1}-\texttt{src2}\|_{L_2} }{\|\texttt{src2}\|_{L_2}}\right)^2 }{if  \(\texttt{normType} = \texttt{NORM_RELATIVE | NORM_L2SQR}\) }
+                 \f]
+                 */
+                 NORM_L2SQR     = 5,
+                 /**
+                 In the case of one input array, calculates the Hamming distance of the array from zero,
+                 In the case of two input arrays, calculates the Hamming distance between the arrays.
+                 */
+                 NORM_HAMMING   = 6,
+                 /**
+                 Similar to NORM_HAMMING, but in the calculation, each two bits of the input sequence will
+                 be added and treated as a single bit to be used in the same calculation as NORM_HAMMING.
+                 */
+                 NORM_HAMMING2  = 7,
+                 NORM_TYPE_MASK = 7, //!< bit-mask which can be used to separate norm type from norm flags
+                 NORM_RELATIVE  = 8, //!< flag
+                 NORM_MINMAX    = 32 //!< flag
+               };
+
+//! comparison types
+enum CmpTypes { CMP_EQ = 0, //!< src1 is equal to src2.
+                CMP_GT = 1, //!< src1 is greater than src2.
+                CMP_GE = 2, //!< src1 is greater than or equal to src2.
+                CMP_LT = 3, //!< src1 is less than src2.
+                CMP_LE = 4, //!< src1 is less than or equal to src2.
+                CMP_NE = 5  //!< src1 is unequal to src2.
+              };
+
+//! generalized matrix multiplication flags
+enum GemmFlags { GEMM_1_T = 1, //!< transposes src1
+                 GEMM_2_T = 2, //!< transposes src2
+                 GEMM_3_T = 4 //!< transposes src3
+               };
+
+enum DftFlags {
+    /** performs an inverse 1D or 2D transform instead of the default forward
+        transform. */
+    DFT_INVERSE        = 1,
+    /** scales the result: divide it by the number of array elements. Normally, it is
+        combined with DFT_INVERSE. */
+    DFT_SCALE          = 2,
+    /** performs a forward or inverse transform of every individual row of the input
+        matrix; this flag enables you to transform multiple vectors simultaneously and can be used to
+        decrease the overhead (which is sometimes several times larger than the processing itself) to
+        perform 3D and higher-dimensional transformations and so forth.*/
+    DFT_ROWS           = 4,
+    /** performs a forward transformation of 1D or 2D real array; the result,
+        though being a complex array, has complex-conjugate symmetry (*CCS*, see the function
+        description below for details), and such an array can be packed into a real array of the same
+        size as input, which is the fastest option and which is what the function does by default;
+        however, you may wish to get a full complex array (for simpler spectrum analysis, and so on) -
+        pass the flag to enable the function to produce a full-size complex output array. */
+    DFT_COMPLEX_OUTPUT = 16,
+    /** performs an inverse transformation of a 1D or 2D complex array; the
+        result is normally a complex array of the same size, however, if the input array has
+        conjugate-complex symmetry (for example, it is a result of forward transformation with
+        DFT_COMPLEX_OUTPUT flag), the output is a real array; while the function itself does not
+        check whether the input is symmetrical or not, you can pass the flag and then the function
+        will assume the symmetry and produce the real output array (note that when the input is packed
+        into a real array and inverse transformation is executed, the function treats the input as a
+        packed complex-conjugate symmetrical array, and the output will also be a real array). */
+    DFT_REAL_OUTPUT    = 32,
+    /** specifies that input is complex input. If this flag is set, the input must have 2 channels.
+        On the other hand, for backwards compatibility reason, if input has 2 channels, input is
+        already considered complex. */
+    DFT_COMPLEX_INPUT  = 64,
+    /** performs an inverse 1D or 2D transform instead of the default forward transform. */
+    DCT_INVERSE        = DFT_INVERSE,
+    /** performs a forward or inverse transform of every individual row of the input
+        matrix. This flag enables you to transform multiple vectors simultaneously and can be used to
+        decrease the overhead (which is sometimes several times larger than the processing itself) to
+        perform 3D and higher-dimensional transforms and so forth.*/
+    DCT_ROWS           = DFT_ROWS
+};
+
+//! Various border types, image boundaries are denoted with `|`
+//! @see borderInterpolate, copyMakeBorder
+enum BorderTypes {
+    BORDER_CONSTANT    = 0, //!< `iiiiii|abcdefgh|iiiiiii`  with some specified `i`
+    BORDER_REPLICATE   = 1, //!< `aaaaaa|abcdefgh|hhhhhhh`
+    BORDER_REFLECT     = 2, //!< `fedcba|abcdefgh|hgfedcb`
+    BORDER_WRAP        = 3, //!< `cdefgh|abcdefgh|abcdefg`
+    BORDER_REFLECT_101 = 4, //!< `gfedcb|abcdefgh|gfedcba`
+    BORDER_TRANSPARENT = 5, //!< `uvwxyz|abcdefgh|ijklmno`
+
+    BORDER_REFLECT101  = BORDER_REFLECT_101, //!< same as BORDER_REFLECT_101
+    BORDER_DEFAULT     = BORDER_REFLECT_101, //!< same as BORDER_REFLECT_101
+    BORDER_ISOLATED    = 16 //!< do not look outside of ROI
+};
+
+//! @} core_array
+
+//! @addtogroup core_utils
+//! @{
+
+/*! @brief Signals an error and raises the exception.
+
+By default the function prints information about the error to stderr,
+then it either stops if setBreakOnError() had been called before or raises the exception.
+It is possible to alternate error processing by using redirectError().
+@param _code - error code (Error::Code)
+@param _err - error description
+@param _func - function name. Available only when the compiler supports getting it
+@param _file - source file name where the error has occurred
+@param _line - line number in the source file where the error has occurred
+@see CV_Error, CV_Error_, CV_Assert, CV_DbgAssert
+ */
+CV_EXPORTS CV_NORETURN void error(int _code, const String& _err, const char* _func, const char* _file, int _line);
+
+#ifdef CV_STATIC_ANALYSIS
+
+// In practice, some macro are not processed correctly (noreturn is not detected).
+// We need to use simplified definition for them.
+#define CV_Error(code, msg) do { (void)(code); (void)(msg); abort(); } while (0)
+#define CV_Error_(code, args) do { (void)(code); (void)(cv::format args); abort(); } while (0)
+#define CV_Assert( expr ) do { if (!(expr)) abort(); } while (0)
+
+#else // CV_STATIC_ANALYSIS
+
+/** @brief Call the error handler.
+
+Currently, the error handler prints the error code and the error message to the standard
+error stream `stderr`. In the Debug configuration, it then provokes memory access violation, so that
+the execution stack and all the parameters can be analyzed by the debugger. In the Release
+configuration, the exception is thrown.
+
+@param code one of Error::Code
+@param msg error message
+*/
+#define CV_Error( code, msg ) cv::error( code, msg, CV_Func, __FILE__, __LINE__ )
+
+/**  @brief Call the error handler.
+
+This macro can be used to construct an error message on-fly to include some dynamic information,
+for example:
+@code
+    // note the extra parentheses around the formatted text message
+    CV_Error_(Error::StsOutOfRange,
+    ("the value at (%d, %d)=%g is out of range", badPt.x, badPt.y, badValue));
+@endcode
+@param code one of Error::Code
+@param args printf-like formatted error message in parentheses
+*/
+#define CV_Error_( code, args ) cv::error( code, cv::format args, CV_Func, __FILE__, __LINE__ )
+
+/** @brief Checks a condition at runtime and throws exception if it fails
+
+The macros CV_Assert (and CV_DbgAssert(expr)) evaluate the specified expression. If it is 0, the macros
+raise an error (see cv::error). The macro CV_Assert checks the condition in both Debug and Release
+configurations while CV_DbgAssert is only retained in the Debug configuration.
+*/
+#define CV_Assert( expr ) do { if(!!(expr)) ; else cv::error( cv::Error::StsAssert, #expr, CV_Func, __FILE__, __LINE__ ); } while(0)
+
+#endif // CV_STATIC_ANALYSIS
+
+//! @cond IGNORED
+#if !defined(__OPENCV_BUILD)  // TODO: backward compatibility only
+#ifndef CV_ErrorNoReturn
+#define CV_ErrorNoReturn CV_Error
+#endif
+#ifndef CV_ErrorNoReturn_
+#define CV_ErrorNoReturn_ CV_Error_
+#endif
+#endif
+
+#define CV_Assert_1 CV_Assert
+#define CV_Assert_2( expr, ... ) CV_Assert_1(expr); __CV_EXPAND(CV_Assert_1( __VA_ARGS__ ))
+#define CV_Assert_3( expr, ... ) CV_Assert_1(expr); __CV_EXPAND(CV_Assert_2( __VA_ARGS__ ))
+#define CV_Assert_4( expr, ... ) CV_Assert_1(expr); __CV_EXPAND(CV_Assert_3( __VA_ARGS__ ))
+#define CV_Assert_5( expr, ... ) CV_Assert_1(expr); __CV_EXPAND(CV_Assert_4( __VA_ARGS__ ))
+#define CV_Assert_6( expr, ... ) CV_Assert_1(expr); __CV_EXPAND(CV_Assert_5( __VA_ARGS__ ))
+#define CV_Assert_7( expr, ... ) CV_Assert_1(expr); __CV_EXPAND(CV_Assert_6( __VA_ARGS__ ))
+#define CV_Assert_8( expr, ... ) CV_Assert_1(expr); __CV_EXPAND(CV_Assert_7( __VA_ARGS__ ))
+#define CV_Assert_9( expr, ... ) CV_Assert_1(expr); __CV_EXPAND(CV_Assert_8( __VA_ARGS__ ))
+#define CV_Assert_10( expr, ... ) CV_Assert_1(expr); __CV_EXPAND(CV_Assert_9( __VA_ARGS__ ))
+
+#define CV_Assert_N(...) do { __CV_EXPAND(__CV_CAT(CV_Assert_, __CV_VA_NUM_ARGS(__VA_ARGS__)) (__VA_ARGS__)); } while(0)
+
+//! @endcond
+
+#if defined _DEBUG || defined CV_STATIC_ANALYSIS
+#  define CV_DbgAssert(expr) CV_Assert(expr)
+#else
+/** replaced with CV_Assert(expr) in Debug configuration */
+#  define CV_DbgAssert(expr)
+#endif
+
+/*
+ * Hamming distance functor - counts the bit differences between two strings - useful for the Brief descriptor
+ * bit count of A exclusive XOR'ed with B
+ */
+struct CV_EXPORTS Hamming
+{
+    static const NormTypes normType = NORM_HAMMING;
+    typedef unsigned char ValueType;
+    typedef int ResultType;
+
+    /** this will count the bits in a ^ b
+     */
+    ResultType operator()( const unsigned char* a, const unsigned char* b, int size ) const;
+};
+
+typedef Hamming HammingLUT;
+
+/////////////////////////////////// inline norms ////////////////////////////////////
+
+template<typename _Tp> inline _Tp cv_abs(_Tp x) { return std::abs(x); }
+inline int cv_abs(uchar x) { return x; }
+inline int cv_abs(schar x) { return std::abs(x); }
+inline int cv_abs(ushort x) { return x; }
+inline int cv_abs(short x) { return std::abs(x); }
+
+template<typename _Tp, typename _AccTp> static inline
+_AccTp normL2Sqr(const _Tp* a, int n)
+{
+    _AccTp s = 0;
+    int i=0;
+#if CV_ENABLE_UNROLLED
+    for( ; i <= n - 4; i += 4 )
+    {
+        _AccTp v0 = a[i], v1 = a[i+1], v2 = a[i+2], v3 = a[i+3];
+        s += v0*v0 + v1*v1 + v2*v2 + v3*v3;
+    }
+#endif
+    for( ; i < n; i++ )
+    {
+        _AccTp v = a[i];
+        s += v*v;
+    }
+    return s;
+}
+
+template<typename _Tp, typename _AccTp> static inline
+_AccTp normL1(const _Tp* a, int n)
+{
+    _AccTp s = 0;
+    int i = 0;
+#if CV_ENABLE_UNROLLED
+    for(; i <= n - 4; i += 4 )
+    {
+        s += (_AccTp)cv_abs(a[i]) + (_AccTp)cv_abs(a[i+1]) +
+            (_AccTp)cv_abs(a[i+2]) + (_AccTp)cv_abs(a[i+3]);
+    }
+#endif
+    for( ; i < n; i++ )
+        s += cv_abs(a[i]);
+    return s;
+}
+
+template<typename _Tp, typename _AccTp> static inline
+_AccTp normInf(const _Tp* a, int n)
+{
+    _AccTp s = 0;
+    for( int i = 0; i < n; i++ )
+        s = std::max(s, (_AccTp)cv_abs(a[i]));
+    return s;
+}
+
+template<typename _Tp, typename _AccTp> static inline
+_AccTp normL2Sqr(const _Tp* a, const _Tp* b, int n)
+{
+    _AccTp s = 0;
+    int i= 0;
+#if CV_ENABLE_UNROLLED
+    for(; i <= n - 4; i += 4 )
+    {
+        _AccTp v0 = _AccTp(a[i] - b[i]), v1 = _AccTp(a[i+1] - b[i+1]), v2 = _AccTp(a[i+2] - b[i+2]), v3 = _AccTp(a[i+3] - b[i+3]);
+        s += v0*v0 + v1*v1 + v2*v2 + v3*v3;
+    }
+#endif
+    for( ; i < n; i++ )
+    {
+        _AccTp v = _AccTp(a[i] - b[i]);
+        s += v*v;
+    }
+    return s;
+}
+
+static inline float normL2Sqr(const float* a, const float* b, int n)
+{
+    float s = 0.f;
+    for( int i = 0; i < n; i++ )
+    {
+        float v = a[i] - b[i];
+        s += v*v;
+    }
+    return s;
+}
+
+template<typename _Tp, typename _AccTp> static inline
+_AccTp normL1(const _Tp* a, const _Tp* b, int n)
+{
+    _AccTp s = 0;
+    int i= 0;
+#if CV_ENABLE_UNROLLED
+    for(; i <= n - 4; i += 4 )
+    {
+        _AccTp v0 = _AccTp(a[i] - b[i]), v1 = _AccTp(a[i+1] - b[i+1]), v2 = _AccTp(a[i+2] - b[i+2]), v3 = _AccTp(a[i+3] - b[i+3]);
+        s += std::abs(v0) + std::abs(v1) + std::abs(v2) + std::abs(v3);
+    }
+#endif
+    for( ; i < n; i++ )
+    {
+        _AccTp v = _AccTp(a[i] - b[i]);
+        s += std::abs(v);
+    }
+    return s;
+}
+
+inline float normL1(const float* a, const float* b, int n)
+{
+    float s = 0.f;
+    for( int i = 0; i < n; i++ )
+    {
+        s += std::abs(a[i] - b[i]);
+    }
+    return s;
+}
+
+inline int normL1(const uchar* a, const uchar* b, int n)
+{
+    int s = 0;
+    for( int i = 0; i < n; i++ )
+    {
+        s += std::abs(a[i] - b[i]);
+    }
+    return s;
+}
+
+template<typename _Tp, typename _AccTp> static inline
+_AccTp normInf(const _Tp* a, const _Tp* b, int n)
+{
+    _AccTp s = 0;
+    for( int i = 0; i < n; i++ )
+    {
+        _AccTp v0 = a[i] - b[i];
+        s = std::max(s, std::abs(v0));
+    }
+    return s;
+}
+
+/** @brief Computes the cube root of an argument.
+
+ The function cubeRoot computes \f$\sqrt[3]{\texttt{val}}\f$. Negative arguments are handled correctly.
+ NaN and Inf are not handled. The accuracy approaches the maximum possible accuracy for
+ single-precision data.
+ @param val A function argument.
+ */
+CV_EXPORTS_W float cubeRoot(float val);
+
+/** @overload
+
+cubeRoot with argument of `double` type calls `std::cbrt(double)`
+*/
+static inline
+double cubeRoot(double val)
+{
+    return std::cbrt(val);
+}
+
+/** @brief Calculates the angle of a 2D vector in degrees.
+
+ The function fastAtan2 calculates the full-range angle of an input 2D vector. The angle is measured
+ in degrees and varies from 0 to 360 degrees. The accuracy is about 0.3 degrees.
+ @param x x-coordinate of the vector.
+ @param y y-coordinate of the vector.
+ */
+CV_EXPORTS_W float fastAtan2(float y, float x);
+
+/** proxy for hal::LU */
+CV_EXPORTS int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n);
+/** proxy for hal::LU */
+CV_EXPORTS int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n);
+/** proxy for hal::Cholesky */
+CV_EXPORTS bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n);
+/** proxy for hal::Cholesky */
+CV_EXPORTS bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n);
+
+////////////////// forward declarations for important OpenCV types //////////////////
+
+//! @cond IGNORED
+
+template<typename _Tp, int cn> class Vec;
+template<typename _Tp, int m, int n> class Matx;
+
+template<typename _Tp> class Complex;
+template<typename _Tp> class Point_;
+template<typename _Tp> class Point3_;
+template<typename _Tp> class Size_;
+template<typename _Tp> class Rect_;
+template<typename _Tp> class Scalar_;
+
+class CV_EXPORTS RotatedRect;
+class CV_EXPORTS Range;
+class CV_EXPORTS TermCriteria;
+class CV_EXPORTS KeyPoint;
+class CV_EXPORTS DMatch;
+class CV_EXPORTS RNG;
+
+class CV_EXPORTS Mat;
+class CV_EXPORTS MatExpr;
+
+class CV_EXPORTS UMat;
+
+class CV_EXPORTS SparseMat;
+typedef Mat MatND;
+
+template<typename _Tp> class Mat_;
+template<typename _Tp> class SparseMat_;
+
+class CV_EXPORTS MatConstIterator;
+class CV_EXPORTS SparseMatIterator;
+class CV_EXPORTS SparseMatConstIterator;
+template<typename _Tp> class MatIterator_;
+template<typename _Tp> class MatConstIterator_;
+template<typename _Tp> class SparseMatIterator_;
+template<typename _Tp> class SparseMatConstIterator_;
+
+namespace ogl
+{
+    class CV_EXPORTS Buffer;
+    class CV_EXPORTS Texture2D;
+    class CV_EXPORTS Arrays;
+}
+
+namespace cuda
+{
+    class CV_EXPORTS GpuMat;
+    class CV_EXPORTS HostMem;
+    class CV_EXPORTS Stream;
+    class CV_EXPORTS Event;
+}
+
+namespace cudev
+{
+    template <typename _Tp> class GpuMat_;
+}
+
+namespace ipp
+{
+CV_EXPORTS   unsigned long long getIppFeatures();
+CV_EXPORTS   void setIppStatus(int status, const char * const funcname = NULL, const char * const filename = NULL,
+                             int line = 0);
+CV_EXPORTS   int getIppStatus();
+CV_EXPORTS   String getIppErrorLocation();
+CV_EXPORTS_W bool   useIPP();
+CV_EXPORTS_W void   setUseIPP(bool flag);
+CV_EXPORTS_W String getIppVersion();
+
+// IPP Not-Exact mode. This function may force use of IPP then both IPP and OpenCV provide proper results
+// but have internal accuracy differences which have too much direct or indirect impact on accuracy tests.
+CV_EXPORTS_W bool useIPP_NotExact();
+CV_EXPORTS_W void setUseIPP_NotExact(bool flag);
+#ifndef DISABLE_OPENCV_3_COMPATIBILITY
+static inline bool useIPP_NE() { return useIPP_NotExact(); }
+static inline void setUseIPP_NE(bool flag) { setUseIPP_NotExact(flag); }
+#endif
+
+} // ipp
+
+//! @endcond
+
+//! @} core_utils
+
+
+
+
+} // cv
+
+#include "opencv2/core/neon_utils.hpp"
+#include "opencv2/core/vsx_utils.hpp"
+#include "opencv2/core/check.hpp"
+
+#endif //OPENCV_CORE_BASE_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/bindings_utils.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/bindings_utils.hpp
new file mode 100644
index 0000000..1c14334
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/bindings_utils.hpp
@@ -0,0 +1,287 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_BINDINGS_UTILS_HPP
+#define OPENCV_CORE_BINDINGS_UTILS_HPP
+
+#include <opencv2/core/async.hpp>
+#include <opencv2/core/detail/async_promise.hpp>
+#include <opencv2/core/utils/logger.hpp>
+
+#include <stdexcept>
+
+namespace cv { namespace utils {
+//! @addtogroup core_utils
+//! @{
+
+CV_EXPORTS_W String dumpInputArray(InputArray argument);
+
+CV_EXPORTS_W String dumpInputArrayOfArrays(InputArrayOfArrays argument);
+
+CV_EXPORTS_W String dumpInputOutputArray(InputOutputArray argument);
+
+CV_EXPORTS_W String dumpInputOutputArrayOfArrays(InputOutputArrayOfArrays argument);
+
+CV_WRAP static inline
+String dumpBool(bool argument)
+{
+    return (argument) ? String("Bool: True") : String("Bool: False");
+}
+
+CV_WRAP static inline
+String dumpInt(int argument)
+{
+    return cv::format("Int: %d", argument);
+}
+
+CV_WRAP static inline
+String dumpSizeT(size_t argument)
+{
+    std::ostringstream oss("size_t: ", std::ios::ate);
+    oss << argument;
+    return oss.str();
+}
+
+CV_WRAP static inline
+String dumpFloat(float argument)
+{
+    return cv::format("Float: %.2f", argument);
+}
+
+CV_WRAP static inline
+String dumpDouble(double argument)
+{
+    return cv::format("Double: %.2f", argument);
+}
+
+CV_WRAP static inline
+String dumpCString(const char* argument)
+{
+    return cv::format("String: %s", argument);
+}
+
+CV_WRAP static inline
+String dumpString(const String& argument)
+{
+    return cv::format("String: %s", argument.c_str());
+}
+
+CV_WRAP static inline
+String testOverloadResolution(int value, const Point& point = Point(42, 24))
+{
+    return format("overload (int=%d, point=(x=%d, y=%d))", value, point.x,
+                  point.y);
+}
+
+CV_WRAP static inline
+String testOverloadResolution(const Rect& rect)
+{
+    return format("overload (rect=(x=%d, y=%d, w=%d, h=%d))", rect.x, rect.y,
+                  rect.width, rect.height);
+}
+
+CV_WRAP static inline
+String dumpRect(const Rect& argument)
+{
+    return format("rect: (x=%d, y=%d, w=%d, h=%d)", argument.x, argument.y,
+                  argument.width, argument.height);
+}
+
+CV_WRAP static inline
+String dumpTermCriteria(const TermCriteria& argument)
+{
+    return format("term_criteria: (type=%d, max_count=%d, epsilon=%lf",
+                  argument.type, argument.maxCount, argument.epsilon);
+}
+
+CV_WRAP static inline
+String dumpRotatedRect(const RotatedRect& argument)
+{
+    return format("rotated_rect: (c_x=%f, c_y=%f, w=%f, h=%f, a=%f)",
+                  argument.center.x, argument.center.y, argument.size.width,
+                  argument.size.height, argument.angle);
+}
+
+CV_WRAP static inline
+RotatedRect testRotatedRect(float x, float y, float w, float h, float angle)
+{
+    return RotatedRect(Point2f(x, y), Size2f(w, h), angle);
+}
+
+CV_WRAP static inline
+std::vector<RotatedRect> testRotatedRectVector(float x, float y, float w, float h, float angle)
+{
+    std::vector<RotatedRect> result;
+    for (int i = 0; i < 10; i++)
+        result.push_back(RotatedRect(Point2f(x + i, y + 2 * i), Size2f(w, h), angle + 10 * i));
+    return result;
+}
+
+CV_WRAP static inline
+String dumpRange(const Range& argument)
+{
+    if (argument == Range::all())
+    {
+        return "range: all";
+    }
+    else
+    {
+        return format("range: (s=%d, e=%d)", argument.start, argument.end);
+    }
+}
+
+CV_WRAP static inline
+int testOverwriteNativeMethod(int argument)
+{
+    return argument;
+}
+
+CV_WRAP static inline
+String testReservedKeywordConversion(int positional_argument, int lambda = 2, int from = 3)
+{
+    return format("arg=%d, lambda=%d, from=%d", positional_argument, lambda, from);
+}
+
+CV_EXPORTS_W String dumpVectorOfInt(const std::vector<int>& vec);
+
+CV_EXPORTS_W String dumpVectorOfDouble(const std::vector<double>& vec);
+
+CV_EXPORTS_W String dumpVectorOfRect(const std::vector<Rect>& vec);
+
+CV_WRAP static inline
+void generateVectorOfRect(size_t len, CV_OUT std::vector<Rect>& vec)
+{
+    vec.resize(len);
+    if (len > 0)
+    {
+        RNG rng(12345);
+        Mat tmp(static_cast<int>(len), 1, CV_32SC4);
+        rng.fill(tmp, RNG::UNIFORM, 10, 20);
+        tmp.copyTo(vec);
+    }
+}
+
+CV_WRAP static inline
+void generateVectorOfInt(size_t len, CV_OUT std::vector<int>& vec)
+{
+    vec.resize(len);
+    if (len > 0)
+    {
+        RNG rng(554433);
+        Mat tmp(static_cast<int>(len), 1, CV_32SC1);
+        rng.fill(tmp, RNG::UNIFORM, -10, 10);
+        tmp.copyTo(vec);
+    }
+}
+
+CV_WRAP static inline
+void generateVectorOfMat(size_t len, int rows, int cols, int dtype, CV_OUT std::vector<Mat>& vec)
+{
+    vec.resize(len);
+    if (len > 0)
+    {
+        RNG rng(65431);
+        for (size_t i = 0; i < len; ++i)
+        {
+            vec[i].create(rows, cols, dtype);
+            rng.fill(vec[i], RNG::UNIFORM, 0, 10);
+        }
+    }
+}
+
+CV_WRAP static inline
+AsyncArray testAsyncArray(InputArray argument)
+{
+    AsyncPromise p;
+    p.setValue(argument);
+    return p.getArrayResult();
+}
+
+CV_WRAP static inline
+AsyncArray testAsyncException()
+{
+    AsyncPromise p;
+    return p.getArrayResult();
+}
+
+namespace nested {
+CV_WRAP static inline bool testEchoBooleanFunction(bool flag) {
+    return flag;
+}
+
+class CV_EXPORTS_W CV_WRAP_AS(ExportClassName) OriginalClassName
+{
+public:
+    struct CV_EXPORTS_W_SIMPLE Params
+    {
+        CV_PROP_RW int int_value;
+        CV_PROP_RW float float_value;
+
+        CV_WRAP explicit Params(int int_param = 123, float float_param = 3.5f)
+        {
+            int_value = int_param;
+            float_value = float_param;
+        }
+    };
+
+    explicit OriginalClassName(const OriginalClassName::Params& params = OriginalClassName::Params())
+    {
+        params_ = params;
+    }
+
+    CV_WRAP int getIntParam() const
+    {
+        return params_.int_value;
+    }
+
+    CV_WRAP float getFloatParam() const
+    {
+        return params_.float_value;
+    }
+
+    CV_WRAP static std::string originalName()
+    {
+        return "OriginalClassName";
+    }
+
+    CV_WRAP static Ptr<OriginalClassName>
+    create(const OriginalClassName::Params& params = OriginalClassName::Params())
+    {
+        return makePtr<OriginalClassName>(params);
+    }
+
+private:
+    OriginalClassName::Params params_;
+};
+
+typedef OriginalClassName::Params OriginalClassName_Params;
+} // namespace nested
+
+namespace fs {
+    CV_EXPORTS_W cv::String getCacheDirectoryForDownloads();
+} // namespace fs
+
+//! @}  // core_utils
+}  // namespace cv::utils
+
+//! @cond IGNORED
+
+CV_WRAP static inline
+int setLogLevel(int level)
+{
+    // NB: Binding generators doesn't work with enums properly yet, so we define separate overload here
+    return cv::utils::logging::setLogLevel((cv::utils::logging::LogLevel)level);
+}
+
+CV_WRAP static inline
+int getLogLevel()
+{
+    return cv::utils::logging::getLogLevel();
+}
+
+//! @endcond IGNORED
+
+} // namespaces cv /  utils
+
+#endif // OPENCV_CORE_BINDINGS_UTILS_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/bufferpool.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/bufferpool.hpp
new file mode 100644
index 0000000..4698e5d
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/bufferpool.hpp
@@ -0,0 +1,40 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
+
+#ifndef OPENCV_CORE_BUFFER_POOL_HPP
+#define OPENCV_CORE_BUFFER_POOL_HPP
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4265)
+#endif
+
+namespace cv
+{
+
+//! @addtogroup core
+//! @{
+
+class BufferPoolController
+{
+protected:
+    ~BufferPoolController() { }
+public:
+    virtual size_t getReservedSize() const = 0;
+    virtual size_t getMaxReservedSize() const = 0;
+    virtual void setMaxReservedSize(size_t size) = 0;
+    virtual void freeAllReservedBuffers() = 0;
+};
+
+//! @}
+
+}
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+#endif // OPENCV_CORE_BUFFER_POOL_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/check.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/check.hpp
new file mode 100644
index 0000000..a32b811
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/check.hpp
@@ -0,0 +1,160 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_CHECK_HPP
+#define OPENCV_CORE_CHECK_HPP
+
+#include <opencv2/core/base.hpp>
+
+namespace cv {
+
+/** Returns string of cv::Mat depth value: CV_8U -> "CV_8U" or "<invalid depth>" */
+CV_EXPORTS const char* depthToString(int depth);
+
+/** Returns string of cv::Mat depth value: CV_8UC3 -> "CV_8UC3" or "<invalid type>" */
+CV_EXPORTS String typeToString(int type);
+
+
+//! @cond IGNORED
+namespace detail {
+
+/** Returns string of cv::Mat depth value: CV_8U -> "CV_8U" or NULL */
+CV_EXPORTS const char* depthToString_(int depth);
+
+/** Returns string of cv::Mat depth value: CV_8UC3 -> "CV_8UC3" or cv::String() */
+CV_EXPORTS cv::String typeToString_(int type);
+
+enum TestOp {
+  TEST_CUSTOM = 0,
+  TEST_EQ = 1,
+  TEST_NE = 2,
+  TEST_LE = 3,
+  TEST_LT = 4,
+  TEST_GE = 5,
+  TEST_GT = 6,
+  CV__LAST_TEST_OP
+};
+
+struct CheckContext {
+    const char* func;
+    const char* file;
+    int line;
+    enum TestOp testOp;
+    const char* message;
+    const char* p1_str;
+    const char* p2_str;
+};
+
+#ifndef CV__CHECK_FILENAME
+# define CV__CHECK_FILENAME __FILE__
+#endif
+
+#ifndef CV__CHECK_FUNCTION
+# if defined _MSC_VER
+#   define CV__CHECK_FUNCTION __FUNCSIG__
+# elif defined __GNUC__
+#   define CV__CHECK_FUNCTION __PRETTY_FUNCTION__
+# else
+#   define CV__CHECK_FUNCTION "<unknown>"
+# endif
+#endif
+
+#define CV__CHECK_LOCATION_VARNAME(id) CVAUX_CONCAT(CVAUX_CONCAT(__cv_check_, id), __LINE__)
+#define CV__DEFINE_CHECK_CONTEXT(id, message, testOp, p1_str, p2_str) \
+    static const cv::detail::CheckContext CV__CHECK_LOCATION_VARNAME(id) = \
+            { CV__CHECK_FUNCTION, CV__CHECK_FILENAME, __LINE__, testOp, "" message, "" p1_str, "" p2_str }
+
+CV_EXPORTS void CV_NORETURN check_failed_auto(const int v1, const int v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const size_t v1, const size_t v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const float v1, const float v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const double v1, const double v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const Size_<int> v1, const Size_<int> v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatDepth(const int v1, const int v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatType(const int v1, const int v2, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatChannels(const int v1, const int v2, const CheckContext& ctx);
+
+CV_EXPORTS void CV_NORETURN check_failed_auto(const int v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const size_t v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const float v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const double v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const Size_<int> v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_auto(const std::string& v1, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatDepth(const int v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatType(const int v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_MatChannels(const int v, const CheckContext& ctx);
+
+
+#define CV__TEST_EQ(v1, v2) ((v1) == (v2))
+#define CV__TEST_NE(v1, v2) ((v1) != (v2))
+#define CV__TEST_LE(v1, v2) ((v1) <= (v2))
+#define CV__TEST_LT(v1, v2) ((v1) < (v2))
+#define CV__TEST_GE(v1, v2) ((v1) >= (v2))
+#define CV__TEST_GT(v1, v2) ((v1) > (v2))
+
+#define CV__CHECK(id, op, type, v1, v2, v1_str, v2_str, msg_str) do { \
+    if(CV__TEST_##op((v1), (v2))) ; else { \
+        CV__DEFINE_CHECK_CONTEXT(id, msg_str, cv::detail::TEST_ ## op, v1_str, v2_str); \
+        cv::detail::check_failed_ ## type((v1), (v2), CV__CHECK_LOCATION_VARNAME(id)); \
+    } \
+} while (0)
+
+#define CV__CHECK_CUSTOM_TEST(id, type, v, test_expr, v_str, test_expr_str, msg_str) do { \
+    if(!!(test_expr)) ; else { \
+        CV__DEFINE_CHECK_CONTEXT(id, msg_str, cv::detail::TEST_CUSTOM, v_str, test_expr_str); \
+        cv::detail::check_failed_ ## type((v), CV__CHECK_LOCATION_VARNAME(id)); \
+    } \
+} while (0)
+
+} // namespace
+//! @endcond
+
+
+/// Supported values of these types: int, float, double
+#define CV_CheckEQ(v1, v2, msg)  CV__CHECK(_, EQ, auto, v1, v2, #v1, #v2, msg)
+#define CV_CheckNE(v1, v2, msg)  CV__CHECK(_, NE, auto, v1, v2, #v1, #v2, msg)
+#define CV_CheckLE(v1, v2, msg)  CV__CHECK(_, LE, auto, v1, v2, #v1, #v2, msg)
+#define CV_CheckLT(v1, v2, msg)  CV__CHECK(_, LT, auto, v1, v2, #v1, #v2, msg)
+#define CV_CheckGE(v1, v2, msg)  CV__CHECK(_, GE, auto, v1, v2, #v1, #v2, msg)
+#define CV_CheckGT(v1, v2, msg)  CV__CHECK(_, GT, auto, v1, v2, #v1, #v2, msg)
+
+/// Check with additional "decoding" of type values in error message
+#define CV_CheckTypeEQ(t1, t2, msg)  CV__CHECK(_, EQ, MatType, t1, t2, #t1, #t2, msg)
+/// Check with additional "decoding" of depth values in error message
+#define CV_CheckDepthEQ(d1, d2, msg)  CV__CHECK(_, EQ, MatDepth, d1, d2, #d1, #d2, msg)
+
+#define CV_CheckChannelsEQ(c1, c2, msg)  CV__CHECK(_, EQ, MatChannels, c1, c2, #c1, #c2, msg)
+
+/// Example: type == CV_8UC1 || type == CV_8UC3
+#define CV_CheckType(t, test_expr, msg)  CV__CHECK_CUSTOM_TEST(_, MatType, t, (test_expr), #t, #test_expr, msg)
+
+/// Example: depth == CV_32F || depth == CV_64F
+#define CV_CheckDepth(t, test_expr, msg)  CV__CHECK_CUSTOM_TEST(_, MatDepth, t, (test_expr), #t, #test_expr, msg)
+
+/// Example: v == A || v == B
+#define CV_Check(v, test_expr, msg)  CV__CHECK_CUSTOM_TEST(_, auto, v, (test_expr), #v, #test_expr, msg)
+
+/// Some complex conditions: CV_Check(src2, src2.empty() || (src2.type() == src1.type() && src2.size() == src1.size()), "src2 should have same size/type as src1")
+// TODO define pretty-printers
+
+#ifndef NDEBUG
+#define CV_DbgCheck(v, test_expr, msg)  CV__CHECK_CUSTOM_TEST(_, auto, v, (test_expr), #v, #test_expr, msg)
+#define CV_DbgCheckEQ(v1, v2, msg)  CV__CHECK(_, EQ, auto, v1, v2, #v1, #v2, msg)
+#define CV_DbgCheckNE(v1, v2, msg)  CV__CHECK(_, NE, auto, v1, v2, #v1, #v2, msg)
+#define CV_DbgCheckLE(v1, v2, msg)  CV__CHECK(_, LE, auto, v1, v2, #v1, #v2, msg)
+#define CV_DbgCheckLT(v1, v2, msg)  CV__CHECK(_, LT, auto, v1, v2, #v1, #v2, msg)
+#define CV_DbgCheckGE(v1, v2, msg)  CV__CHECK(_, GE, auto, v1, v2, #v1, #v2, msg)
+#define CV_DbgCheckGT(v1, v2, msg)  CV__CHECK(_, GT, auto, v1, v2, #v1, #v2, msg)
+#else
+#define CV_DbgCheck(v, test_expr, msg)  do { } while (0)
+#define CV_DbgCheckEQ(v1, v2, msg)  do { } while (0)
+#define CV_DbgCheckNE(v1, v2, msg)  do { } while (0)
+#define CV_DbgCheckLE(v1, v2, msg)  do { } while (0)
+#define CV_DbgCheckLT(v1, v2, msg)  do { } while (0)
+#define CV_DbgCheckGE(v1, v2, msg)  do { } while (0)
+#define CV_DbgCheckGT(v1, v2, msg)  do { } while (0)
+#endif
+
+} // namespace
+
+#endif // OPENCV_CORE_CHECK_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/core.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/core.hpp
new file mode 100644
index 0000000..4389183
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/core.hpp
@@ -0,0 +1,48 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __OPENCV_BUILD
+#error this is a compatibility header which should not be used inside the OpenCV library
+#endif
+
+#include "opencv2/core.hpp"
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/core_c.h b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/core_c.h
new file mode 100644
index 0000000..7b686b8
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/core_c.h
@@ -0,0 +1,3128 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+#ifndef OPENCV_CORE_C_H
+#define OPENCV_CORE_C_H
+
+#include "opencv2/core/types_c.h"
+
+#ifdef __cplusplus
+/* disable MSVC warning C4190 / clang-cl -Wreturn-type-c-linkage:
+       'function' has C-linkage specified, but returns UDT 'typename'
+       which is incompatible with C
+
+   It is OK to disable it because we only extend few plain structures with
+   C++ constructors for simpler interoperability with C++ API of the library
+*/
+#  if defined(__clang__)
+     // handle clang on Linux and clang-cl (i. e. clang on Windows) first
+#    pragma GCC diagnostic ignored "-Wreturn-type-c-linkage"
+#  elif defined(_MSC_VER)
+     // then handle MSVC
+#    pragma warning(disable:4190)
+#  endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup core_c
+    @{
+*/
+
+/****************************************************************************************\
+*          Array allocation, deallocation, initialization and access to elements         *
+\****************************************************************************************/
+
+/** `malloc` wrapper.
+   If there is no enough memory, the function
+   (as well as other OpenCV functions that call cvAlloc)
+   raises an error. */
+CVAPI(void*)  cvAlloc( size_t size );
+
+/** `free` wrapper.
+   Here and further all the memory releasing functions
+   (that all call cvFree) take double pointer in order to
+   to clear pointer to the data after releasing it.
+   Passing pointer to NULL pointer is Ok: nothing happens in this case
+*/
+CVAPI(void)   cvFree_( void* ptr );
+#define cvFree(ptr) (cvFree_(*(ptr)), *(ptr)=0)
+
+/** @brief Creates an image header but does not allocate the image data.
+
+@param size Image width and height
+@param depth Image depth (see cvCreateImage )
+@param channels Number of channels (see cvCreateImage )
+ */
+CVAPI(IplImage*)  cvCreateImageHeader( CvSize size, int depth, int channels );
+
+/** @brief Initializes an image header that was previously allocated.
+
+The returned IplImage\* points to the initialized header.
+@param image Image header to initialize
+@param size Image width and height
+@param depth Image depth (see cvCreateImage )
+@param channels Number of channels (see cvCreateImage )
+@param origin Top-left IPL_ORIGIN_TL or bottom-left IPL_ORIGIN_BL
+@param align Alignment for image rows, typically 4 or 8 bytes
+ */
+CVAPI(IplImage*) cvInitImageHeader( IplImage* image, CvSize size, int depth,
+                                   int channels, int origin CV_DEFAULT(0),
+                                   int align CV_DEFAULT(4));
+
+/** @brief Creates an image header and allocates the image data.
+
+This function call is equivalent to the following code:
+@code
+    header = cvCreateImageHeader(size, depth, channels);
+    cvCreateData(header);
+@endcode
+@param size Image width and height
+@param depth Bit depth of image elements. See IplImage for valid depths.
+@param channels Number of channels per pixel. See IplImage for details. This function only creates
+images with interleaved channels.
+ */
+CVAPI(IplImage*)  cvCreateImage( CvSize size, int depth, int channels );
+
+/** @brief Deallocates an image header.
+
+This call is an analogue of :
+@code
+    if(image )
+    {
+        iplDeallocate(*image, IPL_IMAGE_HEADER | IPL_IMAGE_ROI);
+        *image = 0;
+    }
+@endcode
+but it does not use IPL functions by default (see the CV_TURN_ON_IPL_COMPATIBILITY macro).
+@param image Double pointer to the image header
+ */
+CVAPI(void)  cvReleaseImageHeader( IplImage** image );
+
+/** @brief Deallocates the image header and the image data.
+
+This call is a shortened form of :
+@code
+    if(*image )
+    {
+        cvReleaseData(*image);
+        cvReleaseImageHeader(image);
+    }
+@endcode
+@param image Double pointer to the image header
+*/
+CVAPI(void)  cvReleaseImage( IplImage** image );
+
+/** Creates a copy of IPL image (widthStep may differ) */
+CVAPI(IplImage*) cvCloneImage( const IplImage* image );
+
+/** @brief Sets the channel of interest in an IplImage.
+
+If the ROI is set to NULL and the coi is *not* 0, the ROI is allocated. Most OpenCV functions do
+*not* support the COI setting, so to process an individual image/matrix channel one may copy (via
+cvCopy or cvSplit) the channel to a separate image/matrix, process it and then copy the result
+back (via cvCopy or cvMerge) if needed.
+@param image A pointer to the image header
+@param coi The channel of interest. 0 - all channels are selected, 1 - first channel is selected,
+etc. Note that the channel indices become 1-based.
+ */
+CVAPI(void)  cvSetImageCOI( IplImage* image, int coi );
+
+/** @brief Returns the index of the channel of interest.
+
+Returns the channel of interest of in an IplImage. Returned values correspond to the coi in
+cvSetImageCOI.
+@param image A pointer to the image header
+ */
+CVAPI(int)  cvGetImageCOI( const IplImage* image );
+
+/** @brief Sets an image Region Of Interest (ROI) for a given rectangle.
+
+If the original image ROI was NULL and the rect is not the whole image, the ROI structure is
+allocated.
+
+Most OpenCV functions support the use of ROI and treat the image rectangle as a separate image. For
+example, all of the pixel coordinates are counted from the top-left (or bottom-left) corner of the
+ROI, not the original image.
+@param image A pointer to the image header
+@param rect The ROI rectangle
+ */
+CVAPI(void)  cvSetImageROI( IplImage* image, CvRect rect );
+
+/** @brief Resets the image ROI to include the entire image and releases the ROI structure.
+
+This produces a similar result to the following, but in addition it releases the ROI structure. :
+@code
+    cvSetImageROI(image, cvRect(0, 0, image->width, image->height ));
+    cvSetImageCOI(image, 0);
+@endcode
+@param image A pointer to the image header
+ */
+CVAPI(void)  cvResetImageROI( IplImage* image );
+
+/** @brief Returns the image ROI.
+
+If there is no ROI set, cvRect(0,0,image-\>width,image-\>height) is returned.
+@param image A pointer to the image header
+ */
+CVAPI(CvRect) cvGetImageROI( const IplImage* image );
+
+/** @brief Creates a matrix header but does not allocate the matrix data.
+
+The function allocates a new matrix header and returns a pointer to it. The matrix data can then be
+allocated using cvCreateData or set explicitly to user-allocated data via cvSetData.
+@param rows Number of rows in the matrix
+@param cols Number of columns in the matrix
+@param type Type of the matrix elements, see cvCreateMat
+ */
+CVAPI(CvMat*)  cvCreateMatHeader( int rows, int cols, int type );
+
+#define CV_AUTOSTEP  0x7fffffff
+
+/** @brief Initializes a pre-allocated matrix header.
+
+This function is often used to process raw data with OpenCV matrix functions. For example, the
+following code computes the matrix product of two matrices, stored as ordinary arrays:
+@code
+    double a[] = { 1, 2, 3, 4,
+                   5, 6, 7, 8,
+                   9, 10, 11, 12 };
+
+    double b[] = { 1, 5, 9,
+                   2, 6, 10,
+                   3, 7, 11,
+                   4, 8, 12 };
+
+    double c[9];
+    CvMat Ma, Mb, Mc ;
+
+    cvInitMatHeader(&Ma, 3, 4, CV_64FC1, a);
+    cvInitMatHeader(&Mb, 4, 3, CV_64FC1, b);
+    cvInitMatHeader(&Mc, 3, 3, CV_64FC1, c);
+
+    cvMatMulAdd(&Ma, &Mb, 0, &Mc);
+    // the c array now contains the product of a (3x4) and b (4x3)
+@endcode
+@param mat A pointer to the matrix header to be initialized
+@param rows Number of rows in the matrix
+@param cols Number of columns in the matrix
+@param type Type of the matrix elements, see cvCreateMat .
+@param data Optional: data pointer assigned to the matrix header
+@param step Optional: full row width in bytes of the assigned data. By default, the minimal
+possible step is used which assumes there are no gaps between subsequent rows of the matrix.
+ */
+CVAPI(CvMat*) cvInitMatHeader( CvMat* mat, int rows, int cols,
+                              int type, void* data CV_DEFAULT(NULL),
+                              int step CV_DEFAULT(CV_AUTOSTEP) );
+
+/** @brief Creates a matrix header and allocates the matrix data.
+
+The function call is equivalent to the following code:
+@code
+    CvMat* mat = cvCreateMatHeader(rows, cols, type);
+    cvCreateData(mat);
+@endcode
+@param rows Number of rows in the matrix
+@param cols Number of columns in the matrix
+@param type The type of the matrix elements in the form
+CV_\<bit depth\>\<S|U|F\>C\<number of channels\> , where S=signed, U=unsigned, F=float. For
+example, CV _ 8UC1 means the elements are 8-bit unsigned and the there is 1 channel, and CV _
+32SC2 means the elements are 32-bit signed and there are 2 channels.
+ */
+CVAPI(CvMat*)  cvCreateMat( int rows, int cols, int type );
+
+/** @brief Deallocates a matrix.
+
+The function decrements the matrix data reference counter and deallocates matrix header. If the data
+reference counter is 0, it also deallocates the data. :
+@code
+    if(*mat )
+        cvDecRefData(*mat);
+    cvFree((void**)mat);
+@endcode
+@param mat Double pointer to the matrix
+ */
+CVAPI(void)  cvReleaseMat( CvMat** mat );
+
+/** @brief Decrements an array data reference counter.
+
+The function decrements the data reference counter in a CvMat or CvMatND if the reference counter
+
+pointer is not NULL. If the counter reaches zero, the data is deallocated. In the current
+implementation the reference counter is not NULL only if the data was allocated using the
+cvCreateData function. The counter will be NULL in other cases such as: external data was assigned
+to the header using cvSetData, header is part of a larger matrix or image, or the header was
+converted from an image or n-dimensional matrix header.
+@param arr Pointer to an array header
+ */
+CV_INLINE  void  cvDecRefData( CvArr* arr )
+{
+    if( CV_IS_MAT( arr ))
+    {
+        CvMat* mat = (CvMat*)arr;
+        mat->data.ptr = NULL;
+        if( mat->refcount != NULL && --*mat->refcount == 0 )
+            cvFree( &mat->refcount );
+        mat->refcount = NULL;
+    }
+    else if( CV_IS_MATND( arr ))
+    {
+        CvMatND* mat = (CvMatND*)arr;
+        mat->data.ptr = NULL;
+        if( mat->refcount != NULL && --*mat->refcount == 0 )
+            cvFree( &mat->refcount );
+        mat->refcount = NULL;
+    }
+}
+
+/** @brief Increments array data reference counter.
+
+The function increments CvMat or CvMatND data reference counter and returns the new counter value if
+the reference counter pointer is not NULL, otherwise it returns zero.
+@param arr Array header
+ */
+CV_INLINE  int  cvIncRefData( CvArr* arr )
+{
+    int refcount = 0;
+    if( CV_IS_MAT( arr ))
+    {
+        CvMat* mat = (CvMat*)arr;
+        if( mat->refcount != NULL )
+            refcount = ++*mat->refcount;
+    }
+    else if( CV_IS_MATND( arr ))
+    {
+        CvMatND* mat = (CvMatND*)arr;
+        if( mat->refcount != NULL )
+            refcount = ++*mat->refcount;
+    }
+    return refcount;
+}
+
+
+/** Creates an exact copy of the input matrix (except, may be, step value) */
+CVAPI(CvMat*) cvCloneMat( const CvMat* mat );
+
+
+/** @brief Returns matrix header corresponding to the rectangular sub-array of input image or matrix.
+
+The function returns header, corresponding to a specified rectangle of the input array. In other
+
+words, it allows the user to treat a rectangular part of input array as a stand-alone array. ROI is
+taken into account by the function so the sub-array of ROI is actually extracted.
+@param arr Input array
+@param submat Pointer to the resultant sub-array header
+@param rect Zero-based coordinates of the rectangle of interest
+ */
+CVAPI(CvMat*) cvGetSubRect( const CvArr* arr, CvMat* submat, CvRect rect );
+#define cvGetSubArr cvGetSubRect
+
+/** @brief Returns array row or row span.
+
+The function returns the header, corresponding to a specified row/row span of the input array.
+cvGetRow(arr, submat, row) is a shortcut for cvGetRows(arr, submat, row, row+1).
+@param arr Input array
+@param submat Pointer to the resulting sub-array header
+@param start_row Zero-based index of the starting row (inclusive) of the span
+@param end_row Zero-based index of the ending row (exclusive) of the span
+@param delta_row Index step in the row span. That is, the function extracts every delta_row -th
+row from start_row and up to (but not including) end_row .
+ */
+CVAPI(CvMat*) cvGetRows( const CvArr* arr, CvMat* submat,
+                        int start_row, int end_row,
+                        int delta_row CV_DEFAULT(1));
+
+/** @overload
+@param arr Input array
+@param submat Pointer to the resulting sub-array header
+@param row Zero-based index of the selected row
+*/
+CV_INLINE  CvMat*  cvGetRow( const CvArr* arr, CvMat* submat, int row )
+{
+    return cvGetRows( arr, submat, row, row + 1, 1 );
+}
+
+
+/** @brief Returns one of more array columns.
+
+The function returns the header, corresponding to a specified column span of the input array. That
+
+is, no data is copied. Therefore, any modifications of the submatrix will affect the original array.
+If you need to copy the columns, use cvCloneMat. cvGetCol(arr, submat, col) is a shortcut for
+cvGetCols(arr, submat, col, col+1).
+@param arr Input array
+@param submat Pointer to the resulting sub-array header
+@param start_col Zero-based index of the starting column (inclusive) of the span
+@param end_col Zero-based index of the ending column (exclusive) of the span
+ */
+CVAPI(CvMat*) cvGetCols( const CvArr* arr, CvMat* submat,
+                        int start_col, int end_col );
+
+/** @overload
+@param arr Input array
+@param submat Pointer to the resulting sub-array header
+@param col Zero-based index of the selected column
+*/
+CV_INLINE  CvMat*  cvGetCol( const CvArr* arr, CvMat* submat, int col )
+{
+    return cvGetCols( arr, submat, col, col + 1 );
+}
+
+/** @brief Returns one of array diagonals.
+
+The function returns the header, corresponding to a specified diagonal of the input array.
+@param arr Input array
+@param submat Pointer to the resulting sub-array header
+@param diag Index of the array diagonal. Zero value corresponds to the main diagonal, -1
+corresponds to the diagonal above the main, 1 corresponds to the diagonal below the main, and so
+forth.
+ */
+CVAPI(CvMat*) cvGetDiag( const CvArr* arr, CvMat* submat,
+                            int diag CV_DEFAULT(0));
+
+/** low-level scalar <-> raw data conversion functions */
+CVAPI(void) cvScalarToRawData( const CvScalar* scalar, void* data, int type,
+                              int extend_to_12 CV_DEFAULT(0) );
+
+CVAPI(void) cvRawDataToScalar( const void* data, int type, CvScalar* scalar );
+
+/** @brief Creates a new matrix header but does not allocate the matrix data.
+
+The function allocates a header for a multi-dimensional dense array. The array data can further be
+allocated using cvCreateData or set explicitly to user-allocated data via cvSetData.
+@param dims Number of array dimensions
+@param sizes Array of dimension sizes
+@param type Type of array elements, see cvCreateMat
+ */
+CVAPI(CvMatND*)  cvCreateMatNDHeader( int dims, const int* sizes, int type );
+
+/** @brief Creates the header and allocates the data for a multi-dimensional dense array.
+
+This function call is equivalent to the following code:
+@code
+    CvMatND* mat = cvCreateMatNDHeader(dims, sizes, type);
+    cvCreateData(mat);
+@endcode
+@param dims Number of array dimensions. This must not exceed CV_MAX_DIM (32 by default, but can be
+changed at build time).
+@param sizes Array of dimension sizes.
+@param type Type of array elements, see cvCreateMat .
+ */
+CVAPI(CvMatND*)  cvCreateMatND( int dims, const int* sizes, int type );
+
+/** @brief Initializes a pre-allocated multi-dimensional array header.
+
+@param mat A pointer to the array header to be initialized
+@param dims The number of array dimensions
+@param sizes An array of dimension sizes
+@param type Type of array elements, see cvCreateMat
+@param data Optional data pointer assigned to the matrix header
+ */
+CVAPI(CvMatND*)  cvInitMatNDHeader( CvMatND* mat, int dims, const int* sizes,
+                                    int type, void* data CV_DEFAULT(NULL) );
+
+/** @brief Deallocates a multi-dimensional array.
+
+The function decrements the array data reference counter and releases the array header. If the
+reference counter reaches 0, it also deallocates the data. :
+@code
+    if(*mat )
+        cvDecRefData(*mat);
+    cvFree((void**)mat);
+@endcode
+@param mat Double pointer to the array
+ */
+CV_INLINE  void  cvReleaseMatND( CvMatND** mat )
+{
+    cvReleaseMat( (CvMat**)mat );
+}
+
+/** Creates a copy of CvMatND (except, may be, steps) */
+CVAPI(CvMatND*) cvCloneMatND( const CvMatND* mat );
+
+/** @brief Creates sparse array.
+
+The function allocates a multi-dimensional sparse array. Initially the array contain no elements,
+that is PtrND and other related functions will return 0 for every index.
+@param dims Number of array dimensions. In contrast to the dense matrix, the number of dimensions is
+practically unlimited (up to \f$2^{16}\f$ ).
+@param sizes Array of dimension sizes
+@param type Type of array elements. The same as for CvMat
+ */
+CVAPI(CvSparseMat*)  cvCreateSparseMat( int dims, const int* sizes, int type );
+
+/** @brief Deallocates sparse array.
+
+The function releases the sparse array and clears the array pointer upon exit.
+@param mat Double pointer to the array
+ */
+CVAPI(void)  cvReleaseSparseMat( CvSparseMat** mat );
+
+/** Creates a copy of CvSparseMat (except, may be, zero items) */
+CVAPI(CvSparseMat*) cvCloneSparseMat( const CvSparseMat* mat );
+
+/** @brief Initializes sparse array elements iterator.
+
+The function initializes iterator of sparse array elements and returns pointer to the first element,
+or NULL if the array is empty.
+@param mat Input array
+@param mat_iterator Initialized iterator
+ */
+CVAPI(CvSparseNode*) cvInitSparseMatIterator( const CvSparseMat* mat,
+                                              CvSparseMatIterator* mat_iterator );
+
+/** @brief Returns the next sparse matrix element
+
+The function moves iterator to the next sparse matrix element and returns pointer to it. In the
+current version there is no any particular order of the elements, because they are stored in the
+hash table. The sample below demonstrates how to iterate through the sparse matrix:
+@code
+    // print all the non-zero sparse matrix elements and compute their sum
+    double sum = 0;
+    int i, dims = cvGetDims(sparsemat);
+    CvSparseMatIterator it;
+    CvSparseNode* node = cvInitSparseMatIterator(sparsemat, &it);
+
+    for(; node != 0; node = cvGetNextSparseNode(&it))
+    {
+        int* idx = CV_NODE_IDX(array, node);
+        float val = *(float*)CV_NODE_VAL(array, node);
+        printf("M");
+        for(i = 0; i < dims; i++ )
+            printf("[%d]", idx[i]);
+        printf("=%g\n", val);
+
+        sum += val;
+    }
+
+    printf("nTotal sum = %g\n", sum);
+@endcode
+@param mat_iterator Sparse array iterator
+ */
+CV_INLINE CvSparseNode* cvGetNextSparseNode( CvSparseMatIterator* mat_iterator )
+{
+    if( mat_iterator->node->next )
+        return mat_iterator->node = mat_iterator->node->next;
+    else
+    {
+        int idx;
+        for( idx = ++mat_iterator->curidx; idx < mat_iterator->mat->hashsize; idx++ )
+        {
+            CvSparseNode* node = (CvSparseNode*)mat_iterator->mat->hashtable[idx];
+            if( node )
+            {
+                mat_iterator->curidx = idx;
+                return mat_iterator->node = node;
+            }
+        }
+        return NULL;
+    }
+}
+
+
+#define CV_MAX_ARR 10
+
+/** matrix iterator: used for n-ary operations on dense arrays */
+typedef struct CvNArrayIterator
+{
+    int count; /**< number of arrays */
+    int dims; /**< number of dimensions to iterate */
+    CvSize size; /**< maximal common linear size: { width = size, height = 1 } */
+    uchar* ptr[CV_MAX_ARR]; /**< pointers to the array slices */
+    int stack[CV_MAX_DIM]; /**< for internal use */
+    CvMatND* hdr[CV_MAX_ARR]; /**< pointers to the headers of the
+                                 matrices that are processed */
+}
+CvNArrayIterator;
+
+#define CV_NO_DEPTH_CHECK     1
+#define CV_NO_CN_CHECK        2
+#define CV_NO_SIZE_CHECK      4
+
+/** initializes iterator that traverses through several arrays simultaneously
+   (the function together with cvNextArraySlice is used for
+    N-ari element-wise operations) */
+CVAPI(int) cvInitNArrayIterator( int count, CvArr** arrs,
+                                 const CvArr* mask, CvMatND* stubs,
+                                 CvNArrayIterator* array_iterator,
+                                 int flags CV_DEFAULT(0) );
+
+/** returns zero value if iteration is finished, non-zero (slice length) otherwise */
+CVAPI(int) cvNextNArraySlice( CvNArrayIterator* array_iterator );
+
+
+/** @brief Returns type of array elements.
+
+The function returns type of the array elements. In the case of IplImage the type is converted to
+CvMat-like representation. For example, if the image has been created as:
+@code
+    IplImage* img = cvCreateImage(cvSize(640, 480), IPL_DEPTH_8U, 3);
+@endcode
+The code cvGetElemType(img) will return CV_8UC3.
+@param arr Input array
+ */
+CVAPI(int) cvGetElemType( const CvArr* arr );
+
+/** @brief Return number of array dimensions
+
+The function returns the array dimensionality and the array of dimension sizes. In the case of
+IplImage or CvMat it always returns 2 regardless of number of image/matrix rows. For example, the
+following code calculates total number of array elements:
+@code
+    int sizes[CV_MAX_DIM];
+    int i, total = 1;
+    int dims = cvGetDims(arr, size);
+    for(i = 0; i < dims; i++ )
+        total *= sizes[i];
+@endcode
+@param arr Input array
+@param sizes Optional output vector of the array dimension sizes. For 2d arrays the number of rows
+(height) goes first, number of columns (width) next.
+ */
+CVAPI(int) cvGetDims( const CvArr* arr, int* sizes CV_DEFAULT(NULL) );
+
+
+/** @brief Returns array size along the specified dimension.
+
+@param arr Input array
+@param index Zero-based dimension index (for matrices 0 means number of rows, 1 means number of
+columns; for images 0 means height, 1 means width)
+ */
+CVAPI(int) cvGetDimSize( const CvArr* arr, int index );
+
+
+/** @brief Return pointer to a particular array element.
+
+The functions return a pointer to a specific array element. Number of array dimension should match
+to the number of indices passed to the function except for cvPtr1D function that can be used for
+sequential access to 1D, 2D or nD dense arrays.
+
+The functions can be used for sparse arrays as well - if the requested node does not exist they
+create it and set it to zero.
+
+All these as well as other functions accessing array elements ( cvGetND , cvGetRealND , cvSet
+, cvSetND , cvSetRealND ) raise an error in case if the element index is out of range.
+@param arr Input array
+@param idx0 The first zero-based component of the element index
+@param type Optional output parameter: type of matrix elements
+ */
+CVAPI(uchar*) cvPtr1D( const CvArr* arr, int idx0, int* type CV_DEFAULT(NULL));
+/** @overload */
+CVAPI(uchar*) cvPtr2D( const CvArr* arr, int idx0, int idx1, int* type CV_DEFAULT(NULL) );
+/** @overload */
+CVAPI(uchar*) cvPtr3D( const CvArr* arr, int idx0, int idx1, int idx2,
+                      int* type CV_DEFAULT(NULL));
+/** @overload
+@param arr Input array
+@param idx Array of the element indices
+@param type Optional output parameter: type of matrix elements
+@param create_node Optional input parameter for sparse matrices. Non-zero value of the parameter
+means that the requested element is created if it does not exist already.
+@param precalc_hashval Optional input parameter for sparse matrices. If the pointer is not NULL,
+the function does not recalculate the node hash value, but takes it from the specified location.
+It is useful for speeding up pair-wise operations (TODO: provide an example)
+*/
+CVAPI(uchar*) cvPtrND( const CvArr* arr, const int* idx, int* type CV_DEFAULT(NULL),
+                      int create_node CV_DEFAULT(1),
+                      unsigned* precalc_hashval CV_DEFAULT(NULL));
+
+/** @brief Return a specific array element.
+
+The functions return a specific array element. In the case of a sparse array the functions return 0
+if the requested node does not exist (no new node is created by the functions).
+@param arr Input array
+@param idx0 The first zero-based component of the element index
+ */
+CVAPI(CvScalar) cvGet1D( const CvArr* arr, int idx0 );
+/** @overload */
+CVAPI(CvScalar) cvGet2D( const CvArr* arr, int idx0, int idx1 );
+/** @overload */
+CVAPI(CvScalar) cvGet3D( const CvArr* arr, int idx0, int idx1, int idx2 );
+/** @overload
+@param arr Input array
+@param idx Array of the element indices
+*/
+CVAPI(CvScalar) cvGetND( const CvArr* arr, const int* idx );
+
+/** @brief Return a specific element of single-channel 1D, 2D, 3D or nD array.
+
+Returns a specific element of a single-channel array. If the array has multiple channels, a runtime
+error is raised. Note that Get?D functions can be used safely for both single-channel and
+multiple-channel arrays though they are a bit slower.
+
+In the case of a sparse array the functions return 0 if the requested node does not exist (no new
+node is created by the functions).
+@param arr Input array. Must have a single channel.
+@param idx0 The first zero-based component of the element index
+ */
+CVAPI(double) cvGetReal1D( const CvArr* arr, int idx0 );
+/** @overload */
+CVAPI(double) cvGetReal2D( const CvArr* arr, int idx0, int idx1 );
+/** @overload */
+CVAPI(double) cvGetReal3D( const CvArr* arr, int idx0, int idx1, int idx2 );
+/** @overload
+@param arr Input array. Must have a single channel.
+@param idx Array of the element indices
+*/
+CVAPI(double) cvGetRealND( const CvArr* arr, const int* idx );
+
+/** @brief Change the particular array element.
+
+The functions assign the new value to a particular array element. In the case of a sparse array the
+functions create the node if it does not exist yet.
+@param arr Input array
+@param idx0 The first zero-based component of the element index
+@param value The assigned value
+ */
+CVAPI(void) cvSet1D( CvArr* arr, int idx0, CvScalar value );
+/** @overload */
+CVAPI(void) cvSet2D( CvArr* arr, int idx0, int idx1, CvScalar value );
+/** @overload */
+CVAPI(void) cvSet3D( CvArr* arr, int idx0, int idx1, int idx2, CvScalar value );
+/** @overload
+@param arr Input array
+@param idx Array of the element indices
+@param value The assigned value
+*/
+CVAPI(void) cvSetND( CvArr* arr, const int* idx, CvScalar value );
+
+/** @brief Change a specific array element.
+
+The functions assign a new value to a specific element of a single-channel array. If the array has
+multiple channels, a runtime error is raised. Note that the Set\*D function can be used safely for
+both single-channel and multiple-channel arrays, though they are a bit slower.
+
+In the case of a sparse array the functions create the node if it does not yet exist.
+@param arr Input array
+@param idx0 The first zero-based component of the element index
+@param value The assigned value
+ */
+CVAPI(void) cvSetReal1D( CvArr* arr, int idx0, double value );
+/** @overload */
+CVAPI(void) cvSetReal2D( CvArr* arr, int idx0, int idx1, double value );
+/** @overload */
+CVAPI(void) cvSetReal3D( CvArr* arr, int idx0,
+                        int idx1, int idx2, double value );
+/** @overload
+@param arr Input array
+@param idx Array of the element indices
+@param value The assigned value
+*/
+CVAPI(void) cvSetRealND( CvArr* arr, const int* idx, double value );
+
+/** clears element of ND dense array,
+   in case of sparse arrays it deletes the specified node */
+CVAPI(void) cvClearND( CvArr* arr, const int* idx );
+
+/** @brief Returns matrix header for arbitrary array.
+
+The function returns a matrix header for the input array that can be a matrix - CvMat, an image -
+IplImage, or a multi-dimensional dense array - CvMatND (the third option is allowed only if
+allowND != 0) . In the case of matrix the function simply returns the input pointer. In the case of
+IplImage\* or CvMatND it initializes the header structure with parameters of the current image ROI
+and returns &header. Because COI is not supported by CvMat, it is returned separately.
+
+The function provides an easy way to handle both types of arrays - IplImage and CvMat using the same
+code. Input array must have non-zero data pointer, otherwise the function will report an error.
+
+@note If the input array is IplImage with planar data layout and COI set, the function returns the
+pointer to the selected plane and COI == 0. This feature allows user to process IplImage structures
+with planar data layout, even though OpenCV does not support such images.
+@param arr Input array
+@param header Pointer to CvMat structure used as a temporary buffer
+@param coi Optional output parameter for storing COI
+@param allowND If non-zero, the function accepts multi-dimensional dense arrays (CvMatND\*) and
+returns 2D matrix (if CvMatND has two dimensions) or 1D matrix (when CvMatND has 1 dimension or
+more than 2 dimensions). The CvMatND array must be continuous.
+@sa cvGetImage, cvarrToMat.
+ */
+CVAPI(CvMat*) cvGetMat( const CvArr* arr, CvMat* header,
+                       int* coi CV_DEFAULT(NULL),
+                       int allowND CV_DEFAULT(0));
+
+/** @brief Returns image header for arbitrary array.
+
+The function returns the image header for the input array that can be a matrix (CvMat) or image
+(IplImage). In the case of an image the function simply returns the input pointer. In the case of
+CvMat it initializes an image_header structure with the parameters of the input matrix. Note that
+if we transform IplImage to CvMat using cvGetMat and then transform CvMat back to IplImage using
+this function, we will get different headers if the ROI is set in the original image.
+@param arr Input array
+@param image_header Pointer to IplImage structure used as a temporary buffer
+ */
+CVAPI(IplImage*) cvGetImage( const CvArr* arr, IplImage* image_header );
+
+
+/** @brief Changes the shape of a multi-dimensional array without copying the data.
+
+The function is an advanced version of cvReshape that can work with multi-dimensional arrays as
+well (though it can work with ordinary images and matrices) and change the number of dimensions.
+
+Below are the two samples from the cvReshape description rewritten using cvReshapeMatND:
+@code
+    IplImage* color_img = cvCreateImage(cvSize(320,240), IPL_DEPTH_8U, 3);
+    IplImage gray_img_hdr, *gray_img;
+    gray_img = (IplImage*)cvReshapeMatND(color_img, sizeof(gray_img_hdr), &gray_img_hdr, 1, 0, 0);
+    ...
+    int size[] = { 2, 2, 2 };
+    CvMatND* mat = cvCreateMatND(3, size, CV_32F);
+    CvMat row_header, *row;
+    row = (CvMat*)cvReshapeMatND(mat, sizeof(row_header), &row_header, 0, 1, 0);
+@endcode
+In C, the header file for this function includes a convenient macro cvReshapeND that does away with
+the sizeof_header parameter. So, the lines containing the call to cvReshapeMatND in the examples
+may be replaced as follow:
+@code
+    gray_img = (IplImage*)cvReshapeND(color_img, &gray_img_hdr, 1, 0, 0);
+    ...
+    row = (CvMat*)cvReshapeND(mat, &row_header, 0, 1, 0);
+@endcode
+@param arr Input array
+@param sizeof_header Size of output header to distinguish between IplImage, CvMat and CvMatND
+output headers
+@param header Output header to be filled
+@param new_cn New number of channels. new_cn = 0 means that the number of channels remains
+unchanged.
+@param new_dims New number of dimensions. new_dims = 0 means that the number of dimensions
+remains the same.
+@param new_sizes Array of new dimension sizes. Only new_dims-1 values are used, because the
+total number of elements must remain the same. Thus, if new_dims = 1, new_sizes array is not
+used.
+ */
+CVAPI(CvArr*) cvReshapeMatND( const CvArr* arr,
+                             int sizeof_header, CvArr* header,
+                             int new_cn, int new_dims, int* new_sizes );
+
+#define cvReshapeND( arr, header, new_cn, new_dims, new_sizes )   \
+      cvReshapeMatND( (arr), sizeof(*(header)), (header),         \
+                      (new_cn), (new_dims), (new_sizes))
+
+/** @brief Changes shape of matrix/image without copying data.
+
+The function initializes the CvMat header so that it points to the same data as the original array
+but has a different shape - different number of channels, different number of rows, or both.
+
+The following example code creates one image buffer and two image headers, the first is for a
+320x240x3 image and the second is for a 960x240x1 image:
+@code
+    IplImage* color_img = cvCreateImage(cvSize(320,240), IPL_DEPTH_8U, 3);
+    CvMat gray_mat_hdr;
+    IplImage gray_img_hdr, *gray_img;
+    cvReshape(color_img, &gray_mat_hdr, 1);
+    gray_img = cvGetImage(&gray_mat_hdr, &gray_img_hdr);
+@endcode
+And the next example converts a 3x3 matrix to a single 1x9 vector:
+@code
+    CvMat* mat = cvCreateMat(3, 3, CV_32F);
+    CvMat row_header, *row;
+    row = cvReshape(mat, &row_header, 0, 1);
+@endcode
+@param arr Input array
+@param header Output header to be filled
+@param new_cn New number of channels. 'new_cn = 0' means that the number of channels remains
+unchanged.
+@param new_rows New number of rows. 'new_rows = 0' means that the number of rows remains
+unchanged unless it needs to be changed according to new_cn value.
+*/
+CVAPI(CvMat*) cvReshape( const CvArr* arr, CvMat* header,
+                        int new_cn, int new_rows CV_DEFAULT(0) );
+
+/** Repeats source 2d array several times in both horizontal and
+   vertical direction to fill destination array */
+CVAPI(void) cvRepeat( const CvArr* src, CvArr* dst );
+
+/** @brief Allocates array data
+
+The function allocates image, matrix or multi-dimensional dense array data. Note that in the case of
+matrix types OpenCV allocation functions are used. In the case of IplImage they are used unless
+CV_TURN_ON_IPL_COMPATIBILITY() has been called before. In the latter case IPL functions are used
+to allocate the data.
+@param arr Array header
+ */
+CVAPI(void)  cvCreateData( CvArr* arr );
+
+/** @brief Releases array data.
+
+The function releases the array data. In the case of CvMat or CvMatND it simply calls
+cvDecRefData(), that is the function can not deallocate external data. See also the note to
+cvCreateData .
+@param arr Array header
+ */
+CVAPI(void)  cvReleaseData( CvArr* arr );
+
+/** @brief Assigns user data to the array header.
+
+The function assigns user data to the array header. Header should be initialized before using
+cvCreateMatHeader, cvCreateImageHeader, cvCreateMatNDHeader, cvInitMatHeader,
+cvInitImageHeader or cvInitMatNDHeader.
+@param arr Array header
+@param data User data
+@param step Full row length in bytes
+ */
+CVAPI(void)  cvSetData( CvArr* arr, void* data, int step );
+
+/** @brief Retrieves low-level information about the array.
+
+The function fills output variables with low-level information about the array data. All output
+
+parameters are optional, so some of the pointers may be set to NULL. If the array is IplImage with
+ROI set, the parameters of ROI are returned.
+
+The following example shows how to get access to array elements. It computes absolute values of the
+array elements :
+@code
+    float* data;
+    int step;
+    CvSize size;
+
+    cvGetRawData(array, (uchar**)&data, &step, &size);
+    step /= sizeof(data[0]);
+
+    for(int y = 0; y < size.height; y++, data += step )
+        for(int x = 0; x < size.width; x++ )
+            data[x] = (float)fabs(data[x]);
+@endcode
+@param arr Array header
+@param data Output pointer to the whole image origin or ROI origin if ROI is set
+@param step Output full row length in bytes
+@param roi_size Output ROI size
+ */
+CVAPI(void) cvGetRawData( const CvArr* arr, uchar** data,
+                         int* step CV_DEFAULT(NULL),
+                         CvSize* roi_size CV_DEFAULT(NULL));
+
+/** @brief Returns size of matrix or image ROI.
+
+The function returns number of rows (CvSize::height) and number of columns (CvSize::width) of the
+input matrix or image. In the case of image the size of ROI is returned.
+@param arr array header
+ */
+CVAPI(CvSize) cvGetSize( const CvArr* arr );
+
+/** @brief Copies one array to another.
+
+The function copies selected elements from an input array to an output array:
+
+\f[\texttt{dst} (I)= \texttt{src} (I)  \quad \text{if} \quad \texttt{mask} (I)  \ne 0.\f]
+
+If any of the passed arrays is of IplImage type, then its ROI and COI fields are used. Both arrays
+must have the same type, the same number of dimensions, and the same size. The function can also
+copy sparse arrays (mask is not supported in this case).
+@param src The source array
+@param dst The destination array
+@param mask Operation mask, 8-bit single channel array; specifies elements of the destination array
+to be changed
+ */
+CVAPI(void)  cvCopy( const CvArr* src, CvArr* dst,
+                     const CvArr* mask CV_DEFAULT(NULL) );
+
+/** @brief Sets every element of an array to a given value.
+
+The function copies the scalar value to every selected element of the destination array:
+\f[\texttt{arr} (I)= \texttt{value} \quad \text{if} \quad \texttt{mask} (I)  \ne 0\f]
+If array arr is of IplImage type, then is ROI used, but COI must not be set.
+@param arr The destination array
+@param value Fill value
+@param mask Operation mask, 8-bit single channel array; specifies elements of the destination
+array to be changed
+ */
+CVAPI(void)  cvSet( CvArr* arr, CvScalar value,
+                    const CvArr* mask CV_DEFAULT(NULL) );
+
+/** @brief Clears the array.
+
+The function clears the array. In the case of dense arrays (CvMat, CvMatND or IplImage),
+cvZero(array) is equivalent to cvSet(array,cvScalarAll(0),0). In the case of sparse arrays all the
+elements are removed.
+@param arr Array to be cleared
+ */
+CVAPI(void)  cvSetZero( CvArr* arr );
+#define cvZero  cvSetZero
+
+
+/** Splits a multi-channel array into the set of single-channel arrays or
+   extracts particular [color] plane */
+CVAPI(void)  cvSplit( const CvArr* src, CvArr* dst0, CvArr* dst1,
+                      CvArr* dst2, CvArr* dst3 );
+
+/** Merges a set of single-channel arrays into the single multi-channel array
+   or inserts one particular [color] plane to the array */
+CVAPI(void)  cvMerge( const CvArr* src0, const CvArr* src1,
+                      const CvArr* src2, const CvArr* src3,
+                      CvArr* dst );
+
+/** Copies several channels from input arrays to
+   certain channels of output arrays */
+CVAPI(void)  cvMixChannels( const CvArr** src, int src_count,
+                            CvArr** dst, int dst_count,
+                            const int* from_to, int pair_count );
+
+/** @brief Converts one array to another with optional linear transformation.
+
+The function has several different purposes, and thus has several different names. It copies one
+array to another with optional scaling, which is performed first, and/or optional type conversion,
+performed after:
+
+\f[\texttt{dst} (I) =  \texttt{scale} \texttt{src} (I) + ( \texttt{shift} _0, \texttt{shift} _1,...)\f]
+
+All the channels of multi-channel arrays are processed independently.
+
+The type of conversion is done with rounding and saturation, that is if the result of scaling +
+conversion can not be represented exactly by a value of the destination array element type, it is
+set to the nearest representable value on the real axis.
+@param src Source array
+@param dst Destination array
+@param scale Scale factor
+@param shift Value added to the scaled source array elements
+ */
+CVAPI(void)  cvConvertScale( const CvArr* src, CvArr* dst,
+                             double scale CV_DEFAULT(1),
+                             double shift CV_DEFAULT(0) );
+#define cvCvtScale cvConvertScale
+#define cvScale  cvConvertScale
+#define cvConvert( src, dst )  cvConvertScale( (src), (dst), 1, 0 )
+
+
+/** Performs linear transformation on every source array element,
+   stores absolute value of the result:
+   dst(x,y,c) = abs(scale*src(x,y,c)+shift).
+   destination array must have 8u type.
+   In other cases one may use cvConvertScale + cvAbsDiffS */
+CVAPI(void)  cvConvertScaleAbs( const CvArr* src, CvArr* dst,
+                                double scale CV_DEFAULT(1),
+                                double shift CV_DEFAULT(0) );
+#define cvCvtScaleAbs  cvConvertScaleAbs
+
+
+/** checks termination criteria validity and
+   sets eps to default_eps (if it is not set),
+   max_iter to default_max_iters (if it is not set)
+*/
+CVAPI(CvTermCriteria) cvCheckTermCriteria( CvTermCriteria criteria,
+                                           double default_eps,
+                                           int default_max_iters );
+
+/****************************************************************************************\
+*                   Arithmetic, logic and comparison operations                          *
+\****************************************************************************************/
+
+/** dst(mask) = src1(mask) + src2(mask) */
+CVAPI(void)  cvAdd( const CvArr* src1, const CvArr* src2, CvArr* dst,
+                    const CvArr* mask CV_DEFAULT(NULL));
+
+/** dst(mask) = src(mask) + value */
+CVAPI(void)  cvAddS( const CvArr* src, CvScalar value, CvArr* dst,
+                     const CvArr* mask CV_DEFAULT(NULL));
+
+/** dst(mask) = src1(mask) - src2(mask) */
+CVAPI(void)  cvSub( const CvArr* src1, const CvArr* src2, CvArr* dst,
+                    const CvArr* mask CV_DEFAULT(NULL));
+
+/** dst(mask) = src(mask) - value = src(mask) + (-value) */
+CV_INLINE  void  cvSubS( const CvArr* src, CvScalar value, CvArr* dst,
+                         const CvArr* mask CV_DEFAULT(NULL))
+{
+    cvAddS( src, cvScalar( -value.val[0], -value.val[1], -value.val[2], -value.val[3]),
+            dst, mask );
+}
+
+/** dst(mask) = value - src(mask) */
+CVAPI(void)  cvSubRS( const CvArr* src, CvScalar value, CvArr* dst,
+                      const CvArr* mask CV_DEFAULT(NULL));
+
+/** dst(idx) = src1(idx) * src2(idx) * scale
+   (scaled element-wise multiplication of 2 arrays) */
+CVAPI(void)  cvMul( const CvArr* src1, const CvArr* src2,
+                    CvArr* dst, double scale CV_DEFAULT(1) );
+
+/** element-wise division/inversion with scaling:
+    dst(idx) = src1(idx) * scale / src2(idx)
+    or dst(idx) = scale / src2(idx) if src1 == 0 */
+CVAPI(void)  cvDiv( const CvArr* src1, const CvArr* src2,
+                    CvArr* dst, double scale CV_DEFAULT(1));
+
+/** dst = src1 * scale + src2 */
+CVAPI(void)  cvScaleAdd( const CvArr* src1, CvScalar scale,
+                         const CvArr* src2, CvArr* dst );
+#define cvAXPY( A, real_scalar, B, C ) cvScaleAdd(A, cvRealScalar(real_scalar), B, C)
+
+/** dst = src1 * alpha + src2 * beta + gamma */
+CVAPI(void)  cvAddWeighted( const CvArr* src1, double alpha,
+                            const CvArr* src2, double beta,
+                            double gamma, CvArr* dst );
+
+/** @brief Calculates the dot product of two arrays in Euclidean metrics.
+
+The function calculates and returns the Euclidean dot product of two arrays.
+
+\f[src1  \bullet src2 =  \sum _I ( \texttt{src1} (I)  \texttt{src2} (I))\f]
+
+In the case of multiple channel arrays, the results for all channels are accumulated. In particular,
+cvDotProduct(a,a) where a is a complex vector, will return \f$||\texttt{a}||^2\f$. The function can
+process multi-dimensional arrays, row by row, layer by layer, and so on.
+@param src1 The first source array
+@param src2 The second source array
+ */
+CVAPI(double)  cvDotProduct( const CvArr* src1, const CvArr* src2 );
+
+/** dst(idx) = src1(idx) & src2(idx) */
+CVAPI(void) cvAnd( const CvArr* src1, const CvArr* src2,
+                  CvArr* dst, const CvArr* mask CV_DEFAULT(NULL));
+
+/** dst(idx) = src(idx) & value */
+CVAPI(void) cvAndS( const CvArr* src, CvScalar value,
+                   CvArr* dst, const CvArr* mask CV_DEFAULT(NULL));
+
+/** dst(idx) = src1(idx) | src2(idx) */
+CVAPI(void) cvOr( const CvArr* src1, const CvArr* src2,
+                 CvArr* dst, const CvArr* mask CV_DEFAULT(NULL));
+
+/** dst(idx) = src(idx) | value */
+CVAPI(void) cvOrS( const CvArr* src, CvScalar value,
+                  CvArr* dst, const CvArr* mask CV_DEFAULT(NULL));
+
+/** dst(idx) = src1(idx) ^ src2(idx) */
+CVAPI(void) cvXor( const CvArr* src1, const CvArr* src2,
+                  CvArr* dst, const CvArr* mask CV_DEFAULT(NULL));
+
+/** dst(idx) = src(idx) ^ value */
+CVAPI(void) cvXorS( const CvArr* src, CvScalar value,
+                   CvArr* dst, const CvArr* mask CV_DEFAULT(NULL));
+
+/** dst(idx) = ~src(idx) */
+CVAPI(void) cvNot( const CvArr* src, CvArr* dst );
+
+/** dst(idx) = lower(idx) <= src(idx) < upper(idx) */
+CVAPI(void) cvInRange( const CvArr* src, const CvArr* lower,
+                      const CvArr* upper, CvArr* dst );
+
+/** dst(idx) = lower <= src(idx) < upper */
+CVAPI(void) cvInRangeS( const CvArr* src, CvScalar lower,
+                       CvScalar upper, CvArr* dst );
+
+#define CV_CMP_EQ   0
+#define CV_CMP_GT   1
+#define CV_CMP_GE   2
+#define CV_CMP_LT   3
+#define CV_CMP_LE   4
+#define CV_CMP_NE   5
+
+/** The comparison operation support single-channel arrays only.
+   Destination image should be 8uC1 or 8sC1 */
+
+/** dst(idx) = src1(idx) _cmp_op_ src2(idx) */
+CVAPI(void) cvCmp( const CvArr* src1, const CvArr* src2, CvArr* dst, int cmp_op );
+
+/** dst(idx) = src1(idx) _cmp_op_ value */
+CVAPI(void) cvCmpS( const CvArr* src, double value, CvArr* dst, int cmp_op );
+
+/** dst(idx) = min(src1(idx),src2(idx)) */
+CVAPI(void) cvMin( const CvArr* src1, const CvArr* src2, CvArr* dst );
+
+/** dst(idx) = max(src1(idx),src2(idx)) */
+CVAPI(void) cvMax( const CvArr* src1, const CvArr* src2, CvArr* dst );
+
+/** dst(idx) = min(src(idx),value) */
+CVAPI(void) cvMinS( const CvArr* src, double value, CvArr* dst );
+
+/** dst(idx) = max(src(idx),value) */
+CVAPI(void) cvMaxS( const CvArr* src, double value, CvArr* dst );
+
+/** dst(x,y,c) = abs(src1(x,y,c) - src2(x,y,c)) */
+CVAPI(void) cvAbsDiff( const CvArr* src1, const CvArr* src2, CvArr* dst );
+
+/** dst(x,y,c) = abs(src(x,y,c) - value(c)) */
+CVAPI(void) cvAbsDiffS( const CvArr* src, CvArr* dst, CvScalar value );
+#define cvAbs( src, dst ) cvAbsDiffS( (src), (dst), cvScalarAll(0))
+
+/****************************************************************************************\
+*                                Math operations                                         *
+\****************************************************************************************/
+
+/** Does cartesian->polar coordinates conversion.
+   Either of output components (magnitude or angle) is optional */
+CVAPI(void)  cvCartToPolar( const CvArr* x, const CvArr* y,
+                            CvArr* magnitude, CvArr* angle CV_DEFAULT(NULL),
+                            int angle_in_degrees CV_DEFAULT(0));
+
+/** Does polar->cartesian coordinates conversion.
+   Either of output components (magnitude or angle) is optional.
+   If magnitude is missing it is assumed to be all 1's */
+CVAPI(void)  cvPolarToCart( const CvArr* magnitude, const CvArr* angle,
+                            CvArr* x, CvArr* y,
+                            int angle_in_degrees CV_DEFAULT(0));
+
+/** Does powering: dst(idx) = src(idx)^power */
+CVAPI(void)  cvPow( const CvArr* src, CvArr* dst, double power );
+
+/** Does exponention: dst(idx) = exp(src(idx)).
+   Overflow is not handled yet. Underflow is handled.
+   Maximal relative error is ~7e-6 for single-precision input */
+CVAPI(void)  cvExp( const CvArr* src, CvArr* dst );
+
+/** Calculates natural logarithms: dst(idx) = log(abs(src(idx))).
+   Logarithm of 0 gives large negative number(~-700)
+   Maximal relative error is ~3e-7 for single-precision output
+*/
+CVAPI(void)  cvLog( const CvArr* src, CvArr* dst );
+
+/** Fast arctangent calculation */
+CVAPI(float) cvFastArctan( float y, float x );
+
+/** Fast cubic root calculation */
+CVAPI(float)  cvCbrt( float value );
+
+#define  CV_CHECK_RANGE    1
+#define  CV_CHECK_QUIET    2
+/** Checks array values for NaNs, Infs or simply for too large numbers
+   (if CV_CHECK_RANGE is set). If CV_CHECK_QUIET is set,
+   no runtime errors is raised (function returns zero value in case of "bad" values).
+   Otherwise cvError is called */
+CVAPI(int)  cvCheckArr( const CvArr* arr, int flags CV_DEFAULT(0),
+                        double min_val CV_DEFAULT(0), double max_val CV_DEFAULT(0));
+#define cvCheckArray cvCheckArr
+
+#define CV_RAND_UNI      0
+#define CV_RAND_NORMAL   1
+
+/** @brief Fills an array with random numbers and updates the RNG state.
+
+The function fills the destination array with uniformly or normally distributed random numbers.
+@param rng CvRNG state initialized by cvRNG
+@param arr The destination array
+@param dist_type Distribution type
+> -   **CV_RAND_UNI** uniform distribution
+> -   **CV_RAND_NORMAL** normal or Gaussian distribution
+@param param1 The first parameter of the distribution. In the case of a uniform distribution it is
+the inclusive lower boundary of the random numbers range. In the case of a normal distribution it
+is the mean value of the random numbers.
+@param param2 The second parameter of the distribution. In the case of a uniform distribution it
+is the exclusive upper boundary of the random numbers range. In the case of a normal distribution
+it is the standard deviation of the random numbers.
+@sa randu, randn, RNG::fill.
+ */
+CVAPI(void) cvRandArr( CvRNG* rng, CvArr* arr, int dist_type,
+                      CvScalar param1, CvScalar param2 );
+
+CVAPI(void) cvRandShuffle( CvArr* mat, CvRNG* rng,
+                           double iter_factor CV_DEFAULT(1.));
+
+#define CV_SORT_EVERY_ROW 0
+#define CV_SORT_EVERY_COLUMN 1
+#define CV_SORT_ASCENDING 0
+#define CV_SORT_DESCENDING 16
+
+CVAPI(void) cvSort( const CvArr* src, CvArr* dst CV_DEFAULT(NULL),
+                    CvArr* idxmat CV_DEFAULT(NULL),
+                    int flags CV_DEFAULT(0));
+
+/** Finds real roots of a cubic equation */
+CVAPI(int) cvSolveCubic( const CvMat* coeffs, CvMat* roots );
+
+/** Finds all real and complex roots of a polynomial equation */
+CVAPI(void) cvSolvePoly(const CvMat* coeffs, CvMat *roots2,
+      int maxiter CV_DEFAULT(20), int fig CV_DEFAULT(100));
+
+/****************************************************************************************\
+*                                Matrix operations                                       *
+\****************************************************************************************/
+
+/** @brief Calculates the cross product of two 3D vectors.
+
+The function calculates the cross product of two 3D vectors:
+\f[\texttt{dst} =  \texttt{src1} \times \texttt{src2}\f]
+or:
+\f[\begin{array}{l} \texttt{dst} _1 =  \texttt{src1} _2  \texttt{src2} _3 -  \texttt{src1} _3  \texttt{src2} _2 \\ \texttt{dst} _2 =  \texttt{src1} _3  \texttt{src2} _1 -  \texttt{src1} _1  \texttt{src2} _3 \\ \texttt{dst} _3 =  \texttt{src1} _1  \texttt{src2} _2 -  \texttt{src1} _2  \texttt{src2} _1 \end{array}\f]
+@param src1 The first source vector
+@param src2 The second source vector
+@param dst The destination vector
+ */
+CVAPI(void)  cvCrossProduct( const CvArr* src1, const CvArr* src2, CvArr* dst );
+
+/** Matrix transform: dst = A*B + C, C is optional */
+#define cvMatMulAdd( src1, src2, src3, dst ) cvGEMM( (src1), (src2), 1., (src3), 1., (dst), 0 )
+#define cvMatMul( src1, src2, dst )  cvMatMulAdd( (src1), (src2), NULL, (dst))
+
+#define CV_GEMM_A_T 1
+#define CV_GEMM_B_T 2
+#define CV_GEMM_C_T 4
+/** Extended matrix transform:
+   dst = alpha*op(A)*op(B) + beta*op(C), where op(X) is X or X^T */
+CVAPI(void)  cvGEMM( const CvArr* src1, const CvArr* src2, double alpha,
+                     const CvArr* src3, double beta, CvArr* dst,
+                     int tABC CV_DEFAULT(0));
+#define cvMatMulAddEx cvGEMM
+
+/** Transforms each element of source array and stores
+   resultant vectors in destination array */
+CVAPI(void)  cvTransform( const CvArr* src, CvArr* dst,
+                          const CvMat* transmat,
+                          const CvMat* shiftvec CV_DEFAULT(NULL));
+#define cvMatMulAddS cvTransform
+
+/** Does perspective transform on every element of input array */
+CVAPI(void)  cvPerspectiveTransform( const CvArr* src, CvArr* dst,
+                                     const CvMat* mat );
+
+/** Calculates (A-delta)*(A-delta)^T (order=0) or (A-delta)^T*(A-delta) (order=1) */
+CVAPI(void) cvMulTransposed( const CvArr* src, CvArr* dst, int order,
+                             const CvArr* delta CV_DEFAULT(NULL),
+                             double scale CV_DEFAULT(1.) );
+
+/** Transposes matrix. Square matrices can be transposed in-place */
+CVAPI(void)  cvTranspose( const CvArr* src, CvArr* dst );
+#define cvT cvTranspose
+
+/** Completes the symmetric matrix from the lower (LtoR=0) or from the upper (LtoR!=0) part */
+CVAPI(void)  cvCompleteSymm( CvMat* matrix, int LtoR CV_DEFAULT(0) );
+
+/** Mirror array data around horizontal (flip=0),
+   vertical (flip=1) or both(flip=-1) axises:
+   cvFlip(src) flips images vertically and sequences horizontally (inplace) */
+CVAPI(void)  cvFlip( const CvArr* src, CvArr* dst CV_DEFAULT(NULL),
+                     int flip_mode CV_DEFAULT(0));
+#define cvMirror cvFlip
+
+
+#define CV_SVD_MODIFY_A   1
+#define CV_SVD_U_T        2
+#define CV_SVD_V_T        4
+
+/** Performs Singular Value Decomposition of a matrix */
+CVAPI(void)   cvSVD( CvArr* A, CvArr* W, CvArr* U CV_DEFAULT(NULL),
+                     CvArr* V CV_DEFAULT(NULL), int flags CV_DEFAULT(0));
+
+/** Performs Singular Value Back Substitution (solves A*X = B):
+   flags must be the same as in cvSVD */
+CVAPI(void)   cvSVBkSb( const CvArr* W, const CvArr* U,
+                        const CvArr* V, const CvArr* B,
+                        CvArr* X, int flags );
+
+#define CV_LU  0
+#define CV_SVD 1
+#define CV_SVD_SYM 2
+#define CV_CHOLESKY 3
+#define CV_QR  4
+#define CV_NORMAL 16
+
+/** Inverts matrix */
+CVAPI(double)  cvInvert( const CvArr* src, CvArr* dst,
+                         int method CV_DEFAULT(CV_LU));
+#define cvInv cvInvert
+
+/** Solves linear system (src1)*(dst) = (src2)
+   (returns 0 if src1 is a singular and CV_LU method is used) */
+CVAPI(int)  cvSolve( const CvArr* src1, const CvArr* src2, CvArr* dst,
+                     int method CV_DEFAULT(CV_LU));
+
+/** Calculates determinant of input matrix */
+CVAPI(double) cvDet( const CvArr* mat );
+
+/** Calculates trace of the matrix (sum of elements on the main diagonal) */
+CVAPI(CvScalar) cvTrace( const CvArr* mat );
+
+/** Finds eigen values and vectors of a symmetric matrix */
+CVAPI(void)  cvEigenVV( CvArr* mat, CvArr* evects, CvArr* evals,
+                        double eps CV_DEFAULT(0),
+                        int lowindex CV_DEFAULT(-1),
+                        int highindex CV_DEFAULT(-1));
+
+///* Finds selected eigen values and vectors of a symmetric matrix */
+//CVAPI(void)  cvSelectedEigenVV( CvArr* mat, CvArr* evects, CvArr* evals,
+//                                int lowindex, int highindex );
+
+/** Makes an identity matrix (mat_ij = i == j) */
+CVAPI(void)  cvSetIdentity( CvArr* mat, CvScalar value CV_DEFAULT(cvRealScalar(1)) );
+
+/** Fills matrix with given range of numbers */
+CVAPI(CvArr*)  cvRange( CvArr* mat, double start, double end );
+
+/**   @anchor core_c_CovarFlags
+@name Flags for cvCalcCovarMatrix
+@see cvCalcCovarMatrix
+  @{
+*/
+
+/** flag for cvCalcCovarMatrix, transpose([v1-avg, v2-avg,...]) * [v1-avg,v2-avg,...] */
+#define CV_COVAR_SCRAMBLED 0
+
+/** flag for cvCalcCovarMatrix, [v1-avg, v2-avg,...] * transpose([v1-avg,v2-avg,...]) */
+#define CV_COVAR_NORMAL    1
+
+/** flag for cvCalcCovarMatrix, do not calc average (i.e. mean vector) - use the input vector instead
+   (useful for calculating covariance matrix by parts) */
+#define CV_COVAR_USE_AVG   2
+
+/** flag for cvCalcCovarMatrix, scale the covariance matrix coefficients by number of the vectors */
+#define CV_COVAR_SCALE     4
+
+/** flag for cvCalcCovarMatrix, all the input vectors are stored in a single matrix, as its rows */
+#define CV_COVAR_ROWS      8
+
+/** flag for cvCalcCovarMatrix, all the input vectors are stored in a single matrix, as its columns */
+#define CV_COVAR_COLS     16
+
+/** @} */
+
+/** Calculates covariation matrix for a set of vectors
+@see @ref core_c_CovarFlags "flags"
+*/
+CVAPI(void)  cvCalcCovarMatrix( const CvArr** vects, int count,
+                                CvArr* cov_mat, CvArr* avg, int flags );
+
+#define CV_PCA_DATA_AS_ROW 0
+#define CV_PCA_DATA_AS_COL 1
+#define CV_PCA_USE_AVG 2
+CVAPI(void)  cvCalcPCA( const CvArr* data, CvArr* mean,
+                        CvArr* eigenvals, CvArr* eigenvects, int flags );
+
+CVAPI(void)  cvProjectPCA( const CvArr* data, const CvArr* mean,
+                           const CvArr* eigenvects, CvArr* result );
+
+CVAPI(void)  cvBackProjectPCA( const CvArr* proj, const CvArr* mean,
+                               const CvArr* eigenvects, CvArr* result );
+
+/** Calculates Mahalanobis(weighted) distance */
+CVAPI(double)  cvMahalanobis( const CvArr* vec1, const CvArr* vec2, const CvArr* mat );
+#define cvMahalonobis  cvMahalanobis
+
+/****************************************************************************************\
+*                                    Array Statistics                                    *
+\****************************************************************************************/
+
+/** Finds sum of array elements */
+CVAPI(CvScalar)  cvSum( const CvArr* arr );
+
+/** Calculates number of non-zero pixels */
+CVAPI(int)  cvCountNonZero( const CvArr* arr );
+
+/** Calculates mean value of array elements */
+CVAPI(CvScalar)  cvAvg( const CvArr* arr, const CvArr* mask CV_DEFAULT(NULL) );
+
+/** Calculates mean and standard deviation of pixel values */
+CVAPI(void)  cvAvgSdv( const CvArr* arr, CvScalar* mean, CvScalar* std_dev,
+                       const CvArr* mask CV_DEFAULT(NULL) );
+
+/** Finds global minimum, maximum and their positions */
+CVAPI(void)  cvMinMaxLoc( const CvArr* arr, double* min_val, double* max_val,
+                          CvPoint* min_loc CV_DEFAULT(NULL),
+                          CvPoint* max_loc CV_DEFAULT(NULL),
+                          const CvArr* mask CV_DEFAULT(NULL) );
+
+/** @anchor core_c_NormFlags
+  @name Flags for cvNorm and cvNormalize
+  @{
+*/
+#define CV_C            1
+#define CV_L1           2
+#define CV_L2           4
+#define CV_NORM_MASK    7
+#define CV_RELATIVE     8
+#define CV_DIFF         16
+#define CV_MINMAX       32
+
+#define CV_DIFF_C       (CV_DIFF | CV_C)
+#define CV_DIFF_L1      (CV_DIFF | CV_L1)
+#define CV_DIFF_L2      (CV_DIFF | CV_L2)
+#define CV_RELATIVE_C   (CV_RELATIVE | CV_C)
+#define CV_RELATIVE_L1  (CV_RELATIVE | CV_L1)
+#define CV_RELATIVE_L2  (CV_RELATIVE | CV_L2)
+/** @} */
+
+/** Finds norm, difference norm or relative difference norm for an array (or two arrays)
+@see ref core_c_NormFlags "flags"
+*/
+CVAPI(double)  cvNorm( const CvArr* arr1, const CvArr* arr2 CV_DEFAULT(NULL),
+                       int norm_type CV_DEFAULT(CV_L2),
+                       const CvArr* mask CV_DEFAULT(NULL) );
+
+/** @see ref core_c_NormFlags "flags" */
+CVAPI(void)  cvNormalize( const CvArr* src, CvArr* dst,
+                          double a CV_DEFAULT(1.), double b CV_DEFAULT(0.),
+                          int norm_type CV_DEFAULT(CV_L2),
+                          const CvArr* mask CV_DEFAULT(NULL) );
+
+/** @anchor core_c_ReduceFlags
+  @name Flags for cvReduce
+  @{
+*/
+#define CV_REDUCE_SUM 0
+#define CV_REDUCE_AVG 1
+#define CV_REDUCE_MAX 2
+#define CV_REDUCE_MIN 3
+/** @} */
+
+/** @see @ref core_c_ReduceFlags "flags" */
+CVAPI(void)  cvReduce( const CvArr* src, CvArr* dst, int dim CV_DEFAULT(-1),
+                       int op CV_DEFAULT(CV_REDUCE_SUM) );
+
+/****************************************************************************************\
+*                      Discrete Linear Transforms and Related Functions                  *
+\****************************************************************************************/
+
+/** @anchor core_c_DftFlags
+  @name Flags for cvDFT, cvDCT and cvMulSpectrums
+  @{
+  */
+#define CV_DXT_FORWARD  0
+#define CV_DXT_INVERSE  1
+#define CV_DXT_SCALE    2 /**< divide result by size of array */
+#define CV_DXT_INV_SCALE (CV_DXT_INVERSE + CV_DXT_SCALE)
+#define CV_DXT_INVERSE_SCALE CV_DXT_INV_SCALE
+#define CV_DXT_ROWS     4 /**< transform each row individually */
+#define CV_DXT_MUL_CONJ 8 /**< conjugate the second argument of cvMulSpectrums */
+/** @} */
+
+/** Discrete Fourier Transform:
+    complex->complex,
+    real->ccs (forward),
+    ccs->real (inverse)
+@see core_c_DftFlags "flags"
+*/
+CVAPI(void)  cvDFT( const CvArr* src, CvArr* dst, int flags,
+                    int nonzero_rows CV_DEFAULT(0) );
+#define cvFFT cvDFT
+
+/** Multiply results of DFTs: DFT(X)*DFT(Y) or DFT(X)*conj(DFT(Y))
+@see core_c_DftFlags "flags"
+*/
+CVAPI(void)  cvMulSpectrums( const CvArr* src1, const CvArr* src2,
+                             CvArr* dst, int flags );
+
+/** Finds optimal DFT vector size >= size0 */
+CVAPI(int)  cvGetOptimalDFTSize( int size0 );
+
+/** Discrete Cosine Transform
+@see core_c_DftFlags "flags"
+*/
+CVAPI(void)  cvDCT( const CvArr* src, CvArr* dst, int flags );
+
+/****************************************************************************************\
+*                              Dynamic data structures                                   *
+\****************************************************************************************/
+
+/** Calculates length of sequence slice (with support of negative indices). */
+CVAPI(int) cvSliceLength( CvSlice slice, const CvSeq* seq );
+
+
+/** Creates new memory storage.
+   block_size == 0 means that default,
+   somewhat optimal size, is used (currently, it is 64K) */
+CVAPI(CvMemStorage*)  cvCreateMemStorage( int block_size CV_DEFAULT(0));
+
+
+/** Creates a memory storage that will borrow memory blocks from parent storage */
+CVAPI(CvMemStorage*)  cvCreateChildMemStorage( CvMemStorage* parent );
+
+
+/** Releases memory storage. All the children of a parent must be released before
+   the parent. A child storage returns all the blocks to parent when it is released */
+CVAPI(void)  cvReleaseMemStorage( CvMemStorage** storage );
+
+
+/** Clears memory storage. This is the only way(!!!) (besides cvRestoreMemStoragePos)
+   to reuse memory allocated for the storage - cvClearSeq,cvClearSet ...
+   do not free any memory.
+   A child storage returns all the blocks to the parent when it is cleared */
+CVAPI(void)  cvClearMemStorage( CvMemStorage* storage );
+
+/** Remember a storage "free memory" position */
+CVAPI(void)  cvSaveMemStoragePos( const CvMemStorage* storage, CvMemStoragePos* pos );
+
+/** Restore a storage "free memory" position */
+CVAPI(void)  cvRestoreMemStoragePos( CvMemStorage* storage, CvMemStoragePos* pos );
+
+/** Allocates continuous buffer of the specified size in the storage */
+CVAPI(void*) cvMemStorageAlloc( CvMemStorage* storage, size_t size );
+
+/** Allocates string in memory storage */
+//CVAPI(CvString) cvMemStorageAllocString( CvMemStorage* storage, const char* ptr,
+//                                         int len CV_DEFAULT(-1) );
+
+/** Creates new empty sequence that will reside in the specified storage */
+CVAPI(CvSeq*)  cvCreateSeq( int seq_flags, size_t header_size,
+                            size_t elem_size, CvMemStorage* storage );
+
+/** Changes default size (granularity) of sequence blocks.
+   The default size is ~1Kbyte */
+CVAPI(void)  cvSetSeqBlockSize( CvSeq* seq, int delta_elems );
+
+
+/** Adds new element to the end of sequence. Returns pointer to the element */
+CVAPI(schar*)  cvSeqPush( CvSeq* seq, const void* element CV_DEFAULT(NULL));
+
+
+/** Adds new element to the beginning of sequence. Returns pointer to it */
+CVAPI(schar*)  cvSeqPushFront( CvSeq* seq, const void* element CV_DEFAULT(NULL));
+
+
+/** Removes the last element from sequence and optionally saves it */
+CVAPI(void)  cvSeqPop( CvSeq* seq, void* element CV_DEFAULT(NULL));
+
+
+/** Removes the first element from sequence and optioanally saves it */
+CVAPI(void)  cvSeqPopFront( CvSeq* seq, void* element CV_DEFAULT(NULL));
+
+
+#define CV_FRONT 1
+#define CV_BACK 0
+/** Adds several new elements to the end of sequence */
+CVAPI(void)  cvSeqPushMulti( CvSeq* seq, const void* elements,
+                             int count, int in_front CV_DEFAULT(0) );
+
+/** Removes several elements from the end of sequence and optionally saves them */
+CVAPI(void)  cvSeqPopMulti( CvSeq* seq, void* elements,
+                            int count, int in_front CV_DEFAULT(0) );
+
+/** Inserts a new element in the middle of sequence.
+   cvSeqInsert(seq,0,elem) == cvSeqPushFront(seq,elem) */
+CVAPI(schar*)  cvSeqInsert( CvSeq* seq, int before_index,
+                            const void* element CV_DEFAULT(NULL));
+
+/** Removes specified sequence element */
+CVAPI(void)  cvSeqRemove( CvSeq* seq, int index );
+
+
+/** Removes all the elements from the sequence. The freed memory
+   can be reused later only by the same sequence unless cvClearMemStorage
+   or cvRestoreMemStoragePos is called */
+CVAPI(void)  cvClearSeq( CvSeq* seq );
+
+
+/** Retrieves pointer to specified sequence element.
+   Negative indices are supported and mean counting from the end
+   (e.g -1 means the last sequence element) */
+CVAPI(schar*)  cvGetSeqElem( const CvSeq* seq, int index );
+
+/** Calculates index of the specified sequence element.
+   Returns -1 if element does not belong to the sequence */
+CVAPI(int)  cvSeqElemIdx( const CvSeq* seq, const void* element,
+                         CvSeqBlock** block CV_DEFAULT(NULL) );
+
+/** Initializes sequence writer. The new elements will be added to the end of sequence */
+CVAPI(void)  cvStartAppendToSeq( CvSeq* seq, CvSeqWriter* writer );
+
+
+/** Combination of cvCreateSeq and cvStartAppendToSeq */
+CVAPI(void)  cvStartWriteSeq( int seq_flags, int header_size,
+                              int elem_size, CvMemStorage* storage,
+                              CvSeqWriter* writer );
+
+/** Closes sequence writer, updates sequence header and returns pointer
+   to the resultant sequence
+   (which may be useful if the sequence was created using cvStartWriteSeq))
+*/
+CVAPI(CvSeq*)  cvEndWriteSeq( CvSeqWriter* writer );
+
+
+/** Updates sequence header. May be useful to get access to some of previously
+   written elements via cvGetSeqElem or sequence reader */
+CVAPI(void)   cvFlushSeqWriter( CvSeqWriter* writer );
+
+
+/** Initializes sequence reader.
+   The sequence can be read in forward or backward direction */
+CVAPI(void) cvStartReadSeq( const CvSeq* seq, CvSeqReader* reader,
+                           int reverse CV_DEFAULT(0) );
+
+
+/** Returns current sequence reader position (currently observed sequence element) */
+CVAPI(int)  cvGetSeqReaderPos( CvSeqReader* reader );
+
+
+/** Changes sequence reader position. It may seek to an absolute or
+   to relative to the current position */
+CVAPI(void)   cvSetSeqReaderPos( CvSeqReader* reader, int index,
+                                 int is_relative CV_DEFAULT(0));
+
+/** Copies sequence content to a continuous piece of memory */
+CVAPI(void*)  cvCvtSeqToArray( const CvSeq* seq, void* elements,
+                               CvSlice slice CV_DEFAULT(CV_WHOLE_SEQ) );
+
+/** Creates sequence header for array.
+   After that all the operations on sequences that do not alter the content
+   can be applied to the resultant sequence */
+CVAPI(CvSeq*) cvMakeSeqHeaderForArray( int seq_type, int header_size,
+                                       int elem_size, void* elements, int total,
+                                       CvSeq* seq, CvSeqBlock* block );
+
+/** Extracts sequence slice (with or without copying sequence elements) */
+CVAPI(CvSeq*) cvSeqSlice( const CvSeq* seq, CvSlice slice,
+                         CvMemStorage* storage CV_DEFAULT(NULL),
+                         int copy_data CV_DEFAULT(0));
+
+CV_INLINE CvSeq* cvCloneSeq( const CvSeq* seq, CvMemStorage* storage CV_DEFAULT(NULL))
+{
+    return cvSeqSlice( seq, CV_WHOLE_SEQ, storage, 1 );
+}
+
+/** Removes sequence slice */
+CVAPI(void)  cvSeqRemoveSlice( CvSeq* seq, CvSlice slice );
+
+/** Inserts a sequence or array into another sequence */
+CVAPI(void)  cvSeqInsertSlice( CvSeq* seq, int before_index, const CvArr* from_arr );
+
+/** a < b ? -1 : a > b ? 1 : 0 */
+typedef int (CV_CDECL* CvCmpFunc)(const void* a, const void* b, void* userdata );
+
+/** Sorts sequence in-place given element comparison function */
+CVAPI(void) cvSeqSort( CvSeq* seq, CvCmpFunc func, void* userdata CV_DEFAULT(NULL) );
+
+/** Finds element in a [sorted] sequence */
+CVAPI(schar*) cvSeqSearch( CvSeq* seq, const void* elem, CvCmpFunc func,
+                           int is_sorted, int* elem_idx,
+                           void* userdata CV_DEFAULT(NULL) );
+
+/** Reverses order of sequence elements in-place */
+CVAPI(void) cvSeqInvert( CvSeq* seq );
+
+/** Splits sequence into one or more equivalence classes using the specified criteria */
+CVAPI(int)  cvSeqPartition( const CvSeq* seq, CvMemStorage* storage,
+                            CvSeq** labels, CvCmpFunc is_equal, void* userdata );
+
+/************ Internal sequence functions ************/
+CVAPI(void)  cvChangeSeqBlock( void* reader, int direction );
+CVAPI(void)  cvCreateSeqBlock( CvSeqWriter* writer );
+
+
+/** Creates a new set */
+CVAPI(CvSet*)  cvCreateSet( int set_flags, int header_size,
+                            int elem_size, CvMemStorage* storage );
+
+/** Adds new element to the set and returns pointer to it */
+CVAPI(int)  cvSetAdd( CvSet* set_header, CvSetElem* elem CV_DEFAULT(NULL),
+                      CvSetElem** inserted_elem CV_DEFAULT(NULL) );
+
+/** Fast variant of cvSetAdd */
+CV_INLINE  CvSetElem* cvSetNew( CvSet* set_header )
+{
+    CvSetElem* elem = set_header->free_elems;
+    if( elem )
+    {
+        set_header->free_elems = elem->next_free;
+        elem->flags = elem->flags & CV_SET_ELEM_IDX_MASK;
+        set_header->active_count++;
+    }
+    else
+        cvSetAdd( set_header, NULL, &elem );
+    return elem;
+}
+
+/** Removes set element given its pointer */
+CV_INLINE  void cvSetRemoveByPtr( CvSet* set_header, void* elem )
+{
+    CvSetElem* _elem = (CvSetElem*)elem;
+    assert( _elem->flags >= 0 /*&& (elem->flags & CV_SET_ELEM_IDX_MASK) < set_header->total*/ );
+    _elem->next_free = set_header->free_elems;
+    _elem->flags = (_elem->flags & CV_SET_ELEM_IDX_MASK) | CV_SET_ELEM_FREE_FLAG;
+    set_header->free_elems = _elem;
+    set_header->active_count--;
+}
+
+/** Removes element from the set by its index  */
+CVAPI(void)   cvSetRemove( CvSet* set_header, int index );
+
+/** Returns a set element by index. If the element doesn't belong to the set,
+   NULL is returned */
+CV_INLINE CvSetElem* cvGetSetElem( const CvSet* set_header, int idx )
+{
+    CvSetElem* elem = (CvSetElem*)(void *)cvGetSeqElem( (CvSeq*)set_header, idx );
+    return elem && CV_IS_SET_ELEM( elem ) ? elem : 0;
+}
+
+/** Removes all the elements from the set */
+CVAPI(void)  cvClearSet( CvSet* set_header );
+
+/** Creates new graph */
+CVAPI(CvGraph*)  cvCreateGraph( int graph_flags, int header_size,
+                                int vtx_size, int edge_size,
+                                CvMemStorage* storage );
+
+/** Adds new vertex to the graph */
+CVAPI(int)  cvGraphAddVtx( CvGraph* graph, const CvGraphVtx* vtx CV_DEFAULT(NULL),
+                           CvGraphVtx** inserted_vtx CV_DEFAULT(NULL) );
+
+
+/** Removes vertex from the graph together with all incident edges */
+CVAPI(int)  cvGraphRemoveVtx( CvGraph* graph, int index );
+CVAPI(int)  cvGraphRemoveVtxByPtr( CvGraph* graph, CvGraphVtx* vtx );
+
+
+/** Link two vertices specified by indices or pointers if they
+   are not connected or return pointer to already existing edge
+   connecting the vertices.
+   Functions return 1 if a new edge was created, 0 otherwise */
+CVAPI(int)  cvGraphAddEdge( CvGraph* graph,
+                            int start_idx, int end_idx,
+                            const CvGraphEdge* edge CV_DEFAULT(NULL),
+                            CvGraphEdge** inserted_edge CV_DEFAULT(NULL) );
+
+CVAPI(int)  cvGraphAddEdgeByPtr( CvGraph* graph,
+                               CvGraphVtx* start_vtx, CvGraphVtx* end_vtx,
+                               const CvGraphEdge* edge CV_DEFAULT(NULL),
+                               CvGraphEdge** inserted_edge CV_DEFAULT(NULL) );
+
+/** Remove edge connecting two vertices */
+CVAPI(void)  cvGraphRemoveEdge( CvGraph* graph, int start_idx, int end_idx );
+CVAPI(void)  cvGraphRemoveEdgeByPtr( CvGraph* graph, CvGraphVtx* start_vtx,
+                                     CvGraphVtx* end_vtx );
+
+/** Find edge connecting two vertices */
+CVAPI(CvGraphEdge*)  cvFindGraphEdge( const CvGraph* graph, int start_idx, int end_idx );
+CVAPI(CvGraphEdge*)  cvFindGraphEdgeByPtr( const CvGraph* graph,
+                                           const CvGraphVtx* start_vtx,
+                                           const CvGraphVtx* end_vtx );
+#define cvGraphFindEdge cvFindGraphEdge
+#define cvGraphFindEdgeByPtr cvFindGraphEdgeByPtr
+
+/** Remove all vertices and edges from the graph */
+CVAPI(void)  cvClearGraph( CvGraph* graph );
+
+
+/** Count number of edges incident to the vertex */
+CVAPI(int)  cvGraphVtxDegree( const CvGraph* graph, int vtx_idx );
+CVAPI(int)  cvGraphVtxDegreeByPtr( const CvGraph* graph, const CvGraphVtx* vtx );
+
+
+/** Retrieves graph vertex by given index */
+#define cvGetGraphVtx( graph, idx ) (CvGraphVtx*)cvGetSetElem((CvSet*)(graph), (idx))
+
+/** Retrieves index of a graph vertex given its pointer */
+#define cvGraphVtxIdx( graph, vtx ) ((vtx)->flags & CV_SET_ELEM_IDX_MASK)
+
+/** Retrieves index of a graph edge given its pointer */
+#define cvGraphEdgeIdx( graph, edge ) ((edge)->flags & CV_SET_ELEM_IDX_MASK)
+
+#define cvGraphGetVtxCount( graph ) ((graph)->active_count)
+#define cvGraphGetEdgeCount( graph ) ((graph)->edges->active_count)
+
+#define  CV_GRAPH_VERTEX        1
+#define  CV_GRAPH_TREE_EDGE     2
+#define  CV_GRAPH_BACK_EDGE     4
+#define  CV_GRAPH_FORWARD_EDGE  8
+#define  CV_GRAPH_CROSS_EDGE    16
+#define  CV_GRAPH_ANY_EDGE      30
+#define  CV_GRAPH_NEW_TREE      32
+#define  CV_GRAPH_BACKTRACKING  64
+#define  CV_GRAPH_OVER          -1
+
+#define  CV_GRAPH_ALL_ITEMS    -1
+
+/** flags for graph vertices and edges */
+#define  CV_GRAPH_ITEM_VISITED_FLAG  (1 << 30)
+#define  CV_IS_GRAPH_VERTEX_VISITED(vtx) \
+    (((CvGraphVtx*)(vtx))->flags & CV_GRAPH_ITEM_VISITED_FLAG)
+#define  CV_IS_GRAPH_EDGE_VISITED(edge) \
+    (((CvGraphEdge*)(edge))->flags & CV_GRAPH_ITEM_VISITED_FLAG)
+#define  CV_GRAPH_SEARCH_TREE_NODE_FLAG   (1 << 29)
+#define  CV_GRAPH_FORWARD_EDGE_FLAG       (1 << 28)
+
+typedef struct CvGraphScanner
+{
+    CvGraphVtx* vtx;       /* current graph vertex (or current edge origin) */
+    CvGraphVtx* dst;       /* current graph edge destination vertex */
+    CvGraphEdge* edge;     /* current edge */
+
+    CvGraph* graph;        /* the graph */
+    CvSeq*   stack;        /* the graph vertex stack */
+    int      index;        /* the lower bound of certainly visited vertices */
+    int      mask;         /* event mask */
+}
+CvGraphScanner;
+
+/** Creates new graph scanner. */
+CVAPI(CvGraphScanner*)  cvCreateGraphScanner( CvGraph* graph,
+                                             CvGraphVtx* vtx CV_DEFAULT(NULL),
+                                             int mask CV_DEFAULT(CV_GRAPH_ALL_ITEMS));
+
+/** Releases graph scanner. */
+CVAPI(void) cvReleaseGraphScanner( CvGraphScanner** scanner );
+
+/** Get next graph element */
+CVAPI(int)  cvNextGraphItem( CvGraphScanner* scanner );
+
+/** Creates a copy of graph */
+CVAPI(CvGraph*) cvCloneGraph( const CvGraph* graph, CvMemStorage* storage );
+
+
+/** Does look-up transformation. Elements of the source array
+   (that should be 8uC1 or 8sC1) are used as indexes in lutarr 256-element table */
+CVAPI(void) cvLUT( const CvArr* src, CvArr* dst, const CvArr* lut );
+
+
+/******************* Iteration through the sequence tree *****************/
+typedef struct CvTreeNodeIterator
+{
+    const void* node;
+    int level;
+    int max_level;
+}
+CvTreeNodeIterator;
+
+CVAPI(void) cvInitTreeNodeIterator( CvTreeNodeIterator* tree_iterator,
+                                   const void* first, int max_level );
+CVAPI(void*) cvNextTreeNode( CvTreeNodeIterator* tree_iterator );
+CVAPI(void*) cvPrevTreeNode( CvTreeNodeIterator* tree_iterator );
+
+/** Inserts sequence into tree with specified "parent" sequence.
+   If parent is equal to frame (e.g. the most external contour),
+   then added contour will have null pointer to parent. */
+CVAPI(void) cvInsertNodeIntoTree( void* node, void* parent, void* frame );
+
+/** Removes contour from tree (together with the contour children). */
+CVAPI(void) cvRemoveNodeFromTree( void* node, void* frame );
+
+/** Gathers pointers to all the sequences,
+   accessible from the `first`, to the single sequence */
+CVAPI(CvSeq*) cvTreeToNodeSeq( const void* first, int header_size,
+                              CvMemStorage* storage );
+
+/** The function implements the K-means algorithm for clustering an array of sample
+   vectors in a specified number of classes */
+#define CV_KMEANS_USE_INITIAL_LABELS    1
+CVAPI(int) cvKMeans2( const CvArr* samples, int cluster_count, CvArr* labels,
+                      CvTermCriteria termcrit, int attempts CV_DEFAULT(1),
+                      CvRNG* rng CV_DEFAULT(0), int flags CV_DEFAULT(0),
+                      CvArr* _centers CV_DEFAULT(0), double* compactness CV_DEFAULT(0) );
+
+/****************************************************************************************\
+*                                    System functions                                    *
+\****************************************************************************************/
+
+/** Loads optimized functions from IPP, MKL etc. or switches back to pure C code */
+CVAPI(int)  cvUseOptimized( int on_off );
+
+typedef IplImage* (CV_STDCALL* Cv_iplCreateImageHeader)
+                            (int,int,int,char*,char*,int,int,int,int,int,
+                            IplROI*,IplImage*,void*,IplTileInfo*);
+typedef void (CV_STDCALL* Cv_iplAllocateImageData)(IplImage*,int,int);
+typedef void (CV_STDCALL* Cv_iplDeallocate)(IplImage*,int);
+typedef IplROI* (CV_STDCALL* Cv_iplCreateROI)(int,int,int,int,int);
+typedef IplImage* (CV_STDCALL* Cv_iplCloneImage)(const IplImage*);
+
+/** @brief Makes OpenCV use IPL functions for allocating IplImage and IplROI structures.
+
+Normally, the function is not called directly. Instead, a simple macro
+CV_TURN_ON_IPL_COMPATIBILITY() is used that calls cvSetIPLAllocators and passes there pointers
+to IPL allocation functions. :
+@code
+    ...
+    CV_TURN_ON_IPL_COMPATIBILITY()
+    ...
+@endcode
+@param create_header pointer to a function, creating IPL image header.
+@param allocate_data pointer to a function, allocating IPL image data.
+@param deallocate pointer to a function, deallocating IPL image.
+@param create_roi pointer to a function, creating IPL image ROI (i.e. Region of Interest).
+@param clone_image pointer to a function, cloning an IPL image.
+ */
+CVAPI(void) cvSetIPLAllocators( Cv_iplCreateImageHeader create_header,
+                               Cv_iplAllocateImageData allocate_data,
+                               Cv_iplDeallocate deallocate,
+                               Cv_iplCreateROI create_roi,
+                               Cv_iplCloneImage clone_image );
+
+#define CV_TURN_ON_IPL_COMPATIBILITY()                                  \
+    cvSetIPLAllocators( iplCreateImageHeader, iplAllocateImage,         \
+                        iplDeallocate, iplCreateROI, iplCloneImage )
+
+/****************************************************************************************\
+*                                    Data Persistence                                    *
+\****************************************************************************************/
+
+#if 0
+/********************************** High-level functions ********************************/
+
+/** @brief Opens file storage for reading or writing data.
+
+The function opens file storage for reading or writing data. In the latter case, a new file is
+created or an existing file is rewritten. The type of the read or written file is determined by the
+filename extension: .xml for XML, .yml or .yaml for YAML and .json for JSON.
+
+At the same time, it also supports adding parameters like "example.xml?base64".
+
+The function returns a pointer to the CvFileStorage structure.
+If the file cannot be opened then the function returns NULL.
+@param filename Name of the file associated with the storage
+@param memstorage Memory storage used for temporary data and for
+:   storing dynamic structures, such as CvSeq or CvGraph . If it is NULL, a temporary memory
+    storage is created and used.
+@param flags Can be one of the following:
+> -   **CV_STORAGE_READ** the storage is open for reading
+> -   **CV_STORAGE_WRITE** the storage is open for writing
+      (use **CV_STORAGE_WRITE | CV_STORAGE_WRITE_BASE64** to write rawdata in Base64)
+@param encoding
+ */
+CVAPI(CvFileStorage*)  cvOpenFileStorage( const char* filename, CvMemStorage* memstorage,
+                                          int flags, const char* encoding CV_DEFAULT(NULL) );
+
+/** @brief Releases file storage.
+
+The function closes the file associated with the storage and releases all the temporary structures.
+It must be called after all I/O operations with the storage are finished.
+@param fs Double pointer to the released file storage
+ */
+CVAPI(void) cvReleaseFileStorage( CvFileStorage** fs );
+
+/** returns attribute value or 0 (NULL) if there is no such attribute */
+CVAPI(const char*) cvAttrValue( const CvAttrList* attr, const char* attr_name );
+
+/** @brief Starts writing a new structure.
+
+The function starts writing a compound structure (collection) that can be a sequence or a map. After
+all the structure fields, which can be scalars or structures, are written, cvEndWriteStruct should
+be called. The function can be used to group some objects or to implement the write function for a
+some user object (see CvTypeInfo).
+@param fs File storage
+@param name Name of the written structure. The structure can be accessed by this name when the
+storage is read.
+@param struct_flags A combination one of the following values:
+-   **CV_NODE_SEQ** the written structure is a sequence (see discussion of CvFileStorage ),
+    that is, its elements do not have a name.
+-   **CV_NODE_MAP** the written structure is a map (see discussion of CvFileStorage ), that
+    is, all its elements have names.
+One and only one of the two above flags must be specified
+-   **CV_NODE_FLOW** the optional flag that makes sense only for YAML streams. It means that
+     the structure is written as a flow (not as a block), which is more compact. It is
+     recommended to use this flag for structures or arrays whose elements are all scalars.
+@param type_name Optional parameter - the object type name. In
+    case of XML it is written as a type_id attribute of the structure opening tag. In the case of
+    YAML it is written after a colon following the structure name (see the example in
+    CvFileStorage description). In case of JSON it is written as a name/value pair.
+    Mainly it is used with user objects. When the storage is read, the
+    encoded type name is used to determine the object type (see CvTypeInfo and cvFindType ).
+@param attributes This parameter is not used in the current implementation
+ */
+CVAPI(void) cvStartWriteStruct( CvFileStorage* fs, const char* name,
+                                int struct_flags, const char* type_name CV_DEFAULT(NULL),
+                                CvAttrList attributes CV_DEFAULT(cvAttrList()));
+
+/** @brief Finishes writing to a file node collection.
+@param fs File storage
+@sa cvStartWriteStruct.
+ */
+CVAPI(void) cvEndWriteStruct( CvFileStorage* fs );
+
+/** @brief Writes an integer value.
+
+The function writes a single integer value (with or without a name) to the file storage.
+@param fs File storage
+@param name Name of the written value. Should be NULL if and only if the parent structure is a
+sequence.
+@param value The written value
+ */
+CVAPI(void) cvWriteInt( CvFileStorage* fs, const char* name, int value );
+
+/** @brief Writes a floating-point value.
+
+The function writes a single floating-point value (with or without a name) to file storage. Special
+values are encoded as follows: NaN (Not A Number) as .NaN, infinity as +.Inf or -.Inf.
+
+The following example shows how to use the low-level writing functions to store custom structures,
+such as termination criteria, without registering a new type. :
+@code
+    void write_termcriteria( CvFileStorage* fs, const char* struct_name,
+                             CvTermCriteria* termcrit )
+    {
+        cvStartWriteStruct( fs, struct_name, CV_NODE_MAP, NULL, cvAttrList(0,0));
+        cvWriteComment( fs, "termination criteria", 1 ); // just a description
+        if( termcrit->type & CV_TERMCRIT_ITER )
+            cvWriteInteger( fs, "max_iterations", termcrit->max_iter );
+        if( termcrit->type & CV_TERMCRIT_EPS )
+            cvWriteReal( fs, "accuracy", termcrit->epsilon );
+        cvEndWriteStruct( fs );
+    }
+@endcode
+@param fs File storage
+@param name Name of the written value. Should be NULL if and only if the parent structure is a
+sequence.
+@param value The written value
+*/
+CVAPI(void) cvWriteReal( CvFileStorage* fs, const char* name, double value );
+
+/** @brief Writes a text string.
+
+The function writes a text string to file storage.
+@param fs File storage
+@param name Name of the written string . Should be NULL if and only if the parent structure is a
+sequence.
+@param str The written text string
+@param quote If non-zero, the written string is put in quotes, regardless of whether they are
+required. Otherwise, if the flag is zero, quotes are used only when they are required (e.g. when
+the string starts with a digit or contains spaces).
+ */
+CVAPI(void) cvWriteString( CvFileStorage* fs, const char* name,
+                           const char* str, int quote CV_DEFAULT(0) );
+
+/** @brief Writes a comment.
+
+The function writes a comment into file storage. The comments are skipped when the storage is read.
+@param fs File storage
+@param comment The written comment, single-line or multi-line
+@param eol_comment If non-zero, the function tries to put the comment at the end of current line.
+If the flag is zero, if the comment is multi-line, or if it does not fit at the end of the current
+line, the comment starts a new line.
+ */
+CVAPI(void) cvWriteComment( CvFileStorage* fs, const char* comment,
+                            int eol_comment );
+
+/** @brief Writes an object to file storage.
+
+The function writes an object to file storage. First, the appropriate type info is found using
+cvTypeOf. Then, the write method associated with the type info is called.
+
+Attributes are used to customize the writing procedure. The standard types support the following
+attributes (all the dt attributes have the same format as in cvWriteRawData):
+
+-# CvSeq
+    -   **header_dt** description of user fields of the sequence header that follow CvSeq, or
+        CvChain (if the sequence is a Freeman chain) or CvContour (if the sequence is a contour or
+        point sequence)
+    -   **dt** description of the sequence elements.
+    -   **recursive** if the attribute is present and is not equal to "0" or "false", the whole
+        tree of sequences (contours) is stored.
+-# CvGraph
+    -   **header_dt** description of user fields of the graph header that follows CvGraph;
+    -   **vertex_dt** description of user fields of graph vertices
+    -   **edge_dt** description of user fields of graph edges (note that the edge weight is
+        always written, so there is no need to specify it explicitly)
+
+Below is the code that creates the YAML file shown in the CvFileStorage description:
+@code
+    #include "cxcore.h"
+
+    int main( int argc, char** argv )
+    {
+        CvMat* mat = cvCreateMat( 3, 3, CV_32F );
+        CvFileStorage* fs = cvOpenFileStorage( "example.yml", 0, CV_STORAGE_WRITE );
+
+        cvSetIdentity( mat );
+        cvWrite( fs, "A", mat, cvAttrList(0,0) );
+
+        cvReleaseFileStorage( &fs );
+        cvReleaseMat( &mat );
+        return 0;
+    }
+@endcode
+@param fs File storage
+@param name Name of the written object. Should be NULL if and only if the parent structure is a
+sequence.
+@param ptr Pointer to the object
+@param attributes The attributes of the object. They are specific for each particular type (see
+the discussion below).
+ */
+CVAPI(void) cvWrite( CvFileStorage* fs, const char* name, const void* ptr,
+                         CvAttrList attributes CV_DEFAULT(cvAttrList()));
+
+/** @brief Starts the next stream.
+
+The function finishes the currently written stream and starts the next stream. In the case of XML
+the file with multiple streams looks like this:
+@code{.xml}
+    <opencv_storage>
+    <!-- stream #1 data -->
+    </opencv_storage>
+    <opencv_storage>
+    <!-- stream #2 data -->
+    </opencv_storage>
+    ...
+@endcode
+The YAML file will look like this:
+@code{.yaml}
+    %YAML 1.0
+    # stream #1 data
+    ...
+    ---
+    # stream #2 data
+@endcode
+This is useful for concatenating files or for resuming the writing process.
+@param fs File storage
+ */
+CVAPI(void) cvStartNextStream( CvFileStorage* fs );
+
+/** @brief Writes multiple numbers.
+
+The function writes an array, whose elements consist of single or multiple numbers. The function
+call can be replaced with a loop containing a few cvWriteInt and cvWriteReal calls, but a single
+call is more efficient. Note that because none of the elements have a name, they should be written
+to a sequence rather than a map.
+@param fs File storage
+@param src Pointer to the written array
+@param len Number of the array elements to write
+@param dt Specification of each array element, see @ref format_spec "format specification"
+ */
+CVAPI(void) cvWriteRawData( CvFileStorage* fs, const void* src,
+                                int len, const char* dt );
+
+/** @brief Writes multiple numbers in Base64.
+
+If either CV_STORAGE_WRITE_BASE64 or cv::FileStorage::WRITE_BASE64 is used,
+this function will be the same as cvWriteRawData. If neither, the main
+difference is that it outputs a sequence in Base64 encoding rather than
+in plain text.
+
+This function can only be used to write a sequence with a type "binary".
+
+@param fs File storage
+@param src Pointer to the written array
+@param len Number of the array elements to write
+@param dt Specification of each array element, see @ref format_spec "format specification"
+*/
+CVAPI(void) cvWriteRawDataBase64( CvFileStorage* fs, const void* src,
+                                 int len, const char* dt );
+
+/** @brief Returns a unique pointer for a given name.
+
+The function returns a unique pointer for each particular file node name. This pointer can be then
+passed to the cvGetFileNode function that is faster than cvGetFileNodeByName because it compares
+text strings by comparing pointers rather than the strings' content.
+
+Consider the following example where an array of points is encoded as a sequence of 2-entry maps:
+@code
+    points:
+      - { x: 10, y: 10 }
+      - { x: 20, y: 20 }
+      - { x: 30, y: 30 }
+      # ...
+@endcode
+Then, it is possible to get hashed "x" and "y" pointers to speed up decoding of the points. :
+@code
+    #include "cxcore.h"
+
+    int main( int argc, char** argv )
+    {
+        CvFileStorage* fs = cvOpenFileStorage( "points.yml", 0, CV_STORAGE_READ );
+        CvStringHashNode* x_key = cvGetHashedNode( fs, "x", -1, 1 );
+        CvStringHashNode* y_key = cvGetHashedNode( fs, "y", -1, 1 );
+        CvFileNode* points = cvGetFileNodeByName( fs, 0, "points" );
+
+        if( CV_NODE_IS_SEQ(points->tag) )
+        {
+            CvSeq* seq = points->data.seq;
+            int i, total = seq->total;
+            CvSeqReader reader;
+            cvStartReadSeq( seq, &reader, 0 );
+            for( i = 0; i < total; i++ )
+            {
+                CvFileNode* pt = (CvFileNode*)reader.ptr;
+    #if 1 // faster variant
+                CvFileNode* xnode = cvGetFileNode( fs, pt, x_key, 0 );
+                CvFileNode* ynode = cvGetFileNode( fs, pt, y_key, 0 );
+                assert( xnode && CV_NODE_IS_INT(xnode->tag) &&
+                        ynode && CV_NODE_IS_INT(ynode->tag));
+                int x = xnode->data.i; // or x = cvReadInt( xnode, 0 );
+                int y = ynode->data.i; // or y = cvReadInt( ynode, 0 );
+    #elif 1 // slower variant; does not use x_key & y_key
+                CvFileNode* xnode = cvGetFileNodeByName( fs, pt, "x" );
+                CvFileNode* ynode = cvGetFileNodeByName( fs, pt, "y" );
+                assert( xnode && CV_NODE_IS_INT(xnode->tag) &&
+                        ynode && CV_NODE_IS_INT(ynode->tag));
+                int x = xnode->data.i; // or x = cvReadInt( xnode, 0 );
+                int y = ynode->data.i; // or y = cvReadInt( ynode, 0 );
+    #else // the slowest yet the easiest to use variant
+                int x = cvReadIntByName( fs, pt, "x", 0 );
+                int y = cvReadIntByName( fs, pt, "y", 0 );
+    #endif
+                CV_NEXT_SEQ_ELEM( seq->elem_size, reader );
+                printf("
+            }
+        }
+        cvReleaseFileStorage( &fs );
+        return 0;
+    }
+@endcode
+Please note that whatever method of accessing a map you are using, it is still much slower than
+using plain sequences; for example, in the above example, it is more efficient to encode the points
+as pairs of integers in a single numeric sequence.
+@param fs File storage
+@param name Literal node name
+@param len Length of the name (if it is known apriori), or -1 if it needs to be calculated
+@param create_missing Flag that specifies, whether an absent key should be added into the hash table
+*/
+CVAPI(CvStringHashNode*) cvGetHashedKey( CvFileStorage* fs, const char* name,
+                                        int len CV_DEFAULT(-1),
+                                        int create_missing CV_DEFAULT(0));
+
+/** @brief Retrieves one of the top-level nodes of the file storage.
+
+The function returns one of the top-level file nodes. The top-level nodes do not have a name, they
+correspond to the streams that are stored one after another in the file storage. If the index is out
+of range, the function returns a NULL pointer, so all the top-level nodes can be iterated by
+subsequent calls to the function with stream_index=0,1,..., until the NULL pointer is returned.
+This function can be used as a base for recursive traversal of the file storage.
+@param fs File storage
+@param stream_index Zero-based index of the stream. See cvStartNextStream . In most cases,
+there is only one stream in the file; however, there can be several.
+ */
+CVAPI(CvFileNode*) cvGetRootFileNode( const CvFileStorage* fs,
+                                     int stream_index CV_DEFAULT(0) );
+
+/** @brief Finds a node in a map or file storage.
+
+The function finds a file node. It is a faster version of cvGetFileNodeByName (see
+cvGetHashedKey discussion). Also, the function can insert a new node, if it is not in the map yet.
+@param fs File storage
+@param map The parent map. If it is NULL, the function searches a top-level node. If both map and
+key are NULLs, the function returns the root file node - a map that contains top-level nodes.
+@param key Unique pointer to the node name, retrieved with cvGetHashedKey
+@param create_missing Flag that specifies whether an absent node should be added to the map
+ */
+CVAPI(CvFileNode*) cvGetFileNode( CvFileStorage* fs, CvFileNode* map,
+                                 const CvStringHashNode* key,
+                                 int create_missing CV_DEFAULT(0) );
+
+/** @brief Finds a node in a map or file storage.
+
+The function finds a file node by name. The node is searched either in map or, if the pointer is
+NULL, among the top-level file storage nodes. Using this function for maps and cvGetSeqElem (or
+sequence reader) for sequences, it is possible to navigate through the file storage. To speed up
+multiple queries for a certain key (e.g., in the case of an array of structures) one may use a
+combination of cvGetHashedKey and cvGetFileNode.
+@param fs File storage
+@param map The parent map. If it is NULL, the function searches in all the top-level nodes
+(streams), starting with the first one.
+@param name The file node name
+ */
+CVAPI(CvFileNode*) cvGetFileNodeByName( const CvFileStorage* fs,
+                                       const CvFileNode* map,
+                                       const char* name );
+
+/** @brief Retrieves an integer value from a file node.
+
+The function returns an integer that is represented by the file node. If the file node is NULL, the
+default_value is returned (thus, it is convenient to call the function right after cvGetFileNode
+without checking for a NULL pointer). If the file node has type CV_NODE_INT, then node-\>data.i is
+returned. If the file node has type CV_NODE_REAL, then node-\>data.f is converted to an integer
+and returned. Otherwise the error is reported.
+@param node File node
+@param default_value The value that is returned if node is NULL
+ */
+CV_INLINE int cvReadInt( const CvFileNode* node, int default_value CV_DEFAULT(0) )
+{
+    return !node ? default_value :
+        CV_NODE_IS_INT(node->tag) ? node->data.i :
+        CV_NODE_IS_REAL(node->tag) ? cvRound(node->data.f) : 0x7fffffff;
+}
+
+/** @brief Finds a file node and returns its value.
+
+The function is a simple superposition of cvGetFileNodeByName and cvReadInt.
+@param fs File storage
+@param map The parent map. If it is NULL, the function searches a top-level node.
+@param name The node name
+@param default_value The value that is returned if the file node is not found
+ */
+CV_INLINE int cvReadIntByName( const CvFileStorage* fs, const CvFileNode* map,
+                         const char* name, int default_value CV_DEFAULT(0) )
+{
+    return cvReadInt( cvGetFileNodeByName( fs, map, name ), default_value );
+}
+
+/** @brief Retrieves a floating-point value from a file node.
+
+The function returns a floating-point value that is represented by the file node. If the file node
+is NULL, the default_value is returned (thus, it is convenient to call the function right after
+cvGetFileNode without checking for a NULL pointer). If the file node has type CV_NODE_REAL ,
+then node-\>data.f is returned. If the file node has type CV_NODE_INT , then node-:math:\>data.f
+is converted to floating-point and returned. Otherwise the result is not determined.
+@param node File node
+@param default_value The value that is returned if node is NULL
+ */
+CV_INLINE double cvReadReal( const CvFileNode* node, double default_value CV_DEFAULT(0.) )
+{
+    return !node ? default_value :
+        CV_NODE_IS_INT(node->tag) ? (double)node->data.i :
+        CV_NODE_IS_REAL(node->tag) ? node->data.f : 1e300;
+}
+
+/** @brief Finds a file node and returns its value.
+
+The function is a simple superposition of cvGetFileNodeByName and cvReadReal .
+@param fs File storage
+@param map The parent map. If it is NULL, the function searches a top-level node.
+@param name The node name
+@param default_value The value that is returned if the file node is not found
+ */
+CV_INLINE double cvReadRealByName( const CvFileStorage* fs, const CvFileNode* map,
+                        const char* name, double default_value CV_DEFAULT(0.) )
+{
+    return cvReadReal( cvGetFileNodeByName( fs, map, name ), default_value );
+}
+
+/** @brief Retrieves a text string from a file node.
+
+The function returns a text string that is represented by the file node. If the file node is NULL,
+the default_value is returned (thus, it is convenient to call the function right after
+cvGetFileNode without checking for a NULL pointer). If the file node has type CV_NODE_STR , then
+node-:math:\>data.str.ptr is returned. Otherwise the result is not determined.
+@param node File node
+@param default_value The value that is returned if node is NULL
+ */
+CV_INLINE const char* cvReadString( const CvFileNode* node,
+                        const char* default_value CV_DEFAULT(NULL) )
+{
+    return !node ? default_value : CV_NODE_IS_STRING(node->tag) ? node->data.str.ptr : 0;
+}
+
+/** @brief Finds a file node by its name and returns its value.
+
+The function is a simple superposition of cvGetFileNodeByName and cvReadString .
+@param fs File storage
+@param map The parent map. If it is NULL, the function searches a top-level node.
+@param name The node name
+@param default_value The value that is returned if the file node is not found
+ */
+CV_INLINE const char* cvReadStringByName( const CvFileStorage* fs, const CvFileNode* map,
+                        const char* name, const char* default_value CV_DEFAULT(NULL) )
+{
+    return cvReadString( cvGetFileNodeByName( fs, map, name ), default_value );
+}
+
+
+/** @brief Decodes an object and returns a pointer to it.
+
+The function decodes a user object (creates an object in a native representation from the file
+storage subtree) and returns it. The object to be decoded must be an instance of a registered type
+that supports the read method (see CvTypeInfo). The type of the object is determined by the type
+name that is encoded in the file. If the object is a dynamic structure, it is created either in
+memory storage and passed to cvOpenFileStorage or, if a NULL pointer was passed, in temporary
+memory storage, which is released when cvReleaseFileStorage is called. Otherwise, if the object is
+not a dynamic structure, it is created in a heap and should be released with a specialized function
+or by using the generic cvRelease.
+@param fs File storage
+@param node The root object node
+@param attributes Unused parameter
+ */
+CVAPI(void*) cvRead( CvFileStorage* fs, CvFileNode* node,
+                        CvAttrList* attributes CV_DEFAULT(NULL));
+
+/** @brief Finds an object by name and decodes it.
+
+The function is a simple superposition of cvGetFileNodeByName and cvRead.
+@param fs File storage
+@param map The parent map. If it is NULL, the function searches a top-level node.
+@param name The node name
+@param attributes Unused parameter
+ */
+CV_INLINE void* cvReadByName( CvFileStorage* fs, const CvFileNode* map,
+                              const char* name, CvAttrList* attributes CV_DEFAULT(NULL) )
+{
+    return cvRead( fs, cvGetFileNodeByName( fs, map, name ), attributes );
+}
+
+
+/** @brief Initializes the file node sequence reader.
+
+The function initializes the sequence reader to read data from a file node. The initialized reader
+can be then passed to cvReadRawDataSlice.
+@param fs File storage
+@param src The file node (a sequence) to read numbers from
+@param reader Pointer to the sequence reader
+ */
+CVAPI(void) cvStartReadRawData( const CvFileStorage* fs, const CvFileNode* src,
+                               CvSeqReader* reader );
+
+/** @brief Initializes file node sequence reader.
+
+The function reads one or more elements from the file node, representing a sequence, to a
+user-specified array. The total number of read sequence elements is a product of total and the
+number of components in each array element. For example, if dt=2if, the function will read total\*3
+sequence elements. As with any sequence, some parts of the file node sequence can be skipped or read
+repeatedly by repositioning the reader using cvSetSeqReaderPos.
+@param fs File storage
+@param reader The sequence reader. Initialize it with cvStartReadRawData .
+@param count The number of elements to read
+@param dst Pointer to the destination array
+@param dt Specification of each array element. It has the same format as in cvWriteRawData .
+ */
+CVAPI(void) cvReadRawDataSlice( const CvFileStorage* fs, CvSeqReader* reader,
+                               int count, void* dst, const char* dt );
+
+/** @brief Reads multiple numbers.
+
+The function reads elements from a file node that represents a sequence of scalars.
+@param fs File storage
+@param src The file node (a sequence) to read numbers from
+@param dst Pointer to the destination array
+@param dt Specification of each array element. It has the same format as in cvWriteRawData .
+ */
+CVAPI(void) cvReadRawData( const CvFileStorage* fs, const CvFileNode* src,
+                          void* dst, const char* dt );
+
+/** @brief Writes a file node to another file storage.
+
+The function writes a copy of a file node to file storage. Possible applications of the function are
+merging several file storages into one and conversion between XML, YAML and JSON formats.
+@param fs Destination file storage
+@param new_node_name New name of the file node in the destination file storage. To keep the
+existing name, use cvcvGetFileNodeName
+@param node The written node
+@param embed If the written node is a collection and this parameter is not zero, no extra level of
+hierarchy is created. Instead, all the elements of node are written into the currently written
+structure. Of course, map elements can only be embedded into another map, and sequence elements
+can only be embedded into another sequence.
+ */
+CVAPI(void) cvWriteFileNode( CvFileStorage* fs, const char* new_node_name,
+                            const CvFileNode* node, int embed );
+
+/** @brief Returns the name of a file node.
+
+The function returns the name of a file node or NULL, if the file node does not have a name or if
+node is NULL.
+@param node File node
+ */
+CVAPI(const char*) cvGetFileNodeName( const CvFileNode* node );
+
+/*********************************** Adding own types ***********************************/
+
+/** @brief Registers a new type.
+
+The function registers a new type, which is described by info . The function creates a copy of the
+structure, so the user should delete it after calling the function.
+@param info Type info structure
+ */
+CVAPI(void) cvRegisterType( const CvTypeInfo* info );
+
+/** @brief Unregisters the type.
+
+The function unregisters a type with a specified name. If the name is unknown, it is possible to
+locate the type info by an instance of the type using cvTypeOf or by iterating the type list,
+starting from cvFirstType, and then calling cvUnregisterType(info-\>typeName).
+@param type_name Name of an unregistered type
+ */
+CVAPI(void) cvUnregisterType( const char* type_name );
+
+/** @brief Returns the beginning of a type list.
+
+The function returns the first type in the list of registered types. Navigation through the list can
+be done via the prev and next fields of the CvTypeInfo structure.
+ */
+CVAPI(CvTypeInfo*) cvFirstType(void);
+
+/** @brief Finds a type by its name.
+
+The function finds a registered type by its name. It returns NULL if there is no type with the
+specified name.
+@param type_name Type name
+ */
+CVAPI(CvTypeInfo*) cvFindType( const char* type_name );
+
+/** @brief Returns the type of an object.
+
+The function finds the type of a given object. It iterates through the list of registered types and
+calls the is_instance function/method for every type info structure with that object until one of
+them returns non-zero or until the whole list has been traversed. In the latter case, the function
+returns NULL.
+@param struct_ptr The object pointer
+ */
+CVAPI(CvTypeInfo*) cvTypeOf( const void* struct_ptr );
+
+#endif
+
+/** @brief Releases an object.
+
+ The function finds the type of a given object and calls release with the double pointer.
+ @param struct_ptr Double pointer to the object
+ */
+CVAPI(void) cvRelease( void** struct_ptr );
+
+/** @brief Makes a clone of an object.
+
+The function finds the type of a given object and calls clone with the passed object. Of course, if
+you know the object type, for example, struct_ptr is CvMat\*, it is faster to call the specific
+function, like cvCloneMat.
+@param struct_ptr The object to clone
+ */
+CVAPI(void*) cvClone( const void* struct_ptr );
+
+/*********************************** Measuring Execution Time ***************************/
+
+/** helper functions for RNG initialization and accurate time measurement:
+   uses internal clock counter on x86 */
+CVAPI(int64)  cvGetTickCount( void );
+CVAPI(double) cvGetTickFrequency( void );
+
+/*********************************** CPU capabilities ***********************************/
+
+CVAPI(int) cvCheckHardwareSupport(int feature);
+
+/*********************************** Multi-Threading ************************************/
+
+/** retrieve/set the number of threads used in OpenMP implementations */
+CVAPI(int)  cvGetNumThreads( void );
+CVAPI(void) cvSetNumThreads( int threads CV_DEFAULT(0) );
+/** get index of the thread being executed */
+CVAPI(int)  cvGetThreadNum( void );
+
+
+/********************************** Error Handling **************************************/
+
+/** Get current OpenCV error status */
+CVAPI(int) cvGetErrStatus( void );
+
+/** Sets error status silently */
+CVAPI(void) cvSetErrStatus( int status );
+
+#define CV_ErrModeLeaf     0   /* Print error and exit program */
+#define CV_ErrModeParent   1   /* Print error and continue */
+#define CV_ErrModeSilent   2   /* Don't print and continue */
+
+/** Retrieves current error processing mode */
+CVAPI(int)  cvGetErrMode( void );
+
+/** Sets error processing mode, returns previously used mode */
+CVAPI(int) cvSetErrMode( int mode );
+
+/** Sets error status and performs some additional actions (displaying message box,
+ writing message to stderr, terminating application etc.)
+ depending on the current error mode */
+CVAPI(void) cvError( int status, const char* func_name,
+                    const char* err_msg, const char* file_name, int line );
+
+/** Retrieves textual description of the error given its code */
+CVAPI(const char*) cvErrorStr( int status );
+
+/** Retrieves detailed information about the last error occurred */
+CVAPI(int) cvGetErrInfo( const char** errcode_desc, const char** description,
+                        const char** filename, int* line );
+
+/** Maps IPP error codes to the counterparts from OpenCV */
+CVAPI(int) cvErrorFromIppStatus( int ipp_status );
+
+typedef int (CV_CDECL *CvErrorCallback)( int status, const char* func_name,
+                                        const char* err_msg, const char* file_name, int line, void* userdata );
+
+/** Assigns a new error-handling function */
+CVAPI(CvErrorCallback) cvRedirectError( CvErrorCallback error_handler,
+                                       void* userdata CV_DEFAULT(NULL),
+                                       void** prev_userdata CV_DEFAULT(NULL) );
+
+/** Output nothing */
+CVAPI(int) cvNulDevReport( int status, const char* func_name, const char* err_msg,
+                          const char* file_name, int line, void* userdata );
+
+/** Output to console(fprintf(stderr,...)) */
+CVAPI(int) cvStdErrReport( int status, const char* func_name, const char* err_msg,
+                          const char* file_name, int line, void* userdata );
+
+/** Output to MessageBox(WIN32) */
+CVAPI(int) cvGuiBoxReport( int status, const char* func_name, const char* err_msg,
+                          const char* file_name, int line, void* userdata );
+
+#define OPENCV_ERROR(status,func,context)                           \
+cvError((status),(func),(context),__FILE__,__LINE__)
+
+#define OPENCV_ASSERT(expr,func,context)                            \
+{if (! (expr))                                      \
+{OPENCV_ERROR(CV_StsInternal,(func),(context));}}
+
+#define OPENCV_CALL( Func )                                         \
+{                                                                   \
+Func;                                                           \
+}
+
+
+/** CV_FUNCNAME macro defines icvFuncName constant which is used by CV_ERROR macro */
+#ifdef CV_NO_FUNC_NAMES
+#define CV_FUNCNAME( Name )
+#define cvFuncName ""
+#else
+#define CV_FUNCNAME( Name )  \
+static char cvFuncName[] = Name
+#endif
+
+
+/**
+ CV_ERROR macro unconditionally raises error with passed code and message.
+ After raising error, control will be transferred to the exit label.
+ */
+#define CV_ERROR( Code, Msg )                                       \
+{                                                                   \
+    cvError( (Code), cvFuncName, Msg, __FILE__, __LINE__ );        \
+    __CV_EXIT__;                                                   \
+}
+
+/**
+ CV_CHECK macro checks error status after CV (or IPL)
+ function call. If error detected, control will be transferred to the exit
+ label.
+ */
+#define CV_CHECK()                                                  \
+{                                                                   \
+    if( cvGetErrStatus() < 0 )                                      \
+        CV_ERROR( CV_StsBackTrace, "Inner function failed." );      \
+}
+
+
+/**
+ CV_CALL macro calls CV (or IPL) function, checks error status and
+ signals a error if the function failed. Useful in "parent node"
+ error processing mode
+ */
+#define CV_CALL( Func )                                             \
+{                                                                   \
+    Func;                                                           \
+    CV_CHECK();                                                     \
+}
+
+
+/** Runtime assertion macro */
+#define CV_ASSERT( Condition )                                          \
+{                                                                       \
+    if( !(Condition) )                                                  \
+        CV_ERROR( CV_StsInternal, "Assertion: " #Condition " failed" ); \
+}
+
+#define __CV_BEGIN__       {
+#define __CV_END__         goto exit; exit: ; }
+#define __CV_EXIT__        goto exit
+
+/** @} core_c */
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#ifdef __cplusplus
+
+#include "opencv2/core/utility.hpp"
+
+namespace cv
+{
+
+//! @addtogroup core_c_glue
+//! @{
+
+/////////////////////////////////////////// glue ///////////////////////////////////////////
+
+//! converts array (CvMat or IplImage) to cv::Mat
+CV_EXPORTS Mat cvarrToMat(const CvArr* arr, bool copyData=false,
+                          bool allowND=true, int coiMode=0,
+                          AutoBuffer<double>* buf=0);
+
+static inline Mat cvarrToMatND(const CvArr* arr, bool copyData=false, int coiMode=0)
+{
+    return cvarrToMat(arr, copyData, true, coiMode);
+}
+
+
+//! extracts Channel of Interest from CvMat or IplImage and makes cv::Mat out of it.
+CV_EXPORTS void extractImageCOI(const CvArr* arr, OutputArray coiimg, int coi=-1);
+//! inserts single-channel cv::Mat into a multi-channel CvMat or IplImage
+CV_EXPORTS void insertImageCOI(InputArray coiimg, CvArr* arr, int coi=-1);
+
+
+
+////// specialized implementations of DefaultDeleter::operator() for classic OpenCV types //////
+
+template<> struct DefaultDeleter<CvMat>{ CV_EXPORTS void operator ()(CvMat* obj) const; };
+template<> struct DefaultDeleter<IplImage>{ CV_EXPORTS void operator ()(IplImage* obj) const; };
+template<> struct DefaultDeleter<CvMatND>{ CV_EXPORTS void operator ()(CvMatND* obj) const; };
+template<> struct DefaultDeleter<CvSparseMat>{ CV_EXPORTS void operator ()(CvSparseMat* obj) const; };
+template<> struct DefaultDeleter<CvMemStorage>{ CV_EXPORTS void operator ()(CvMemStorage* obj) const; };
+
+////////////// convenient wrappers for operating old-style dynamic structures //////////////
+
+template<typename _Tp> class SeqIterator;
+
+typedef Ptr<CvMemStorage> MemStorage;
+
+/*!
+ Template Sequence Class derived from CvSeq
+
+ The class provides more convenient access to sequence elements,
+ STL-style operations and iterators.
+
+ \note The class is targeted for simple data types,
+    i.e. no constructors or destructors
+    are called for the sequence elements.
+*/
+template<typename _Tp> class Seq
+{
+public:
+    typedef SeqIterator<_Tp> iterator;
+    typedef SeqIterator<_Tp> const_iterator;
+
+    //! the default constructor
+    Seq();
+    //! the constructor for wrapping CvSeq structure. The real element type in CvSeq should match _Tp.
+    Seq(const CvSeq* seq);
+    //! creates the empty sequence that resides in the specified storage
+    Seq(MemStorage& storage, int headerSize = sizeof(CvSeq));
+    //! returns read-write reference to the specified element
+    _Tp& operator [](int idx);
+    //! returns read-only reference to the specified element
+    const _Tp& operator[](int idx) const;
+    //! returns iterator pointing to the beginning of the sequence
+    SeqIterator<_Tp> begin() const;
+    //! returns iterator pointing to the element following the last sequence element
+    SeqIterator<_Tp> end() const;
+    //! returns the number of elements in the sequence
+    size_t size() const;
+    //! returns the type of sequence elements (CV_8UC1 ... CV_64FC(CV_CN_MAX) ...)
+    int type() const;
+    //! returns the depth of sequence elements (CV_8U ... CV_64F)
+    int depth() const;
+    //! returns the number of channels in each sequence element
+    int channels() const;
+    //! returns the size of each sequence element
+    size_t elemSize() const;
+    //! returns index of the specified sequence element
+    size_t index(const _Tp& elem) const;
+    //! appends the specified element to the end of the sequence
+    void push_back(const _Tp& elem);
+    //! appends the specified element to the front of the sequence
+    void push_front(const _Tp& elem);
+    //! appends zero or more elements to the end of the sequence
+    void push_back(const _Tp* elems, size_t count);
+    //! appends zero or more elements to the front of the sequence
+    void push_front(const _Tp* elems, size_t count);
+    //! inserts the specified element to the specified position
+    void insert(int idx, const _Tp& elem);
+    //! inserts zero or more elements to the specified position
+    void insert(int idx, const _Tp* elems, size_t count);
+    //! removes element at the specified position
+    void remove(int idx);
+    //! removes the specified subsequence
+    void remove(const Range& r);
+
+    //! returns reference to the first sequence element
+    _Tp& front();
+    //! returns read-only reference to the first sequence element
+    const _Tp& front() const;
+    //! returns reference to the last sequence element
+    _Tp& back();
+    //! returns read-only reference to the last sequence element
+    const _Tp& back() const;
+    //! returns true iff the sequence contains no elements
+    bool empty() const;
+
+    //! removes all the elements from the sequence
+    void clear();
+    //! removes the first element from the sequence
+    void pop_front();
+    //! removes the last element from the sequence
+    void pop_back();
+    //! removes zero or more elements from the beginning of the sequence
+    void pop_front(_Tp* elems, size_t count);
+    //! removes zero or more elements from the end of the sequence
+    void pop_back(_Tp* elems, size_t count);
+
+    //! copies the whole sequence or the sequence slice to the specified vector
+    void copyTo(std::vector<_Tp>& vec, const Range& range=Range::all()) const;
+    //! returns the vector containing all the sequence elements
+    operator std::vector<_Tp>() const;
+
+    CvSeq* seq;
+};
+
+
+/*!
+ STL-style Sequence Iterator inherited from the CvSeqReader structure
+*/
+template<typename _Tp> class SeqIterator : public CvSeqReader
+{
+public:
+    //! the default constructor
+    SeqIterator();
+    //! the constructor setting the iterator to the beginning or to the end of the sequence
+    SeqIterator(const Seq<_Tp>& seq, bool seekEnd=false);
+    //! positions the iterator within the sequence
+    void seek(size_t pos);
+    //! reports the current iterator position
+    size_t tell() const;
+    //! returns reference to the current sequence element
+    _Tp& operator *();
+    //! returns read-only reference to the current sequence element
+    const _Tp& operator *() const;
+    //! moves iterator to the next sequence element
+    SeqIterator& operator ++();
+    //! moves iterator to the next sequence element
+    SeqIterator operator ++(int) const;
+    //! moves iterator to the previous sequence element
+    SeqIterator& operator --();
+    //! moves iterator to the previous sequence element
+    SeqIterator operator --(int) const;
+
+    //! moves iterator forward by the specified offset (possibly negative)
+    SeqIterator& operator +=(int);
+    //! moves iterator backward by the specified offset (possibly negative)
+    SeqIterator& operator -=(int);
+
+    // this is index of the current element module seq->total*2
+    // (to distinguish between 0 and seq->total)
+    int index;
+};
+
+
+
+// bridge C++ => C Seq API
+CV_EXPORTS schar*  seqPush( CvSeq* seq, const void* element=0);
+CV_EXPORTS schar*  seqPushFront( CvSeq* seq, const void* element=0);
+CV_EXPORTS void  seqPop( CvSeq* seq, void* element=0);
+CV_EXPORTS void  seqPopFront( CvSeq* seq, void* element=0);
+CV_EXPORTS void  seqPopMulti( CvSeq* seq, void* elements,
+                              int count, int in_front=0 );
+CV_EXPORTS void  seqRemove( CvSeq* seq, int index );
+CV_EXPORTS void  clearSeq( CvSeq* seq );
+CV_EXPORTS schar*  getSeqElem( const CvSeq* seq, int index );
+CV_EXPORTS void  seqRemoveSlice( CvSeq* seq, CvSlice slice );
+CV_EXPORTS void  seqInsertSlice( CvSeq* seq, int before_index, const CvArr* from_arr );
+
+template<typename _Tp> inline Seq<_Tp>::Seq() : seq(0) {}
+template<typename _Tp> inline Seq<_Tp>::Seq( const CvSeq* _seq ) : seq((CvSeq*)_seq)
+{
+    CV_Assert(!_seq || _seq->elem_size == sizeof(_Tp));
+}
+
+template<typename _Tp> inline Seq<_Tp>::Seq( MemStorage& storage,
+                                             int headerSize )
+{
+    CV_Assert(headerSize >= (int)sizeof(CvSeq));
+    seq = cvCreateSeq(DataType<_Tp>::type, headerSize, sizeof(_Tp), storage);
+}
+
+template<typename _Tp> inline _Tp& Seq<_Tp>::operator [](int idx)
+{ return *(_Tp*)getSeqElem(seq, idx); }
+
+template<typename _Tp> inline const _Tp& Seq<_Tp>::operator [](int idx) const
+{ return *(_Tp*)getSeqElem(seq, idx); }
+
+template<typename _Tp> inline SeqIterator<_Tp> Seq<_Tp>::begin() const
+{ return SeqIterator<_Tp>(*this); }
+
+template<typename _Tp> inline SeqIterator<_Tp> Seq<_Tp>::end() const
+{ return SeqIterator<_Tp>(*this, true); }
+
+template<typename _Tp> inline size_t Seq<_Tp>::size() const
+{ return seq ? seq->total : 0; }
+
+template<typename _Tp> inline int Seq<_Tp>::type() const
+{ return seq ? CV_MAT_TYPE(seq->flags) : 0; }
+
+template<typename _Tp> inline int Seq<_Tp>::depth() const
+{ return seq ? CV_MAT_DEPTH(seq->flags) : 0; }
+
+template<typename _Tp> inline int Seq<_Tp>::channels() const
+{ return seq ? CV_MAT_CN(seq->flags) : 0; }
+
+template<typename _Tp> inline size_t Seq<_Tp>::elemSize() const
+{ return seq ? seq->elem_size : 0; }
+
+template<typename _Tp> inline size_t Seq<_Tp>::index(const _Tp& elem) const
+{ return cvSeqElemIdx(seq, &elem); }
+
+template<typename _Tp> inline void Seq<_Tp>::push_back(const _Tp& elem)
+{ cvSeqPush(seq, &elem); }
+
+template<typename _Tp> inline void Seq<_Tp>::push_front(const _Tp& elem)
+{ cvSeqPushFront(seq, &elem); }
+
+template<typename _Tp> inline void Seq<_Tp>::push_back(const _Tp* elem, size_t count)
+{ cvSeqPushMulti(seq, elem, (int)count, 0); }
+
+template<typename _Tp> inline void Seq<_Tp>::push_front(const _Tp* elem, size_t count)
+{ cvSeqPushMulti(seq, elem, (int)count, 1); }
+
+template<typename _Tp> inline _Tp& Seq<_Tp>::back()
+{ return *(_Tp*)getSeqElem(seq, -1); }
+
+template<typename _Tp> inline const _Tp& Seq<_Tp>::back() const
+{ return *(const _Tp*)getSeqElem(seq, -1); }
+
+template<typename _Tp> inline _Tp& Seq<_Tp>::front()
+{ return *(_Tp*)getSeqElem(seq, 0); }
+
+template<typename _Tp> inline const _Tp& Seq<_Tp>::front() const
+{ return *(const _Tp*)getSeqElem(seq, 0); }
+
+template<typename _Tp> inline bool Seq<_Tp>::empty() const
+{ return !seq || seq->total == 0; }
+
+template<typename _Tp> inline void Seq<_Tp>::clear()
+{ if(seq) clearSeq(seq); }
+
+template<typename _Tp> inline void Seq<_Tp>::pop_back()
+{ seqPop(seq); }
+
+template<typename _Tp> inline void Seq<_Tp>::pop_front()
+{ seqPopFront(seq); }
+
+template<typename _Tp> inline void Seq<_Tp>::pop_back(_Tp* elem, size_t count)
+{ seqPopMulti(seq, elem, (int)count, 0); }
+
+template<typename _Tp> inline void Seq<_Tp>::pop_front(_Tp* elem, size_t count)
+{ seqPopMulti(seq, elem, (int)count, 1); }
+
+template<typename _Tp> inline void Seq<_Tp>::insert(int idx, const _Tp& elem)
+{ seqInsert(seq, idx, &elem); }
+
+template<typename _Tp> inline void Seq<_Tp>::insert(int idx, const _Tp* elems, size_t count)
+{
+    CvMat m = cvMat(1, count, DataType<_Tp>::type, elems);
+    seqInsertSlice(seq, idx, &m);
+}
+
+template<typename _Tp> inline void Seq<_Tp>::remove(int idx)
+{ seqRemove(seq, idx); }
+
+template<typename _Tp> inline void Seq<_Tp>::remove(const Range& r)
+{ seqRemoveSlice(seq, cvSlice(r.start, r.end)); }
+
+template<typename _Tp> inline void Seq<_Tp>::copyTo(std::vector<_Tp>& vec, const Range& range) const
+{
+    size_t len = !seq ? 0 : range == Range::all() ? seq->total : range.end - range.start;
+    vec.resize(len);
+    if( seq && len )
+        cvCvtSeqToArray(seq, &vec[0], cvSlice(range));
+}
+
+template<typename _Tp> inline Seq<_Tp>::operator std::vector<_Tp>() const
+{
+    std::vector<_Tp> vec;
+    copyTo(vec);
+    return vec;
+}
+
+template<typename _Tp> inline SeqIterator<_Tp>::SeqIterator()
+{ memset(this, 0, sizeof(*this)); }
+
+template<typename _Tp> inline SeqIterator<_Tp>::SeqIterator(const Seq<_Tp>& _seq, bool seekEnd)
+{
+    cvStartReadSeq(_seq.seq, this);
+    index = seekEnd ? _seq.seq->total : 0;
+}
+
+template<typename _Tp> inline void SeqIterator<_Tp>::seek(size_t pos)
+{
+    cvSetSeqReaderPos(this, (int)pos, false);
+    index = pos;
+}
+
+template<typename _Tp> inline size_t SeqIterator<_Tp>::tell() const
+{ return index; }
+
+template<typename _Tp> inline _Tp& SeqIterator<_Tp>::operator *()
+{ return *(_Tp*)ptr; }
+
+template<typename _Tp> inline const _Tp& SeqIterator<_Tp>::operator *() const
+{ return *(const _Tp*)ptr; }
+
+template<typename _Tp> inline SeqIterator<_Tp>& SeqIterator<_Tp>::operator ++()
+{
+    CV_NEXT_SEQ_ELEM(sizeof(_Tp), *this);
+    if( ++index >= seq->total*2 )
+        index = 0;
+    return *this;
+}
+
+template<typename _Tp> inline SeqIterator<_Tp> SeqIterator<_Tp>::operator ++(int) const
+{
+    SeqIterator<_Tp> it = *this;
+    ++*this;
+    return it;
+}
+
+template<typename _Tp> inline SeqIterator<_Tp>& SeqIterator<_Tp>::operator --()
+{
+    CV_PREV_SEQ_ELEM(sizeof(_Tp), *this);
+    if( --index < 0 )
+        index = seq->total*2-1;
+    return *this;
+}
+
+template<typename _Tp> inline SeqIterator<_Tp> SeqIterator<_Tp>::operator --(int) const
+{
+    SeqIterator<_Tp> it = *this;
+    --*this;
+    return it;
+}
+
+template<typename _Tp> inline SeqIterator<_Tp>& SeqIterator<_Tp>::operator +=(int delta)
+{
+    cvSetSeqReaderPos(this, delta, 1);
+    index += delta;
+    int n = seq->total*2;
+    if( index < 0 )
+        index += n;
+    if( index >= n )
+        index -= n;
+    return *this;
+}
+
+template<typename _Tp> inline SeqIterator<_Tp>& SeqIterator<_Tp>::operator -=(int delta)
+{
+    return (*this += -delta);
+}
+
+template<typename _Tp> inline ptrdiff_t operator - (const SeqIterator<_Tp>& a,
+                                                    const SeqIterator<_Tp>& b)
+{
+    ptrdiff_t delta = a.index - b.index, n = a.seq->total;
+    if( delta > n || delta < -n )
+        delta += delta < 0 ? n : -n;
+    return delta;
+}
+
+template<typename _Tp> inline bool operator == (const SeqIterator<_Tp>& a,
+                                                const SeqIterator<_Tp>& b)
+{
+    return a.seq == b.seq && a.index == b.index;
+}
+
+template<typename _Tp> inline bool operator != (const SeqIterator<_Tp>& a,
+                                                const SeqIterator<_Tp>& b)
+{
+    return !(a == b);
+}
+
+//! @}
+
+} // cv
+
+#endif
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda.hpp
new file mode 100644
index 0000000..1ebea07
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda.hpp
@@ -0,0 +1,1271 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_CUDA_HPP
+#define OPENCV_CORE_CUDA_HPP
+
+#ifndef __cplusplus
+#  error cuda.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core.hpp"
+#include "opencv2/core/cuda_types.hpp"
+
+/**
+  @defgroup cuda CUDA-accelerated Computer Vision
+  @{
+    @defgroup cudacore Core part
+    @{
+      @defgroup cudacore_init Initialization and Information
+      @defgroup cudacore_struct Data Structures
+    @}
+  @}
+ */
+
+namespace cv { namespace cuda {
+
+//! @addtogroup cudacore_struct
+//! @{
+
+//===================================================================================
+// GpuMat
+//===================================================================================
+
+/** @brief Base storage class for GPU memory with reference counting.
+
+Its interface matches the Mat interface with the following limitations:
+
+-   no arbitrary dimensions support (only 2D)
+-   no functions that return references to their data (because references on GPU are not valid for
+    CPU)
+-   no expression templates technique support
+
+Beware that the latter limitation may lead to overloaded matrix operators that cause memory
+allocations. The GpuMat class is convertible to cuda::PtrStepSz and cuda::PtrStep so it can be
+passed directly to the kernel.
+
+@note In contrast with Mat, in most cases GpuMat::isContinuous() == false . This means that rows are
+aligned to a size depending on the hardware. Single-row GpuMat is always a continuous matrix.
+
+@note You are not recommended to leave static or global GpuMat variables allocated, that is, to rely
+on its destructor. The destruction order of such variables and CUDA context is undefined. GPU memory
+release function returns error if the CUDA context has been destroyed before.
+
+Some member functions are described as a "Blocking Call" while some are described as a
+"Non-Blocking Call". Blocking functions are synchronous to host. It is guaranteed that the GPU
+operation is finished when the function returns. However, non-blocking functions are asynchronous to
+host. Those functions may return even if the GPU operation is not finished.
+
+Compared to their blocking counterpart, non-blocking functions accept Stream as an additional
+argument. If a non-default stream is passed, the GPU operation may overlap with operations in other
+streams.
+
+@sa Mat
+ */
+class CV_EXPORTS_W GpuMat
+{
+public:
+    class CV_EXPORTS_W Allocator
+    {
+    public:
+        virtual ~Allocator() {}
+
+        // allocator must fill data, step and refcount fields
+        virtual bool allocate(GpuMat* mat, int rows, int cols, size_t elemSize) = 0;
+        virtual void free(GpuMat* mat) = 0;
+    };
+
+    //! default allocator
+    CV_WRAP static GpuMat::Allocator* defaultAllocator();
+    CV_WRAP static void setDefaultAllocator(GpuMat::Allocator* allocator);
+
+    //! default constructor
+    CV_WRAP explicit GpuMat(GpuMat::Allocator* allocator = GpuMat::defaultAllocator());
+
+    //! constructs GpuMat of the specified size and type
+    CV_WRAP GpuMat(int rows, int cols, int type, GpuMat::Allocator* allocator = GpuMat::defaultAllocator());
+    CV_WRAP GpuMat(Size size, int type, GpuMat::Allocator* allocator = GpuMat::defaultAllocator());
+
+    //! constructs GpuMat and fills it with the specified value _s
+    CV_WRAP GpuMat(int rows, int cols, int type, Scalar s, GpuMat::Allocator* allocator = GpuMat::defaultAllocator());
+    CV_WRAP GpuMat(Size size, int type, Scalar s, GpuMat::Allocator* allocator = GpuMat::defaultAllocator());
+
+    //! copy constructor
+    CV_WRAP GpuMat(const GpuMat& m);
+
+    //! constructor for GpuMat headers pointing to user-allocated data
+    GpuMat(int rows, int cols, int type, void* data, size_t step = Mat::AUTO_STEP);
+    GpuMat(Size size, int type, void* data, size_t step = Mat::AUTO_STEP);
+
+    //! creates a GpuMat header for a part of the bigger matrix
+    CV_WRAP GpuMat(const GpuMat& m, Range rowRange, Range colRange);
+    CV_WRAP GpuMat(const GpuMat& m, Rect roi);
+
+    //! builds GpuMat from host memory (Blocking call)
+    CV_WRAP explicit GpuMat(InputArray arr, GpuMat::Allocator* allocator = GpuMat::defaultAllocator());
+
+    //! destructor - calls release()
+    ~GpuMat();
+
+    //! assignment operators
+    GpuMat& operator =(const GpuMat& m);
+
+    //! allocates new GpuMat data unless the GpuMat already has specified size and type
+    CV_WRAP void create(int rows, int cols, int type);
+    CV_WRAP void create(Size size, int type);
+
+    //! decreases reference counter, deallocate the data when reference counter reaches 0
+    CV_WRAP void release();
+
+    //! swaps with other smart pointer
+    CV_WRAP void swap(GpuMat& mat);
+
+    /** @brief Performs data upload to GpuMat (Blocking call)
+
+    This function copies data from host memory to device memory. As being a blocking call, it is
+    guaranteed that the copy operation is finished when this function returns.
+    */
+    CV_WRAP void upload(InputArray arr);
+
+    /** @brief Performs data upload to GpuMat (Non-Blocking call)
+
+    This function copies data from host memory to device memory. As being a non-blocking call, this
+    function may return even if the copy operation is not finished.
+
+    The copy operation may be overlapped with operations in other non-default streams if \p stream is
+    not the default stream and \p dst is HostMem allocated with HostMem::PAGE_LOCKED option.
+    */
+    CV_WRAP void upload(InputArray arr, Stream& stream);
+
+    /** @brief Performs data download from GpuMat (Blocking call)
+
+    This function copies data from device memory to host memory. As being a blocking call, it is
+    guaranteed that the copy operation is finished when this function returns.
+    */
+    CV_WRAP void download(OutputArray dst) const;
+
+    /** @brief Performs data download from GpuMat (Non-Blocking call)
+
+    This function copies data from device memory to host memory. As being a non-blocking call, this
+    function may return even if the copy operation is not finished.
+
+    The copy operation may be overlapped with operations in other non-default streams if \p stream is
+    not the default stream and \p dst is HostMem allocated with HostMem::PAGE_LOCKED option.
+    */
+    CV_WRAP void download(OutputArray dst, Stream& stream) const;
+
+    //! returns deep copy of the GpuMat, i.e. the data is copied
+    CV_WRAP GpuMat clone() const;
+
+    //! copies the GpuMat content to device memory (Blocking call)
+    CV_WRAP void copyTo(OutputArray dst) const;
+
+    //! copies the GpuMat content to device memory (Non-Blocking call)
+    CV_WRAP void copyTo(OutputArray dst, Stream& stream) const;
+
+    //! copies those GpuMat elements to "m" that are marked with non-zero mask elements (Blocking call)
+    CV_WRAP void copyTo(OutputArray dst, InputArray mask) const;
+
+    //! copies those GpuMat elements to "m" that are marked with non-zero mask elements (Non-Blocking call)
+    CV_WRAP void copyTo(OutputArray dst, InputArray mask, Stream& stream) const;
+
+    //! sets some of the GpuMat elements to s (Blocking call)
+    CV_WRAP GpuMat& setTo(Scalar s);
+
+    //! sets some of the GpuMat elements to s (Non-Blocking call)
+    CV_WRAP GpuMat& setTo(Scalar s, Stream& stream);
+
+    //! sets some of the GpuMat elements to s, according to the mask (Blocking call)
+    CV_WRAP GpuMat& setTo(Scalar s, InputArray mask);
+
+    //! sets some of the GpuMat elements to s, according to the mask (Non-Blocking call)
+    CV_WRAP GpuMat& setTo(Scalar s, InputArray mask, Stream& stream);
+
+    //! converts GpuMat to another datatype (Blocking call)
+    CV_WRAP void convertTo(OutputArray dst, int rtype) const;
+
+    //! converts GpuMat to another datatype (Non-Blocking call)
+    CV_WRAP void convertTo(OutputArray dst, int rtype, Stream& stream) const;
+
+    //! converts GpuMat to another datatype with scaling (Blocking call)
+    CV_WRAP void convertTo(OutputArray dst, int rtype, double alpha, double beta = 0.0) const;
+
+    //! converts GpuMat to another datatype with scaling (Non-Blocking call)
+    CV_WRAP void convertTo(OutputArray dst, int rtype, double alpha, Stream& stream) const;
+
+    //! converts GpuMat to another datatype with scaling (Non-Blocking call)
+    CV_WRAP void convertTo(OutputArray dst, int rtype, double alpha, double beta, Stream& stream) const;
+
+    CV_WRAP void assignTo(GpuMat& m, int type = -1) const;
+
+    //! returns pointer to y-th row
+    uchar* ptr(int y = 0);
+    const uchar* ptr(int y = 0) const;
+
+    //! template version of the above method
+    template<typename _Tp> _Tp* ptr(int y = 0);
+    template<typename _Tp> const _Tp* ptr(int y = 0) const;
+
+    template <typename _Tp> operator PtrStepSz<_Tp>() const;
+    template <typename _Tp> operator PtrStep<_Tp>() const;
+
+    //! returns a new GpuMat header for the specified row
+    CV_WRAP GpuMat row(int y) const;
+
+    //! returns a new GpuMat header for the specified column
+    CV_WRAP GpuMat col(int x) const;
+
+    //! ... for the specified row span
+    CV_WRAP GpuMat rowRange(int startrow, int endrow) const;
+    CV_WRAP GpuMat rowRange(Range r) const;
+
+    //! ... for the specified column span
+    CV_WRAP GpuMat colRange(int startcol, int endcol) const;
+    CV_WRAP GpuMat colRange(Range r) const;
+
+    //! extracts a rectangular sub-GpuMat (this is a generalized form of row, rowRange etc.)
+    GpuMat operator ()(Range rowRange, Range colRange) const;
+    GpuMat operator ()(Rect roi) const;
+
+    //! creates alternative GpuMat header for the same data, with different
+    //! number of channels and/or different number of rows
+    CV_WRAP GpuMat reshape(int cn, int rows = 0) const;
+
+    //! locates GpuMat header within a parent GpuMat
+    CV_WRAP void locateROI(Size& wholeSize, Point& ofs) const;
+
+    //! moves/resizes the current GpuMat ROI inside the parent GpuMat
+    CV_WRAP GpuMat& adjustROI(int dtop, int dbottom, int dleft, int dright);
+
+    //! returns true iff the GpuMat data is continuous
+    //! (i.e. when there are no gaps between successive rows)
+    CV_WRAP bool isContinuous() const;
+
+    //! returns element size in bytes
+    CV_WRAP size_t elemSize() const;
+
+    //! returns the size of element channel in bytes
+    CV_WRAP size_t elemSize1() const;
+
+    //! returns element type
+    CV_WRAP int type() const;
+
+    //! returns element type
+    CV_WRAP int depth() const;
+
+    //! returns number of channels
+    CV_WRAP int channels() const;
+
+    //! returns step/elemSize1()
+    CV_WRAP size_t step1() const;
+
+    //! returns GpuMat size : width == number of columns, height == number of rows
+    CV_WRAP Size size() const;
+
+    //! returns true if GpuMat data is NULL
+    CV_WRAP bool empty() const;
+
+    // returns pointer to cuda memory
+    CV_WRAP void* cudaPtr() const;
+
+    //! internal use method: updates the continuity flag
+    CV_WRAP void updateContinuityFlag();
+
+    /*! includes several bit-fields:
+    - the magic signature
+    - continuity flag
+    - depth
+    - number of channels
+    */
+    int flags;
+
+    //! the number of rows and columns
+    int rows, cols;
+
+    //! a distance between successive rows in bytes; includes the gap if any
+    CV_PROP size_t step;
+
+    //! pointer to the data
+    uchar* data;
+
+    //! pointer to the reference counter;
+    //! when GpuMat points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    //! helper fields used in locateROI and adjustROI
+    uchar* datastart;
+    const uchar* dataend;
+
+    //! allocator
+    Allocator* allocator;
+};
+
+struct CV_EXPORTS_W GpuData
+{
+    explicit GpuData(size_t _size);
+     ~GpuData();
+
+    GpuData(const GpuData&) = delete;
+    GpuData& operator=(const GpuData&) = delete;
+
+    GpuData(GpuData&&) = delete;
+    GpuData& operator=(GpuData&&) = delete;
+
+    uchar* data;
+    size_t size;
+};
+
+class CV_EXPORTS_W GpuMatND
+{
+public:
+    using SizeArray = std::vector<int>;
+    using StepArray = std::vector<size_t>;
+    using IndexArray = std::vector<int>;
+
+    //! destructor
+    ~GpuMatND();
+
+    //! default constructor
+    GpuMatND();
+
+    /** @overload
+    @param size Array of integers specifying an n-dimensional array shape.
+    @param type Array type. Use CV_8UC1, ..., CV_16FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+    */
+    GpuMatND(SizeArray size, int type);
+
+    /** @overload
+    @param size Array of integers specifying an n-dimensional array shape.
+    @param type Array type. Use CV_8UC1, ..., CV_16FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+    @param data Pointer to the user data. Matrix constructors that take data and step parameters do not
+    allocate matrix data. Instead, they just initialize the matrix header that points to the specified
+    data, which means that no data is copied. This operation is very efficient and can be used to
+    process external data using OpenCV functions. The external data is not automatically deallocated, so
+    you should take care of it.
+    @param step Array of _size.size()-1 steps in case of a multi-dimensional array (the last step is always
+    set to the element size). If not specified, the matrix is assumed to be continuous.
+    */
+    GpuMatND(SizeArray size, int type, void* data, StepArray step = StepArray());
+
+    /** @brief Allocates GPU memory.
+    Suppose there is some GPU memory already allocated. In that case, this method may choose to reuse that
+    GPU memory under the specific condition: it must be of the same size and type, not externally allocated,
+    the GPU memory is continuous(i.e., isContinuous() is true), and is not a sub-matrix of another GpuMatND
+    (i.e., isSubmatrix() is false). In other words, this method guarantees that the GPU memory allocated by
+    this method is always continuous and is not a sub-region of another GpuMatND.
+    */
+    void create(SizeArray size, int type);
+
+    void release();
+
+    void swap(GpuMatND& m) noexcept;
+
+    /** @brief Creates a full copy of the array and the underlying data.
+    The method creates a full copy of the array. It mimics the behavior of Mat::clone(), i.e.
+    the original step is not taken into account. So, the array copy is a continuous array
+    occupying total()\*elemSize() bytes.
+    */
+    GpuMatND clone() const;
+
+    /** @overload
+    This overload is non-blocking, so it may return even if the copy operation is not finished.
+    */
+    GpuMatND clone(Stream& stream) const;
+
+    /** @brief Extracts a sub-matrix.
+    The operator makes a new header for the specified sub-array of \*this.
+    The operator is an O(1) operation, that is, no matrix data is copied.
+    @param ranges Array of selected ranges along each dimension.
+    */
+    GpuMatND operator()(const std::vector<Range>& ranges) const;
+
+    /** @brief Creates a GpuMat header for a 2D plane part of an n-dim matrix.
+    @note The returned GpuMat is constructed with the constructor for user-allocated data.
+    That is, It does not perform reference counting.
+    @note This function does not increment this GpuMatND's reference counter.
+    */
+    GpuMat createGpuMatHeader(IndexArray idx, Range rowRange, Range colRange) const;
+
+    /** @overload
+    Creates a GpuMat header if this GpuMatND is effectively 2D.
+    @note The returned GpuMat is constructed with the constructor for user-allocated data.
+    That is, It does not perform reference counting.
+    @note This function does not increment this GpuMatND's reference counter.
+    */
+    GpuMat createGpuMatHeader() const;
+
+    /** @brief Extracts a 2D plane part of an n-dim matrix.
+    It differs from createGpuMatHeader(IndexArray, Range, Range) in that it clones a part of this
+    GpuMatND to the returned GpuMat.
+    @note This operator does not increment this GpuMatND's reference counter;
+    */
+    GpuMat operator()(IndexArray idx, Range rowRange, Range colRange) const;
+
+    /** @brief Extracts a 2D plane part of an n-dim matrix if this GpuMatND is effectively 2D.
+    It differs from createGpuMatHeader() in that it clones a part of this GpuMatND.
+    @note This operator does not increment this GpuMatND's reference counter;
+    */
+    operator GpuMat() const;
+
+    GpuMatND(const GpuMatND&) = default;
+    GpuMatND& operator=(const GpuMatND&) = default;
+
+#if defined(__GNUC__) && __GNUC__ < 5
+    // error: function '...' defaulted on its first declaration with an exception-specification
+    // that differs from the implicit declaration '...'
+
+    GpuMatND(GpuMatND&&) = default;
+    GpuMatND& operator=(GpuMatND&&) = default;
+#else
+    GpuMatND(GpuMatND&&) noexcept = default;
+    GpuMatND& operator=(GpuMatND&&) noexcept = default;
+#endif
+
+    void upload(InputArray src);
+    void upload(InputArray src, Stream& stream);
+    void download(OutputArray dst) const;
+    void download(OutputArray dst, Stream& stream) const;
+
+    //! returns true iff the GpuMatND data is continuous
+    //! (i.e. when there are no gaps between successive rows)
+    bool isContinuous() const;
+
+    //! returns true if the matrix is a sub-matrix of another matrix
+    bool isSubmatrix() const;
+
+    //! returns element size in bytes
+    size_t elemSize() const;
+
+    //! returns the size of element channel in bytes
+    size_t elemSize1() const;
+
+    //! returns true if data is null
+    bool empty() const;
+
+    //! returns true if not empty and points to external(user-allocated) gpu memory
+    bool external() const;
+
+    //! returns pointer to the first byte of the GPU memory
+    uchar* getDevicePtr() const;
+
+    //! returns the total number of array elements
+    size_t total() const;
+
+    //! returns the size of underlying memory in bytes
+    size_t totalMemSize() const;
+
+    //! returns element type
+    int type() const;
+
+private:
+    //! internal use
+    void setFields(SizeArray size, int type, StepArray step = StepArray());
+
+public:
+    /*! includes several bit-fields:
+    - the magic signature
+    - continuity flag
+    - depth
+    - number of channels
+    */
+    int flags;
+
+    //! matrix dimensionality
+    int dims;
+
+    //! shape of this array
+    SizeArray size;
+
+    /*! step values
+    Their semantics is identical to the semantics of step for Mat.
+    */
+    StepArray step;
+
+private:
+    /*! internal use
+    If this GpuMatND holds external memory, this is empty.
+    */
+    std::shared_ptr<GpuData> data_;
+
+    /*! internal use
+    If this GpuMatND manages memory with reference counting, this value is
+    always equal to data_->data. If this GpuMatND holds external memory,
+    data_ is empty and data points to the external memory.
+    */
+    uchar* data;
+
+    /*! internal use
+    If this GpuMatND is a sub-matrix of a larger matrix, this value is the
+    difference of the first byte between the sub-matrix and the whole matrix.
+    */
+    size_t offset;
+};
+
+/** @brief Creates a continuous matrix.
+
+@param rows Row count.
+@param cols Column count.
+@param type Type of the matrix.
+@param arr Destination matrix. This parameter changes only if it has a proper type and area (
+\f$\texttt{rows} \times \texttt{cols}\f$ ).
+
+Matrix is called continuous if its elements are stored continuously, that is, without gaps at the
+end of each row.
+ */
+CV_EXPORTS_W void createContinuous(int rows, int cols, int type, OutputArray arr);
+
+/** @brief Ensures that the size of a matrix is big enough and the matrix has a proper type.
+
+@param rows Minimum desired number of rows.
+@param cols Minimum desired number of columns.
+@param type Desired matrix type.
+@param arr Destination matrix.
+
+The function does not reallocate memory if the matrix has proper attributes already.
+ */
+CV_EXPORTS_W void ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr);
+
+/** @brief BufferPool for use with CUDA streams
+
+BufferPool utilizes Stream's allocator to create new buffers for GpuMat's. It is
+only useful when enabled with #setBufferPoolUsage.
+
+@code
+    setBufferPoolUsage(true);
+@endcode
+
+@note #setBufferPoolUsage must be called \em before any Stream declaration.
+
+Users may specify custom allocator for Stream and may implement their own stream based
+functions utilizing the same underlying GPU memory management.
+
+If custom allocator is not specified, BufferPool utilizes StackAllocator by
+default. StackAllocator allocates a chunk of GPU device memory beforehand,
+and when GpuMat is declared later on, it is given the pre-allocated memory.
+This kind of strategy reduces the number of calls for memory allocating APIs
+such as cudaMalloc or cudaMallocPitch.
+
+Below is an example that utilizes BufferPool with StackAllocator:
+
+@code
+    #include <opencv2/opencv.hpp>
+
+    using namespace cv;
+    using namespace cv::cuda
+
+    int main()
+    {
+        setBufferPoolUsage(true);                               // Tell OpenCV that we are going to utilize BufferPool
+        setBufferPoolConfig(getDevice(), 1024 * 1024 * 64, 2);  // Allocate 64 MB, 2 stacks (default is 10 MB, 5 stacks)
+
+        Stream stream1, stream2;                                // Each stream uses 1 stack
+        BufferPool pool1(stream1), pool2(stream2);
+
+        GpuMat d_src1 = pool1.getBuffer(4096, 4096, CV_8UC1);   // 16MB
+        GpuMat d_dst1 = pool1.getBuffer(4096, 4096, CV_8UC3);   // 48MB, pool1 is now full
+
+        GpuMat d_src2 = pool2.getBuffer(1024, 1024, CV_8UC1);   // 1MB
+        GpuMat d_dst2 = pool2.getBuffer(1024, 1024, CV_8UC3);   // 3MB
+
+        cvtColor(d_src1, d_dst1, CV_GRAY2BGR, 0, stream1);
+        cvtColor(d_src2, d_dst2, CV_GRAY2BGR, 0, stream2);
+    }
+@endcode
+
+If we allocate another GpuMat on pool1 in the above example, it will be carried out by
+the DefaultAllocator since the stack for pool1 is full.
+
+@code
+    GpuMat d_add1 = pool1.getBuffer(1024, 1024, CV_8UC1);   // Stack for pool1 is full, memory is allocated with DefaultAllocator
+@endcode
+
+If a third stream is declared in the above example, allocating with #getBuffer
+within that stream will also be carried out by the DefaultAllocator because we've run out of
+stacks.
+
+@code
+    Stream stream3;                                         // Only 2 stacks were allocated, we've run out of stacks
+    BufferPool pool3(stream3);
+    GpuMat d_src3 = pool3.getBuffer(1024, 1024, CV_8UC1);   // Memory is allocated with DefaultAllocator
+@endcode
+
+@warning When utilizing StackAllocator, deallocation order is important.
+
+Just like a stack, deallocation must be done in LIFO order. Below is an example of
+erroneous usage that violates LIFO rule. If OpenCV is compiled in Debug mode, this
+sample code will emit CV_Assert error.
+
+@code
+    int main()
+    {
+        setBufferPoolUsage(true);                               // Tell OpenCV that we are going to utilize BufferPool
+        Stream stream;                                          // A default size (10 MB) stack is allocated to this stream
+        BufferPool pool(stream);
+
+        GpuMat mat1 = pool.getBuffer(1024, 1024, CV_8UC1);      // Allocate mat1 (1MB)
+        GpuMat mat2 = pool.getBuffer(1024, 1024, CV_8UC1);      // Allocate mat2 (1MB)
+
+        mat1.release();                                         // erroneous usage : mat2 must be deallocated before mat1
+    }
+@endcode
+
+Since C++ local variables are destroyed in the reverse order of construction,
+the code sample below satisfies the LIFO rule. Local GpuMat's are deallocated
+and the corresponding memory is automatically returned to the pool for later usage.
+
+@code
+    int main()
+    {
+        setBufferPoolUsage(true);                               // Tell OpenCV that we are going to utilize BufferPool
+        setBufferPoolConfig(getDevice(), 1024 * 1024 * 64, 2);  // Allocate 64 MB, 2 stacks (default is 10 MB, 5 stacks)
+
+        Stream stream1, stream2;                                // Each stream uses 1 stack
+        BufferPool pool1(stream1), pool2(stream2);
+
+        for (int i = 0; i < 10; i++)
+        {
+            GpuMat d_src1 = pool1.getBuffer(4096, 4096, CV_8UC1);   // 16MB
+            GpuMat d_dst1 = pool1.getBuffer(4096, 4096, CV_8UC3);   // 48MB, pool1 is now full
+
+            GpuMat d_src2 = pool2.getBuffer(1024, 1024, CV_8UC1);   // 1MB
+            GpuMat d_dst2 = pool2.getBuffer(1024, 1024, CV_8UC3);   // 3MB
+
+            d_src1.setTo(Scalar(i), stream1);
+            d_src2.setTo(Scalar(i), stream2);
+
+            cvtColor(d_src1, d_dst1, CV_GRAY2BGR, 0, stream1);
+            cvtColor(d_src2, d_dst2, CV_GRAY2BGR, 0, stream2);
+                                                                    // The order of destruction of the local variables is:
+                                                                    //   d_dst2 => d_src2 => d_dst1 => d_src1
+                                                                    // LIFO rule is satisfied, this code runs without error
+        }
+    }
+@endcode
+ */
+class CV_EXPORTS_W BufferPool
+{
+public:
+
+    //! Gets the BufferPool for the given stream.
+    CV_WRAP explicit BufferPool(Stream& stream);
+
+    //! Allocates a new GpuMat of given size and type.
+    CV_WRAP GpuMat getBuffer(int rows, int cols, int type);
+
+    //! Allocates a new GpuMat of given size and type.
+    CV_WRAP GpuMat getBuffer(Size size, int type) { return getBuffer(size.height, size.width, type); }
+
+    //! Returns the allocator associated with the stream.
+    CV_WRAP Ptr<GpuMat::Allocator> getAllocator() const { return allocator_; }
+
+private:
+    Ptr<GpuMat::Allocator> allocator_;
+};
+
+//! BufferPool management (must be called before Stream creation)
+CV_EXPORTS_W void setBufferPoolUsage(bool on);
+CV_EXPORTS_W void setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount);
+
+//===================================================================================
+// HostMem
+//===================================================================================
+
+/** @brief Class with reference counting wrapping special memory type allocation functions from CUDA.
+
+Its interface is also Mat-like but with additional memory type parameters.
+
+-   **PAGE_LOCKED** sets a page locked memory type used commonly for fast and asynchronous
+    uploading/downloading data from/to GPU.
+-   **SHARED** specifies a zero copy memory allocation that enables mapping the host memory to GPU
+    address space, if supported.
+-   **WRITE_COMBINED** sets the write combined buffer that is not cached by CPU. Such buffers are
+    used to supply GPU with data when GPU only reads it. The advantage is a better CPU cache
+    utilization.
+
+@note Allocation size of such memory types is usually limited. For more details, see *CUDA 2.2
+Pinned Memory APIs* document or *CUDA C Programming Guide*.
+ */
+class CV_EXPORTS_W HostMem
+{
+public:
+    enum AllocType { PAGE_LOCKED = 1, SHARED = 2, WRITE_COMBINED = 4 };
+
+    static MatAllocator* getAllocator(HostMem::AllocType alloc_type = HostMem::AllocType::PAGE_LOCKED);
+
+    CV_WRAP explicit HostMem(HostMem::AllocType alloc_type = HostMem::AllocType::PAGE_LOCKED);
+
+    HostMem(const HostMem& m);
+
+    CV_WRAP HostMem(int rows, int cols, int type, HostMem::AllocType alloc_type = HostMem::AllocType::PAGE_LOCKED);
+    CV_WRAP HostMem(Size size, int type, HostMem::AllocType alloc_type = HostMem::AllocType::PAGE_LOCKED);
+
+    //! creates from host memory with coping data
+    CV_WRAP explicit HostMem(InputArray arr, HostMem::AllocType alloc_type = HostMem::AllocType::PAGE_LOCKED);
+
+    ~HostMem();
+
+    HostMem& operator =(const HostMem& m);
+
+    //! swaps with other smart pointer
+    CV_WRAP void swap(HostMem& b);
+
+    //! returns deep copy of the matrix, i.e. the data is copied
+    CV_WRAP HostMem clone() const;
+
+    //! allocates new matrix data unless the matrix already has specified size and type.
+    CV_WRAP void create(int rows, int cols, int type);
+    void create(Size size, int type);
+
+    //! creates alternative HostMem header for the same data, with different
+    //! number of channels and/or different number of rows
+    CV_WRAP HostMem reshape(int cn, int rows = 0) const;
+
+    //! decrements reference counter and released memory if needed.
+    void release();
+
+    //! returns matrix header with disabled reference counting for HostMem data.
+    CV_WRAP Mat createMatHeader() const;
+
+    /** @brief Maps CPU memory to GPU address space and creates the cuda::GpuMat header without reference counting
+    for it.
+
+    This can be done only if memory was allocated with the SHARED flag and if it is supported by the
+    hardware. Laptops often share video and CPU memory, so address spaces can be mapped, which
+    eliminates an extra copy.
+     */
+    GpuMat createGpuMatHeader() const;
+
+    // Please see cv::Mat for descriptions
+    CV_WRAP bool isContinuous() const;
+    CV_WRAP size_t elemSize() const;
+    CV_WRAP size_t elemSize1() const;
+    CV_WRAP int type() const;
+    CV_WRAP int depth() const;
+    CV_WRAP int channels() const;
+    CV_WRAP size_t step1() const;
+    CV_WRAP Size size() const;
+    CV_WRAP bool empty() const;
+
+    // Please see cv::Mat for descriptions
+    int flags;
+    int rows, cols;
+    CV_PROP size_t step;
+
+    uchar* data;
+    int* refcount;
+
+    uchar* datastart;
+    const uchar* dataend;
+
+    AllocType alloc_type;
+};
+
+/** @brief Page-locks the memory of matrix and maps it for the device(s).
+
+@param m Input matrix.
+ */
+CV_EXPORTS_W void registerPageLocked(Mat& m);
+
+/** @brief Unmaps the memory of matrix and makes it pageable again.
+
+@param m Input matrix.
+ */
+CV_EXPORTS_W void unregisterPageLocked(Mat& m);
+
+//===================================================================================
+// Stream
+//===================================================================================
+
+/** @brief This class encapsulates a queue of asynchronous calls.
+
+@note Currently, you may face problems if an operation is enqueued twice with different data. Some
+functions use the constant GPU memory, and next call may update the memory before the previous one
+has been finished. But calling different operations asynchronously is safe because each operation
+has its own constant buffer. Memory copy/upload/download/set operations to the buffers you hold are
+also safe.
+
+@note The Stream class is not thread-safe. Please use different Stream objects for different CPU threads.
+
+@code
+void thread1()
+{
+    cv::cuda::Stream stream1;
+    cv::cuda::func1(..., stream1);
+}
+
+void thread2()
+{
+    cv::cuda::Stream stream2;
+    cv::cuda::func2(..., stream2);
+}
+@endcode
+
+@note By default all CUDA routines are launched in Stream::Null() object, if the stream is not specified by user.
+In multi-threading environment the stream objects must be passed explicitly (see previous note).
+ */
+class CV_EXPORTS_W Stream
+{
+    typedef void (Stream::*bool_type)() const;
+    void this_type_does_not_support_comparisons() const {}
+
+public:
+    typedef void (*StreamCallback)(int status, void* userData);
+
+    //! creates a new asynchronous stream
+    CV_WRAP Stream();
+
+    //! creates a new asynchronous stream with custom allocator
+    CV_WRAP Stream(const Ptr<GpuMat::Allocator>& allocator);
+
+    /** @brief creates a new Stream using the cudaFlags argument to determine the behaviors of the stream
+
+    @note The cudaFlags parameter is passed to the underlying api cudaStreamCreateWithFlags() and
+    supports the same parameter values.
+    @code
+        // creates an OpenCV cuda::Stream that manages an asynchronous, non-blocking,
+        // non-default CUDA stream
+        cv::cuda::Stream cvStream(cudaStreamNonBlocking);
+    @endcode
+     */
+    CV_WRAP Stream(const size_t cudaFlags);
+
+    /** @brief Returns true if the current stream queue is finished. Otherwise, it returns false.
+    */
+    CV_WRAP bool queryIfComplete() const;
+
+    /** @brief Blocks the current CPU thread until all operations in the stream are complete.
+    */
+    CV_WRAP void waitForCompletion();
+
+    /** @brief Makes a compute stream wait on an event.
+    */
+    CV_WRAP void waitEvent(const Event& event);
+
+    /** @brief Adds a callback to be called on the host after all currently enqueued items in the stream have
+    completed.
+
+    @note Callbacks must not make any CUDA API calls. Callbacks must not perform any synchronization
+    that may depend on outstanding device work or other callbacks that are not mandated to run earlier.
+    Callbacks without a mandated order (in independent streams) execute in undefined order and may be
+    serialized.
+     */
+    void enqueueHostCallback(StreamCallback callback, void* userData);
+
+    //! return Stream object for default CUDA stream
+    CV_WRAP static Stream& Null();
+
+    //! returns true if stream object is not default (!= 0)
+    operator bool_type() const;
+
+    //! return Pointer to CUDA stream
+    CV_WRAP void* cudaPtr() const;
+
+    class Impl;
+
+private:
+    Ptr<Impl> impl_;
+    Stream(const Ptr<Impl>& impl);
+
+    friend struct StreamAccessor;
+    friend class BufferPool;
+    friend class DefaultDeviceInitializer;
+};
+
+class CV_EXPORTS_W Event
+{
+public:
+    enum CreateFlags
+    {
+        DEFAULT        = 0x00,  /**< Default event flag */
+        BLOCKING_SYNC  = 0x01,  /**< Event uses blocking synchronization */
+        DISABLE_TIMING = 0x02,  /**< Event will not record timing data */
+        INTERPROCESS   = 0x04   /**< Event is suitable for interprocess use. DisableTiming must be set */
+    };
+
+    CV_WRAP explicit Event(const Event::CreateFlags flags = Event::CreateFlags::DEFAULT);
+
+    //! records an event
+    CV_WRAP void record(Stream& stream = Stream::Null());
+
+    //! queries an event's status
+    CV_WRAP bool queryIfComplete() const;
+
+    //! waits for an event to complete
+    CV_WRAP void waitForCompletion();
+
+    //! computes the elapsed time between events
+    CV_WRAP static float elapsedTime(const Event& start, const Event& end);
+
+    class Impl;
+
+private:
+    Ptr<Impl> impl_;
+    Event(const Ptr<Impl>& impl);
+
+    friend struct EventAccessor;
+};
+CV_ENUM_FLAGS(Event::CreateFlags)
+
+//! @} cudacore_struct
+
+//===================================================================================
+// Initialization & Info
+//===================================================================================
+
+//! @addtogroup cudacore_init
+//! @{
+
+/** @brief Returns the number of installed CUDA-enabled devices.
+
+Use this function before any other CUDA functions calls. If OpenCV is compiled without CUDA support,
+this function returns 0. If the CUDA driver is not installed, or is incompatible, this function
+returns -1.
+ */
+CV_EXPORTS_W int getCudaEnabledDeviceCount();
+
+/** @brief Sets a device and initializes it for the current thread.
+
+@param device System index of a CUDA device starting with 0.
+
+If the call of this function is omitted, a default device is initialized at the fist CUDA usage.
+ */
+CV_EXPORTS_W void setDevice(int device);
+
+/** @brief Returns the current device index set by cuda::setDevice or initialized by default.
+ */
+CV_EXPORTS_W int getDevice();
+
+/** @brief Explicitly destroys and cleans up all resources associated with the current device in the current
+process.
+
+Any subsequent API call to this device will reinitialize the device.
+ */
+CV_EXPORTS_W void resetDevice();
+
+/** @brief Enumeration providing CUDA computing features.
+ */
+enum FeatureSet
+{
+    FEATURE_SET_COMPUTE_10 = 10,
+    FEATURE_SET_COMPUTE_11 = 11,
+    FEATURE_SET_COMPUTE_12 = 12,
+    FEATURE_SET_COMPUTE_13 = 13,
+    FEATURE_SET_COMPUTE_20 = 20,
+    FEATURE_SET_COMPUTE_21 = 21,
+    FEATURE_SET_COMPUTE_30 = 30,
+    FEATURE_SET_COMPUTE_32 = 32,
+    FEATURE_SET_COMPUTE_35 = 35,
+    FEATURE_SET_COMPUTE_50 = 50,
+
+    GLOBAL_ATOMICS = FEATURE_SET_COMPUTE_11,
+    SHARED_ATOMICS = FEATURE_SET_COMPUTE_12,
+    NATIVE_DOUBLE = FEATURE_SET_COMPUTE_13,
+    WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30,
+    DYNAMIC_PARALLELISM = FEATURE_SET_COMPUTE_35
+};
+
+//! checks whether current device supports the given feature
+CV_EXPORTS bool deviceSupports(FeatureSet feature_set);
+
+/** @brief Class providing a set of static methods to check what NVIDIA\* card architecture the CUDA module was
+built for.
+
+According to the CUDA C Programming Guide Version 3.2: "PTX code produced for some specific compute
+capability can always be compiled to binary code of greater or equal compute capability".
+ */
+class CV_EXPORTS_W TargetArchs
+{
+public:
+    /** @brief The following method checks whether the module was built with the support of the given feature:
+
+    @param feature_set Features to be checked. See :ocvcuda::FeatureSet.
+     */
+    static bool builtWith(FeatureSet feature_set);
+
+    /** @brief There is a set of methods to check whether the module contains intermediate (PTX) or binary CUDA
+    code for the given architecture(s):
+
+    @param major Major compute capability version.
+    @param minor Minor compute capability version.
+     */
+    CV_WRAP static bool has(int major, int minor);
+    CV_WRAP static bool hasPtx(int major, int minor);
+    CV_WRAP static bool hasBin(int major, int minor);
+
+    CV_WRAP static bool hasEqualOrLessPtx(int major, int minor);
+    CV_WRAP static bool hasEqualOrGreater(int major, int minor);
+    CV_WRAP static bool hasEqualOrGreaterPtx(int major, int minor);
+    CV_WRAP static bool hasEqualOrGreaterBin(int major, int minor);
+};
+
+/** @brief Class providing functionality for querying the specified GPU properties.
+ */
+class CV_EXPORTS_W DeviceInfo
+{
+public:
+    //! creates DeviceInfo object for the current GPU
+    CV_WRAP DeviceInfo();
+
+    /** @brief The constructors.
+
+    @param device_id System index of the CUDA device starting with 0.
+
+    Constructs the DeviceInfo object for the specified device. If device_id parameter is missed, it
+    constructs an object for the current device.
+     */
+    CV_WRAP DeviceInfo(int device_id);
+
+    /** @brief Returns system index of the CUDA device starting with 0.
+    */
+    CV_WRAP int deviceID() const;
+
+    //! ASCII string identifying device
+    const char* name() const;
+
+    //! global memory available on device in bytes
+    CV_WRAP size_t totalGlobalMem() const;
+
+    //! shared memory available per block in bytes
+    CV_WRAP size_t sharedMemPerBlock() const;
+
+    //! 32-bit registers available per block
+    CV_WRAP int regsPerBlock() const;
+
+    //! warp size in threads
+    CV_WRAP int warpSize() const;
+
+    //! maximum pitch in bytes allowed by memory copies
+    CV_WRAP size_t memPitch() const;
+
+    //! maximum number of threads per block
+    CV_WRAP int maxThreadsPerBlock() const;
+
+    //! maximum size of each dimension of a block
+    CV_WRAP Vec3i maxThreadsDim() const;
+
+    //! maximum size of each dimension of a grid
+    CV_WRAP Vec3i maxGridSize() const;
+
+    //! clock frequency in kilohertz
+    CV_WRAP int clockRate() const;
+
+    //! constant memory available on device in bytes
+    CV_WRAP size_t totalConstMem() const;
+
+    //! major compute capability
+    CV_WRAP int majorVersion() const;
+
+    //! minor compute capability
+    CV_WRAP int minorVersion() const;
+
+    //! alignment requirement for textures
+    CV_WRAP size_t textureAlignment() const;
+
+    //! pitch alignment requirement for texture references bound to pitched memory
+    CV_WRAP size_t texturePitchAlignment() const;
+
+    //! number of multiprocessors on device
+    CV_WRAP int multiProcessorCount() const;
+
+    //! specified whether there is a run time limit on kernels
+    CV_WRAP bool kernelExecTimeoutEnabled() const;
+
+    //! device is integrated as opposed to discrete
+    CV_WRAP bool integrated() const;
+
+    //! device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer
+    CV_WRAP bool canMapHostMemory() const;
+
+    enum ComputeMode
+    {
+        ComputeModeDefault,         /**< default compute mode (Multiple threads can use cudaSetDevice with this device) */
+        ComputeModeExclusive,       /**< compute-exclusive-thread mode (Only one thread in one process will be able to use cudaSetDevice with this device) */
+        ComputeModeProhibited,      /**< compute-prohibited mode (No threads can use cudaSetDevice with this device) */
+        ComputeModeExclusiveProcess /**< compute-exclusive-process mode (Many threads in one process will be able to use cudaSetDevice with this device) */
+    };
+
+    //! compute mode
+    CV_WRAP DeviceInfo::ComputeMode computeMode() const;
+
+    //! maximum 1D texture size
+    CV_WRAP int maxTexture1D() const;
+
+    //! maximum 1D mipmapped texture size
+    CV_WRAP int maxTexture1DMipmap() const;
+
+    //! maximum size for 1D textures bound to linear memory
+    CV_WRAP int maxTexture1DLinear() const;
+
+    //! maximum 2D texture dimensions
+    CV_WRAP Vec2i maxTexture2D() const;
+
+    //! maximum 2D mipmapped texture dimensions
+    CV_WRAP Vec2i maxTexture2DMipmap() const;
+
+    //! maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory
+    CV_WRAP Vec3i maxTexture2DLinear() const;
+
+    //! maximum 2D texture dimensions if texture gather operations have to be performed
+    CV_WRAP Vec2i maxTexture2DGather() const;
+
+    //! maximum 3D texture dimensions
+    CV_WRAP Vec3i maxTexture3D() const;
+
+    //! maximum Cubemap texture dimensions
+    CV_WRAP int maxTextureCubemap() const;
+
+    //! maximum 1D layered texture dimensions
+    CV_WRAP Vec2i maxTexture1DLayered() const;
+
+    //! maximum 2D layered texture dimensions
+    CV_WRAP Vec3i maxTexture2DLayered() const;
+
+    //! maximum Cubemap layered texture dimensions
+    CV_WRAP Vec2i maxTextureCubemapLayered() const;
+
+    //! maximum 1D surface size
+    CV_WRAP int maxSurface1D() const;
+
+    //! maximum 2D surface dimensions
+    CV_WRAP Vec2i maxSurface2D() const;
+
+    //! maximum 3D surface dimensions
+    CV_WRAP Vec3i maxSurface3D() const;
+
+    //! maximum 1D layered surface dimensions
+    CV_WRAP Vec2i maxSurface1DLayered() const;
+
+    //! maximum 2D layered surface dimensions
+    CV_WRAP Vec3i maxSurface2DLayered() const;
+
+    //! maximum Cubemap surface dimensions
+    CV_WRAP int maxSurfaceCubemap() const;
+
+    //! maximum Cubemap layered surface dimensions
+    CV_WRAP Vec2i maxSurfaceCubemapLayered() const;
+
+    //! alignment requirements for surfaces
+    CV_WRAP size_t surfaceAlignment() const;
+
+    //! device can possibly execute multiple kernels concurrently
+    CV_WRAP bool concurrentKernels() const;
+
+    //! device has ECC support enabled
+    CV_WRAP bool ECCEnabled() const;
+
+    //! PCI bus ID of the device
+    CV_WRAP int pciBusID() const;
+
+    //! PCI device ID of the device
+    CV_WRAP int pciDeviceID() const;
+
+    //! PCI domain ID of the device
+    CV_WRAP int pciDomainID() const;
+
+    //! true if device is a Tesla device using TCC driver, false otherwise
+    CV_WRAP bool tccDriver() const;
+
+    //! number of asynchronous engines
+    CV_WRAP int asyncEngineCount() const;
+
+    //! device shares a unified address space with the host
+    CV_WRAP bool unifiedAddressing() const;
+
+    //! peak memory clock frequency in kilohertz
+    CV_WRAP int memoryClockRate() const;
+
+    //! global memory bus width in bits
+    CV_WRAP int memoryBusWidth() const;
+
+    //! size of L2 cache in bytes
+    CV_WRAP int l2CacheSize() const;
+
+    //! maximum resident threads per multiprocessor
+    CV_WRAP int maxThreadsPerMultiProcessor() const;
+
+    //! gets free and total device memory
+    CV_WRAP void queryMemory(size_t& totalMemory, size_t& freeMemory) const;
+    CV_WRAP size_t freeMemory() const;
+    CV_WRAP size_t totalMemory() const;
+
+    /** @brief Provides information on CUDA feature support.
+
+    @param feature_set Features to be checked. See cuda::FeatureSet.
+
+    This function returns true if the device has the specified CUDA feature. Otherwise, it returns false
+     */
+    bool supports(FeatureSet feature_set) const;
+
+    /** @brief Checks the CUDA module and device compatibility.
+
+    This function returns true if the CUDA module can be run on the specified device. Otherwise, it
+    returns false .
+     */
+    CV_WRAP bool isCompatible() const;
+
+private:
+    int device_id_;
+};
+
+CV_EXPORTS_W void printCudaDeviceInfo(int device);
+CV_EXPORTS_W void printShortCudaDeviceInfo(int device);
+
+/** @brief Converts an array to half precision floating number.
+
+@param _src input array.
+@param _dst output array.
+@param stream Stream for the asynchronous version.
+@sa convertFp16
+*/
+CV_EXPORTS void convertFp16(InputArray _src, OutputArray _dst, Stream& stream = Stream::Null());
+
+//! @} cudacore_init
+
+}} // namespace cv { namespace cuda {
+
+
+#include "opencv2/core/cuda.inl.hpp"
+
+#endif /* OPENCV_CORE_CUDA_HPP */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda.inl.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda.inl.hpp
new file mode 100644
index 0000000..3f2a0c7
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda.inl.hpp
@@ -0,0 +1,723 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_CUDAINL_HPP
+#define OPENCV_CORE_CUDAINL_HPP
+
+#include "opencv2/core/cuda.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda {
+
+//===================================================================================
+// GpuMat
+//===================================================================================
+
+inline
+GpuMat::GpuMat(Allocator* allocator_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), allocator(allocator_)
+{}
+
+inline
+GpuMat::GpuMat(int rows_, int cols_, int type_, Allocator* allocator_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), allocator(allocator_)
+{
+    if (rows_ > 0 && cols_ > 0)
+        create(rows_, cols_, type_);
+}
+
+inline
+GpuMat::GpuMat(Size size_, int type_, Allocator* allocator_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), allocator(allocator_)
+{
+    if (size_.height > 0 && size_.width > 0)
+        create(size_.height, size_.width, type_);
+}
+
+inline
+GpuMat::GpuMat(int rows_, int cols_, int type_, Scalar s_, Allocator* allocator_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), allocator(allocator_)
+{
+    if (rows_ > 0 && cols_ > 0)
+    {
+        create(rows_, cols_, type_);
+        setTo(s_);
+    }
+}
+
+inline
+GpuMat::GpuMat(Size size_, int type_, Scalar s_, Allocator* allocator_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), allocator(allocator_)
+{
+    if (size_.height > 0 && size_.width > 0)
+    {
+        create(size_.height, size_.width, type_);
+        setTo(s_);
+    }
+}
+
+inline
+GpuMat::GpuMat(const GpuMat& m)
+    : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend), allocator(m.allocator)
+{
+    if (refcount)
+        CV_XADD(refcount, 1);
+}
+
+inline
+GpuMat::GpuMat(InputArray arr, Allocator* allocator_) :
+    flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), allocator(allocator_)
+{
+    upload(arr);
+}
+
+inline
+GpuMat::~GpuMat()
+{
+    release();
+}
+
+inline
+GpuMat& GpuMat::operator =(const GpuMat& m)
+{
+    if (this != &m)
+    {
+        GpuMat temp(m);
+        swap(temp);
+    }
+
+    return *this;
+}
+
+inline
+void GpuMat::create(Size size_, int type_)
+{
+    create(size_.height, size_.width, type_);
+}
+
+inline
+void GpuMat::swap(GpuMat& b)
+{
+    std::swap(flags, b.flags);
+    std::swap(rows, b.rows);
+    std::swap(cols, b.cols);
+    std::swap(step, b.step);
+    std::swap(data, b.data);
+    std::swap(datastart, b.datastart);
+    std::swap(dataend, b.dataend);
+    std::swap(refcount, b.refcount);
+    std::swap(allocator, b.allocator);
+}
+
+inline
+GpuMat GpuMat::clone() const
+{
+    GpuMat m;
+    copyTo(m);
+    return m;
+}
+
+inline
+void GpuMat::copyTo(OutputArray dst, InputArray mask) const
+{
+    copyTo(dst, mask, Stream::Null());
+}
+
+inline
+GpuMat& GpuMat::setTo(Scalar s)
+{
+    return setTo(s, Stream::Null());
+}
+
+inline
+GpuMat& GpuMat::setTo(Scalar s, InputArray mask)
+{
+    return setTo(s, mask, Stream::Null());
+}
+
+inline
+void GpuMat::convertTo(OutputArray dst, int rtype) const
+{
+    convertTo(dst, rtype, Stream::Null());
+}
+
+inline
+void GpuMat::convertTo(OutputArray dst, int rtype, double alpha, double beta) const
+{
+    convertTo(dst, rtype, alpha, beta, Stream::Null());
+}
+
+inline
+void GpuMat::convertTo(OutputArray dst, int rtype, double alpha, Stream& stream) const
+{
+    convertTo(dst, rtype, alpha, 0.0, stream);
+}
+
+inline
+void GpuMat::assignTo(GpuMat& m, int _type) const
+{
+    if (_type < 0)
+        m = *this;
+    else
+        convertTo(m, _type);
+}
+
+inline
+uchar* GpuMat::ptr(int y)
+{
+    CV_DbgAssert( (unsigned)y < (unsigned)rows );
+    return data + step * y;
+}
+
+inline
+const uchar* GpuMat::ptr(int y) const
+{
+    CV_DbgAssert( (unsigned)y < (unsigned)rows );
+    return data + step * y;
+}
+
+template<typename _Tp> inline
+_Tp* GpuMat::ptr(int y)
+{
+    return (_Tp*)ptr(y);
+}
+
+template<typename _Tp> inline
+const _Tp* GpuMat::ptr(int y) const
+{
+    return (const _Tp*)ptr(y);
+}
+
+template <class T> inline
+GpuMat::operator PtrStepSz<T>() const
+{
+    return PtrStepSz<T>(rows, cols, (T*)data, step);
+}
+
+template <class T> inline
+GpuMat::operator PtrStep<T>() const
+{
+    return PtrStep<T>((T*)data, step);
+}
+
+inline
+GpuMat GpuMat::row(int y) const
+{
+    return GpuMat(*this, Range(y, y+1), Range::all());
+}
+
+inline
+GpuMat GpuMat::col(int x) const
+{
+    return GpuMat(*this, Range::all(), Range(x, x+1));
+}
+
+inline
+GpuMat GpuMat::rowRange(int startrow, int endrow) const
+{
+    return GpuMat(*this, Range(startrow, endrow), Range::all());
+}
+
+inline
+GpuMat GpuMat::rowRange(Range r) const
+{
+    return GpuMat(*this, r, Range::all());
+}
+
+inline
+GpuMat GpuMat::colRange(int startcol, int endcol) const
+{
+    return GpuMat(*this, Range::all(), Range(startcol, endcol));
+}
+
+inline
+GpuMat GpuMat::colRange(Range r) const
+{
+    return GpuMat(*this, Range::all(), r);
+}
+
+inline
+GpuMat GpuMat::operator ()(Range rowRange_, Range colRange_) const
+{
+    return GpuMat(*this, rowRange_, colRange_);
+}
+
+inline
+GpuMat GpuMat::operator ()(Rect roi) const
+{
+    return GpuMat(*this, roi);
+}
+
+inline
+bool GpuMat::isContinuous() const
+{
+    return (flags & Mat::CONTINUOUS_FLAG) != 0;
+}
+
+inline
+size_t GpuMat::elemSize() const
+{
+    return CV_ELEM_SIZE(flags);
+}
+
+inline
+size_t GpuMat::elemSize1() const
+{
+    return CV_ELEM_SIZE1(flags);
+}
+
+inline
+int GpuMat::type() const
+{
+    return CV_MAT_TYPE(flags);
+}
+
+inline
+int GpuMat::depth() const
+{
+    return CV_MAT_DEPTH(flags);
+}
+
+inline
+int GpuMat::channels() const
+{
+    return CV_MAT_CN(flags);
+}
+
+inline
+size_t GpuMat::step1() const
+{
+    return step / elemSize1();
+}
+
+inline
+Size GpuMat::size() const
+{
+    return Size(cols, rows);
+}
+
+inline
+bool GpuMat::empty() const
+{
+    return data == 0;
+}
+
+inline
+void* GpuMat::cudaPtr() const
+{
+    return data;
+}
+
+static inline
+GpuMat createContinuous(int rows, int cols, int type)
+{
+    GpuMat m;
+    createContinuous(rows, cols, type, m);
+    return m;
+}
+
+static inline
+void createContinuous(Size size, int type, OutputArray arr)
+{
+    createContinuous(size.height, size.width, type, arr);
+}
+
+static inline
+GpuMat createContinuous(Size size, int type)
+{
+    GpuMat m;
+    createContinuous(size, type, m);
+    return m;
+}
+
+static inline
+void ensureSizeIsEnough(Size size, int type, OutputArray arr)
+{
+    ensureSizeIsEnough(size.height, size.width, type, arr);
+}
+
+static inline
+void swap(GpuMat& a, GpuMat& b)
+{
+    a.swap(b);
+}
+
+//===================================================================================
+// GpuMatND
+//===================================================================================
+
+inline
+GpuMatND::GpuMatND() :
+    flags(0), dims(0), data(nullptr), offset(0)
+{
+}
+
+inline
+GpuMatND::GpuMatND(SizeArray _size, int _type) :
+    flags(0), dims(0), data(nullptr), offset(0)
+{
+    create(std::move(_size), _type);
+}
+
+inline
+void GpuMatND::swap(GpuMatND& m) noexcept
+{
+    std::swap(*this, m);
+}
+
+inline
+bool GpuMatND::isContinuous() const
+{
+    return (flags & Mat::CONTINUOUS_FLAG) != 0;
+}
+
+inline
+bool GpuMatND::isSubmatrix() const
+{
+    return (flags & Mat::SUBMATRIX_FLAG) != 0;
+}
+
+inline
+size_t GpuMatND::elemSize() const
+{
+    return CV_ELEM_SIZE(flags);
+}
+
+inline
+size_t GpuMatND::elemSize1() const
+{
+    return CV_ELEM_SIZE1(flags);
+}
+
+inline
+bool GpuMatND::empty() const
+{
+    return data == nullptr;
+}
+
+inline
+bool GpuMatND::external() const
+{
+    return !empty() && data_.use_count() == 0;
+}
+
+inline
+uchar* GpuMatND::getDevicePtr() const
+{
+    return data + offset;
+}
+
+inline
+size_t GpuMatND::total() const
+{
+    size_t p = 1;
+    for(auto s : size)
+        p *= s;
+    return p;
+}
+
+inline
+size_t GpuMatND::totalMemSize() const
+{
+    return size[0] * step[0];
+}
+
+inline
+int GpuMatND::type() const
+{
+    return CV_MAT_TYPE(flags);
+}
+
+//===================================================================================
+// HostMem
+//===================================================================================
+
+inline
+HostMem::HostMem(AllocType alloc_type_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_)
+{
+}
+
+inline
+HostMem::HostMem(const HostMem& m)
+    : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend), alloc_type(m.alloc_type)
+{
+    if( refcount )
+        CV_XADD(refcount, 1);
+}
+
+inline
+HostMem::HostMem(int rows_, int cols_, int type_, AllocType alloc_type_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_)
+{
+    if (rows_ > 0 && cols_ > 0)
+        create(rows_, cols_, type_);
+}
+
+inline
+HostMem::HostMem(Size size_, int type_, AllocType alloc_type_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_)
+{
+    if (size_.height > 0 && size_.width > 0)
+        create(size_.height, size_.width, type_);
+}
+
+inline
+HostMem::HostMem(InputArray arr, AllocType alloc_type_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_)
+{
+    arr.getMat().copyTo(*this);
+}
+
+inline
+HostMem::~HostMem()
+{
+    release();
+}
+
+inline
+HostMem& HostMem::operator =(const HostMem& m)
+{
+    if (this != &m)
+    {
+        HostMem temp(m);
+        swap(temp);
+    }
+
+    return *this;
+}
+
+inline
+void HostMem::swap(HostMem& b)
+{
+    std::swap(flags, b.flags);
+    std::swap(rows, b.rows);
+    std::swap(cols, b.cols);
+    std::swap(step, b.step);
+    std::swap(data, b.data);
+    std::swap(datastart, b.datastart);
+    std::swap(dataend, b.dataend);
+    std::swap(refcount, b.refcount);
+    std::swap(alloc_type, b.alloc_type);
+}
+
+inline
+HostMem HostMem::clone() const
+{
+    HostMem m(size(), type(), alloc_type);
+    createMatHeader().copyTo(m);
+    return m;
+}
+
+inline
+void HostMem::create(Size size_, int type_)
+{
+    create(size_.height, size_.width, type_);
+}
+
+inline
+Mat HostMem::createMatHeader() const
+{
+    return Mat(size(), type(), data, step);
+}
+
+inline
+bool HostMem::isContinuous() const
+{
+    return (flags & Mat::CONTINUOUS_FLAG) != 0;
+}
+
+inline
+size_t HostMem::elemSize() const
+{
+    return CV_ELEM_SIZE(flags);
+}
+
+inline
+size_t HostMem::elemSize1() const
+{
+    return CV_ELEM_SIZE1(flags);
+}
+
+inline
+int HostMem::type() const
+{
+    return CV_MAT_TYPE(flags);
+}
+
+inline
+int HostMem::depth() const
+{
+    return CV_MAT_DEPTH(flags);
+}
+
+inline
+int HostMem::channels() const
+{
+    return CV_MAT_CN(flags);
+}
+
+inline
+size_t HostMem::step1() const
+{
+    return step / elemSize1();
+}
+
+inline
+Size HostMem::size() const
+{
+    return Size(cols, rows);
+}
+
+inline
+bool HostMem::empty() const
+{
+    return data == 0;
+}
+
+static inline
+void swap(HostMem& a, HostMem& b)
+{
+    a.swap(b);
+}
+
+//===================================================================================
+// Stream
+//===================================================================================
+
+inline
+Stream::Stream(const Ptr<Impl>& impl)
+    : impl_(impl)
+{
+}
+
+//===================================================================================
+// Event
+//===================================================================================
+
+inline
+Event::Event(const Ptr<Impl>& impl)
+    : impl_(impl)
+{
+}
+
+//===================================================================================
+// Initialization & Info
+//===================================================================================
+
+inline
+bool TargetArchs::has(int major, int minor)
+{
+    return hasPtx(major, minor) || hasBin(major, minor);
+}
+
+inline
+bool TargetArchs::hasEqualOrGreater(int major, int minor)
+{
+    return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
+}
+
+inline
+DeviceInfo::DeviceInfo()
+{
+    device_id_ = getDevice();
+}
+
+inline
+DeviceInfo::DeviceInfo(int device_id)
+{
+    CV_Assert( device_id >= 0 && device_id < getCudaEnabledDeviceCount() );
+    device_id_ = device_id;
+}
+
+inline
+int DeviceInfo::deviceID() const
+{
+    return device_id_;
+}
+
+inline
+size_t DeviceInfo::freeMemory() const
+{
+    size_t _totalMemory = 0, _freeMemory = 0;
+    queryMemory(_totalMemory, _freeMemory);
+    return _freeMemory;
+}
+
+inline
+size_t DeviceInfo::totalMemory() const
+{
+    size_t _totalMemory = 0, _freeMemory = 0;
+    queryMemory(_totalMemory, _freeMemory);
+    return _totalMemory;
+}
+
+inline
+bool DeviceInfo::supports(FeatureSet feature_set) const
+{
+    int version = majorVersion() * 10 + minorVersion();
+    return version >= feature_set;
+}
+
+
+}} // namespace cv { namespace cuda {
+
+//===================================================================================
+// Mat
+//===================================================================================
+
+namespace cv {
+
+inline
+Mat::Mat(const cuda::GpuMat& m)
+    : flags(0), dims(0), rows(0), cols(0), data(0), datastart(0), dataend(0), datalimit(0), allocator(0), u(0), size(&rows)
+{
+    m.download(*this);
+}
+
+}
+
+//! @endcond
+
+#endif // OPENCV_CORE_CUDAINL_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/block.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/block.hpp
new file mode 100644
index 0000000..c277f0e
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/block.hpp
@@ -0,0 +1,211 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_DEVICE_BLOCK_HPP
+#define OPENCV_CUDA_DEVICE_BLOCK_HPP
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    struct Block
+    {
+        static __device__ __forceinline__ unsigned int id()
+        {
+            return blockIdx.x;
+        }
+
+        static __device__ __forceinline__ unsigned int stride()
+        {
+            return blockDim.x * blockDim.y * blockDim.z;
+        }
+
+        static __device__ __forceinline__ void sync()
+        {
+            __syncthreads();
+        }
+
+        static __device__ __forceinline__ int flattenedThreadId()
+        {
+            return threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
+        }
+
+        template<typename It, typename T>
+        static __device__ __forceinline__ void fill(It beg, It end, const T& value)
+        {
+            int STRIDE = stride();
+            It t = beg + flattenedThreadId();
+
+            for(; t < end; t += STRIDE)
+                *t = value;
+        }
+
+        template<typename OutIt, typename T>
+        static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
+        {
+            int STRIDE = stride();
+            int tid = flattenedThreadId();
+            value += tid;
+
+            for(OutIt t = beg + tid; t < end; t += STRIDE, value += STRIDE)
+                *t = value;
+        }
+
+        template<typename InIt, typename OutIt>
+        static __device__ __forceinline__ void copy(InIt beg, InIt end, OutIt out)
+        {
+            int STRIDE = stride();
+            InIt  t = beg + flattenedThreadId();
+            OutIt o = out + (t - beg);
+
+            for(; t < end; t += STRIDE, o += STRIDE)
+                *o = *t;
+        }
+
+        template<typename InIt, typename OutIt, class UnOp>
+        static __device__ __forceinline__ void transform(InIt beg, InIt end, OutIt out, UnOp op)
+        {
+            int STRIDE = stride();
+            InIt  t = beg + flattenedThreadId();
+            OutIt o = out + (t - beg);
+
+            for(; t < end; t += STRIDE, o += STRIDE)
+                *o = op(*t);
+        }
+
+        template<typename InIt1, typename InIt2, typename OutIt, class BinOp>
+        static __device__ __forceinline__ void transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
+        {
+            int STRIDE = stride();
+            InIt1 t1 = beg1 + flattenedThreadId();
+            InIt2 t2 = beg2 + flattenedThreadId();
+            OutIt o  = out + (t1 - beg1);
+
+            for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, o += STRIDE)
+                *o = op(*t1, *t2);
+        }
+
+        template<int CTA_SIZE, typename T, class BinOp>
+        static __device__ __forceinline__ void reduce(volatile T* buffer, BinOp op)
+        {
+            int tid = flattenedThreadId();
+            T val =  buffer[tid];
+
+            if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
+            if (CTA_SIZE >=  512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
+            if (CTA_SIZE >=  256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
+            if (CTA_SIZE >=  128) { if (tid <  64) buffer[tid] = val = op(val, buffer[tid +  64]); __syncthreads(); }
+
+            if (tid < 32)
+            {
+                if (CTA_SIZE >=   64) { buffer[tid] = val = op(val, buffer[tid +  32]); }
+                if (CTA_SIZE >=   32) { buffer[tid] = val = op(val, buffer[tid +  16]); }
+                if (CTA_SIZE >=   16) { buffer[tid] = val = op(val, buffer[tid +   8]); }
+                if (CTA_SIZE >=    8) { buffer[tid] = val = op(val, buffer[tid +   4]); }
+                if (CTA_SIZE >=    4) { buffer[tid] = val = op(val, buffer[tid +   2]); }
+                if (CTA_SIZE >=    2) { buffer[tid] = val = op(val, buffer[tid +   1]); }
+            }
+        }
+
+        template<int CTA_SIZE, typename T, class BinOp>
+        static __device__ __forceinline__ T reduce(volatile T* buffer, T init, BinOp op)
+        {
+            int tid = flattenedThreadId();
+            T val =  buffer[tid] = init;
+            __syncthreads();
+
+            if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
+            if (CTA_SIZE >=  512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
+            if (CTA_SIZE >=  256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
+            if (CTA_SIZE >=  128) { if (tid <  64) buffer[tid] = val = op(val, buffer[tid +  64]); __syncthreads(); }
+
+            if (tid < 32)
+            {
+                if (CTA_SIZE >=   64) { buffer[tid] = val = op(val, buffer[tid +  32]); }
+                if (CTA_SIZE >=   32) { buffer[tid] = val = op(val, buffer[tid +  16]); }
+                if (CTA_SIZE >=   16) { buffer[tid] = val = op(val, buffer[tid +   8]); }
+                if (CTA_SIZE >=    8) { buffer[tid] = val = op(val, buffer[tid +   4]); }
+                if (CTA_SIZE >=    4) { buffer[tid] = val = op(val, buffer[tid +   2]); }
+                if (CTA_SIZE >=    2) { buffer[tid] = val = op(val, buffer[tid +   1]); }
+            }
+            __syncthreads();
+            return buffer[0];
+        }
+
+        template <typename T, class BinOp>
+        static __device__ __forceinline__ void reduce_n(T* data, unsigned int n, BinOp op)
+        {
+            int ftid = flattenedThreadId();
+            int sft = stride();
+
+            if (sft < n)
+            {
+                for (unsigned int i = sft + ftid; i < n; i += sft)
+                    data[ftid] = op(data[ftid], data[i]);
+
+                __syncthreads();
+
+                n = sft;
+            }
+
+            while (n > 1)
+            {
+                unsigned int half = n/2;
+
+                if (ftid < half)
+                    data[ftid] = op(data[ftid], data[n - ftid - 1]);
+
+                __syncthreads();
+
+                n = n - half;
+            }
+        }
+    };
+}}}
+
+//! @endcond
+
+#endif /* OPENCV_CUDA_DEVICE_BLOCK_HPP */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/border_interpolate.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/border_interpolate.hpp
new file mode 100644
index 0000000..874f705
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/border_interpolate.hpp
@@ -0,0 +1,722 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_BORDER_INTERPOLATE_HPP
+#define OPENCV_CUDA_BORDER_INTERPOLATE_HPP
+
+#include "saturate_cast.hpp"
+#include "vec_traits.hpp"
+#include "vec_math.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    //////////////////////////////////////////////////////////////
+    // BrdConstant
+
+    template <typename D> struct BrdRowConstant
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdRowConstant(int width_, const D& val_ = VecTraits<D>::all(0)) : width(width_), val(val_) {}
+
+        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const
+        {
+            return x >= 0 ? saturate_cast<D>(data[x]) : val;
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const
+        {
+            return x < width ? saturate_cast<D>(data[x]) : val;
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const
+        {
+            return (x >= 0 && x < width) ? saturate_cast<D>(data[x]) : val;
+        }
+
+        int width;
+        D val;
+    };
+
+    template <typename D> struct BrdColConstant
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdColConstant(int height_, const D& val_ = VecTraits<D>::all(0)) : height(height_), val(val_) {}
+
+        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const
+        {
+            return y >= 0 ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const
+        {
+            return y < height ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const
+        {
+            return (y >= 0 && y < height) ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;
+        }
+
+        int height;
+        D val;
+    };
+
+    template <typename D> struct BrdConstant
+    {
+        typedef D result_type;
+
+        __host__ __device__ __forceinline__ BrdConstant(int height_, int width_, const D& val_ = VecTraits<D>::all(0)) : height(height_), width(width_), val(val_)
+        {
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const
+        {
+            return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(((const T*)((const uchar*)data + y * step))[x]) : val;
+        }
+
+        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const
+        {
+            return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(src(y, x)) : val;
+        }
+
+        int height;
+        int width;
+        D val;
+    };
+
+    //////////////////////////////////////////////////////////////
+    // BrdReplicate
+
+    template <typename D> struct BrdRowReplicate
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdRowReplicate(int width) : last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdRowReplicate(int width, U) : last_col(width - 1) {}
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return ::max(x, 0);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return ::min(x, last_col);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_low(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_high(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col(x)]);
+        }
+
+        int last_col;
+    };
+
+    template <typename D> struct BrdColReplicate
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdColReplicate(int height) : last_row(height - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdColReplicate(int height, U) : last_row(height - 1) {}
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return ::max(y, 0);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return ::min(y, last_row);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const T*)((const char*)data + idx_row_low(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const T*)((const char*)data + idx_row_high(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const T*)((const char*)data + idx_row(y) * step));
+        }
+
+        int last_row;
+    };
+
+    template <typename D> struct BrdReplicate
+    {
+        typedef D result_type;
+
+        __host__ __device__ __forceinline__ BrdReplicate(int height, int width) : last_row(height - 1), last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdReplicate(int height, int width, U) : last_row(height - 1), last_col(width - 1) {}
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return ::max(y, 0);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return ::min(y, last_row);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return ::max(x, 0);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return ::min(x, last_col);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);
+        }
+
+        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const
+        {
+            return saturate_cast<D>(src(idx_row(y), idx_col(x)));
+        }
+
+        int last_row;
+        int last_col;
+    };
+
+    //////////////////////////////////////////////////////////////
+    // BrdReflect101
+
+    template <typename D> struct BrdRowReflect101
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdRowReflect101(int width) : last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdRowReflect101(int width, U) : last_col(width - 1) {}
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return ::abs(x) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return ::abs(last_col - ::abs(last_col - x)) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_low(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_high(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col(x)]);
+        }
+
+        int last_col;
+    };
+
+    template <typename D> struct BrdColReflect101
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdColReflect101(int height) : last_row(height - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdColReflect101(int height, U) : last_row(height - 1) {}
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return ::abs(y) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return ::abs(last_row - ::abs(last_row - y)) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));
+        }
+
+        int last_row;
+    };
+
+    template <typename D> struct BrdReflect101
+    {
+        typedef D result_type;
+
+        __host__ __device__ __forceinline__ BrdReflect101(int height, int width) : last_row(height - 1), last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdReflect101(int height, int width, U) : last_row(height - 1), last_col(width - 1) {}
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return ::abs(y) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return ::abs(last_row - ::abs(last_row - y)) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return ::abs(x) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return ::abs(last_col - ::abs(last_col - x)) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);
+        }
+
+        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const
+        {
+            return saturate_cast<D>(src(idx_row(y), idx_col(x)));
+        }
+
+        int last_row;
+        int last_col;
+    };
+
+    //////////////////////////////////////////////////////////////
+    // BrdReflect
+
+    template <typename D> struct BrdRowReflect
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdRowReflect(int width) : last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdRowReflect(int width, U) : last_col(width - 1) {}
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return (::abs(x) - (x < 0)) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return ::abs(last_col - ::abs(last_col - x) + (x > last_col)) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_high(::abs(x) - (x < 0));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_low(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_high(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col(x)]);
+        }
+
+        int last_col;
+    };
+
+    template <typename D> struct BrdColReflect
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdColReflect(int height) : last_row(height - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdColReflect(int height, U) : last_row(height - 1) {}
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return (::abs(y) - (y < 0)) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return ::abs(last_row - ::abs(last_row - y) + (y > last_row)) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_high(::abs(y) - (y < 0));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));
+        }
+
+        int last_row;
+    };
+
+    template <typename D> struct BrdReflect
+    {
+        typedef D result_type;
+
+        __host__ __device__ __forceinline__ BrdReflect(int height, int width) : last_row(height - 1), last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdReflect(int height, int width, U) : last_row(height - 1), last_col(width - 1) {}
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return (::abs(y) - (y < 0)) % (last_row + 1);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return /*::abs*/(last_row - ::abs(last_row - y) + (y > last_row)) /*% (last_row + 1)*/;
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return (::abs(x) - (x < 0)) % (last_col + 1);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return (last_col - ::abs(last_col - x) + (x > last_col));
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);
+        }
+
+        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const
+        {
+            return saturate_cast<D>(src(idx_row(y), idx_col(x)));
+        }
+
+        int last_row;
+        int last_col;
+    };
+
+    //////////////////////////////////////////////////////////////
+    // BrdWrap
+
+    template <typename D> struct BrdRowWrap
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdRowWrap(int width_) : width(width_) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdRowWrap(int width_, U) : width(width_) {}
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return (x >= 0) * x + (x < 0) * (x - ((x - width + 1) / width) * width);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return (x < width) * x + (x >= width) * (x % width);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_high(idx_col_low(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_low(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col_high(x)]);
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const
+        {
+            return saturate_cast<D>(data[idx_col(x)]);
+        }
+
+        int width;
+    };
+
+    template <typename D> struct BrdColWrap
+    {
+        typedef D result_type;
+
+        explicit __host__ __device__ __forceinline__ BrdColWrap(int height_) : height(height_) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdColWrap(int height_, U) : height(height_) {}
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return (y >= 0) * y + (y < 0) * (y - ((y - height + 1) / height) * height);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return (y < height) * y + (y >= height) * (y % height);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_high(idx_row_low(y));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));
+        }
+
+        int height;
+    };
+
+    template <typename D> struct BrdWrap
+    {
+        typedef D result_type;
+
+        __host__ __device__ __forceinline__ BrdWrap(int height_, int width_) :
+            height(height_), width(width_)
+        {
+        }
+        template <typename U>
+        __host__ __device__ __forceinline__ BrdWrap(int height_, int width_, U) :
+            height(height_), width(width_)
+        {
+        }
+
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return (y >= 0) ? y : (y - ((y - height + 1) / height) * height);
+        }
+
+        __device__ __forceinline__ int idx_row_high(int y) const
+        {
+            return (y < height) ? y : (y % height);
+        }
+
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_high(idx_row_low(y));
+        }
+
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return (x >= 0) ? x : (x - ((x - width + 1) / width) * width);
+        }
+
+        __device__ __forceinline__ int idx_col_high(int x) const
+        {
+            return (x < width) ? x : (x % width);
+        }
+
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_high(idx_col_low(x));
+        }
+
+        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const
+        {
+            return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);
+        }
+
+        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const
+        {
+            return saturate_cast<D>(src(idx_row(y), idx_col(x)));
+        }
+
+        int height;
+        int width;
+    };
+
+    //////////////////////////////////////////////////////////////
+    // BorderReader
+
+    template <typename Ptr2D, typename B> struct BorderReader
+    {
+        typedef typename B::result_type elem_type;
+        typedef typename Ptr2D::index_type index_type;
+
+        __host__ __device__ __forceinline__ BorderReader(const Ptr2D& ptr_, const B& b_) : ptr(ptr_), b(b_) {}
+
+        __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const
+        {
+            return b.at(y, x, ptr);
+        }
+
+        Ptr2D ptr;
+        B b;
+    };
+
+    // under win32 there is some bug with templated types that passed as kernel parameters
+    // with this specialization all works fine
+    template <typename Ptr2D, typename D> struct BorderReader< Ptr2D, BrdConstant<D> >
+    {
+        typedef typename BrdConstant<D>::result_type elem_type;
+        typedef typename Ptr2D::index_type index_type;
+
+        __host__ __device__ __forceinline__ BorderReader(const Ptr2D& src_, const BrdConstant<D>& b) :
+            src(src_), height(b.height), width(b.width), val(b.val)
+        {
+        }
+
+        __device__ __forceinline__ D operator ()(index_type y, index_type x) const
+        {
+            return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(src(y, x)) : val;
+        }
+
+        Ptr2D src;
+        int height;
+        int width;
+        D val;
+    };
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_BORDER_INTERPOLATE_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/color.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/color.hpp
new file mode 100644
index 0000000..dcce280
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/color.hpp
@@ -0,0 +1,309 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_COLOR_HPP
+#define OPENCV_CUDA_COLOR_HPP
+
+#include "detail/color_detail.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    // All OPENCV_CUDA_IMPLEMENT_*_TRAITS(ColorSpace1_to_ColorSpace2, ...) macros implements
+    // template <typename T> class ColorSpace1_to_ColorSpace2_traits
+    // {
+    //     typedef ... functor_type;
+    //     static __host__ __device__ functor_type create_functor();
+    // };
+
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_bgra, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgba, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_bgr, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgba, 4, 4, 2)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr555, 3, 0, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr565, 3, 0, 6)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr555, 3, 2, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr565, 3, 2, 6)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr555, 4, 0, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr565, 4, 0, 6)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr555, 4, 2, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr565, 4, 2, 6)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgb, 3, 2, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgb, 3, 2, 6)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgr, 3, 0, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgr, 3, 0, 6)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgba, 4, 2, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgba, 4, 2, 6)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgra, 4, 0, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgra, 4, 0, 6)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgr, 3)
+    OPENCV_CUDA_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgra, 4)
+
+    #undef OPENCV_CUDA_IMPLEMENT_GRAY2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr555, 5)
+    OPENCV_CUDA_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr565, 6)
+
+    #undef OPENCV_CUDA_IMPLEMENT_GRAY2RGB5x5_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr555_to_gray, 5)
+    OPENCV_CUDA_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr565_to_gray, 6)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB5x52GRAY_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2GRAY_TRAITS(rgb_to_gray, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2GRAY_TRAITS(bgr_to_gray, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2GRAY_TRAITS(rgba_to_gray, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2GRAY_TRAITS(bgra_to_gray, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2GRAY_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv4, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv4, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv4, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv4, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgba, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgba, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgr, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgra, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgr, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb4, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb4, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb4, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb4, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgba, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgba, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgr, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgra, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgr, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz4, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz4, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz4, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz4, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgba, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgba, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgr, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgr, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgra, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv4, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv4, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv4, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv4, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgba, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgba, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgr, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgra, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgr, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls4, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls4, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls4, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls4, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgb, 3, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgba, 3, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgb, 4, 3, 2)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgba, 4, 4, 2)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgr, 3, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgra, 3, 4, 0)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgr, 4, 3, 0)
+    OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(rgb_to_lab, 3, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(rgba_to_lab, 4, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(rgb_to_lab4, 3, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(rgba_to_lab4, 4, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(bgr_to_lab, 3, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(bgra_to_lab, 4, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(bgr_to_lab4, 3, 4, true, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(bgra_to_lab4, 4, 4, true, 0)
+
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lrgb_to_lab, 3, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lrgba_to_lab, 4, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lrgb_to_lab4, 3, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lrgba_to_lab4, 4, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lbgr_to_lab, 3, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lbgra_to_lab, 4, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lbgr_to_lab4, 3, 4, false, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(lbgra_to_lab4, 4, 4, false, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_rgb, 3, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_rgb, 4, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_rgba, 3, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_rgba, 4, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_bgr, 3, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_bgr, 4, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_bgra, 3, 4, true, 0)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_bgra, 4, 4, true, 0)
+
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lrgb, 3, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lrgb, 4, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lrgba, 3, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lrgba, 4, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lbgr, 3, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lbgr, 4, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab_to_lbgra, 3, 4, false, 0)
+    OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(lab4_to_lbgra, 4, 4, false, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(rgb_to_luv, 3, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(rgba_to_luv, 4, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(rgb_to_luv4, 3, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(rgba_to_luv4, 4, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(bgr_to_luv, 3, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(bgra_to_luv, 4, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(bgr_to_luv4, 3, 4, true, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(bgra_to_luv4, 4, 4, true, 0)
+
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lrgb_to_luv, 3, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lrgba_to_luv, 4, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lrgb_to_luv4, 3, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lrgba_to_luv4, 4, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lbgr_to_luv, 3, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lbgra_to_luv, 4, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lbgr_to_luv4, 3, 4, false, 0)
+    OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(lbgra_to_luv4, 4, 4, false, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS
+
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_rgb, 3, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_rgb, 4, 3, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_rgba, 3, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_rgba, 4, 4, true, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_bgr, 3, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_bgr, 4, 3, true, 0)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_bgra, 3, 4, true, 0)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_bgra, 4, 4, true, 0)
+
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lrgb, 3, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lrgb, 4, 3, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lrgba, 3, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lrgba, 4, 4, false, 2)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lbgr, 3, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lbgr, 4, 3, false, 0)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv_to_lbgra, 3, 4, false, 0)
+    OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(luv4_to_lbgra, 4, 4, false, 0)
+
+    #undef OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_COLOR_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/common.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/common.hpp
new file mode 100644
index 0000000..80b2ff0
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/common.hpp
@@ -0,0 +1,123 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_COMMON_HPP
+#define OPENCV_CUDA_COMMON_HPP
+
+#include <cuda_runtime.h>
+#include "opencv2/core/cuda_types.hpp"
+#include "opencv2/core/cvdef.h"
+#include "opencv2/core/base.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+#ifndef CV_PI_F
+    #ifndef CV_PI
+        #define CV_PI_F 3.14159265f
+    #else
+        #define CV_PI_F ((float)CV_PI)
+    #endif
+#endif
+
+namespace cv { namespace cuda {
+    static inline void checkCudaError(cudaError_t err, const char* file, const int line, const char* func)
+    {
+        if (cudaSuccess != err)
+            cv::error(cv::Error::GpuApiCallError, cudaGetErrorString(err), func, file, line);
+    }
+}}
+
+#ifndef cudaSafeCall
+    #define cudaSafeCall(expr)  cv::cuda::checkCudaError(expr, __FILE__, __LINE__, CV_Func)
+#endif
+
+namespace cv { namespace cuda
+{
+    template <typename T> static inline bool isAligned(const T* ptr, size_t size)
+    {
+        return reinterpret_cast<size_t>(ptr) % size == 0;
+    }
+
+    static inline bool isAligned(size_t step, size_t size)
+    {
+        return step % size == 0;
+    }
+}}
+
+namespace cv { namespace cuda
+{
+    namespace device
+    {
+        __host__ __device__ __forceinline__ int divUp(int total, int grain)
+        {
+            return (total + grain - 1) / grain;
+        }
+
+        template<class T> inline void bindTexture(const textureReference* tex, const PtrStepSz<T>& img)
+        {
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
+            cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
+        }
+
+        template<class T> inline void createTextureObjectPitch2D(cudaTextureObject_t* tex, PtrStepSz<T>& img, const cudaTextureDesc& texDesc)
+        {
+            cudaResourceDesc resDesc;
+            memset(&resDesc, 0, sizeof(resDesc));
+            resDesc.resType = cudaResourceTypePitch2D;
+            resDesc.res.pitch2D.devPtr = static_cast<void*>(img.ptr());
+            resDesc.res.pitch2D.height = img.rows;
+            resDesc.res.pitch2D.width = img.cols;
+            resDesc.res.pitch2D.pitchInBytes = img.step;
+            resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>();
+
+            cudaSafeCall( cudaCreateTextureObject(tex, &resDesc, &texDesc, NULL) );
+        }
+    }
+}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_COMMON_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/datamov_utils.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/datamov_utils.hpp
new file mode 100644
index 0000000..6820d0f
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/datamov_utils.hpp
@@ -0,0 +1,113 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_DATAMOV_UTILS_HPP
+#define OPENCV_CUDA_DATAMOV_UTILS_HPP
+
+#include "common.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 200
+
+        // for Fermi memory space is detected automatically
+        template <typename T> struct ForceGlob
+        {
+            __device__ __forceinline__ static void Load(const T* ptr, int offset, T& val)  { val = ptr[offset];  }
+        };
+
+    #else // __CUDA_ARCH__ >= 200
+
+        #if defined(_WIN64) || defined(__LP64__)
+            // 64-bit register modifier for inlined asm
+            #define OPENCV_CUDA_ASM_PTR "l"
+        #else
+            // 32-bit register modifier for inlined asm
+            #define OPENCV_CUDA_ASM_PTR "r"
+        #endif
+
+        template<class T> struct ForceGlob;
+
+        #define OPENCV_CUDA_DEFINE_FORCE_GLOB(base_type, ptx_type, reg_mod) \
+            template <> struct ForceGlob<base_type> \
+            { \
+                __device__ __forceinline__ static void Load(const base_type* ptr, int offset, base_type& val) \
+                { \
+                    asm("ld.global."#ptx_type" %0, [%1];" : "="#reg_mod(val) : OPENCV_CUDA_ASM_PTR(ptr + offset)); \
+                } \
+            };
+
+        #define OPENCV_CUDA_DEFINE_FORCE_GLOB_B(base_type, ptx_type) \
+            template <> struct ForceGlob<base_type> \
+            { \
+                __device__ __forceinline__ static void Load(const base_type* ptr, int offset, base_type& val) \
+                { \
+                    asm("ld.global."#ptx_type" %0, [%1];" : "=r"(*reinterpret_cast<uint*>(&val)) : OPENCV_CUDA_ASM_PTR(ptr + offset)); \
+                } \
+            };
+
+            OPENCV_CUDA_DEFINE_FORCE_GLOB_B(uchar,  u8)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB_B(schar,  s8)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB_B(char,   b8)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB  (ushort, u16, h)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB  (short,  s16, h)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB  (uint,   u32, r)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB  (int,    s32, r)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB  (float,  f32, f)
+            OPENCV_CUDA_DEFINE_FORCE_GLOB  (double, f64, d)
+
+        #undef OPENCV_CUDA_DEFINE_FORCE_GLOB
+        #undef OPENCV_CUDA_DEFINE_FORCE_GLOB_B
+        #undef OPENCV_CUDA_ASM_PTR
+
+    #endif // __CUDA_ARCH__ >= 200
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_DATAMOV_UTILS_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/detail/color_detail.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/detail/color_detail.hpp
new file mode 100644
index 0000000..f4b4796
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/detail/color_detail.hpp
@@ -0,0 +1,2018 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_COLOR_DETAIL_HPP
+#define OPENCV_CUDA_COLOR_DETAIL_HPP
+
+#include "../common.hpp"
+#include "../vec_traits.hpp"
+#include "../saturate_cast.hpp"
+#include "../limits.hpp"
+#include "../functional.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    #ifndef CV_DESCALE
+        #define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))
+    #endif
+
+    namespace color_detail
+    {
+        template<typename T> struct ColorChannel
+        {
+            typedef float worktype_f;
+            static __device__ __forceinline__ T max() { return numeric_limits<T>::max(); }
+            static __device__ __forceinline__ T half() { return (T)(max()/2 + 1); }
+        };
+
+        template<> struct ColorChannel<float>
+        {
+            typedef float worktype_f;
+            static __device__ __forceinline__ float max() { return 1.f; }
+            static __device__ __forceinline__ float half() { return 0.5f; }
+        };
+
+        template <typename T> static __device__ __forceinline__ void setAlpha(typename TypeVec<T, 3>::vec_type& vec, T val)
+        {
+        }
+
+        template <typename T> static __device__ __forceinline__ void setAlpha(typename TypeVec<T, 4>::vec_type& vec, T val)
+        {
+            vec.w = val;
+        }
+
+        template <typename T> static __device__ __forceinline__ T getAlpha(const typename TypeVec<T, 3>::vec_type& vec)
+        {
+            return ColorChannel<T>::max();
+        }
+
+        template <typename T> static __device__ __forceinline__ T getAlpha(const typename TypeVec<T, 4>::vec_type& vec)
+        {
+            return vec.w;
+        }
+
+        //constants for conversion from/to RGB and Gray, YUV, YCrCb according to BT.601
+        constexpr float B2YF = 0.114f;
+        constexpr float G2YF = 0.587f;
+        constexpr float R2YF = 0.299f;
+
+        //to YCbCr
+        constexpr float YCBF = 0.564f; // == 1/2/(1-B2YF)
+        constexpr float YCRF = 0.713f; // == 1/2/(1-R2YF)
+        const     int   YCBI = 9241;  // == YCBF*16384
+        const     int   YCRI = 11682; // == YCRF*16384
+        //to YUV
+        constexpr float B2UF = 0.492f;
+        constexpr float R2VF = 0.877f;
+        const     int   B2UI = 8061;  // == B2UF*16384
+        const     int   R2VI = 14369; // == R2VF*16384
+        //from YUV
+        constexpr float U2BF = 2.032f;
+        constexpr float U2GF = -0.395f;
+        constexpr float V2GF = -0.581f;
+        constexpr float V2RF = 1.140f;
+        const     int   U2BI = 33292;
+        const     int   U2GI = -6472;
+        const     int   V2GI = -9519;
+        const     int   V2RI = 18678;
+        //from YCrCb
+        constexpr float CB2BF = 1.773f;
+        constexpr float CB2GF = -0.344f;
+        constexpr float CR2GF = -0.714f;
+        constexpr float CR2RF = 1.403f;
+        const     int   CB2BI = 29049;
+        const     int   CB2GI = -5636;
+        const     int   CR2GI = -11698;
+        const     int   CR2RI = 22987;
+
+        enum
+        {
+            yuv_shift  = 14,
+            xyz_shift  = 12,
+            gray_shift = 15,
+            R2Y        = 4899,
+            G2Y        = 9617,
+            B2Y        = 1868,
+            RY15 =  9798, // == R2YF*32768 + 0.5
+            GY15 = 19235, // == G2YF*32768 + 0.5
+            BY15 =  3735, // == B2YF*32768 + 0.5
+            BLOCK_SIZE = 256
+        };
+    }
+
+////////////////// Various 3/4-channel to 3/4-channel RGB transformations /////////////////
+
+    namespace color_detail
+    {
+        template <typename T, int scn, int dcn, int bidx> struct RGB2RGB
+            : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
+        {
+            __device__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+
+                dst.x = (&src.x)[bidx];
+                dst.y = src.y;
+                dst.z = (&src.x)[bidx^2];
+                setAlpha(dst, getAlpha<T>(src));
+
+                return dst;
+            }
+
+            __host__ __device__ __forceinline__ RGB2RGB() {}
+            __host__ __device__ __forceinline__ RGB2RGB(const RGB2RGB&) {}
+        };
+
+        template <> struct RGB2RGB<uchar, 4, 4, 2> : unary_function<uint, uint>
+        {
+            __device__ uint operator()(uint src) const
+            {
+                uint dst = 0;
+
+                dst |= (0xffu & (src >> 16));
+                dst |= (0xffu & (src >> 8)) << 8;
+                dst |= (0xffu & (src)) << 16;
+                dst |= (0xffu & (src >> 24)) << 24;
+
+                return dst;
+            }
+
+            __host__ __device__ __forceinline__ RGB2RGB() {}
+            __host__ __device__ __forceinline__ RGB2RGB(const RGB2RGB&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_RGB2RGB_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2RGB<T, scn, dcn, bidx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+/////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB //////////
+
+    namespace color_detail
+    {
+        template <int green_bits, int bidx> struct RGB2RGB5x5Converter;
+        template<int bidx> struct RGB2RGB5x5Converter<6, bidx>
+        {
+            static __device__ __forceinline__ ushort cvt(const uchar3& src)
+            {
+                return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~3) << 3) | (((&src.x)[bidx^2] & ~7) << 8));
+            }
+
+            static __device__ __forceinline__ ushort cvt(uint src)
+            {
+                uint b = 0xffu & (src >> (bidx * 8));
+                uint g = 0xffu & (src >> 8);
+                uint r = 0xffu & (src >> ((bidx ^ 2) * 8));
+                return (ushort)((b >> 3) | ((g & ~3) << 3) | ((r & ~7) << 8));
+            }
+        };
+
+        template<int bidx> struct RGB2RGB5x5Converter<5, bidx>
+        {
+            static __device__ __forceinline__ ushort cvt(const uchar3& src)
+            {
+                return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~7) << 2) | (((&src.x)[bidx^2] & ~7) << 7));
+            }
+
+            static __device__ __forceinline__ ushort cvt(uint src)
+            {
+                uint b = 0xffu & (src >> (bidx * 8));
+                uint g = 0xffu & (src >> 8);
+                uint r = 0xffu & (src >> ((bidx ^ 2) * 8));
+                uint a = 0xffu & (src >> 24);
+                return (ushort)((b >> 3) | ((g & ~7) << 2) | ((r & ~7) << 7) | (a * 0x8000));
+            }
+        };
+
+        template<int scn, int bidx, int green_bits> struct RGB2RGB5x5;
+
+        template<int bidx, int green_bits> struct RGB2RGB5x5<3, bidx,green_bits> : unary_function<uchar3, ushort>
+        {
+            __device__ __forceinline__ ushort operator()(const uchar3& src) const
+            {
+                return RGB2RGB5x5Converter<green_bits, bidx>::cvt(src);
+            }
+
+            __host__ __device__ __forceinline__ RGB2RGB5x5() {}
+            __host__ __device__ __forceinline__ RGB2RGB5x5(const RGB2RGB5x5&) {}
+        };
+
+        template<int bidx, int green_bits> struct RGB2RGB5x5<4, bidx,green_bits> : unary_function<uint, ushort>
+        {
+            __device__ __forceinline__ ushort operator()(uint src) const
+            {
+                return RGB2RGB5x5Converter<green_bits, bidx>::cvt(src);
+            }
+
+            __host__ __device__ __forceinline__ RGB2RGB5x5() {}
+            __host__ __device__ __forceinline__ RGB2RGB5x5(const RGB2RGB5x5&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_RGB2RGB5x5_TRAITS(name, scn, bidx, green_bits) \
+    struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2RGB5x5<scn, bidx, green_bits> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+    namespace color_detail
+    {
+        template <int green_bits, int bidx> struct RGB5x52RGBConverter;
+
+        template <int bidx> struct RGB5x52RGBConverter<5, bidx>
+        {
+            static __device__ __forceinline__ void cvt(uint src, uchar3& dst)
+            {
+                (&dst.x)[bidx] = src << 3;
+                dst.y = (src >> 2) & ~7;
+                (&dst.x)[bidx ^ 2] = (src >> 7) & ~7;
+            }
+
+            static __device__ __forceinline__ void cvt(uint src, uint& dst)
+            {
+                dst = 0;
+
+                dst |= (0xffu & (src << 3)) << (bidx * 8);
+                dst |= (0xffu & ((src >> 2) & ~7)) << 8;
+                dst |= (0xffu & ((src >> 7) & ~7)) << ((bidx ^ 2) * 8);
+                dst |= ((src & 0x8000) * 0xffu) << 24;
+            }
+        };
+
+        template <int bidx> struct RGB5x52RGBConverter<6, bidx>
+        {
+            static __device__ __forceinline__ void cvt(uint src, uchar3& dst)
+            {
+                (&dst.x)[bidx] = src << 3;
+                dst.y = (src >> 3) & ~3;
+                (&dst.x)[bidx ^ 2] = (src >> 8) & ~7;
+            }
+
+            static __device__ __forceinline__ void cvt(uint src, uint& dst)
+            {
+                dst = 0xffu << 24;
+
+                dst |= (0xffu & (src << 3)) << (bidx * 8);
+                dst |= (0xffu &((src >> 3) & ~3)) << 8;
+                dst |= (0xffu & ((src >> 8) & ~7)) << ((bidx ^ 2) * 8);
+            }
+        };
+
+        template <int dcn, int bidx, int green_bits> struct RGB5x52RGB;
+
+        template <int bidx, int green_bits> struct RGB5x52RGB<3, bidx, green_bits> : unary_function<ushort, uchar3>
+        {
+            __device__ __forceinline__ uchar3 operator()(ushort src) const
+            {
+                uchar3 dst;
+                RGB5x52RGBConverter<green_bits, bidx>::cvt(src, dst);
+                return dst;
+            }
+            __host__ __device__ __forceinline__ RGB5x52RGB() {}
+            __host__ __device__ __forceinline__ RGB5x52RGB(const RGB5x52RGB&) {}
+
+        };
+
+        template <int bidx, int green_bits> struct RGB5x52RGB<4, bidx, green_bits> : unary_function<ushort, uint>
+        {
+            __device__ __forceinline__ uint operator()(ushort src) const
+            {
+                uint dst;
+                RGB5x52RGBConverter<green_bits, bidx>::cvt(src, dst);
+                return dst;
+            }
+            __host__ __device__ __forceinline__ RGB5x52RGB() {}
+            __host__ __device__ __forceinline__ RGB5x52RGB(const RGB5x52RGB&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_RGB5x52RGB_TRAITS(name, dcn, bidx, green_bits) \
+    struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB5x52RGB<dcn, bidx, green_bits> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+///////////////////////////////// Grayscale to Color ////////////////////////////////
+
+    namespace color_detail
+    {
+        template <typename T, int dcn> struct Gray2RGB : unary_function<T, typename TypeVec<T, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(T src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+
+                dst.z = dst.y = dst.x = src;
+                setAlpha(dst, ColorChannel<T>::max());
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ Gray2RGB() {}
+            __host__ __device__ __forceinline__ Gray2RGB(const Gray2RGB&) {}
+        };
+
+        template <> struct Gray2RGB<uchar, 4> : unary_function<uchar, uint>
+        {
+            __device__ __forceinline__ uint operator()(uint src) const
+            {
+                uint dst = 0xffu << 24;
+
+                dst |= src;
+                dst |= src << 8;
+                dst |= src << 16;
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ Gray2RGB() {}
+            __host__ __device__ __forceinline__ Gray2RGB(const Gray2RGB&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_GRAY2RGB_TRAITS(name, dcn) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::Gray2RGB<T, dcn> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+    namespace color_detail
+    {
+        template <int green_bits> struct Gray2RGB5x5Converter;
+        template<> struct Gray2RGB5x5Converter<6>
+        {
+            static __device__ __forceinline__ ushort cvt(uint t)
+            {
+                return (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));
+            }
+        };
+
+        template<> struct Gray2RGB5x5Converter<5>
+        {
+            static __device__ __forceinline__ ushort cvt(uint t)
+            {
+                t >>= 3;
+                return (ushort)(t | (t << 5) | (t << 10));
+            }
+        };
+
+        template<int green_bits> struct Gray2RGB5x5 : unary_function<uchar, ushort>
+        {
+            __device__ __forceinline__ ushort operator()(uint src) const
+            {
+                return Gray2RGB5x5Converter<green_bits>::cvt(src);
+            }
+
+            __host__ __device__ __forceinline__ Gray2RGB5x5() {}
+            __host__ __device__ __forceinline__ Gray2RGB5x5(const Gray2RGB5x5&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_GRAY2RGB5x5_TRAITS(name, green_bits) \
+    struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::Gray2RGB5x5<green_bits> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+///////////////////////////////// Color to Grayscale ////////////////////////////////
+
+    namespace color_detail
+    {
+        template <int green_bits> struct RGB5x52GrayConverter;
+        template <> struct RGB5x52GrayConverter<6>
+        {
+            static __device__ __forceinline__ uchar cvt(uint t)
+            {
+                return (uchar)CV_DESCALE(((t << 3) & 0xf8) * BY15 + ((t >> 3) & 0xfc) * GY15 + ((t >> 8) & 0xf8) * RY15, gray_shift);
+            }
+        };
+
+        template <> struct RGB5x52GrayConverter<5>
+        {
+            static __device__ __forceinline__ uchar cvt(uint t)
+            {
+                return (uchar)CV_DESCALE(((t << 3) & 0xf8) * BY15 + ((t >> 2) & 0xf8) * GY15 + ((t >> 7) & 0xf8) * RY15, gray_shift);
+            }
+        };
+
+        template<int green_bits> struct RGB5x52Gray : unary_function<ushort, uchar>
+        {
+            __device__ __forceinline__ uchar operator()(uint src) const
+            {
+                return RGB5x52GrayConverter<green_bits>::cvt(src);
+            }
+            __host__ __device__ __forceinline__ RGB5x52Gray() {}
+            __host__ __device__ __forceinline__ RGB5x52Gray(const RGB5x52Gray&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_RGB5x52GRAY_TRAITS(name, green_bits) \
+    struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB5x52Gray<green_bits> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+    namespace color_detail
+    {
+        template <int bidx, typename T> static __device__ __forceinline__ T RGB2GrayConvert(const T* src)
+        {
+            return (T)CV_DESCALE((unsigned)(src[bidx] * BY15 + src[1] * GY15 + src[bidx^2] * RY15), gray_shift);
+        }
+
+        template <int bidx> static __device__ __forceinline__ uchar RGB2GrayConvert(uint src)
+        {
+            uint b = 0xffu & (src >> (bidx * 8));
+            uint g = 0xffu & (src >> 8);
+            uint r = 0xffu & (src >> ((bidx ^ 2) * 8));
+            return CV_DESCALE((uint)(b * BY15 + g * GY15 + r * RY15), gray_shift);
+        }
+
+        template <int bidx> static __device__ __forceinline__ float RGB2GrayConvert(const float* src)
+        {
+            return src[bidx] * B2YF + src[1] * G2YF + src[bidx^2] * R2YF;
+        }
+
+        template <typename T, int scn, int bidx> struct RGB2Gray : unary_function<typename TypeVec<T, scn>::vec_type, T>
+        {
+            __device__ __forceinline__ T operator()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                return RGB2GrayConvert<bidx>(&src.x);
+            }
+            __host__ __device__ __forceinline__ RGB2Gray() {}
+            __host__ __device__ __forceinline__ RGB2Gray(const RGB2Gray&) {}
+        };
+
+        template <int bidx> struct RGB2Gray<uchar, 4, bidx> : unary_function<uint, uchar>
+        {
+            __device__ __forceinline__ uchar operator()(uint src) const
+            {
+                return RGB2GrayConvert<bidx>(src);
+            }
+            __host__ __device__ __forceinline__ RGB2Gray() {}
+            __host__ __device__ __forceinline__ RGB2Gray(const RGB2Gray&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_RGB2GRAY_TRAITS(name, scn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2Gray<T, scn, bidx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+///////////////////////////////////// RGB <-> YUV //////////////////////////////////////
+
+    namespace color_detail
+    {
+        __constant__ float c_RGB2YUVCoeffs_f[5] = { B2YF, G2YF, R2YF, B2UF, R2VF };
+        __constant__ int   c_RGB2YUVCoeffs_i[5] = { B2Y, G2Y, R2Y, B2UI, R2VI };
+
+        template <int bidx, typename T, typename D> static __device__ void RGB2YUVConvert(const T* src, D& dst)
+        {
+            const int delta = ColorChannel<T>::half() * (1 << yuv_shift);
+
+            const int Y = CV_DESCALE(src[0] * c_RGB2YUVCoeffs_i[bidx^2] + src[1] * c_RGB2YUVCoeffs_i[1] + src[2] * c_RGB2YUVCoeffs_i[bidx], yuv_shift);
+            const int Cr = CV_DESCALE((src[bidx^2] - Y) * c_RGB2YUVCoeffs_i[3] + delta, yuv_shift);
+            const int Cb = CV_DESCALE((src[bidx] - Y) * c_RGB2YUVCoeffs_i[4] + delta, yuv_shift);
+
+            dst.x = saturate_cast<T>(Y);
+            dst.y = saturate_cast<T>(Cr);
+            dst.z = saturate_cast<T>(Cb);
+        }
+
+        template <int bidx, typename D> static __device__ __forceinline__ void RGB2YUVConvert(const float* src, D& dst)
+        {
+            dst.x = src[0] * c_RGB2YUVCoeffs_f[bidx^2] + src[1] * c_RGB2YUVCoeffs_f[1] + src[2] * c_RGB2YUVCoeffs_f[bidx];
+            dst.y = (src[bidx^2] - dst.x) * c_RGB2YUVCoeffs_f[3] + ColorChannel<float>::half();
+            dst.z = (src[bidx] - dst.x) * c_RGB2YUVCoeffs_f[4] + ColorChannel<float>::half();
+        }
+
+        template <typename T, int scn, int dcn, int bidx> struct RGB2YUV
+            : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+                RGB2YUVConvert<bidx>(&src.x, dst);
+                return dst;
+            }
+            __host__ __device__ __forceinline__ RGB2YUV() {}
+            __host__ __device__ __forceinline__ RGB2YUV(const RGB2YUV&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_RGB2YUV_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2YUV<T, scn, dcn, bidx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+    namespace color_detail
+    {
+        __constant__ float c_YUV2RGBCoeffs_f[5] = { U2BF, U2GF, V2GF, V2RF };
+        __constant__ int   c_YUV2RGBCoeffs_i[5] = { U2BI, U2GI, V2GI, V2RI };
+
+        template <int bidx, typename T, typename D> static __device__ void YUV2RGBConvert(const T& src, D* dst)
+        {
+            const int b = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[3], yuv_shift);
+
+            const int g = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[2]
+                                             + (src.y - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[1], yuv_shift);
+
+            const int r = src.x + CV_DESCALE((src.y - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[0], yuv_shift);
+
+            dst[bidx] = saturate_cast<D>(b);
+            dst[1] = saturate_cast<D>(g);
+            dst[bidx^2] = saturate_cast<D>(r);
+        }
+
+        template <int bidx> static __device__ uint YUV2RGBConvert(uint src)
+        {
+            const int x = 0xff & (src);
+            const int y = 0xff & (src >> 8);
+            const int z = 0xff & (src >> 16);
+
+            const int b = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[3], yuv_shift);
+
+            const int g = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[2]
+                                         + (y - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[1], yuv_shift);
+
+            const int r = x + CV_DESCALE((y - ColorChannel<uchar>::half()) * c_YUV2RGBCoeffs_i[0], yuv_shift);
+
+            uint dst = 0xffu << 24;
+
+            dst |= saturate_cast<uchar>(b) << (bidx * 8);
+            dst |= saturate_cast<uchar>(g) << 8;
+            dst |= saturate_cast<uchar>(r) << ((bidx ^ 2) * 8);
+
+            return dst;
+        }
+
+        template <int bidx, typename T> static __device__ __forceinline__ void YUV2RGBConvert(const T& src, float* dst)
+        {
+            dst[bidx] = src.x + (src.z - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[3];
+
+            dst[1] = src.x + (src.z - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[2]
+                     + (src.y - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[1];
+
+            dst[bidx^2] = src.x + (src.y - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[0];
+        }
+
+        template <typename T, int scn, int dcn, int bidx> struct YUV2RGB
+            : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+
+                YUV2RGBConvert<bidx>(src, &dst.x);
+                setAlpha(dst, ColorChannel<T>::max());
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ YUV2RGB() {}
+            __host__ __device__ __forceinline__ YUV2RGB(const YUV2RGB&) {}
+        };
+
+        template <int bidx> struct YUV2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>
+        {
+            __device__ __forceinline__ uint operator ()(uint src) const
+            {
+                return YUV2RGBConvert<bidx>(src);
+            }
+            __host__ __device__ __forceinline__ YUV2RGB() {}
+            __host__ __device__ __forceinline__ YUV2RGB(const YUV2RGB&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_YUV2RGB_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::YUV2RGB<T, scn, dcn, bidx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////
+
+    namespace color_detail
+    {
+        __constant__ float c_RGB2YCrCbCoeffs_f[5] = {R2YF, G2YF, B2YF, YCRF, YCBF};
+        __constant__ int   c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, YCRI, YCBI};
+
+        template <int bidx, typename T, typename D> static __device__ void RGB2YCrCbConvert(const T* src, D& dst)
+        {
+            const int delta = ColorChannel<T>::half() * (1 << yuv_shift);
+
+            const int Y = CV_DESCALE(src[0] * c_RGB2YCrCbCoeffs_i[bidx^2] + src[1] * c_RGB2YCrCbCoeffs_i[1] + src[2] * c_RGB2YCrCbCoeffs_i[bidx], yuv_shift);
+            const int Cr = CV_DESCALE((src[bidx^2] - Y) * c_RGB2YCrCbCoeffs_i[3] + delta, yuv_shift);
+            const int Cb = CV_DESCALE((src[bidx] - Y) * c_RGB2YCrCbCoeffs_i[4] + delta, yuv_shift);
+
+            dst.x = saturate_cast<T>(Y);
+            dst.y = saturate_cast<T>(Cr);
+            dst.z = saturate_cast<T>(Cb);
+        }
+
+        template <int bidx> static __device__ uint RGB2YCrCbConvert(uint src)
+        {
+            const int delta = ColorChannel<uchar>::half() * (1 << yuv_shift);
+
+            const int Y = CV_DESCALE((0xffu & src) * c_RGB2YCrCbCoeffs_i[bidx^2] + (0xffu & (src >> 8)) * c_RGB2YCrCbCoeffs_i[1] + (0xffu & (src >> 16)) * c_RGB2YCrCbCoeffs_i[bidx], yuv_shift);
+            const int Cr = CV_DESCALE(((0xffu & (src >> ((bidx ^ 2) * 8))) - Y) * c_RGB2YCrCbCoeffs_i[3] + delta, yuv_shift);
+            const int Cb = CV_DESCALE(((0xffu & (src >> (bidx * 8))) - Y) * c_RGB2YCrCbCoeffs_i[4] + delta, yuv_shift);
+
+            uint dst = 0;
+
+            dst |= saturate_cast<uchar>(Y);
+            dst |= saturate_cast<uchar>(Cr) << 8;
+            dst |= saturate_cast<uchar>(Cb) << 16;
+
+            return dst;
+        }
+
+        template <int bidx, typename D> static __device__ __forceinline__ void RGB2YCrCbConvert(const float* src, D& dst)
+        {
+            dst.x = src[0] * c_RGB2YCrCbCoeffs_f[bidx^2] + src[1] * c_RGB2YCrCbCoeffs_f[1] + src[2] * c_RGB2YCrCbCoeffs_f[bidx];
+            dst.y = (src[bidx^2] - dst.x) * c_RGB2YCrCbCoeffs_f[3] + ColorChannel<float>::half();
+            dst.z = (src[bidx] - dst.x) * c_RGB2YCrCbCoeffs_f[4] + ColorChannel<float>::half();
+        }
+
+        template <typename T, int scn, int dcn, int bidx> struct RGB2YCrCb
+            : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+                RGB2YCrCbConvert<bidx>(&src.x, dst);
+                return dst;
+            }
+            __host__ __device__ __forceinline__ RGB2YCrCb() {}
+            __host__ __device__ __forceinline__ RGB2YCrCb(const RGB2YCrCb&) {}
+        };
+
+        template <int bidx> struct RGB2YCrCb<uchar, 4, 4, bidx> : unary_function<uint, uint>
+        {
+            __device__ __forceinline__ uint operator ()(uint src) const
+            {
+                return RGB2YCrCbConvert<bidx>(src);
+            }
+
+            __host__ __device__ __forceinline__ RGB2YCrCb() {}
+            __host__ __device__ __forceinline__ RGB2YCrCb(const RGB2YCrCb&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_RGB2YCrCb_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2YCrCb<T, scn, dcn, bidx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+    namespace color_detail
+    {
+        __constant__ float c_YCrCb2RGBCoeffs_f[5] = {CR2RF, CR2GF, CB2GF, CB2BF};
+        __constant__ int   c_YCrCb2RGBCoeffs_i[5] = {CR2RI, CR2GI, CB2GI, CB2BI};
+
+        template <int bidx, typename T, typename D> static __device__ void YCrCb2RGBConvert(const T& src, D* dst)
+        {
+            const int b = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[3], yuv_shift);
+            const int g = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[2] + (src.y - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[1], yuv_shift);
+            const int r = src.x + CV_DESCALE((src.y - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[0], yuv_shift);
+
+            dst[bidx] = saturate_cast<D>(b);
+            dst[1] = saturate_cast<D>(g);
+            dst[bidx^2] = saturate_cast<D>(r);
+        }
+
+        template <int bidx> static __device__ uint YCrCb2RGBConvert(uint src)
+        {
+            const int x = 0xff & (src);
+            const int y = 0xff & (src >> 8);
+            const int z = 0xff & (src >> 16);
+
+            const int b = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[3], yuv_shift);
+            const int g = x + CV_DESCALE((z - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[2] + (y - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[1], yuv_shift);
+            const int r = x + CV_DESCALE((y - ColorChannel<uchar>::half()) * c_YCrCb2RGBCoeffs_i[0], yuv_shift);
+
+            uint dst = 0xffu << 24;
+
+            dst |= saturate_cast<uchar>(b) << (bidx * 8);
+            dst |= saturate_cast<uchar>(g) << 8;
+            dst |= saturate_cast<uchar>(r) << ((bidx ^ 2) * 8);
+
+            return dst;
+        }
+
+        template <int bidx, typename T> __device__ __forceinline__ void YCrCb2RGBConvert(const T& src, float* dst)
+        {
+            dst[bidx] = src.x + (src.z - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[3];
+            dst[1] = src.x + (src.z - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[2] + (src.y - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[1];
+            dst[bidx^2] = src.x + (src.y - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[0];
+        }
+
+        template <typename T, int scn, int dcn, int bidx> struct YCrCb2RGB
+            : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator ()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+
+                YCrCb2RGBConvert<bidx>(src, &dst.x);
+                setAlpha(dst, ColorChannel<T>::max());
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ YCrCb2RGB() {}
+            __host__ __device__ __forceinline__ YCrCb2RGB(const YCrCb2RGB&) {}
+        };
+
+        template <int bidx> struct YCrCb2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>
+        {
+            __device__ __forceinline__ uint operator ()(uint src) const
+            {
+                return YCrCb2RGBConvert<bidx>(src);
+            }
+            __host__ __device__ __forceinline__ YCrCb2RGB() {}
+            __host__ __device__ __forceinline__ YCrCb2RGB(const YCrCb2RGB&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_YCrCb2RGB_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::YCrCb2RGB<T, scn, dcn, bidx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+////////////////////////////////////// RGB <-> XYZ ///////////////////////////////////////
+
+    namespace color_detail
+    {
+        __constant__ float c_RGB2XYZ_D65f[9] = { 0.412453f, 0.357580f, 0.180423f, 0.212671f, 0.715160f, 0.072169f, 0.019334f, 0.119193f, 0.950227f };
+        __constant__ int   c_RGB2XYZ_D65i[9] = { 1689, 1465, 739, 871, 2929, 296, 79, 488, 3892 };
+
+        template <int bidx, typename T, typename D> static __device__ __forceinline__ void RGB2XYZConvert(const T* src, D& dst)
+        {
+            dst.z = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[6] + src[1] * c_RGB2XYZ_D65i[7] + src[bidx] * c_RGB2XYZ_D65i[8], xyz_shift));
+            dst.x = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[0] + src[1] * c_RGB2XYZ_D65i[1] + src[bidx] * c_RGB2XYZ_D65i[2], xyz_shift));
+            dst.y = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[3] + src[1] * c_RGB2XYZ_D65i[4] + src[bidx] * c_RGB2XYZ_D65i[5], xyz_shift));
+        }
+
+        template <int bidx> static __device__ __forceinline__ uint RGB2XYZConvert(uint src)
+        {
+            const uint b = 0xffu & (src >> (bidx * 8));
+            const uint g = 0xffu & (src >> 8);
+            const uint r = 0xffu & (src >> ((bidx ^ 2) * 8));
+
+            const uint x = saturate_cast<uchar>(CV_DESCALE(r * c_RGB2XYZ_D65i[0] + g * c_RGB2XYZ_D65i[1] + b * c_RGB2XYZ_D65i[2], xyz_shift));
+            const uint y = saturate_cast<uchar>(CV_DESCALE(r * c_RGB2XYZ_D65i[3] + g * c_RGB2XYZ_D65i[4] + b * c_RGB2XYZ_D65i[5], xyz_shift));
+            const uint z = saturate_cast<uchar>(CV_DESCALE(r * c_RGB2XYZ_D65i[6] + g * c_RGB2XYZ_D65i[7] + b * c_RGB2XYZ_D65i[8], xyz_shift));
+
+            uint dst = 0;
+
+            dst |= x;
+            dst |= y << 8;
+            dst |= z << 16;
+
+            return dst;
+        }
+
+        template <int bidx, typename D> static __device__ __forceinline__ void RGB2XYZConvert(const float* src, D& dst)
+        {
+            dst.x = src[bidx^2] * c_RGB2XYZ_D65f[0] + src[1] * c_RGB2XYZ_D65f[1] + src[bidx] * c_RGB2XYZ_D65f[2];
+            dst.y = src[bidx^2] * c_RGB2XYZ_D65f[3] + src[1] * c_RGB2XYZ_D65f[4] + src[bidx] * c_RGB2XYZ_D65f[5];
+            dst.z = src[bidx^2] * c_RGB2XYZ_D65f[6] + src[1] * c_RGB2XYZ_D65f[7] + src[bidx] * c_RGB2XYZ_D65f[8];
+        }
+
+        template <typename T, int scn, int dcn, int bidx> struct RGB2XYZ
+            : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+
+                RGB2XYZConvert<bidx>(&src.x, dst);
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ RGB2XYZ() {}
+            __host__ __device__ __forceinline__ RGB2XYZ(const RGB2XYZ&) {}
+        };
+
+        template <int bidx> struct RGB2XYZ<uchar, 4, 4, bidx> : unary_function<uint, uint>
+        {
+            __device__ __forceinline__ uint operator()(uint src) const
+            {
+                return RGB2XYZConvert<bidx>(src);
+            }
+            __host__ __device__ __forceinline__ RGB2XYZ() {}
+            __host__ __device__ __forceinline__ RGB2XYZ(const RGB2XYZ&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_RGB2XYZ_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2XYZ<T, scn, dcn, bidx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+    namespace color_detail
+    {
+        __constant__ float c_XYZ2sRGB_D65f[9] = { 3.240479f, -1.53715f, -0.498535f, -0.969256f, 1.875991f, 0.041556f, 0.055648f, -0.204043f, 1.057311f };
+        __constant__ int   c_XYZ2sRGB_D65i[9] = { 13273, -6296, -2042, -3970, 7684, 170, 228, -836, 4331 };
+
+        template <int bidx, typename T, typename D> static __device__ __forceinline__ void XYZ2RGBConvert(const T& src, D* dst)
+        {
+            dst[bidx^2] = saturate_cast<D>(CV_DESCALE(src.x * c_XYZ2sRGB_D65i[0] + src.y * c_XYZ2sRGB_D65i[1] + src.z * c_XYZ2sRGB_D65i[2], xyz_shift));
+            dst[1]      = saturate_cast<D>(CV_DESCALE(src.x * c_XYZ2sRGB_D65i[3] + src.y * c_XYZ2sRGB_D65i[4] + src.z * c_XYZ2sRGB_D65i[5], xyz_shift));
+            dst[bidx]   = saturate_cast<D>(CV_DESCALE(src.x * c_XYZ2sRGB_D65i[6] + src.y * c_XYZ2sRGB_D65i[7] + src.z * c_XYZ2sRGB_D65i[8], xyz_shift));
+        }
+
+        template <int bidx> static __device__ __forceinline__ uint XYZ2RGBConvert(uint src)
+        {
+            const int x = 0xff & src;
+            const int y = 0xff & (src >> 8);
+            const int z = 0xff & (src >> 16);
+
+            const uint r = saturate_cast<uchar>(CV_DESCALE(x * c_XYZ2sRGB_D65i[0] + y * c_XYZ2sRGB_D65i[1] + z * c_XYZ2sRGB_D65i[2], xyz_shift));
+            const uint g = saturate_cast<uchar>(CV_DESCALE(x * c_XYZ2sRGB_D65i[3] + y * c_XYZ2sRGB_D65i[4] + z * c_XYZ2sRGB_D65i[5], xyz_shift));
+            const uint b = saturate_cast<uchar>(CV_DESCALE(x * c_XYZ2sRGB_D65i[6] + y * c_XYZ2sRGB_D65i[7] + z * c_XYZ2sRGB_D65i[8], xyz_shift));
+
+            uint dst = 0xffu << 24;
+
+            dst |= b << (bidx * 8);
+            dst |= g << 8;
+            dst |= r << ((bidx ^ 2) * 8);
+
+            return dst;
+        }
+
+        template <int bidx, typename T> static __device__ __forceinline__ void XYZ2RGBConvert(const T& src, float* dst)
+        {
+            dst[bidx^2] = src.x * c_XYZ2sRGB_D65f[0] + src.y * c_XYZ2sRGB_D65f[1] + src.z * c_XYZ2sRGB_D65f[2];
+            dst[1]      = src.x * c_XYZ2sRGB_D65f[3] + src.y * c_XYZ2sRGB_D65f[4] + src.z * c_XYZ2sRGB_D65f[5];
+            dst[bidx]   = src.x * c_XYZ2sRGB_D65f[6] + src.y * c_XYZ2sRGB_D65f[7] + src.z * c_XYZ2sRGB_D65f[8];
+        }
+
+        template <typename T, int scn, int dcn, int bidx> struct XYZ2RGB
+            : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+
+                XYZ2RGBConvert<bidx>(src, &dst.x);
+                setAlpha(dst, ColorChannel<T>::max());
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ XYZ2RGB() {}
+            __host__ __device__ __forceinline__ XYZ2RGB(const XYZ2RGB&) {}
+        };
+
+        template <int bidx> struct XYZ2RGB<uchar, 4, 4, bidx> : unary_function<uint, uint>
+        {
+            __device__ __forceinline__ uint operator()(uint src) const
+            {
+                return XYZ2RGBConvert<bidx>(src);
+            }
+            __host__ __device__ __forceinline__ XYZ2RGB() {}
+            __host__ __device__ __forceinline__ XYZ2RGB(const XYZ2RGB&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_XYZ2RGB_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::XYZ2RGB<T, scn, dcn, bidx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+////////////////////////////////////// RGB <-> HSV ///////////////////////////////////////
+
+    namespace color_detail
+    {
+        __constant__ int c_HsvDivTable   [256] = {0, 1044480, 522240, 348160, 261120, 208896, 174080, 149211, 130560, 116053, 104448, 94953, 87040, 80345, 74606, 69632, 65280, 61440, 58027, 54973, 52224, 49737, 47476, 45412, 43520, 41779, 40172, 38684, 37303, 36017, 34816, 33693, 32640, 31651, 30720, 29842, 29013, 28229, 27486, 26782, 26112, 25475, 24869, 24290, 23738, 23211, 22706, 22223, 21760, 21316, 20890, 20480, 20086, 19707, 19342, 18991, 18651, 18324, 18008, 17703, 17408, 17123, 16846, 16579, 16320, 16069, 15825, 15589, 15360, 15137, 14921, 14711, 14507, 14308, 14115, 13926, 13743, 13565, 13391, 13221, 13056, 12895, 12738, 12584, 12434, 12288, 12145, 12006, 11869, 11736, 11605, 11478, 11353, 11231, 11111, 10995, 10880, 10768, 10658, 10550, 10445, 10341, 10240, 10141, 10043, 9947, 9854, 9761, 9671, 9582, 9495, 9410, 9326, 9243, 9162, 9082, 9004, 8927, 8852, 8777, 8704, 8632, 8561, 8492, 8423, 8356, 8290, 8224, 8160, 8097, 8034, 7973, 7913, 7853, 7795, 7737, 7680, 7624, 7569, 7514, 7461, 7408, 7355, 7304, 7253, 7203, 7154, 7105, 7057, 7010, 6963, 6917, 6872, 6827, 6782, 6739, 6695, 6653, 6611, 6569, 6528, 6487, 6447, 6408, 6369, 6330, 6292, 6254, 6217, 6180, 6144, 6108, 6073, 6037, 6003, 5968, 5935, 5901, 5868, 5835, 5803, 5771, 5739, 5708, 5677, 5646, 5615, 5585, 5556, 5526, 5497, 5468, 5440, 5412, 5384, 5356, 5329, 5302, 5275, 5249, 5222, 5196, 5171, 5145, 5120, 5095, 5070, 5046, 5022, 4998, 4974, 4950, 4927, 4904, 4881, 4858, 4836, 4813, 4791, 4769, 4748, 4726, 4705, 4684, 4663, 4642, 4622, 4601, 4581, 4561, 4541, 4522, 4502, 4483, 4464, 4445, 4426, 4407, 4389, 4370, 4352, 4334, 4316, 4298, 4281, 4263, 4246, 4229, 4212, 4195, 4178, 4161, 4145, 4128, 4112, 4096};
+        __constant__ int c_HsvDivTable180[256] = {0, 122880, 61440, 40960, 30720, 24576, 20480, 17554, 15360, 13653, 12288, 11171, 10240, 9452, 8777, 8192, 7680, 7228, 6827, 6467, 6144, 5851, 5585, 5343, 5120, 4915, 4726, 4551, 4389, 4237, 4096, 3964, 3840, 3724, 3614, 3511, 3413, 3321, 3234, 3151, 3072, 2997, 2926, 2858, 2793, 2731, 2671, 2614, 2560, 2508, 2458, 2409, 2363, 2318, 2276, 2234, 2194, 2156, 2119, 2083, 2048, 2014, 1982, 1950, 1920, 1890, 1862, 1834, 1807, 1781, 1755, 1731, 1707, 1683, 1661, 1638, 1617, 1596, 1575, 1555, 1536, 1517, 1499, 1480, 1463, 1446, 1429, 1412, 1396, 1381, 1365, 1350, 1336, 1321, 1307, 1293, 1280, 1267, 1254, 1241, 1229, 1217, 1205, 1193, 1182, 1170, 1159, 1148, 1138, 1127, 1117, 1107, 1097, 1087, 1078, 1069, 1059, 1050, 1041, 1033, 1024, 1016, 1007, 999, 991, 983, 975, 968, 960, 953, 945, 938, 931, 924, 917, 910, 904, 897, 890, 884, 878, 871, 865, 859, 853, 847, 842, 836, 830, 825, 819, 814, 808, 803, 798, 793, 788, 783, 778, 773, 768, 763, 759, 754, 749, 745, 740, 736, 731, 727, 723, 719, 714, 710, 706, 702, 698, 694, 690, 686, 683, 679, 675, 671, 668, 664, 661, 657, 654, 650, 647, 643, 640, 637, 633, 630, 627, 624, 621, 617, 614, 611, 608, 605, 602, 599, 597, 594, 591, 588, 585, 582, 580, 577, 574, 572, 569, 566, 564, 561, 559, 556, 554, 551, 549, 546, 544, 541, 539, 537, 534, 532, 530, 527, 525, 523, 521, 518, 516, 514, 512, 510, 508, 506, 504, 502, 500, 497, 495, 493, 492, 490, 488, 486, 484, 482};
+        __constant__ int c_HsvDivTable256[256] = {0, 174763, 87381, 58254, 43691, 34953, 29127, 24966, 21845, 19418, 17476, 15888, 14564, 13443, 12483, 11651, 10923, 10280, 9709, 9198, 8738, 8322, 7944, 7598, 7282, 6991, 6722, 6473, 6242, 6026, 5825, 5638, 5461, 5296, 5140, 4993, 4855, 4723, 4599, 4481, 4369, 4263, 4161, 4064, 3972, 3884, 3799, 3718, 3641, 3567, 3495, 3427, 3361, 3297, 3236, 3178, 3121, 3066, 3013, 2962, 2913, 2865, 2819, 2774, 2731, 2689, 2648, 2608, 2570, 2533, 2497, 2461, 2427, 2394, 2362, 2330, 2300, 2270, 2241, 2212, 2185, 2158, 2131, 2106, 2081, 2056, 2032, 2009, 1986, 1964, 1942, 1920, 1900, 1879, 1859, 1840, 1820, 1802, 1783, 1765, 1748, 1730, 1713, 1697, 1680, 1664, 1649, 1633, 1618, 1603, 1589, 1574, 1560, 1547, 1533, 1520, 1507, 1494, 1481, 1469, 1456, 1444, 1432, 1421, 1409, 1398, 1387, 1376, 1365, 1355, 1344, 1334, 1324, 1314, 1304, 1295, 1285, 1276, 1266, 1257, 1248, 1239, 1231, 1222, 1214, 1205, 1197, 1189, 1181, 1173, 1165, 1157, 1150, 1142, 1135, 1128, 1120, 1113, 1106, 1099, 1092, 1085, 1079, 1072, 1066, 1059, 1053, 1046, 1040, 1034, 1028, 1022, 1016, 1010, 1004, 999, 993, 987, 982, 976, 971, 966, 960, 955, 950, 945, 940, 935, 930, 925, 920, 915, 910, 906, 901, 896, 892, 887, 883, 878, 874, 869, 865, 861, 857, 853, 848, 844, 840, 836, 832, 828, 824, 820, 817, 813, 809, 805, 802, 798, 794, 791, 787, 784, 780, 777, 773, 770, 767, 763, 760, 757, 753, 750, 747, 744, 741, 737, 734, 731, 728, 725, 722, 719, 716, 713, 710, 708, 705, 702, 699, 696, 694, 691, 688, 685};
+
+        template <int bidx, int hr, typename D> static __device__ void RGB2HSVConvert(const uchar* src, D& dst)
+        {
+            const int hsv_shift = 12;
+            const int* hdiv_table = hr == 180 ? c_HsvDivTable180 : c_HsvDivTable256;
+
+            int b = src[bidx], g = src[1], r = src[bidx^2];
+            int h, s, v = b;
+            int vmin = b, diff;
+            int vr, vg;
+
+            v = ::max(v, g);
+            v = ::max(v, r);
+            vmin = ::min(vmin, g);
+            vmin = ::min(vmin, r);
+
+            diff = v - vmin;
+            vr = (v == r) * -1;
+            vg = (v == g) * -1;
+
+            s = (diff * c_HsvDivTable[v] + (1 << (hsv_shift-1))) >> hsv_shift;
+            h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
+            h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;
+            h += (h < 0) * hr;
+
+            dst.x = saturate_cast<uchar>(h);
+            dst.y = (uchar)s;
+            dst.z = (uchar)v;
+        }
+
+        template <int bidx, int hr> static __device__ uint RGB2HSVConvert(uint src)
+        {
+            const int hsv_shift = 12;
+            const int* hdiv_table = hr == 180 ? c_HsvDivTable180 : c_HsvDivTable256;
+
+            const int b = 0xff & (src >> (bidx * 8));
+            const int g = 0xff & (src >> 8);
+            const int r = 0xff & (src >> ((bidx ^ 2) * 8));
+
+            int h, s, v = b;
+            int vmin = b, diff;
+            int vr, vg;
+
+            v = ::max(v, g);
+            v = ::max(v, r);
+            vmin = ::min(vmin, g);
+            vmin = ::min(vmin, r);
+
+            diff = v - vmin;
+            vr = (v == r) * -1;
+            vg = (v == g) * -1;
+
+            s = (diff * c_HsvDivTable[v] + (1 << (hsv_shift-1))) >> hsv_shift;
+            h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
+            h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;
+            h += (h < 0) * hr;
+
+            uint dst = 0;
+
+            dst |= saturate_cast<uchar>(h);
+            dst |= (0xffu & s) << 8;
+            dst |= (0xffu & v) << 16;
+
+            return dst;
+        }
+
+        template <int bidx, int hr, typename D> static __device__ void RGB2HSVConvert(const float* src, D& dst)
+        {
+            const float hscale = hr * (1.f / 360.f);
+
+            float b = src[bidx], g = src[1], r = src[bidx^2];
+            float h, s, v;
+
+            float vmin, diff;
+
+            v = vmin = r;
+            v = fmax(v, g);
+            v = fmax(v, b);
+            vmin = fmin(vmin, g);
+            vmin = fmin(vmin, b);
+
+            diff = v - vmin;
+            s = diff / (float)(::fabs(v) + numeric_limits<float>::epsilon());
+            diff = (float)(60. / (diff + numeric_limits<float>::epsilon()));
+
+            h  = (v == r) * (g - b) * diff;
+            h += (v != r && v == g) * ((b - r) * diff + 120.f);
+            h += (v != r && v != g) * ((r - g) * diff + 240.f);
+            h += (h < 0) * 360.f;
+
+            dst.x = h * hscale;
+            dst.y = s;
+            dst.z = v;
+        }
+
+        template <typename T, int scn, int dcn, int bidx, int hr> struct RGB2HSV
+            : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+
+                RGB2HSVConvert<bidx, hr>(&src.x, dst);
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ RGB2HSV() {}
+            __host__ __device__ __forceinline__ RGB2HSV(const RGB2HSV&) {}
+        };
+
+        template <int bidx, int hr> struct RGB2HSV<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
+        {
+            __device__ __forceinline__ uint operator()(uint src) const
+            {
+                return RGB2HSVConvert<bidx, hr>(src);
+            }
+            __host__ __device__ __forceinline__ RGB2HSV() {}
+            __host__ __device__ __forceinline__ RGB2HSV(const RGB2HSV&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_RGB2HSV_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2HSV<T, scn, dcn, bidx, 180> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    }; \
+    template <typename T> struct name ## _full_traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2HSV<T, scn, dcn, bidx, 256> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    }; \
+    template <> struct name ## _traits<float> \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2HSV<float, scn, dcn, bidx, 360> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    }; \
+    template <> struct name ## _full_traits<float> \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2HSV<float, scn, dcn, bidx, 360> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+    namespace color_detail
+    {
+        __constant__ int c_HsvSectorData[6][3] = { {1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0} };
+
+        template <int bidx, int hr, typename T> static __device__ void HSV2RGBConvert(const T& src, float* dst)
+        {
+            const float hscale = 6.f / hr;
+
+            float h = src.x, s = src.y, v = src.z;
+            float b = v, g = v, r = v;
+
+            if (s != 0)
+            {
+                h *= hscale;
+
+                if( h < 0 )
+                    do h += 6; while( h < 0 );
+                else if( h >= 6 )
+                    do h -= 6; while( h >= 6 );
+
+                int sector = __float2int_rd(h);
+                h -= sector;
+
+                if ( (unsigned)sector >= 6u )
+                {
+                    sector = 0;
+                    h = 0.f;
+                }
+
+                float tab[4];
+                tab[0] = v;
+                tab[1] = v * (1.f - s);
+                tab[2] = v * (1.f - s * h);
+                tab[3] = v * (1.f - s * (1.f - h));
+
+                b = tab[c_HsvSectorData[sector][0]];
+                g = tab[c_HsvSectorData[sector][1]];
+                r = tab[c_HsvSectorData[sector][2]];
+            }
+
+            dst[bidx] = b;
+            dst[1] = g;
+            dst[bidx^2] = r;
+        }
+
+        template <int bidx, int HR, typename T> static __device__ void HSV2RGBConvert(const T& src, uchar* dst)
+        {
+            float3 buf;
+
+            buf.x = src.x;
+            buf.y = src.y * (1.f / 255.f);
+            buf.z = src.z * (1.f / 255.f);
+
+            HSV2RGBConvert<bidx, HR>(buf, &buf.x);
+
+            dst[0] = saturate_cast<uchar>(buf.x * 255.f);
+            dst[1] = saturate_cast<uchar>(buf.y * 255.f);
+            dst[2] = saturate_cast<uchar>(buf.z * 255.f);
+        }
+
+        template <int bidx, int hr> static __device__ uint HSV2RGBConvert(uint src)
+        {
+            float3 buf;
+
+            buf.x = src & 0xff;
+            buf.y = ((src >> 8) & 0xff) * (1.f/255.f);
+            buf.z = ((src >> 16) & 0xff) * (1.f/255.f);
+
+            HSV2RGBConvert<bidx, hr>(buf, &buf.x);
+
+            uint dst = 0xffu << 24;
+
+            dst |= saturate_cast<uchar>(buf.x * 255.f);
+            dst |= saturate_cast<uchar>(buf.y * 255.f) << 8;
+            dst |= saturate_cast<uchar>(buf.z * 255.f) << 16;
+
+            return dst;
+        }
+
+        template <typename T, int scn, int dcn, int bidx, int hr> struct HSV2RGB
+            : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+
+                HSV2RGBConvert<bidx, hr>(src, &dst.x);
+                setAlpha(dst, ColorChannel<T>::max());
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ HSV2RGB() {}
+            __host__ __device__ __forceinline__ HSV2RGB(const HSV2RGB&) {}
+        };
+
+        template <int bidx, int hr> struct HSV2RGB<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
+        {
+            __device__ __forceinline__ uint operator()(uint src) const
+            {
+                return HSV2RGBConvert<bidx, hr>(src);
+            }
+            __host__ __device__ __forceinline__ HSV2RGB() {}
+            __host__ __device__ __forceinline__ HSV2RGB(const HSV2RGB&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_HSV2RGB_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::HSV2RGB<T, scn, dcn, bidx, 180> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    }; \
+    template <typename T> struct name ## _full_traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::HSV2RGB<T, scn, dcn, bidx, 255> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    }; \
+    template <> struct name ## _traits<float> \
+    { \
+        typedef ::cv::cuda::device::color_detail::HSV2RGB<float, scn, dcn, bidx, 360> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    }; \
+    template <> struct name ## _full_traits<float> \
+    { \
+        typedef ::cv::cuda::device::color_detail::HSV2RGB<float, scn, dcn, bidx, 360> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+/////////////////////////////////////// RGB <-> HLS ////////////////////////////////////////
+
+    namespace color_detail
+    {
+        template <int bidx, int hr, typename D> static __device__ void RGB2HLSConvert(const float* src, D& dst)
+        {
+            const float hscale = hr * (1.f / 360.f);
+
+            float b = src[bidx], g = src[1], r = src[bidx^2];
+            float h = 0.f, s = 0.f, l;
+            float vmin, vmax, diff;
+
+            vmax = vmin = r;
+            vmax = fmax(vmax, g);
+            vmax = fmax(vmax, b);
+            vmin = fmin(vmin, g);
+            vmin = fmin(vmin, b);
+
+            diff = vmax - vmin;
+            l = (vmax + vmin) * 0.5f;
+
+            if (diff > numeric_limits<float>::epsilon())
+            {
+                s = (l < 0.5f) * diff / (vmax + vmin);
+                s += (l >= 0.5f) * diff / (2.0f - vmax - vmin);
+
+                diff = 60.f / diff;
+
+                h  = (vmax == r) * (g - b) * diff;
+                h += (vmax != r && vmax == g) * ((b - r) * diff + 120.f);
+                h += (vmax != r && vmax != g) * ((r - g) * diff + 240.f);
+                h += (h < 0.f) * 360.f;
+            }
+
+            dst.x = h * hscale;
+            dst.y = l;
+            dst.z = s;
+        }
+
+        template <int bidx, int hr, typename D> static __device__ void RGB2HLSConvert(const uchar* src, D& dst)
+        {
+            float3 buf;
+
+            buf.x = src[0] * (1.f / 255.f);
+            buf.y = src[1] * (1.f / 255.f);
+            buf.z = src[2] * (1.f / 255.f);
+
+            RGB2HLSConvert<bidx, hr>(&buf.x, buf);
+
+            dst.x = saturate_cast<uchar>(buf.x);
+            dst.y = saturate_cast<uchar>(buf.y*255.f);
+            dst.z = saturate_cast<uchar>(buf.z*255.f);
+        }
+
+        template <int bidx, int hr> static __device__ uint RGB2HLSConvert(uint src)
+        {
+            float3 buf;
+
+            buf.x = (0xff & src) * (1.f / 255.f);
+            buf.y = (0xff & (src >> 8)) * (1.f / 255.f);
+            buf.z = (0xff & (src >> 16)) * (1.f / 255.f);
+
+            RGB2HLSConvert<bidx, hr>(&buf.x, buf);
+
+            uint dst = 0xffu << 24;
+
+            dst |= saturate_cast<uchar>(buf.x);
+            dst |= saturate_cast<uchar>(buf.y * 255.f) << 8;
+            dst |= saturate_cast<uchar>(buf.z * 255.f) << 16;
+
+            return dst;
+        }
+
+        template <typename T, int scn, int dcn, int bidx, int hr> struct RGB2HLS
+            : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+
+                RGB2HLSConvert<bidx, hr>(&src.x, dst);
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ RGB2HLS() {}
+            __host__ __device__ __forceinline__ RGB2HLS(const RGB2HLS&) {}
+        };
+
+        template <int bidx, int hr> struct RGB2HLS<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
+        {
+            __device__ __forceinline__ uint operator()(uint src) const
+            {
+                return RGB2HLSConvert<bidx, hr>(src);
+            }
+            __host__ __device__ __forceinline__ RGB2HLS() {}
+            __host__ __device__ __forceinline__ RGB2HLS(const RGB2HLS&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_RGB2HLS_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2HLS<T, scn, dcn, bidx, 180> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    }; \
+    template <typename T> struct name ## _full_traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2HLS<T, scn, dcn, bidx, 256> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    }; \
+    template <> struct name ## _traits<float> \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2HLS<float, scn, dcn, bidx, 360> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    }; \
+    template <> struct name ## _full_traits<float> \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2HLS<float, scn, dcn, bidx, 360> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+    namespace color_detail
+    {
+        __constant__ int c_HlsSectorData[6][3] = { {1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0} };
+
+        template <int bidx, int hr, typename T> static __device__ void HLS2RGBConvert(const T& src, float* dst)
+        {
+            const float hscale = 6.0f / hr;
+
+            float h = src.x, l = src.y, s = src.z;
+            float b = l, g = l, r = l;
+
+            if (s != 0)
+            {
+                float p2  = (l <= 0.5f) * l * (1 + s);
+                      p2 += (l > 0.5f) * (l + s - l * s);
+                float p1 = 2 * l - p2;
+
+                h *= hscale;
+
+                if( h < 0 )
+                    do h += 6; while( h < 0 );
+                else if( h >= 6 )
+                    do h -= 6; while( h >= 6 );
+
+                int sector;
+                sector = __float2int_rd(h);
+
+                h -= sector;
+
+                float tab[4];
+                tab[0] = p2;
+                tab[1] = p1;
+                tab[2] = p1 + (p2 - p1) * (1 - h);
+                tab[3] = p1 + (p2 - p1) * h;
+
+                b = tab[c_HlsSectorData[sector][0]];
+                g = tab[c_HlsSectorData[sector][1]];
+                r = tab[c_HlsSectorData[sector][2]];
+            }
+
+            dst[bidx] = b;
+            dst[1] = g;
+            dst[bidx^2] = r;
+        }
+
+        template <int bidx, int hr, typename T> static __device__ void HLS2RGBConvert(const T& src, uchar* dst)
+        {
+            float3 buf;
+
+            buf.x = src.x;
+            buf.y = src.y * (1.f / 255.f);
+            buf.z = src.z * (1.f / 255.f);
+
+            HLS2RGBConvert<bidx, hr>(buf, &buf.x);
+
+            dst[0] = saturate_cast<uchar>(buf.x * 255.f);
+            dst[1] = saturate_cast<uchar>(buf.y * 255.f);
+            dst[2] = saturate_cast<uchar>(buf.z * 255.f);
+        }
+
+        template <int bidx, int hr> static __device__ uint HLS2RGBConvert(uint src)
+        {
+            float3 buf;
+
+            buf.x = 0xff & src;
+            buf.y = (0xff & (src >> 8)) * (1.f / 255.f);
+            buf.z = (0xff & (src >> 16)) * (1.f / 255.f);
+
+            HLS2RGBConvert<bidx, hr>(buf, &buf.x);
+
+            uint dst = 0xffu << 24;
+
+            dst |= saturate_cast<uchar>(buf.x * 255.f);
+            dst |= saturate_cast<uchar>(buf.y * 255.f) << 8;
+            dst |= saturate_cast<uchar>(buf.z * 255.f) << 16;
+
+            return dst;
+        }
+
+        template <typename T, int scn, int dcn, int bidx, int hr> struct HLS2RGB
+            : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const
+            {
+                typename TypeVec<T, dcn>::vec_type dst;
+
+                HLS2RGBConvert<bidx, hr>(src, &dst.x);
+                setAlpha(dst, ColorChannel<T>::max());
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ HLS2RGB() {}
+            __host__ __device__ __forceinline__ HLS2RGB(const HLS2RGB&) {}
+        };
+
+        template <int bidx, int hr> struct HLS2RGB<uchar, 4, 4, bidx, hr> : unary_function<uint, uint>
+        {
+            __device__ __forceinline__ uint operator()(uint src) const
+            {
+                return HLS2RGBConvert<bidx, hr>(src);
+            }
+            __host__ __device__ __forceinline__ HLS2RGB() {}
+            __host__ __device__ __forceinline__ HLS2RGB(const HLS2RGB&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_HLS2RGB_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::HLS2RGB<T, scn, dcn, bidx, 180> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    }; \
+    template <typename T> struct name ## _full_traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::HLS2RGB<T, scn, dcn, bidx, 255> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    }; \
+    template <> struct name ## _traits<float> \
+    { \
+        typedef ::cv::cuda::device::color_detail::HLS2RGB<float, scn, dcn, bidx, 360> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    }; \
+    template <> struct name ## _full_traits<float> \
+    { \
+        typedef ::cv::cuda::device::color_detail::HLS2RGB<float, scn, dcn, bidx, 360> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+///////////////////////////////////// RGB <-> Lab /////////////////////////////////////
+
+    namespace color_detail
+    {
+        enum
+        {
+            LAB_CBRT_TAB_SIZE = 1024,
+            GAMMA_TAB_SIZE = 1024,
+            lab_shift = xyz_shift,
+            gamma_shift = 3,
+            lab_shift2 = (lab_shift + gamma_shift),
+            LAB_CBRT_TAB_SIZE_B = (256 * 3 / 2 * (1 << gamma_shift))
+        };
+
+        __constant__ ushort c_sRGBGammaTab_b[] = {0,1,1,2,2,3,4,4,5,6,6,7,8,8,9,10,11,11,12,13,14,15,16,17,19,20,21,22,24,25,26,28,29,31,33,34,36,38,40,41,43,45,47,49,51,54,56,58,60,63,65,68,70,73,75,78,81,83,86,89,92,95,98,101,105,108,111,115,118,121,125,129,132,136,140,144,147,151,155,160,164,168,172,176,181,185,190,194,199,204,209,213,218,223,228,233,239,244,249,255,260,265,271,277,282,288,294,300,306,312,318,324,331,337,343,350,356,363,370,376,383,390,397,404,411,418,426,433,440,448,455,463,471,478,486,494,502,510,518,527,535,543,552,560,569,578,586,595,604,613,622,631,641,650,659,669,678,688,698,707,717,727,737,747,757,768,778,788,799,809,820,831,842,852,863,875,886,897,908,920,931,943,954,966,978,990,1002,1014,1026,1038,1050,1063,1075,1088,1101,1113,1126,1139,1152,1165,1178,1192,1205,1218,1232,1245,1259,1273,1287,1301,1315,1329,1343,1357,1372,1386,1401,1415,1430,1445,1460,1475,1490,1505,1521,1536,1551,1567,1583,1598,1614,1630,1646,1662,1678,1695,1711,1728,1744,1761,1778,1794,1811,1828,1846,1863,1880,1897,1915,1933,1950,1968,1986,2004,2022,2040};
+
+        __device__ __forceinline__ int LabCbrt_b(int i)
+        {
+            float x = i * (1.f / (255.f * (1 << gamma_shift)));
+            return (1 << lab_shift2) * (x < 0.008856f ? x * 7.787f + 0.13793103448275862f : ::cbrtf(x));
+        }
+
+        template <bool srgb, int blueIdx, typename T, typename D>
+        __device__ __forceinline__ void RGB2LabConvert_b(const T& src, D& dst)
+        {
+            const int Lscale = (116 * 255 + 50) / 100;
+            const int Lshift = -((16 * 255 * (1 << lab_shift2) + 50) / 100);
+
+            int B = blueIdx == 0 ? src.x : src.z;
+            int G = src.y;
+            int R = blueIdx == 0 ? src.z : src.x;
+
+            if (srgb)
+            {
+                B = c_sRGBGammaTab_b[B];
+                G = c_sRGBGammaTab_b[G];
+                R = c_sRGBGammaTab_b[R];
+            }
+            else
+            {
+                B <<= 3;
+                G <<= 3;
+                R <<= 3;
+            }
+
+            int fX = LabCbrt_b(CV_DESCALE(B * 778 + G * 1541 + R * 1777, lab_shift));
+            int fY = LabCbrt_b(CV_DESCALE(B * 296 + G * 2929 + R * 871, lab_shift));
+            int fZ = LabCbrt_b(CV_DESCALE(B * 3575 + G * 448 + R * 73, lab_shift));
+
+            int L = CV_DESCALE(Lscale * fY + Lshift, lab_shift2);
+            int a = CV_DESCALE(500 * (fX - fY) + 128 * (1 << lab_shift2), lab_shift2);
+            int b = CV_DESCALE(200 * (fY - fZ) + 128 * (1 << lab_shift2), lab_shift2);
+
+            dst.x = saturate_cast<uchar>(L);
+            dst.y = saturate_cast<uchar>(a);
+            dst.z = saturate_cast<uchar>(b);
+        }
+
+        __device__ __forceinline__ float splineInterpolate(float x, const float* tab, int n)
+        {
+            int ix = ::min(::max(int(x), 0), n-1);
+            x -= ix;
+            tab += ix * 4;
+            return ((tab[3] * x + tab[2]) * x + tab[1]) * x + tab[0];
+        }
+
+        __constant__ float c_sRGBGammaTab[] = {0,7.55853e-05,0.,-7.51331e-13,7.55853e-05,7.55853e-05,-2.25399e-12,3.75665e-12,0.000151171,7.55853e-05,9.01597e-12,-6.99932e-12,0.000226756,7.55853e-05,-1.1982e-11,2.41277e-12,0.000302341,7.55853e-05,-4.74369e-12,1.19001e-11,0.000377927,7.55853e-05,3.09568e-11,-2.09095e-11,0.000453512,7.55853e-05,-3.17718e-11,1.35303e-11,0.000529097,7.55853e-05,8.81905e-12,-4.10782e-12,0.000604683,7.55853e-05,-3.50439e-12,2.90097e-12,0.000680268,7.55853e-05,5.19852e-12,-7.49607e-12,0.000755853,7.55853e-05,-1.72897e-11,2.70833e-11,0.000831439,7.55854e-05,6.39602e-11,-4.26295e-11,0.000907024,7.55854e-05,-6.39282e-11,2.70193e-11,0.000982609,7.55853e-05,1.71298e-11,-7.24017e-12,0.00105819,7.55853e-05,-4.59077e-12,1.94137e-12,0.00113378,7.55853e-05,1.23333e-12,-5.25291e-13,0.00120937,7.55853e-05,-3.42545e-13,1.59799e-13,0.00128495,7.55853e-05,1.36852e-13,-1.13904e-13,0.00136054,7.55853e-05,-2.04861e-13,2.95818e-13,0.00143612,7.55853e-05,6.82594e-13,-1.06937e-12,0.00151171,7.55853e-05,-2.52551e-12,3.98166e-12,0.00158729,7.55853e-05,9.41946e-12,-1.48573e-11,0.00166288,7.55853e-05,-3.51523e-11,5.54474e-11,0.00173846,7.55854e-05,1.3119e-10,-9.0517e-11,0.00181405,7.55854e-05,-1.40361e-10,7.37899e-11,0.00188963,7.55853e-05,8.10085e-11,-8.82272e-11,0.00196522,7.55852e-05,-1.83673e-10,1.62704e-10,0.0020408,7.55853e-05,3.04438e-10,-2.13341e-10,0.00211639,7.55853e-05,-3.35586e-10,2.25e-10,0.00219197,7.55853e-05,3.39414e-10,-2.20997e-10,0.00226756,7.55853e-05,-3.23576e-10,1.93326e-10,0.00234315,7.55853e-05,2.564e-10,-8.66446e-11,0.00241873,7.55855e-05,-3.53328e-12,-7.9578e-11,0.00249432,7.55853e-05,-2.42267e-10,1.72126e-10,0.0025699,7.55853e-05,2.74111e-10,-1.43265e-10,0.00264549,7.55854e-05,-1.55683e-10,-6.47292e-11,0.00272107,7.55849e-05,-3.4987e-10,8.67842e-10,0.00279666,7.55868e-05,2.25366e-09,-3.8723e-09,0.00287224,7.55797e-05,-9.36325e-09,1.5087e-08,0.00294783,7.56063e-05,3.58978e-08,-5.69415e-08,0.00302341,7.55072e-05,-1.34927e-07,2.13144e-07,0.003099,7.58768e-05,5.04507e-07,1.38713e-07,0.00317552,7.7302e-05,9.20646e-07,-1.55186e-07,0.00325359,7.86777e-05,4.55087e-07,4.26813e-08,0.00333276,7.97159e-05,5.83131e-07,-1.06495e-08,0.00341305,8.08502e-05,5.51182e-07,3.87467e-09,0.00349446,8.19642e-05,5.62806e-07,-1.92586e-10,0.00357698,8.30892e-05,5.62228e-07,1.0866e-09,0.00366063,8.4217e-05,5.65488e-07,5.02818e-10,0.00374542,8.53494e-05,5.66997e-07,8.60211e-10,0.00383133,8.6486e-05,5.69577e-07,7.13044e-10,0.00391839,8.76273e-05,5.71716e-07,4.78527e-10,0.00400659,8.87722e-05,5.73152e-07,1.09818e-09,0.00409594,8.99218e-05,5.76447e-07,2.50964e-10,0.00418644,9.10754e-05,5.772e-07,1.15762e-09,0.00427809,9.22333e-05,5.80672e-07,2.40865e-10,0.0043709,9.33954e-05,5.81395e-07,1.13854e-09,0.00446488,9.45616e-05,5.84811e-07,3.27267e-10,0.00456003,9.57322e-05,5.85792e-07,8.1197e-10,0.00465635,9.69062e-05,5.88228e-07,6.15823e-10,0.00475384,9.80845e-05,5.90076e-07,9.15747e-10,0.00485252,9.92674e-05,5.92823e-07,3.778e-10,0.00495238,0.000100454,5.93956e-07,8.32623e-10,0.00505343,0.000101645,5.96454e-07,4.82695e-10,0.00515567,0.000102839,5.97902e-07,9.61904e-10,0.00525911,0.000104038,6.00788e-07,3.26281e-10,0.00536375,0.00010524,6.01767e-07,9.926e-10,0.00546959,0.000106447,6.04745e-07,3.59933e-10,0.00557664,0.000107657,6.05824e-07,8.2728e-10,0.0056849,0.000108871,6.08306e-07,5.21898e-10,0.00579438,0.00011009,6.09872e-07,8.10492e-10,0.00590508,0.000111312,6.12303e-07,4.27046e-10,0.00601701,0.000112538,6.13585e-07,7.40878e-10,0.00613016,0.000113767,6.15807e-07,8.00469e-10,0.00624454,0.000115001,6.18209e-07,2.48178e-10,0.00636016,0.000116238,6.18953e-07,1.00073e-09,0.00647702,0.000117479,6.21955e-07,4.05654e-10,0.00659512,0.000118724,6.23172e-07,6.36192e-10,0.00671447,0.000119973,6.25081e-07,7.74927e-10,0.00683507,0.000121225,6.27406e-07,4.54975e-10,0.00695692,0.000122481,6.28771e-07,6.64841e-10,0.00708003,0.000123741,6.30765e-07,6.10972e-10,0.00720441,0.000125004,6.32598e-07,6.16543e-10,0.00733004,0.000126271,6.34448e-07,6.48204e-10,0.00745695,0.000127542,6.36392e-07,5.15835e-10,0.00758513,0.000128816,6.3794e-07,5.48103e-10,0.00771458,0.000130094,6.39584e-07,1.01706e-09,0.00784532,0.000131376,6.42635e-07,4.0283e-11,0.00797734,0.000132661,6.42756e-07,6.84471e-10,0.00811064,0.000133949,6.4481e-07,9.47144e-10,0.00824524,0.000135241,6.47651e-07,1.83472e-10,0.00838112,0.000136537,6.48201e-07,1.11296e-09,0.00851831,0.000137837,6.5154e-07,2.13163e-11,0.0086568,0.00013914,6.51604e-07,6.64462e-10,0.00879659,0.000140445,6.53598e-07,1.04613e-09,0.00893769,0.000141756,6.56736e-07,-1.92377e-10,0.0090801,0.000143069,6.56159e-07,1.58601e-09,0.00922383,0.000144386,6.60917e-07,-5.63754e-10,0.00936888,0.000145706,6.59226e-07,1.60033e-09,0.00951524,0.000147029,6.64027e-07,-2.49543e-10,0.00966294,0.000148356,6.63278e-07,1.26043e-09,0.00981196,0.000149687,6.67059e-07,-1.35572e-10,0.00996231,0.00015102,6.66653e-07,1.14458e-09,0.010114,0.000152357,6.70086e-07,2.13864e-10,0.010267,0.000153698,6.70728e-07,7.93856e-10,0.0104214,0.000155042,6.73109e-07,3.36077e-10,0.0105771,0.000156389,6.74118e-07,6.55765e-10,0.0107342,0.000157739,6.76085e-07,7.66211e-10,0.0108926,0.000159094,6.78384e-07,4.66116e-12,0.0110524,0.000160451,6.78398e-07,1.07775e-09,0.0112135,0.000161811,6.81631e-07,3.41023e-10,0.011376,0.000163175,6.82654e-07,3.5205e-10,0.0115398,0.000164541,6.8371e-07,1.04473e-09,0.0117051,0.000165912,6.86844e-07,1.25757e-10,0.0118717,0.000167286,6.87222e-07,3.14818e-10,0.0120396,0.000168661,6.88166e-07,1.40886e-09,0.012209,0.000170042,6.92393e-07,-3.62244e-10,0.0123797,0.000171425,6.91306e-07,9.71397e-10,0.0125518,0.000172811,6.9422e-07,2.02003e-10,0.0127253,0.0001742,6.94826e-07,1.01448e-09,0.0129002,0.000175593,6.97869e-07,3.96653e-10,0.0130765,0.00017699,6.99059e-07,1.92927e-10,0.0132542,0.000178388,6.99638e-07,6.94305e-10,0.0134333,0.00017979,7.01721e-07,7.55108e-10,0.0136138,0.000181195,7.03986e-07,1.05918e-11,0.0137957,0.000182603,7.04018e-07,1.06513e-09,0.013979,0.000184015,7.07214e-07,3.85512e-10,0.0141637,0.00018543,7.0837e-07,1.86769e-10,0.0143499,0.000186848,7.0893e-07,7.30116e-10,0.0145374,0.000188268,7.11121e-07,6.17983e-10,0.0147264,0.000189692,7.12975e-07,5.23282e-10,0.0149168,0.000191119,7.14545e-07,8.28398e-11,0.0151087,0.000192549,7.14793e-07,1.0081e-09,0.0153019,0.000193981,7.17817e-07,5.41244e-10,0.0154966,0.000195418,7.19441e-07,-3.7907e-10,0.0156928,0.000196856,7.18304e-07,1.90641e-09,0.0158903,0.000198298,7.24023e-07,-7.27387e-10,0.0160893,0.000199744,7.21841e-07,1.00317e-09,0.0162898,0.000201191,7.24851e-07,4.39949e-10,0.0164917,0.000202642,7.2617e-07,9.6234e-10,0.0166951,0.000204097,7.29057e-07,-5.64019e-10,0.0168999,0.000205554,7.27365e-07,1.29374e-09,0.0171062,0.000207012,7.31247e-07,9.77025e-10,0.017314,0.000208478,7.34178e-07,-1.47651e-09,0.0175232,0.000209942,7.29748e-07,3.06636e-09,0.0177338,0.00021141,7.38947e-07,-1.47573e-09,0.017946,0.000212884,7.3452e-07,9.7386e-10,0.0181596,0.000214356,7.37442e-07,1.30562e-09,0.0183747,0.000215835,7.41358e-07,-6.08376e-10,0.0185913,0.000217315,7.39533e-07,1.12785e-09,0.0188093,0.000218798,7.42917e-07,-1.77711e-10,0.0190289,0.000220283,7.42384e-07,1.44562e-09,0.0192499,0.000221772,7.46721e-07,-1.68825e-11,0.0194724,0.000223266,7.4667e-07,4.84533e-10,0.0196964,0.000224761,7.48124e-07,-5.85298e-11,0.0199219,0.000226257,7.47948e-07,1.61217e-09,0.0201489,0.000227757,7.52785e-07,-8.02136e-10,0.0203775,0.00022926,7.50378e-07,1.59637e-09,0.0206075,0.000230766,7.55167e-07,4.47168e-12,0.020839,0.000232276,7.55181e-07,2.48387e-10,0.021072,0.000233787,7.55926e-07,8.6474e-10,0.0213066,0.000235302,7.5852e-07,1.78299e-11,0.0215426,0.000236819,7.58573e-07,9.26567e-10,0.0217802,0.000238339,7.61353e-07,1.34529e-12,0.0220193,0.000239862,7.61357e-07,9.30659e-10,0.0222599,0.000241387,7.64149e-07,1.34529e-12,0.0225021,0.000242915,7.64153e-07,9.26567e-10,0.0227458,0.000244447,7.66933e-07,1.76215e-11,0.022991,0.00024598,7.66986e-07,8.65536e-10,0.0232377,0.000247517,7.69582e-07,2.45677e-10,0.023486,0.000249057,7.70319e-07,1.44193e-11,0.0237358,0.000250598,7.70363e-07,1.55918e-09,0.0239872,0.000252143,7.7504e-07,-6.63173e-10,0.0242401,0.000253691,7.73051e-07,1.09357e-09,0.0244946,0.000255241,7.76331e-07,1.41919e-11,0.0247506,0.000256793,7.76374e-07,7.12248e-10,0.0250082,0.000258348,7.78511e-07,8.62049e-10,0.0252673,0.000259908,7.81097e-07,-4.35061e-10,0.025528,0.000261469,7.79792e-07,8.7825e-10,0.0257902,0.000263031,7.82426e-07,6.47181e-10,0.0260541,0.000264598,7.84368e-07,2.58448e-10,0.0263194,0.000266167,7.85143e-07,1.81558e-10,0.0265864,0.000267738,7.85688e-07,8.78041e-10,0.0268549,0.000269312,7.88322e-07,3.15102e-11,0.027125,0.000270889,7.88417e-07,8.58525e-10,0.0273967,0.000272468,7.90992e-07,2.59812e-10,0.02767,0.000274051,7.91772e-07,-3.5224e-11,0.0279448,0.000275634,7.91666e-07,1.74377e-09,0.0282212,0.000277223,7.96897e-07,-1.35196e-09,0.0284992,0.000278813,7.92841e-07,1.80141e-09,0.0287788,0.000280404,7.98246e-07,-2.65629e-10,0.0290601,0.000281999,7.97449e-07,1.12374e-09,0.0293428,0.000283598,8.0082e-07,-5.04106e-10,0.0296272,0.000285198,7.99308e-07,8.92764e-10,0.0299132,0.000286799,8.01986e-07,6.58379e-10,0.0302008,0.000288405,8.03961e-07,1.98971e-10,0.0304901,0.000290014,8.04558e-07,4.08382e-10,0.0307809,0.000291624,8.05783e-07,3.01839e-11,0.0310733,0.000293236,8.05874e-07,1.33343e-09,0.0313673,0.000294851,8.09874e-07,2.2419e-10,0.031663,0.000296472,8.10547e-07,-3.67606e-10,0.0319603,0.000298092,8.09444e-07,1.24624e-09,0.0322592,0.000299714,8.13182e-07,-8.92025e-10,0.0325597,0.000301338,8.10506e-07,2.32183e-09,0.0328619,0.000302966,8.17472e-07,-9.44719e-10,0.0331657,0.000304598,8.14638e-07,1.45703e-09,0.0334711,0.000306232,8.19009e-07,-1.15805e-09,0.0337781,0.000307866,8.15535e-07,3.17507e-09,0.0340868,0.000309507,8.2506e-07,-4.09161e-09,0.0343971,0.000311145,8.12785e-07,5.74079e-09,0.0347091,0.000312788,8.30007e-07,-3.97034e-09,0.0350227,0.000314436,8.18096e-07,2.68985e-09,0.035338,0.00031608,8.26166e-07,6.61676e-10,0.0356549,0.000317734,8.28151e-07,-1.61123e-09,0.0359734,0.000319386,8.23317e-07,2.05786e-09,0.0362936,0.000321038,8.29491e-07,8.30388e-10,0.0366155,0.0003227,8.31982e-07,-1.65424e-09,0.036939,0.000324359,8.27019e-07,2.06129e-09,0.0372642,0.000326019,8.33203e-07,8.59719e-10,0.0375911,0.000327688,8.35782e-07,-1.77488e-09,0.0379196,0.000329354,8.30458e-07,2.51464e-09,0.0382498,0.000331023,8.38002e-07,-8.33135e-10,0.0385817,0.000332696,8.35502e-07,8.17825e-10,0.0389152,0.00033437,8.37956e-07,1.28718e-09,0.0392504,0.00033605,8.41817e-07,-2.2413e-09,0.0395873,0.000337727,8.35093e-07,3.95265e-09,0.0399258,0.000339409,8.46951e-07,-2.39332e-09,0.0402661,0.000341095,8.39771e-07,1.89533e-09,0.040608,0.000342781,8.45457e-07,-1.46271e-09,0.0409517,0.000344467,8.41069e-07,3.95554e-09,0.041297,0.000346161,8.52936e-07,-3.18369e-09,0.041644,0.000347857,8.43385e-07,1.32873e-09,0.0419927,0.000349548,8.47371e-07,1.59402e-09,0.0423431,0.000351248,8.52153e-07,-2.54336e-10,0.0426952,0.000352951,8.5139e-07,-5.76676e-10,0.043049,0.000354652,8.4966e-07,2.56114e-09,0.0434045,0.000356359,8.57343e-07,-2.21744e-09,0.0437617,0.000358067,8.50691e-07,2.58344e-09,0.0441206,0.000359776,8.58441e-07,-6.65826e-10,0.0444813,0.000361491,8.56444e-07,7.99218e-11,0.0448436,0.000363204,8.56684e-07,3.46063e-10,0.0452077,0.000364919,8.57722e-07,2.26116e-09,0.0455734,0.000366641,8.64505e-07,-1.94005e-09,0.045941,0.000368364,8.58685e-07,1.77384e-09,0.0463102,0.000370087,8.64007e-07,-1.43005e-09,0.0466811,0.000371811,8.59717e-07,3.94634e-09,0.0470538,0.000373542,8.71556e-07,-3.17946e-09,0.0474282,0.000375276,8.62017e-07,1.32104e-09,0.0478043,0.000377003,8.6598e-07,1.62045e-09,0.0481822,0.00037874,8.70842e-07,-3.52297e-10,0.0485618,0.000380481,8.69785e-07,-2.11211e-10,0.0489432,0.00038222,8.69151e-07,1.19716e-09,0.0493263,0.000383962,8.72743e-07,-8.52026e-10,0.0497111,0.000385705,8.70187e-07,2.21092e-09,0.0500977,0.000387452,8.76819e-07,-5.41339e-10,0.050486,0.000389204,8.75195e-07,-4.5361e-11,0.0508761,0.000390954,8.75059e-07,7.22669e-10,0.0512679,0.000392706,8.77227e-07,8.79936e-10,0.0516615,0.000394463,8.79867e-07,-5.17048e-10,0.0520568,0.000396222,8.78316e-07,1.18833e-09,0.0524539,0.000397982,8.81881e-07,-5.11022e-10,0.0528528,0.000399744,8.80348e-07,8.55683e-10,0.0532534,0.000401507,8.82915e-07,8.13562e-10,0.0536558,0.000403276,8.85356e-07,-3.84603e-10,0.05406,0.000405045,8.84202e-07,7.24962e-10,0.0544659,0.000406816,8.86377e-07,1.20986e-09,0.0548736,0.000408592,8.90006e-07,-1.83896e-09,0.0552831,0.000410367,8.84489e-07,2.42071e-09,0.0556944,0.000412143,8.91751e-07,-3.93413e-10,0.0561074,0.000413925,8.90571e-07,-8.46967e-10,0.0565222,0.000415704,8.8803e-07,3.78122e-09,0.0569388,0.000417491,8.99374e-07,-3.1021e-09,0.0573572,0.000419281,8.90068e-07,1.17658e-09,0.0577774,0.000421064,8.93597e-07,2.12117e-09,0.0581993,0.000422858,8.99961e-07,-2.21068e-09,0.0586231,0.000424651,8.93329e-07,2.9961e-09,0.0590486,0.000426447,9.02317e-07,-2.32311e-09,0.059476,0.000428244,8.95348e-07,2.57122e-09,0.0599051,0.000430043,9.03062e-07,-5.11098e-10,0.0603361,0.000431847,9.01528e-07,-5.27166e-10,0.0607688,0.000433649,8.99947e-07,2.61984e-09,0.0612034,0.000435457,9.07806e-07,-2.50141e-09,0.0616397,0.000437265,9.00302e-07,3.66045e-09,0.0620779,0.000439076,9.11283e-07,-4.68977e-09,0.0625179,0.000440885,8.97214e-07,7.64783e-09,0.0629597,0.000442702,9.20158e-07,-7.27499e-09,0.0634033,0.000444521,8.98333e-07,6.55113e-09,0.0638487,0.000446337,9.17986e-07,-4.02844e-09,0.0642959,0.000448161,9.05901e-07,2.11196e-09,0.064745,0.000449979,9.12236e-07,3.03125e-09,0.0651959,0.000451813,9.2133e-07,-6.78648e-09,0.0656486,0.000453635,9.00971e-07,9.21375e-09,0.0661032,0.000455464,9.28612e-07,-7.71684e-09,0.0665596,0.000457299,9.05462e-07,6.7522e-09,0.0670178,0.00045913,9.25718e-07,-4.3907e-09,0.0674778,0.000460968,9.12546e-07,3.36e-09,0.0679397,0.000462803,9.22626e-07,-1.59876e-09,0.0684034,0.000464644,9.1783e-07,3.0351e-09,0.068869,0.000466488,9.26935e-07,-3.09101e-09,0.0693364,0.000468333,9.17662e-07,1.8785e-09,0.0698057,0.000470174,9.23298e-07,3.02733e-09,0.0702768,0.00047203,9.3238e-07,-6.53722e-09,0.0707497,0.000473875,9.12768e-07,8.22054e-09,0.0712245,0.000475725,9.37429e-07,-3.99325e-09,0.0717012,0.000477588,9.2545e-07,3.01839e-10,0.0721797,0.00047944,9.26355e-07,2.78597e-09,0.0726601,0.000481301,9.34713e-07,-3.99507e-09,0.0731423,0.000483158,9.22728e-07,5.7435e-09,0.0736264,0.000485021,9.39958e-07,-4.07776e-09,0.0741123,0.000486888,9.27725e-07,3.11695e-09,0.0746002,0.000488753,9.37076e-07,-9.39394e-10,0.0750898,0.000490625,9.34258e-07,6.4055e-10,0.0755814,0.000492495,9.3618e-07,-1.62265e-09,0.0760748,0.000494363,9.31312e-07,5.84995e-09,0.0765701,0.000496243,9.48861e-07,-6.87601e-09,0.0770673,0.00049812,9.28233e-07,6.75296e-09,0.0775664,0.000499997,9.48492e-07,-5.23467e-09,0.0780673,0.000501878,9.32788e-07,6.73523e-09,0.0785701,0.000503764,9.52994e-07,-6.80514e-09,0.0790748,0.000505649,9.32578e-07,5.5842e-09,0.0795814,0.000507531,9.49331e-07,-6.30583e-10,0.0800899,0.000509428,9.47439e-07,-3.0618e-09,0.0806003,0.000511314,9.38254e-07,5.4273e-09,0.0811125,0.000513206,9.54536e-07,-3.74627e-09,0.0816267,0.000515104,9.43297e-07,2.10713e-09,0.0821427,0.000516997,9.49618e-07,2.76839e-09,0.0826607,0.000518905,9.57924e-07,-5.73006e-09,0.0831805,0.000520803,9.40733e-07,5.25072e-09,0.0837023,0.0005227,9.56486e-07,-3.71718e-10,0.084226,0.000524612,9.5537e-07,-3.76404e-09,0.0847515,0.000526512,9.44078e-07,7.97735e-09,0.085279,0.000528424,9.6801e-07,-5.79367e-09,0.0858084,0.000530343,9.50629e-07,2.96268e-10,0.0863397,0.000532245,9.51518e-07,4.6086e-09,0.0868729,0.000534162,9.65344e-07,-3.82947e-09,0.087408,0.000536081,9.53856e-07,3.25861e-09,0.087945,0.000537998,9.63631e-07,-1.7543e-09,0.088484,0.00053992,9.58368e-07,3.75849e-09,0.0890249,0.000541848,9.69644e-07,-5.82891e-09,0.0895677,0.00054377,9.52157e-07,4.65593e-09,0.0901124,0.000545688,9.66125e-07,2.10643e-09,0.0906591,0.000547627,9.72444e-07,-5.63099e-09,0.0912077,0.000549555,9.55551e-07,5.51627e-09,0.0917582,0.000551483,9.721e-07,-1.53292e-09,0.0923106,0.000553422,9.67501e-07,6.15311e-10,0.092865,0.000555359,9.69347e-07,-9.28291e-10,0.0934213,0.000557295,9.66562e-07,3.09774e-09,0.0939796,0.000559237,9.75856e-07,-4.01186e-09,0.0945398,0.000561177,9.6382e-07,5.49892e-09,0.095102,0.000563121,9.80317e-07,-3.08258e-09,0.0956661,0.000565073,9.71069e-07,-6.19176e-10,0.0962321,0.000567013,9.69212e-07,5.55932e-09,0.0968001,0.000568968,9.8589e-07,-6.71704e-09,0.09737,0.00057092,9.65738e-07,6.40762e-09,0.0979419,0.00057287,9.84961e-07,-4.0122e-09,0.0985158,0.000574828,9.72925e-07,2.19059e-09,0.0990916,0.000576781,9.79496e-07,2.70048e-09,0.0996693,0.000578748,9.87598e-07,-5.54193e-09,0.100249,0.000580706,9.70972e-07,4.56597e-09,0.100831,0.000582662,9.8467e-07,2.17923e-09,0.101414,0.000584638,9.91208e-07,-5.83232e-09,0.102,0.000586603,9.73711e-07,6.24884e-09,0.102588,0.000588569,9.92457e-07,-4.26178e-09,0.103177,0.000590541,9.79672e-07,3.34781e-09,0.103769,0.00059251,9.89715e-07,-1.67904e-09,0.104362,0.000594485,9.84678e-07,3.36839e-09,0.104958,0.000596464,9.94783e-07,-4.34397e-09,0.105555,0.000598441,9.81751e-07,6.55696e-09,0.106155,0.000600424,1.00142e-06,-6.98272e-09,0.106756,0.000602406,9.80474e-07,6.4728e-09,0.107359,0.000604386,9.99893e-07,-4.00742e-09,0.107965,0.000606374,9.8787e-07,2.10654e-09,0.108572,0.000608356,9.9419e-07,3.0318e-09,0.109181,0.000610353,1.00329e-06,-6.7832e-09,0.109793,0.00061234,9.82936e-07,9.1998e-09,0.110406,0.000614333,1.01054e-06,-7.6642e-09,0.111021,0.000616331,9.87543e-07,6.55579e-09,0.111639,0.000618326,1.00721e-06,-3.65791e-09,0.112258,0.000620329,9.96236e-07,6.25467e-10,0.112879,0.000622324,9.98113e-07,1.15593e-09,0.113503,0.000624323,1.00158e-06,2.20158e-09,0.114128,0.000626333,1.00819e-06,-2.51191e-09,0.114755,0.000628342,1.00065e-06,3.95517e-10,0.115385,0.000630345,1.00184e-06,9.29807e-10,0.116016,0.000632351,1.00463e-06,3.33599e-09,0.116649,0.00063437,1.01463e-06,-6.82329e-09,0.117285,0.000636379,9.94163e-07,9.05595e-09,0.117922,0.000638395,1.02133e-06,-7.04862e-09,0.118562,0.000640416,1.00019e-06,4.23737e-09,0.119203,0.000642429,1.0129e-06,-2.45033e-09,0.119847,0.000644448,1.00555e-06,5.56395e-09,0.120492,0.000646475,1.02224e-06,-4.9043e-09,0.121139,0.000648505,1.00753e-06,-8.47952e-10,0.121789,0.000650518,1.00498e-06,8.29622e-09,0.122441,0.000652553,1.02987e-06,-9.98538e-09,0.123094,0.000654582,9.99914e-07,9.2936e-09,0.12375,0.00065661,1.02779e-06,-4.83707e-09,0.124407,0.000658651,1.01328e-06,2.60411e-09,0.125067,0.000660685,1.0211e-06,-5.57945e-09,0.125729,0.000662711,1.00436e-06,1.22631e-08,0.126392,0.000664756,1.04115e-06,-1.36704e-08,0.127058,0.000666798,1.00014e-06,1.26161e-08,0.127726,0.000668836,1.03798e-06,-6.99155e-09,0.128396,0.000670891,1.01701e-06,4.48836e-10,0.129068,0.000672926,1.01836e-06,5.19606e-09,0.129742,0.000674978,1.03394e-06,-6.3319e-09,0.130418,0.000677027,1.01495e-06,5.2305e-09,0.131096,0.000679073,1.03064e-06,3.11123e-10,0.131776,0.000681135,1.03157e-06,-6.47511e-09,0.132458,0.000683179,1.01215e-06,1.06882e-08,0.133142,0.000685235,1.04421e-06,-6.47519e-09,0.133829,0.000687304,1.02479e-06,3.11237e-10,0.134517,0.000689355,1.02572e-06,5.23035e-09,0.135207,0.000691422,1.04141e-06,-6.3316e-09,0.1359,0.000693486,1.02242e-06,5.19484e-09,0.136594,0.000695546,1.038e-06,4.53497e-10,0.137291,0.000697623,1.03936e-06,-7.00891e-09,0.137989,0.000699681,1.01834e-06,1.2681e-08,0.13869,0.000701756,1.05638e-06,-1.39128e-08,0.139393,0.000703827,1.01464e-06,1.31679e-08,0.140098,0.000705896,1.05414e-06,-8.95659e-09,0.140805,0.000707977,1.02727e-06,7.75742e-09,0.141514,0.000710055,1.05055e-06,-7.17182e-09,0.142225,0.000712135,1.02903e-06,6.02862e-09,0.142938,0.000714211,1.04712e-06,-2.04163e-09,0.143653,0.000716299,1.04099e-06,2.13792e-09,0.144371,0.000718387,1.04741e-06,-6.51009e-09,0.14509,0.000720462,1.02787e-06,9.00123e-09,0.145812,0.000722545,1.05488e-06,3.07523e-10,0.146535,0.000724656,1.0558e-06,-1.02312e-08,0.147261,0.000726737,1.02511e-06,1.0815e-08,0.147989,0.000728819,1.05755e-06,-3.22681e-09,0.148719,0.000730925,1.04787e-06,2.09244e-09,0.14945,0.000733027,1.05415e-06,-5.143e-09,0.150185,0.00073512,1.03872e-06,3.57844e-09,0.150921,0.000737208,1.04946e-06,5.73027e-09,0.151659,0.000739324,1.06665e-06,-1.15983e-08,0.152399,0.000741423,1.03185e-06,1.08605e-08,0.153142,0.000743519,1.06443e-06,-2.04106e-09,0.153886,0.000745642,1.05831e-06,-2.69642e-09,0.154633,0.00074775,1.05022e-06,-2.07425e-09,0.155382,0.000749844,1.044e-06,1.09934e-08,0.156133,0.000751965,1.07698e-06,-1.20972e-08,0.156886,0.000754083,1.04069e-06,7.59288e-09,0.157641,0.000756187,1.06347e-06,-3.37305e-09,0.158398,0.000758304,1.05335e-06,5.89921e-09,0.159158,0.000760428,1.07104e-06,-5.32248e-09,0.159919,0.000762554,1.05508e-06,4.8927e-10,0.160683,0.000764666,1.05654e-06,3.36547e-09,0.161448,0.000766789,1.06664e-06,9.50081e-10,0.162216,0.000768925,1.06949e-06,-7.16568e-09,0.162986,0.000771043,1.04799e-06,1.28114e-08,0.163758,0.000773177,1.08643e-06,-1.42774e-08,0.164533,0.000775307,1.0436e-06,1.44956e-08,0.165309,0.000777438,1.08708e-06,-1.39025e-08,0.166087,0.00077957,1.04538e-06,1.13118e-08,0.166868,0.000781695,1.07931e-06,-1.54224e-09,0.167651,0.000783849,1.07468e-06,-5.14312e-09,0.168436,0.000785983,1.05925e-06,7.21381e-09,0.169223,0.000788123,1.0809e-06,-8.81096e-09,0.170012,0.000790259,1.05446e-06,1.31289e-08,0.170803,0.000792407,1.09385e-06,-1.39022e-08,0.171597,0.000794553,1.05214e-06,1.26775e-08,0.172392,0.000796695,1.09018e-06,-7.00557e-09,0.17319,0.000798855,1.06916e-06,4.43796e-10,0.17399,0.000800994,1.07049e-06,5.23031e-09,0.174792,0.000803151,1.08618e-06,-6.46397e-09,0.175596,0.000805304,1.06679e-06,5.72444e-09,0.176403,0.000807455,1.08396e-06,-1.53254e-09,0.177211,0.000809618,1.07937e-06,4.05673e-10,0.178022,0.000811778,1.08058e-06,-9.01916e-11,0.178835,0.000813939,1.08031e-06,-4.49821e-11,0.17965,0.000816099,1.08018e-06,2.70234e-10,0.180467,0.00081826,1.08099e-06,-1.03603e-09,0.181286,0.000820419,1.07788e-06,3.87392e-09,0.182108,0.000822587,1.0895e-06,4.41522e-10,0.182932,0.000824767,1.09083e-06,-5.63997e-09,0.183758,0.000826932,1.07391e-06,7.21707e-09,0.184586,0.000829101,1.09556e-06,-8.32718e-09,0.185416,0.000831267,1.07058e-06,1.11907e-08,0.186248,0.000833442,1.10415e-06,-6.63336e-09,0.187083,0.00083563,1.08425e-06,4.41484e-10,0.187919,0.0008378,1.08557e-06,4.86754e-09,0.188758,0.000839986,1.10017e-06,-5.01041e-09,0.189599,0.000842171,1.08514e-06,2.72811e-10,0.190443,0.000844342,1.08596e-06,3.91916e-09,0.191288,0.000846526,1.09772e-06,-1.04819e-09,0.192136,0.000848718,1.09457e-06,2.73531e-10,0.192985,0.000850908,1.0954e-06,-4.58916e-11,0.193837,0.000853099,1.09526e-06,-9.01158e-11,0.194692,0.000855289,1.09499e-06,4.06506e-10,0.195548,0.00085748,1.09621e-06,-1.53595e-09,0.196407,0.000859668,1.0916e-06,5.73717e-09,0.197267,0.000861869,1.10881e-06,-6.51164e-09,0.19813,0.000864067,1.08928e-06,5.40831e-09,0.198995,0.000866261,1.1055e-06,-2.20401e-10,0.199863,0.000868472,1.10484e-06,-4.52652e-09,0.200732,0.000870668,1.09126e-06,3.42508e-09,0.201604,0.000872861,1.10153e-06,5.72762e-09,0.202478,0.000875081,1.11872e-06,-1.14344e-08,0.203354,0.000877284,1.08441e-06,1.02076e-08,0.204233,0.000879484,1.11504e-06,4.06355e-10,0.205113,0.000881715,1.11626e-06,-1.18329e-08,0.205996,0.000883912,1.08076e-06,1.71227e-08,0.206881,0.000886125,1.13213e-06,-1.19546e-08,0.207768,0.000888353,1.09626e-06,8.93465e-10,0.208658,0.000890548,1.09894e-06,8.38062e-09,0.209549,0.000892771,1.12408e-06,-4.61353e-09,0.210443,0.000895006,1.11024e-06,-4.82756e-09,0.211339,0.000897212,1.09576e-06,9.02245e-09,0.212238,0.00089943,1.12283e-06,-1.45997e-09,0.213138,0.000901672,1.11845e-06,-3.18255e-09,0.214041,0.000903899,1.1089e-06,-7.11073e-10,0.214946,0.000906115,1.10677e-06,6.02692e-09,0.215853,0.000908346,1.12485e-06,-8.49548e-09,0.216763,0.00091057,1.09936e-06,1.30537e-08,0.217675,0.000912808,1.13852e-06,-1.3917e-08,0.218588,0.000915044,1.09677e-06,1.28121e-08,0.219505,0.000917276,1.13521e-06,-7.5288e-09,0.220423,0.000919523,1.11262e-06,2.40205e-09,0.221344,0.000921756,1.11983e-06,-2.07941e-09,0.222267,0.000923989,1.11359e-06,5.91551e-09,0.223192,0.000926234,1.13134e-06,-6.68149e-09,0.224119,0.000928477,1.11129e-06,5.90929e-09,0.225049,0.000930717,1.12902e-06,-2.05436e-09,0.22598,0.000932969,1.12286e-06,2.30807e-09,0.226915,0.000935222,1.12978e-06,-7.17796e-09,0.227851,0.00093746,1.10825e-06,1.15028e-08,0.228789,0.000939711,1.14276e-06,-9.03083e-09,0.22973,0.000941969,1.11566e-06,9.71932e-09,0.230673,0.00094423,1.14482e-06,-1.49452e-08,0.231619,0.000946474,1.09998e-06,2.02591e-08,0.232566,0.000948735,1.16076e-06,-2.13879e-08,0.233516,0.000950993,1.0966e-06,2.05888e-08,0.234468,0.000953247,1.15837e-06,-1.62642e-08,0.235423,0.000955515,1.10957e-06,1.46658e-08,0.236379,0.000957779,1.15357e-06,-1.25966e-08,0.237338,0.000960048,1.11578e-06,5.91793e-09,0.238299,0.000962297,1.13353e-06,3.82602e-09,0.239263,0.000964576,1.14501e-06,-6.3208e-09,0.240229,0.000966847,1.12605e-06,6.55613e-09,0.241197,0.000969119,1.14572e-06,-5.00268e-09,0.242167,0.000971395,1.13071e-06,-1.44659e-09,0.243139,0.000973652,1.12637e-06,1.07891e-08,0.244114,0.000975937,1.15874e-06,-1.19073e-08,0.245091,0.000978219,1.12302e-06,7.03782e-09,0.246071,0.000980486,1.14413e-06,-1.34276e-09,0.247052,0.00098277,1.1401e-06,-1.66669e-09,0.248036,0.000985046,1.1351e-06,8.00935e-09,0.249022,0.00098734,1.15913e-06,-1.54694e-08,0.250011,0.000989612,1.11272e-06,2.4066e-08,0.251002,0.000991909,1.18492e-06,-2.11901e-08,0.251995,0.000994215,1.12135e-06,1.08973e-09,0.25299,0.000996461,1.12462e-06,1.68311e-08,0.253988,0.000998761,1.17511e-06,-8.8094e-09,0.254987,0.00100109,1.14868e-06,-1.13958e-08,0.25599,0.00100335,1.1145e-06,2.45902e-08,0.256994,0.00100565,1.18827e-06,-2.73603e-08,0.258001,0.00100795,1.10618e-06,2.52464e-08,0.25901,0.00101023,1.18192e-06,-1.40207e-08,0.260021,0.00101256,1.13986e-06,1.03387e-09,0.261035,0.00101484,1.14296e-06,9.8853e-09,0.262051,0.00101715,1.17262e-06,-1.07726e-08,0.263069,0.00101947,1.1403e-06,3.40272e-09,0.26409,0.00102176,1.15051e-06,-2.83827e-09,0.265113,0.00102405,1.142e-06,7.95039e-09,0.266138,0.00102636,1.16585e-06,8.39047e-10,0.267166,0.00102869,1.16836e-06,-1.13066e-08,0.268196,0.00103099,1.13444e-06,1.4585e-08,0.269228,0.00103331,1.1782e-06,-1.72314e-08,0.270262,0.00103561,1.1265e-06,2.45382e-08,0.271299,0.00103794,1.20012e-06,-2.13166e-08,0.272338,0.00104028,1.13617e-06,1.12364e-09,0.273379,0.00104255,1.13954e-06,1.68221e-08,0.274423,0.00104488,1.19001e-06,-8.80736e-09,0.275469,0.00104723,1.16358e-06,-1.13948e-08,0.276518,0.00104953,1.1294e-06,2.45839e-08,0.277568,0.00105186,1.20315e-06,-2.73361e-08,0.278621,0.00105418,1.12114e-06,2.51559e-08,0.279677,0.0010565,1.19661e-06,-1.36832e-08,0.280734,0.00105885,1.15556e-06,-2.25706e-10,0.281794,0.00106116,1.15488e-06,1.45862e-08,0.282857,0.00106352,1.19864e-06,-2.83167e-08,0.283921,0.00106583,1.11369e-06,3.90759e-08,0.284988,0.00106817,1.23092e-06,-3.85801e-08,0.286058,0.00107052,1.11518e-06,2.58375e-08,0.287129,0.00107283,1.19269e-06,-5.16498e-09,0.288203,0.0010752,1.1772e-06,-5.17768e-09,0.28928,0.00107754,1.16167e-06,-3.92671e-09,0.290358,0.00107985,1.14988e-06,2.08846e-08,0.29144,0.00108221,1.21254e-06,-2.00072e-08,0.292523,0.00108458,1.15252e-06,-4.60659e-10,0.293609,0.00108688,1.15114e-06,2.18499e-08,0.294697,0.00108925,1.21669e-06,-2.73343e-08,0.295787,0.0010916,1.13468e-06,2.78826e-08,0.29688,0.00109395,1.21833e-06,-2.45915e-08,0.297975,0.00109632,1.14456e-06,1.08787e-08,0.299073,0.00109864,1.17719e-06,1.08788e-08,0.300172,0.00110102,1.20983e-06,-2.45915e-08,0.301275,0.00110337,1.13605e-06,2.78828e-08,0.302379,0.00110573,1.2197e-06,-2.73348e-08,0.303486,0.00110808,1.1377e-06,2.18518e-08,0.304595,0.00111042,1.20325e-06,-4.67556e-10,0.305707,0.00111283,1.20185e-06,-1.99816e-08,0.306821,0.00111517,1.14191e-06,2.07891e-08,0.307937,0.00111752,1.20427e-06,-3.57026e-09,0.309056,0.00111992,1.19356e-06,-6.50797e-09,0.310177,0.00112228,1.17404e-06,-2.00165e-10,0.3113,0.00112463,1.17344e-06,7.30874e-09,0.312426,0.001127,1.19536e-06,7.67424e-10,0.313554,0.00112939,1.19767e-06,-1.03784e-08,0.314685,0.00113176,1.16653e-06,1.09437e-08,0.315818,0.00113412,1.19936e-06,-3.59406e-09,0.316953,0.00113651,1.18858e-06,3.43251e-09,0.318091,0.0011389,1.19888e-06,-1.0136e-08,0.319231,0.00114127,1.16847e-06,7.30915e-09,0.320374,0.00114363,1.1904e-06,1.07018e-08,0.321518,0.00114604,1.2225e-06,-2.03137e-08,0.322666,0.00114842,1.16156e-06,1.09484e-08,0.323815,0.00115078,1.19441e-06,6.32224e-09,0.324967,0.00115319,1.21337e-06,-6.43509e-09,0.326122,0.00115559,1.19407e-06,-1.03842e-08,0.327278,0.00115795,1.16291e-06,1.81697e-08,0.328438,0.00116033,1.21742e-06,-2.6901e-09,0.329599,0.00116276,1.20935e-06,-7.40939e-09,0.330763,0.00116515,1.18713e-06,2.52533e-09,0.331929,0.00116754,1.1947e-06,-2.69191e-09,0.333098,0.00116992,1.18663e-06,8.24218e-09,0.334269,0.00117232,1.21135e-06,-4.74377e-10,0.335443,0.00117474,1.20993e-06,-6.34471e-09,0.336619,0.00117714,1.1909e-06,-3.94922e-09,0.337797,0.00117951,1.17905e-06,2.21417e-08,0.338978,0.00118193,1.24547e-06,-2.50128e-08,0.340161,0.00118435,1.17043e-06,1.8305e-08,0.341346,0.00118674,1.22535e-06,-1.84048e-08,0.342534,0.00118914,1.17013e-06,2.55121e-08,0.343725,0.00119156,1.24667e-06,-2.40389e-08,0.344917,0.00119398,1.17455e-06,1.10389e-08,0.346113,0.00119636,1.20767e-06,9.68574e-09,0.34731,0.0011988,1.23673e-06,-1.99797e-08,0.34851,0.00120122,1.17679e-06,1.06284e-08,0.349713,0.0012036,1.20867e-06,7.26868e-09,0.350917,0.00120604,1.23048e-06,-9.90072e-09,0.352125,0.00120847,1.20078e-06,2.53177e-09,0.353334,0.00121088,1.20837e-06,-2.26199e-10,0.354546,0.0012133,1.20769e-06,-1.62705e-09,0.355761,0.00121571,1.20281e-06,6.73435e-09,0.356978,0.00121813,1.22302e-06,4.49207e-09,0.358197,0.00122059,1.23649e-06,-2.47027e-08,0.359419,0.00122299,1.16238e-06,3.47142e-08,0.360643,0.00122542,1.26653e-06,-2.47472e-08,0.36187,0.00122788,1.19229e-06,4.66965e-09,0.363099,0.00123028,1.20629e-06,6.06872e-09,0.36433,0.00123271,1.2245e-06,8.57729e-10,0.365564,0.00123516,1.22707e-06,-9.49952e-09,0.366801,0.00123759,1.19858e-06,7.33792e-09,0.36804,0.00124001,1.22059e-06,9.95025e-09,0.369281,0.00124248,1.25044e-06,-1.73366e-08,0.370525,0.00124493,1.19843e-06,-2.08464e-10,0.371771,0.00124732,1.1978e-06,1.81704e-08,0.373019,0.00124977,1.25232e-06,-1.28683e-08,0.37427,0.00125224,1.21371e-06,3.50042e-09,0.375524,0.00125468,1.22421e-06,-1.1335e-09,0.37678,0.00125712,1.22081e-06,1.03345e-09,0.378038,0.00125957,1.22391e-06,-3.00023e-09,0.379299,0.00126201,1.21491e-06,1.09676e-08,0.380562,0.00126447,1.24781e-06,-1.10676e-08,0.381828,0.00126693,1.21461e-06,3.50042e-09,0.383096,0.00126937,1.22511e-06,-2.93403e-09,0.384366,0.00127181,1.21631e-06,8.23574e-09,0.385639,0.00127427,1.24102e-06,-2.06607e-10,0.386915,0.00127675,1.2404e-06,-7.40935e-09,0.388193,0.00127921,1.21817e-06,4.1761e-11,0.389473,0.00128165,1.21829e-06,7.24223e-09,0.390756,0.0012841,1.24002e-06,7.91564e-10,0.392042,0.00128659,1.2424e-06,-1.04086e-08,0.393329,0.00128904,1.21117e-06,1.10405e-08,0.39462,0.0012915,1.24429e-06,-3.951e-09,0.395912,0.00129397,1.23244e-06,4.7634e-09,0.397208,0.00129645,1.24673e-06,-1.51025e-08,0.398505,0.0012989,1.20142e-06,2.58443e-08,0.399805,0.00130138,1.27895e-06,-2.86702e-08,0.401108,0.00130385,1.19294e-06,2.92318e-08,0.402413,0.00130632,1.28064e-06,-2.86524e-08,0.403721,0.0013088,1.19468e-06,2.57731e-08,0.405031,0.00131127,1.272e-06,-1.48355e-08,0.406343,0.00131377,1.2275e-06,3.76652e-09,0.407658,0.00131623,1.23879e-06,-2.30784e-10,0.408976,0.00131871,1.2381e-06,-2.84331e-09,0.410296,0.00132118,1.22957e-06,1.16041e-08,0.411618,0.00132367,1.26438e-06,-1.37708e-08,0.412943,0.00132616,1.22307e-06,1.36768e-08,0.41427,0.00132865,1.2641e-06,-1.1134e-08,0.4156,0.00133114,1.2307e-06,1.05714e-09,0.416933,0.00133361,1.23387e-06,6.90538e-09,0.418267,0.00133609,1.25459e-06,1.12372e-09,0.419605,0.00133861,1.25796e-06,-1.14002e-08,0.420945,0.00134109,1.22376e-06,1.46747e-08,0.422287,0.00134358,1.26778e-06,-1.7496e-08,0.423632,0.00134606,1.21529e-06,2.5507e-08,0.424979,0.00134857,1.29182e-06,-2.49272e-08,0.426329,0.00135108,1.21703e-06,1.45972e-08,0.427681,0.00135356,1.26083e-06,-3.65935e-09,0.429036,0.00135607,1.24985e-06,4.00178e-11,0.430393,0.00135857,1.24997e-06,3.49917e-09,0.431753,0.00136108,1.26047e-06,-1.40366e-08,0.433116,0.00136356,1.21836e-06,2.28448e-08,0.43448,0.00136606,1.28689e-06,-1.77378e-08,0.435848,0.00136858,1.23368e-06,1.83043e-08,0.437218,0.0013711,1.28859e-06,-2.56769e-08,0.43859,0.0013736,1.21156e-06,2.47987e-08,0.439965,0.0013761,1.28595e-06,-1.39133e-08,0.441342,0.00137863,1.24421e-06,1.05202e-09,0.442722,0.00138112,1.24737e-06,9.70507e-09,0.444104,0.00138365,1.27649e-06,-1.00698e-08,0.445489,0.00138617,1.24628e-06,7.72123e-10,0.446877,0.00138867,1.24859e-06,6.98132e-09,0.448267,0.00139118,1.26954e-06,1.10477e-09,0.449659,0.00139373,1.27285e-06,-1.14003e-08,0.451054,0.00139624,1.23865e-06,1.4694e-08,0.452452,0.00139876,1.28273e-06,-1.75734e-08,0.453852,0.00140127,1.23001e-06,2.5797e-08,0.455254,0.00140381,1.3074e-06,-2.60097e-08,0.456659,0.00140635,1.22937e-06,1.86371e-08,0.458067,0.00140886,1.28529e-06,-1.8736e-08,0.459477,0.00141137,1.22908e-06,2.65048e-08,0.46089,0.00141391,1.30859e-06,-2.76784e-08,0.462305,0.00141645,1.22556e-06,2.46043e-08,0.463722,0.00141897,1.29937e-06,-1.11341e-08,0.465143,0.00142154,1.26597e-06,-9.87033e-09,0.466565,0.00142404,1.23636e-06,2.08131e-08,0.467991,0.00142657,1.2988e-06,-1.37773e-08,0.469419,0.00142913,1.25746e-06,4.49378e-09,0.470849,0.00143166,1.27094e-06,-4.19781e-09,0.472282,0.00143419,1.25835e-06,1.22975e-08,0.473717,0.00143674,1.29524e-06,-1.51902e-08,0.475155,0.00143929,1.24967e-06,1.86608e-08,0.476596,0.00144184,1.30566e-06,-2.96506e-08,0.478039,0.00144436,1.2167e-06,4.03368e-08,0.479485,0.00144692,1.33771e-06,-4.22896e-08,0.480933,0.00144947,1.21085e-06,3.94148e-08,0.482384,0.00145201,1.32909e-06,-2.59626e-08,0.483837,0.00145459,1.2512e-06,4.83124e-09,0.485293,0.0014571,1.2657e-06,6.63757e-09,0.486751,0.00145966,1.28561e-06,-1.57911e-09,0.488212,0.00146222,1.28087e-06,-3.21468e-10,0.489676,0.00146478,1.27991e-06,2.86517e-09,0.491142,0.00146735,1.2885e-06,-1.11392e-08,0.49261,0.00146989,1.25508e-06,1.18893e-08,0.494081,0.00147244,1.29075e-06,-6.61574e-09,0.495555,0.001475,1.27091e-06,1.45736e-08,0.497031,0.00147759,1.31463e-06,-2.18759e-08,0.49851,0.00148015,1.249e-06,1.33252e-08,0.499992,0.00148269,1.28897e-06,-1.62277e-09,0.501476,0.00148526,1.28411e-06,-6.83421e-09,0.502962,0.00148781,1.2636e-06,2.89596e-08,0.504451,0.00149042,1.35048e-06,-4.93997e-08,0.505943,0.00149298,1.20228e-06,4.94299e-08,0.507437,0.00149553,1.35057e-06,-2.91107e-08,0.508934,0.00149814,1.26324e-06,7.40848e-09,0.510434,0.00150069,1.28547e-06,-5.23187e-10,0.511936,0.00150326,1.2839e-06,-5.31585e-09,0.51344,0.00150581,1.26795e-06,2.17866e-08,0.514947,0.00150841,1.33331e-06,-2.22257e-08,0.516457,0.00151101,1.26663e-06,7.51178e-09,0.517969,0.00151357,1.28917e-06,-7.82128e-09,0.519484,0.00151613,1.2657e-06,2.37733e-08,0.521002,0.00151873,1.33702e-06,-2.76674e-08,0.522522,0.00152132,1.25402e-06,2.72917e-08,0.524044,0.00152391,1.3359e-06,-2.18949e-08,0.525569,0.00152652,1.27021e-06,6.83372e-10,0.527097,0.00152906,1.27226e-06,1.91613e-08,0.528628,0.00153166,1.32974e-06,-1.77241e-08,0.53016,0.00153427,1.27657e-06,-7.86963e-09,0.531696,0.0015368,1.25296e-06,4.92027e-08,0.533234,0.00153945,1.40057e-06,-6.9732e-08,0.534775,0.00154204,1.19138e-06,5.09114e-08,0.536318,0.00154458,1.34411e-06,-1.4704e-08,0.537864,0.00154722,1.3e-06,7.9048e-09,0.539413,0.00154984,1.32371e-06,-1.69152e-08,0.540964,0.00155244,1.27297e-06,1.51355e-10,0.542517,0.00155499,1.27342e-06,1.63099e-08,0.544074,0.00155758,1.32235e-06,-5.78647e-09,0.545633,0.00156021,1.30499e-06,6.83599e-09,0.547194,0.00156284,1.3255e-06,-2.15575e-08,0.548758,0.00156543,1.26083e-06,1.97892e-08,0.550325,0.00156801,1.32019e-06,2.00525e-09,0.551894,0.00157065,1.32621e-06,-2.78103e-08,0.553466,0.00157322,1.24278e-06,4.96314e-08,0.555041,0.00157586,1.39167e-06,-5.1506e-08,0.556618,0.00157849,1.23716e-06,3.71835e-08,0.558198,0.00158107,1.34871e-06,-3.76233e-08,0.55978,0.00158366,1.23584e-06,5.37052e-08,0.561365,0.00158629,1.39695e-06,-5.79884e-08,0.562953,0.00158891,1.22299e-06,5.90392e-08,0.564543,0.00159153,1.4001e-06,-5.89592e-08,0.566136,0.00159416,1.22323e-06,5.7588e-08,0.567731,0.00159678,1.39599e-06,-5.21835e-08,0.569329,0.00159941,1.23944e-06,3.19369e-08,0.57093,0.00160199,1.33525e-06,-1.59594e-08,0.572533,0.00160461,1.28737e-06,3.19006e-08,0.574139,0.00160728,1.38307e-06,-5.20383e-08,0.575748,0.00160989,1.22696e-06,5.70431e-08,0.577359,0.00161251,1.39809e-06,-5.69247e-08,0.578973,0.00161514,1.22731e-06,5.14463e-08,0.580589,0.00161775,1.38165e-06,-2.9651e-08,0.582208,0.00162042,1.2927e-06,7.55339e-09,0.58383,0.00162303,1.31536e-06,-5.62636e-10,0.585455,0.00162566,1.31367e-06,-5.30281e-09,0.587081,0.00162827,1.29776e-06,2.17738e-08,0.588711,0.00163093,1.36309e-06,-2.21875e-08,0.590343,0.00163359,1.29652e-06,7.37164e-09,0.591978,0.00163621,1.31864e-06,-7.29907e-09,0.593616,0.00163882,1.29674e-06,2.18247e-08,0.595256,0.00164148,1.36221e-06,-2.03952e-08,0.596899,0.00164414,1.30103e-06,1.51241e-10,0.598544,0.00164675,1.30148e-06,1.97902e-08,0.600192,0.00164941,1.36085e-06,-1.97074e-08,0.601843,0.00165207,1.30173e-06,-5.65175e-10,0.603496,0.00165467,1.30004e-06,2.1968e-08,0.605152,0.00165734,1.36594e-06,-2.77024e-08,0.606811,0.00165999,1.28283e-06,2.92369e-08,0.608472,0.00166264,1.37054e-06,-2.96407e-08,0.610136,0.00166529,1.28162e-06,2.97215e-08,0.611803,0.00166795,1.37079e-06,-2.96408e-08,0.613472,0.0016706,1.28186e-06,2.92371e-08,0.615144,0.00167325,1.36957e-06,-2.77031e-08,0.616819,0.00167591,1.28647e-06,2.19708e-08,0.618496,0.00167855,1.35238e-06,-5.75407e-10,0.620176,0.00168125,1.35065e-06,-1.9669e-08,0.621858,0.00168389,1.29164e-06,1.96468e-08,0.623544,0.00168653,1.35058e-06,6.86403e-10,0.625232,0.00168924,1.35264e-06,-2.23924e-08,0.626922,0.00169187,1.28547e-06,2.92788e-08,0.628615,0.00169453,1.3733e-06,-3.51181e-08,0.630311,0.00169717,1.26795e-06,5.15889e-08,0.63201,0.00169987,1.42272e-06,-5.2028e-08,0.633711,0.00170255,1.26663e-06,3.73139e-08,0.635415,0.0017052,1.37857e-06,-3.76227e-08,0.637121,0.00170784,1.2657e-06,5.35722e-08,0.63883,0.00171054,1.42642e-06,-5.74567e-08,0.640542,0.00171322,1.25405e-06,5.70456e-08,0.642257,0.0017159,1.42519e-06,-5.15163e-08,0.643974,0.00171859,1.27064e-06,2.98103e-08,0.645694,0.00172122,1.36007e-06,-8.12016e-09,0.647417,0.00172392,1.33571e-06,2.67039e-09,0.649142,0.0017266,1.34372e-06,-2.56152e-09,0.65087,0.00172928,1.33604e-06,7.57571e-09,0.6526,0.00173197,1.35876e-06,-2.77413e-08,0.654334,0.00173461,1.27554e-06,4.3785e-08,0.65607,0.00173729,1.40689e-06,-2.81896e-08,0.657808,0.00174002,1.32233e-06,9.36893e-09,0.65955,0.00174269,1.35043e-06,-9.28617e-09,0.661294,0.00174536,1.32257e-06,2.77757e-08,0.66304,0.00174809,1.4059e-06,-4.2212e-08,0.66479,0.00175078,1.27926e-06,2.1863e-08,0.666542,0.0017534,1.34485e-06,1.43648e-08,0.668297,0.00175613,1.38795e-06,-1.97177e-08,0.670054,0.00175885,1.3288e-06,4.90115e-09,0.671814,0.00176152,1.3435e-06,1.13232e-10,0.673577,0.00176421,1.34384e-06,-5.3542e-09,0.675343,0.00176688,1.32778e-06,2.13035e-08,0.677111,0.0017696,1.39169e-06,-2.02553e-08,0.678882,0.00177232,1.33092e-06,1.13005e-10,0.680656,0.00177499,1.33126e-06,1.98031e-08,0.682432,0.00177771,1.39067e-06,-1.97211e-08,0.684211,0.00178043,1.33151e-06,-5.2349e-10,0.685993,0.00178309,1.32994e-06,2.18151e-08,0.687777,0.00178582,1.39538e-06,-2.71325e-08,0.689564,0.00178853,1.31398e-06,2.71101e-08,0.691354,0.00179124,1.39531e-06,-2.17035e-08,0.693147,0.00179396,1.3302e-06,9.92865e-11,0.694942,0.00179662,1.3305e-06,2.13063e-08,0.69674,0.00179935,1.39442e-06,-2.57198e-08,0.698541,0.00180206,1.31726e-06,2.19682e-08,0.700344,0.00180476,1.38317e-06,-2.54852e-09,0.70215,0.00180752,1.37552e-06,-1.17741e-08,0.703959,0.00181023,1.3402e-06,-9.95999e-09,0.705771,0.00181288,1.31032e-06,5.16141e-08,0.707585,0.00181566,1.46516e-06,-7.72869e-08,0.709402,0.00181836,1.2333e-06,7.87197e-08,0.711222,0.00182106,1.46946e-06,-5.87781e-08,0.713044,0.00182382,1.29312e-06,3.71834e-08,0.714869,0.00182652,1.40467e-06,-3.03511e-08,0.716697,0.00182924,1.31362e-06,2.46161e-08,0.718528,0.00183194,1.38747e-06,-8.5087e-09,0.720361,0.00183469,1.36194e-06,9.41892e-09,0.722197,0.00183744,1.3902e-06,-2.91671e-08,0.724036,0.00184014,1.3027e-06,4.76448e-08,0.725878,0.00184288,1.44563e-06,-4.22028e-08,0.727722,0.00184565,1.31902e-06,1.95682e-09,0.729569,0.00184829,1.3249e-06,3.43754e-08,0.731419,0.00185104,1.42802e-06,-2.0249e-08,0.733271,0.00185384,1.36727e-06,-1.29838e-08,0.735126,0.00185654,1.32832e-06,1.25794e-08,0.736984,0.00185923,1.36606e-06,2.22711e-08,0.738845,0.00186203,1.43287e-06,-4.20594e-08,0.740708,0.00186477,1.3067e-06,2.67571e-08,0.742574,0.00186746,1.38697e-06,-5.36424e-09,0.744443,0.00187022,1.37087e-06,-5.30023e-09,0.746315,0.00187295,1.35497e-06,2.65653e-08,0.748189,0.00187574,1.43467e-06,-4.13564e-08,0.750066,0.00187848,1.3106e-06,1.9651e-08,0.751946,0.00188116,1.36955e-06,2.23572e-08,0.753828,0.00188397,1.43663e-06,-4.9475e-08,0.755714,0.00188669,1.2882e-06,5.63335e-08,0.757602,0.00188944,1.4572e-06,-5.66499e-08,0.759493,0.00189218,1.28725e-06,5.10567e-08,0.761386,0.00189491,1.44042e-06,-2.83677e-08,0.763283,0.00189771,1.35532e-06,2.80962e-09,0.765182,0.00190042,1.36375e-06,1.71293e-08,0.767083,0.0019032,1.41513e-06,-1.17221e-08,0.768988,0.001906,1.37997e-06,-2.98453e-08,0.770895,0.00190867,1.29043e-06,7.14987e-08,0.772805,0.00191146,1.50493e-06,-7.73354e-08,0.774718,0.00191424,1.27292e-06,5.90292e-08,0.776634,0.00191697,1.45001e-06,-3.9572e-08,0.778552,0.00191975,1.33129e-06,3.9654e-08,0.780473,0.00192253,1.45026e-06,-5.94395e-08,0.782397,0.00192525,1.27194e-06,7.88945e-08,0.784324,0.00192803,1.50862e-06,-7.73249e-08,0.786253,0.00193082,1.27665e-06,5.15913e-08,0.788185,0.00193352,1.43142e-06,-9.83099e-09,0.79012,0.00193636,1.40193e-06,-1.22672e-08,0.792058,0.00193912,1.36513e-06,-7.05275e-10,0.793999,0.00194185,1.36301e-06,1.50883e-08,0.795942,0.00194462,1.40828e-06,-4.33147e-11,0.797888,0.00194744,1.40815e-06,-1.49151e-08,0.799837,0.00195021,1.3634e-06,9.93244e-11,0.801788,0.00195294,1.3637e-06,1.45179e-08,0.803743,0.00195571,1.40725e-06,1.43363e-09,0.8057,0.00195853,1.41155e-06,-2.02525e-08,0.80766,0.00196129,1.35079e-06,1.99718e-08,0.809622,0.00196405,1.41071e-06,-3.01649e-11,0.811588,0.00196687,1.41062e-06,-1.9851e-08,0.813556,0.00196964,1.35107e-06,1.98296e-08,0.815527,0.0019724,1.41056e-06,1.37485e-10,0.817501,0.00197522,1.41097e-06,-2.03796e-08,0.819477,0.00197798,1.34983e-06,2.17763e-08,0.821457,0.00198074,1.41516e-06,-7.12085e-09,0.823439,0.00198355,1.3938e-06,6.70707e-09,0.825424,0.00198636,1.41392e-06,-1.97074e-08,0.827412,0.00198913,1.35479e-06,1.25179e-08,0.829402,0.00199188,1.39235e-06,2.92405e-08,0.831396,0.00199475,1.48007e-06,-6.98755e-08,0.833392,0.0019975,1.27044e-06,7.14477e-08,0.835391,0.00200026,1.48479e-06,-3.71014e-08,0.837392,0.00200311,1.37348e-06,1.73533e-08,0.839397,0.00200591,1.42554e-06,-3.23118e-08,0.841404,0.00200867,1.32861e-06,5.2289e-08,0.843414,0.00201148,1.48547e-06,-5.76348e-08,0.845427,0.00201428,1.31257e-06,5.9041e-08,0.847443,0.00201708,1.48969e-06,-5.93197e-08,0.849461,0.00201988,1.31173e-06,5.90289e-08,0.851482,0.00202268,1.48882e-06,-5.75864e-08,0.853507,0.00202549,1.31606e-06,5.21075e-08,0.855533,0.00202828,1.47238e-06,-3.16344e-08,0.857563,0.00203113,1.37748e-06,1.48257e-08,0.859596,0.00203393,1.42196e-06,-2.76684e-08,0.861631,0.00203669,1.33895e-06,3.62433e-08,0.863669,0.00203947,1.44768e-06,1.90463e-09,0.86571,0.00204237,1.45339e-06,-4.38617e-08,0.867754,0.00204515,1.32181e-06,5.43328e-08,0.8698,0.00204796,1.48481e-06,-5.42603e-08,0.87185,0.00205076,1.32203e-06,4.34989e-08,0.873902,0.00205354,1.45252e-06,-5.26029e-10,0.875957,0.00205644,1.45095e-06,-4.13949e-08,0.878015,0.00205922,1.32676e-06,4.68962e-08,0.880075,0.00206201,1.46745e-06,-2.69807e-08,0.882139,0.00206487,1.38651e-06,1.42181e-09,0.884205,0.00206764,1.39077e-06,2.12935e-08,0.886274,0.00207049,1.45465e-06,-2.69912e-08,0.888346,0.00207332,1.37368e-06,2.70664e-08,0.890421,0.00207615,1.45488e-06,-2.16698e-08,0.892498,0.00207899,1.38987e-06,8.14756e-12,0.894579,0.00208177,1.38989e-06,2.16371e-08,0.896662,0.00208462,1.45481e-06,-2.6952e-08,0.898748,0.00208744,1.37395e-06,2.65663e-08,0.900837,0.00209027,1.45365e-06,-1.97084e-08,0.902928,0.00209312,1.39452e-06,-7.33731e-09,0.905023,0.00209589,1.37251e-06,4.90578e-08,0.90712,0.00209878,1.51968e-06,-6.96845e-08,0.90922,0.00210161,1.31063e-06,5.08664e-08,0.911323,0.00210438,1.46323e-06,-1.45717e-08,0.913429,0.00210727,1.41952e-06,7.42038e-09,0.915538,0.00211013,1.44178e-06,-1.51097e-08,0.917649,0.00211297,1.39645e-06,-6.58618e-09,0.919764,0.00211574,1.37669e-06,4.14545e-08,0.921881,0.00211862,1.50105e-06,-4.00222e-08,0.924001,0.0021215,1.38099e-06,-5.7518e-10,0.926124,0.00212426,1.37926e-06,4.23229e-08,0.92825,0.00212714,1.50623e-06,-4.9507e-08,0.930378,0.00213001,1.35771e-06,3.64958e-08,0.93251,0.00213283,1.4672e-06,-3.68713e-08,0.934644,0.00213566,1.35658e-06,5.13848e-08,0.936781,0.00213852,1.51074e-06,-4.94585e-08,0.938921,0.0021414,1.36236e-06,2.72399e-08,0.941064,0.0021442,1.44408e-06,1.0372e-10,0.943209,0.00214709,1.44439e-06,-2.76547e-08,0.945358,0.0021499,1.36143e-06,5.09106e-08,0.947509,0.00215277,1.51416e-06,-5.67784e-08,0.949663,0.00215563,1.34382e-06,5.69935e-08,0.95182,0.00215849,1.5148e-06,-5.19861e-08,0.95398,0.00216136,1.35885e-06,3.17417e-08,0.956143,0.00216418,1.45407e-06,-1.53758e-08,0.958309,0.00216704,1.40794e-06,2.97615e-08,0.960477,0.00216994,1.49723e-06,-4.40657e-08,0.962649,0.00217281,1.36503e-06,2.72919e-08,0.964823,0.00217562,1.44691e-06,-5.49729e-09,0.967,0.0021785,1.43041e-06,-5.30273e-09,0.96918,0.00218134,1.41451e-06,2.67084e-08,0.971363,0.00218425,1.49463e-06,-4.19265e-08,0.973548,0.00218711,1.36885e-06,2.17881e-08,0.975737,0.00218992,1.43422e-06,1.43789e-08,0.977928,0.00219283,1.47735e-06,-1.96989e-08,0.980122,0.00219572,1.41826e-06,4.81221e-09,0.98232,0.00219857,1.43269e-06,4.50048e-10,0.98452,0.00220144,1.43404e-06,-6.61237e-09,0.986722,0.00220429,1.41421e-06,2.59993e-08,0.988928,0.0022072,1.4922e-06,-3.77803e-08,0.991137,0.00221007,1.37886e-06,5.9127e-09,0.993348,0.00221284,1.3966e-06,1.33339e-07,0.995563,0.00221604,1.79662e-06,-5.98872e-07,0.99778,0.00222015,0.,0.};
+
+        template <bool srgb, int blueIdx, typename T, typename D>
+        __device__ __forceinline__ void RGB2LabConvert_f(const T& src, D& dst)
+        {
+            const float _1_3 = 1.0f / 3.0f;
+            const float _a = 16.0f / 116.0f;
+
+            float B = blueIdx == 0 ? src.x : src.z;
+            float G = src.y;
+            float R = blueIdx == 0 ? src.z : src.x;
+
+            if (srgb)
+            {
+                B = splineInterpolate(B * GAMMA_TAB_SIZE, c_sRGBGammaTab, GAMMA_TAB_SIZE);
+                G = splineInterpolate(G * GAMMA_TAB_SIZE, c_sRGBGammaTab, GAMMA_TAB_SIZE);
+                R = splineInterpolate(R * GAMMA_TAB_SIZE, c_sRGBGammaTab, GAMMA_TAB_SIZE);
+            }
+
+            float X = B * 0.189828f + G * 0.376219f + R * 0.433953f;
+            float Y = B * 0.072169f + G * 0.715160f + R * 0.212671f;
+            float Z = B * 0.872766f + G * 0.109477f + R * 0.017758f;
+
+            float FX = X > 0.008856f ? ::powf(X, _1_3) : (7.787f * X + _a);
+            float FY = Y > 0.008856f ? ::powf(Y, _1_3) : (7.787f * Y + _a);
+            float FZ = Z > 0.008856f ? ::powf(Z, _1_3) : (7.787f * Z + _a);
+
+            float L = Y > 0.008856f ? (116.f * FY - 16.f) : (903.3f * Y);
+            float a = 500.f * (FX - FY);
+            float b = 200.f * (FY - FZ);
+
+            dst.x = L;
+            dst.y = a;
+            dst.z = b;
+        }
+
+        template <typename T, int scn, int dcn, bool srgb, int blueIdx> struct RGB2Lab;
+        template <int scn, int dcn, bool srgb, int blueIdx>
+        struct RGB2Lab<uchar, scn, dcn, srgb, blueIdx>
+            : unary_function<typename TypeVec<uchar, scn>::vec_type, typename TypeVec<uchar, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<uchar, dcn>::vec_type operator ()(const typename TypeVec<uchar, scn>::vec_type& src) const
+            {
+                typename TypeVec<uchar, dcn>::vec_type dst;
+
+                RGB2LabConvert_b<srgb, blueIdx>(src, dst);
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ RGB2Lab() {}
+            __host__ __device__ __forceinline__ RGB2Lab(const RGB2Lab&) {}
+        };
+        template <int scn, int dcn, bool srgb, int blueIdx>
+        struct RGB2Lab<float, scn, dcn, srgb, blueIdx>
+            : unary_function<typename TypeVec<float, scn>::vec_type, typename TypeVec<float, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<float, dcn>::vec_type operator ()(const typename TypeVec<float, scn>::vec_type& src) const
+            {
+                typename TypeVec<float, dcn>::vec_type dst;
+
+                RGB2LabConvert_f<srgb, blueIdx>(src, dst);
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ RGB2Lab() {}
+            __host__ __device__ __forceinline__ RGB2Lab(const RGB2Lab&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_RGB2Lab_TRAITS(name, scn, dcn, srgb, blueIdx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2Lab<T, scn, dcn, srgb, blueIdx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+    namespace color_detail
+    {
+        __constant__ float c_sRGBInvGammaTab[] = {0,0.0126255,0.,-8.33961e-06,0.0126172,0.0126005,-2.50188e-05,4.1698e-05,0.0252344,0.0126756,0.000100075,-0.000158451,0.0378516,0.0124004,-0.000375277,-0.000207393,0.0496693,0.0110276,-0.000997456,0.00016837,0.0598678,0.00953783,-0.000492346,2.07235e-05,0.068934,0.00861531,-0.000430176,3.62876e-05,0.0771554,0.00786382,-0.000321313,1.87625e-05,0.0847167,0.00727748,-0.000265025,1.53594e-05,0.0917445,0.00679351,-0.000218947,1.10545e-05,0.0983301,0.00638877,-0.000185784,8.66984e-06,0.104542,0.00604322,-0.000159774,6.82996e-06,0.110432,0.00574416,-0.000139284,5.51008e-06,0.116042,0.00548212,-0.000122754,4.52322e-06,0.121406,0.00525018,-0.000109184,3.75557e-06,0.126551,0.00504308,-9.79177e-05,3.17134e-06,0.131499,0.00485676,-8.84037e-05,2.68469e-06,0.13627,0.004688,-8.03496e-05,2.31725e-06,0.14088,0.00453426,-7.33978e-05,2.00868e-06,0.145343,0.00439349,-6.73718e-05,1.74775e-06,0.149671,0.00426399,-6.21286e-05,1.53547e-06,0.153875,0.00414434,-5.75222e-05,1.364e-06,0.157963,0.00403338,-5.34301e-05,1.20416e-06,0.161944,0.00393014,-4.98177e-05,1.09114e-06,0.165825,0.00383377,-4.65443e-05,9.57987e-07,0.169613,0.00374356,-4.36703e-05,8.88359e-07,0.173314,0.00365888,-4.10052e-05,7.7849e-07,0.176933,0.00357921,-3.86697e-05,7.36254e-07,0.180474,0.00350408,-3.6461e-05,6.42534e-07,0.183942,0.00343308,-3.45334e-05,6.12614e-07,0.187342,0.00336586,-3.26955e-05,5.42894e-07,0.190675,0.00330209,-3.10669e-05,5.08967e-07,0.193947,0.00324149,-2.954e-05,4.75977e-07,0.197159,0.00318383,-2.8112e-05,4.18343e-07,0.200315,0.00312887,-2.6857e-05,4.13651e-07,0.203418,0.00307639,-2.5616e-05,3.70847e-07,0.206469,0.00302627,-2.45035e-05,3.3813e-07,0.209471,0.00297828,-2.34891e-05,3.32999e-07,0.212426,0.0029323,-2.24901e-05,2.96826e-07,0.215336,0.00288821,-2.15996e-05,2.82736e-07,0.218203,0.00284586,-2.07514e-05,2.70961e-07,0.221029,0.00280517,-1.99385e-05,2.42744e-07,0.223814,0.00276602,-1.92103e-05,2.33277e-07,0.226561,0.0027283,-1.85105e-05,2.2486e-07,0.229271,0.00269195,-1.78359e-05,2.08383e-07,0.231945,0.00265691,-1.72108e-05,1.93305e-07,0.234585,0.00262307,-1.66308e-05,1.80687e-07,0.237192,0.00259035,-1.60888e-05,1.86632e-07,0.239766,0.00255873,-1.55289e-05,1.60569e-07,0.24231,0.00252815,-1.50472e-05,1.54566e-07,0.244823,0.00249852,-1.45835e-05,1.59939e-07,0.247307,0.00246983,-1.41037e-05,1.29549e-07,0.249763,0.00244202,-1.3715e-05,1.41429e-07,0.252191,0.00241501,-1.32907e-05,1.39198e-07,0.254593,0.00238885,-1.28731e-05,1.06444e-07,0.256969,0.00236342,-1.25538e-05,1.2048e-07,0.25932,0.00233867,-1.21924e-05,1.26892e-07,0.261647,0.00231467,-1.18117e-05,8.72084e-08,0.26395,0.00229131,-1.15501e-05,1.20323e-07,0.26623,0.00226857,-1.11891e-05,8.71514e-08,0.268487,0.00224645,-1.09276e-05,9.73165e-08,0.270723,0.00222489,-1.06357e-05,8.98259e-08,0.272937,0.00220389,-1.03662e-05,7.98218e-08,0.275131,0.00218339,-1.01267e-05,9.75254e-08,0.277304,0.00216343,-9.83416e-06,6.65195e-08,0.279458,0.00214396,-9.63461e-06,8.34313e-08,0.281592,0.00212494,-9.38431e-06,7.65919e-08,0.283708,0.00210641,-9.15454e-06,5.7236e-08,0.285805,0.00208827,-8.98283e-06,8.18939e-08,0.287885,0.00207055,-8.73715e-06,6.2224e-08,0.289946,0.00205326,-8.55047e-06,5.66388e-08,0.291991,0.00203633,-8.38056e-06,6.88491e-08,0.294019,0.00201978,-8.17401e-06,5.53955e-08,0.296031,0.00200359,-8.00782e-06,6.71971e-08,0.298027,0.00198778,-7.80623e-06,3.34439e-08,0.300007,0.00197227,-7.7059e-06,6.7248e-08,0.301971,0.00195706,-7.50416e-06,5.51915e-08,0.303921,0.00194221,-7.33858e-06,3.98124e-08,0.305856,0.00192766,-7.21915e-06,5.37795e-08,0.307776,0.00191338,-7.05781e-06,4.30919e-08,0.309683,0.00189939,-6.92853e-06,4.20744e-08,0.311575,0.00188566,-6.80231e-06,5.68321e-08,0.313454,0.00187223,-6.63181e-06,2.86195e-08,0.31532,0.00185905,-6.54595e-06,3.73075e-08,0.317172,0.00184607,-6.43403e-06,6.05684e-08,0.319012,0.00183338,-6.25233e-06,1.84426e-08,0.320839,0.00182094,-6.197e-06,4.44757e-08,0.322654,0.00180867,-6.06357e-06,4.20729e-08,0.324456,0.00179667,-5.93735e-06,2.56511e-08,0.326247,0.00178488,-5.8604e-06,3.41368e-08,0.328026,0.00177326,-5.75799e-06,4.64177e-08,0.329794,0.00176188,-5.61874e-06,1.86107e-08,0.33155,0.0017507,-5.5629e-06,2.81511e-08,0.333295,0.00173966,-5.47845e-06,4.75987e-08,0.335029,0.00172884,-5.33565e-06,1.98726e-08,0.336753,0.00171823,-5.27604e-06,2.19226e-08,0.338466,0.00170775,-5.21027e-06,4.14483e-08,0.340169,0.00169745,-5.08592e-06,2.09017e-08,0.341861,0.00168734,-5.02322e-06,2.39561e-08,0.343543,0.00167737,-4.95135e-06,3.22852e-08,0.345216,0.00166756,-4.85449e-06,2.57173e-08,0.346878,0.00165793,-4.77734e-06,1.38569e-08,0.348532,0.00164841,-4.73577e-06,3.80634e-08,0.350175,0.00163906,-4.62158e-06,1.27043e-08,0.35181,0.00162985,-4.58347e-06,3.03279e-08,0.353435,0.00162078,-4.49249e-06,1.49961e-08,0.355051,0.00161184,-4.4475e-06,2.88977e-08,0.356659,0.00160303,-4.3608e-06,1.84241e-08,0.358257,0.00159436,-4.30553e-06,1.6616e-08,0.359848,0.0015858,-4.25568e-06,3.43218e-08,0.361429,0.00157739,-4.15272e-06,-4.89172e-09,0.363002,0.00156907,-4.16739e-06,4.48498e-08,0.364567,0.00156087,-4.03284e-06,4.30676e-09,0.366124,0.00155282,-4.01992e-06,2.73303e-08,0.367673,0.00154486,-3.93793e-06,5.58036e-09,0.369214,0.001537,-3.92119e-06,3.97554e-08,0.370747,0.00152928,-3.80193e-06,-1.55904e-08,0.372272,0.00152163,-3.8487e-06,5.24081e-08,0.37379,0.00151409,-3.69147e-06,-1.52272e-08,0.375301,0.00150666,-3.73715e-06,3.83028e-08,0.376804,0.0014993,-3.62225e-06,1.10278e-08,0.378299,0.00149209,-3.58916e-06,6.99326e-09,0.379788,0.00148493,-3.56818e-06,2.06038e-08,0.381269,0.00147786,-3.50637e-06,2.98009e-08,0.382744,0.00147093,-3.41697e-06,-2.05978e-08,0.384211,0.00146404,-3.47876e-06,5.25899e-08,0.385672,0.00145724,-3.32099e-06,-1.09471e-08,0.387126,0.00145056,-3.35383e-06,2.10009e-08,0.388573,0.00144392,-3.29083e-06,1.63501e-08,0.390014,0.00143739,-3.24178e-06,3.00641e-09,0.391448,0.00143091,-3.23276e-06,3.12282e-08,0.392875,0.00142454,-3.13908e-06,-8.70932e-09,0.394297,0.00141824,-3.16521e-06,3.34114e-08,0.395712,0.00141201,-3.06497e-06,-5.72754e-09,0.397121,0.00140586,-3.08215e-06,1.9301e-08,0.398524,0.00139975,-3.02425e-06,1.7931e-08,0.39992,0.00139376,-2.97046e-06,-1.61822e-09,0.401311,0.00138781,-2.97531e-06,1.83442e-08,0.402696,0.00138192,-2.92028e-06,1.76485e-08,0.404075,0.00137613,-2.86733e-06,4.68617e-10,0.405448,0.00137039,-2.86593e-06,1.02794e-08,0.406816,0.00136469,-2.83509e-06,1.80179e-08,0.408178,0.00135908,-2.78104e-06,7.05594e-09,0.409534,0.00135354,-2.75987e-06,1.33633e-08,0.410885,0.00134806,-2.71978e-06,-9.04568e-10,0.41223,0.00134261,-2.72249e-06,2.0057e-08,0.41357,0.00133723,-2.66232e-06,1.00841e-08,0.414905,0.00133194,-2.63207e-06,-7.88835e-10,0.416234,0.00132667,-2.63444e-06,2.28734e-08,0.417558,0.00132147,-2.56582e-06,-1.29785e-09,0.418877,0.00131633,-2.56971e-06,1.21205e-08,0.420191,0.00131123,-2.53335e-06,1.24202e-08,0.421499,0.0013062,-2.49609e-06,-2.19681e-09,0.422803,0.0013012,-2.50268e-06,2.61696e-08,0.424102,0.00129628,-2.42417e-06,-1.30747e-08,0.425396,0.00129139,-2.46339e-06,2.6129e-08,0.426685,0.00128654,-2.38501e-06,-2.03454e-09,0.427969,0.00128176,-2.39111e-06,1.18115e-08,0.429248,0.00127702,-2.35567e-06,1.43932e-08,0.430523,0.00127235,-2.31249e-06,-9.77965e-09,0.431793,0.00126769,-2.34183e-06,2.47253e-08,0.433058,0.00126308,-2.26766e-06,2.85278e-10,0.434319,0.00125855,-2.2668e-06,3.93614e-09,0.435575,0.00125403,-2.25499e-06,1.37722e-08,0.436827,0.00124956,-2.21368e-06,5.79803e-10,0.438074,0.00124513,-2.21194e-06,1.37112e-08,0.439317,0.00124075,-2.1708e-06,4.17973e-09,0.440556,0.00123642,-2.15826e-06,-6.27703e-10,0.44179,0.0012321,-2.16015e-06,2.81332e-08,0.44302,0.00122787,-2.07575e-06,-2.24985e-08,0.444246,0.00122365,-2.14324e-06,3.20586e-08,0.445467,0.00121946,-2.04707e-06,-1.6329e-08,0.446685,0.00121532,-2.09605e-06,3.32573e-08,0.447898,0.00121122,-1.99628e-06,-2.72927e-08,0.449107,0.00120715,-2.07816e-06,4.6111e-08,0.450312,0.00120313,-1.93983e-06,-3.79416e-08,0.451514,0.00119914,-2.05365e-06,4.60507e-08,0.452711,0.00119517,-1.9155e-06,-2.7052e-08,0.453904,0.00119126,-1.99666e-06,3.23551e-08,0.455093,0.00118736,-1.89959e-06,-1.29613e-08,0.456279,0.00118352,-1.93848e-06,1.94905e-08,0.45746,0.0011797,-1.88e-06,-5.39588e-09,0.458638,0.00117593,-1.89619e-06,2.09282e-09,0.459812,0.00117214,-1.88991e-06,2.68267e-08,0.460982,0.00116844,-1.80943e-06,-1.99925e-08,0.462149,0.00116476,-1.86941e-06,2.3341e-08,0.463312,0.00116109,-1.79939e-06,-1.37674e-08,0.464471,0.00115745,-1.84069e-06,3.17287e-08,0.465627,0.00115387,-1.7455e-06,-2.37407e-08,0.466779,0.00115031,-1.81673e-06,3.34315e-08,0.467927,0.00114677,-1.71643e-06,-2.05786e-08,0.469073,0.00114328,-1.77817e-06,1.90802e-08,0.470214,0.00113978,-1.72093e-06,3.86247e-09,0.471352,0.00113635,-1.70934e-06,-4.72759e-09,0.472487,0.00113292,-1.72352e-06,1.50478e-08,0.473618,0.00112951,-1.67838e-06,4.14108e-09,0.474746,0.00112617,-1.66595e-06,-1.80986e-09,0.47587,0.00112283,-1.67138e-06,3.09816e-09,0.476991,0.0011195,-1.66209e-06,1.92198e-08,0.478109,0.00111623,-1.60443e-06,-2.03726e-08,0.479224,0.00111296,-1.66555e-06,3.2468e-08,0.480335,0.00110973,-1.56814e-06,-2.00922e-08,0.481443,0.00110653,-1.62842e-06,1.80983e-08,0.482548,0.00110333,-1.57413e-06,7.30362e-09,0.48365,0.0011002,-1.55221e-06,-1.75107e-08,0.484749,0.00109705,-1.60475e-06,3.29373e-08,0.485844,0.00109393,-1.50594e-06,-2.48315e-08,0.486937,0.00109085,-1.58043e-06,3.65865e-08,0.488026,0.0010878,-1.47067e-06,-3.21078e-08,0.489112,0.00108476,-1.56699e-06,3.22397e-08,0.490195,0.00108172,-1.47027e-06,-7.44391e-09,0.491276,0.00107876,-1.49261e-06,-2.46428e-09,0.492353,0.00107577,-1.5e-06,1.73011e-08,0.493427,0.00107282,-1.4481e-06,-7.13552e-09,0.494499,0.0010699,-1.4695e-06,1.1241e-08,0.495567,0.001067,-1.43578e-06,-8.02637e-09,0.496633,0.0010641,-1.45986e-06,2.08645e-08,0.497695,0.00106124,-1.39726e-06,-1.58271e-08,0.498755,0.0010584,-1.44475e-06,1.26415e-08,0.499812,0.00105555,-1.40682e-06,2.48655e-08,0.500866,0.00105281,-1.33222e-06,-5.24988e-08,0.501918,0.00104999,-1.48972e-06,6.59206e-08,0.502966,0.00104721,-1.29196e-06,-3.237e-08,0.504012,0.00104453,-1.38907e-06,3.95479e-09,0.505055,0.00104176,-1.3772e-06,1.65509e-08,0.506096,0.00103905,-1.32755e-06,-1.05539e-08,0.507133,0.00103637,-1.35921e-06,2.56648e-08,0.508168,0.00103373,-1.28222e-06,-3.25007e-08,0.509201,0.00103106,-1.37972e-06,4.47336e-08,0.51023,0.00102844,-1.24552e-06,-2.72245e-08,0.511258,0.00102587,-1.32719e-06,4.55952e-09,0.512282,0.00102323,-1.31352e-06,8.98645e-09,0.513304,0.00102063,-1.28656e-06,1.90992e-08,0.514323,0.00101811,-1.22926e-06,-2.57786e-08,0.51534,0.00101557,-1.30659e-06,2.44104e-08,0.516355,0.00101303,-1.23336e-06,-1.22581e-08,0.517366,0.00101053,-1.27014e-06,2.4622e-08,0.518376,0.00100806,-1.19627e-06,-2.66253e-08,0.519383,0.00100559,-1.27615e-06,2.22744e-08,0.520387,0.00100311,-1.20932e-06,-2.8679e-09,0.521389,0.00100068,-1.21793e-06,-1.08029e-08,0.522388,0.000998211,-1.25034e-06,4.60795e-08,0.523385,0.000995849,-1.1121e-06,-5.4306e-08,0.52438,0.000993462,-1.27502e-06,5.19354e-08,0.525372,0.000991067,-1.11921e-06,-3.42262e-08,0.526362,0.000988726,-1.22189e-06,2.53646e-08,0.52735,0.000986359,-1.14579e-06,-7.62782e-09,0.528335,0.000984044,-1.16868e-06,5.14668e-09,0.529318,0.000981722,-1.15324e-06,-1.29589e-08,0.530298,0.000979377,-1.19211e-06,4.66888e-08,0.531276,0.000977133,-1.05205e-06,-5.45868e-08,0.532252,0.000974865,-1.21581e-06,5.24495e-08,0.533226,0.000972591,-1.05846e-06,-3.60019e-08,0.534198,0.000970366,-1.16647e-06,3.19537e-08,0.535167,0.000968129,-1.07061e-06,-3.2208e-08,0.536134,0.000965891,-1.16723e-06,3.72738e-08,0.537099,0.000963668,-1.05541e-06,2.32205e-09,0.538061,0.000961564,-1.04844e-06,-4.65618e-08,0.539022,0.000959328,-1.18813e-06,6.47159e-08,0.53998,0.000957146,-9.93979e-07,-3.3488e-08,0.540936,0.000955057,-1.09444e-06,9.63166e-09,0.54189,0.000952897,-1.06555e-06,-5.03871e-09,0.542842,0.000950751,-1.08066e-06,1.05232e-08,0.543792,0.000948621,-1.04909e-06,2.25503e-08,0.544739,0.000946591,-9.81444e-07,-4.11195e-08,0.545685,0.000944504,-1.1048e-06,2.27182e-08,0.546628,0.000942363,-1.03665e-06,9.85146e-09,0.54757,0.000940319,-1.00709e-06,-2.51938e-09,0.548509,0.000938297,-1.01465e-06,2.25858e-10,0.549446,0.000936269,-1.01397e-06,1.61598e-09,0.550381,0.000934246,-1.00913e-06,-6.68983e-09,0.551315,0.000932207,-1.0292e-06,2.51434e-08,0.552246,0.000930224,-9.53765e-07,-3.42793e-08,0.553175,0.000928214,-1.0566e-06,5.23688e-08,0.554102,0.000926258,-8.99497e-07,-5.59865e-08,0.555028,0.000924291,-1.06746e-06,5.23679e-08,0.555951,0.000922313,-9.10352e-07,-3.42763e-08,0.556872,0.00092039,-1.01318e-06,2.51326e-08,0.557792,0.000918439,-9.37783e-07,-6.64954e-09,0.558709,0.000916543,-9.57732e-07,1.46554e-09,0.559625,0.000914632,-9.53335e-07,7.87281e-10,0.560538,0.000912728,-9.50973e-07,-4.61466e-09,0.56145,0.000910812,-9.64817e-07,1.76713e-08,0.56236,0.000908935,-9.11804e-07,-6.46564e-09,0.563268,0.000907092,-9.312e-07,8.19121e-09,0.564174,0.000905255,-9.06627e-07,-2.62992e-08,0.565078,0.000903362,-9.85524e-07,3.74007e-08,0.565981,0.000901504,-8.73322e-07,-4.0942e-09,0.566882,0.000899745,-8.85605e-07,-2.1024e-08,0.56778,0.00089791,-9.48677e-07,2.85854e-08,0.568677,0.000896099,-8.62921e-07,-3.3713e-08,0.569573,0.000894272,-9.64059e-07,4.6662e-08,0.570466,0.000892484,-8.24073e-07,-3.37258e-08,0.571358,0.000890734,-9.25251e-07,2.86365e-08,0.572247,0.00088897,-8.39341e-07,-2.12155e-08,0.573135,0.000887227,-9.02988e-07,-3.37913e-09,0.574022,0.000885411,-9.13125e-07,3.47319e-08,0.574906,0.000883689,-8.08929e-07,-1.63394e-08,0.575789,0.000882022,-8.57947e-07,-2.8979e-08,0.57667,0.00088022,-9.44885e-07,7.26509e-08,0.57755,0.000878548,-7.26932e-07,-8.28106e-08,0.578427,0.000876845,-9.75364e-07,7.97774e-08,0.579303,0.000875134,-7.36032e-07,-5.74849e-08,0.580178,0.00087349,-9.08486e-07,3.09529e-08,0.58105,0.000871765,-8.15628e-07,-6.72206e-09,0.581921,0.000870114,-8.35794e-07,-4.06451e-09,0.582791,0.00086843,-8.47987e-07,2.29799e-08,0.583658,0.000866803,-7.79048e-07,-2.82503e-08,0.584524,0.00086516,-8.63799e-07,3.04167e-08,0.585388,0.000863524,-7.72548e-07,-3.38119e-08,0.586251,0.000861877,-8.73984e-07,4.52264e-08,0.587112,0.000860265,-7.38305e-07,-2.78842e-08,0.587972,0.000858705,-8.21958e-07,6.70567e-09,0.58883,0.000857081,-8.01841e-07,1.06161e-09,0.589686,0.000855481,-7.98656e-07,-1.09521e-08,0.590541,0.00085385,-8.31512e-07,4.27468e-08,0.591394,0.000852316,-7.03272e-07,-4.08257e-08,0.592245,0.000850787,-8.25749e-07,1.34677e-09,0.593095,0.000849139,-8.21709e-07,3.54387e-08,0.593944,0.000847602,-7.15393e-07,-2.38924e-08,0.59479,0.0008461,-7.8707e-07,5.26143e-10,0.595636,0.000844527,-7.85491e-07,2.17879e-08,0.596479,0.000843021,-7.20127e-07,-2.80733e-08,0.597322,0.000841497,-8.04347e-07,3.09005e-08,0.598162,0.000839981,-7.11646e-07,-3.5924e-08,0.599002,0.00083845,-8.19418e-07,5.3191e-08,0.599839,0.000836971,-6.59845e-07,-5.76307e-08,0.600676,0.000835478,-8.32737e-07,5.81227e-08,0.60151,0.000833987,-6.58369e-07,-5.56507e-08,0.602344,0.000832503,-8.25321e-07,4.52706e-08,0.603175,0.000830988,-6.89509e-07,-6.22236e-09,0.604006,0.000829591,-7.08176e-07,-2.03811e-08,0.604834,0.000828113,-7.6932e-07,2.8142e-08,0.605662,0.000826659,-6.84894e-07,-3.25822e-08,0.606488,0.000825191,-7.8264e-07,4.25823e-08,0.607312,0.000823754,-6.54893e-07,-1.85376e-08,0.608135,0.000822389,-7.10506e-07,-2.80365e-08,0.608957,0.000820883,-7.94616e-07,7.1079e-08,0.609777,0.000819507,-5.81379e-07,-7.74655e-08,0.610596,0.000818112,-8.13775e-07,5.9969e-08,0.611413,0.000816665,-6.33868e-07,-4.32013e-08,0.612229,0.000815267,-7.63472e-07,5.32313e-08,0.613044,0.0008139,-6.03778e-07,-5.05148e-08,0.613857,0.000812541,-7.55323e-07,2.96187e-08,0.614669,0.000811119,-6.66466e-07,-8.35545e-09,0.615479,0.000809761,-6.91533e-07,3.80301e-09,0.616288,0.00080839,-6.80124e-07,-6.85666e-09,0.617096,0.000807009,-7.00694e-07,2.36237e-08,0.617903,0.000805678,-6.29822e-07,-2.80336e-08,0.618708,0.000804334,-7.13923e-07,2.8906e-08,0.619511,0.000802993,-6.27205e-07,-2.79859e-08,0.620314,0.000801655,-7.11163e-07,2.34329e-08,0.621114,0.000800303,-6.40864e-07,-6.14108e-09,0.621914,0.000799003,-6.59287e-07,1.13151e-09,0.622712,0.000797688,-6.55893e-07,1.61507e-09,0.62351,0.000796381,-6.51048e-07,-7.59186e-09,0.624305,0.000795056,-6.73823e-07,2.87524e-08,0.6251,0.000793794,-5.87566e-07,-4.7813e-08,0.625893,0.000792476,-7.31005e-07,4.32901e-08,0.626685,0.000791144,-6.01135e-07,-6.13814e-09,0.627475,0.000789923,-6.19549e-07,-1.87376e-08,0.628264,0.000788628,-6.75762e-07,2.14837e-08,0.629052,0.000787341,-6.11311e-07,-7.59265e-09,0.629839,0.000786095,-6.34089e-07,8.88692e-09,0.630625,0.000784854,-6.07428e-07,-2.7955e-08,0.631409,0.000783555,-6.91293e-07,4.33285e-08,0.632192,0.000782302,-5.61307e-07,-2.61497e-08,0.632973,0.000781101,-6.39757e-07,1.6658e-09,0.633754,0.000779827,-6.34759e-07,1.94866e-08,0.634533,0.000778616,-5.76299e-07,-2.00076e-08,0.635311,0.000777403,-6.36322e-07,9.39091e-10,0.636088,0.000776133,-6.33505e-07,1.62512e-08,0.636863,0.000774915,-5.84751e-07,-6.33937e-09,0.637638,0.000773726,-6.03769e-07,9.10609e-09,0.638411,0.000772546,-5.76451e-07,-3.00849e-08,0.639183,0.000771303,-6.66706e-07,5.1629e-08,0.639953,0.000770125,-5.11819e-07,-5.7222e-08,0.640723,0.000768929,-6.83485e-07,5.80497e-08,0.641491,0.000767736,-5.09336e-07,-5.57674e-08,0.642259,0.000766551,-6.76638e-07,4.58105e-08,0.643024,0.000765335,-5.39206e-07,-8.26541e-09,0.643789,0.000764231,-5.64002e-07,-1.27488e-08,0.644553,0.000763065,-6.02249e-07,-3.44168e-10,0.645315,0.00076186,-6.03281e-07,1.41254e-08,0.646077,0.000760695,-5.60905e-07,3.44727e-09,0.646837,0.000759584,-5.50563e-07,-2.79144e-08,0.647596,0.000758399,-6.34307e-07,4.86057e-08,0.648354,0.000757276,-4.88489e-07,-4.72989e-08,0.64911,0.000756158,-6.30386e-07,2.13807e-08,0.649866,0.000754961,-5.66244e-07,2.13808e-08,0.65062,0.000753893,-5.02102e-07,-4.7299e-08,0.651374,0.000752746,-6.43999e-07,4.86059e-08,0.652126,0.000751604,-4.98181e-07,-2.79154e-08,0.652877,0.000750524,-5.81927e-07,3.45089e-09,0.653627,0.000749371,-5.71575e-07,1.41119e-08,0.654376,0.00074827,-5.29239e-07,-2.93748e-10,0.655123,0.00074721,-5.3012e-07,-1.29368e-08,0.65587,0.000746111,-5.68931e-07,-7.56355e-09,0.656616,0.000744951,-5.91621e-07,4.3191e-08,0.65736,0.000743897,-4.62048e-07,-4.59911e-08,0.658103,0.000742835,-6.00022e-07,2.15642e-08,0.658846,0.0007417,-5.35329e-07,1.93389e-08,0.659587,0.000740687,-4.77312e-07,-3.93152e-08,0.660327,0.000739615,-5.95258e-07,1.87126e-08,0.661066,0.00073848,-5.3912e-07,2.40695e-08,0.661804,0.000737474,-4.66912e-07,-5.53859e-08,0.662541,0.000736374,-6.33069e-07,7.82648e-08,0.663277,0.000735343,-3.98275e-07,-7.88593e-08,0.664012,0.00073431,-6.34853e-07,5.83585e-08,0.664745,0.000733215,-4.59777e-07,-3.53656e-08,0.665478,0.000732189,-5.65874e-07,2.34994e-08,0.66621,0.000731128,-4.95376e-07,9.72743e-10,0.66694,0.00073014,-4.92458e-07,-2.73903e-08,0.66767,0.000729073,-5.74629e-07,4.89839e-08,0.668398,0.000728071,-4.27677e-07,-4.93359e-08,0.669126,0.000727068,-5.75685e-07,2.91504e-08,0.669853,0.000726004,-4.88234e-07,-7.66109e-09,0.670578,0.000725004,-5.11217e-07,1.49392e-09,0.671303,0.000723986,-5.06735e-07,1.68533e-09,0.672026,0.000722978,-5.01679e-07,-8.23525e-09,0.672749,0.00072195,-5.26385e-07,3.12556e-08,0.67347,0.000720991,-4.32618e-07,-5.71825e-08,0.674191,0.000719954,-6.04166e-07,7.8265e-08,0.67491,0.00071898,-3.69371e-07,-7.70634e-08,0.675628,0.00071801,-6.00561e-07,5.11747e-08,0.676346,0.000716963,-4.47037e-07,-8.42615e-09,0.677062,0.000716044,-4.72315e-07,-1.747e-08,0.677778,0.000715046,-5.24725e-07,1.87015e-08,0.678493,0.000714053,-4.68621e-07,2.26856e-09,0.679206,0.000713123,-4.61815e-07,-2.77758e-08,0.679919,0.000712116,-5.45142e-07,4.92298e-08,0.68063,0.000711173,-3.97453e-07,-4.99339e-08,0.681341,0.000710228,-5.47255e-07,3.12967e-08,0.682051,0.000709228,-4.53365e-07,-1.56481e-08,0.68276,0.000708274,-5.00309e-07,3.12958e-08,0.683467,0.000707367,-4.06422e-07,-4.99303e-08,0.684174,0.000706405,-5.56213e-07,4.9216e-08,0.68488,0.00070544,-4.08565e-07,-2.77245e-08,0.685585,0.00070454,-4.91738e-07,2.07748e-09,0.686289,0.000703562,-4.85506e-07,1.94146e-08,0.686992,0.00070265,-4.27262e-07,-2.01314e-08,0.687695,0.000701735,-4.87656e-07,1.50616e-09,0.688396,0.000700764,-4.83137e-07,1.41067e-08,0.689096,0.00069984,-4.40817e-07,1.67168e-09,0.689795,0.000698963,-4.35802e-07,-2.07934e-08,0.690494,0.000698029,-4.98182e-07,2.18972e-08,0.691192,0.000697099,-4.32491e-07,-7.19092e-09,0.691888,0.000696212,-4.54064e-07,6.86642e-09,0.692584,0.000695325,-4.33464e-07,-2.02747e-08,0.693279,0.000694397,-4.94288e-07,1.46279e-08,0.693973,0.000693452,-4.50405e-07,2.13678e-08,0.694666,0.000692616,-3.86301e-07,-4.04945e-08,0.695358,0.000691721,-5.07785e-07,2.14009e-08,0.696049,0.00069077,-4.43582e-07,1.44955e-08,0.69674,0.000689926,-4.00096e-07,-1.97783e-08,0.697429,0.000689067,-4.5943e-07,5.01296e-09,0.698118,0.000688163,-4.44392e-07,-2.73521e-10,0.698805,0.000687273,-4.45212e-07,-3.91893e-09,0.699492,0.000686371,-4.56969e-07,1.59493e-08,0.700178,0.000685505,-4.09121e-07,-2.73351e-10,0.700863,0.000684686,-4.09941e-07,-1.4856e-08,0.701548,0.000683822,-4.54509e-07,9.25979e-11,0.702231,0.000682913,-4.54231e-07,1.44855e-08,0.702913,0.000682048,-4.10775e-07,1.56992e-09,0.703595,0.000681231,-4.06065e-07,-2.07652e-08,0.704276,0.000680357,-4.68361e-07,2.18864e-08,0.704956,0.000679486,-4.02701e-07,-7.17595e-09,0.705635,0.000678659,-4.24229e-07,6.81748e-09,0.706313,0.000677831,-4.03777e-07,-2.0094e-08,0.70699,0.000676963,-4.64059e-07,1.39538e-08,0.707667,0.000676077,-4.22197e-07,2.38835e-08,0.708343,0.000675304,-3.50547e-07,-4.98831e-08,0.709018,0.000674453,-5.00196e-07,5.64395e-08,0.709692,0.000673622,-3.30878e-07,-5.66657e-08,0.710365,0.00067279,-5.00875e-07,5.1014e-08,0.711037,0.000671942,-3.47833e-07,-2.81809e-08,0.711709,0.000671161,-4.32376e-07,2.10513e-09,0.712379,0.000670303,-4.2606e-07,1.97604e-08,0.713049,0.00066951,-3.66779e-07,-2.15422e-08,0.713718,0.000668712,-4.31406e-07,6.8038e-09,0.714387,0.000667869,-4.10994e-07,-5.67295e-09,0.715054,0.00066703,-4.28013e-07,1.5888e-08,0.715721,0.000666222,-3.80349e-07,1.72576e-09,0.716387,0.000665467,-3.75172e-07,-2.27911e-08,0.717052,0.000664648,-4.43545e-07,2.9834e-08,0.717716,0.00066385,-3.54043e-07,-3.69401e-08,0.718379,0.000663031,-4.64864e-07,5.83219e-08,0.719042,0.000662277,-2.89898e-07,-7.71382e-08,0.719704,0.000661465,-5.21313e-07,7.14171e-08,0.720365,0.000660637,-3.07061e-07,-2.97161e-08,0.721025,0.000659934,-3.96209e-07,-1.21575e-08,0.721685,0.000659105,-4.32682e-07,1.87412e-08,0.722343,0.000658296,-3.76458e-07,-3.2029e-09,0.723001,0.000657533,-3.86067e-07,-5.9296e-09,0.723659,0.000656743,-4.03856e-07,2.69213e-08,0.724315,0.000656016,-3.23092e-07,-4.21511e-08,0.724971,0.000655244,-4.49545e-07,2.24737e-08,0.725625,0.000654412,-3.82124e-07,1.18611e-08,0.726279,0.000653683,-3.46541e-07,-1.03132e-08,0.726933,0.000652959,-3.7748e-07,-3.02128e-08,0.727585,0.000652114,-4.68119e-07,7.15597e-08,0.728237,0.000651392,-2.5344e-07,-7.72119e-08,0.728888,0.000650654,-4.85075e-07,5.8474e-08,0.729538,0.000649859,-3.09654e-07,-3.74746e-08,0.730188,0.000649127,-4.22077e-07,3.18197e-08,0.730837,0.000648379,-3.26618e-07,-3.01997e-08,0.731485,0.000647635,-4.17217e-07,2.93747e-08,0.732132,0.000646888,-3.29093e-07,-2.76943e-08,0.732778,0.000646147,-4.12176e-07,2.17979e-08,0.733424,0.000645388,-3.46783e-07,1.07292e-10,0.734069,0.000644695,-3.46461e-07,-2.22271e-08,0.734713,0.000643935,-4.13142e-07,2.91963e-08,0.735357,0.000643197,-3.25553e-07,-3.49536e-08,0.736,0.000642441,-4.30414e-07,5.10133e-08,0.736642,0.000641733,-2.77374e-07,-4.98904e-08,0.737283,0.000641028,-4.27045e-07,2.93392e-08,0.737924,0.000640262,-3.39028e-07,-7.86156e-09,0.738564,0.000639561,-3.62612e-07,2.10703e-09,0.739203,0.000638842,-3.56291e-07,-5.6653e-10,0.739842,0.000638128,-3.57991e-07,1.59086e-10,0.740479,0.000637412,-3.57513e-07,-6.98321e-11,0.741116,0.000636697,-3.57723e-07,1.20214e-10,0.741753,0.000635982,-3.57362e-07,-4.10987e-10,0.742388,0.000635266,-3.58595e-07,1.5237e-09,0.743023,0.000634553,-3.54024e-07,-5.68376e-09,0.743657,0.000633828,-3.71075e-07,2.12113e-08,0.744291,0.00063315,-3.07441e-07,-1.95569e-08,0.744924,0.000632476,-3.66112e-07,-2.58816e-09,0.745556,0.000631736,-3.73877e-07,2.99096e-08,0.746187,0.000631078,-2.84148e-07,-5.74454e-08,0.746818,0.000630337,-4.56484e-07,8.06629e-08,0.747448,0.000629666,-2.14496e-07,-8.63922e-08,0.748077,0.000628978,-4.73672e-07,8.60918e-08,0.748706,0.000628289,-2.15397e-07,-7.91613e-08,0.749334,0.000627621,-4.5288e-07,5.17393e-08,0.749961,0.00062687,-2.97663e-07,-8.58662e-09,0.750588,0.000626249,-3.23422e-07,-1.73928e-08,0.751214,0.00062555,-3.75601e-07,1.85532e-08,0.751839,0.000624855,-3.19941e-07,2.78479e-09,0.752463,0.000624223,-3.11587e-07,-2.96923e-08,0.753087,0.000623511,-4.00664e-07,5.63799e-08,0.75371,0.000622879,-2.31524e-07,-7.66179e-08,0.754333,0.000622186,-4.61378e-07,7.12778e-08,0.754955,0.000621477,-2.47545e-07,-2.96794e-08,0.755576,0.000620893,-3.36583e-07,-1.21648e-08,0.756196,0.000620183,-3.73077e-07,1.87339e-08,0.756816,0.000619493,-3.16875e-07,-3.16622e-09,0.757435,0.00061885,-3.26374e-07,-6.0691e-09,0.758054,0.000618179,-3.44581e-07,2.74426e-08,0.758672,0.000617572,-2.62254e-07,-4.40968e-08,0.759289,0.000616915,-3.94544e-07,2.97352e-08,0.759906,0.000616215,-3.05338e-07,-1.52393e-08,0.760522,0.000615559,-3.51056e-07,3.12221e-08,0.761137,0.000614951,-2.5739e-07,-5.00443e-08,0.761751,0.000614286,-4.07523e-07,4.9746e-08,0.762365,0.00061362,-2.58285e-07,-2.97303e-08,0.762979,0.000613014,-3.47476e-07,9.57079e-09,0.763591,0.000612348,-3.18764e-07,-8.55287e-09,0.764203,0.000611685,-3.44422e-07,2.46407e-08,0.764815,0.00061107,-2.705e-07,-3.04053e-08,0.765426,0.000610437,-3.61716e-07,3.73759e-08,0.766036,0.000609826,-2.49589e-07,-5.94935e-08,0.766645,0.000609149,-4.28069e-07,8.13889e-08,0.767254,0.000608537,-1.83902e-07,-8.72483e-08,0.767862,0.000607907,-4.45647e-07,8.87901e-08,0.76847,0.000607282,-1.79277e-07,-8.90983e-08,0.769077,0.000606656,-4.46572e-07,8.87892e-08,0.769683,0.000606029,-1.80204e-07,-8.72446e-08,0.770289,0.000605407,-4.41938e-07,8.13752e-08,0.770894,0.000604768,-1.97812e-07,-5.94423e-08,0.771498,0.000604194,-3.76139e-07,3.71848e-08,0.772102,0.000603553,-2.64585e-07,-2.96922e-08,0.772705,0.000602935,-3.53661e-07,2.19793e-08,0.773308,0.000602293,-2.87723e-07,1.37955e-09,0.77391,0.000601722,-2.83585e-07,-2.74976e-08,0.774512,0.000601072,-3.66077e-07,4.9006e-08,0.775112,0.000600487,-2.19059e-07,-4.93171e-08,0.775712,0.000599901,-3.67011e-07,2.90531e-08,0.776312,0.000599254,-2.79851e-07,-7.29081e-09,0.776911,0.000598673,-3.01724e-07,1.10077e-10,0.777509,0.00059807,-3.01393e-07,6.85053e-09,0.778107,0.000597487,-2.80842e-07,-2.75123e-08,0.778704,0.000596843,-3.63379e-07,4.35939e-08,0.779301,0.000596247,-2.32597e-07,-2.7654e-08,0.779897,0.000595699,-3.15559e-07,7.41741e-09,0.780492,0.00059509,-2.93307e-07,-2.01562e-09,0.781087,0.000594497,-2.99354e-07,6.45059e-10,0.781681,0.000593901,-2.97418e-07,-5.64635e-10,0.782275,0.000593304,-2.99112e-07,1.61347e-09,0.782868,0.000592711,-2.94272e-07,-5.88926e-09,0.78346,0.000592105,-3.1194e-07,2.19436e-08,0.784052,0.000591546,-2.46109e-07,-2.22805e-08,0.784643,0.000590987,-3.1295e-07,7.57368e-09,0.785234,0.000590384,-2.90229e-07,-8.01428e-09,0.785824,0.00058978,-3.14272e-07,2.44834e-08,0.786414,0.000589225,-2.40822e-07,-3.03148e-08,0.787003,0.000588652,-3.31766e-07,3.7171e-08,0.787591,0.0005881,-2.20253e-07,-5.87646e-08,0.788179,0.000587483,-3.96547e-07,7.86782e-08,0.788766,0.000586926,-1.60512e-07,-7.71342e-08,0.789353,0.000586374,-3.91915e-07,5.10444e-08,0.789939,0.000585743,-2.38782e-07,-7.83422e-09,0.790524,0.000585242,-2.62284e-07,-1.97076e-08,0.791109,0.000584658,-3.21407e-07,2.70598e-08,0.791693,0.000584097,-2.40228e-07,-2.89269e-08,0.792277,0.000583529,-3.27008e-07,2.90431e-08,0.792861,0.000582963,-2.39879e-07,-2.76409e-08,0.793443,0.0005824,-3.22802e-07,2.1916e-08,0.794025,0.00058182,-2.57054e-07,-4.18368e-10,0.794607,0.000581305,-2.58309e-07,-2.02425e-08,0.795188,0.000580727,-3.19036e-07,2.17838e-08,0.795768,0.000580155,-2.53685e-07,-7.28814e-09,0.796348,0.000579625,-2.75549e-07,7.36871e-09,0.796928,0.000579096,-2.53443e-07,-2.21867e-08,0.797506,0.000578523,-3.20003e-07,2.17736e-08,0.798085,0.000577948,-2.54683e-07,-5.30296e-09,0.798662,0.000577423,-2.70592e-07,-5.61698e-10,0.799239,0.00057688,-2.72277e-07,7.54977e-09,0.799816,0.000576358,-2.49627e-07,-2.96374e-08,0.800392,0.00057577,-3.38539e-07,5.1395e-08,0.800968,0.000575247,-1.84354e-07,-5.67335e-08,0.801543,0.000574708,-3.54555e-07,5.63297e-08,0.802117,0.000574168,-1.85566e-07,-4.93759e-08,0.802691,0.000573649,-3.33693e-07,2.19646e-08,0.803264,0.000573047,-2.678e-07,2.1122e-08,0.803837,0.000572575,-2.04433e-07,-4.68482e-08,0.804409,0.000572026,-3.44978e-07,4.70613e-08,0.804981,0.000571477,-2.03794e-07,-2.21877e-08,0.805552,0.000571003,-2.70357e-07,-1.79153e-08,0.806123,0.000570408,-3.24103e-07,3.42443e-08,0.806693,0.000569863,-2.2137e-07,1.47556e-10,0.807263,0.000569421,-2.20928e-07,-3.48345e-08,0.807832,0.000568874,-3.25431e-07,1.99812e-08,0.808401,0.000568283,-2.65487e-07,1.45143e-08,0.808969,0.000567796,-2.21945e-07,-1.84338e-08,0.809536,0.000567297,-2.77246e-07,-3.83608e-10,0.810103,0.000566741,-2.78397e-07,1.99683e-08,0.81067,0.000566244,-2.18492e-07,-1.98848e-08,0.811236,0.000565747,-2.78146e-07,-3.38976e-11,0.811801,0.000565191,-2.78248e-07,2.00204e-08,0.812366,0.000564695,-2.18187e-07,-2.04429e-08,0.812931,0.000564197,-2.79516e-07,2.1467e-09,0.813495,0.000563644,-2.73076e-07,1.18561e-08,0.814058,0.000563134,-2.37507e-07,1.00334e-08,0.814621,0.000562689,-2.07407e-07,-5.19898e-08,0.815183,0.000562118,-3.63376e-07,7.87163e-08,0.815745,0.000561627,-1.27227e-07,-8.40616e-08,0.816306,0.000561121,-3.79412e-07,7.87163e-08,0.816867,0.000560598,-1.43263e-07,-5.19898e-08,0.817428,0.000560156,-2.99233e-07,1.00335e-08,0.817988,0.000559587,-2.69132e-07,1.18559e-08,0.818547,0.000559085,-2.33564e-07,2.14764e-09,0.819106,0.000558624,-2.27122e-07,-2.04464e-08,0.819664,0.000558108,-2.88461e-07,2.00334e-08,0.820222,0.000557591,-2.28361e-07,-8.24277e-11,0.820779,0.000557135,-2.28608e-07,-1.97037e-08,0.821336,0.000556618,-2.87719e-07,1.92925e-08,0.821893,0.000556101,-2.29841e-07,2.13831e-09,0.822448,0.000555647,-2.23427e-07,-2.78458e-08,0.823004,0.000555117,-3.06964e-07,4.96402e-08,0.823559,0.000554652,-1.58043e-07,-5.15058e-08,0.824113,0.000554181,-3.12561e-07,3.71737e-08,0.824667,0.000553668,-2.0104e-07,-3.75844e-08,0.82522,0.000553153,-3.13793e-07,5.35592e-08,0.825773,0.000552686,-1.53115e-07,-5.74431e-08,0.826326,0.000552207,-3.25444e-07,5.7004e-08,0.826878,0.000551728,-1.54433e-07,-5.13635e-08,0.827429,0.000551265,-3.08523e-07,2.92406e-08,0.82798,0.000550735,-2.20801e-07,-5.99424e-09,0.828531,0.000550276,-2.38784e-07,-5.26363e-09,0.829081,0.000549782,-2.54575e-07,2.70488e-08,0.82963,0.000549354,-1.73429e-07,-4.33268e-08,0.83018,0.000548878,-3.03409e-07,2.7049e-08,0.830728,0.000548352,-2.22262e-07,-5.26461e-09,0.831276,0.000547892,-2.38056e-07,-5.99057e-09,0.831824,0.000547397,-2.56027e-07,2.92269e-08,0.832371,0.000546973,-1.68347e-07,-5.13125e-08,0.832918,0.000546482,-3.22284e-07,5.68139e-08,0.833464,0.000546008,-1.51843e-07,-5.67336e-08,0.83401,0.000545534,-3.22043e-07,5.09113e-08,0.834555,0.000545043,-1.6931e-07,-2.77022e-08,0.8351,0.000544621,-2.52416e-07,2.92924e-10,0.835644,0.000544117,-2.51537e-07,2.65305e-08,0.836188,0.000543694,-1.71946e-07,-4.68105e-08,0.836732,0.00054321,-3.12377e-07,4.15021e-08,0.837275,0.000542709,-1.87871e-07,1.13355e-11,0.837817,0.000542334,-1.87837e-07,-4.15474e-08,0.838359,0.000541833,-3.12479e-07,4.69691e-08,0.838901,0.000541349,-1.71572e-07,-2.71196e-08,0.839442,0.000540925,-2.52931e-07,1.90462e-09,0.839983,0.000540425,-2.47217e-07,1.95011e-08,0.840523,0.000539989,-1.88713e-07,-2.03045e-08,0.841063,0.00053955,-2.49627e-07,2.11216e-09,0.841602,0.000539057,-2.4329e-07,1.18558e-08,0.842141,0.000538606,-2.07723e-07,1.00691e-08,0.842679,0.000538221,-1.77516e-07,-5.21324e-08,0.843217,0.00053771,-3.33913e-07,7.92513e-08,0.843755,0.00053728,-9.6159e-08,-8.60587e-08,0.844292,0.000536829,-3.54335e-07,8.61696e-08,0.844828,0.000536379,-9.58263e-08,-7.98057e-08,0.845364,0.000535948,-3.35243e-07,5.42394e-08,0.8459,0.00053544,-1.72525e-07,-1.79426e-08,0.846435,0.000535041,-2.26353e-07,1.75308e-08,0.84697,0.000534641,-1.73761e-07,-5.21806e-08,0.847505,0.000534137,-3.30302e-07,7.19824e-08,0.848038,0.000533692,-1.14355e-07,-5.69349e-08,0.848572,0.000533293,-2.8516e-07,3.65479e-08,0.849105,0.000532832,-1.75516e-07,-2.96519e-08,0.849638,0.000532392,-2.64472e-07,2.2455e-08,0.85017,0.000531931,-1.97107e-07,-5.63451e-10,0.850702,0.000531535,-1.98797e-07,-2.02011e-08,0.851233,0.000531077,-2.59401e-07,2.17634e-08,0.851764,0.000530623,-1.94111e-07,-7.24794e-09,0.852294,0.000530213,-2.15854e-07,7.22832e-09,0.852824,0.000529803,-1.94169e-07,-2.16653e-08,0.853354,0.00052935,-2.59165e-07,1.98283e-08,0.853883,0.000528891,-1.9968e-07,1.95678e-09,0.854412,0.000528497,-1.9381e-07,-2.76554e-08,0.85494,0.000528027,-2.76776e-07,4.90603e-08,0.855468,0.00052762,-1.29596e-07,-4.93764e-08,0.855995,0.000527213,-2.77725e-07,2.92361e-08,0.856522,0.000526745,-1.90016e-07,-7.96341e-09,0.857049,0.000526341,-2.13907e-07,2.61752e-09,0.857575,0.000525922,-2.06054e-07,-2.50665e-09,0.8581,0.000525502,-2.13574e-07,7.40906e-09,0.858626,0.000525097,-1.91347e-07,-2.71296e-08,0.859151,0.000524633,-2.72736e-07,4.15048e-08,0.859675,0.000524212,-1.48221e-07,-1.96802e-08,0.860199,0.000523856,-2.07262e-07,-2.23886e-08,0.860723,0.000523375,-2.74428e-07,4.96299e-08,0.861246,0.000522975,-1.25538e-07,-5.69216e-08,0.861769,0.000522553,-2.96303e-07,5.88473e-08,0.862291,0.000522137,-1.19761e-07,-5.92584e-08,0.862813,0.00052172,-2.97536e-07,5.8977e-08,0.863334,0.000521301,-1.20605e-07,-5.74403e-08,0.863855,0.000520888,-2.92926e-07,5.15751e-08,0.864376,0.000520457,-1.38201e-07,-2.96506e-08,0.864896,0.000520091,-2.27153e-07,7.42277e-09,0.865416,0.000519659,-2.04885e-07,-4.05057e-11,0.865936,0.00051925,-2.05006e-07,-7.26074e-09,0.866455,0.000518818,-2.26788e-07,2.90835e-08,0.866973,0.000518451,-1.39538e-07,-4.94686e-08,0.867492,0.000518024,-2.87944e-07,4.95814e-08,0.868009,0.000517597,-1.39199e-07,-2.96479e-08,0.868527,0.000517229,-2.28143e-07,9.40539e-09,0.869044,0.000516801,-1.99927e-07,-7.9737e-09,0.86956,0.000516378,-2.23848e-07,2.24894e-08,0.870077,0.000515997,-1.5638e-07,-2.23793e-08,0.870592,0.000515617,-2.23517e-07,7.42302e-09,0.871108,0.000515193,-2.01248e-07,-7.31283e-09,0.871623,0.000514768,-2.23187e-07,2.18283e-08,0.872137,0.000514387,-1.57702e-07,-2.03959e-08,0.872652,0.000514011,-2.1889e-07,1.50711e-10,0.873165,0.000513573,-2.18437e-07,1.97931e-08,0.873679,0.000513196,-1.59058e-07,-1.97183e-08,0.874192,0.000512819,-2.18213e-07,-5.24324e-10,0.874704,0.000512381,-2.19786e-07,2.18156e-08,0.875217,0.000512007,-1.54339e-07,-2.71336e-08,0.875728,0.000511616,-2.3574e-07,2.71141e-08,0.87624,0.000511226,-1.54398e-07,-2.17182e-08,0.876751,0.000510852,-2.19552e-07,1.54131e-10,0.877262,0.000510414,-2.1909e-07,2.11017e-08,0.877772,0.000510039,-1.55785e-07,-2.49562e-08,0.878282,0.000509652,-2.30654e-07,1.91183e-08,0.878791,0.000509248,-1.73299e-07,8.08751e-09,0.8793,0.000508926,-1.49036e-07,-5.14684e-08,0.879809,0.000508474,-3.03441e-07,7.85766e-08,0.880317,0.000508103,-6.77112e-08,-8.40242e-08,0.880825,0.000507715,-3.19784e-07,7.87063e-08,0.881333,0.000507312,-8.36649e-08,-5.19871e-08,0.88184,0.000506988,-2.39626e-07,1.00327e-08,0.882346,0.000506539,-2.09528e-07,1.18562e-08,0.882853,0.000506156,-1.73959e-07,2.14703e-09,0.883359,0.000505814,-1.67518e-07,-2.04444e-08,0.883864,0.000505418,-2.28851e-07,2.00258e-08,0.88437,0.00050502,-1.68774e-07,-5.42855e-11,0.884874,0.000504682,-1.68937e-07,-1.98087e-08,0.885379,0.000504285,-2.28363e-07,1.96842e-08,0.885883,0.000503887,-1.6931e-07,6.76342e-10,0.886387,0.000503551,-1.67281e-07,-2.23896e-08,0.88689,0.000503149,-2.3445e-07,2.92774e-08,0.887393,0.000502768,-1.46618e-07,-3.51152e-08,0.887896,0.00050237,-2.51963e-07,5.15787e-08,0.888398,0.00050202,-9.72271e-08,-5.19903e-08,0.8889,0.00050167,-2.53198e-07,3.71732e-08,0.889401,0.000501275,-1.41678e-07,-3.70978e-08,0.889902,0.00050088,-2.52972e-07,5.16132e-08,0.890403,0.000500529,-9.81321e-08,-5.01459e-08,0.890903,0.000500183,-2.4857e-07,2.9761e-08,0.891403,0.000499775,-1.59287e-07,-9.29351e-09,0.891903,0.000499428,-1.87167e-07,7.41301e-09,0.892402,0.000499076,-1.64928e-07,-2.03585e-08,0.892901,0.000498685,-2.26004e-07,1.44165e-08,0.893399,0.000498276,-1.82754e-07,2.22974e-08,0.893898,0.000497978,-1.15862e-07,-4.40013e-08,0.894395,0.000497614,-2.47866e-07,3.44985e-08,0.894893,0.000497222,-1.44371e-07,-3.43882e-08,0.89539,0.00049683,-2.47535e-07,4.34497e-08,0.895886,0.000496465,-1.17186e-07,-2.02012e-08,0.896383,0.00049617,-1.7779e-07,-2.22497e-08,0.896879,0.000495748,-2.44539e-07,4.95952e-08,0.897374,0.000495408,-9.57532e-08,-5.69217e-08,0.89787,0.000495045,-2.66518e-07,5.88823e-08,0.898364,0.000494689,-8.98713e-08,-5.93983e-08,0.898859,0.000494331,-2.68066e-07,5.95017e-08,0.899353,0.000493973,-8.95613e-08,-5.9399e-08,0.899847,0.000493616,-2.67758e-07,5.8885e-08,0.90034,0.000493257,-9.11033e-08,-5.69317e-08,0.900833,0.000492904,-2.61898e-07,4.96326e-08,0.901326,0.000492529,-1.13001e-07,-2.23893e-08,0.901819,0.000492236,-1.80169e-07,-1.968e-08,0.902311,0.000491817,-2.39209e-07,4.15047e-08,0.902802,0.000491463,-1.14694e-07,-2.71296e-08,0.903293,0.000491152,-1.96083e-07,7.409e-09,0.903784,0.000490782,-1.73856e-07,-2.50645e-09,0.904275,0.000490427,-1.81376e-07,2.61679e-09,0.904765,0.000490072,-1.73525e-07,-7.96072e-09,0.905255,0.000489701,-1.97407e-07,2.92261e-08,0.905745,0.000489394,-1.09729e-07,-4.93389e-08,0.906234,0.000489027,-2.57746e-07,4.89204e-08,0.906723,0.000488658,-1.10985e-07,-2.71333e-08,0.907211,0.000488354,-1.92385e-07,8.30861e-12,0.907699,0.00048797,-1.9236e-07,2.71001e-08,0.908187,0.000487666,-1.1106e-07,-4.88041e-08,0.908675,0.000487298,-2.57472e-07,4.89069e-08,0.909162,0.000486929,-1.10751e-07,-2.76143e-08,0.909649,0.000486625,-1.93594e-07,1.9457e-09,0.910135,0.000486244,-1.87757e-07,1.98315e-08,0.910621,0.000485928,-1.28262e-07,-2.16671e-08,0.911107,0.000485606,-1.93264e-07,7.23216e-09,0.911592,0.000485241,-1.71567e-07,-7.26152e-09,0.912077,0.000484877,-1.93352e-07,2.18139e-08,0.912562,0.000484555,-1.2791e-07,-2.03895e-08,0.913047,0.000484238,-1.89078e-07,1.39494e-10,0.913531,0.000483861,-1.8866e-07,1.98315e-08,0.914014,0.000483543,-1.29165e-07,-1.98609e-08,0.914498,0.000483225,-1.88748e-07,7.39912e-12,0.914981,0.000482847,-1.88726e-07,1.98313e-08,0.915463,0.000482529,-1.29232e-07,-1.9728e-08,0.915946,0.000482212,-1.88416e-07,-5.24035e-10,0.916428,0.000481833,-1.89988e-07,2.18241e-08,0.916909,0.000481519,-1.24516e-07,-2.71679e-08,0.917391,0.000481188,-2.06019e-07,2.72427e-08,0.917872,0.000480858,-1.24291e-07,-2.21985e-08,0.918353,0.000480543,-1.90886e-07,1.94644e-09,0.918833,0.000480167,-1.85047e-07,1.44127e-08,0.919313,0.00047984,-1.41809e-07,7.39438e-12,0.919793,0.000479556,-1.41787e-07,-1.44423e-08,0.920272,0.000479229,-1.85114e-07,-1.84291e-09,0.920751,0.000478854,-1.90642e-07,2.18139e-08,0.92123,0.000478538,-1.25201e-07,-2.58081e-08,0.921708,0.00047821,-2.02625e-07,2.18139e-08,0.922186,0.00047787,-1.37183e-07,-1.84291e-09,0.922664,0.00047759,-1.42712e-07,-1.44423e-08,0.923141,0.000477262,-1.86039e-07,7.34701e-12,0.923618,0.00047689,-1.86017e-07,1.44129e-08,0.924095,0.000476561,-1.42778e-07,1.94572e-09,0.924572,0.000476281,-1.36941e-07,-2.21958e-08,0.925048,0.000475941,-2.03528e-07,2.72327e-08,0.925523,0.000475615,-1.2183e-07,-2.71304e-08,0.925999,0.00047529,-2.03221e-07,2.16843e-08,0.926474,0.000474949,-1.38168e-07,-2.16005e-12,0.926949,0.000474672,-1.38175e-07,-2.16756e-08,0.927423,0.000474331,-2.03202e-07,2.71001e-08,0.927897,0.000474006,-1.21902e-07,-2.71201e-08,0.928371,0.000473681,-2.03262e-07,2.17757e-08,0.928845,0.00047334,-1.37935e-07,-3.78028e-10,0.929318,0.000473063,-1.39069e-07,-2.02636e-08,0.929791,0.000472724,-1.9986e-07,2.18276e-08,0.930263,0.000472389,-1.34377e-07,-7.44231e-09,0.930736,0.000472098,-1.56704e-07,7.94165e-09,0.931208,0.000471809,-1.32879e-07,-2.43243e-08,0.931679,0.00047147,-2.05851e-07,2.97508e-08,0.932151,0.000471148,-1.16599e-07,-3.50742e-08,0.932622,0.000470809,-2.21822e-07,5.09414e-08,0.933092,0.000470518,-6.89976e-08,-4.94821e-08,0.933563,0.000470232,-2.17444e-07,2.77775e-08,0.934033,0.00046988,-1.34111e-07,-2.02351e-09,0.934502,0.000469606,-1.40182e-07,-1.96835e-08,0.934972,0.000469267,-1.99232e-07,2.11529e-08,0.935441,0.000468932,-1.35774e-07,-5.32332e-09,0.93591,0.000468644,-1.51743e-07,1.40413e-10,0.936378,0.000468341,-1.51322e-07,4.76166e-09,0.936846,0.000468053,-1.37037e-07,-1.9187e-08,0.937314,0.000467721,-1.94598e-07,1.23819e-08,0.937782,0.000467369,-1.57453e-07,2.92642e-08,0.938249,0.000467142,-6.96601e-08,-6.98342e-08,0.938716,0.000466793,-2.79163e-07,7.12586e-08,0.939183,0.000466449,-6.53869e-08,-3.63863e-08,0.939649,0.000466209,-1.74546e-07,1.46818e-08,0.940115,0.000465904,-1.305e-07,-2.2341e-08,0.940581,0.000465576,-1.97523e-07,1.50774e-08,0.941046,0.000465226,-1.52291e-07,2.16359e-08,0.941511,0.000464986,-8.73832e-08,-4.20162e-08,0.941976,0.000464685,-2.13432e-07,2.72198e-08,0.942441,0.00046434,-1.31773e-07,-7.2581e-09,0.942905,0.000464055,-1.53547e-07,1.81263e-09,0.943369,0.000463753,-1.48109e-07,7.58386e-12,0.943832,0.000463457,-1.48086e-07,-1.84298e-09,0.944296,0.000463155,-1.53615e-07,7.36433e-09,0.944759,0.00046287,-1.31522e-07,-2.76143e-08,0.945221,0.000462524,-2.14365e-07,4.34883e-08,0.945684,0.000462226,-8.39003e-08,-2.71297e-08,0.946146,0.000461977,-1.65289e-07,5.42595e-09,0.946608,0.000461662,-1.49012e-07,5.42593e-09,0.947069,0.000461381,-1.32734e-07,-2.71297e-08,0.94753,0.000461034,-2.14123e-07,4.34881e-08,0.947991,0.000460736,-8.36585e-08,-2.76134e-08,0.948452,0.000460486,-1.66499e-07,7.36083e-09,0.948912,0.000460175,-1.44416e-07,-1.82993e-09,0.949372,0.000459881,-1.49906e-07,-4.11073e-11,0.949832,0.000459581,-1.50029e-07,1.99434e-09,0.950291,0.000459287,-1.44046e-07,-7.93627e-09,0.950751,0.000458975,-1.67855e-07,2.97507e-08,0.951209,0.000458728,-7.86029e-08,-5.1462e-08,0.951668,0.000458417,-2.32989e-07,5.6888e-08,0.952126,0.000458121,-6.2325e-08,-5.68806e-08,0.952584,0.000457826,-2.32967e-07,5.14251e-08,0.953042,0.000457514,-7.86914e-08,-2.96107e-08,0.953499,0.000457268,-1.67523e-07,7.41296e-09,0.953956,0.000456955,-1.45285e-07,-4.11262e-11,0.954413,0.000456665,-1.45408e-07,-7.24847e-09,0.95487,0.000456352,-1.67153e-07,2.9035e-08,0.955326,0.000456105,-8.00484e-08,-4.92869e-08,0.955782,0.000455797,-2.27909e-07,4.89032e-08,0.956238,0.000455488,-8.11994e-08,-2.71166e-08,0.956693,0.000455244,-1.62549e-07,-4.13678e-11,0.957148,0.000454919,-1.62673e-07,2.72821e-08,0.957603,0.000454675,-8.0827e-08,-4.94824e-08,0.958057,0.000454365,-2.29274e-07,5.14382e-08,0.958512,0.000454061,-7.49597e-08,-3.7061e-08,0.958965,0.0004538,-1.86143e-07,3.72013e-08,0.959419,0.000453539,-7.45389e-08,-5.21396e-08,0.959873,0.000453234,-2.30958e-07,5.21476e-08,0.960326,0.000452928,-7.45146e-08,-3.72416e-08,0.960778,0.000452667,-1.8624e-07,3.72143e-08,0.961231,0.000452407,-7.45967e-08,-5.20109e-08,0.961683,0.000452101,-2.30629e-07,5.16199e-08,0.962135,0.000451795,-7.57696e-08,-3.52595e-08,0.962587,0.000451538,-1.81548e-07,2.98133e-08,0.963038,0.000451264,-9.2108e-08,-2.43892e-08,0.963489,0.000451007,-1.65276e-07,8.13892e-09,0.96394,0.000450701,-1.40859e-07,-8.16647e-09,0.964391,0.000450394,-1.65358e-07,2.45269e-08,0.964841,0.000450137,-9.17775e-08,-3.03367e-08,0.965291,0.000449863,-1.82787e-07,3.7215e-08,0.965741,0.000449609,-7.11424e-08,-5.89188e-08,0.96619,0.00044929,-2.47899e-07,7.92509e-08,0.966639,0.000449032,-1.01462e-08,-7.92707e-08,0.967088,0.000448773,-2.47958e-07,5.90181e-08,0.967537,0.000448455,-7.0904e-08,-3.75925e-08,0.967985,0.0004482,-1.83681e-07,3.17471e-08,0.968433,0.000447928,-8.84401e-08,-2.97913e-08,0.968881,0.000447662,-1.77814e-07,2.78133e-08,0.969329,0.000447389,-9.4374e-08,-2.18572e-08,0.969776,0.000447135,-1.59946e-07,1.10134e-11,0.970223,0.000446815,-1.59913e-07,2.18132e-08,0.97067,0.000446561,-9.44732e-08,-2.76591e-08,0.971116,0.000446289,-1.7745e-07,2.92185e-08,0.971562,0.000446022,-8.97948e-08,-2.96104e-08,0.972008,0.000445753,-1.78626e-07,2.96185e-08,0.972454,0.000445485,-8.97706e-08,-2.92588e-08,0.972899,0.000445218,-1.77547e-07,2.78123e-08,0.973344,0.000444946,-9.41103e-08,-2.23856e-08,0.973789,0.000444691,-1.61267e-07,2.12559e-09,0.974233,0.000444374,-1.5489e-07,1.38833e-08,0.974678,0.000444106,-1.13241e-07,1.94591e-09,0.975122,0.000443886,-1.07403e-07,-2.16669e-08,0.975565,0.000443606,-1.72404e-07,2.5117e-08,0.976009,0.000443336,-9.70526e-08,-1.91963e-08,0.976452,0.000443085,-1.54642e-07,-7.93627e-09,0.976895,0.000442752,-1.7845e-07,5.09414e-08,0.977338,0.000442548,-2.56262e-08,-7.66201e-08,0.97778,0.000442266,-2.55486e-07,7.67249e-08,0.978222,0.000441986,-2.53118e-08,-5.14655e-08,0.978664,0.000441781,-1.79708e-07,9.92773e-09,0.979106,0.000441451,-1.49925e-07,1.17546e-08,0.979547,0.000441186,-1.14661e-07,2.65868e-09,0.979988,0.000440965,-1.06685e-07,-2.23893e-08,0.980429,0.000440684,-1.73853e-07,2.72939e-08,0.980869,0.000440419,-9.19716e-08,-2.71816e-08,0.98131,0.000440153,-1.73516e-07,2.18278e-08,0.98175,0.000439872,-1.08033e-07,-5.24833e-10,0.982189,0.000439654,-1.09607e-07,-1.97284e-08,0.982629,0.000439376,-1.68793e-07,1.98339e-08,0.983068,0.000439097,-1.09291e-07,-2.62901e-12,0.983507,0.000438879,-1.09299e-07,-1.98234e-08,0.983946,0.000438601,-1.68769e-07,1.96916e-08,0.984384,0.000438322,-1.09694e-07,6.6157e-10,0.984823,0.000438105,-1.0771e-07,-2.23379e-08,0.985261,0.000437823,-1.74723e-07,2.90855e-08,0.985698,0.00043756,-8.74669e-08,-3.43992e-08,0.986136,0.000437282,-1.90665e-07,4.89068e-08,0.986573,0.000437048,-4.39442e-08,-4.20188e-08,0.98701,0.000436834,-1.7e-07,-4.11073e-11,0.987446,0.000436494,-1.70124e-07,4.21832e-08,0.987883,0.00043628,-4.35742e-08,-4.94824e-08,0.988319,0.000436044,-1.92021e-07,3.6537e-08,0.988755,0.00043577,-8.24102e-08,-3.70611e-08,0.989191,0.000435494,-1.93593e-07,5.21026e-08,0.989626,0.000435263,-3.72855e-08,-5.21402e-08,0.990061,0.000435032,-1.93706e-07,3.7249e-08,0.990496,0.000434756,-8.19592e-08,-3.72512e-08,0.990931,0.000434481,-1.93713e-07,5.21511e-08,0.991365,0.00043425,-3.72595e-08,-5.21439e-08,0.991799,0.000434019,-1.93691e-07,3.72152e-08,0.992233,0.000433743,-8.20456e-08,-3.71123e-08,0.992667,0.000433468,-1.93382e-07,5.16292e-08,0.9931,0.000433236,-3.84947e-08,-5.01953e-08,0.993533,0.000433008,-1.89081e-07,2.99427e-08,0.993966,0.00043272,-9.92525e-08,-9.9708e-09,0.994399,0.000432491,-1.29165e-07,9.94051e-09,0.994831,0.000432263,-9.93434e-08,-2.97912e-08,0.995263,0.000431975,-1.88717e-07,4.96198e-08,0.995695,0.000431746,-3.98578e-08,-4.94785e-08,0.996127,0.000431518,-1.88293e-07,2.9085e-08,0.996558,0.000431229,-1.01038e-07,-7.25675e-09,0.996989,0.000431005,-1.22809e-07,-5.79945e-11,0.99742,0.000430759,-1.22983e-07,7.48873e-09,0.997851,0.000430536,-1.00516e-07,-2.98969e-08,0.998281,0.000430245,-1.90207e-07,5.24942e-08,0.998711,0.000430022,-3.27246e-08,-6.08706e-08,0.999141,0.000429774,-2.15336e-07,7.17788e-08,0.999571,0.000429392,0.,0.};
+
+        template <bool srgb, int blueIdx, typename T, typename D>
+        __device__ __forceinline__ void Lab2RGBConvert_f(const T& src, D& dst)
+        {
+            const float lThresh = 0.008856f * 903.3f;
+            const float fThresh = 7.787f * 0.008856f + 16.0f / 116.0f;
+
+            float Y, fy;
+
+            if (src.x <= lThresh)
+            {
+                Y = src.x / 903.3f;
+                fy = 7.787f * Y + 16.0f / 116.0f;
+            }
+            else
+            {
+                fy = (src.x + 16.0f) / 116.0f;
+                Y = fy * fy * fy;
+            }
+
+            float X = src.y / 500.0f + fy;
+            float Z = fy - src.z / 200.0f;
+
+            if (X <= fThresh)
+                X = (X - 16.0f / 116.0f) / 7.787f;
+            else
+                X = X * X * X;
+
+            if (Z <= fThresh)
+                Z = (Z - 16.0f / 116.0f) / 7.787f;
+            else
+                Z = Z * Z * Z;
+
+            float B = 0.052891f * X - 0.204043f * Y + 1.151152f * Z;
+            float G = -0.921235f * X + 1.875991f * Y + 0.045244f * Z;
+            float R = 3.079933f * X - 1.537150f * Y - 0.542782f * Z;
+
+            if (srgb)
+            {
+                B = splineInterpolate(B * GAMMA_TAB_SIZE, c_sRGBInvGammaTab, GAMMA_TAB_SIZE);
+                G = splineInterpolate(G * GAMMA_TAB_SIZE, c_sRGBInvGammaTab, GAMMA_TAB_SIZE);
+                R = splineInterpolate(R * GAMMA_TAB_SIZE, c_sRGBInvGammaTab, GAMMA_TAB_SIZE);
+            }
+
+            dst.x = blueIdx == 0 ? B : R;
+            dst.y = G;
+            dst.z = blueIdx == 0 ? R : B;
+            setAlpha(dst, ColorChannel<float>::max());
+        }
+
+        template <bool srgb, int blueIdx, typename T, typename D>
+        __device__ __forceinline__ void Lab2RGBConvert_b(const T& src, D& dst)
+        {
+            float3 srcf, dstf;
+
+            srcf.x = src.x * (100.f / 255.f);
+            srcf.y = src.y - 128;
+            srcf.z = src.z - 128;
+
+            Lab2RGBConvert_f<srgb, blueIdx>(srcf, dstf);
+
+            dst.x = saturate_cast<uchar>(dstf.x * 255.f);
+            dst.y = saturate_cast<uchar>(dstf.y * 255.f);
+            dst.z = saturate_cast<uchar>(dstf.z * 255.f);
+            setAlpha(dst, ColorChannel<uchar>::max());
+        }
+
+        template <typename T, int scn, int dcn, bool srgb, int blueIdx> struct Lab2RGB;
+        template <int scn, int dcn, bool srgb, int blueIdx>
+        struct Lab2RGB<uchar, scn, dcn, srgb, blueIdx>
+            : unary_function<typename TypeVec<uchar, scn>::vec_type, typename TypeVec<uchar, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<uchar, dcn>::vec_type operator ()(const typename TypeVec<uchar, scn>::vec_type& src) const
+            {
+                typename TypeVec<uchar, dcn>::vec_type dst;
+
+                Lab2RGBConvert_b<srgb, blueIdx>(src, dst);
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ Lab2RGB() {}
+            __host__ __device__ __forceinline__ Lab2RGB(const Lab2RGB&) {}
+        };
+        template <int scn, int dcn, bool srgb, int blueIdx>
+        struct Lab2RGB<float, scn, dcn, srgb, blueIdx>
+            : unary_function<typename TypeVec<float, scn>::vec_type, typename TypeVec<float, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<float, dcn>::vec_type operator ()(const typename TypeVec<float, scn>::vec_type& src) const
+            {
+                typename TypeVec<float, dcn>::vec_type dst;
+
+                Lab2RGBConvert_f<srgb, blueIdx>(src, dst);
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ Lab2RGB() {}
+            __host__ __device__ __forceinline__ Lab2RGB(const Lab2RGB&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_Lab2RGB_TRAITS(name, scn, dcn, srgb, blueIdx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::Lab2RGB<T, scn, dcn, srgb, blueIdx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+///////////////////////////////////// RGB <-> Luv /////////////////////////////////////
+
+    namespace color_detail
+    {
+        __constant__ float c_LabCbrtTab[] = {0.137931,0.0114066,0.,1.18859e-07,0.149338,0.011407,3.56578e-07,-5.79396e-07,0.160745,0.0114059,-1.38161e-06,2.16892e-06,0.172151,0.0114097,5.12516e-06,-8.0814e-06,0.183558,0.0113957,-1.9119e-05,3.01567e-05,0.194965,0.0114479,7.13509e-05,-0.000112545,0.206371,0.011253,-0.000266285,-0.000106493,0.217252,0.0104009,-0.000585765,7.32149e-05,0.22714,0.00944906,-0.00036612,1.21917e-05,0.236235,0.0087534,-0.000329545,2.01753e-05,0.244679,0.00815483,-0.000269019,1.24435e-05,0.252577,0.00765412,-0.000231689,1.05618e-05,0.26001,0.00722243,-0.000200003,8.26662e-06,0.267041,0.00684723,-0.000175203,6.76746e-06,0.27372,0.00651712,-0.000154901,5.61192e-06,0.280088,0.00622416,-0.000138065,4.67009e-06,0.286179,0.00596204,-0.000124055,3.99012e-06,0.292021,0.0057259,-0.000112085,3.36032e-06,0.297638,0.00551181,-0.000102004,2.95338e-06,0.30305,0.00531666,-9.31435e-05,2.52875e-06,0.308277,0.00513796,-8.55572e-05,2.22022e-06,0.313331,0.00497351,-7.88966e-05,1.97163e-06,0.318228,0.00482163,-7.29817e-05,1.7248e-06,0.322978,0.00468084,-6.78073e-05,1.55998e-06,0.327593,0.0045499,-6.31274e-05,1.36343e-06,0.332081,0.00442774,-5.90371e-05,1.27136e-06,0.336451,0.00431348,-5.5223e-05,1.09111e-06,0.34071,0.00420631,-5.19496e-05,1.0399e-06,0.344866,0.00410553,-4.88299e-05,9.18347e-07,0.348923,0.00401062,-4.60749e-05,8.29942e-07,0.352889,0.00392096,-4.35851e-05,7.98478e-07,0.356767,0.00383619,-4.11896e-05,6.84917e-07,0.360562,0.00375586,-3.91349e-05,6.63976e-07,0.36428,0.00367959,-3.7143e-05,5.93086e-07,0.367923,0.00360708,-3.53637e-05,5.6976e-07,0.371495,0.00353806,-3.36544e-05,4.95533e-07,0.375,0.00347224,-3.21678e-05,4.87951e-07,0.378441,0.00340937,-3.0704e-05,4.4349e-07,0.38182,0.00334929,-2.93735e-05,4.20297e-07,0.38514,0.0032918,-2.81126e-05,3.7872e-07,0.388404,0.00323671,-2.69764e-05,3.596e-07,0.391614,0.00318384,-2.58976e-05,3.5845e-07,0.394772,0.00313312,-2.48223e-05,2.92765e-07,0.397881,0.00308435,-2.3944e-05,3.18232e-07,0.400942,0.00303742,-2.29893e-05,2.82046e-07,0.403957,0.00299229,-2.21432e-05,2.52315e-07,0.406927,0.00294876,-2.13862e-05,2.58416e-07,0.409855,0.00290676,-2.0611e-05,2.33939e-07,0.412741,0.00286624,-1.99092e-05,2.36342e-07,0.415587,0.00282713,-1.92001e-05,1.916e-07,0.418396,0.00278931,-1.86253e-05,2.1915e-07,0.421167,0.00275271,-1.79679e-05,1.83498e-07,0.423901,0.00271733,-1.74174e-05,1.79343e-07,0.426602,0.00268303,-1.68794e-05,1.72013e-07,0.429268,0.00264979,-1.63633e-05,1.75686e-07,0.431901,0.00261759,-1.58363e-05,1.3852e-07,0.434503,0.00258633,-1.54207e-05,1.64304e-07,0.437074,0.00255598,-1.49278e-05,1.28136e-07,0.439616,0.00252651,-1.45434e-05,1.57618e-07,0.442128,0.0024979,-1.40705e-05,1.0566e-07,0.444612,0.00247007,-1.37535e-05,1.34998e-07,0.447068,0.00244297,-1.33485e-05,1.29207e-07,0.449498,0.00241666,-1.29609e-05,9.32347e-08,0.451902,0.00239102,-1.26812e-05,1.23703e-07,0.45428,0.00236603,-1.23101e-05,9.74072e-08,0.456634,0.0023417,-1.20179e-05,1.12518e-07,0.458964,0.002318,-1.16803e-05,7.83681e-08,0.46127,0.00229488,-1.14452e-05,1.10452e-07,0.463554,0.00227232,-1.11139e-05,7.58719e-08,0.465815,0.00225032,-1.08863e-05,9.2699e-08,0.468055,0.00222882,-1.06082e-05,8.97738e-08,0.470273,0.00220788,-1.03388e-05,5.4845e-08,0.47247,0.00218736,-1.01743e-05,1.0808e-07,0.474648,0.00216734,-9.85007e-06,4.9277e-08,0.476805,0.00214779,-9.70224e-06,8.22408e-08,0.478943,0.00212863,-9.45551e-06,6.87942e-08,0.481063,0.00210993,-9.24913e-06,5.98144e-08,0.483163,0.00209161,-9.06969e-06,7.93789e-08,0.485246,0.00207371,-8.83155e-06,3.99032e-08,0.487311,0.00205616,-8.71184e-06,8.88325e-08,0.489358,0.002039,-8.44534e-06,2.20004e-08,0.491389,0.00202218,-8.37934e-06,9.13872e-08,0.493403,0.0020057,-8.10518e-06,2.96829e-08,0.495401,0.00198957,-8.01613e-06,5.81028e-08,0.497382,0.00197372,-7.84183e-06,6.5731e-08,0.499348,0.00195823,-7.64463e-06,3.66019e-08,0.501299,0.00194305,-7.53483e-06,2.62811e-08,0.503234,0.00192806,-7.45598e-06,9.66907e-08,0.505155,0.00191344,-7.16591e-06,4.18928e-09,0.507061,0.00189912,-7.15334e-06,6.53665e-08,0.508953,0.00188501,-6.95724e-06,3.23686e-08,0.510831,0.00187119,-6.86014e-06,4.35774e-08,0.512696,0.0018576,-6.72941e-06,3.17406e-08,0.514547,0.00184424,-6.63418e-06,6.78785e-08,0.516384,0.00183117,-6.43055e-06,-5.23126e-09,0.518209,0.0018183,-6.44624e-06,7.22562e-08,0.520021,0.00180562,-6.22947e-06,1.42292e-08,0.52182,0.0017932,-6.18679e-06,4.9641e-08,0.523607,0.00178098,-6.03786e-06,2.56259e-08,0.525382,0.00176898,-5.96099e-06,2.66696e-08,0.527145,0.00175714,-5.88098e-06,4.65094e-08,0.528897,0.00174552,-5.74145e-06,2.57114e-08,0.530637,0.00173411,-5.66431e-06,2.94588e-08,0.532365,0.00172287,-5.57594e-06,3.52667e-08,0.534082,0.00171182,-5.47014e-06,8.28868e-09,0.535789,0.00170091,-5.44527e-06,5.07871e-08,0.537484,0.00169017,-5.29291e-06,2.69817e-08,0.539169,0.00167967,-5.21197e-06,2.01009e-08,0.540844,0.0016693,-5.15166e-06,1.18237e-08,0.542508,0.00165903,-5.11619e-06,5.18135e-08,0.544162,0.00164896,-4.96075e-06,1.9341e-08,0.545806,0.00163909,-4.90273e-06,-9.96867e-09,0.54744,0.00162926,-4.93263e-06,8.01382e-08,0.549064,0.00161963,-4.69222e-06,-1.25601e-08,0.550679,0.00161021,-4.7299e-06,2.97067e-08,0.552285,0.00160084,-4.64078e-06,1.29426e-08,0.553881,0.0015916,-4.60195e-06,3.77327e-08,0.555468,0.00158251,-4.48875e-06,1.49412e-08,0.557046,0.00157357,-4.44393e-06,2.17118e-08,0.558615,0.00156475,-4.3788e-06,1.74206e-08,0.560176,0.00155605,-4.32653e-06,2.78152e-08,0.561727,0.00154748,-4.24309e-06,-9.47239e-09,0.563271,0.00153896,-4.27151e-06,6.9679e-08,0.564805,0.00153063,-4.06247e-06,-3.08246e-08,0.566332,0.00152241,-4.15494e-06,5.36188e-08,0.56785,0.00151426,-3.99409e-06,-4.83594e-09,0.56936,0.00150626,-4.00859e-06,2.53293e-08,0.570863,0.00149832,-3.93261e-06,2.27286e-08,0.572357,0.00149052,-3.86442e-06,2.96541e-09,0.573844,0.0014828,-3.85552e-06,2.50147e-08,0.575323,0.00147516,-3.78048e-06,1.61842e-08,0.576794,0.00146765,-3.73193e-06,2.94582e-08,0.578258,0.00146028,-3.64355e-06,-1.48076e-08,0.579715,0.00145295,-3.68798e-06,2.97724e-08,0.581164,0.00144566,-3.59866e-06,1.49272e-08,0.582606,0.00143851,-3.55388e-06,2.97285e-08,0.584041,0.00143149,-3.46469e-06,-1.46323e-08,0.585469,0.00142451,-3.50859e-06,2.88004e-08,0.58689,0.00141758,-3.42219e-06,1.864e-08,0.588304,0.00141079,-3.36627e-06,1.58482e-08,0.589712,0.00140411,-3.31872e-06,-2.24279e-08,0.591112,0.00139741,-3.38601e-06,7.38639e-08,0.592507,0.00139085,-3.16441e-06,-3.46088e-08,0.593894,0.00138442,-3.26824e-06,4.96675e-09,0.595275,0.0013779,-3.25334e-06,7.4346e-08,0.59665,0.00137162,-3.0303e-06,-6.39319e-08,0.598019,0.00136536,-3.2221e-06,6.21725e-08,0.599381,0.00135911,-3.03558e-06,-5.94423e-09,0.600737,0.00135302,-3.05341e-06,2.12091e-08,0.602087,0.00134697,-2.98979e-06,-1.92876e-08,0.603431,0.00134094,-3.04765e-06,5.5941e-08,0.604769,0.00133501,-2.87983e-06,-2.56622e-08,0.606101,0.00132917,-2.95681e-06,4.67078e-08,0.607427,0.0013234,-2.81669e-06,-4.19592e-08,0.608748,0.00131764,-2.94257e-06,6.15243e-08,0.610062,0.00131194,-2.75799e-06,-2.53244e-08,0.611372,0.00130635,-2.83397e-06,3.97739e-08,0.612675,0.0013008,-2.71465e-06,-1.45618e-08,0.613973,0.00129533,-2.75833e-06,1.84733e-08,0.615266,0.00128986,-2.70291e-06,2.73606e-10,0.616553,0.00128446,-2.70209e-06,4.00367e-08,0.617835,0.00127918,-2.58198e-06,-4.12113e-08,0.619111,0.00127389,-2.70561e-06,6.52039e-08,0.620383,0.00126867,-2.51e-06,-4.07901e-08,0.621649,0.00126353,-2.63237e-06,3.83516e-08,0.62291,0.00125838,-2.51732e-06,6.59315e-09,0.624166,0.00125337,-2.49754e-06,-5.11939e-09,0.625416,0.00124836,-2.5129e-06,1.38846e-08,0.626662,0.00124337,-2.47124e-06,9.18514e-09,0.627903,0.00123846,-2.44369e-06,8.97952e-09,0.629139,0.0012336,-2.41675e-06,1.45012e-08,0.63037,0.00122881,-2.37325e-06,-7.37949e-09,0.631597,0.00122404,-2.39538e-06,1.50169e-08,0.632818,0.00121929,-2.35033e-06,6.91648e-09,0.634035,0.00121461,-2.32958e-06,1.69219e-08,0.635248,0.00121,-2.27882e-06,-1.49997e-08,0.636455,0.0012054,-2.32382e-06,4.30769e-08,0.637659,0.00120088,-2.19459e-06,-3.80986e-08,0.638857,0.00119638,-2.30888e-06,4.97134e-08,0.640051,0.00119191,-2.15974e-06,-4.15463e-08,0.641241,0.00118747,-2.28438e-06,5.68667e-08,0.642426,0.00118307,-2.11378e-06,-7.10641e-09,0.643607,0.00117882,-2.1351e-06,-2.8441e-08,0.644784,0.00117446,-2.22042e-06,6.12658e-08,0.645956,0.00117021,-2.03663e-06,-3.78083e-08,0.647124,0.00116602,-2.15005e-06,3.03627e-08,0.648288,0.00116181,-2.05896e-06,-2.40379e-08,0.649448,0.00115762,-2.13108e-06,6.57887e-08,0.650603,0.00115356,-1.93371e-06,-6.03028e-08,0.651755,0.00114951,-2.11462e-06,5.62134e-08,0.652902,0.00114545,-1.94598e-06,-4.53417e-08,0.654046,0.00114142,-2.082e-06,6.55489e-08,0.655185,0.00113745,-1.88536e-06,-3.80396e-08,0.656321,0.00113357,-1.99948e-06,2.70049e-08,0.657452,0.00112965,-1.91846e-06,-1.03755e-08,0.65858,0.00112578,-1.94959e-06,1.44973e-08,0.659704,0.00112192,-1.9061e-06,1.1991e-08,0.660824,0.00111815,-1.87012e-06,-2.85634e-09,0.66194,0.0011144,-1.87869e-06,-5.65782e-10,0.663053,0.00111064,-1.88039e-06,5.11947e-09,0.664162,0.0011069,-1.86503e-06,3.96924e-08,0.665267,0.00110328,-1.74595e-06,-4.46795e-08,0.666368,0.00109966,-1.87999e-06,1.98161e-08,0.667466,0.00109596,-1.82054e-06,2.502e-08,0.66856,0.00109239,-1.74548e-06,-6.86593e-10,0.669651,0.0010889,-1.74754e-06,-2.22739e-08,0.670738,0.00108534,-1.81437e-06,3.01776e-08,0.671821,0.0010818,-1.72383e-06,2.07732e-08,0.672902,0.00107841,-1.66151e-06,-5.36658e-08,0.673978,0.00107493,-1.82251e-06,7.46802e-08,0.675051,0.00107151,-1.59847e-06,-6.62411e-08,0.676121,0.00106811,-1.79719e-06,7.10748e-08,0.677188,0.00106473,-1.58397e-06,-3.92441e-08,0.678251,0.00106145,-1.7017e-06,2.62973e-08,0.679311,0.00105812,-1.62281e-06,-6.34035e-09,0.680367,0.00105486,-1.64183e-06,-9.36249e-10,0.68142,0.00105157,-1.64464e-06,1.00854e-08,0.68247,0.00104831,-1.61438e-06,2.01995e-08,0.683517,0.00104514,-1.55378e-06,-3.1279e-08,0.68456,0.00104194,-1.64762e-06,4.53114e-08,0.685601,0.00103878,-1.51169e-06,-3.07573e-08,0.686638,0.00103567,-1.60396e-06,1.81133e-08,0.687672,0.00103251,-1.54962e-06,1.79085e-08,0.688703,0.00102947,-1.49589e-06,-3.01428e-08,0.689731,0.00102639,-1.58632e-06,4.30583e-08,0.690756,0.00102334,-1.45715e-06,-2.28814e-08,0.691778,0.00102036,-1.52579e-06,-1.11373e-08,0.692797,0.00101727,-1.5592e-06,6.74305e-08,0.693812,0.00101436,-1.35691e-06,-7.97709e-08,0.694825,0.0010114,-1.59622e-06,7.28391e-08,0.695835,0.00100843,-1.37771e-06,-3.27715e-08,0.696842,0.00100558,-1.47602e-06,-1.35807e-09,0.697846,0.00100262,-1.48009e-06,3.82037e-08,0.698847,0.000999775,-1.36548e-06,-3.22474e-08,0.699846,0.000996948,-1.46223e-06,3.11809e-08,0.700841,0.000994117,-1.36868e-06,-3.28714e-08,0.701834,0.000991281,-1.4673e-06,4.07001e-08,0.702824,0.000988468,-1.3452e-06,-1.07197e-08,0.703811,0.000985746,-1.37736e-06,2.17866e-09,0.704795,0.000982998,-1.37082e-06,2.00521e-09,0.705777,0.000980262,-1.3648e-06,-1.01996e-08,0.706756,0.000977502,-1.3954e-06,3.87931e-08,0.707732,0.000974827,-1.27902e-06,-2.57632e-08,0.708706,0.000972192,-1.35631e-06,4.65513e-09,0.709676,0.000969493,-1.34235e-06,7.14257e-09,0.710645,0.00096683,-1.32092e-06,2.63791e-08,0.71161,0.000964267,-1.24178e-06,-5.30543e-08,0.712573,0.000961625,-1.40095e-06,6.66289e-08,0.713533,0.000959023,-1.20106e-06,-3.46474e-08,0.714491,0.000956517,-1.305e-06,1.23559e-08,0.715446,0.000953944,-1.26793e-06,-1.47763e-08,0.716399,0.000951364,-1.31226e-06,4.67494e-08,0.717349,0.000948879,-1.17201e-06,-5.3012e-08,0.718297,0.000946376,-1.33105e-06,4.60894e-08,0.719242,0.000943852,-1.19278e-06,-1.21366e-08,0.720185,0.00094143,-1.22919e-06,2.45673e-09,0.721125,0.000938979,-1.22182e-06,2.30966e-09,0.722063,0.000936543,-1.21489e-06,-1.16954e-08,0.722998,0.000934078,-1.24998e-06,4.44718e-08,0.723931,0.000931711,-1.11656e-06,-4.69823e-08,0.724861,0.000929337,-1.25751e-06,2.4248e-08,0.725789,0.000926895,-1.18477e-06,9.5949e-09,0.726715,0.000924554,-1.15598e-06,-3.02286e-09,0.727638,0.000922233,-1.16505e-06,2.49649e-09,0.72856,0.00091991,-1.15756e-06,-6.96321e-09,0.729478,0.000917575,-1.17845e-06,2.53564e-08,0.730395,0.000915294,-1.10238e-06,-3.48578e-08,0.731309,0.000912984,-1.20695e-06,5.44704e-08,0.732221,0.000910734,-1.04354e-06,-6.38144e-08,0.73313,0.000908455,-1.23499e-06,8.15781e-08,0.734038,0.00090623,-9.90253e-07,-8.3684e-08,0.734943,0.000903999,-1.2413e-06,7.43441e-08,0.735846,0.000901739,-1.01827e-06,-3.48787e-08,0.736746,0.000899598,-1.12291e-06,5.56596e-09,0.737645,0.000897369,-1.10621e-06,1.26148e-08,0.738541,0.000895194,-1.06837e-06,3.57935e-09,0.739435,0.000893068,-1.05763e-06,-2.69322e-08,0.740327,0.000890872,-1.13842e-06,4.45448e-08,0.741217,0.000888729,-1.00479e-06,-3.20376e-08,0.742105,0.000886623,-1.1009e-06,2.40011e-08,0.74299,0.000884493,-1.0289e-06,-4.36209e-09,0.743874,0.000882422,-1.04199e-06,-6.55268e-09,0.744755,0.000880319,-1.06164e-06,3.05728e-08,0.745634,0.000878287,-9.69926e-07,-5.61338e-08,0.746512,0.000876179,-1.13833e-06,7.4753e-08,0.747387,0.000874127,-9.14068e-07,-6.40644e-08,0.74826,0.000872106,-1.10626e-06,6.22955e-08,0.749131,0.000870081,-9.19375e-07,-6.59083e-08,0.75,0.000868044,-1.1171e-06,8.21284e-08,0.750867,0.000866056,-8.70714e-07,-8.37915e-08,0.751732,0.000864064,-1.12209e-06,7.42237e-08,0.752595,0.000862042,-8.99418e-07,-3.42894e-08,0.753456,0.00086014,-1.00229e-06,3.32955e-09,0.754315,0.000858146,-9.92297e-07,2.09712e-08,0.755173,0.000856224,-9.29384e-07,-2.76096e-08,0.756028,0.000854282,-1.01221e-06,2.98627e-08,0.756881,0.000852348,-9.22625e-07,-3.22365e-08,0.757733,0.000850406,-1.01933e-06,3.94786e-08,0.758582,0.000848485,-9.00898e-07,-6.46833e-09,0.75943,0.000846664,-9.20303e-07,-1.36052e-08,0.760275,0.000844783,-9.61119e-07,1.28447e-09,0.761119,0.000842864,-9.57266e-07,8.4674e-09,0.761961,0.000840975,-9.31864e-07,2.44506e-08,0.762801,0.000839185,-8.58512e-07,-4.6665e-08,0.763639,0.000837328,-9.98507e-07,4.30001e-08,0.764476,0.00083546,-8.69507e-07,-6.12609e-09,0.76531,0.000833703,-8.87885e-07,-1.84959e-08,0.766143,0.000831871,-9.43372e-07,2.05052e-08,0.766974,0.000830046,-8.81857e-07,-3.92026e-09,0.767803,0.000828271,-8.93618e-07,-4.82426e-09,0.768631,0.000826469,-9.0809e-07,2.32172e-08,0.769456,0.000824722,-8.38439e-07,-2.84401e-08,0.77028,0.00082296,-9.23759e-07,3.09386e-08,0.771102,0.000821205,-8.30943e-07,-3.57099e-08,0.771922,0.000819436,-9.38073e-07,5.22963e-08,0.772741,0.000817717,-7.81184e-07,-5.42658e-08,0.773558,0.000815992,-9.43981e-07,4.55579e-08,0.774373,0.000814241,-8.07308e-07,-8.75656e-09,0.775186,0.0008126,-8.33578e-07,-1.05315e-08,0.775998,0.000810901,-8.65172e-07,-8.72188e-09,0.776808,0.000809145,-8.91338e-07,4.54191e-08,0.777616,0.000807498,-7.5508e-07,-5.37454e-08,0.778423,0.000805827,-9.16317e-07,5.03532e-08,0.779228,0.000804145,-7.65257e-07,-2.84584e-08,0.780031,0.000802529,-8.50632e-07,3.87579e-09,0.780833,0.00080084,-8.39005e-07,1.29552e-08,0.781633,0.0007992,-8.00139e-07,3.90804e-09,0.782432,0.000797612,-7.88415e-07,-2.85874e-08,0.783228,0.000795949,-8.74177e-07,5.0837e-08,0.784023,0.000794353,-7.21666e-07,-5.55513e-08,0.784817,0.000792743,-8.8832e-07,5.21587e-08,0.785609,0.000791123,-7.31844e-07,-3.38744e-08,0.786399,0.000789558,-8.33467e-07,2.37342e-08,0.787188,0.000787962,-7.62264e-07,-1.45775e-09,0.787975,0.000786433,-7.66638e-07,-1.79034e-08,0.788761,0.000784846,-8.20348e-07,1.34665e-08,0.789545,0.000783246,-7.79948e-07,2.3642e-08,0.790327,0.000781757,-7.09022e-07,-4.84297e-08,0.791108,0.000780194,-8.54311e-07,5.08674e-08,0.791888,0.000778638,-7.01709e-07,-3.58303e-08,0.792666,0.000777127,-8.092e-07,3.28493e-08,0.793442,0.000775607,-7.10652e-07,-3.59624e-08,0.794217,0.000774078,-8.1854e-07,5.13959e-08,0.79499,0.000772595,-6.64352e-07,-5.04121e-08,0.795762,0.000771115,-8.15588e-07,3.10431e-08,0.796532,0.000769577,-7.22459e-07,-1.41557e-08,0.797301,0.00076809,-7.64926e-07,2.55795e-08,0.798069,0.000766636,-6.88187e-07,-2.85578e-08,0.798835,0.000765174,-7.73861e-07,2.90472e-08,0.799599,0.000763714,-6.86719e-07,-2.80262e-08,0.800362,0.000762256,-7.70798e-07,2.34531e-08,0.801123,0.000760785,-7.00438e-07,-6.18144e-09,0.801884,0.000759366,-7.18983e-07,1.27263e-09,0.802642,0.000757931,-7.15165e-07,1.09101e-09,0.803399,0.000756504,-7.11892e-07,-5.63675e-09,0.804155,0.000755064,-7.28802e-07,2.14559e-08,0.80491,0.00075367,-6.64434e-07,-2.05821e-08,0.805663,0.00075228,-7.26181e-07,1.26812e-09,0.806414,0.000750831,-7.22377e-07,1.55097e-08,0.807164,0.000749433,-6.75848e-07,-3.70216e-09,0.807913,0.00074807,-6.86954e-07,-7.0105e-10,0.80866,0.000746694,-6.89057e-07,6.5063e-09,0.809406,0.000745336,-6.69538e-07,-2.53242e-08,0.810151,0.000743921,-7.45511e-07,3.51858e-08,0.810894,0.000742535,-6.39953e-07,3.79034e-09,0.811636,0.000741267,-6.28582e-07,-5.03471e-08,0.812377,0.000739858,-7.79624e-07,7.83886e-08,0.813116,0.000738534,-5.44458e-07,-8.43935e-08,0.813854,0.000737192,-7.97638e-07,8.03714e-08,0.81459,0.000735838,-5.56524e-07,-5.82784e-08,0.815325,0.00073455,-7.31359e-07,3.35329e-08,0.816059,0.000733188,-6.3076e-07,-1.62486e-08,0.816792,0.000731878,-6.79506e-07,3.14614e-08,0.817523,0.000730613,-5.85122e-07,-4.99925e-08,0.818253,0.000729293,-7.35099e-07,4.92994e-08,0.818982,0.000727971,-5.87201e-07,-2.79959e-08,0.819709,0.000726712,-6.71189e-07,3.07959e-09,0.820435,0.000725379,-6.6195e-07,1.56777e-08,0.82116,0.000724102,-6.14917e-07,-6.18564e-09,0.821883,0.000722854,-6.33474e-07,9.06488e-09,0.822606,0.000721614,-6.06279e-07,-3.00739e-08,0.823327,0.000720311,-6.96501e-07,5.16262e-08,0.824046,0.000719073,-5.41623e-07,-5.72214e-08,0.824765,0.000717818,-7.13287e-07,5.80503e-08,0.825482,0.000716566,-5.39136e-07,-5.57703e-08,0.826198,0.00071532,-7.06447e-07,4.58215e-08,0.826912,0.000714045,-5.68983e-07,-8.30636e-09,0.827626,0.000712882,-5.93902e-07,-1.25961e-08,0.828338,0.000711656,-6.3169e-07,-9.13985e-10,0.829049,0.00071039,-6.34432e-07,1.62519e-08,0.829759,0.00070917,-5.85676e-07,-4.48904e-09,0.830468,0.000707985,-5.99143e-07,1.70418e-09,0.831175,0.000706792,-5.9403e-07,-2.32768e-09,0.831881,0.000705597,-6.01014e-07,7.60648e-09,0.832586,0.000704418,-5.78194e-07,-2.80982e-08,0.83329,0.000703177,-6.62489e-07,4.51817e-08,0.833993,0.000701988,-5.26944e-07,-3.34192e-08,0.834694,0.000700834,-6.27201e-07,2.88904e-08,0.835394,0.000699666,-5.4053e-07,-2.25378e-08,0.836093,0.000698517,-6.08143e-07,1.65589e-09,0.836791,0.000697306,-6.03176e-07,1.59142e-08,0.837488,0.000696147,-5.55433e-07,-5.70801e-09,0.838184,0.000695019,-5.72557e-07,6.91792e-09,0.838878,0.000693895,-5.51803e-07,-2.19637e-08,0.839571,0.000692725,-6.17694e-07,2.13321e-08,0.840263,0.000691554,-5.53698e-07,-3.75996e-09,0.840954,0.000690435,-5.64978e-07,-6.29219e-09,0.841644,0.000689287,-5.83855e-07,2.89287e-08,0.842333,0.000688206,-4.97068e-07,-4.98181e-08,0.843021,0.000687062,-6.46523e-07,5.11344e-08,0.843707,0.000685922,-4.9312e-07,-3.55102e-08,0.844393,0.00068483,-5.9965e-07,3.13019e-08,0.845077,0.000683724,-5.05745e-07,-3.00925e-08,0.84576,0.000682622,-5.96022e-07,2.94636e-08,0.846442,0.000681519,-5.07631e-07,-2.81572e-08,0.847123,0.000680419,-5.92103e-07,2.35606e-08,0.847803,0.000679306,-5.21421e-07,-6.48045e-09,0.848482,0.000678243,-5.40863e-07,2.36124e-09,0.849159,0.000677169,-5.33779e-07,-2.96461e-09,0.849836,0.000676092,-5.42673e-07,9.49728e-09,0.850512,0.000675035,-5.14181e-07,-3.50245e-08,0.851186,0.000673902,-6.19254e-07,7.09959e-08,0.851859,0.000672876,-4.06267e-07,-7.01453e-08,0.852532,0.000671853,-6.16703e-07,3.07714e-08,0.853203,0.000670712,-5.24388e-07,6.66423e-09,0.853873,0.000669684,-5.04396e-07,2.17629e-09,0.854542,0.000668681,-4.97867e-07,-1.53693e-08,0.855211,0.000667639,-5.43975e-07,-3.03752e-10,0.855878,0.000666551,-5.44886e-07,1.65844e-08,0.856544,0.000665511,-4.95133e-07,-6.42907e-09,0.857209,0.000664501,-5.1442e-07,9.13195e-09,0.857873,0.0006635,-4.87024e-07,-3.00987e-08,0.858536,0.000662435,-5.7732e-07,5.16584e-08,0.859198,0.000661436,-4.22345e-07,-5.73255e-08,0.859859,0.000660419,-5.94322e-07,5.84343e-08,0.860518,0.000659406,-4.19019e-07,-5.72022e-08,0.861177,0.000658396,-5.90626e-07,5.11653e-08,0.861835,0.000657368,-4.3713e-07,-2.82495e-08,0.862492,0.000656409,-5.21878e-07,2.22788e-09,0.863148,0.000655372,-5.15195e-07,1.9338e-08,0.863803,0.0006544,-4.5718e-07,-1.99754e-08,0.864457,0.000653425,-5.17107e-07,9.59024e-10,0.86511,0.000652394,-5.1423e-07,1.61393e-08,0.865762,0.000651414,-4.65812e-07,-5.91149e-09,0.866413,0.000650465,-4.83546e-07,7.50665e-09,0.867063,0.00064952,-4.61026e-07,-2.4115e-08,0.867712,0.000648526,-5.33371e-07,2.93486e-08,0.86836,0.000647547,-4.45325e-07,-3.36748e-08,0.869007,0.000646555,-5.4635e-07,4.57461e-08,0.869653,0.0006456,-4.09112e-07,-3.01002e-08,0.870298,0.000644691,-4.99412e-07,1.50501e-08,0.870942,0.000643738,-4.54262e-07,-3.01002e-08,0.871585,0.000642739,-5.44563e-07,4.57461e-08,0.872228,0.000641787,-4.07324e-07,-3.36748e-08,0.872869,0.000640871,-5.08349e-07,2.93486e-08,0.873509,0.000639943,-4.20303e-07,-2.4115e-08,0.874149,0.00063903,-4.92648e-07,7.50655e-09,0.874787,0.000638067,-4.70128e-07,-5.91126e-09,0.875425,0.000637109,-4.87862e-07,1.61385e-08,0.876062,0.000636182,-4.39447e-07,9.61961e-10,0.876697,0.000635306,-4.36561e-07,-1.99863e-08,0.877332,0.000634373,-4.9652e-07,1.93785e-08,0.877966,0.000633438,-4.38384e-07,2.07697e-09,0.878599,0.000632567,-4.32153e-07,-2.76864e-08,0.879231,0.00063162,-5.15212e-07,4.90641e-08,0.879862,0.000630737,-3.6802e-07,-4.93606e-08,0.880493,0.000629852,-5.16102e-07,2.9169e-08,0.881122,0.000628908,-4.28595e-07,-7.71083e-09,0.881751,0.000628027,-4.51727e-07,1.6744e-09,0.882378,0.000627129,-4.46704e-07,1.01317e-09,0.883005,0.000626239,-4.43665e-07,-5.72703e-09,0.883631,0.000625334,-4.60846e-07,2.1895e-08,0.884255,0.000624478,-3.95161e-07,-2.22481e-08,0.88488,0.000623621,-4.61905e-07,7.4928e-09,0.885503,0.00062272,-4.39427e-07,-7.72306e-09,0.886125,0.000621818,-4.62596e-07,2.33995e-08,0.886746,0.000620963,-3.92398e-07,-2.62704e-08,0.887367,0.000620099,-4.71209e-07,2.20775e-08,0.887987,0.000619223,-4.04976e-07,-2.43496e-09,0.888605,0.000618406,-4.12281e-07,-1.23377e-08,0.889223,0.000617544,-4.49294e-07,-7.81876e-09,0.88984,0.000616622,-4.72751e-07,4.36128e-08,0.890457,0.000615807,-3.41912e-07,-4.7423e-08,0.891072,0.000614981,-4.84181e-07,2.68698e-08,0.891687,0.000614093,-4.03572e-07,-4.51384e-10,0.8923,0.000613285,-4.04926e-07,-2.50643e-08,0.892913,0.0006124,-4.80119e-07,4.11038e-08,0.893525,0.000611563,-3.56808e-07,-2.01414e-08,0.894136,0.000610789,-4.17232e-07,-2.01426e-08,0.894747,0.000609894,-4.7766e-07,4.11073e-08,0.895356,0.000609062,-3.54338e-07,-2.50773e-08,0.895965,0.000608278,-4.2957e-07,-4.02954e-10,0.896573,0.000607418,-4.30779e-07,2.66891e-08,0.89718,0.000606636,-3.50711e-07,-4.67489e-08,0.897786,0.000605795,-4.90958e-07,4.10972e-08,0.898391,0.000604936,-3.67666e-07,1.56948e-09,0.898996,0.000604205,-3.62958e-07,-4.73751e-08,0.8996,0.000603337,-5.05083e-07,6.87214e-08,0.900202,0.000602533,-2.98919e-07,-4.86966e-08,0.900805,0.000601789,-4.45009e-07,6.85589e-09,0.901406,0.00060092,-4.24441e-07,2.1273e-08,0.902007,0.000600135,-3.60622e-07,-3.23434e-08,0.902606,0.000599317,-4.57652e-07,4.84959e-08,0.903205,0.000598547,-3.12164e-07,-4.24309e-08,0.903803,0.000597795,-4.39457e-07,2.01844e-09,0.904401,0.000596922,-4.33402e-07,3.43571e-08,0.904997,0.000596159,-3.30331e-07,-2.02374e-08,0.905593,0.000595437,-3.91043e-07,-1.30123e-08,0.906188,0.000594616,-4.3008e-07,1.26819e-08,0.906782,0.000593794,-3.92034e-07,2.18894e-08,0.907376,0.000593076,-3.26366e-07,-4.06349e-08,0.907968,0.000592301,-4.4827e-07,2.1441e-08,0.90856,0.000591469,-3.83947e-07,1.44754e-08,0.909151,0.000590744,-3.40521e-07,-1.97379e-08,0.909742,0.000590004,-3.99735e-07,4.87161e-09,0.910331,0.000589219,-3.8512e-07,2.51532e-10,0.91092,0.00058845,-3.84366e-07,-5.87776e-09,0.911508,0.000587663,-4.01999e-07,2.32595e-08,0.912096,0.000586929,-3.3222e-07,-2.75554e-08,0.912682,0.000586182,-4.14887e-07,2.73573e-08,0.913268,0.000585434,-3.32815e-07,-2.22692e-08,0.913853,0.000584702,-3.99622e-07,2.11486e-09,0.914437,0.000583909,-3.93278e-07,1.38098e-08,0.915021,0.000583164,-3.51848e-07,2.25042e-09,0.915604,0.000582467,-3.45097e-07,-2.28115e-08,0.916186,0.000581708,-4.13531e-07,2.93911e-08,0.916767,0.000580969,-3.25358e-07,-3.51481e-08,0.917348,0.000580213,-4.30803e-07,5.15967e-08,0.917928,0.000579506,-2.76012e-07,-5.20296e-08,0.918507,0.000578798,-4.32101e-07,3.73124e-08,0.919085,0.000578046,-3.20164e-07,-3.76154e-08,0.919663,0.000577293,-4.3301e-07,5.35447e-08,0.92024,0.000576587,-2.72376e-07,-5.7354e-08,0.920816,0.000575871,-4.44438e-07,5.66621e-08,0.921391,0.000575152,-2.74452e-07,-5.00851e-08,0.921966,0.000574453,-4.24707e-07,2.4469e-08,0.92254,0.000573677,-3.513e-07,1.18138e-08,0.923114,0.000573009,-3.15859e-07,-1.21195e-08,0.923686,0.000572341,-3.52217e-07,-2.29403e-08,0.924258,0.000571568,-4.21038e-07,4.4276e-08,0.924829,0.000570859,-2.8821e-07,-3.49546e-08,0.9254,0.000570178,-3.93074e-07,3.59377e-08,0.92597,0.000569499,-2.85261e-07,-4.91915e-08,0.926539,0.000568781,-4.32835e-07,4.16189e-08,0.927107,0.00056804,-3.07979e-07,1.92523e-09,0.927675,0.00056743,-3.02203e-07,-4.93198e-08,0.928242,0.000566678,-4.50162e-07,7.61447e-08,0.928809,0.000566006,-2.21728e-07,-7.6445e-08,0.929374,0.000565333,-4.51063e-07,5.08216e-08,0.929939,0.000564583,-2.98599e-07,-7.63212e-09,0.930503,0.000563963,-3.21495e-07,-2.02931e-08,0.931067,0.000563259,-3.82374e-07,2.92001e-08,0.93163,0.000562582,-2.94774e-07,-3.69025e-08,0.932192,0.000561882,-4.05482e-07,5.88053e-08,0.932754,0.000561247,-2.29066e-07,-7.91094e-08,0.933315,0.000560552,-4.66394e-07,7.88184e-08,0.933875,0.000559856,-2.29939e-07,-5.73501e-08,0.934434,0.000559224,-4.01989e-07,3.13727e-08,0.934993,0.000558514,-3.07871e-07,-8.53611e-09,0.935551,0.000557873,-3.33479e-07,2.77175e-09,0.936109,0.000557214,-3.25164e-07,-2.55091e-09,0.936666,0.000556556,-3.32817e-07,7.43188e-09,0.937222,0.000555913,-3.10521e-07,-2.71766e-08,0.937778,0.00055521,-3.92051e-07,4.167e-08,0.938333,0.000554551,-2.67041e-07,-2.02941e-08,0.938887,0.000553956,-3.27923e-07,-2.00984e-08,0.93944,0.00055324,-3.88218e-07,4.10828e-08,0.939993,0.000552587,-2.6497e-07,-2.50237e-08,0.940546,0.000551982,-3.40041e-07,-5.92583e-10,0.941097,0.0005513,-3.41819e-07,2.7394e-08,0.941648,0.000550698,-2.59637e-07,-4.93788e-08,0.942199,0.000550031,-4.07773e-07,5.09119e-08,0.942748,0.000549368,-2.55038e-07,-3.50595e-08,0.943297,0.000548753,-3.60216e-07,2.97214e-08,0.943846,0.000548122,-2.71052e-07,-2.42215e-08,0.944394,0.000547507,-3.43716e-07,7.55985e-09,0.944941,0.000546842,-3.21037e-07,-6.01796e-09,0.945487,0.000546182,-3.3909e-07,1.65119e-08,0.946033,0.000545553,-2.89555e-07,-4.2498e-10,0.946578,0.000544973,-2.9083e-07,-1.4812e-08,0.947123,0.000544347,-3.35266e-07,6.83068e-11,0.947667,0.000543676,-3.35061e-07,1.45388e-08,0.94821,0.00054305,-2.91444e-07,1.38123e-09,0.948753,0.000542471,-2.87301e-07,-2.00637e-08,0.949295,0.000541836,-3.47492e-07,1.92688e-08,0.949837,0.000541199,-2.89685e-07,2.59298e-09,0.950378,0.000540628,-2.81906e-07,-2.96407e-08,0.950918,0.000539975,-3.70829e-07,5.63652e-08,0.951458,0.000539402,-2.01733e-07,-7.66107e-08,0.951997,0.000538769,-4.31565e-07,7.12638e-08,0.952535,0.00053812,-2.17774e-07,-2.96305e-08,0.953073,0.000537595,-3.06665e-07,-1.23464e-08,0.95361,0.000536945,-3.43704e-07,1.94114e-08,0.954147,0.000536316,-2.8547e-07,-5.69451e-09,0.954683,0.000535728,-3.02554e-07,3.36666e-09,0.955219,0.000535133,-2.92454e-07,-7.77208e-09,0.955753,0.000534525,-3.1577e-07,2.77216e-08,0.956288,0.000533976,-2.32605e-07,-4.35097e-08,0.956821,0.00053338,-3.63134e-07,2.7108e-08,0.957354,0.000532735,-2.8181e-07,-5.31772e-09,0.957887,0.000532156,-2.97764e-07,-5.83718e-09,0.958419,0.000531543,-3.15275e-07,2.86664e-08,0.95895,0.000530998,-2.29276e-07,-4.9224e-08,0.959481,0.000530392,-3.76948e-07,4.90201e-08,0.960011,0.000529785,-2.29887e-07,-2.76471e-08,0.96054,0.000529243,-3.12829e-07,1.96385e-09,0.961069,0.000528623,-3.06937e-07,1.97917e-08,0.961598,0.000528068,-2.47562e-07,-2.15261e-08,0.962125,0.000527508,-3.1214e-07,6.70795e-09,0.962653,0.000526904,-2.92016e-07,-5.30573e-09,0.963179,0.000526304,-3.07934e-07,1.4515e-08,0.963705,0.000525732,-2.64389e-07,6.85048e-09,0.964231,0.000525224,-2.43837e-07,-4.19169e-08,0.964756,0.00052461,-3.69588e-07,4.1608e-08,0.96528,0.000523996,-2.44764e-07,-5.30598e-09,0.965804,0.000523491,-2.60682e-07,-2.03841e-08,0.966327,0.000522908,-3.21834e-07,2.72378e-08,0.966849,0.000522346,-2.40121e-07,-2.89625e-08,0.967371,0.000521779,-3.27008e-07,2.90075e-08,0.967893,0.000521212,-2.39986e-07,-2.74629e-08,0.968414,0.00052065,-3.22374e-07,2.12396e-08,0.968934,0.000520069,-2.58656e-07,2.10922e-09,0.969454,0.000519558,-2.52328e-07,-2.96765e-08,0.969973,0.000518964,-3.41357e-07,5.6992e-08,0.970492,0.000518452,-1.70382e-07,-7.90821e-08,0.97101,0.000517874,-4.07628e-07,8.05224e-08,0.971528,0.000517301,-1.66061e-07,-6.41937e-08,0.972045,0.000516776,-3.58642e-07,5.70429e-08,0.972561,0.00051623,-1.87513e-07,-4.47686e-08,0.973077,0.00051572,-3.21819e-07,2.82237e-09,0.973593,0.000515085,-3.13352e-07,3.34792e-08,0.974108,0.000514559,-2.12914e-07,-1.75298e-08,0.974622,0.000514081,-2.65503e-07,-2.29648e-08,0.975136,0.000513481,-3.34398e-07,4.97843e-08,0.975649,0.000512961,-1.85045e-07,-5.6963e-08,0.976162,0.00051242,-3.55934e-07,5.88585e-08,0.976674,0.000511885,-1.79359e-07,-5.92616e-08,0.977185,0.000511348,-3.57143e-07,5.89785e-08,0.977696,0.000510811,-1.80208e-07,-5.74433e-08,0.978207,0.000510278,-3.52538e-07,5.15854e-08,0.978717,0.000509728,-1.97781e-07,-2.9689e-08,0.979226,0.000509243,-2.86848e-07,7.56591e-09,0.979735,0.000508692,-2.64151e-07,-5.74649e-10,0.980244,0.000508162,-2.65875e-07,-5.26732e-09,0.980752,0.000507615,-2.81677e-07,2.16439e-08,0.981259,0.000507116,-2.16745e-07,-2.17037e-08,0.981766,0.000506618,-2.81856e-07,5.56636e-09,0.982272,0.000506071,-2.65157e-07,-5.61689e-10,0.982778,0.000505539,-2.66842e-07,-3.31963e-09,0.983283,0.000504995,-2.76801e-07,1.38402e-08,0.983788,0.000504483,-2.3528e-07,7.56339e-09,0.984292,0.000504035,-2.1259e-07,-4.40938e-08,0.984796,0.000503478,-3.44871e-07,4.96026e-08,0.985299,0.000502937,-1.96064e-07,-3.51071e-08,0.985802,0.000502439,-3.01385e-07,3.12212e-08,0.986304,0.00050193,-2.07721e-07,-3.0173e-08,0.986806,0.000501424,-2.9824e-07,2.9866e-08,0.987307,0.000500917,-2.08642e-07,-2.96865e-08,0.987808,0.000500411,-2.97702e-07,2.92753e-08,0.988308,0.000499903,-2.09876e-07,-2.78101e-08,0.988807,0.0004994,-2.93306e-07,2.23604e-08,0.989307,0.000498881,-2.26225e-07,-2.02681e-09,0.989805,0.000498422,-2.32305e-07,-1.42531e-08,0.990303,0.000497915,-2.75065e-07,-5.65232e-10,0.990801,0.000497363,-2.76761e-07,1.65141e-08,0.991298,0.000496859,-2.27218e-07,-5.88639e-09,0.991795,0.000496387,-2.44878e-07,7.0315e-09,0.992291,0.000495918,-2.23783e-07,-2.22396e-08,0.992787,0.000495404,-2.90502e-07,2.23224e-08,0.993282,0.00049489,-2.23535e-07,-7.44543e-09,0.993776,0.000494421,-2.45871e-07,7.45924e-09,0.994271,0.000493951,-2.23493e-07,-2.23915e-08,0.994764,0.000493437,-2.90668e-07,2.25021e-08,0.995257,0.000492923,-2.23161e-07,-8.01218e-09,0.99575,0.000492453,-2.47198e-07,9.54669e-09,0.996242,0.000491987,-2.18558e-07,-3.01746e-08,0.996734,0.000491459,-3.09082e-07,5.1547e-08,0.997225,0.000490996,-1.54441e-07,-5.68039e-08,0.997716,0.000490517,-3.24853e-07,5.64594e-08,0.998206,0.000490036,-1.55474e-07,-4.98245e-08,0.998696,0.000489576,-3.04948e-07,2.36292e-08,0.999186,0.000489037,-2.3406e-07,1.49121e-08,0.999674,0.000488613,-1.89324e-07,-2.3673e-08,1.00016,0.000488164,-2.60343e-07,2.01754e-08,1.00065,0.000487704,-1.99816e-07,-5.70288e-08,1.00114,0.000487133,-3.70903e-07,8.87303e-08,1.00162,0.000486657,-1.04712e-07,-5.94737e-08,1.00211,0.000486269,-2.83133e-07,2.99553e-08,1.0026,0.000485793,-1.93267e-07,-6.03474e-08,1.00308,0.000485225,-3.74309e-07,9.2225e-08,1.00357,0.000484754,-9.76345e-08,-7.0134e-08,1.00405,0.000484348,-3.08036e-07,6.91016e-08,1.00454,0.000483939,-1.00731e-07,-8.70633e-08,1.00502,0.000483476,-3.61921e-07,4.07328e-08,1.0055,0.000482875,-2.39723e-07,4.33413e-08,1.00599,0.000482525,-1.09699e-07,-9.48886e-08,1.00647,0.000482021,-3.94365e-07,9.77947e-08,1.00695,0.000481526,-1.00981e-07,-5.78713e-08,1.00743,0.00048115,-2.74595e-07,1.44814e-08,1.00791,0.000480645,-2.31151e-07,-5.42665e-11,1.00839,0.000480182,-2.31314e-07,-1.42643e-08,1.00887,0.000479677,-2.74106e-07,5.71115e-08,1.00935,0.0004793,-1.02772e-07,-9.49724e-08,1.00983,0.000478809,-3.87689e-07,8.43596e-08,1.01031,0.000478287,-1.3461e-07,-4.04755e-09,1.01079,0.000478006,-1.46753e-07,-6.81694e-08,1.01127,0.000477508,-3.51261e-07,3.83067e-08,1.01174,0.00047692,-2.36341e-07,3.41521e-08,1.01222,0.00047655,-1.33885e-07,-5.57058e-08,1.0127,0.000476115,-3.01002e-07,6.94616e-08,1.01317,0.000475721,-9.26174e-08,-1.02931e-07,1.01365,0.000475227,-4.01412e-07,1.03846e-07,1.01412,0.000474736,-8.98751e-08,-7.40321e-08,1.0146,0.000474334,-3.11971e-07,7.30735e-08,1.01507,0.00047393,-9.27508e-08,-9.90527e-08,1.01554,0.000473447,-3.89909e-07,8.47188e-08,1.01602,0.000472921,-1.35753e-07,-1.40381e-09,1.01649,0.000472645,-1.39964e-07,-7.91035e-08,1.01696,0.000472128,-3.77275e-07,7.93993e-08,1.01744,0.000471612,-1.39077e-07,-7.52607e-11,1.01791,0.000471334,-1.39302e-07,-7.90983e-08,1.01838,0.000470818,-3.76597e-07,7.80499e-08,1.01885,0.000470299,-1.42448e-07,5.31733e-09,1.01932,0.00047003,-1.26496e-07,-9.93193e-08,1.01979,0.000469479,-4.24453e-07,1.53541e-07,1.02026,0.00046909,3.617e-08,-1.57217e-07,1.02073,0.000468691,-4.35482e-07,1.177e-07,1.02119,0.000468173,-8.23808e-08,-7.51659e-08,1.02166,0.000467783,-3.07878e-07,6.37538e-08,1.02213,0.000467358,-1.16617e-07,-6.064e-08,1.0226,0.000466943,-2.98537e-07,5.9597e-08,1.02306,0.000466525,-1.19746e-07,-5.85386e-08,1.02353,0.00046611,-2.95362e-07,5.53482e-08,1.024,0.000465685,-1.29317e-07,-4.36449e-08,1.02446,0.000465296,-2.60252e-07,2.20268e-11,1.02493,0.000464775,-2.60186e-07,4.35568e-08,1.02539,0.000464386,-1.29516e-07,-5.50398e-08,1.02586,0.000463961,-2.94635e-07,5.73932e-08,1.02632,0.000463544,-1.22456e-07,-5.53236e-08,1.02678,0.000463133,-2.88426e-07,4.46921e-08,1.02725,0.000462691,-1.5435e-07,-4.23534e-09,1.02771,0.000462369,-1.67056e-07,-2.77507e-08,1.02817,0.000461952,-2.50308e-07,-3.97101e-09,1.02863,0.000461439,-2.62221e-07,4.36348e-08,1.02909,0.000461046,-1.31317e-07,-5.13589e-08,1.02955,0.000460629,-2.85394e-07,4.25913e-08,1.03001,0.000460186,-1.5762e-07,2.0285e-10,1.03047,0.000459871,-1.57011e-07,-4.34027e-08,1.03093,0.000459427,-2.87219e-07,5.41987e-08,1.03139,0.000459015,-1.24623e-07,-5.4183e-08,1.03185,0.000458604,-2.87172e-07,4.33239e-08,1.03231,0.000458159,-1.572e-07,9.65817e-11,1.03277,0.000457845,-1.56911e-07,-4.37103e-08,1.03323,0.0004574,-2.88041e-07,5.55351e-08,1.03368,0.000456991,-1.21436e-07,-5.9221e-08,1.03414,0.00045657,-2.99099e-07,6.21394e-08,1.0346,0.000456158,-1.1268e-07,-7.01275e-08,1.03505,0.000455723,-3.23063e-07,9.91614e-08,1.03551,0.000455374,-2.55788e-08,-8.80996e-08,1.03596,0.000455058,-2.89878e-07,1.48184e-08,1.03642,0.000454523,-2.45422e-07,2.88258e-08,1.03687,0.000454119,-1.58945e-07,-1.09125e-08,1.03733,0.000453768,-1.91682e-07,1.48241e-08,1.03778,0.000453429,-1.4721e-07,-4.83838e-08,1.03823,0.00045299,-2.92361e-07,5.95019e-08,1.03869,0.000452584,-1.13856e-07,-7.04146e-08,1.03914,0.000452145,-3.25099e-07,1.02947e-07,1.03959,0.000451803,-1.62583e-08,-1.02955e-07,1.04004,0.000451462,-3.25123e-07,7.04544e-08,1.04049,0.000451023,-1.1376e-07,-5.96534e-08,1.04094,0.000450616,-2.9272e-07,4.89499e-08,1.04139,0.000450178,-1.45871e-07,-1.69369e-08,1.04184,0.000449835,-1.96681e-07,1.87977e-08,1.04229,0.000449498,-1.40288e-07,-5.82539e-08,1.04274,0.000449043,-3.1505e-07,9.50087e-08,1.04319,0.000448698,-3.00238e-08,-8.33623e-08,1.04364,0.000448388,-2.80111e-07,2.20363e-11,1.04409,0.000447828,-2.80045e-07,8.32742e-08,1.04454,0.000447517,-3.02221e-08,-9.47002e-08,1.04498,0.000447173,-3.14323e-07,5.7108e-08,1.04543,0.000446716,-1.42999e-07,-1.45225e-08,1.04588,0.000446386,-1.86566e-07,9.82022e-10,1.04632,0.000446016,-1.8362e-07,1.05944e-08,1.04677,0.00044568,-1.51837e-07,-4.33597e-08,1.04721,0.000445247,-2.81916e-07,4.36352e-08,1.04766,0.000444814,-1.51011e-07,-1.19717e-08,1.0481,0.000444476,-1.86926e-07,4.25158e-09,1.04855,0.000444115,-1.74171e-07,-5.03461e-09,1.04899,0.000443751,-1.89275e-07,1.58868e-08,1.04944,0.00044342,-1.41614e-07,-5.85127e-08,1.04988,0.000442961,-3.17152e-07,9.89548e-08,1.05032,0.000442624,-2.0288e-08,-9.88878e-08,1.05076,0.000442287,-3.16951e-07,5.81779e-08,1.05121,0.000441827,-1.42418e-07,-1.46144e-08,1.05165,0.000441499,-1.86261e-07,2.79892e-10,1.05209,0.000441127,-1.85421e-07,1.34949e-08,1.05253,0.000440797,-1.44937e-07,-5.42594e-08,1.05297,0.000440344,-3.07715e-07,8.43335e-08,1.05341,0.000439982,-5.47146e-08,-4.46558e-08,1.05385,0.000439738,-1.88682e-07,-2.49193e-08,1.05429,0.000439286,-2.6344e-07,2.5124e-08,1.05473,0.000438835,-1.88068e-07,4.36328e-08,1.05517,0.000438589,-5.71699e-08,-8.04459e-08,1.05561,0.000438234,-2.98508e-07,3.97324e-08,1.05605,0.000437756,-1.79311e-07,4.07258e-08,1.05648,0.000437519,-5.71332e-08,-8.34263e-08,1.05692,0.000437155,-3.07412e-07,5.45608e-08,1.05736,0.000436704,-1.4373e-07,-1.56078e-08,1.05779,0.000436369,-1.90553e-07,7.87043e-09,1.05823,0.000436012,-1.66942e-07,-1.58739e-08,1.05867,0.00043563,-2.14563e-07,5.56251e-08,1.0591,0.000435368,-4.76881e-08,-8.74172e-08,1.05954,0.000435011,-3.0994e-07,5.56251e-08,1.05997,0.000434558,-1.43064e-07,-1.58739e-08,1.06041,0.000434224,-1.90686e-07,7.87042e-09,1.06084,0.000433866,-1.67075e-07,-1.56078e-08,1.06127,0.000433485,-2.13898e-07,5.45609e-08,1.06171,0.000433221,-5.02157e-08,-8.34263e-08,1.06214,0.00043287,-3.00495e-07,4.07258e-08,1.06257,0.000432391,-1.78317e-07,3.97325e-08,1.063,0.000432154,-5.91198e-08,-8.04464e-08,1.06344,0.000431794,-3.00459e-07,4.36347e-08,1.06387,0.000431324,-1.69555e-07,2.5117e-08,1.0643,0.000431061,-9.42041e-08,-2.48934e-08,1.06473,0.000430798,-1.68884e-07,-4.47527e-08,1.06516,0.000430326,-3.03142e-07,8.46951e-08,1.06559,0.000429973,-4.90573e-08,-5.56089e-08,1.06602,0.000429708,-2.15884e-07,1.85314e-08,1.06645,0.000429332,-1.6029e-07,-1.85166e-08,1.06688,0.000428956,-2.1584e-07,5.5535e-08,1.06731,0.000428691,-4.92347e-08,-8.44142e-08,1.06774,0.000428339,-3.02477e-07,4.37032e-08,1.06816,0.000427865,-1.71368e-07,2.88107e-08,1.06859,0.000427609,-8.49356e-08,-3.97367e-08,1.06902,0.00042732,-2.04146e-07,1.09267e-08,1.06945,0.000426945,-1.71365e-07,-3.97023e-09,1.06987,0.00042659,-1.83276e-07,4.9542e-09,1.0703,0.000426238,-1.68414e-07,-1.58466e-08,1.07073,0.000425854,-2.15953e-07,5.84321e-08,1.07115,0.000425597,-4.0657e-08,-9.86725e-08,1.07158,0.00042522,-3.36674e-07,9.78392e-08,1.072,0.00042484,-4.31568e-08,-5.42658e-08,1.07243,0.000424591,-2.05954e-07,1.45377e-11,1.07285,0.000424179,-2.0591e-07,5.42076e-08,1.07328,0.00042393,-4.32877e-08,-9.76357e-08,1.0737,0.00042355,-3.36195e-07,9.79165e-08,1.07412,0.000423172,-4.24451e-08,-5.56118e-08,1.07455,0.00042292,-2.09281e-07,5.32143e-09,1.07497,0.000422518,-1.93316e-07,3.43261e-08,1.07539,0.000422234,-9.0338e-08,-2.34165e-08,1.07581,0.000421983,-1.60588e-07,-5.98692e-08,1.07623,0.000421482,-3.40195e-07,1.43684e-07,1.07666,0.000421233,9.08574e-08,-1.5724e-07,1.07708,0.000420943,-3.80862e-07,1.27647e-07,1.0775,0.000420564,2.0791e-09,-1.1493e-07,1.07792,0.000420223,-3.4271e-07,9.36534e-08,1.07834,0.000419819,-6.17499e-08,-2.12653e-08,1.07876,0.000419632,-1.25546e-07,-8.59219e-09,1.07918,0.000419355,-1.51322e-07,-6.35752e-08,1.0796,0.000418861,-3.42048e-07,1.43684e-07,1.08002,0.000418608,8.90034e-08,-1.53532e-07,1.08043,0.000418326,-3.71593e-07,1.12817e-07,1.08085,0.000417921,-3.31414e-08,-5.93184e-08,1.08127,0.000417677,-2.11097e-07,5.24697e-09,1.08169,0.00041727,-1.95356e-07,3.83305e-08,1.0821,0.000416995,-8.03642e-08,-3.93597e-08,1.08252,0.000416716,-1.98443e-07,-1.0094e-10,1.08294,0.000416319,-1.98746e-07,3.97635e-08,1.08335,0.00041604,-7.94557e-08,-3.97437e-08,1.08377,0.000415762,-1.98687e-07,1.94215e-12,1.08419,0.000415365,-1.98681e-07,3.97359e-08,1.0846,0.000415087,-7.94732e-08,-3.97362e-08,1.08502,0.000414809,-1.98682e-07,-4.31063e-13,1.08543,0.000414411,-1.98683e-07,3.97379e-08,1.08584,0.000414133,-7.94694e-08,-3.97418e-08,1.08626,0.000413855,-1.98695e-07,2.00563e-11,1.08667,0.000413458,-1.98635e-07,3.96616e-08,1.08709,0.000413179,-7.965e-08,-3.9457e-08,1.0875,0.000412902,-1.98021e-07,-1.04281e-09,1.08791,0.000412502,-2.01149e-07,4.36282e-08,1.08832,0.000412231,-7.02648e-08,-5.42608e-08,1.08874,0.000411928,-2.33047e-07,5.42057e-08,1.08915,0.000411624,-7.04301e-08,-4.33527e-08,1.08956,0.000411353,-2.00488e-07,-4.07378e-12,1.08997,0.000410952,-2.005e-07,4.3369e-08,1.09038,0.000410681,-7.03934e-08,-5.42627e-08,1.09079,0.000410378,-2.33182e-07,5.44726e-08,1.0912,0.000410075,-6.97637e-08,-4.44186e-08,1.09161,0.000409802,-2.03019e-07,3.99235e-09,1.09202,0.000409408,-1.91042e-07,2.84491e-08,1.09243,0.000409111,-1.05695e-07,1.42043e-09,1.09284,0.000408904,-1.01434e-07,-3.41308e-08,1.09325,0.000408599,-2.03826e-07,1.58937e-08,1.09366,0.000408239,-1.56145e-07,-2.94438e-08,1.09406,0.000407838,-2.44476e-07,1.01881e-07,1.09447,0.000407655,6.11676e-08,-1.39663e-07,1.09488,0.000407358,-3.57822e-07,9.91432e-08,1.09529,0.00040694,-6.03921e-08,-1.84912e-08,1.09569,0.000406764,-1.15866e-07,-2.51785e-08,1.0961,0.000406457,-1.91401e-07,-4.03115e-12,1.09651,0.000406074,-1.91413e-07,2.51947e-08,1.09691,0.000405767,-1.15829e-07,1.84346e-08,1.09732,0.00040559,-6.05254e-08,-9.89332e-08,1.09772,0.000405172,-3.57325e-07,1.3888e-07,1.09813,0.000404874,5.93136e-08,-9.8957e-08,1.09853,0.000404696,-2.37557e-07,1.853e-08,1.09894,0.000404277,-1.81968e-07,2.48372e-08,1.09934,0.000403987,-1.07456e-07,1.33047e-09,1.09975,0.000403776,-1.03465e-07,-3.01591e-08,1.10015,0.000403479,-1.93942e-07,9.66054e-11,1.10055,0.000403091,-1.93652e-07,2.97727e-08,1.10096,0.000402793,-1.04334e-07,2.19273e-11,1.10136,0.000402585,-1.04268e-07,-2.98604e-08,1.10176,0.000402287,-1.93849e-07,2.10325e-10,1.10216,0.0004019,-1.93218e-07,2.90191e-08,1.10256,0.0004016,-1.06161e-07,2.92264e-09,1.10297,0.000401397,-9.73931e-08,-4.07096e-08,1.10337,0.00040108,-2.19522e-07,4.07067e-08,1.10377,0.000400763,-9.7402e-08,-2.90783e-09,1.10417,0.000400559,-1.06126e-07,-2.90754e-08,1.10457,0.00040026,-1.93352e-07,9.00021e-14,1.10497,0.000399873,-1.93351e-07,2.9075e-08,1.10537,0.000399574,-1.06126e-07,2.90902e-09,1.10577,0.00039937,-9.73992e-08,-4.07111e-08,1.10617,0.000399053,-2.19533e-07,4.07262e-08,1.10657,0.000398736,-9.73541e-08,-2.98424e-09,1.10697,0.000398533,-1.06307e-07,-2.87892e-08,1.10736,0.000398234,-1.92674e-07,-1.06824e-09,1.10776,0.000397845,-1.95879e-07,3.30622e-08,1.10816,0.000397552,-9.66926e-08,-1.19712e-08,1.10856,0.000397323,-1.32606e-07,1.48225e-08,1.10895,0.000397102,-8.81387e-08,-4.73187e-08,1.10935,0.000396784,-2.30095e-07,5.52429e-08,1.10975,0.00039649,-6.4366e-08,-5.44437e-08,1.11014,0.000396198,-2.27697e-07,4.33226e-08,1.11054,0.000395872,-9.77293e-08,3.62656e-10,1.11094,0.000395678,-9.66414e-08,-4.47732e-08,1.11133,0.00039535,-2.30961e-07,5.95208e-08,1.11173,0.000395067,-5.23985e-08,-7.41008e-08,1.11212,0.00039474,-2.74701e-07,1.17673e-07,1.11252,0.000394543,7.83181e-08,-1.58172e-07,1.11291,0.000394225,-3.96199e-07,1.57389e-07,1.1133,0.000393905,7.59679e-08,-1.13756e-07,1.1137,0.000393716,-2.653e-07,5.92165e-08,1.11409,0.000393363,-8.76507e-08,-3.90074e-09,1.11449,0.000393176,-9.93529e-08,-4.36136e-08,1.11488,0.000392846,-2.30194e-07,5.91457e-08,1.11527,0.000392563,-5.27564e-08,-7.376e-08,1.11566,0.000392237,-2.74037e-07,1.16685e-07,1.11606,0.000392039,7.60189e-08,-1.54562e-07,1.11645,0.000391727,-3.87667e-07,1.43935e-07,1.11684,0.000391384,4.4137e-08,-6.35487e-08,1.11723,0.000391281,-1.46509e-07,-8.94896e-09,1.11762,0.000390961,-1.73356e-07,-1.98647e-08,1.11801,0.000390555,-2.3295e-07,8.8408e-08,1.1184,0.000390354,3.22736e-08,-9.53486e-08,1.11879,0.000390133,-2.53772e-07,5.45677e-08,1.11918,0.000389789,-9.0069e-08,-3.71296e-09,1.11957,0.000389598,-1.01208e-07,-3.97159e-08,1.11996,0.000389276,-2.20355e-07,4.33671e-08,1.12035,0.000388966,-9.02542e-08,-1.45431e-08,1.12074,0.000388741,-1.33883e-07,1.48052e-08,1.12113,0.000388518,-8.94678e-08,-4.46778e-08,1.12152,0.000388205,-2.23501e-07,4.46966e-08,1.12191,0.000387892,-8.94114e-08,-1.48992e-08,1.12229,0.000387669,-1.34109e-07,1.49003e-08,1.12268,0.000387445,-8.94082e-08,-4.47019e-08,1.12307,0.000387132,-2.23514e-07,4.4698e-08,1.12345,0.000386819,-8.942e-08,-1.48806e-08,1.12384,0.000386596,-1.34062e-07,1.48245e-08,1.12423,0.000386372,-8.95885e-08,-4.44172e-08,1.12461,0.00038606,-2.2284e-07,4.36351e-08,1.125,0.000385745,-9.19348e-08,-1.09139e-08,1.12539,0.000385528,-1.24677e-07,2.05584e-11,1.12577,0.000385279,-1.24615e-07,1.08317e-08,1.12616,0.000385062,-9.21198e-08,-4.33473e-08,1.12654,0.000384748,-2.22162e-07,4.33481e-08,1.12693,0.000384434,-9.21174e-08,-1.08356e-08,1.12731,0.000384217,-1.24624e-07,-5.50907e-12,1.12769,0.000383968,-1.24641e-07,1.08577e-08,1.12808,0.000383751,-9.20679e-08,-4.34252e-08,1.12846,0.000383437,-2.22343e-07,4.36337e-08,1.12884,0.000383123,-9.14422e-08,-1.19005e-08,1.12923,0.000382904,-1.27144e-07,3.96813e-09,1.12961,0.000382662,-1.15239e-07,-3.97207e-09,1.12999,0.000382419,-1.27155e-07,1.19201e-08,1.13038,0.000382201,-9.1395e-08,-4.37085e-08,1.13076,0.000381887,-2.2252e-07,4.37046e-08,1.13114,0.000381573,-9.14068e-08,-1.19005e-08,1.13152,0.000381355,-1.27108e-07,3.89734e-09,1.1319,0.000381112,-1.15416e-07,-3.68887e-09,1.13228,0.00038087,-1.26483e-07,1.08582e-08,1.13266,0.00038065,-9.39083e-08,-3.97438e-08,1.13304,0.000380343,-2.1314e-07,2.89076e-08,1.13342,0.000380003,-1.26417e-07,4.33225e-08,1.1338,0.00037988,3.55072e-09,-8.29883e-08,1.13418,0.000379638,-2.45414e-07,5.0212e-08,1.13456,0.000379298,-9.47781e-08,1.34964e-09,1.13494,0.000379113,-9.07292e-08,-5.56105e-08,1.13532,0.000378764,-2.57561e-07,1.01883e-07,1.1357,0.000378555,4.80889e-08,-1.13504e-07,1.13608,0.000378311,-2.92423e-07,1.13713e-07,1.13646,0.000378067,4.87176e-08,-1.02931e-07,1.13683,0.000377856,-2.60076e-07,5.95923e-08,1.13721,0.000377514,-8.12988e-08,-1.62288e-08,1.13759,0.000377303,-1.29985e-07,5.32278e-09,1.13797,0.000377059,-1.14017e-07,-5.06237e-09,1.13834,0.000376816,-1.29204e-07,1.49267e-08,1.13872,0.000376602,-8.44237e-08,-5.46444e-08,1.1391,0.000376269,-2.48357e-07,8.44417e-08,1.13947,0.000376026,4.96815e-09,-4.47039e-08,1.13985,0.000375902,-1.29143e-07,-2.48355e-08,1.14023,0.000375569,-2.0365e-07,2.48368e-08,1.1406,0.000375236,-1.2914e-07,4.46977e-08,1.14098,0.000375112,4.95341e-09,-8.44184e-08,1.14135,0.000374869,-2.48302e-07,5.45572e-08,1.14173,0.000374536,-8.463e-08,-1.46013e-08,1.1421,0.000374323,-1.28434e-07,3.8478e-09,1.14247,0.000374077,-1.1689e-07,-7.89941e-10,1.14285,0.000373841,-1.1926e-07,-6.88042e-10,1.14322,0.0003736,-1.21324e-07,3.54213e-09,1.1436,0.000373368,-1.10698e-07,-1.34805e-08,1.14397,0.000373107,-1.51139e-07,5.03798e-08,1.14434,0.000372767,0.,0.};
+
+        template <bool srgb, int blueIdx, typename T, typename D>
+        __device__ __forceinline__ void RGB2LuvConvert_f(const T& src, D& dst)
+        {
+            const float _d = 1.f / (0.950456f + 15 + 1.088754f * 3);
+            const float _un = 13 * (4 * 0.950456f * _d);
+            const float _vn = 13 * (9 * _d);
+
+            float B = blueIdx == 0 ? src.x : src.z;
+            float G = src.y;
+            float R = blueIdx == 0 ? src.z : src.x;
+
+            if (srgb)
+            {
+                B = splineInterpolate(B * GAMMA_TAB_SIZE, c_sRGBGammaTab, GAMMA_TAB_SIZE);
+                G = splineInterpolate(G * GAMMA_TAB_SIZE, c_sRGBGammaTab, GAMMA_TAB_SIZE);
+                R = splineInterpolate(R * GAMMA_TAB_SIZE, c_sRGBGammaTab, GAMMA_TAB_SIZE);
+            }
+
+            float X = R * 0.412453f + G * 0.357580f + B * 0.180423f;
+            float Y = R * 0.212671f + G * 0.715160f + B * 0.072169f;
+            float Z = R * 0.019334f + G * 0.119193f + B * 0.950227f;
+
+            float L = splineInterpolate(Y * (LAB_CBRT_TAB_SIZE / 1.5f), c_LabCbrtTab, LAB_CBRT_TAB_SIZE);
+            L = 116.f * L - 16.f;
+
+            const float d = (4 * 13) / ::fmaxf(X + 15 * Y + 3 * Z, numeric_limits<float>::epsilon());
+            float u = L * (X * d - _un);
+            float v = L * ((9 * 0.25f) * Y * d - _vn);
+
+            dst.x = L;
+            dst.y = u;
+            dst.z = v;
+        }
+
+        template <bool srgb, int blueIdx, typename T, typename D>
+        __device__ __forceinline__ void RGB2LuvConvert_b(const T& src, D& dst)
+        {
+            float3 srcf, dstf;
+
+            srcf.x = src.x * (1.f / 255.f);
+            srcf.y = src.y * (1.f / 255.f);
+            srcf.z = src.z * (1.f / 255.f);
+
+            RGB2LuvConvert_f<srgb, blueIdx>(srcf, dstf);
+
+            dst.x = saturate_cast<uchar>(dstf.x * 2.55f);
+            dst.y = saturate_cast<uchar>(dstf.y * 0.72033898305084743f + 96.525423728813564f);
+            dst.z = saturate_cast<uchar>(dstf.z * 0.9732824427480916f + 136.259541984732824f);
+        }
+
+        template <typename T, int scn, int dcn, bool srgb, int blueIdx> struct RGB2Luv;
+        template <int scn, int dcn, bool srgb, int blueIdx>
+        struct RGB2Luv<uchar, scn, dcn, srgb, blueIdx>
+            : unary_function<typename TypeVec<uchar, scn>::vec_type, typename TypeVec<uchar, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<uchar, dcn>::vec_type operator ()(const typename TypeVec<uchar, scn>::vec_type& src) const
+            {
+                typename TypeVec<uchar, dcn>::vec_type dst;
+
+                RGB2LuvConvert_b<srgb, blueIdx>(src, dst);
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ RGB2Luv() {}
+            __host__ __device__ __forceinline__ RGB2Luv(const RGB2Luv&) {}
+        };
+        template <int scn, int dcn, bool srgb, int blueIdx>
+        struct RGB2Luv<float, scn, dcn, srgb, blueIdx>
+            : unary_function<typename TypeVec<float, scn>::vec_type, typename TypeVec<float, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<float, dcn>::vec_type operator ()(const typename TypeVec<float, scn>::vec_type& src) const
+            {
+                typename TypeVec<float, dcn>::vec_type dst;
+
+                RGB2LuvConvert_f<srgb, blueIdx>(src, dst);
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ RGB2Luv() {}
+            __host__ __device__ __forceinline__ RGB2Luv(const RGB2Luv&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_RGB2Luv_TRAITS(name, scn, dcn, srgb, blueIdx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::RGB2Luv<T, scn, dcn, srgb, blueIdx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+    namespace color_detail
+    {
+        template <bool srgb, int blueIdx, typename T, typename D>
+        __device__ __forceinline__ void Luv2RGBConvert_f(const T& src, D& dst)
+        {
+            const float _d = 1.f / (0.950456f + 15 + 1.088754f * 3);
+            const float _un = 4 * 0.950456f * _d;
+            const float _vn = 9 * _d;
+
+            float L = src.x;
+            float u = src.y;
+            float v = src.z;
+
+            float Y = (L + 16.f) * (1.f / 116.f);
+            Y = Y * Y * Y;
+
+            float d = (1.f / 13.f) / L;
+            u = u * d + _un;
+            v = v * d + _vn;
+
+            float iv = 1.f / v;
+            float X = 2.25f * u * Y * iv;
+            float Z = (12 - 3 * u - 20 * v) * Y * 0.25f * iv;
+
+            float B = 0.055648f * X - 0.204043f * Y + 1.057311f * Z;
+            float G = -0.969256f * X + 1.875991f * Y + 0.041556f * Z;
+            float R = 3.240479f * X - 1.537150f * Y - 0.498535f * Z;
+
+            if (srgb)
+            {
+                B = splineInterpolate(B * GAMMA_TAB_SIZE, c_sRGBInvGammaTab, GAMMA_TAB_SIZE);
+                G = splineInterpolate(G * GAMMA_TAB_SIZE, c_sRGBInvGammaTab, GAMMA_TAB_SIZE);
+                R = splineInterpolate(R * GAMMA_TAB_SIZE, c_sRGBInvGammaTab, GAMMA_TAB_SIZE);
+            }
+
+            dst.x = blueIdx == 0 ? B : R;
+            dst.y = G;
+            dst.z = blueIdx == 0 ? R : B;
+            setAlpha(dst, ColorChannel<float>::max());
+        }
+
+        template <bool srgb, int blueIdx, typename T, typename D>
+        __device__ __forceinline__ void Luv2RGBConvert_b(const T& src, D& dst)
+        {
+            float3 srcf, dstf;
+
+            srcf.x = src.x * (100.f / 255.f);
+            srcf.y = src.y * 1.388235294117647f - 134.f;
+            srcf.z = src.z * 1.027450980392157f - 140.f;
+
+            Luv2RGBConvert_f<srgb, blueIdx>(srcf, dstf);
+
+            dst.x = saturate_cast<uchar>(dstf.x * 255.f);
+            dst.y = saturate_cast<uchar>(dstf.y * 255.f);
+            dst.z = saturate_cast<uchar>(dstf.z * 255.f);
+            setAlpha(dst, ColorChannel<uchar>::max());
+        }
+
+        template <typename T, int scn, int dcn, bool srgb, int blueIdx> struct Luv2RGB;
+        template <int scn, int dcn, bool srgb, int blueIdx>
+        struct Luv2RGB<uchar, scn, dcn, srgb, blueIdx>
+            : unary_function<typename TypeVec<uchar, scn>::vec_type, typename TypeVec<uchar, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<uchar, dcn>::vec_type operator ()(const typename TypeVec<uchar, scn>::vec_type& src) const
+            {
+                typename TypeVec<uchar, dcn>::vec_type dst;
+
+                Luv2RGBConvert_b<srgb, blueIdx>(src, dst);
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ Luv2RGB() {}
+            __host__ __device__ __forceinline__ Luv2RGB(const Luv2RGB&) {}
+        };
+        template <int scn, int dcn, bool srgb, int blueIdx>
+        struct Luv2RGB<float, scn, dcn, srgb, blueIdx>
+            : unary_function<typename TypeVec<float, scn>::vec_type, typename TypeVec<float, dcn>::vec_type>
+        {
+            __device__ __forceinline__ typename TypeVec<float, dcn>::vec_type operator ()(const typename TypeVec<float, scn>::vec_type& src) const
+            {
+                typename TypeVec<float, dcn>::vec_type dst;
+
+                Luv2RGBConvert_f<srgb, blueIdx>(src, dst);
+
+                return dst;
+            }
+            __host__ __device__ __forceinline__ Luv2RGB() {}
+            __host__ __device__ __forceinline__ Luv2RGB(const Luv2RGB&) {}
+        };
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_Luv2RGB_TRAITS(name, scn, dcn, srgb, blueIdx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef ::cv::cuda::device::color_detail::Luv2RGB<T, scn, dcn, srgb, blueIdx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return functor_type(); \
+        } \
+    };
+
+    #undef CV_DESCALE
+
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_COLOR_DETAIL_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/detail/reduce.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/detail/reduce.hpp
new file mode 100644
index 0000000..8af20b0
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/detail/reduce.hpp
@@ -0,0 +1,365 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_REDUCE_DETAIL_HPP
+#define OPENCV_CUDA_REDUCE_DETAIL_HPP
+
+#include <thrust/tuple.h>
+#include "../warp.hpp"
+#include "../warp_shuffle.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace reduce_detail
+    {
+        template <typename T> struct GetType;
+        template <typename T> struct GetType<T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<volatile T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<T&>
+        {
+            typedef T type;
+        };
+
+        template <unsigned int I, unsigned int N>
+        struct For
+        {
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadToSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
+            {
+                thrust::get<I>(smem)[tid] = thrust::get<I>(val);
+
+                For<I + 1, N>::loadToSmem(smem, val, tid);
+            }
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadFromSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
+            {
+                thrust::get<I>(val) = thrust::get<I>(smem)[tid];
+
+                For<I + 1, N>::loadFromSmem(smem, val, tid);
+            }
+
+            template <class PointerTuple, class ValTuple, class OpTuple>
+            static __device__ void merge(const PointerTuple& smem, const ValTuple& val, unsigned int tid, unsigned int delta, const OpTuple& op)
+            {
+                typename GetType<typename thrust::tuple_element<I, PointerTuple>::type>::type reg = thrust::get<I>(smem)[tid + delta];
+                thrust::get<I>(smem)[tid] = thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
+
+                For<I + 1, N>::merge(smem, val, tid, delta, op);
+            }
+            template <class ValTuple, class OpTuple>
+            static __device__ void mergeShfl(const ValTuple& val, unsigned int delta, unsigned int width, const OpTuple& op)
+            {
+                typename GetType<typename thrust::tuple_element<I, ValTuple>::type>::type reg = shfl_down(thrust::get<I>(val), delta, width);
+                thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
+
+                For<I + 1, N>::mergeShfl(val, delta, width, op);
+            }
+        };
+        template <unsigned int N>
+        struct For<N, N>
+        {
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadToSmem(const PointerTuple&, const ValTuple&, unsigned int)
+            {
+            }
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadFromSmem(const PointerTuple&, const ValTuple&, unsigned int)
+            {
+            }
+
+            template <class PointerTuple, class ValTuple, class OpTuple>
+            static __device__ void merge(const PointerTuple&, const ValTuple&, unsigned int, unsigned int, const OpTuple&)
+            {
+            }
+            template <class ValTuple, class OpTuple>
+            static __device__ void mergeShfl(const ValTuple&, unsigned int, unsigned int, const OpTuple&)
+            {
+            }
+        };
+
+        template <typename T>
+        __device__ __forceinline__ void loadToSmem(volatile T* smem, T& val, unsigned int tid)
+        {
+            smem[tid] = val;
+        }
+        template <typename T>
+        __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& val, unsigned int tid)
+        {
+            val = smem[tid];
+        }
+        template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+                  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
+        __device__ __forceinline__ void loadToSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                                       const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                                       unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadToSmem(smem, val, tid);
+        }
+        template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+                  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
+        __device__ __forceinline__ void loadFromSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                                         const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                                         unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadFromSmem(smem, val, tid);
+        }
+
+        template <typename T, class Op>
+        __device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op)
+        {
+            T reg = smem[tid + delta];
+            smem[tid] = val = op(val, reg);
+        }
+        template <typename T, class Op>
+        __device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op)
+        {
+            T reg = shfl_down(val, delta, width);
+            val = op(val, reg);
+        }
+        template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+                  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+                  class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+        __device__ __forceinline__ void merge(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                              const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                              unsigned int tid,
+                                              unsigned int delta,
+                                              const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::merge(smem, val, tid, delta, op);
+        }
+        template <typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+                  class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+        __device__ __forceinline__ void mergeShfl(const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                                  unsigned int delta,
+                                                  unsigned int width,
+                                                  const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9> >::value>::mergeShfl(val, delta, width, op);
+        }
+
+        template <unsigned int N> struct Generic
+        {
+            template <typename Pointer, typename Reference, class Op>
+            static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+                loadToSmem(smem, val, tid);
+                if (N >= 32)
+                    __syncthreads();
+
+                if (N >= 2048)
+                {
+                    if (tid < 1024)
+                        merge(smem, val, tid, 1024, op);
+
+                    __syncthreads();
+                }
+                if (N >= 1024)
+                {
+                    if (tid < 512)
+                        merge(smem, val, tid, 512, op);
+
+                    __syncthreads();
+                }
+                if (N >= 512)
+                {
+                    if (tid < 256)
+                        merge(smem, val, tid, 256, op);
+
+                    __syncthreads();
+                }
+                if (N >= 256)
+                {
+                    if (tid < 128)
+                        merge(smem, val, tid, 128, op);
+
+                    __syncthreads();
+                }
+                if (N >= 128)
+                {
+                    if (tid < 64)
+                        merge(smem, val, tid, 64, op);
+
+                    __syncthreads();
+                }
+                if (N >= 64)
+                {
+                    if (tid < 32)
+                        merge(smem, val, tid, 32, op);
+                }
+
+                if (tid < 16)
+                {
+                    merge(smem, val, tid, 16, op);
+                    merge(smem, val, tid, 8, op);
+                    merge(smem, val, tid, 4, op);
+                    merge(smem, val, tid, 2, op);
+                    merge(smem, val, tid, 1, op);
+                }
+            }
+        };
+
+        template <unsigned int I, typename Pointer, typename Reference, class Op>
+        struct Unroll
+        {
+            static __device__ void loopShfl(Reference val, Op op, unsigned int N)
+            {
+                mergeShfl(val, I, N, op);
+                Unroll<I / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
+            }
+            static __device__ void loop(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+                merge(smem, val, tid, I, op);
+                Unroll<I / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
+            }
+        };
+        template <typename Pointer, typename Reference, class Op>
+        struct Unroll<0, Pointer, Reference, Op>
+        {
+            static __device__ void loopShfl(Reference, Op, unsigned int)
+            {
+            }
+            static __device__ void loop(Pointer, Reference, unsigned int, Op)
+            {
+            }
+        };
+
+        template <unsigned int N> struct WarpOptimized
+        {
+            template <typename Pointer, typename Reference, class Op>
+            static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+            #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+                CV_UNUSED(smem);
+                CV_UNUSED(tid);
+
+                Unroll<N / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
+            #else
+                loadToSmem(smem, val, tid);
+
+                if (tid < N / 2)
+                    Unroll<N / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
+            #endif
+            }
+        };
+
+        template <unsigned int N> struct GenericOptimized32
+        {
+            enum { M = N / 32 };
+
+            template <typename Pointer, typename Reference, class Op>
+            static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+                const unsigned int laneId = Warp::laneId();
+
+            #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+                Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize);
+
+                if (laneId == 0)
+                    loadToSmem(smem, val, tid / 32);
+            #else
+                loadToSmem(smem, val, tid);
+
+                if (laneId < 16)
+                    Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op);
+
+                __syncthreads();
+
+                if (laneId == 0)
+                    loadToSmem(smem, val, tid / 32);
+            #endif
+
+                __syncthreads();
+
+                loadFromSmem(smem, val, tid);
+
+                if (tid < 32)
+                {
+                #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+                    Unroll<M / 2, Pointer, Reference, Op>::loopShfl(val, op, M);
+                #else
+                    Unroll<M / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
+                #endif
+                }
+            }
+        };
+
+        template <bool val, class T1, class T2> struct StaticIf;
+        template <class T1, class T2> struct StaticIf<true, T1, T2>
+        {
+            typedef T1 type;
+        };
+        template <class T1, class T2> struct StaticIf<false, T1, T2>
+        {
+            typedef T2 type;
+        };
+
+        template <unsigned int N> struct IsPowerOf2
+        {
+            enum { value = ((N != 0) && !(N & (N - 1))) };
+        };
+
+        template <unsigned int N> struct Dispatcher
+        {
+            typedef typename StaticIf<
+                (N <= 32) && IsPowerOf2<N>::value,
+                WarpOptimized<N>,
+                typename StaticIf<
+                    (N <= 1024) && IsPowerOf2<N>::value,
+                    GenericOptimized32<N>,
+                    Generic<N>
+                >::type
+            >::type reductor;
+        };
+    }
+}}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_REDUCE_DETAIL_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/detail/reduce_key_val.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/detail/reduce_key_val.hpp
new file mode 100644
index 0000000..df37c17
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/detail/reduce_key_val.hpp
@@ -0,0 +1,502 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_PRED_VAL_REDUCE_DETAIL_HPP
+#define OPENCV_CUDA_PRED_VAL_REDUCE_DETAIL_HPP
+
+#include <thrust/tuple.h>
+#include "../warp.hpp"
+#include "../warp_shuffle.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace reduce_key_val_detail
+    {
+        template <typename T> struct GetType;
+        template <typename T> struct GetType<T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<volatile T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<T&>
+        {
+            typedef T type;
+        };
+
+        template <unsigned int I, unsigned int N>
+        struct For
+        {
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadToSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
+            {
+                thrust::get<I>(smem)[tid] = thrust::get<I>(data);
+
+                For<I + 1, N>::loadToSmem(smem, data, tid);
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadFromSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
+            {
+                thrust::get<I>(data) = thrust::get<I>(smem)[tid];
+
+                For<I + 1, N>::loadFromSmem(smem, data, tid);
+            }
+
+            template <class ReferenceTuple>
+            static __device__ void copyShfl(const ReferenceTuple& val, unsigned int delta, int width)
+            {
+                thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
+
+                For<I + 1, N>::copyShfl(val, delta, width);
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void copy(const PointerTuple& svals, const ReferenceTuple& val, unsigned int tid, unsigned int delta)
+            {
+                thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
+
+                For<I + 1, N>::copy(svals, val, tid, delta);
+            }
+
+            template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void mergeShfl(const KeyReferenceTuple& key, const ValReferenceTuple& val, const CmpTuple& cmp, unsigned int delta, int width)
+            {
+                typename GetType<typename thrust::tuple_element<I, KeyReferenceTuple>::type>::type reg = shfl_down(thrust::get<I>(key), delta, width);
+
+                if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
+                {
+                    thrust::get<I>(key) = reg;
+                    thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
+                }
+
+                For<I + 1, N>::mergeShfl(key, val, cmp, delta, width);
+            }
+            template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void merge(const KeyPointerTuple& skeys, const KeyReferenceTuple& key,
+                                         const ValPointerTuple& svals, const ValReferenceTuple& val,
+                                         const CmpTuple& cmp,
+                                         unsigned int tid, unsigned int delta)
+            {
+                typename GetType<typename thrust::tuple_element<I, KeyPointerTuple>::type>::type reg = thrust::get<I>(skeys)[tid + delta];
+
+                if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
+                {
+                    thrust::get<I>(skeys)[tid] = thrust::get<I>(key) = reg;
+                    thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
+                }
+
+                For<I + 1, N>::merge(skeys, key, svals, val, cmp, tid, delta);
+            }
+        };
+        template <unsigned int N>
+        struct For<N, N>
+        {
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadToSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
+            {
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadFromSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
+            {
+            }
+
+            template <class ReferenceTuple>
+            static __device__ void copyShfl(const ReferenceTuple&, unsigned int, int)
+            {
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void copy(const PointerTuple&, const ReferenceTuple&, unsigned int, unsigned int)
+            {
+            }
+
+            template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void mergeShfl(const KeyReferenceTuple&, const ValReferenceTuple&, const CmpTuple&, unsigned int, int)
+            {
+            }
+            template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void merge(const KeyPointerTuple&, const KeyReferenceTuple&,
+                                         const ValPointerTuple&, const ValReferenceTuple&,
+                                         const CmpTuple&,
+                                         unsigned int, unsigned int)
+            {
+            }
+        };
+
+        //////////////////////////////////////////////////////
+        // loadToSmem
+
+        template <typename T>
+        __device__ __forceinline__ void loadToSmem(volatile T* smem, T& data, unsigned int tid)
+        {
+            smem[tid] = data;
+        }
+        template <typename T>
+        __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& data, unsigned int tid)
+        {
+            data = smem[tid];
+        }
+        template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void loadToSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
+                                                   const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
+                                                   unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadToSmem(smem, data, tid);
+        }
+        template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void loadFromSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
+                                                     const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
+                                                     unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadFromSmem(smem, data, tid);
+        }
+
+        //////////////////////////////////////////////////////
+        // copyVals
+
+        template <typename V>
+        __device__ __forceinline__ void copyValsShfl(V& val, unsigned int delta, int width)
+        {
+            val = shfl_down(val, delta, width);
+        }
+        template <typename V>
+        __device__ __forceinline__ void copyVals(volatile V* svals, V& val, unsigned int tid, unsigned int delta)
+        {
+            svals[tid] = val = svals[tid + delta];
+        }
+        template <typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void copyValsShfl(const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                     unsigned int delta,
+                                                     int width)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9> >::value>::copyShfl(val, delta, width);
+        }
+        template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void copyVals(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                                 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                 unsigned int tid, unsigned int delta)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::copy(svals, val, tid, delta);
+        }
+
+        //////////////////////////////////////////////////////
+        // merge
+
+        template <typename K, typename V, class Cmp>
+        __device__ __forceinline__ void mergeShfl(K& key, V& val, const Cmp& cmp, unsigned int delta, int width)
+        {
+            K reg = shfl_down(key, delta, width);
+
+            if (cmp(reg, key))
+            {
+                key = reg;
+                copyValsShfl(val, delta, width);
+            }
+        }
+        template <typename K, typename V, class Cmp>
+        __device__ __forceinline__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, unsigned int tid, unsigned int delta)
+        {
+            K reg = skeys[tid + delta];
+
+            if (cmp(reg, key))
+            {
+                skeys[tid] = key = reg;
+                copyVals(svals, val, tid, delta);
+            }
+        }
+        template <typename K,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp>
+        __device__ __forceinline__ void mergeShfl(K& key,
+                                                  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                  const Cmp& cmp,
+                                                  unsigned int delta, int width)
+        {
+            K reg = shfl_down(key, delta, width);
+
+            if (cmp(reg, key))
+            {
+                key = reg;
+                copyValsShfl(val, delta, width);
+            }
+        }
+        template <typename K,
+                  typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp>
+        __device__ __forceinline__ void merge(volatile K* skeys, K& key,
+                                              const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                              const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                              const Cmp& cmp, unsigned int tid, unsigned int delta)
+        {
+            K reg = skeys[tid + delta];
+
+            if (cmp(reg, key))
+            {
+                skeys[tid] = key = reg;
+                copyVals(svals, val, tid, delta);
+            }
+        }
+        template <typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
+        __device__ __forceinline__ void mergeShfl(const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
+                                                  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                  const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
+                                                  unsigned int delta, int width)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9> >::value>::mergeShfl(key, val, cmp, delta, width);
+        }
+        template <typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
+                  typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
+                  typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
+        __device__ __forceinline__ void merge(const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
+                                              const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
+                                              const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                              const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                              const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
+                                              unsigned int tid, unsigned int delta)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::merge(skeys, key, svals, val, cmp, tid, delta);
+        }
+
+        //////////////////////////////////////////////////////
+        // Generic
+
+        template <unsigned int N> struct Generic
+        {
+            template <class KP, class KR, class VP, class VR, class Cmp>
+            static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+                loadToSmem(skeys, key, tid);
+                loadValsToSmem(svals, val, tid);
+                if (N >= 32)
+                    __syncthreads();
+
+                if (N >= 2048)
+                {
+                    if (tid < 1024)
+                        merge(skeys, key, svals, val, cmp, tid, 1024);
+
+                    __syncthreads();
+                }
+                if (N >= 1024)
+                {
+                    if (tid < 512)
+                        merge(skeys, key, svals, val, cmp, tid, 512);
+
+                    __syncthreads();
+                }
+                if (N >= 512)
+                {
+                    if (tid < 256)
+                        merge(skeys, key, svals, val, cmp, tid, 256);
+
+                    __syncthreads();
+                }
+                if (N >= 256)
+                {
+                    if (tid < 128)
+                        merge(skeys, key, svals, val, cmp, tid, 128);
+
+                    __syncthreads();
+                }
+                if (N >= 128)
+                {
+                    if (tid < 64)
+                        merge(skeys, key, svals, val, cmp, tid, 64);
+
+                    __syncthreads();
+                }
+                if (N >= 64)
+                {
+                    if (tid < 32)
+                        merge(skeys, key, svals, val, cmp, tid, 32);
+                }
+
+                if (tid < 16)
+                {
+                    merge(skeys, key, svals, val, cmp, tid, 16);
+                    merge(skeys, key, svals, val, cmp, tid, 8);
+                    merge(skeys, key, svals, val, cmp, tid, 4);
+                    merge(skeys, key, svals, val, cmp, tid, 2);
+                    merge(skeys, key, svals, val, cmp, tid, 1);
+                }
+            }
+        };
+
+        template <unsigned int I, class KP, class KR, class VP, class VR, class Cmp>
+        struct Unroll
+        {
+            static __device__ void loopShfl(KR key, VR val, Cmp cmp, unsigned int N)
+            {
+                mergeShfl(key, val, cmp, I, N);
+                Unroll<I / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
+            }
+            static __device__ void loop(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+                merge(skeys, key, svals, val, cmp, tid, I);
+                Unroll<I / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+            }
+        };
+        template <class KP, class KR, class VP, class VR, class Cmp>
+        struct Unroll<0, KP, KR, VP, VR, Cmp>
+        {
+            static __device__ void loopShfl(KR, VR, Cmp, unsigned int)
+            {
+            }
+            static __device__ void loop(KP, KR, VP, VR, unsigned int, Cmp)
+            {
+            }
+        };
+
+        template <unsigned int N> struct WarpOptimized
+        {
+            template <class KP, class KR, class VP, class VR, class Cmp>
+            static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+            #if 0 // __CUDA_ARCH__ >= 300
+                CV_UNUSED(skeys);
+                CV_UNUSED(svals);
+                CV_UNUSED(tid);
+
+                Unroll<N / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
+            #else
+                loadToSmem(skeys, key, tid);
+                loadToSmem(svals, val, tid);
+
+                if (tid < N / 2)
+                    Unroll<N / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+            #endif
+            }
+        };
+
+        template <unsigned int N> struct GenericOptimized32
+        {
+            enum { M = N / 32 };
+
+            template <class KP, class KR, class VP, class VR, class Cmp>
+            static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+                const unsigned int laneId = Warp::laneId();
+
+            #if 0 // __CUDA_ARCH__ >= 300
+                Unroll<16, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, warpSize);
+
+                if (laneId == 0)
+                {
+                    loadToSmem(skeys, key, tid / 32);
+                    loadToSmem(svals, val, tid / 32);
+                }
+            #else
+                loadToSmem(skeys, key, tid);
+                loadToSmem(svals, val, tid);
+
+                if (laneId < 16)
+                    Unroll<16, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+
+                __syncthreads();
+
+                if (laneId == 0)
+                {
+                    loadToSmem(skeys, key, tid / 32);
+                    loadToSmem(svals, val, tid / 32);
+                }
+            #endif
+
+                __syncthreads();
+
+                loadFromSmem(skeys, key, tid);
+
+                if (tid < 32)
+                {
+                #if 0 // __CUDA_ARCH__ >= 300
+                    loadFromSmem(svals, val, tid);
+
+                    Unroll<M / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, M);
+                #else
+                    Unroll<M / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+                #endif
+                }
+            }
+        };
+
+        template <bool val, class T1, class T2> struct StaticIf;
+        template <class T1, class T2> struct StaticIf<true, T1, T2>
+        {
+            typedef T1 type;
+        };
+        template <class T1, class T2> struct StaticIf<false, T1, T2>
+        {
+            typedef T2 type;
+        };
+
+        template <unsigned int N> struct IsPowerOf2
+        {
+            enum { value = ((N != 0) && !(N & (N - 1))) };
+        };
+
+        template <unsigned int N> struct Dispatcher
+        {
+            typedef typename StaticIf<
+                (N <= 32) && IsPowerOf2<N>::value,
+                WarpOptimized<N>,
+                typename StaticIf<
+                    (N <= 1024) && IsPowerOf2<N>::value,
+                    GenericOptimized32<N>,
+                    Generic<N>
+                >::type
+            >::type reductor;
+        };
+    }
+}}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_PRED_VAL_REDUCE_DETAIL_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/detail/transform_detail.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/detail/transform_detail.hpp
new file mode 100644
index 0000000..1919848
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/detail/transform_detail.hpp
@@ -0,0 +1,392 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_TRANSFORM_DETAIL_HPP
+#define OPENCV_CUDA_TRANSFORM_DETAIL_HPP
+
+#include "../common.hpp"
+#include "../vec_traits.hpp"
+#include "../functional.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace transform_detail
+    {
+        //! Read Write Traits
+
+        template <typename T, typename D, int shift> struct UnaryReadWriteTraits
+        {
+            typedef typename TypeVec<T, shift>::vec_type read_type;
+            typedef typename TypeVec<D, shift>::vec_type write_type;
+        };
+
+        template <typename T1, typename T2, typename D, int shift> struct BinaryReadWriteTraits
+        {
+            typedef typename TypeVec<T1, shift>::vec_type read_type1;
+            typedef typename TypeVec<T2, shift>::vec_type read_type2;
+            typedef typename TypeVec<D, shift>::vec_type write_type;
+        };
+
+        //! Transform kernels
+
+        template <int shift> struct OpUnroller;
+        template <> struct OpUnroller<1>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+            }
+        };
+        template <> struct OpUnroller<2>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src.y);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src1.y, src2.y);
+            }
+        };
+        template <> struct OpUnroller<3>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src.z);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src1.y, src2.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src1.z, src2.z);
+            }
+        };
+        template <> struct OpUnroller<4>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src.z);
+                if (mask(y, x_shifted + 3))
+                    dst.w = op(src.w);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src1.y, src2.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src1.z, src2.z);
+                if (mask(y, x_shifted + 3))
+                    dst.w = op(src1.w, src2.w);
+            }
+        };
+        template <> struct OpUnroller<8>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.a0 = op(src.a0);
+                if (mask(y, x_shifted + 1))
+                    dst.a1 = op(src.a1);
+                if (mask(y, x_shifted + 2))
+                    dst.a2 = op(src.a2);
+                if (mask(y, x_shifted + 3))
+                    dst.a3 = op(src.a3);
+                if (mask(y, x_shifted + 4))
+                    dst.a4 = op(src.a4);
+                if (mask(y, x_shifted + 5))
+                    dst.a5 = op(src.a5);
+                if (mask(y, x_shifted + 6))
+                    dst.a6 = op(src.a6);
+                if (mask(y, x_shifted + 7))
+                    dst.a7 = op(src.a7);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.a0 = op(src1.a0, src2.a0);
+                if (mask(y, x_shifted + 1))
+                    dst.a1 = op(src1.a1, src2.a1);
+                if (mask(y, x_shifted + 2))
+                    dst.a2 = op(src1.a2, src2.a2);
+                if (mask(y, x_shifted + 3))
+                    dst.a3 = op(src1.a3, src2.a3);
+                if (mask(y, x_shifted + 4))
+                    dst.a4 = op(src1.a4, src2.a4);
+                if (mask(y, x_shifted + 5))
+                    dst.a5 = op(src1.a5, src2.a5);
+                if (mask(y, x_shifted + 6))
+                    dst.a6 = op(src1.a6, src2.a6);
+                if (mask(y, x_shifted + 7))
+                    dst.a7 = op(src1.a7, src2.a7);
+            }
+        };
+
+        template <typename T, typename D, typename UnOp, typename Mask>
+        static __global__ void transformSmart(const PtrStepSz<T> src_, PtrStep<D> dst_, const Mask mask, const UnOp op)
+        {
+            typedef TransformFunctorTraits<UnOp> ft;
+            typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type;
+            typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type;
+
+            const int x = threadIdx.x + blockIdx.x * blockDim.x;
+            const int y = threadIdx.y + blockIdx.y * blockDim.y;
+            const int x_shifted = x * ft::smart_shift;
+
+            if (y < src_.rows)
+            {
+                const T* src = src_.ptr(y);
+                D* dst = dst_.ptr(y);
+
+                if (x_shifted + ft::smart_shift - 1 < src_.cols)
+                {
+                    const read_type src_n_el = ((const read_type*)src)[x];
+                    OpUnroller<ft::smart_shift>::unroll(src_n_el, ((write_type*)dst)[x], mask, op, x_shifted, y);
+                }
+                else
+                {
+                    for (int real_x = x_shifted; real_x < src_.cols; ++real_x)
+                    {
+                        if (mask(y, real_x))
+                            dst[real_x] = op(src[real_x]);
+                    }
+                }
+            }
+        }
+
+        template <typename T, typename D, typename UnOp, typename Mask>
+        __global__ static void transformSimple(const PtrStepSz<T> src, PtrStep<D> dst, const Mask mask, const UnOp op)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < src.cols && y < src.rows && mask(y, x))
+            {
+                dst.ptr(y)[x] = op(src.ptr(y)[x]);
+            }
+        }
+
+        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+        static __global__ void transformSmart(const PtrStepSz<T1> src1_, const PtrStep<T2> src2_, PtrStep<D> dst_,
+            const Mask mask, const BinOp op)
+        {
+            typedef TransformFunctorTraits<BinOp> ft;
+            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1;
+            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2;
+            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type;
+
+            const int x = threadIdx.x + blockIdx.x * blockDim.x;
+            const int y = threadIdx.y + blockIdx.y * blockDim.y;
+            const int x_shifted = x * ft::smart_shift;
+
+            if (y < src1_.rows)
+            {
+                const T1* src1 = src1_.ptr(y);
+                const T2* src2 = src2_.ptr(y);
+                D* dst = dst_.ptr(y);
+
+                if (x_shifted + ft::smart_shift - 1 < src1_.cols)
+                {
+                    const read_type1 src1_n_el = ((const read_type1*)src1)[x];
+                    const read_type2 src2_n_el = ((const read_type2*)src2)[x];
+
+                    OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, ((write_type*)dst)[x], mask, op, x_shifted, y);
+                }
+                else
+                {
+                    for (int real_x = x_shifted; real_x < src1_.cols; ++real_x)
+                    {
+                        if (mask(y, real_x))
+                            dst[real_x] = op(src1[real_x], src2[real_x]);
+                    }
+                }
+            }
+        }
+
+        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+        static __global__ void transformSimple(const PtrStepSz<T1> src1, const PtrStep<T2> src2, PtrStep<D> dst,
+            const Mask mask, const BinOp op)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < src1.cols && y < src1.rows && mask(y, x))
+            {
+                const T1 src1_data = src1.ptr(y)[x];
+                const T2 src2_data = src2.ptr(y)[x];
+                dst.ptr(y)[x] = op(src1_data, src2_data);
+            }
+        }
+
+        template <bool UseSmart> struct TransformDispatcher;
+        template<> struct TransformDispatcher<false>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static void call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream)
+            {
+                typedef TransformFunctorTraits<UnOp> ft;
+
+                const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
+                const dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y), 1);
+
+                transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream)
+            {
+                typedef TransformFunctorTraits<BinOp> ft;
+
+                const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
+                const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1);
+
+                transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+        template<> struct TransformDispatcher<true>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static void call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream)
+            {
+                typedef TransformFunctorTraits<UnOp> ft;
+
+                CV_StaticAssert(ft::smart_shift != 1, "");
+
+                if (!isAligned(src.data, ft::smart_shift * sizeof(T)) || !isAligned(src.step, ft::smart_shift * sizeof(T)) ||
+                    !isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D)))
+                {
+                    TransformDispatcher<false>::call(src, dst, op, mask, stream);
+                    return;
+                }
+
+                const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
+                const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1);
+
+                transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream)
+            {
+                typedef TransformFunctorTraits<BinOp> ft;
+
+                CV_StaticAssert(ft::smart_shift != 1, "");
+
+                if (!isAligned(src1.data, ft::smart_shift * sizeof(T1)) || !isAligned(src1.step, ft::smart_shift * sizeof(T1)) ||
+                    !isAligned(src2.data, ft::smart_shift * sizeof(T2)) || !isAligned(src2.step, ft::smart_shift * sizeof(T2)) ||
+                    !isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D)))
+                {
+                    TransformDispatcher<false>::call(src1, src2, dst, op, mask, stream);
+                    return;
+                }
+
+                const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
+                const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1);
+
+                transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+    } // namespace transform_detail
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_TRANSFORM_DETAIL_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/detail/type_traits_detail.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/detail/type_traits_detail.hpp
new file mode 100644
index 0000000..a78bd2c
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/detail/type_traits_detail.hpp
@@ -0,0 +1,191 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_TYPE_TRAITS_DETAIL_HPP
+#define OPENCV_CUDA_TYPE_TRAITS_DETAIL_HPP
+
+#include "../common.hpp"
+#include "../vec_traits.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace type_traits_detail
+    {
+        template <bool, typename T1, typename T2> struct Select { typedef T1 type; };
+        template <typename T1, typename T2> struct Select<false, T1, T2> { typedef T2 type; };
+
+        template <typename T> struct IsSignedIntergral { enum {value = 0}; };
+        template <> struct IsSignedIntergral<schar> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<char1> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<short> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<short1> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<int> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<int1> { enum {value = 1}; };
+
+        template <typename T> struct IsUnsignedIntegral { enum {value = 0}; };
+        template <> struct IsUnsignedIntegral<uchar> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<uchar1> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<ushort> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<ushort1> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<uint> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<uint1> { enum {value = 1}; };
+
+        template <typename T> struct IsIntegral { enum {value = IsSignedIntergral<T>::value || IsUnsignedIntegral<T>::value}; };
+        template <> struct IsIntegral<char> { enum {value = 1}; };
+        template <> struct IsIntegral<bool> { enum {value = 1}; };
+
+        template <typename T> struct IsFloat { enum {value = 0}; };
+        template <> struct IsFloat<float> { enum {value = 1}; };
+        template <> struct IsFloat<double> { enum {value = 1}; };
+
+        template <typename T> struct IsVec { enum {value = 0}; };
+        template <> struct IsVec<uchar1> { enum {value = 1}; };
+        template <> struct IsVec<uchar2> { enum {value = 1}; };
+        template <> struct IsVec<uchar3> { enum {value = 1}; };
+        template <> struct IsVec<uchar4> { enum {value = 1}; };
+        template <> struct IsVec<uchar8> { enum {value = 1}; };
+        template <> struct IsVec<char1> { enum {value = 1}; };
+        template <> struct IsVec<char2> { enum {value = 1}; };
+        template <> struct IsVec<char3> { enum {value = 1}; };
+        template <> struct IsVec<char4> { enum {value = 1}; };
+        template <> struct IsVec<char8> { enum {value = 1}; };
+        template <> struct IsVec<ushort1> { enum {value = 1}; };
+        template <> struct IsVec<ushort2> { enum {value = 1}; };
+        template <> struct IsVec<ushort3> { enum {value = 1}; };
+        template <> struct IsVec<ushort4> { enum {value = 1}; };
+        template <> struct IsVec<ushort8> { enum {value = 1}; };
+        template <> struct IsVec<short1> { enum {value = 1}; };
+        template <> struct IsVec<short2> { enum {value = 1}; };
+        template <> struct IsVec<short3> { enum {value = 1}; };
+        template <> struct IsVec<short4> { enum {value = 1}; };
+        template <> struct IsVec<short8> { enum {value = 1}; };
+        template <> struct IsVec<uint1> { enum {value = 1}; };
+        template <> struct IsVec<uint2> { enum {value = 1}; };
+        template <> struct IsVec<uint3> { enum {value = 1}; };
+        template <> struct IsVec<uint4> { enum {value = 1}; };
+        template <> struct IsVec<uint8> { enum {value = 1}; };
+        template <> struct IsVec<int1> { enum {value = 1}; };
+        template <> struct IsVec<int2> { enum {value = 1}; };
+        template <> struct IsVec<int3> { enum {value = 1}; };
+        template <> struct IsVec<int4> { enum {value = 1}; };
+        template <> struct IsVec<int8> { enum {value = 1}; };
+        template <> struct IsVec<float1> { enum {value = 1}; };
+        template <> struct IsVec<float2> { enum {value = 1}; };
+        template <> struct IsVec<float3> { enum {value = 1}; };
+        template <> struct IsVec<float4> { enum {value = 1}; };
+        template <> struct IsVec<float8> { enum {value = 1}; };
+        template <> struct IsVec<double1> { enum {value = 1}; };
+        template <> struct IsVec<double2> { enum {value = 1}; };
+        template <> struct IsVec<double3> { enum {value = 1}; };
+        template <> struct IsVec<double4> { enum {value = 1}; };
+        template <> struct IsVec<double8> { enum {value = 1}; };
+
+        template <class U> struct AddParameterType { typedef const U& type; };
+        template <class U> struct AddParameterType<U&> { typedef U& type; };
+        template <> struct AddParameterType<void> { typedef void type; };
+
+        template <class U> struct ReferenceTraits
+        {
+            enum { value = false };
+            typedef U type;
+        };
+        template <class U> struct ReferenceTraits<U&>
+        {
+            enum { value = true };
+            typedef U type;
+        };
+
+        template <class U> struct PointerTraits
+        {
+            enum { value = false };
+            typedef void type;
+        };
+        template <class U> struct PointerTraits<U*>
+        {
+            enum { value = true };
+            typedef U type;
+        };
+        template <class U> struct PointerTraits<U*&>
+        {
+            enum { value = true };
+            typedef U type;
+        };
+
+        template <class U> struct UnConst
+        {
+            typedef U type;
+            enum { value = 0 };
+        };
+        template <class U> struct UnConst<const U>
+        {
+            typedef U type;
+            enum { value = 1 };
+        };
+        template <class U> struct UnConst<const U&>
+        {
+            typedef U& type;
+            enum { value = 1 };
+        };
+
+        template <class U> struct UnVolatile
+        {
+            typedef U type;
+            enum { value = 0 };
+        };
+        template <class U> struct UnVolatile<volatile U>
+        {
+            typedef U type;
+            enum { value = 1 };
+        };
+        template <class U> struct UnVolatile<volatile U&>
+        {
+            typedef U& type;
+            enum { value = 1 };
+        };
+    } // namespace type_traits_detail
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_TYPE_TRAITS_DETAIL_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/detail/vec_distance_detail.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/detail/vec_distance_detail.hpp
new file mode 100644
index 0000000..8283a99
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/detail/vec_distance_detail.hpp
@@ -0,0 +1,121 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_VEC_DISTANCE_DETAIL_HPP
+#define OPENCV_CUDA_VEC_DISTANCE_DETAIL_HPP
+
+#include "../datamov_utils.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace vec_distance_detail
+    {
+        template <int THREAD_DIM, int N> struct UnrollVecDiffCached
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ void calcCheck(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int ind)
+            {
+                if (ind < len)
+                {
+                    T1 val1 = *vecCached++;
+
+                    T2 val2;
+                    ForceGlob<T2>::Load(vecGlob, ind, val2);
+
+                    dist.reduceIter(val1, val2);
+
+                    UnrollVecDiffCached<THREAD_DIM, N - 1>::calcCheck(vecCached, vecGlob, len, dist, ind + THREAD_DIM);
+                }
+            }
+
+            template <typename Dist, typename T1, typename T2>
+            static __device__ void calcWithoutCheck(const T1* vecCached, const T2* vecGlob, Dist& dist)
+            {
+                T1 val1 = *vecCached++;
+
+                T2 val2;
+                ForceGlob<T2>::Load(vecGlob, 0, val2);
+                vecGlob += THREAD_DIM;
+
+                dist.reduceIter(val1, val2);
+
+                UnrollVecDiffCached<THREAD_DIM, N - 1>::calcWithoutCheck(vecCached, vecGlob, dist);
+            }
+        };
+        template <int THREAD_DIM> struct UnrollVecDiffCached<THREAD_DIM, 0>
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calcCheck(const T1*, const T2*, int, Dist&, int)
+            {
+            }
+
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calcWithoutCheck(const T1*, const T2*, Dist&)
+            {
+            }
+        };
+
+        template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN> struct VecDiffCachedCalculator;
+        template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, false>
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
+            {
+                UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcCheck(vecCached, vecGlob, len, dist, tid);
+            }
+        };
+        template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, true>
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
+            {
+                UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcWithoutCheck(vecCached, vecGlob + tid, dist);
+            }
+        };
+    } // namespace vec_distance_detail
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_VEC_DISTANCE_DETAIL_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/dynamic_smem.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/dynamic_smem.hpp
new file mode 100644
index 0000000..42570c6
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/dynamic_smem.hpp
@@ -0,0 +1,88 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_DYNAMIC_SMEM_HPP
+#define OPENCV_CUDA_DYNAMIC_SMEM_HPP
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    template<class T> struct DynamicSharedMem
+    {
+        __device__ __forceinline__ operator T*()
+        {
+            extern __shared__ int __smem[];
+            return (T*)__smem;
+        }
+
+        __device__ __forceinline__ operator const T*() const
+        {
+            extern __shared__ int __smem[];
+            return (T*)__smem;
+        }
+    };
+
+    // specialize for double to avoid unaligned memory access compile errors
+    template<> struct DynamicSharedMem<double>
+    {
+        __device__ __forceinline__ operator double*()
+        {
+            extern __shared__ double __smem_d[];
+            return (double*)__smem_d;
+        }
+
+        __device__ __forceinline__ operator const double*() const
+        {
+            extern __shared__ double __smem_d[];
+            return (double*)__smem_d;
+        }
+    };
+}}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_DYNAMIC_SMEM_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/emulation.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/emulation.hpp
new file mode 100644
index 0000000..17dc117
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/emulation.hpp
@@ -0,0 +1,269 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_EMULATION_HPP_
+#define OPENCV_CUDA_EMULATION_HPP_
+
+#include "common.hpp"
+#include "warp_reduce.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    struct Emulation
+    {
+
+        static __device__ __forceinline__ int syncthreadsOr(int pred)
+        {
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 200)
+                // just campilation stab
+                return 0;
+#else
+                return __syncthreads_or(pred);
+#endif
+        }
+
+        template<int CTA_SIZE>
+        static __forceinline__ __device__ int Ballot(int predicate)
+        {
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
+            return __ballot(predicate);
+#else
+            __shared__ volatile int cta_buffer[CTA_SIZE];
+
+            int tid = threadIdx.x;
+            cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
+            return warp_reduce(cta_buffer);
+#endif
+        }
+
+        struct smem
+        {
+            enum { TAG_MASK = (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U };
+
+            template<typename T>
+            static __device__ __forceinline__ T atomicInc(T* address, T val)
+            {
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
+                T count;
+                unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
+                do
+                {
+                    count = *address & TAG_MASK;
+                    count = tag | (count + 1);
+                    *address = count;
+                } while (*address != count);
+
+                return (count & TAG_MASK) - 1;
+#else
+                return ::atomicInc(address, val);
+#endif
+            }
+
+            template<typename T>
+            static __device__ __forceinline__ T atomicAdd(T* address, T val)
+            {
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
+                T count;
+                unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
+                do
+                {
+                    count = *address & TAG_MASK;
+                    count = tag | (count + val);
+                    *address = count;
+                } while (*address != count);
+
+                return (count & TAG_MASK) - val;
+#else
+                return ::atomicAdd(address, val);
+#endif
+            }
+
+            template<typename T>
+            static __device__ __forceinline__ T atomicMin(T* address, T val)
+            {
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
+                T count = ::min(*address, val);
+                do
+                {
+                    *address = count;
+                } while (*address > count);
+
+                return count;
+#else
+                return ::atomicMin(address, val);
+#endif
+            }
+        }; // struct cmem
+
+        struct glob
+        {
+            static __device__ __forceinline__ int atomicAdd(int* address, int val)
+            {
+                return ::atomicAdd(address, val);
+            }
+            static __device__ __forceinline__ unsigned int atomicAdd(unsigned int* address, unsigned int val)
+            {
+                return ::atomicAdd(address, val);
+            }
+            static __device__ __forceinline__ float atomicAdd(float* address, float val)
+            {
+            #if __CUDA_ARCH__ >= 200
+                return ::atomicAdd(address, val);
+            #else
+                int* address_as_i = (int*) address;
+                int old = *address_as_i, assumed;
+                do {
+                    assumed = old;
+                    old = ::atomicCAS(address_as_i, assumed,
+                        __float_as_int(val + __int_as_float(assumed)));
+                } while (assumed != old);
+                return __int_as_float(old);
+            #endif
+            }
+            static __device__ __forceinline__ double atomicAdd(double* address, double val)
+            {
+            #if __CUDA_ARCH__ >= 130
+                unsigned long long int* address_as_ull = (unsigned long long int*) address;
+                unsigned long long int old = *address_as_ull, assumed;
+                do {
+                    assumed = old;
+                    old = ::atomicCAS(address_as_ull, assumed,
+                        __double_as_longlong(val + __longlong_as_double(assumed)));
+                } while (assumed != old);
+                return __longlong_as_double(old);
+            #else
+                CV_UNUSED(address);
+                CV_UNUSED(val);
+                return 0.0;
+            #endif
+            }
+
+            static __device__ __forceinline__ int atomicMin(int* address, int val)
+            {
+                return ::atomicMin(address, val);
+            }
+            static __device__ __forceinline__ float atomicMin(float* address, float val)
+            {
+            #if __CUDA_ARCH__ >= 120
+                int* address_as_i = (int*) address;
+                int old = *address_as_i, assumed;
+                do {
+                    assumed = old;
+                    old = ::atomicCAS(address_as_i, assumed,
+                        __float_as_int(::fminf(val, __int_as_float(assumed))));
+                } while (assumed != old);
+                return __int_as_float(old);
+            #else
+                CV_UNUSED(address);
+                CV_UNUSED(val);
+                return 0.0f;
+            #endif
+            }
+            static __device__ __forceinline__ double atomicMin(double* address, double val)
+            {
+            #if __CUDA_ARCH__ >= 130
+                unsigned long long int* address_as_ull = (unsigned long long int*) address;
+                unsigned long long int old = *address_as_ull, assumed;
+                do {
+                    assumed = old;
+                    old = ::atomicCAS(address_as_ull, assumed,
+                        __double_as_longlong(::fmin(val, __longlong_as_double(assumed))));
+                } while (assumed != old);
+                return __longlong_as_double(old);
+            #else
+                CV_UNUSED(address);
+                CV_UNUSED(val);
+                return 0.0;
+            #endif
+            }
+
+            static __device__ __forceinline__ int atomicMax(int* address, int val)
+            {
+                return ::atomicMax(address, val);
+            }
+            static __device__ __forceinline__ float atomicMax(float* address, float val)
+            {
+            #if __CUDA_ARCH__ >= 120
+                int* address_as_i = (int*) address;
+                int old = *address_as_i, assumed;
+                do {
+                    assumed = old;
+                    old = ::atomicCAS(address_as_i, assumed,
+                        __float_as_int(::fmaxf(val, __int_as_float(assumed))));
+                } while (assumed != old);
+                return __int_as_float(old);
+            #else
+                CV_UNUSED(address);
+                CV_UNUSED(val);
+                return 0.0f;
+            #endif
+            }
+            static __device__ __forceinline__ double atomicMax(double* address, double val)
+            {
+            #if __CUDA_ARCH__ >= 130
+                unsigned long long int* address_as_ull = (unsigned long long int*) address;
+                unsigned long long int old = *address_as_ull, assumed;
+                do {
+                    assumed = old;
+                    old = ::atomicCAS(address_as_ull, assumed,
+                        __double_as_longlong(::fmax(val, __longlong_as_double(assumed))));
+                } while (assumed != old);
+                return __longlong_as_double(old);
+            #else
+                CV_UNUSED(address);
+                CV_UNUSED(val);
+                return 0.0;
+            #endif
+            }
+        };
+    }; //struct Emulation
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif /* OPENCV_CUDA_EMULATION_HPP_ */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/filters.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/filters.hpp
new file mode 100644
index 0000000..bf3147e
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/filters.hpp
@@ -0,0 +1,293 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_FILTERS_HPP
+#define OPENCV_CUDA_FILTERS_HPP
+
+#include "saturate_cast.hpp"
+#include "vec_traits.hpp"
+#include "vec_math.hpp"
+#include "type_traits.hpp"
+#include "nppdefs.h"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename Ptr2D> struct PointFilter
+    {
+        typedef typename Ptr2D::elem_type elem_type;
+        typedef float index_type;
+
+        explicit __host__ __device__ __forceinline__ PointFilter(const Ptr2D& src_, float fx = 0.f, float fy = 0.f)
+        : src(src_)
+        {
+            CV_UNUSED(fx);
+            CV_UNUSED(fy);
+        }
+
+        __device__ __forceinline__ elem_type operator ()(float y, float x) const
+        {
+            return src(__float2int_rz(y), __float2int_rz(x));
+        }
+
+        Ptr2D src;
+    };
+
+    template <typename Ptr2D> struct LinearFilter
+    {
+        typedef typename Ptr2D::elem_type elem_type;
+        typedef float index_type;
+
+        explicit __host__ __device__ __forceinline__ LinearFilter(const Ptr2D& src_, float fx = 0.f, float fy = 0.f)
+        : src(src_)
+        {
+            CV_UNUSED(fx);
+            CV_UNUSED(fy);
+        }
+        __device__ __forceinline__ elem_type operator ()(float y, float x) const
+        {
+            typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
+
+            work_type out = VecTraits<work_type>::all(0);
+
+            const int x1 = __float2int_rd(x);
+            const int y1 = __float2int_rd(y);
+            if (x1 <= NPP_MIN_32S || x1 >= NPP_MAX_32S || y1 <= NPP_MIN_32S || y1 >= NPP_MAX_32S)
+            {
+                elem_type src_reg = src(y1, x1);
+                out = out + src_reg * 1.0f;
+                return saturate_cast<elem_type>(out);
+            }
+            const int x2 = x1 + 1;
+            const int y2 = y1 + 1;
+
+            elem_type src_reg = src(y1, x1);
+            out = out + src_reg * ((x2 - x) * (y2 - y));
+
+            src_reg = src(y1, x2);
+            out = out + src_reg * ((x - x1) * (y2 - y));
+
+            src_reg = src(y2, x1);
+            out = out + src_reg * ((x2 - x) * (y - y1));
+
+            src_reg = src(y2, x2);
+            out = out + src_reg * ((x - x1) * (y - y1));
+
+            return saturate_cast<elem_type>(out);
+        }
+
+        Ptr2D src;
+    };
+
+    template <typename Ptr2D> struct CubicFilter
+    {
+        typedef typename Ptr2D::elem_type elem_type;
+        typedef float index_type;
+        typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
+
+        explicit __host__ __device__ __forceinline__ CubicFilter(const Ptr2D& src_, float fx = 0.f, float fy = 0.f)
+        : src(src_)
+        {
+            CV_UNUSED(fx);
+            CV_UNUSED(fy);
+        }
+
+        static __device__ __forceinline__ float bicubicCoeff(float x_)
+        {
+            float x = fabsf(x_);
+            if (x <= 1.0f)
+            {
+                return x * x * (1.5f * x - 2.5f) + 1.0f;
+            }
+            else if (x < 2.0f)
+            {
+                return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;
+            }
+            else
+            {
+                return 0.0f;
+            }
+        }
+
+        __device__ elem_type operator ()(float y, float x) const
+        {
+            const float xmin = ::ceilf(x - 2.0f);
+            const float xmax = ::floorf(x + 2.0f);
+
+            const float ymin = ::ceilf(y - 2.0f);
+            const float ymax = ::floorf(y + 2.0f);
+
+            work_type sum = VecTraits<work_type>::all(0);
+            float wsum = 0.0f;
+
+            for (float cy = ymin; cy <= ymax; cy += 1.0f)
+            {
+                for (float cx = xmin; cx <= xmax; cx += 1.0f)
+                {
+                    const float w = bicubicCoeff(x - cx) * bicubicCoeff(y - cy);
+                    sum = sum + w * src(__float2int_rd(cy), __float2int_rd(cx));
+                    wsum += w;
+                }
+            }
+
+            work_type res = (!wsum)? VecTraits<work_type>::all(0) : sum / wsum;
+
+            return saturate_cast<elem_type>(res);
+        }
+
+        Ptr2D src;
+    };
+    // for integer scaling
+    template <typename Ptr2D> struct IntegerAreaFilter
+    {
+        typedef typename Ptr2D::elem_type elem_type;
+        typedef float index_type;
+
+        explicit __host__ __device__ __forceinline__ IntegerAreaFilter(const Ptr2D& src_, float scale_x_, float scale_y_)
+            : src(src_), scale_x(scale_x_), scale_y(scale_y_), scale(1.f / (scale_x * scale_y)) {}
+
+        __device__ __forceinline__ elem_type operator ()(float y, float x) const
+        {
+            float fsx1 = x * scale_x;
+            float fsx2 = fsx1 + scale_x;
+
+            int sx1 = __float2int_ru(fsx1);
+            int sx2 = __float2int_rd(fsx2);
+
+            float fsy1 = y * scale_y;
+            float fsy2 = fsy1 + scale_y;
+
+            int sy1 = __float2int_ru(fsy1);
+            int sy2 = __float2int_rd(fsy2);
+
+            typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
+            work_type out = VecTraits<work_type>::all(0.f);
+
+            for(int dy = sy1; dy < sy2; ++dy)
+                for(int dx = sx1; dx < sx2; ++dx)
+                {
+                    out = out + src(dy, dx) * scale;
+                }
+
+            return saturate_cast<elem_type>(out);
+        }
+
+        Ptr2D src;
+        float scale_x, scale_y ,scale;
+    };
+
+    template <typename Ptr2D> struct AreaFilter
+    {
+        typedef typename Ptr2D::elem_type elem_type;
+        typedef float index_type;
+
+        explicit __host__ __device__ __forceinline__ AreaFilter(const Ptr2D& src_, float scale_x_, float scale_y_)
+            : src(src_), scale_x(scale_x_), scale_y(scale_y_){}
+
+        __device__ __forceinline__ elem_type operator ()(float y, float x) const
+        {
+            float fsx1 = x * scale_x;
+            float fsx2 = fsx1 + scale_x;
+
+            int sx1 = __float2int_ru(fsx1);
+            int sx2 = __float2int_rd(fsx2);
+
+            float fsy1 = y * scale_y;
+            float fsy2 = fsy1 + scale_y;
+
+            int sy1 = __float2int_ru(fsy1);
+            int sy2 = __float2int_rd(fsy2);
+
+            float scale = 1.f / (fminf(scale_x, src.width - fsx1) * fminf(scale_y, src.height - fsy1));
+
+            typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
+            work_type out = VecTraits<work_type>::all(0.f);
+
+            for (int dy = sy1; dy < sy2; ++dy)
+            {
+                for (int dx = sx1; dx < sx2; ++dx)
+                    out = out + src(dy, dx) * scale;
+
+                if (sx1 > fsx1)
+                    out = out + src(dy, (sx1 -1) ) * ((sx1 - fsx1) * scale);
+
+                if (sx2 < fsx2)
+                    out = out + src(dy, sx2) * ((fsx2 -sx2) * scale);
+            }
+
+            if (sy1 > fsy1)
+                for (int dx = sx1; dx < sx2; ++dx)
+                    out = out + src( (sy1 - 1) , dx) * ((sy1 -fsy1) * scale);
+
+            if (sy2 < fsy2)
+                for (int dx = sx1; dx < sx2; ++dx)
+                    out = out + src(sy2, dx) * ((fsy2 -sy2) * scale);
+
+            if ((sy1 > fsy1) &&  (sx1 > fsx1))
+                out = out + src( (sy1 - 1) , (sx1 - 1)) * ((sy1 -fsy1) * (sx1 -fsx1) * scale);
+
+            if ((sy1 > fsy1) &&  (sx2 < fsx2))
+                out = out + src( (sy1 - 1) , sx2) * ((sy1 -fsy1) * (fsx2 -sx2) * scale);
+
+            if ((sy2 < fsy2) &&  (sx2 < fsx2))
+                out = out + src(sy2, sx2) * ((fsy2 -sy2) * (fsx2 -sx2) * scale);
+
+            if ((sy2 < fsy2) &&  (sx1 > fsx1))
+                out = out + src(sy2, (sx1 - 1)) * ((fsy2 -sy2) * (sx1 -fsx1) * scale);
+
+            return saturate_cast<elem_type>(out);
+        }
+
+        Ptr2D src;
+        float scale_x, scale_y;
+        int width, haight;
+    };
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_FILTERS_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/funcattrib.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/funcattrib.hpp
new file mode 100644
index 0000000..f582080
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/funcattrib.hpp
@@ -0,0 +1,79 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_DEVICE_FUNCATTRIB_HPP
+#define OPENCV_CUDA_DEVICE_FUNCATTRIB_HPP
+
+#include <cstdio>
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    template<class Func>
+    void printFuncAttrib(Func& func)
+    {
+
+        cudaFuncAttributes attrs;
+        cudaFuncGetAttributes(&attrs, func);
+
+        printf("=== Function stats ===\n");
+        printf("Name: \n");
+        printf("sharedSizeBytes    = %d\n", attrs.sharedSizeBytes);
+        printf("constSizeBytes     = %d\n", attrs.constSizeBytes);
+        printf("localSizeBytes     = %d\n", attrs.localSizeBytes);
+        printf("maxThreadsPerBlock = %d\n", attrs.maxThreadsPerBlock);
+        printf("numRegs            = %d\n", attrs.numRegs);
+        printf("ptxVersion         = %d\n", attrs.ptxVersion);
+        printf("binaryVersion      = %d\n", attrs.binaryVersion);
+        printf("\n");
+        fflush(stdout);
+    }
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif  /* OPENCV_CUDA_DEVICE_FUNCATTRIB_HPP */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/functional.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/functional.hpp
new file mode 100644
index 0000000..9f53d87
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/functional.hpp
@@ -0,0 +1,805 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_FUNCTIONAL_HPP
+#define OPENCV_CUDA_FUNCTIONAL_HPP
+
+#include <functional>
+#include "saturate_cast.hpp"
+#include "vec_traits.hpp"
+#include "type_traits.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    // Function Objects
+    template<typename Argument, typename Result> struct unary_function
+    {
+        typedef Argument argument_type;
+        typedef Result result_type;
+    };
+    template<typename Argument1, typename Argument2, typename Result> struct binary_function
+    {
+        typedef Argument1 first_argument_type;
+        typedef Argument2 second_argument_type;
+        typedef Result result_type;
+    };
+
+    // Arithmetic Operations
+    template <typename T> struct plus : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a,
+                                                 typename TypeTraits<T>::ParameterType b) const
+        {
+            return a + b;
+        }
+        __host__ __device__ __forceinline__ plus() {}
+        __host__ __device__ __forceinline__ plus(const plus&) {}
+    };
+
+    template <typename T> struct minus : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a,
+                                                 typename TypeTraits<T>::ParameterType b) const
+        {
+            return a - b;
+        }
+        __host__ __device__ __forceinline__ minus() {}
+        __host__ __device__ __forceinline__ minus(const minus&) {}
+    };
+
+    template <typename T> struct multiplies : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a,
+                                                 typename TypeTraits<T>::ParameterType b) const
+        {
+            return a * b;
+        }
+        __host__ __device__ __forceinline__ multiplies() {}
+        __host__ __device__ __forceinline__ multiplies(const multiplies&) {}
+    };
+
+    template <typename T> struct divides : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a,
+                                                 typename TypeTraits<T>::ParameterType b) const
+        {
+            return a / b;
+        }
+        __host__ __device__ __forceinline__ divides() {}
+        __host__ __device__ __forceinline__ divides(const divides&) {}
+    };
+
+    template <typename T> struct modulus : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a,
+                                                 typename TypeTraits<T>::ParameterType b) const
+        {
+            return a % b;
+        }
+        __host__ __device__ __forceinline__ modulus() {}
+        __host__ __device__ __forceinline__ modulus(const modulus&) {}
+    };
+
+    template <typename T> struct negate : unary_function<T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a) const
+        {
+            return -a;
+        }
+        __host__ __device__ __forceinline__ negate() {}
+        __host__ __device__ __forceinline__ negate(const negate&) {}
+    };
+
+    // Comparison Operations
+    template <typename T> struct equal_to : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a,
+                                                    typename TypeTraits<T>::ParameterType b) const
+        {
+            return a == b;
+        }
+        __host__ __device__ __forceinline__ equal_to() {}
+        __host__ __device__ __forceinline__ equal_to(const equal_to&) {}
+    };
+
+    template <typename T> struct not_equal_to : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a,
+                                                    typename TypeTraits<T>::ParameterType b) const
+        {
+            return a != b;
+        }
+        __host__ __device__ __forceinline__ not_equal_to() {}
+        __host__ __device__ __forceinline__ not_equal_to(const not_equal_to&) {}
+    };
+
+    template <typename T> struct greater : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a,
+                                                    typename TypeTraits<T>::ParameterType b) const
+        {
+            return a > b;
+        }
+        __host__ __device__ __forceinline__ greater() {}
+        __host__ __device__ __forceinline__ greater(const greater&) {}
+    };
+
+    template <typename T> struct less : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a,
+                                                    typename TypeTraits<T>::ParameterType b) const
+        {
+            return a < b;
+        }
+        __host__ __device__ __forceinline__ less() {}
+        __host__ __device__ __forceinline__ less(const less&) {}
+    };
+
+    template <typename T> struct greater_equal : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a,
+                                                    typename TypeTraits<T>::ParameterType b) const
+        {
+            return a >= b;
+        }
+        __host__ __device__ __forceinline__ greater_equal() {}
+        __host__ __device__ __forceinline__ greater_equal(const greater_equal&) {}
+    };
+
+    template <typename T> struct less_equal : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a,
+                                                    typename TypeTraits<T>::ParameterType b) const
+        {
+            return a <= b;
+        }
+        __host__ __device__ __forceinline__ less_equal() {}
+        __host__ __device__ __forceinline__ less_equal(const less_equal&) {}
+    };
+
+    // Logical Operations
+    template <typename T> struct logical_and : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a,
+                                                    typename TypeTraits<T>::ParameterType b) const
+        {
+            return a && b;
+        }
+        __host__ __device__ __forceinline__ logical_and() {}
+        __host__ __device__ __forceinline__ logical_and(const logical_and&) {}
+    };
+
+    template <typename T> struct logical_or : binary_function<T, T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a,
+                                                    typename TypeTraits<T>::ParameterType b) const
+        {
+            return a || b;
+        }
+        __host__ __device__ __forceinline__ logical_or() {}
+        __host__ __device__ __forceinline__ logical_or(const logical_or&) {}
+    };
+
+    template <typename T> struct logical_not : unary_function<T, bool>
+    {
+        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a) const
+        {
+            return !a;
+        }
+        __host__ __device__ __forceinline__ logical_not() {}
+        __host__ __device__ __forceinline__ logical_not(const logical_not&) {}
+    };
+
+    // Bitwise Operations
+    template <typename T> struct bit_and : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a,
+                                                 typename TypeTraits<T>::ParameterType b) const
+        {
+            return a & b;
+        }
+        __host__ __device__ __forceinline__ bit_and() {}
+        __host__ __device__ __forceinline__ bit_and(const bit_and&) {}
+    };
+
+    template <typename T> struct bit_or : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a,
+                                                 typename TypeTraits<T>::ParameterType b) const
+        {
+            return a | b;
+        }
+        __host__ __device__ __forceinline__ bit_or() {}
+        __host__ __device__ __forceinline__ bit_or(const bit_or&) {}
+    };
+
+    template <typename T> struct bit_xor : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a,
+                                                 typename TypeTraits<T>::ParameterType b) const
+        {
+            return a ^ b;
+        }
+        __host__ __device__ __forceinline__ bit_xor() {}
+        __host__ __device__ __forceinline__ bit_xor(const bit_xor&) {}
+    };
+
+    template <typename T> struct bit_not : unary_function<T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType v) const
+        {
+            return ~v;
+        }
+        __host__ __device__ __forceinline__ bit_not() {}
+        __host__ __device__ __forceinline__ bit_not(const bit_not&) {}
+    };
+
+    // Generalized Identity Operations
+    template <typename T> struct identity : unary_function<T, T>
+    {
+        __device__ __forceinline__ typename TypeTraits<T>::ParameterType operator()(typename TypeTraits<T>::ParameterType x) const
+        {
+            return x;
+        }
+        __host__ __device__ __forceinline__ identity() {}
+        __host__ __device__ __forceinline__ identity(const identity&) {}
+    };
+
+    template <typename T1, typename T2> struct project1st : binary_function<T1, T2, T1>
+    {
+        __device__ __forceinline__ typename TypeTraits<T1>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const
+        {
+            return lhs;
+        }
+        __host__ __device__ __forceinline__ project1st() {}
+        __host__ __device__ __forceinline__ project1st(const project1st&) {}
+    };
+
+    template <typename T1, typename T2> struct project2nd : binary_function<T1, T2, T2>
+    {
+        __device__ __forceinline__ typename TypeTraits<T2>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const
+        {
+            return rhs;
+        }
+        __host__ __device__ __forceinline__ project2nd() {}
+        __host__ __device__ __forceinline__ project2nd(const project2nd&) {}
+    };
+
+    // Min/Max Operations
+
+#define OPENCV_CUDA_IMPLEMENT_MINMAX(name, type, op) \
+    template <> struct name<type> : binary_function<type, type, type> \
+    { \
+        __device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \
+        __host__ __device__ __forceinline__ name() {}\
+        __host__ __device__ __forceinline__ name(const name&) {}\
+    };
+
+    template <typename T> struct maximum : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
+        {
+            return max(lhs, rhs);
+        }
+        __host__ __device__ __forceinline__ maximum() {}
+        __host__ __device__ __forceinline__ maximum(const maximum&) {}
+    };
+
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, uchar, ::max)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, schar, ::max)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, char, ::max)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, ushort, ::max)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, short, ::max)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, int, ::max)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, uint, ::max)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, float, ::fmax)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(maximum, double, ::fmax)
+
+    template <typename T> struct minimum : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
+        {
+            return min(lhs, rhs);
+        }
+        __host__ __device__ __forceinline__ minimum() {}
+        __host__ __device__ __forceinline__ minimum(const minimum&) {}
+    };
+
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, uchar, ::min)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, schar, ::min)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, char, ::min)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, ushort, ::min)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, short, ::min)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, int, ::min)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, uint, ::min)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, float, ::fmin)
+    OPENCV_CUDA_IMPLEMENT_MINMAX(minimum, double, ::fmin)
+
+#undef OPENCV_CUDA_IMPLEMENT_MINMAX
+
+    // Math functions
+
+    template <typename T> struct abs_func : unary_function<T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType x) const
+        {
+            return abs(x);
+        }
+
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<unsigned char> : unary_function<unsigned char, unsigned char>
+    {
+        __device__ __forceinline__ unsigned char operator ()(unsigned char x) const
+        {
+            return x;
+        }
+
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<signed char> : unary_function<signed char, signed char>
+    {
+        __device__ __forceinline__ signed char operator ()(signed char x) const
+        {
+            return ::abs((int)x);
+        }
+
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<char> : unary_function<char, char>
+    {
+        __device__ __forceinline__ char operator ()(char x) const
+        {
+            return ::abs((int)x);
+        }
+
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<unsigned short> : unary_function<unsigned short, unsigned short>
+    {
+        __device__ __forceinline__ unsigned short operator ()(unsigned short x) const
+        {
+            return x;
+        }
+
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<short> : unary_function<short, short>
+    {
+        __device__ __forceinline__ short operator ()(short x) const
+        {
+            return ::abs((int)x);
+        }
+
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<unsigned int> : unary_function<unsigned int, unsigned int>
+    {
+        __device__ __forceinline__ unsigned int operator ()(unsigned int x) const
+        {
+            return x;
+        }
+
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<int> : unary_function<int, int>
+    {
+        __device__ __forceinline__ int operator ()(int x) const
+        {
+            return ::abs(x);
+        }
+
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<float> : unary_function<float, float>
+    {
+        __device__ __forceinline__ float operator ()(float x) const
+        {
+            return ::fabsf(x);
+        }
+
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<double> : unary_function<double, double>
+    {
+        __device__ __forceinline__ double operator ()(double x) const
+        {
+            return ::fabs(x);
+        }
+
+        __host__ __device__ __forceinline__ abs_func() {}
+        __host__ __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+
+#define OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(name, func) \
+    template <typename T> struct name ## _func : unary_function<T, float> \
+    { \
+        __device__ __forceinline__ float operator ()(typename TypeTraits<T>::ParameterType v) const \
+        { \
+            return func ## f(v); \
+        } \
+        __host__ __device__ __forceinline__ name ## _func() {} \
+        __host__ __device__ __forceinline__ name ## _func(const name ## _func&) {} \
+    }; \
+    template <> struct name ## _func<double> : unary_function<double, double> \
+    { \
+        __device__ __forceinline__ double operator ()(double v) const \
+        { \
+            return func(v); \
+        } \
+        __host__ __device__ __forceinline__ name ## _func() {} \
+        __host__ __device__ __forceinline__ name ## _func(const name ## _func&) {} \
+    };
+
+#define OPENCV_CUDA_IMPLEMENT_BIN_FUNCTOR(name, func) \
+    template <typename T> struct name ## _func : binary_function<T, T, float> \
+    { \
+        __device__ __forceinline__ float operator ()(typename TypeTraits<T>::ParameterType v1, typename TypeTraits<T>::ParameterType v2) const \
+        { \
+            return func ## f(v1, v2); \
+        } \
+        __host__ __device__ __forceinline__ name ## _func() {} \
+        __host__ __device__ __forceinline__ name ## _func(const name ## _func&) {} \
+    }; \
+    template <> struct name ## _func<double> : binary_function<double, double, double> \
+    { \
+        __device__ __forceinline__ double operator ()(double v1, double v2) const \
+        { \
+            return func(v1, v2); \
+        } \
+        __host__ __device__ __forceinline__ name ## _func() {} \
+        __host__ __device__ __forceinline__ name ## _func(const name ## _func&) {} \
+    };
+
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(sqrt, ::sqrt)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(exp, ::exp)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(exp2, ::exp2)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(exp10, ::exp10)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(log, ::log)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(log2, ::log2)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(log10, ::log10)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(sin, ::sin)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(cos, ::cos)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(tan, ::tan)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(asin, ::asin)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(acos, ::acos)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(atan, ::atan)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(sinh, ::sinh)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(cosh, ::cosh)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(tanh, ::tanh)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(asinh, ::asinh)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(acosh, ::acosh)
+    OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR(atanh, ::atanh)
+
+    OPENCV_CUDA_IMPLEMENT_BIN_FUNCTOR(hypot, ::hypot)
+    OPENCV_CUDA_IMPLEMENT_BIN_FUNCTOR(atan2, ::atan2)
+    OPENCV_CUDA_IMPLEMENT_BIN_FUNCTOR(pow, ::pow)
+
+    #undef OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR
+    #undef OPENCV_CUDA_IMPLEMENT_UN_FUNCTOR_NO_DOUBLE
+    #undef OPENCV_CUDA_IMPLEMENT_BIN_FUNCTOR
+
+    template<typename T> struct hypot_sqr_func : binary_function<T, T, float>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType src1, typename TypeTraits<T>::ParameterType src2) const
+        {
+            return src1 * src1 + src2 * src2;
+        }
+        __host__ __device__ __forceinline__ hypot_sqr_func() {}
+        __host__ __device__ __forceinline__ hypot_sqr_func(const hypot_sqr_func&) {}
+    };
+
+    // Saturate Cast Functor
+    template <typename T, typename D> struct saturate_cast_func : unary_function<T, D>
+    {
+        __device__ __forceinline__ D operator ()(typename TypeTraits<T>::ParameterType v) const
+        {
+            return saturate_cast<D>(v);
+        }
+        __host__ __device__ __forceinline__ saturate_cast_func() {}
+        __host__ __device__ __forceinline__ saturate_cast_func(const saturate_cast_func&) {}
+    };
+
+    // Threshold Functors
+    template <typename T> struct thresh_binary_func : unary_function<T, T>
+    {
+        __host__ __device__ __forceinline__ thresh_binary_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
+
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
+        {
+            return (src > thresh) * maxVal;
+        }
+
+        __host__ __device__ __forceinline__ thresh_binary_func() {}
+        __host__ __device__ __forceinline__ thresh_binary_func(const thresh_binary_func& other)
+            : thresh(other.thresh), maxVal(other.maxVal) {}
+
+        T thresh;
+        T maxVal;
+    };
+
+    template <typename T> struct thresh_binary_inv_func : unary_function<T, T>
+    {
+        __host__ __device__ __forceinline__ thresh_binary_inv_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
+
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
+        {
+            return (src <= thresh) * maxVal;
+        }
+
+        __host__ __device__ __forceinline__ thresh_binary_inv_func() {}
+        __host__ __device__ __forceinline__ thresh_binary_inv_func(const thresh_binary_inv_func& other)
+            : thresh(other.thresh), maxVal(other.maxVal) {}
+
+        T thresh;
+        T maxVal;
+    };
+
+    template <typename T> struct thresh_trunc_func : unary_function<T, T>
+    {
+        explicit __host__ __device__ __forceinline__ thresh_trunc_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {CV_UNUSED(maxVal_);}
+
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
+        {
+            return minimum<T>()(src, thresh);
+        }
+
+        __host__ __device__ __forceinline__ thresh_trunc_func() {}
+        __host__ __device__ __forceinline__ thresh_trunc_func(const thresh_trunc_func& other)
+            : thresh(other.thresh) {}
+
+        T thresh;
+    };
+
+    template <typename T> struct thresh_to_zero_func : unary_function<T, T>
+    {
+        explicit __host__ __device__ __forceinline__ thresh_to_zero_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {CV_UNUSED(maxVal_);}
+
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
+        {
+            return (src > thresh) * src;
+        }
+
+        __host__ __device__ __forceinline__ thresh_to_zero_func() {}
+       __host__  __device__ __forceinline__ thresh_to_zero_func(const thresh_to_zero_func& other)
+            : thresh(other.thresh) {}
+
+        T thresh;
+    };
+
+    template <typename T> struct thresh_to_zero_inv_func : unary_function<T, T>
+    {
+        explicit __host__ __device__ __forceinline__ thresh_to_zero_inv_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {CV_UNUSED(maxVal_);}
+
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
+        {
+            return (src <= thresh) * src;
+        }
+
+        __host__ __device__ __forceinline__ thresh_to_zero_inv_func() {}
+        __host__ __device__ __forceinline__ thresh_to_zero_inv_func(const thresh_to_zero_inv_func& other)
+            : thresh(other.thresh) {}
+
+        T thresh;
+    };
+
+    // Function Object Adaptors
+    template <typename Predicate> struct unary_negate : unary_function<typename Predicate::argument_type, bool>
+    {
+      explicit __host__ __device__ __forceinline__ unary_negate(const Predicate& p) : pred(p) {}
+
+      __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::argument_type>::ParameterType x) const
+      {
+          return !pred(x);
+      }
+
+      __host__ __device__ __forceinline__ unary_negate() {}
+      __host__ __device__ __forceinline__ unary_negate(const unary_negate& other) : pred(other.pred) {}
+
+      Predicate pred;
+    };
+
+    template <typename Predicate> __host__ __device__ __forceinline__ unary_negate<Predicate> not1(const Predicate& pred)
+    {
+        return unary_negate<Predicate>(pred);
+    }
+
+    template <typename Predicate> struct binary_negate : binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>
+    {
+        explicit __host__ __device__ __forceinline__ binary_negate(const Predicate& p) : pred(p) {}
+
+        __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::first_argument_type>::ParameterType x,
+                                                   typename TypeTraits<typename Predicate::second_argument_type>::ParameterType y) const
+        {
+            return !pred(x,y);
+        }
+
+        __host__ __device__ __forceinline__ binary_negate() {}
+        __host__ __device__ __forceinline__ binary_negate(const binary_negate& other) : pred(other.pred) {}
+
+        Predicate pred;
+    };
+
+    template <typename BinaryPredicate> __host__ __device__ __forceinline__ binary_negate<BinaryPredicate> not2(const BinaryPredicate& pred)
+    {
+        return binary_negate<BinaryPredicate>(pred);
+    }
+
+    template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type>
+    {
+        __host__ __device__ __forceinline__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {}
+
+        __device__ __forceinline__ typename Op::result_type operator ()(typename TypeTraits<typename Op::second_argument_type>::ParameterType a) const
+        {
+            return op(arg1, a);
+        }
+
+        __host__ __device__ __forceinline__ binder1st() {}
+        __host__ __device__ __forceinline__ binder1st(const binder1st& other) : op(other.op), arg1(other.arg1) {}
+
+        Op op;
+        typename Op::first_argument_type arg1;
+    };
+
+    template <typename Op, typename T> __host__ __device__ __forceinline__ binder1st<Op> bind1st(const Op& op, const T& x)
+    {
+        return binder1st<Op>(op, typename Op::first_argument_type(x));
+    }
+
+    template <typename Op> struct binder2nd : unary_function<typename Op::first_argument_type, typename Op::result_type>
+    {
+        __host__ __device__ __forceinline__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {}
+
+        __forceinline__ __device__ typename Op::result_type operator ()(typename TypeTraits<typename Op::first_argument_type>::ParameterType a) const
+        {
+            return op(a, arg2);
+        }
+
+        __host__ __device__ __forceinline__ binder2nd() {}
+        __host__ __device__ __forceinline__ binder2nd(const binder2nd& other) : op(other.op), arg2(other.arg2) {}
+
+        Op op;
+        typename Op::second_argument_type arg2;
+    };
+
+    template <typename Op, typename T> __host__ __device__ __forceinline__ binder2nd<Op> bind2nd(const Op& op, const T& x)
+    {
+        return binder2nd<Op>(op, typename Op::second_argument_type(x));
+    }
+
+    // Functor Traits
+    template <typename F> struct IsUnaryFunction
+    {
+        typedef char Yes;
+        struct No {Yes a[2];};
+
+        template <typename T, typename D> static Yes check(unary_function<T, D>);
+        static No check(...);
+
+        static F makeF();
+
+        enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };
+    };
+
+    template <typename F> struct IsBinaryFunction
+    {
+        typedef char Yes;
+        struct No {Yes a[2];};
+
+        template <typename T1, typename T2, typename D> static Yes check(binary_function<T1, T2, D>);
+        static No check(...);
+
+        static F makeF();
+
+        enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };
+    };
+
+    namespace functional_detail
+    {
+        template <size_t src_elem_size, size_t dst_elem_size> struct UnOpShift { enum { shift = 1 }; };
+        template <size_t src_elem_size> struct UnOpShift<src_elem_size, 1> { enum { shift = 4 }; };
+        template <size_t src_elem_size> struct UnOpShift<src_elem_size, 2> { enum { shift = 2 }; };
+
+        template <typename T, typename D> struct DefaultUnaryShift
+        {
+            enum { shift = UnOpShift<sizeof(T), sizeof(D)>::shift };
+        };
+
+        template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size> struct BinOpShift { enum { shift = 1 }; };
+        template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 1> { enum { shift = 4 }; };
+        template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 2> { enum { shift = 2 }; };
+
+        template <typename T1, typename T2, typename D> struct DefaultBinaryShift
+        {
+            enum { shift = BinOpShift<sizeof(T1), sizeof(T2), sizeof(D)>::shift };
+        };
+
+        template <typename Func, bool unary = IsUnaryFunction<Func>::value> struct ShiftDispatcher;
+        template <typename Func> struct ShiftDispatcher<Func, true>
+        {
+            enum { shift = DefaultUnaryShift<typename Func::argument_type, typename Func::result_type>::shift };
+        };
+        template <typename Func> struct ShiftDispatcher<Func, false>
+        {
+            enum { shift = DefaultBinaryShift<typename Func::first_argument_type, typename Func::second_argument_type, typename Func::result_type>::shift };
+        };
+    }
+
+    template <typename Func> struct DefaultTransformShift
+    {
+        enum { shift = functional_detail::ShiftDispatcher<Func>::shift };
+    };
+
+    template <typename Func> struct DefaultTransformFunctorTraits
+    {
+        enum { simple_block_dim_x = 16 };
+        enum { simple_block_dim_y = 16 };
+
+        enum { smart_block_dim_x = 16 };
+        enum { smart_block_dim_y = 16 };
+        enum { smart_shift = DefaultTransformShift<Func>::shift };
+    };
+
+    template <typename Func> struct TransformFunctorTraits : DefaultTransformFunctorTraits<Func> {};
+
+#define OPENCV_CUDA_TRANSFORM_FUNCTOR_TRAITS(type) \
+    template <> struct TransformFunctorTraits< type > : DefaultTransformFunctorTraits< type >
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_FUNCTIONAL_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/limits.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/limits.hpp
new file mode 100644
index 0000000..7e15ed6
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/limits.hpp
@@ -0,0 +1,128 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_LIMITS_HPP
+#define OPENCV_CUDA_LIMITS_HPP
+
+#include <limits.h>
+#include <float.h>
+#include "common.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+template <class T> struct numeric_limits;
+
+template <> struct numeric_limits<bool>
+{
+    __device__ __forceinline__ static bool min() { return false; }
+    __device__ __forceinline__ static bool max() { return true;  }
+    static const bool is_signed = false;
+};
+
+template <> struct numeric_limits<signed char>
+{
+    __device__ __forceinline__ static signed char min() { return SCHAR_MIN; }
+    __device__ __forceinline__ static signed char max() { return SCHAR_MAX; }
+    static const bool is_signed = true;
+};
+
+template <> struct numeric_limits<unsigned char>
+{
+    __device__ __forceinline__ static unsigned char min() { return 0; }
+    __device__ __forceinline__ static unsigned char max() { return UCHAR_MAX; }
+    static const bool is_signed = false;
+};
+
+template <> struct numeric_limits<short>
+{
+    __device__ __forceinline__ static short min() { return SHRT_MIN; }
+    __device__ __forceinline__ static short max() { return SHRT_MAX; }
+    static const bool is_signed = true;
+};
+
+template <> struct numeric_limits<unsigned short>
+{
+    __device__ __forceinline__ static unsigned short min() { return 0; }
+    __device__ __forceinline__ static unsigned short max() { return USHRT_MAX; }
+    static const bool is_signed = false;
+};
+
+template <> struct numeric_limits<int>
+{
+    __device__ __forceinline__ static int min() { return INT_MIN; }
+    __device__ __forceinline__ static int max() { return INT_MAX; }
+    static const bool is_signed = true;
+};
+
+template <> struct numeric_limits<unsigned int>
+{
+    __device__ __forceinline__ static unsigned int min() { return 0; }
+    __device__ __forceinline__ static unsigned int max() { return UINT_MAX; }
+    static const bool is_signed = false;
+};
+
+template <> struct numeric_limits<float>
+{
+    __device__ __forceinline__ static float min() { return FLT_MIN; }
+    __device__ __forceinline__ static float max() { return FLT_MAX; }
+    __device__ __forceinline__ static float epsilon() { return FLT_EPSILON; }
+    static const bool is_signed = true;
+};
+
+template <> struct numeric_limits<double>
+{
+    __device__ __forceinline__ static double min() { return DBL_MIN; }
+    __device__ __forceinline__ static double max() { return DBL_MAX; }
+    __device__ __forceinline__ static double epsilon() { return DBL_EPSILON; }
+    static const bool is_signed = true;
+};
+}}} // namespace cv { namespace cuda { namespace cudev {
+
+//! @endcond
+
+#endif // OPENCV_CUDA_LIMITS_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/reduce.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/reduce.hpp
new file mode 100644
index 0000000..5de3650
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/reduce.hpp
@@ -0,0 +1,209 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_REDUCE_HPP
+#define OPENCV_CUDA_REDUCE_HPP
+
+#ifndef THRUST_DEBUG // eliminate -Wundef warning
+#define THRUST_DEBUG 0
+#endif
+
+#include <thrust/tuple.h>
+#include "detail/reduce.hpp"
+#include "detail/reduce_key_val.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    template <int N, typename T, class Op>
+    __device__ __forceinline__ void reduce(volatile T* smem, T& val, unsigned int tid, const Op& op)
+    {
+        reduce_detail::Dispatcher<N>::reductor::template reduce<volatile T*, T&, const Op&>(smem, val, tid, op);
+    }
+    template <int N,
+              typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+              typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+              class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+    __device__ __forceinline__ void reduce(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                           const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                           unsigned int tid,
+                                           const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+    {
+        reduce_detail::Dispatcher<N>::reductor::template reduce<
+                const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>&,
+                const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>&,
+                const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>&>(smem, val, tid, op);
+    }
+
+    template <unsigned int N, typename K, typename V, class Cmp>
+    __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key, volatile V* svals, V& val, unsigned int tid, const Cmp& cmp)
+    {
+        reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&, volatile V*, V&, const Cmp&>(skeys, key, svals, val, tid, cmp);
+    }
+    template <unsigned int N,
+              typename K,
+              typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+              typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+              class Cmp>
+    __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key,
+                                                 const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                                 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                 unsigned int tid, const Cmp& cmp)
+    {
+        reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&,
+                const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>&,
+                const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&,
+                const Cmp&>(skeys, key, svals, val, tid, cmp);
+    }
+    template <unsigned int N,
+              typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
+              typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
+              typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+              typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+              class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
+    __device__ __forceinline__ void reduceKeyVal(const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
+                                                 const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
+                                                 const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                                 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                 unsigned int tid,
+                                                 const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp)
+    {
+        reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<
+                const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>&,
+                const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>&,
+                const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>&,
+                const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&,
+                const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>&
+                >(skeys, key, svals, val, tid, cmp);
+    }
+
+    // smem_tuple
+
+    template <typename T0>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*>
+    smem_tuple(T0* t0)
+    {
+        return thrust::make_tuple((volatile T0*) t0);
+    }
+
+    template <typename T0, typename T1>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*>
+    smem_tuple(T0* t0, T1* t1)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1);
+    }
+
+    template <typename T0, typename T1, typename T2>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*>
+    smem_tuple(T0* t0, T1* t1, T2* t2)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*, volatile T8*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8);
+    }
+
+    template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9>
+    __device__ __forceinline__
+    thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*, volatile T8*, volatile T9*>
+    smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8, T9* t9)
+    {
+        return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8, (volatile T9*) t9);
+    }
+}}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_REDUCE_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/saturate_cast.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/saturate_cast.hpp
new file mode 100644
index 0000000..c3a3d1c
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/saturate_cast.hpp
@@ -0,0 +1,292 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_SATURATE_CAST_HPP
+#define OPENCV_CUDA_SATURATE_CAST_HPP
+
+#include "common.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(ushort v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(short v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uint v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(int v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(float v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); }
+
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
+    {
+        uint res = 0;
+        int vi = v;
+        asm("cvt.sat.u8.s8 %0, %1;" : "=r"(res) : "r"(vi));
+        return res;
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u8.s16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u8.u16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u8.s32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u8.u32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
+    {
+        uint res = 0;
+        asm("cvt.rni.sat.u8.f32 %0, %1;" : "=r"(res) : "f"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+        uint res = 0;
+        asm("cvt.rni.sat.u8.f64 %0, %1;" : "=r"(res) : "d"(v));
+        return res;
+    #else
+        return saturate_cast<uchar>((float)v);
+    #endif
+    }
+
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
+    {
+        uint res = 0;
+        uint vi = v;
+        asm("cvt.sat.s8.u8 %0, %1;" : "=r"(res) : "r"(vi));
+        return res;
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
+    {
+        uint res = 0;
+        asm("cvt.sat.s8.s16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
+    {
+        uint res = 0;
+        asm("cvt.sat.s8.u16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
+    {
+        uint res = 0;
+        asm("cvt.sat.s8.s32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)
+    {
+        uint res = 0;
+        asm("cvt.sat.s8.u32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)
+    {
+        uint res = 0;
+        asm("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(res) : "f"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+        uint res = 0;
+        asm("cvt.rni.sat.s8.f64 %0, %1;" : "=r"(res) : "d"(v));
+        return res;
+    #else
+        return saturate_cast<schar>((float)v);
+    #endif
+    }
+
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
+    {
+        ushort res = 0;
+        int vi = v;
+        asm("cvt.sat.u16.s8 %0, %1;" : "=h"(res) : "r"(vi));
+        return res;
+    }
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
+    {
+        ushort res = 0;
+        asm("cvt.sat.u16.s16 %0, %1;" : "=h"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
+    {
+        ushort res = 0;
+        asm("cvt.sat.u16.s32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
+    {
+        ushort res = 0;
+        asm("cvt.sat.u16.u32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
+    {
+        ushort res = 0;
+        asm("cvt.rni.sat.u16.f32 %0, %1;" : "=h"(res) : "f"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+        ushort res = 0;
+        asm("cvt.rni.sat.u16.f64 %0, %1;" : "=h"(res) : "d"(v));
+        return res;
+    #else
+        return saturate_cast<ushort>((float)v);
+    #endif
+    }
+
+    template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)
+    {
+        short res = 0;
+        asm("cvt.sat.s16.u16 %0, %1;" : "=h"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ short saturate_cast<short>(int v)
+    {
+        short res = 0;
+        asm("cvt.sat.s16.s32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ short saturate_cast<short>(uint v)
+    {
+        short res = 0;
+        asm("cvt.sat.s16.u32 %0, %1;" : "=h"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ short saturate_cast<short>(float v)
+    {
+        short res = 0;
+        asm("cvt.rni.sat.s16.f32 %0, %1;" : "=h"(res) : "f"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ short saturate_cast<short>(double v)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+        short res = 0;
+        asm("cvt.rni.sat.s16.f64 %0, %1;" : "=h"(res) : "d"(v));
+        return res;
+    #else
+        return saturate_cast<short>((float)v);
+    #endif
+    }
+
+    template<> __device__ __forceinline__ int saturate_cast<int>(uint v)
+    {
+        int res = 0;
+        asm("cvt.sat.s32.u32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ int saturate_cast<int>(float v)
+    {
+        return __float2int_rn(v);
+    }
+    template<> __device__ __forceinline__ int saturate_cast<int>(double v)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+        return __double2int_rn(v);
+    #else
+        return saturate_cast<int>((float)v);
+    #endif
+    }
+
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(schar v)
+    {
+        uint res = 0;
+        int vi = v;
+        asm("cvt.sat.u32.s8 %0, %1;" : "=r"(res) : "r"(vi));
+        return res;
+    }
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(short v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u32.s16 %0, %1;" : "=r"(res) : "h"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(int v)
+    {
+        uint res = 0;
+        asm("cvt.sat.u32.s32 %0, %1;" : "=r"(res) : "r"(v));
+        return res;
+    }
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(float v)
+    {
+        return __float2uint_rn(v);
+    }
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(double v)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+        return __double2uint_rn(v);
+    #else
+        return saturate_cast<uint>((float)v);
+    #endif
+    }
+}}}
+
+//! @endcond
+
+#endif /* OPENCV_CUDA_SATURATE_CAST_HPP */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/scan.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/scan.hpp
new file mode 100644
index 0000000..e128fb0
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/scan.hpp
@@ -0,0 +1,258 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_SCAN_HPP
+#define OPENCV_CUDA_SCAN_HPP
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+#include "opencv2/core/cuda/warp.hpp"
+#include "opencv2/core/cuda/warp_shuffle.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    enum ScanKind { EXCLUSIVE = 0,  INCLUSIVE = 1 };
+
+    template <ScanKind Kind, typename T, typename F> struct WarpScan
+    {
+        __device__ __forceinline__ WarpScan() {}
+        __device__ __forceinline__ WarpScan(const WarpScan& other) { CV_UNUSED(other); }
+
+        __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)
+        {
+            const unsigned int lane = idx & 31;
+            F op;
+
+            if ( lane >=  1) ptr [idx ] = op(ptr [idx -  1], ptr [idx]);
+            if ( lane >=  2) ptr [idx ] = op(ptr [idx -  2], ptr [idx]);
+            if ( lane >=  4) ptr [idx ] = op(ptr [idx -  4], ptr [idx]);
+            if ( lane >=  8) ptr [idx ] = op(ptr [idx -  8], ptr [idx]);
+            if ( lane >= 16) ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
+
+            if( Kind == INCLUSIVE )
+                return ptr [idx];
+            else
+                return (lane > 0) ? ptr [idx - 1] : 0;
+        }
+
+        __device__ __forceinline__ unsigned int index(const unsigned int tid)
+        {
+            return tid;
+        }
+
+        __device__ __forceinline__ void init(volatile T *ptr){}
+
+        static const int warp_offset      = 0;
+
+        typedef WarpScan<INCLUSIVE, T, F>  merge;
+    };
+
+    template <ScanKind Kind , typename T, typename F> struct WarpScanNoComp
+    {
+        __device__ __forceinline__ WarpScanNoComp() {}
+        __device__ __forceinline__ WarpScanNoComp(const WarpScanNoComp& other) { CV_UNUSED(other); }
+
+        __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)
+        {
+            const unsigned int lane = threadIdx.x & 31;
+            F op;
+
+            ptr [idx ] = op(ptr [idx -  1], ptr [idx]);
+            ptr [idx ] = op(ptr [idx -  2], ptr [idx]);
+            ptr [idx ] = op(ptr [idx -  4], ptr [idx]);
+            ptr [idx ] = op(ptr [idx -  8], ptr [idx]);
+            ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
+
+            if( Kind == INCLUSIVE )
+                return ptr [idx];
+            else
+                return (lane > 0) ? ptr [idx - 1] : 0;
+        }
+
+        __device__ __forceinline__ unsigned int index(const unsigned int tid)
+        {
+            return (tid >> warp_log) * warp_smem_stride + 16 + (tid & warp_mask);
+        }
+
+        __device__ __forceinline__ void init(volatile T *ptr)
+        {
+            ptr[threadIdx.x] = 0;
+        }
+
+        static const int warp_smem_stride = 32 + 16 + 1;
+        static const int warp_offset      = 16;
+        static const int warp_log         = 5;
+        static const int warp_mask        = 31;
+
+        typedef WarpScanNoComp<INCLUSIVE, T, F> merge;
+    };
+
+    template <ScanKind Kind , typename T, typename Sc, typename F> struct BlockScan
+    {
+        __device__ __forceinline__ BlockScan() {}
+        __device__ __forceinline__ BlockScan(const BlockScan& other) { CV_UNUSED(other); }
+
+        __device__ __forceinline__ T operator()(volatile T *ptr)
+        {
+            const unsigned int tid  = threadIdx.x;
+            const unsigned int lane = tid & warp_mask;
+            const unsigned int warp = tid >> warp_log;
+
+            Sc scan;
+            typename Sc::merge merge_scan;
+            const unsigned int idx = scan.index(tid);
+
+            T val = scan(ptr, idx);
+            __syncthreads ();
+
+            if( warp == 0)
+                scan.init(ptr);
+            __syncthreads ();
+
+            if( lane == 31 )
+                ptr [scan.warp_offset + warp ] = (Kind == INCLUSIVE) ? val : ptr [idx];
+            __syncthreads ();
+
+            if( warp == 0 )
+                merge_scan(ptr, idx);
+            __syncthreads();
+
+            if ( warp > 0)
+                val = ptr [scan.warp_offset + warp - 1] + val;
+            __syncthreads ();
+
+            ptr[idx] = val;
+            __syncthreads ();
+
+            return val ;
+        }
+
+        static const int warp_log  = 5;
+        static const int warp_mask = 31;
+    };
+
+    template <typename T>
+    __device__ T warpScanInclusive(T idata, volatile T* s_Data, unsigned int tid)
+    {
+    #if __CUDA_ARCH__ >= 300
+        const unsigned int laneId = cv::cuda::device::Warp::laneId();
+
+        // scan on shuffl functions
+        #pragma unroll
+        for (int i = 1; i <= (OPENCV_CUDA_WARP_SIZE / 2); i *= 2)
+        {
+            const T n = cv::cuda::device::shfl_up(idata, i);
+            if (laneId >= i)
+                  idata += n;
+        }
+
+        return idata;
+    #else
+        unsigned int pos = 2 * tid - (tid & (OPENCV_CUDA_WARP_SIZE - 1));
+        s_Data[pos] = 0;
+        pos += OPENCV_CUDA_WARP_SIZE;
+        s_Data[pos] = idata;
+
+        s_Data[pos] += s_Data[pos - 1];
+        s_Data[pos] += s_Data[pos - 2];
+        s_Data[pos] += s_Data[pos - 4];
+        s_Data[pos] += s_Data[pos - 8];
+        s_Data[pos] += s_Data[pos - 16];
+
+        return s_Data[pos];
+    #endif
+    }
+
+    template <typename T>
+    __device__ __forceinline__ T warpScanExclusive(T idata, volatile T* s_Data, unsigned int tid)
+    {
+        return warpScanInclusive(idata, s_Data, tid) - idata;
+    }
+
+    template <int tiNumScanThreads, typename T>
+    __device__ T blockScanInclusive(T idata, volatile T* s_Data, unsigned int tid)
+    {
+        if (tiNumScanThreads > OPENCV_CUDA_WARP_SIZE)
+        {
+            //Bottom-level inclusive warp scan
+            T warpResult = warpScanInclusive(idata, s_Data, tid);
+
+            //Save top elements of each warp for exclusive warp scan
+            //sync to wait for warp scans to complete (because s_Data is being overwritten)
+            __syncthreads();
+            if ((tid & (OPENCV_CUDA_WARP_SIZE - 1)) == (OPENCV_CUDA_WARP_SIZE - 1))
+            {
+                s_Data[tid >> OPENCV_CUDA_LOG_WARP_SIZE] = warpResult;
+            }
+
+            //wait for warp scans to complete
+            __syncthreads();
+
+            if (tid < (tiNumScanThreads / OPENCV_CUDA_WARP_SIZE) )
+            {
+                //grab top warp elements
+                T val = s_Data[tid];
+                //calculate exclusive scan and write back to shared memory
+                s_Data[tid] = warpScanExclusive(val, s_Data, tid);
+            }
+
+            //return updated warp scans with exclusive scan results
+            __syncthreads();
+
+            return warpResult + s_Data[tid >> OPENCV_CUDA_LOG_WARP_SIZE];
+        }
+        else
+        {
+            return warpScanInclusive(idata, s_Data, tid);
+        }
+    }
+}}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_SCAN_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/simd_functions.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/simd_functions.hpp
new file mode 100644
index 0000000..3d8c2e0
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/simd_functions.hpp
@@ -0,0 +1,869 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/*
+ * Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *   Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ *
+ *   Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ *   Neither the name of NVIDIA Corporation nor the names of its contributors
+ *   may be used to endorse or promote products derived from this software
+ *   without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef OPENCV_CUDA_SIMD_FUNCTIONS_HPP
+#define OPENCV_CUDA_SIMD_FUNCTIONS_HPP
+
+#include "common.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    // 2
+
+    static __device__ __forceinline__ unsigned int vadd2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s;
+        s = a ^ b;          // sum bits
+        r = a + b;          // actual sum
+        s = s ^ r;          // determine carry-ins for each bit position
+        s = s & 0x00010000; // carry-in to high word (= carry-out from low word)
+        r = r - s;          // subtract out carry-out from low word
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsub2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s;
+        s = a ^ b;          // sum bits
+        r = a - b;          // actual sum
+        s = s ^ r;          // determine carry-ins for each bit position
+        s = s & 0x00010000; // borrow to high word
+        r = r + s;          // compensate for borrow from low word
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vabsdiff2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s, t, u, v;
+        s = a & 0x0000ffff; // extract low halfword
+        r = b & 0x0000ffff; // extract low halfword
+        u = ::max(r, s);    // maximum of low halfwords
+        v = ::min(r, s);    // minimum of low halfwords
+        s = a & 0xffff0000; // extract high halfword
+        r = b & 0xffff0000; // extract high halfword
+        t = ::max(r, s);    // maximum of high halfwords
+        s = ::min(r, s);    // minimum of high halfwords
+        r = u | t;          // maximum of both halfwords
+        s = v | s;          // minimum of both halfwords
+        r = r - s;          // |a - b| = max(a,b) - min(a,b);
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vavg2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, s;
+
+        // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
+        // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
+        s = a ^ b;
+        r = a & b;
+        s = s & 0xfffefffe; // ensure shift doesn't cross halfword boundaries
+        s = s >> 1;
+        s = r + s;
+
+        return s;
+    }
+
+    static __device__ __forceinline__ unsigned int vavrg2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vavrg2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
+        // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
+        unsigned int s;
+        s = a ^ b;
+        r = a | b;
+        s = s & 0xfffefffe; // ensure shift doesn't cross half-word boundaries
+        s = s >> 1;
+        r = r - s;
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vseteq2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset2.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        unsigned int c;
+        r = a ^ b;          // 0x0000 if a == b
+        c = r | 0x80008000; // set msbs, to catch carry out
+        r = r ^ c;          // extract msbs, msb = 1 if r < 0x8000
+        c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
+        c = r & ~c;         // msb = 1, if r was 0x0000
+        r = c >> 15;        // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpeq2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vseteq2(a, b);
+        c = r << 16;        // convert bool
+        r = c - r;          //  into mask
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        r = a ^ b;          // 0x0000 if a == b
+        c = r | 0x80008000; // set msbs, to catch carry out
+        r = r ^ c;          // extract msbs, msb = 1 if r < 0x8000
+        c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
+        c = r & ~c;         // msb = 1, if r was 0x0000
+        r = c >> 15;        // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetge2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset2.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavrg2(a, b);   // (a + ~b + 1) / 2 = (a - b) / 2
+        c = c & 0x80008000; // msb = carry-outs
+        r = c >> 15;        // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpge2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetge2(a, b);
+        c = r << 16;        // convert bool
+        r = c - r;          //  into mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavrg2(a, b);   // (a + ~b + 1) / 2 = (a - b) / 2
+        c = c & 0x80008000; // msb = carry-outs
+        r = c >> 15;        // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetgt2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset2.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavg2(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
+        c = c & 0x80008000; // msbs = carry-outs
+        r = c >> 15;        // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpgt2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetgt2(a, b);
+        c = r << 16;        // convert bool
+        r = c - r;          //  into mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavg2(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
+        c = c & 0x80008000; // msbs = carry-outs
+        r = c >> 15;        // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetle2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset2.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavrg2(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
+        c = c & 0x80008000; // msb = carry-outs
+        r = c >> 15;        // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmple2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetle2(a, b);
+        c = r << 16;        // convert bool
+        r = c - r;          //  into mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavrg2(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
+        c = c & 0x80008000; // msb = carry-outs
+        r = c >> 15;        // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetlt2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset2.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavg2(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
+        c = c & 0x80008000; // msb = carry-outs
+        r = c >> 15;        // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmplt2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetlt2(a, b);
+        c = r << 16;        // convert bool
+        r = c - r;          //  into mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavg2(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
+        c = c & 0x80008000; // msb = carry-outs
+        r = c >> 15;        // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetne2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm ("vset2.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        unsigned int c;
+        r = a ^ b;          // 0x0000 if a == b
+        c = r | 0x80008000; // set msbs, to catch carry out
+        c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
+        c = r | c;          // msb = 1, if r was not 0x0000
+        c = c & 0x80008000; // extract msbs
+        r = c >> 15;        // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpne2(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetne2(a, b);
+        c = r << 16;        // convert bool
+        r = c - r;          //  into mask
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        r = a ^ b;          // 0x0000 if a == b
+        c = r | 0x80008000; // set msbs, to catch carry out
+        c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
+        c = r | c;          // msb = 1, if r was not 0x0000
+        c = c & 0x80008000; // extract msbs
+        r = c >> 15;        // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vmax2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s, t, u;
+        r = a & 0x0000ffff; // extract low halfword
+        s = b & 0x0000ffff; // extract low halfword
+        t = ::max(r, s);    // maximum of low halfwords
+        r = a & 0xffff0000; // extract high halfword
+        s = b & 0xffff0000; // extract high halfword
+        u = ::max(r, s);    // maximum of high halfwords
+        r = t | u;          // combine halfword maximums
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vmin2(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s, t, u;
+        r = a & 0x0000ffff; // extract low halfword
+        s = b & 0x0000ffff; // extract low halfword
+        t = ::min(r, s);    // minimum of low halfwords
+        r = a & 0xffff0000; // extract high halfword
+        s = b & 0xffff0000; // extract high halfword
+        u = ::min(r, s);    // minimum of high halfwords
+        r = t | u;          // combine halfword minimums
+    #endif
+
+        return r;
+    }
+
+    // 4
+
+    static __device__ __forceinline__ unsigned int vadd4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s, t;
+        s = a ^ b;          // sum bits
+        r = a & 0x7f7f7f7f; // clear msbs
+        t = b & 0x7f7f7f7f; // clear msbs
+        s = s & 0x80808080; // msb sum bits
+        r = r + t;          // add without msbs, record carry-out in msbs
+        r = r ^ s;          // sum of msb sum and carry-in bits, w/o carry-out
+    #endif /* __CUDA_ARCH__ >= 300 */
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsub4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s, t;
+        s = a ^ ~b;         // inverted sum bits
+        r = a | 0x80808080; // set msbs
+        t = b & 0x7f7f7f7f; // clear msbs
+        s = s & 0x80808080; // inverted msb sum bits
+        r = r - t;          // subtract w/o msbs, record inverted borrows in msb
+        r = r ^ s;          // combine inverted msb sum bits and borrows
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vavg4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, s;
+
+        // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
+        // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
+        s = a ^ b;
+        r = a & b;
+        s = s & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
+        s = s >> 1;
+        s = r + s;
+
+        return s;
+    }
+
+    static __device__ __forceinline__ unsigned int vavrg4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vavrg4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
+        // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
+        unsigned int c;
+        c = a ^ b;
+        r = a | b;
+        c = c & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
+        c = c >> 1;
+        r = r - c;
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vseteq4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset4.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        unsigned int c;
+        r = a ^ b;          // 0x00 if a == b
+        c = r | 0x80808080; // set msbs, to catch carry out
+        r = r ^ c;          // extract msbs, msb = 1 if r < 0x80
+        c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
+        c = r & ~c;         // msb = 1, if r was 0x00
+        r = c >> 7;         // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpeq4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, t;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vseteq4(a, b);
+        t = r << 8;         // convert bool
+        r = t - r;          //  to mask
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        t = a ^ b;          // 0x00 if a == b
+        r = t | 0x80808080; // set msbs, to catch carry out
+        t = t ^ r;          // extract msbs, msb = 1 if t < 0x80
+        r = r - 0x01010101; // msb = 0, if t was 0x00 or 0x80
+        r = t & ~r;         // msb = 1, if t was 0x00
+        t = r >> 7;         // build mask
+        t = r - t;          //  from
+        r = t | r;          //   msbs
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetle4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset4.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavrg4(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
+        c = c & 0x80808080; // msb = carry-outs
+        r = c >> 7;         // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmple4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetle4(a, b);
+        c = r << 8;         // convert bool
+        r = c - r;          //  to mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavrg4(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
+        c = c & 0x80808080; // msbs = carry-outs
+        r = c >> 7;         // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetlt4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset4.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavg4(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
+        c = c & 0x80808080; // msb = carry-outs
+        r = c >> 7;         // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmplt4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetlt4(a, b);
+        c = r << 8;         // convert bool
+        r = c - r;          //  to mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(a));
+        c = vavg4(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
+        c = c & 0x80808080; // msbs = carry-outs
+        r = c >> 7;         // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetge4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset4.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavrg4(a, b);   // (a + ~b + 1) / 2 = (a - b) / 2
+        c = c & 0x80808080; // msb = carry-outs
+        r = c >> 7;         // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpge4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, s;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetge4(a, b);
+        s = r << 8;         // convert bool
+        r = s - r;          //  to mask
+    #else
+        asm ("not.b32 %0,%0;" : "+r"(b));
+        r = vavrg4 (a, b);  // (a + ~b + 1) / 2 = (a - b) / 2
+        r = r & 0x80808080; // msb = carry-outs
+        s = r >> 7;         // build mask
+        s = r - s;          //  from
+        r = s | r;          //   msbs
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetgt4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset4.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int c;
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavg4(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
+        c = c & 0x80808080; // msb = carry-outs
+        r = c >> 7;         // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpgt4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetgt4(a, b);
+        c = r << 8;         // convert bool
+        r = c - r;          //  to mask
+    #else
+        asm("not.b32 %0, %0;" : "+r"(b));
+        c = vavg4(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
+        c = c & 0x80808080; // msb = carry-outs
+        r = c >> 7;         // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vsetne4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vset4.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        unsigned int c;
+        r = a ^ b;          // 0x00 if a == b
+        c = r | 0x80808080; // set msbs, to catch carry out
+        c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
+        c = r | c;          // msb = 1, if r was not 0x00
+        c = c & 0x80808080; // extract msbs
+        r = c >> 7;         // convert to bool
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vcmpne4(unsigned int a, unsigned int b)
+    {
+        unsigned int r, c;
+
+    #if __CUDA_ARCH__ >= 300
+        r = vsetne4(a, b);
+        c = r << 8;         // convert bool
+        r = c - r;          //  to mask
+    #else
+        // inspired by Alan Mycroft's null-byte detection algorithm:
+        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
+        r = a ^ b;          // 0x00 if a == b
+        c = r | 0x80808080; // set msbs, to catch carry out
+        c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
+        c = r | c;          // msb = 1, if r was not 0x00
+        c = c & 0x80808080; // extract msbs
+        r = c >> 7;         // convert
+        r = c - r;          //  msbs to
+        r = c | r;          //   mask
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vabsdiff4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s;
+        s = vcmpge4(a, b);  // mask = 0xff if a >= b
+        r = a ^ b;          //
+        s = (r &  s) ^ b;   // select a when a >= b, else select b => max(a,b)
+        r = s ^ r;          // select a when b >= a, else select b => min(a,b)
+        r = s - r;          // |a - b| = max(a,b) - min(a,b);
+    #endif
+
+        return r;
+    }
+
+    static __device__ __forceinline__ unsigned int vmax4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s;
+        s = vcmpge4(a, b);  // mask = 0xff if a >= b
+        r = a & s;          // select a when b >= a
+        s = b & ~s;         // select b when b < a
+        r = r | s;          // combine byte selections
+    #endif
+
+        return r;           // byte-wise unsigned maximum
+    }
+
+    static __device__ __forceinline__ unsigned int vmin4(unsigned int a, unsigned int b)
+    {
+        unsigned int r = 0;
+
+    #if __CUDA_ARCH__ >= 300
+        asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #elif __CUDA_ARCH__ >= 200
+        asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+        asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
+    #else
+        unsigned int s;
+        s = vcmpge4(b, a);  // mask = 0xff if a >= b
+        r = a & s;          // select a when b >= a
+        s = b & ~s;         // select b when b < a
+        r = r | s;          // combine byte selections
+    #endif
+
+        return r;
+    }
+}}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_SIMD_FUNCTIONS_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/transform.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/transform.hpp
new file mode 100644
index 0000000..42aa6ea
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/transform.hpp
@@ -0,0 +1,75 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_TRANSFORM_HPP
+#define OPENCV_CUDA_TRANSFORM_HPP
+
+#include "common.hpp"
+#include "utility.hpp"
+#include "detail/transform_detail.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename T, typename D, typename UnOp, typename Mask>
+    static inline void transform(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, const Mask& mask, cudaStream_t stream)
+    {
+        typedef TransformFunctorTraits<UnOp> ft;
+        transform_detail::TransformDispatcher<VecTraits<T>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src, dst, op, mask, stream);
+    }
+
+    template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+    static inline void transform(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, const Mask& mask, cudaStream_t stream)
+    {
+        typedef TransformFunctorTraits<BinOp> ft;
+        transform_detail::TransformDispatcher<VecTraits<T1>::cn == 1 && VecTraits<T2>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src1, src2, dst, op, mask, stream);
+    }
+}}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_TRANSFORM_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/type_traits.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/type_traits.hpp
new file mode 100644
index 0000000..8b7a3fd
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/type_traits.hpp
@@ -0,0 +1,90 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_TYPE_TRAITS_HPP
+#define OPENCV_CUDA_TYPE_TRAITS_HPP
+
+#include "detail/type_traits_detail.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename T> struct IsSimpleParameter
+    {
+        enum {value = type_traits_detail::IsIntegral<T>::value || type_traits_detail::IsFloat<T>::value ||
+            type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<T>::type>::value};
+    };
+
+    template <typename T> struct TypeTraits
+    {
+        typedef typename type_traits_detail::UnConst<T>::type                                                NonConstType;
+        typedef typename type_traits_detail::UnVolatile<T>::type                                             NonVolatileType;
+        typedef typename type_traits_detail::UnVolatile<typename type_traits_detail::UnConst<T>::type>::type UnqualifiedType;
+        typedef typename type_traits_detail::PointerTraits<UnqualifiedType>::type                            PointeeType;
+        typedef typename type_traits_detail::ReferenceTraits<T>::type                                        ReferredType;
+
+        enum { isConst          = type_traits_detail::UnConst<T>::value };
+        enum { isVolatile       = type_traits_detail::UnVolatile<T>::value };
+
+        enum { isReference      = type_traits_detail::ReferenceTraits<UnqualifiedType>::value };
+        enum { isPointer        = type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<UnqualifiedType>::type>::value };
+
+        enum { isUnsignedInt    = type_traits_detail::IsUnsignedIntegral<UnqualifiedType>::value };
+        enum { isSignedInt      = type_traits_detail::IsSignedIntergral<UnqualifiedType>::value };
+        enum { isIntegral       = type_traits_detail::IsIntegral<UnqualifiedType>::value };
+        enum { isFloat          = type_traits_detail::IsFloat<UnqualifiedType>::value };
+        enum { isArith          = isIntegral || isFloat };
+        enum { isVec            = type_traits_detail::IsVec<UnqualifiedType>::value };
+
+        typedef typename type_traits_detail::Select<IsSimpleParameter<UnqualifiedType>::value,
+            T, typename type_traits_detail::AddParameterType<T>::type>::type ParameterType;
+    };
+}}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_TYPE_TRAITS_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/utility.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/utility.hpp
new file mode 100644
index 0000000..7f5db48
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/utility.hpp
@@ -0,0 +1,230 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_UTILITY_HPP
+#define OPENCV_CUDA_UTILITY_HPP
+
+#include "saturate_cast.hpp"
+#include "datamov_utils.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    struct CV_EXPORTS ThrustAllocator
+    {
+        typedef uchar value_type;
+        virtual ~ThrustAllocator();
+        virtual __device__ __host__ uchar* allocate(size_t numBytes) = 0;
+        virtual __device__ __host__ void deallocate(uchar* ptr, size_t numBytes) = 0;
+        static ThrustAllocator& getAllocator();
+        static void setAllocator(ThrustAllocator* allocator);
+    };
+    #define OPENCV_CUDA_LOG_WARP_SIZE        (5)
+    #define OPENCV_CUDA_WARP_SIZE            (1 << OPENCV_CUDA_LOG_WARP_SIZE)
+    #define OPENCV_CUDA_LOG_MEM_BANKS        ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
+    #define OPENCV_CUDA_MEM_BANKS            (1 << OPENCV_CUDA_LOG_MEM_BANKS)
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // swap
+
+    template <typename T> void __device__ __host__ __forceinline__ swap(T& a, T& b)
+    {
+        const T temp = a;
+        a = b;
+        b = temp;
+    }
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // Mask Reader
+
+    struct SingleMask
+    {
+        explicit __host__ __device__ __forceinline__ SingleMask(PtrStepb mask_) : mask(mask_) {}
+        __host__ __device__ __forceinline__ SingleMask(const SingleMask& mask_): mask(mask_.mask){}
+
+        __device__ __forceinline__ bool operator()(int y, int x) const
+        {
+            return mask.ptr(y)[x] != 0;
+        }
+
+        PtrStepb mask;
+    };
+
+    struct SingleMaskChannels
+    {
+        __host__ __device__ __forceinline__ SingleMaskChannels(PtrStepb mask_, int channels_)
+        : mask(mask_), channels(channels_) {}
+        __host__ __device__ __forceinline__ SingleMaskChannels(const SingleMaskChannels& mask_)
+            :mask(mask_.mask), channels(mask_.channels){}
+
+        __device__ __forceinline__ bool operator()(int y, int x) const
+        {
+            return mask.ptr(y)[x / channels] != 0;
+        }
+
+        PtrStepb mask;
+        int channels;
+    };
+
+    struct MaskCollection
+    {
+        explicit __host__ __device__ __forceinline__ MaskCollection(PtrStepb* maskCollection_)
+            : maskCollection(maskCollection_) {}
+
+        __device__ __forceinline__ MaskCollection(const MaskCollection& masks_)
+            : maskCollection(masks_.maskCollection), curMask(masks_.curMask){}
+
+        __device__ __forceinline__ void next()
+        {
+            curMask = *maskCollection++;
+        }
+        __device__ __forceinline__ void setMask(int z)
+        {
+            curMask = maskCollection[z];
+        }
+
+        __device__ __forceinline__ bool operator()(int y, int x) const
+        {
+            uchar val;
+            return curMask.data == 0 || (ForceGlob<uchar>::Load(curMask.ptr(y), x, val), (val != 0));
+        }
+
+        const PtrStepb* maskCollection;
+        PtrStepb curMask;
+    };
+
+    struct WithOutMask
+    {
+        __host__ __device__ __forceinline__ WithOutMask(){}
+        __host__ __device__ __forceinline__ WithOutMask(const WithOutMask&){}
+
+        __device__ __forceinline__ void next() const
+        {
+        }
+        __device__ __forceinline__ void setMask(int) const
+        {
+        }
+
+        __device__ __forceinline__ bool operator()(int, int) const
+        {
+            return true;
+        }
+
+        __device__ __forceinline__ bool operator()(int, int, int) const
+        {
+            return true;
+        }
+
+        static __device__ __forceinline__ bool check(int, int)
+        {
+            return true;
+        }
+
+        static __device__ __forceinline__ bool check(int, int, int)
+        {
+            return true;
+        }
+    };
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // Solve linear system
+
+    // solve 2x2 linear system Ax=b
+    template <typename T> __device__ __forceinline__ bool solve2x2(const T A[2][2], const T b[2], T x[2])
+    {
+        T det = A[0][0] * A[1][1] - A[1][0] * A[0][1];
+
+        if (det != 0)
+        {
+            double invdet = 1.0 / det;
+
+            x[0] = saturate_cast<T>(invdet * (b[0] * A[1][1] - b[1] * A[0][1]));
+
+            x[1] = saturate_cast<T>(invdet * (A[0][0] * b[1] - A[1][0] * b[0]));
+
+            return true;
+        }
+
+        return false;
+    }
+
+    // solve 3x3 linear system Ax=b
+    template <typename T> __device__ __forceinline__ bool solve3x3(const T A[3][3], const T b[3], T x[3])
+    {
+        T det = A[0][0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1])
+              - A[0][1] * (A[1][0] * A[2][2] - A[1][2] * A[2][0])
+              + A[0][2] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]);
+
+        if (det != 0)
+        {
+            double invdet = 1.0 / det;
+
+            x[0] = saturate_cast<T>(invdet *
+                (b[0]    * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -
+                 A[0][1] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) +
+                 A[0][2] * (b[1]    * A[2][1] - A[1][1] * b[2]   )));
+
+            x[1] = saturate_cast<T>(invdet *
+                (A[0][0] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) -
+                 b[0]    * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +
+                 A[0][2] * (A[1][0] * b[2]    - b[1]    * A[2][0])));
+
+            x[2] = saturate_cast<T>(invdet *
+                (A[0][0] * (A[1][1] * b[2]    - b[1]    * A[2][1]) -
+                 A[0][1] * (A[1][0] * b[2]    - b[1]    * A[2][0]) +
+                 b[0]    * (A[1][0] * A[2][1] - A[1][1] * A[2][0])));
+
+            return true;
+        }
+
+        return false;
+    }
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_UTILITY_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/vec_distance.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/vec_distance.hpp
new file mode 100644
index 0000000..ef6e510
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/vec_distance.hpp
@@ -0,0 +1,232 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_VEC_DISTANCE_HPP
+#define OPENCV_CUDA_VEC_DISTANCE_HPP
+
+#include "reduce.hpp"
+#include "functional.hpp"
+#include "detail/vec_distance_detail.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename T> struct L1Dist
+    {
+        typedef int value_type;
+        typedef int result_type;
+
+        __device__ __forceinline__ L1Dist() : mySum(0) {}
+
+        __device__ __forceinline__ void reduceIter(int val1, int val2)
+        {
+            mySum = __sad(val1, val2, mySum);
+        }
+
+        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
+        {
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
+        }
+
+        __device__ __forceinline__ operator int() const
+        {
+            return mySum;
+        }
+
+        int mySum;
+    };
+    template <> struct L1Dist<float>
+    {
+        typedef float value_type;
+        typedef float result_type;
+
+        __device__ __forceinline__ L1Dist() : mySum(0.0f) {}
+
+        __device__ __forceinline__ void reduceIter(float val1, float val2)
+        {
+            mySum += ::fabs(val1 - val2);
+        }
+
+        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
+        {
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
+        }
+
+        __device__ __forceinline__ operator float() const
+        {
+            return mySum;
+        }
+
+        float mySum;
+    };
+
+    struct L2Dist
+    {
+        typedef float value_type;
+        typedef float result_type;
+
+        __device__ __forceinline__ L2Dist() : mySum(0.0f) {}
+
+        __device__ __forceinline__ void reduceIter(float val1, float val2)
+        {
+            float reg = val1 - val2;
+            mySum += reg * reg;
+        }
+
+        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
+        {
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
+        }
+
+        __device__ __forceinline__ operator float() const
+        {
+            return sqrtf(mySum);
+        }
+
+        float mySum;
+    };
+
+    struct HammingDist
+    {
+        typedef int value_type;
+        typedef int result_type;
+
+        __device__ __forceinline__ HammingDist() : mySum(0) {}
+
+        __device__ __forceinline__ void reduceIter(int val1, int val2)
+        {
+            mySum += __popc(val1 ^ val2);
+        }
+
+        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
+        {
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
+        }
+
+        __device__ __forceinline__ operator int() const
+        {
+            return mySum;
+        }
+
+        int mySum;
+    };
+
+    // calc distance between two vectors in global memory
+    template <int THREAD_DIM, typename Dist, typename T1, typename T2>
+    __device__ void calcVecDiffGlobal(const T1* vec1, const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid)
+    {
+        for (int i = tid; i < len; i += THREAD_DIM)
+        {
+            T1 val1;
+            ForceGlob<T1>::Load(vec1, i, val1);
+
+            T2 val2;
+            ForceGlob<T2>::Load(vec2, i, val2);
+
+            dist.reduceIter(val1, val2);
+        }
+
+        dist.reduceAll<THREAD_DIM>(smem, tid);
+    }
+
+    // calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory
+    template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T1, typename T2>
+    __device__ __forceinline__ void calcVecDiffCached(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, typename Dist::result_type* smem, int tid)
+    {
+        vec_distance_detail::VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>::calc(vecCached, vecGlob, len, dist, tid);
+
+        dist.reduceAll<THREAD_DIM>(smem, tid);
+    }
+
+    // calc distance between two vectors in global memory
+    template <int THREAD_DIM, typename T1> struct VecDiffGlobal
+    {
+        explicit __device__ __forceinline__ VecDiffGlobal(const T1* vec1_, int = 0, void* = 0, int = 0, int = 0)
+        {
+            vec1 = vec1_;
+        }
+
+        template <typename T2, typename Dist>
+        __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
+        {
+            calcVecDiffGlobal<THREAD_DIM>(vec1, vec2, len, dist, smem, tid);
+        }
+
+        const T1* vec1;
+    };
+
+    // calc distance between two vectors, first vector is cached in register memory, second vector is in global memory
+    template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename U> struct VecDiffCachedRegister
+    {
+        template <typename T1> __device__ __forceinline__ VecDiffCachedRegister(const T1* vec1, int len, U* smem, int glob_tid, int tid)
+        {
+            if (glob_tid < len)
+                smem[glob_tid] = vec1[glob_tid];
+            __syncthreads();
+
+            U* vec1ValsPtr = vec1Vals;
+
+            #pragma unroll
+            for (int i = tid; i < MAX_LEN; i += THREAD_DIM)
+                *vec1ValsPtr++ = smem[i];
+
+            __syncthreads();
+        }
+
+        template <typename T2, typename Dist>
+        __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
+        {
+            calcVecDiffCached<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>(vec1Vals, vec2, len, dist, smem, tid);
+        }
+
+        U vec1Vals[MAX_LEN / THREAD_DIM];
+    };
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_VEC_DISTANCE_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/vec_math.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/vec_math.hpp
new file mode 100644
index 0000000..80b1303
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/vec_math.hpp
@@ -0,0 +1,923 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_VECMATH_HPP
+#define OPENCV_CUDA_VECMATH_HPP
+
+#include "vec_traits.hpp"
+#include "saturate_cast.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+
+// saturate_cast
+
+namespace vec_math_detail
+{
+    template <int cn, typename VecD> struct SatCastHelper;
+    template <typename VecD> struct SatCastHelper<1, VecD>
+    {
+        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
+        {
+            typedef typename VecTraits<VecD>::elem_type D;
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x));
+        }
+    };
+    template <typename VecD> struct SatCastHelper<2, VecD>
+    {
+        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
+        {
+            typedef typename VecTraits<VecD>::elem_type D;
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y));
+        }
+    };
+    template <typename VecD> struct SatCastHelper<3, VecD>
+    {
+        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
+        {
+            typedef typename VecTraits<VecD>::elem_type D;
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z));
+        }
+    };
+    template <typename VecD> struct SatCastHelper<4, VecD>
+    {
+        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
+        {
+            typedef typename VecTraits<VecD>::elem_type D;
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z), saturate_cast<D>(v.w));
+        }
+    };
+
+    template <typename VecD, typename VecS> static __device__ __forceinline__ VecD saturate_cast_helper(const VecS& v)
+    {
+        return SatCastHelper<VecTraits<VecD>::cn, VecD>::cast(v);
+    }
+}
+
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uchar1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const char1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const ushort1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const short1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uint1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const int1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const float1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const double1& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uchar2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const char2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const ushort2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const short2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uint2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const int2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const float2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const double2& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uchar3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const char3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const ushort3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const short3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uint3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const int3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const float3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const double3& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uchar4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const char4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const ushort4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const short4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const uint4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const int4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const float4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+template<typename T> static __device__ __forceinline__ T saturate_cast(const double4& v) {return vec_math_detail::saturate_cast_helper<T>(v);}
+
+// unary operators
+
+#define CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(op, input_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 operator op(const input_type ## 1 & a) \
+    { \
+        return VecTraits<output_type ## 1>::make(op (a.x)); \
+    } \
+    __device__ __forceinline__ output_type ## 2 operator op(const input_type ## 2 & a) \
+    { \
+        return VecTraits<output_type ## 2>::make(op (a.x), op (a.y)); \
+    } \
+    __device__ __forceinline__ output_type ## 3 operator op(const input_type ## 3 & a) \
+    { \
+        return VecTraits<output_type ## 3>::make(op (a.x), op (a.y), op (a.z)); \
+    } \
+    __device__ __forceinline__ output_type ## 4 operator op(const input_type ## 4 & a) \
+    { \
+        return VecTraits<output_type ## 4>::make(op (a.x), op (a.y), op (a.z), op (a.w)); \
+    }
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(-, char, char)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(-, short, short)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(-, int, int)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(-, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(-, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(!, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, char, char)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, short, short)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, int, int)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, uint, uint)
+
+#undef CV_CUDEV_IMPLEMENT_VEC_UNARY_OP
+
+// unary functions
+
+#define CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(func_name, func, input_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 func_name(const input_type ## 1 & a) \
+    { \
+        return VecTraits<output_type ## 1>::make(func (a.x)); \
+    } \
+    __device__ __forceinline__ output_type ## 2 func_name(const input_type ## 2 & a) \
+    { \
+        return VecTraits<output_type ## 2>::make(func (a.x), func (a.y)); \
+    } \
+    __device__ __forceinline__ output_type ## 3 func_name(const input_type ## 3 & a) \
+    { \
+        return VecTraits<output_type ## 3>::make(func (a.x), func (a.y), func (a.z)); \
+    } \
+    __device__ __forceinline__ output_type ## 4 func_name(const input_type ## 4 & a) \
+    { \
+        return VecTraits<output_type ## 4>::make(func (a.x), func (a.y), func (a.z), func (a.w)); \
+    }
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::fabsf, float, float)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrtf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sqrt, ::sqrt, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::expf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp, ::exp, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2f, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp2, ::exp2, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10f, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(exp10, ::exp10, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::logf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log, ::log, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2f, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log2, ::log2, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10f, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(log10, ::log10, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sinf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sin, ::sin, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cosf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cos, ::cos, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tanf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tan, ::tan, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asinf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asin, ::asin, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acosf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acos, ::acos, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atanf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atan, ::atan, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinhf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(sinh, ::sinh, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::coshf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(cosh, ::cosh, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanhf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(tanh, ::tanh, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinhf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(asinh, ::asinh, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acoshf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(acosh, ::acosh, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanhf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(atanh, ::atanh, double, double)
+
+#undef CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC
+
+// binary operators (vec & vec)
+
+#define CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(op, input_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 operator op(const input_type ## 1 & a, const input_type ## 1 & b) \
+    { \
+        return VecTraits<output_type ## 1>::make(a.x op b.x); \
+    } \
+    __device__ __forceinline__ output_type ## 2 operator op(const input_type ## 2 & a, const input_type ## 2 & b) \
+    { \
+        return VecTraits<output_type ## 2>::make(a.x op b.x, a.y op b.y); \
+    } \
+    __device__ __forceinline__ output_type ## 3 operator op(const input_type ## 3 & a, const input_type ## 3 & b) \
+    { \
+        return VecTraits<output_type ## 3>::make(a.x op b.x, a.y op b.y, a.z op b.z); \
+    } \
+    __device__ __forceinline__ output_type ## 4 operator op(const input_type ## 4 & a, const input_type ## 4 & b) \
+    { \
+        return VecTraits<output_type ## 4>::make(a.x op b.x, a.y op b.y, a.z op b.z, a.w op b.w); \
+    }
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, uchar, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, char, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, ushort, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, short, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(+, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, uchar, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, char, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, ushort, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, short, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(-, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, uchar, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, char, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, ushort, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, short, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(*, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, uchar, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, char, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, ushort, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, short, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(/, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(==, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(!=, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(>=, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(<=, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&&, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, char, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, ushort, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, short, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, int, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, uint, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, float, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(||, double, uchar)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(&, uint, uint)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(|, uint, uint)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_OP(^, uint, uint)
+
+#undef CV_CUDEV_IMPLEMENT_VEC_BINARY_OP
+
+// binary operators (vec & scalar)
+
+#define CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(op, input_type, scalar_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 operator op(const input_type ## 1 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 1>::make(a.x op s); \
+    } \
+    __device__ __forceinline__ output_type ## 1 operator op(scalar_type s, const input_type ## 1 & b) \
+    { \
+        return VecTraits<output_type ## 1>::make(s op b.x); \
+    } \
+    __device__ __forceinline__ output_type ## 2 operator op(const input_type ## 2 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 2>::make(a.x op s, a.y op s); \
+    } \
+    __device__ __forceinline__ output_type ## 2 operator op(scalar_type s, const input_type ## 2 & b) \
+    { \
+        return VecTraits<output_type ## 2>::make(s op b.x, s op b.y); \
+    } \
+    __device__ __forceinline__ output_type ## 3 operator op(const input_type ## 3 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 3>::make(a.x op s, a.y op s, a.z op s); \
+    } \
+    __device__ __forceinline__ output_type ## 3 operator op(scalar_type s, const input_type ## 3 & b) \
+    { \
+        return VecTraits<output_type ## 3>::make(s op b.x, s op b.y, s op b.z); \
+    } \
+    __device__ __forceinline__ output_type ## 4 operator op(const input_type ## 4 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 4>::make(a.x op s, a.y op s, a.z op s, a.w op s); \
+    } \
+    __device__ __forceinline__ output_type ## 4 operator op(scalar_type s, const input_type ## 4 & b) \
+    { \
+        return VecTraits<output_type ## 4>::make(s op b.x, s op b.y, s op b.z, s op b.w); \
+    }
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uchar, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, char, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, ushort, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, short, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(+, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uchar, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, char, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, ushort, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, short, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(-, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uchar, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, char, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, ushort, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, short, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(*, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uchar, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, char, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, ushort, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, short, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(/, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(==, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(!=, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(>=, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(<=, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&&, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, char, char, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, ushort, ushort, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, short, short, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, int, int, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, uint, uint, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, float, float, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(||, double, double, uchar)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, char, char, char)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, ushort, ushort, ushort)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, short, short, short)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(&, uint, uint, uint)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, char, char, char)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, ushort, ushort, ushort)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, short, short, short)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(|, uint, uint, uint)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, char, char, char)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, ushort, ushort, ushort)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, short, short, short)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP(^, uint, uint, uint)
+
+#undef CV_CUDEV_IMPLEMENT_SCALAR_BINARY_OP
+
+// binary function (vec & vec)
+
+#define CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(func_name, func, input_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 func_name(const input_type ## 1 & a, const input_type ## 1 & b) \
+    { \
+        return VecTraits<output_type ## 1>::make(func (a.x, b.x)); \
+    } \
+    __device__ __forceinline__ output_type ## 2 func_name(const input_type ## 2 & a, const input_type ## 2 & b) \
+    { \
+        return VecTraits<output_type ## 2>::make(func (a.x, b.x), func (a.y, b.y)); \
+    } \
+    __device__ __forceinline__ output_type ## 3 func_name(const input_type ## 3 & a, const input_type ## 3 & b) \
+    { \
+        return VecTraits<output_type ## 3>::make(func (a.x, b.x), func (a.y, b.y), func (a.z, b.z)); \
+    } \
+    __device__ __forceinline__ output_type ## 4 func_name(const input_type ## 4 & a, const input_type ## 4 & b) \
+    { \
+        return VecTraits<output_type ## 4>::make(func (a.x, b.x), func (a.y, b.y), func (a.z, b.z), func (a.w, b.w)); \
+    }
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::max, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::fmaxf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(max, ::fmax, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, uchar, uchar)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, char, char)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, ushort, ushort)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, short, short)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, uint, uint)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::min, int, int)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::fminf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(min, ::fmin, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, char, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, short, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, int, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypotf, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(hypot, ::hypot, double, double)
+
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, uchar, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, char, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, ushort, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, short, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, uint, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, int, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2f, float, float)
+CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC(atan2, ::atan2, double, double)
+
+#undef CV_CUDEV_IMPLEMENT_VEC_BINARY_FUNC
+
+// binary function (vec & scalar)
+
+#define CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(func_name, func, input_type, scalar_type, output_type) \
+    __device__ __forceinline__ output_type ## 1 func_name(const input_type ## 1 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 1>::make(func ((output_type) a.x, (output_type) s)); \
+    } \
+    __device__ __forceinline__ output_type ## 1 func_name(scalar_type s, const input_type ## 1 & b) \
+    { \
+        return VecTraits<output_type ## 1>::make(func ((output_type) s, (output_type) b.x)); \
+    } \
+    __device__ __forceinline__ output_type ## 2 func_name(const input_type ## 2 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 2>::make(func ((output_type) a.x, (output_type) s), func ((output_type) a.y, (output_type) s)); \
+    } \
+    __device__ __forceinline__ output_type ## 2 func_name(scalar_type s, const input_type ## 2 & b) \
+    { \
+        return VecTraits<output_type ## 2>::make(func ((output_type) s, (output_type) b.x), func ((output_type) s, (output_type) b.y)); \
+    } \
+    __device__ __forceinline__ output_type ## 3 func_name(const input_type ## 3 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 3>::make(func ((output_type) a.x, (output_type) s), func ((output_type) a.y, (output_type) s), func ((output_type) a.z, (output_type) s)); \
+    } \
+    __device__ __forceinline__ output_type ## 3 func_name(scalar_type s, const input_type ## 3 & b) \
+    { \
+        return VecTraits<output_type ## 3>::make(func ((output_type) s, (output_type) b.x), func ((output_type) s, (output_type) b.y), func ((output_type) s, (output_type) b.z)); \
+    } \
+    __device__ __forceinline__ output_type ## 4 func_name(const input_type ## 4 & a, scalar_type s) \
+    { \
+        return VecTraits<output_type ## 4>::make(func ((output_type) a.x, (output_type) s), func ((output_type) a.y, (output_type) s), func ((output_type) a.z, (output_type) s), func ((output_type) a.w, (output_type) s)); \
+    } \
+    __device__ __forceinline__ output_type ## 4 func_name(scalar_type s, const input_type ## 4 & b) \
+    { \
+        return VecTraits<output_type ## 4>::make(func ((output_type) s, (output_type) b.x), func ((output_type) s, (output_type) b.y), func ((output_type) s, (output_type) b.z), func ((output_type) s, (output_type) b.w)); \
+    }
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, char, char, char)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, ushort, ushort, ushort)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, short, short, short)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::max, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmaxf, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(max, ::fmax, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, uchar, uchar, uchar)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, char, char, char)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, ushort, ushort, ushort)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, short, short, short)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, uint, uint, uint)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::min, int, int, int)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fminf, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(min, ::fmin, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypotf, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(hypot, ::hypot, double, double, double)
+
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, uchar, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, uchar, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, char, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, char, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, ushort, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, ushort, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, short, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, short, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, uint, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, uint, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, int, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, int, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2f, float, float, float)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, float, double, double)
+CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC(atan2, ::atan2, double, double, double)
+
+#undef CV_CUDEV_IMPLEMENT_SCALAR_BINARY_FUNC
+
+}}} // namespace cv { namespace cuda { namespace device
+
+//! @endcond
+
+#endif // OPENCV_CUDA_VECMATH_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/vec_traits.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/vec_traits.hpp
new file mode 100644
index 0000000..b5ff281
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/vec_traits.hpp
@@ -0,0 +1,288 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_VEC_TRAITS_HPP
+#define OPENCV_CUDA_VEC_TRAITS_HPP
+
+#include "common.hpp"
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    template<typename T, int N> struct TypeVec;
+
+    struct __align__(8) uchar8
+    {
+        uchar a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ uchar8 make_uchar8(uchar a0, uchar a1, uchar a2, uchar a3, uchar a4, uchar a5, uchar a6, uchar a7)
+    {
+        uchar8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(8) char8
+    {
+        schar a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ char8 make_char8(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7)
+    {
+        char8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(16) ushort8
+    {
+        ushort a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ ushort8 make_ushort8(ushort a0, ushort a1, ushort a2, ushort a3, ushort a4, ushort a5, ushort a6, ushort a7)
+    {
+        ushort8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(16) short8
+    {
+        short a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ short8 make_short8(short a0, short a1, short a2, short a3, short a4, short a5, short a6, short a7)
+    {
+        short8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(32) uint8
+    {
+        uint a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ uint8 make_uint8(uint a0, uint a1, uint a2, uint a3, uint a4, uint a5, uint a6, uint a7)
+    {
+        uint8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(32) int8
+    {
+        int a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ int8 make_int8(int a0, int a1, int a2, int a3, int a4, int a5, int a6, int a7)
+    {
+        int8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(32) float8
+    {
+        float a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ float8 make_float8(float a0, float a1, float a2, float a3, float a4, float a5, float a6, float a7)
+    {
+        float8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct double8
+    {
+        double a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ double8 make_double8(double a0, double a1, double a2, double a3, double a4, double a5, double a6, double a7)
+    {
+        double8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+
+#define OPENCV_CUDA_IMPLEMENT_TYPE_VEC(type) \
+    template<> struct TypeVec<type, 1> { typedef type vec_type; }; \
+    template<> struct TypeVec<type ## 1, 1> { typedef type ## 1 vec_type; }; \
+    template<> struct TypeVec<type, 2> { typedef type ## 2 vec_type; }; \
+    template<> struct TypeVec<type ## 2, 2> { typedef type ## 2 vec_type; }; \
+    template<> struct TypeVec<type, 3> { typedef type ## 3 vec_type; }; \
+    template<> struct TypeVec<type ## 3, 3> { typedef type ## 3 vec_type; }; \
+    template<> struct TypeVec<type, 4> { typedef type ## 4 vec_type; }; \
+    template<> struct TypeVec<type ## 4, 4> { typedef type ## 4 vec_type; }; \
+    template<> struct TypeVec<type, 8> { typedef type ## 8 vec_type; }; \
+    template<> struct TypeVec<type ## 8, 8> { typedef type ## 8 vec_type; };
+
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(uchar)
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(char)
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(ushort)
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(short)
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(int)
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(uint)
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(float)
+    OPENCV_CUDA_IMPLEMENT_TYPE_VEC(double)
+
+    #undef OPENCV_CUDA_IMPLEMENT_TYPE_VEC
+
+    template<> struct TypeVec<schar, 1> { typedef schar vec_type; };
+    template<> struct TypeVec<schar, 2> { typedef char2 vec_type; };
+    template<> struct TypeVec<schar, 3> { typedef char3 vec_type; };
+    template<> struct TypeVec<schar, 4> { typedef char4 vec_type; };
+    template<> struct TypeVec<schar, 8> { typedef char8 vec_type; };
+
+    template<> struct TypeVec<bool, 1> { typedef uchar vec_type; };
+    template<> struct TypeVec<bool, 2> { typedef uchar2 vec_type; };
+    template<> struct TypeVec<bool, 3> { typedef uchar3 vec_type; };
+    template<> struct TypeVec<bool, 4> { typedef uchar4 vec_type; };
+    template<> struct TypeVec<bool, 8> { typedef uchar8 vec_type; };
+
+    template<typename T> struct VecTraits;
+
+#define OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(type) \
+    template<> struct VecTraits<type> \
+    { \
+        typedef type elem_type; \
+        enum {cn=1}; \
+        static __device__ __host__ __forceinline__ type all(type v) {return v;} \
+        static __device__ __host__ __forceinline__ type make(type x) {return x;} \
+        static __device__ __host__ __forceinline__ type make(const type* v) {return *v;} \
+    }; \
+    template<> struct VecTraits<type ## 1> \
+    { \
+        typedef type elem_type; \
+        enum {cn=1}; \
+        static __device__ __host__ __forceinline__ type ## 1 all(type v) {return make_ ## type ## 1(v);} \
+        static __device__ __host__ __forceinline__ type ## 1 make(type x) {return make_ ## type ## 1(x);} \
+        static __device__ __host__ __forceinline__ type ## 1 make(const type* v) {return make_ ## type ## 1(*v);} \
+    }; \
+    template<> struct VecTraits<type ## 2> \
+    { \
+        typedef type elem_type; \
+        enum {cn=2}; \
+        static __device__ __host__ __forceinline__ type ## 2 all(type v) {return make_ ## type ## 2(v, v);} \
+        static __device__ __host__ __forceinline__ type ## 2 make(type x, type y) {return make_ ## type ## 2(x, y);} \
+        static __device__ __host__ __forceinline__ type ## 2 make(const type* v) {return make_ ## type ## 2(v[0], v[1]);} \
+    }; \
+    template<> struct VecTraits<type ## 3> \
+    { \
+        typedef type elem_type; \
+        enum {cn=3}; \
+        static __device__ __host__ __forceinline__ type ## 3 all(type v) {return make_ ## type ## 3(v, v, v);} \
+        static __device__ __host__ __forceinline__ type ## 3 make(type x, type y, type z) {return make_ ## type ## 3(x, y, z);} \
+        static __device__ __host__ __forceinline__ type ## 3 make(const type* v) {return make_ ## type ## 3(v[0], v[1], v[2]);} \
+    }; \
+    template<> struct VecTraits<type ## 4> \
+    { \
+        typedef type elem_type; \
+        enum {cn=4}; \
+        static __device__ __host__ __forceinline__ type ## 4 all(type v) {return make_ ## type ## 4(v, v, v, v);} \
+        static __device__ __host__ __forceinline__ type ## 4 make(type x, type y, type z, type w) {return make_ ## type ## 4(x, y, z, w);} \
+        static __device__ __host__ __forceinline__ type ## 4 make(const type* v) {return make_ ## type ## 4(v[0], v[1], v[2], v[3]);} \
+    }; \
+    template<> struct VecTraits<type ## 8> \
+    { \
+        typedef type elem_type; \
+        enum {cn=8}; \
+        static __device__ __host__ __forceinline__ type ## 8 all(type v) {return make_ ## type ## 8(v, v, v, v, v, v, v, v);} \
+        static __device__ __host__ __forceinline__ type ## 8 make(type a0, type a1, type a2, type a3, type a4, type a5, type a6, type a7) {return make_ ## type ## 8(a0, a1, a2, a3, a4, a5, a6, a7);} \
+        static __device__ __host__ __forceinline__ type ## 8 make(const type* v) {return make_ ## type ## 8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);} \
+    };
+
+    OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(uchar)
+    OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(ushort)
+    OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(short)
+    OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(int)
+    OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(uint)
+    OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(float)
+    OPENCV_CUDA_IMPLEMENT_VEC_TRAITS(double)
+
+    #undef OPENCV_CUDA_IMPLEMENT_VEC_TRAITS
+
+    template<> struct VecTraits<char>
+    {
+        typedef char elem_type;
+        enum {cn=1};
+        static __device__ __host__ __forceinline__ char all(char v) {return v;}
+        static __device__ __host__ __forceinline__ char make(char x) {return x;}
+        static __device__ __host__ __forceinline__ char make(const char* x) {return *x;}
+    };
+    template<> struct VecTraits<schar>
+    {
+        typedef schar elem_type;
+        enum {cn=1};
+        static __device__ __host__ __forceinline__ schar all(schar v) {return v;}
+        static __device__ __host__ __forceinline__ schar make(schar x) {return x;}
+        static __device__ __host__ __forceinline__ schar make(const schar* x) {return *x;}
+    };
+    template<> struct VecTraits<char1>
+    {
+        typedef schar elem_type;
+        enum {cn=1};
+        static __device__ __host__ __forceinline__ char1 all(schar v) {return make_char1(v);}
+        static __device__ __host__ __forceinline__ char1 make(schar x) {return make_char1(x);}
+        static __device__ __host__ __forceinline__ char1 make(const schar* v) {return make_char1(v[0]);}
+    };
+    template<> struct VecTraits<char2>
+    {
+        typedef schar elem_type;
+        enum {cn=2};
+        static __device__ __host__ __forceinline__ char2 all(schar v) {return make_char2(v, v);}
+        static __device__ __host__ __forceinline__ char2 make(schar x, schar y) {return make_char2(x, y);}
+        static __device__ __host__ __forceinline__ char2 make(const schar* v) {return make_char2(v[0], v[1]);}
+    };
+    template<> struct VecTraits<char3>
+    {
+        typedef schar elem_type;
+        enum {cn=3};
+        static __device__ __host__ __forceinline__ char3 all(schar v) {return make_char3(v, v, v);}
+        static __device__ __host__ __forceinline__ char3 make(schar x, schar y, schar z) {return make_char3(x, y, z);}
+        static __device__ __host__ __forceinline__ char3 make(const schar* v) {return make_char3(v[0], v[1], v[2]);}
+    };
+    template<> struct VecTraits<char4>
+    {
+        typedef schar elem_type;
+        enum {cn=4};
+        static __device__ __host__ __forceinline__ char4 all(schar v) {return make_char4(v, v, v, v);}
+        static __device__ __host__ __forceinline__ char4 make(schar x, schar y, schar z, schar w) {return make_char4(x, y, z, w);}
+        static __device__ __host__ __forceinline__ char4 make(const schar* v) {return make_char4(v[0], v[1], v[2], v[3]);}
+    };
+    template<> struct VecTraits<char8>
+    {
+        typedef schar elem_type;
+        enum {cn=8};
+        static __device__ __host__ __forceinline__ char8 all(schar v) {return make_char8(v, v, v, v, v, v, v, v);}
+        static __device__ __host__ __forceinline__ char8 make(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7) {return make_char8(a0, a1, a2, a3, a4, a5, a6, a7);}
+        static __device__ __host__ __forceinline__ char8 make(const schar* v) {return make_char8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);}
+    };
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_VEC_TRAITS_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/warp.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/warp.hpp
new file mode 100644
index 0000000..8af7e6a
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/warp.hpp
@@ -0,0 +1,139 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_DEVICE_WARP_HPP
+#define OPENCV_CUDA_DEVICE_WARP_HPP
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    struct Warp
+    {
+        enum
+        {
+            LOG_WARP_SIZE = 5,
+            WARP_SIZE     = 1 << LOG_WARP_SIZE,
+            STRIDE        = WARP_SIZE
+        };
+
+        /** \brief Returns the warp lane ID of the calling thread. */
+        static __device__ __forceinline__ unsigned int laneId()
+        {
+            unsigned int ret;
+            asm("mov.u32 %0, %%laneid;" : "=r"(ret) );
+            return ret;
+        }
+
+        template<typename It, typename T>
+        static __device__ __forceinline__ void fill(It beg, It end, const T& value)
+        {
+            for(It t = beg + laneId(); t < end; t += STRIDE)
+                *t = value;
+        }
+
+        template<typename InIt, typename OutIt>
+        static __device__ __forceinline__ OutIt copy(InIt beg, InIt end, OutIt out)
+        {
+            for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)
+                *out = *t;
+            return out;
+        }
+
+        template<typename InIt, typename OutIt, class UnOp>
+        static __device__ __forceinline__ OutIt transform(InIt beg, InIt end, OutIt out, UnOp op)
+        {
+            for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)
+                *out = op(*t);
+            return out;
+        }
+
+        template<typename InIt1, typename InIt2, typename OutIt, class BinOp>
+        static __device__ __forceinline__ OutIt transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
+        {
+            unsigned int lane = laneId();
+
+            InIt1 t1 = beg1 + lane;
+            InIt2 t2 = beg2 + lane;
+            for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, out += STRIDE)
+                *out = op(*t1, *t2);
+            return out;
+        }
+
+        template <class T, class BinOp>
+        static __device__ __forceinline__ T reduce(volatile T *ptr, BinOp op)
+        {
+            const unsigned int lane = laneId();
+
+            if (lane < 16)
+            {
+                T partial = ptr[lane];
+
+                ptr[lane] = partial = op(partial, ptr[lane + 16]);
+                ptr[lane] = partial = op(partial, ptr[lane + 8]);
+                ptr[lane] = partial = op(partial, ptr[lane + 4]);
+                ptr[lane] = partial = op(partial, ptr[lane + 2]);
+                ptr[lane] = partial = op(partial, ptr[lane + 1]);
+            }
+
+            return *ptr;
+        }
+
+        template<typename OutIt, typename T>
+        static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
+        {
+            unsigned int lane = laneId();
+            value += lane;
+
+            for(OutIt t = beg + lane; t < end; t += STRIDE, value += STRIDE)
+                *t = value;
+        }
+    };
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif /* OPENCV_CUDA_DEVICE_WARP_HPP */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/warp_reduce.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/warp_reduce.hpp
new file mode 100644
index 0000000..530303d
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/warp_reduce.hpp
@@ -0,0 +1,76 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_WARP_REDUCE_HPP__
+#define OPENCV_CUDA_WARP_REDUCE_HPP__
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    template <class T>
+    __device__ __forceinline__ T warp_reduce(volatile T *ptr , const unsigned int tid = threadIdx.x)
+    {
+        const unsigned int lane = tid & 31; // index of thread in warp (0..31)
+
+        if (lane < 16)
+        {
+            T partial = ptr[tid];
+
+            ptr[tid] = partial = partial + ptr[tid + 16];
+            ptr[tid] = partial = partial + ptr[tid + 8];
+            ptr[tid] = partial = partial + ptr[tid + 4];
+            ptr[tid] = partial = partial + ptr[tid + 2];
+            ptr[tid] = partial = partial + ptr[tid + 1];
+        }
+
+        return ptr[tid - lane];
+    }
+}}} // namespace cv { namespace cuda { namespace cudev {
+
+//! @endcond
+
+#endif /* OPENCV_CUDA_WARP_REDUCE_HPP__ */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/warp_shuffle.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/warp_shuffle.hpp
new file mode 100644
index 0000000..0da54ae
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda/warp_shuffle.hpp
@@ -0,0 +1,162 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_WARP_SHUFFLE_HPP
+#define OPENCV_CUDA_WARP_SHUFFLE_HPP
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+#if __CUDACC_VER_MAJOR__ >= 9
+#  define __shfl(x, y, z) __shfl_sync(0xFFFFFFFFU, x, y, z)
+#  define __shfl_up(x, y, z) __shfl_up_sync(0xFFFFFFFFU, x, y, z)
+#  define __shfl_down(x, y, z) __shfl_down_sync(0xFFFFFFFFU, x, y, z)
+#endif
+    template <typename T>
+    __device__ __forceinline__ T shfl(T val, int srcLane, int width = warpSize)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+        return __shfl(val, srcLane, width);
+    #else
+        return T();
+    #endif
+    }
+    __device__ __forceinline__ unsigned int shfl(unsigned int val, int srcLane, int width = warpSize)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+        return (unsigned int) __shfl((int) val, srcLane, width);
+    #else
+        return 0;
+    #endif
+    }
+    __device__ __forceinline__ double shfl(double val, int srcLane, int width = warpSize)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+        int lo = __double2loint(val);
+        int hi = __double2hiint(val);
+
+        lo = __shfl(lo, srcLane, width);
+        hi = __shfl(hi, srcLane, width);
+
+        return __hiloint2double(hi, lo);
+    #else
+        return 0.0;
+    #endif
+    }
+
+    template <typename T>
+    __device__ __forceinline__ T shfl_down(T val, unsigned int delta, int width = warpSize)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+        return __shfl_down(val, delta, width);
+    #else
+        return T();
+    #endif
+    }
+    __device__ __forceinline__ unsigned int shfl_down(unsigned int val, unsigned int delta, int width = warpSize)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+        return (unsigned int) __shfl_down((int) val, delta, width);
+    #else
+        return 0;
+    #endif
+    }
+    __device__ __forceinline__ double shfl_down(double val, unsigned int delta, int width = warpSize)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+        int lo = __double2loint(val);
+        int hi = __double2hiint(val);
+
+        lo = __shfl_down(lo, delta, width);
+        hi = __shfl_down(hi, delta, width);
+
+        return __hiloint2double(hi, lo);
+    #else
+        return 0.0;
+    #endif
+    }
+
+    template <typename T>
+    __device__ __forceinline__ T shfl_up(T val, unsigned int delta, int width = warpSize)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+        return __shfl_up(val, delta, width);
+    #else
+        return T();
+    #endif
+    }
+    __device__ __forceinline__ unsigned int shfl_up(unsigned int val, unsigned int delta, int width = warpSize)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+        return (unsigned int) __shfl_up((int) val, delta, width);
+    #else
+        return 0;
+    #endif
+    }
+    __device__ __forceinline__ double shfl_up(double val, unsigned int delta, int width = warpSize)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+        int lo = __double2loint(val);
+        int hi = __double2hiint(val);
+
+        lo = __shfl_up(lo, delta, width);
+        hi = __shfl_up(hi, delta, width);
+
+        return __hiloint2double(hi, lo);
+    #else
+        return 0.0;
+    #endif
+    }
+}}}
+
+#  undef __shfl
+#  undef __shfl_up
+#  undef __shfl_down
+
+//! @endcond
+
+#endif // OPENCV_CUDA_WARP_SHUFFLE_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda_stream_accessor.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda_stream_accessor.hpp
new file mode 100644
index 0000000..deaf356
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda_stream_accessor.hpp
@@ -0,0 +1,86 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_CUDA_STREAM_ACCESSOR_HPP
+#define OPENCV_CORE_CUDA_STREAM_ACCESSOR_HPP
+
+#ifndef __cplusplus
+#  error cuda_stream_accessor.hpp header must be compiled as C++
+#endif
+
+/** @file cuda_stream_accessor.hpp
+ * This is only header file that depends on CUDA Runtime API. All other headers are independent.
+ */
+
+#include <cuda_runtime.h>
+#include "opencv2/core/cuda.hpp"
+
+namespace cv
+{
+    namespace cuda
+    {
+
+//! @addtogroup cudacore_struct
+//! @{
+
+        /** @brief Class that enables getting cudaStream_t from cuda::Stream
+         */
+        struct StreamAccessor
+        {
+            CV_EXPORTS static cudaStream_t getStream(const Stream& stream);
+            CV_EXPORTS static Stream wrapStream(cudaStream_t stream);
+        };
+
+        /** @brief Class that enables getting cudaEvent_t from cuda::Event
+         */
+        struct EventAccessor
+        {
+            CV_EXPORTS static cudaEvent_t getEvent(const Event& event);
+            CV_EXPORTS static Event wrapEvent(cudaEvent_t event);
+        };
+
+//! @}
+
+    }
+}
+
+#endif /* OPENCV_CORE_CUDA_STREAM_ACCESSOR_HPP */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda_types.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda_types.hpp
new file mode 100644
index 0000000..b33f061
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cuda_types.hpp
@@ -0,0 +1,144 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_CUDA_TYPES_HPP
+#define OPENCV_CORE_CUDA_TYPES_HPP
+
+#ifndef __cplusplus
+#  error cuda_types.hpp header must be compiled as C++
+#endif
+
+#if defined(__OPENCV_BUILD) && defined(__clang__)
+#pragma clang diagnostic ignored "-Winconsistent-missing-override"
+#endif
+#if defined(__OPENCV_BUILD) && defined(__GNUC__) && __GNUC__ >= 5
+#pragma GCC diagnostic ignored "-Wsuggest-override"
+#endif
+
+/** @file
+ * @deprecated Use @ref cudev instead.
+ */
+
+//! @cond IGNORED
+
+#ifdef __CUDACC__
+    #define __CV_CUDA_HOST_DEVICE__ __host__ __device__ __forceinline__
+#else
+    #define __CV_CUDA_HOST_DEVICE__
+#endif
+
+namespace cv
+{
+    namespace cuda
+    {
+
+        // Simple lightweight structures that encapsulates information about an image on device.
+        // It is intended to pass to nvcc-compiled code. GpuMat depends on headers that nvcc can't compile
+
+        template <typename T> struct DevPtr
+        {
+            typedef T elem_type;
+            typedef int index_type;
+
+            enum { elem_size = sizeof(elem_type) };
+
+            T* data;
+
+            __CV_CUDA_HOST_DEVICE__ DevPtr() : data(0) {}
+            __CV_CUDA_HOST_DEVICE__ DevPtr(T* data_) : data(data_) {}
+
+            __CV_CUDA_HOST_DEVICE__ size_t elemSize() const { return elem_size; }
+            __CV_CUDA_HOST_DEVICE__ operator       T*()       { return data; }
+            __CV_CUDA_HOST_DEVICE__ operator const T*() const { return data; }
+        };
+
+        template <typename T> struct PtrSz : public DevPtr<T>
+        {
+            __CV_CUDA_HOST_DEVICE__ PtrSz() : size(0) {}
+            __CV_CUDA_HOST_DEVICE__ PtrSz(T* data_, size_t size_) : DevPtr<T>(data_), size(size_) {}
+
+            size_t size;
+        };
+
+        template <typename T> struct PtrStep : public DevPtr<T>
+        {
+            __CV_CUDA_HOST_DEVICE__ PtrStep() : step(0) {}
+            __CV_CUDA_HOST_DEVICE__ PtrStep(T* data_, size_t step_) : DevPtr<T>(data_), step(step_) {}
+
+            size_t step;
+
+            __CV_CUDA_HOST_DEVICE__       T* ptr(int y = 0)       { return (      T*)( (      char*)(((DevPtr<T>*)this)->data) + y * step); }
+            __CV_CUDA_HOST_DEVICE__ const T* ptr(int y = 0) const { return (const T*)( (const char*)(((DevPtr<T>*)this)->data) + y * step); }
+
+            __CV_CUDA_HOST_DEVICE__       T& operator ()(int y, int x)       { return ptr(y)[x]; }
+            __CV_CUDA_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }
+        };
+
+        template <typename T> struct PtrStepSz : public PtrStep<T>
+        {
+            __CV_CUDA_HOST_DEVICE__ PtrStepSz() : cols(0), rows(0) {}
+            __CV_CUDA_HOST_DEVICE__ PtrStepSz(int rows_, int cols_, T* data_, size_t step_)
+                : PtrStep<T>(data_, step_), cols(cols_), rows(rows_) {}
+
+            template <typename U>
+            explicit PtrStepSz(const PtrStepSz<U>& d) : PtrStep<T>((T*)d.data, d.step), cols(d.cols), rows(d.rows){}
+
+            int cols;
+            int rows;
+        };
+
+        typedef PtrStepSz<unsigned char> PtrStepSzb;
+        typedef PtrStepSz<unsigned short> PtrStepSzus;
+        typedef PtrStepSz<float> PtrStepSzf;
+        typedef PtrStepSz<int> PtrStepSzi;
+
+        typedef PtrStep<unsigned char> PtrStepb;
+        typedef PtrStep<unsigned short> PtrStepus;
+        typedef PtrStep<float> PtrStepf;
+        typedef PtrStep<int> PtrStepi;
+
+    }
+}
+
+//! @endcond
+
+#endif /* OPENCV_CORE_CUDA_TYPES_HPP */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cv_cpu_dispatch.h b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cv_cpu_dispatch.h
new file mode 100644
index 0000000..ab5a67d
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cv_cpu_dispatch.h
@@ -0,0 +1,368 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#if defined __OPENCV_BUILD \
+
+#include "cv_cpu_config.h"
+#include "cv_cpu_helper.h"
+
+#ifdef CV_CPU_DISPATCH_MODE
+#define CV_CPU_OPTIMIZATION_NAMESPACE __CV_CAT(opt_, CV_CPU_DISPATCH_MODE)
+#define CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN namespace __CV_CAT(opt_, CV_CPU_DISPATCH_MODE) {
+#define CV_CPU_OPTIMIZATION_NAMESPACE_END }
+#else
+#define CV_CPU_OPTIMIZATION_NAMESPACE cpu_baseline
+#define CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN namespace cpu_baseline {
+#define CV_CPU_OPTIMIZATION_NAMESPACE_END }
+#define CV_CPU_BASELINE_MODE 1
+#endif
+
+
+#define __CV_CPU_DISPATCH_CHAIN_END(fn, args, mode, ...)  /* done */
+#define __CV_CPU_DISPATCH(fn, args, mode, ...) __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+#define __CV_CPU_DISPATCH_EXPAND(fn, args, ...) __CV_EXPAND(__CV_CPU_DISPATCH(fn, args, __VA_ARGS__))
+#define CV_CPU_DISPATCH(fn, args, ...) __CV_CPU_DISPATCH_EXPAND(fn, args, __VA_ARGS__, END) // expand macros
+
+
+#if defined CV_ENABLE_INTRINSICS \
+    && !defined CV_DISABLE_OPTIMIZATION \
+    && !defined __CUDACC__ /* do not include SSE/AVX/NEON headers for NVCC compiler */ \
+
+#ifdef CV_CPU_COMPILE_SSE2
+#  include <emmintrin.h>
+#  define CV_MMX 1
+#  define CV_SSE 1
+#  define CV_SSE2 1
+#endif
+#ifdef CV_CPU_COMPILE_SSE3
+#  include <pmmintrin.h>
+#  define CV_SSE3 1
+#endif
+#ifdef CV_CPU_COMPILE_SSSE3
+#  include <tmmintrin.h>
+#  define CV_SSSE3 1
+#endif
+#ifdef CV_CPU_COMPILE_SSE4_1
+#  include <smmintrin.h>
+#  define CV_SSE4_1 1
+#endif
+#ifdef CV_CPU_COMPILE_SSE4_2
+#  include <nmmintrin.h>
+#  define CV_SSE4_2 1
+#endif
+#ifdef CV_CPU_COMPILE_POPCNT
+#  ifdef _MSC_VER
+#    include <nmmintrin.h>
+#    if defined(_M_X64)
+#      define CV_POPCNT_U64 (int)_mm_popcnt_u64
+#    endif
+#    define CV_POPCNT_U32 _mm_popcnt_u32
+#  else
+#    include <popcntintrin.h>
+#    if defined(__x86_64__)
+#      define CV_POPCNT_U64 __builtin_popcountll
+#    endif
+#    define CV_POPCNT_U32 __builtin_popcount
+#  endif
+#  define CV_POPCNT 1
+#endif
+#ifdef CV_CPU_COMPILE_AVX
+#  include <immintrin.h>
+#  define CV_AVX 1
+#endif
+#ifdef CV_CPU_COMPILE_FP16
+#  if defined(__arm__) || defined(__aarch64__) || defined(_M_ARM) || defined(_M_ARM64)
+#    include <arm_neon.h>
+#  else
+#    include <immintrin.h>
+#  endif
+#  define CV_FP16 1
+#endif
+#ifdef CV_CPU_COMPILE_AVX2
+#  include <immintrin.h>
+#  define CV_AVX2 1
+#endif
+#ifdef CV_CPU_COMPILE_AVX_512F
+#  include <immintrin.h>
+#  define CV_AVX_512F 1
+#endif
+#ifdef CV_CPU_COMPILE_AVX512_COMMON
+#  define CV_AVX512_COMMON 1
+#  define CV_AVX_512CD 1
+#endif
+#ifdef CV_CPU_COMPILE_AVX512_KNL
+#  define CV_AVX512_KNL 1
+#  define CV_AVX_512ER 1
+#  define CV_AVX_512PF 1
+#endif
+#ifdef CV_CPU_COMPILE_AVX512_KNM
+#  define CV_AVX512_KNM 1
+#  define CV_AVX_5124FMAPS 1
+#  define CV_AVX_5124VNNIW 1
+#  define CV_AVX_512VPOPCNTDQ 1
+#endif
+#ifdef CV_CPU_COMPILE_AVX512_SKX
+#  define CV_AVX512_SKX 1
+#  define CV_AVX_512VL 1
+#  define CV_AVX_512BW 1
+#  define CV_AVX_512DQ 1
+#endif
+#ifdef CV_CPU_COMPILE_AVX512_CNL
+#  define CV_AVX512_CNL 1
+#  define CV_AVX_512IFMA 1
+#  define CV_AVX_512VBMI 1
+#endif
+#ifdef CV_CPU_COMPILE_AVX512_CLX
+#  define CV_AVX512_CLX 1
+#  define CV_AVX_512VNNI 1
+#endif
+#ifdef CV_CPU_COMPILE_AVX512_ICL
+#  define CV_AVX512_ICL 1
+#  undef CV_AVX_512IFMA
+#  define CV_AVX_512IFMA 1
+#  undef CV_AVX_512VBMI
+#  define CV_AVX_512VBMI 1
+#  undef CV_AVX_512VNNI
+#  define CV_AVX_512VNNI 1
+#  define CV_AVX_512VBMI2 1
+#  define CV_AVX_512BITALG 1
+#  define CV_AVX_512VPOPCNTDQ 1
+#endif
+#ifdef CV_CPU_COMPILE_FMA3
+#  define CV_FMA3 1
+#endif
+
+#if defined _WIN32 && (defined(_M_ARM) || defined(_M_ARM64)) && (defined(CV_CPU_COMPILE_NEON) || !defined(_MSC_VER))
+# include <Intrin.h>
+# include <arm_neon.h>
+# define CV_NEON 1
+#elif defined(__ARM_NEON__) || (defined (__ARM_NEON) && defined(__aarch64__))
+#  include <arm_neon.h>
+#  define CV_NEON 1
+#endif
+
+#if defined(__riscv) && defined(__riscv_vector) && defined(__riscv_vector_071)
+# include<riscv-vector.h>
+# define CV_RVV071 1
+#endif
+
+#if defined(__ARM_NEON__) || defined(__aarch64__)
+#  include <arm_neon.h>
+#endif
+
+#ifdef CV_CPU_COMPILE_VSX
+#  include <altivec.h>
+#  undef vector
+#  undef pixel
+#  undef bool
+#  define CV_VSX 1
+#endif
+
+#ifdef CV_CPU_COMPILE_VSX3
+#  define CV_VSX3 1
+#endif
+
+#ifdef CV_CPU_COMPILE_MSA
+#  include "hal/msa_macros.h"
+#  define CV_MSA 1
+#endif
+
+#ifdef __EMSCRIPTEN__
+#  define CV_WASM_SIMD 1
+#  include <wasm_simd128.h>
+#endif
+
+#if defined CV_CPU_COMPILE_RVV
+#  define CV_RVV 1
+#  include <riscv_vector.h>
+#endif
+
+#endif // CV_ENABLE_INTRINSICS && !CV_DISABLE_OPTIMIZATION && !__CUDACC__
+
+#if defined CV_CPU_COMPILE_AVX && !defined CV_CPU_BASELINE_COMPILE_AVX
+struct VZeroUpperGuard {
+#ifdef __GNUC__
+    __attribute__((always_inline))
+#endif
+    inline VZeroUpperGuard() { _mm256_zeroupper(); }
+#ifdef __GNUC__
+    __attribute__((always_inline))
+#endif
+    inline ~VZeroUpperGuard() { _mm256_zeroupper(); }
+};
+#define __CV_AVX_GUARD VZeroUpperGuard __vzeroupper_guard; CV_UNUSED(__vzeroupper_guard);
+#endif
+
+#ifdef __CV_AVX_GUARD
+#define CV_AVX_GUARD __CV_AVX_GUARD
+#else
+#define CV_AVX_GUARD
+#endif
+
+#endif // __OPENCV_BUILD
+
+
+
+#if !defined __OPENCV_BUILD /* Compatibility code */ \
+    && !defined __CUDACC__ /* do not include SSE/AVX/NEON headers for NVCC compiler */
+#if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)
+#  include <emmintrin.h>
+#  define CV_MMX 1
+#  define CV_SSE 1
+#  define CV_SSE2 1
+#elif defined _WIN32 && (defined(_M_ARM) || defined(_M_ARM64)) && (defined(CV_CPU_COMPILE_NEON) || !defined(_MSC_VER))
+# include <Intrin.h>
+# include <arm_neon.h>
+# define CV_NEON 1
+#elif defined(__ARM_NEON__) || (defined (__ARM_NEON) && defined(__aarch64__))
+#  include <arm_neon.h>
+#  define CV_NEON 1
+#elif defined(__VSX__) && defined(__PPC64__) && defined(__LITTLE_ENDIAN__)
+#  include <altivec.h>
+#  undef vector
+#  undef pixel
+#  undef bool
+#  define CV_VSX 1
+#endif
+
+#ifdef __F16C__
+#  include <immintrin.h>
+#  define CV_FP16 1
+#endif
+
+#endif // !__OPENCV_BUILD && !__CUDACC (Compatibility code)
+
+
+
+#ifndef CV_MMX
+#  define CV_MMX 0
+#endif
+#ifndef CV_SSE
+#  define CV_SSE 0
+#endif
+#ifndef CV_SSE2
+#  define CV_SSE2 0
+#endif
+#ifndef CV_SSE3
+#  define CV_SSE3 0
+#endif
+#ifndef CV_SSSE3
+#  define CV_SSSE3 0
+#endif
+#ifndef CV_SSE4_1
+#  define CV_SSE4_1 0
+#endif
+#ifndef CV_SSE4_2
+#  define CV_SSE4_2 0
+#endif
+#ifndef CV_POPCNT
+#  define CV_POPCNT 0
+#endif
+#ifndef CV_AVX
+#  define CV_AVX 0
+#endif
+#ifndef CV_FP16
+#  define CV_FP16 0
+#endif
+#ifndef CV_AVX2
+#  define CV_AVX2 0
+#endif
+#ifndef CV_FMA3
+#  define CV_FMA3 0
+#endif
+#ifndef CV_AVX_512F
+#  define CV_AVX_512F 0
+#endif
+#ifndef CV_AVX_512BW
+#  define CV_AVX_512BW 0
+#endif
+#ifndef CV_AVX_512CD
+#  define CV_AVX_512CD 0
+#endif
+#ifndef CV_AVX_512DQ
+#  define CV_AVX_512DQ 0
+#endif
+#ifndef CV_AVX_512ER
+#  define CV_AVX_512ER 0
+#endif
+#ifndef CV_AVX_512IFMA
+#  define CV_AVX_512IFMA 0
+#endif
+#define CV_AVX_512IFMA512 CV_AVX_512IFMA // deprecated
+#ifndef CV_AVX_512PF
+#  define CV_AVX_512PF 0
+#endif
+#ifndef CV_AVX_512VBMI
+#  define CV_AVX_512VBMI 0
+#endif
+#ifndef CV_AVX_512VL
+#  define CV_AVX_512VL 0
+#endif
+#ifndef CV_AVX_5124FMAPS
+#  define CV_AVX_5124FMAPS 0
+#endif
+#ifndef CV_AVX_5124VNNIW
+#  define CV_AVX_5124VNNIW 0
+#endif
+#ifndef CV_AVX_512VPOPCNTDQ
+#  define CV_AVX_512VPOPCNTDQ 0
+#endif
+#ifndef CV_AVX_512VNNI
+#  define CV_AVX_512VNNI 0
+#endif
+#ifndef CV_AVX_512VBMI2
+#  define CV_AVX_512VBMI2 0
+#endif
+#ifndef CV_AVX_512BITALG
+#  define CV_AVX_512BITALG 0
+#endif
+#ifndef CV_AVX512_COMMON
+#  define CV_AVX512_COMMON 0
+#endif
+#ifndef CV_AVX512_KNL
+#  define CV_AVX512_KNL 0
+#endif
+#ifndef CV_AVX512_KNM
+#  define CV_AVX512_KNM 0
+#endif
+#ifndef CV_AVX512_SKX
+#  define CV_AVX512_SKX 0
+#endif
+#ifndef CV_AVX512_CNL
+#  define CV_AVX512_CNL 0
+#endif
+#ifndef CV_AVX512_CLX
+#  define CV_AVX512_CLX 0
+#endif
+#ifndef CV_AVX512_ICL
+#  define CV_AVX512_ICL 0
+#endif
+
+#ifndef CV_NEON
+#  define CV_NEON 0
+#endif
+
+#ifndef CV_RVV071
+#  define CV_RVV071 0
+#endif
+
+#ifndef CV_VSX
+#  define CV_VSX 0
+#endif
+
+#ifndef CV_VSX3
+#  define CV_VSX3 0
+#endif
+
+#ifndef CV_MSA
+#  define CV_MSA 0
+#endif
+
+#ifndef CV_WASM_SIMD
+#  define CV_WASM_SIMD 0
+#endif
+
+#ifndef CV_RVV
+#  define CV_RVV 0
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cv_cpu_helper.h b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cv_cpu_helper.h
new file mode 100644
index 0000000..39ae0b9
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cv_cpu_helper.h
@@ -0,0 +1,508 @@
+// AUTOGENERATED, DO NOT EDIT
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE
+#  define CV_TRY_SSE 1
+#  define CV_CPU_FORCE_SSE 1
+#  define CV_CPU_HAS_SUPPORT_SSE 1
+#  define CV_CPU_CALL_SSE(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_SSE_(fn, args) return (opt_SSE::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE
+#  define CV_TRY_SSE 1
+#  define CV_CPU_FORCE_SSE 0
+#  define CV_CPU_HAS_SUPPORT_SSE (cv::checkHardwareSupport(CV_CPU_SSE))
+#  define CV_CPU_CALL_SSE(fn, args) if (CV_CPU_HAS_SUPPORT_SSE) return (opt_SSE::fn args)
+#  define CV_CPU_CALL_SSE_(fn, args) if (CV_CPU_HAS_SUPPORT_SSE) return (opt_SSE::fn args)
+#else
+#  define CV_TRY_SSE 0
+#  define CV_CPU_FORCE_SSE 0
+#  define CV_CPU_HAS_SUPPORT_SSE 0
+#  define CV_CPU_CALL_SSE(fn, args)
+#  define CV_CPU_CALL_SSE_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_SSE(fn, args, mode, ...)  CV_CPU_CALL_SSE(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE2
+#  define CV_TRY_SSE2 1
+#  define CV_CPU_FORCE_SSE2 1
+#  define CV_CPU_HAS_SUPPORT_SSE2 1
+#  define CV_CPU_CALL_SSE2(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_SSE2_(fn, args) return (opt_SSE2::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE2
+#  define CV_TRY_SSE2 1
+#  define CV_CPU_FORCE_SSE2 0
+#  define CV_CPU_HAS_SUPPORT_SSE2 (cv::checkHardwareSupport(CV_CPU_SSE2))
+#  define CV_CPU_CALL_SSE2(fn, args) if (CV_CPU_HAS_SUPPORT_SSE2) return (opt_SSE2::fn args)
+#  define CV_CPU_CALL_SSE2_(fn, args) if (CV_CPU_HAS_SUPPORT_SSE2) return (opt_SSE2::fn args)
+#else
+#  define CV_TRY_SSE2 0
+#  define CV_CPU_FORCE_SSE2 0
+#  define CV_CPU_HAS_SUPPORT_SSE2 0
+#  define CV_CPU_CALL_SSE2(fn, args)
+#  define CV_CPU_CALL_SSE2_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_SSE2(fn, args, mode, ...)  CV_CPU_CALL_SSE2(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE3
+#  define CV_TRY_SSE3 1
+#  define CV_CPU_FORCE_SSE3 1
+#  define CV_CPU_HAS_SUPPORT_SSE3 1
+#  define CV_CPU_CALL_SSE3(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_SSE3_(fn, args) return (opt_SSE3::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE3
+#  define CV_TRY_SSE3 1
+#  define CV_CPU_FORCE_SSE3 0
+#  define CV_CPU_HAS_SUPPORT_SSE3 (cv::checkHardwareSupport(CV_CPU_SSE3))
+#  define CV_CPU_CALL_SSE3(fn, args) if (CV_CPU_HAS_SUPPORT_SSE3) return (opt_SSE3::fn args)
+#  define CV_CPU_CALL_SSE3_(fn, args) if (CV_CPU_HAS_SUPPORT_SSE3) return (opt_SSE3::fn args)
+#else
+#  define CV_TRY_SSE3 0
+#  define CV_CPU_FORCE_SSE3 0
+#  define CV_CPU_HAS_SUPPORT_SSE3 0
+#  define CV_CPU_CALL_SSE3(fn, args)
+#  define CV_CPU_CALL_SSE3_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_SSE3(fn, args, mode, ...)  CV_CPU_CALL_SSE3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSSE3
+#  define CV_TRY_SSSE3 1
+#  define CV_CPU_FORCE_SSSE3 1
+#  define CV_CPU_HAS_SUPPORT_SSSE3 1
+#  define CV_CPU_CALL_SSSE3(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_SSSE3_(fn, args) return (opt_SSSE3::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSSE3
+#  define CV_TRY_SSSE3 1
+#  define CV_CPU_FORCE_SSSE3 0
+#  define CV_CPU_HAS_SUPPORT_SSSE3 (cv::checkHardwareSupport(CV_CPU_SSSE3))
+#  define CV_CPU_CALL_SSSE3(fn, args) if (CV_CPU_HAS_SUPPORT_SSSE3) return (opt_SSSE3::fn args)
+#  define CV_CPU_CALL_SSSE3_(fn, args) if (CV_CPU_HAS_SUPPORT_SSSE3) return (opt_SSSE3::fn args)
+#else
+#  define CV_TRY_SSSE3 0
+#  define CV_CPU_FORCE_SSSE3 0
+#  define CV_CPU_HAS_SUPPORT_SSSE3 0
+#  define CV_CPU_CALL_SSSE3(fn, args)
+#  define CV_CPU_CALL_SSSE3_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_SSSE3(fn, args, mode, ...)  CV_CPU_CALL_SSSE3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE4_1
+#  define CV_TRY_SSE4_1 1
+#  define CV_CPU_FORCE_SSE4_1 1
+#  define CV_CPU_HAS_SUPPORT_SSE4_1 1
+#  define CV_CPU_CALL_SSE4_1(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_SSE4_1_(fn, args) return (opt_SSE4_1::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE4_1
+#  define CV_TRY_SSE4_1 1
+#  define CV_CPU_FORCE_SSE4_1 0
+#  define CV_CPU_HAS_SUPPORT_SSE4_1 (cv::checkHardwareSupport(CV_CPU_SSE4_1))
+#  define CV_CPU_CALL_SSE4_1(fn, args) if (CV_CPU_HAS_SUPPORT_SSE4_1) return (opt_SSE4_1::fn args)
+#  define CV_CPU_CALL_SSE4_1_(fn, args) if (CV_CPU_HAS_SUPPORT_SSE4_1) return (opt_SSE4_1::fn args)
+#else
+#  define CV_TRY_SSE4_1 0
+#  define CV_CPU_FORCE_SSE4_1 0
+#  define CV_CPU_HAS_SUPPORT_SSE4_1 0
+#  define CV_CPU_CALL_SSE4_1(fn, args)
+#  define CV_CPU_CALL_SSE4_1_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_SSE4_1(fn, args, mode, ...)  CV_CPU_CALL_SSE4_1(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_SSE4_2
+#  define CV_TRY_SSE4_2 1
+#  define CV_CPU_FORCE_SSE4_2 1
+#  define CV_CPU_HAS_SUPPORT_SSE4_2 1
+#  define CV_CPU_CALL_SSE4_2(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_SSE4_2_(fn, args) return (opt_SSE4_2::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_SSE4_2
+#  define CV_TRY_SSE4_2 1
+#  define CV_CPU_FORCE_SSE4_2 0
+#  define CV_CPU_HAS_SUPPORT_SSE4_2 (cv::checkHardwareSupport(CV_CPU_SSE4_2))
+#  define CV_CPU_CALL_SSE4_2(fn, args) if (CV_CPU_HAS_SUPPORT_SSE4_2) return (opt_SSE4_2::fn args)
+#  define CV_CPU_CALL_SSE4_2_(fn, args) if (CV_CPU_HAS_SUPPORT_SSE4_2) return (opt_SSE4_2::fn args)
+#else
+#  define CV_TRY_SSE4_2 0
+#  define CV_CPU_FORCE_SSE4_2 0
+#  define CV_CPU_HAS_SUPPORT_SSE4_2 0
+#  define CV_CPU_CALL_SSE4_2(fn, args)
+#  define CV_CPU_CALL_SSE4_2_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_SSE4_2(fn, args, mode, ...)  CV_CPU_CALL_SSE4_2(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_POPCNT
+#  define CV_TRY_POPCNT 1
+#  define CV_CPU_FORCE_POPCNT 1
+#  define CV_CPU_HAS_SUPPORT_POPCNT 1
+#  define CV_CPU_CALL_POPCNT(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_POPCNT_(fn, args) return (opt_POPCNT::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_POPCNT
+#  define CV_TRY_POPCNT 1
+#  define CV_CPU_FORCE_POPCNT 0
+#  define CV_CPU_HAS_SUPPORT_POPCNT (cv::checkHardwareSupport(CV_CPU_POPCNT))
+#  define CV_CPU_CALL_POPCNT(fn, args) if (CV_CPU_HAS_SUPPORT_POPCNT) return (opt_POPCNT::fn args)
+#  define CV_CPU_CALL_POPCNT_(fn, args) if (CV_CPU_HAS_SUPPORT_POPCNT) return (opt_POPCNT::fn args)
+#else
+#  define CV_TRY_POPCNT 0
+#  define CV_CPU_FORCE_POPCNT 0
+#  define CV_CPU_HAS_SUPPORT_POPCNT 0
+#  define CV_CPU_CALL_POPCNT(fn, args)
+#  define CV_CPU_CALL_POPCNT_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_POPCNT(fn, args, mode, ...)  CV_CPU_CALL_POPCNT(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX
+#  define CV_TRY_AVX 1
+#  define CV_CPU_FORCE_AVX 1
+#  define CV_CPU_HAS_SUPPORT_AVX 1
+#  define CV_CPU_CALL_AVX(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_AVX_(fn, args) return (opt_AVX::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX
+#  define CV_TRY_AVX 1
+#  define CV_CPU_FORCE_AVX 0
+#  define CV_CPU_HAS_SUPPORT_AVX (cv::checkHardwareSupport(CV_CPU_AVX))
+#  define CV_CPU_CALL_AVX(fn, args) if (CV_CPU_HAS_SUPPORT_AVX) return (opt_AVX::fn args)
+#  define CV_CPU_CALL_AVX_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX) return (opt_AVX::fn args)
+#else
+#  define CV_TRY_AVX 0
+#  define CV_CPU_FORCE_AVX 0
+#  define CV_CPU_HAS_SUPPORT_AVX 0
+#  define CV_CPU_CALL_AVX(fn, args)
+#  define CV_CPU_CALL_AVX_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_AVX(fn, args, mode, ...)  CV_CPU_CALL_AVX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_FP16
+#  define CV_TRY_FP16 1
+#  define CV_CPU_FORCE_FP16 1
+#  define CV_CPU_HAS_SUPPORT_FP16 1
+#  define CV_CPU_CALL_FP16(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_FP16_(fn, args) return (opt_FP16::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_FP16
+#  define CV_TRY_FP16 1
+#  define CV_CPU_FORCE_FP16 0
+#  define CV_CPU_HAS_SUPPORT_FP16 (cv::checkHardwareSupport(CV_CPU_FP16))
+#  define CV_CPU_CALL_FP16(fn, args) if (CV_CPU_HAS_SUPPORT_FP16) return (opt_FP16::fn args)
+#  define CV_CPU_CALL_FP16_(fn, args) if (CV_CPU_HAS_SUPPORT_FP16) return (opt_FP16::fn args)
+#else
+#  define CV_TRY_FP16 0
+#  define CV_CPU_FORCE_FP16 0
+#  define CV_CPU_HAS_SUPPORT_FP16 0
+#  define CV_CPU_CALL_FP16(fn, args)
+#  define CV_CPU_CALL_FP16_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_FP16(fn, args, mode, ...)  CV_CPU_CALL_FP16(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX2
+#  define CV_TRY_AVX2 1
+#  define CV_CPU_FORCE_AVX2 1
+#  define CV_CPU_HAS_SUPPORT_AVX2 1
+#  define CV_CPU_CALL_AVX2(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_AVX2_(fn, args) return (opt_AVX2::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX2
+#  define CV_TRY_AVX2 1
+#  define CV_CPU_FORCE_AVX2 0
+#  define CV_CPU_HAS_SUPPORT_AVX2 (cv::checkHardwareSupport(CV_CPU_AVX2))
+#  define CV_CPU_CALL_AVX2(fn, args) if (CV_CPU_HAS_SUPPORT_AVX2) return (opt_AVX2::fn args)
+#  define CV_CPU_CALL_AVX2_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX2) return (opt_AVX2::fn args)
+#else
+#  define CV_TRY_AVX2 0
+#  define CV_CPU_FORCE_AVX2 0
+#  define CV_CPU_HAS_SUPPORT_AVX2 0
+#  define CV_CPU_CALL_AVX2(fn, args)
+#  define CV_CPU_CALL_AVX2_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_AVX2(fn, args, mode, ...)  CV_CPU_CALL_AVX2(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_FMA3
+#  define CV_TRY_FMA3 1
+#  define CV_CPU_FORCE_FMA3 1
+#  define CV_CPU_HAS_SUPPORT_FMA3 1
+#  define CV_CPU_CALL_FMA3(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_FMA3_(fn, args) return (opt_FMA3::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_FMA3
+#  define CV_TRY_FMA3 1
+#  define CV_CPU_FORCE_FMA3 0
+#  define CV_CPU_HAS_SUPPORT_FMA3 (cv::checkHardwareSupport(CV_CPU_FMA3))
+#  define CV_CPU_CALL_FMA3(fn, args) if (CV_CPU_HAS_SUPPORT_FMA3) return (opt_FMA3::fn args)
+#  define CV_CPU_CALL_FMA3_(fn, args) if (CV_CPU_HAS_SUPPORT_FMA3) return (opt_FMA3::fn args)
+#else
+#  define CV_TRY_FMA3 0
+#  define CV_CPU_FORCE_FMA3 0
+#  define CV_CPU_HAS_SUPPORT_FMA3 0
+#  define CV_CPU_CALL_FMA3(fn, args)
+#  define CV_CPU_CALL_FMA3_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_FMA3(fn, args, mode, ...)  CV_CPU_CALL_FMA3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX_512F
+#  define CV_TRY_AVX_512F 1
+#  define CV_CPU_FORCE_AVX_512F 1
+#  define CV_CPU_HAS_SUPPORT_AVX_512F 1
+#  define CV_CPU_CALL_AVX_512F(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_AVX_512F_(fn, args) return (opt_AVX_512F::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX_512F
+#  define CV_TRY_AVX_512F 1
+#  define CV_CPU_FORCE_AVX_512F 0
+#  define CV_CPU_HAS_SUPPORT_AVX_512F (cv::checkHardwareSupport(CV_CPU_AVX_512F))
+#  define CV_CPU_CALL_AVX_512F(fn, args) if (CV_CPU_HAS_SUPPORT_AVX_512F) return (opt_AVX_512F::fn args)
+#  define CV_CPU_CALL_AVX_512F_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX_512F) return (opt_AVX_512F::fn args)
+#else
+#  define CV_TRY_AVX_512F 0
+#  define CV_CPU_FORCE_AVX_512F 0
+#  define CV_CPU_HAS_SUPPORT_AVX_512F 0
+#  define CV_CPU_CALL_AVX_512F(fn, args)
+#  define CV_CPU_CALL_AVX_512F_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_AVX_512F(fn, args, mode, ...)  CV_CPU_CALL_AVX_512F(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512_COMMON
+#  define CV_TRY_AVX512_COMMON 1
+#  define CV_CPU_FORCE_AVX512_COMMON 1
+#  define CV_CPU_HAS_SUPPORT_AVX512_COMMON 1
+#  define CV_CPU_CALL_AVX512_COMMON(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_AVX512_COMMON_(fn, args) return (opt_AVX512_COMMON::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX512_COMMON
+#  define CV_TRY_AVX512_COMMON 1
+#  define CV_CPU_FORCE_AVX512_COMMON 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_COMMON (cv::checkHardwareSupport(CV_CPU_AVX512_COMMON))
+#  define CV_CPU_CALL_AVX512_COMMON(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_COMMON) return (opt_AVX512_COMMON::fn args)
+#  define CV_CPU_CALL_AVX512_COMMON_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_COMMON) return (opt_AVX512_COMMON::fn args)
+#else
+#  define CV_TRY_AVX512_COMMON 0
+#  define CV_CPU_FORCE_AVX512_COMMON 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_COMMON 0
+#  define CV_CPU_CALL_AVX512_COMMON(fn, args)
+#  define CV_CPU_CALL_AVX512_COMMON_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_AVX512_COMMON(fn, args, mode, ...)  CV_CPU_CALL_AVX512_COMMON(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512_KNL
+#  define CV_TRY_AVX512_KNL 1
+#  define CV_CPU_FORCE_AVX512_KNL 1
+#  define CV_CPU_HAS_SUPPORT_AVX512_KNL 1
+#  define CV_CPU_CALL_AVX512_KNL(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_AVX512_KNL_(fn, args) return (opt_AVX512_KNL::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX512_KNL
+#  define CV_TRY_AVX512_KNL 1
+#  define CV_CPU_FORCE_AVX512_KNL 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_KNL (cv::checkHardwareSupport(CV_CPU_AVX512_KNL))
+#  define CV_CPU_CALL_AVX512_KNL(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_KNL) return (opt_AVX512_KNL::fn args)
+#  define CV_CPU_CALL_AVX512_KNL_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_KNL) return (opt_AVX512_KNL::fn args)
+#else
+#  define CV_TRY_AVX512_KNL 0
+#  define CV_CPU_FORCE_AVX512_KNL 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_KNL 0
+#  define CV_CPU_CALL_AVX512_KNL(fn, args)
+#  define CV_CPU_CALL_AVX512_KNL_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_AVX512_KNL(fn, args, mode, ...)  CV_CPU_CALL_AVX512_KNL(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512_KNM
+#  define CV_TRY_AVX512_KNM 1
+#  define CV_CPU_FORCE_AVX512_KNM 1
+#  define CV_CPU_HAS_SUPPORT_AVX512_KNM 1
+#  define CV_CPU_CALL_AVX512_KNM(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_AVX512_KNM_(fn, args) return (opt_AVX512_KNM::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX512_KNM
+#  define CV_TRY_AVX512_KNM 1
+#  define CV_CPU_FORCE_AVX512_KNM 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_KNM (cv::checkHardwareSupport(CV_CPU_AVX512_KNM))
+#  define CV_CPU_CALL_AVX512_KNM(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_KNM) return (opt_AVX512_KNM::fn args)
+#  define CV_CPU_CALL_AVX512_KNM_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_KNM) return (opt_AVX512_KNM::fn args)
+#else
+#  define CV_TRY_AVX512_KNM 0
+#  define CV_CPU_FORCE_AVX512_KNM 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_KNM 0
+#  define CV_CPU_CALL_AVX512_KNM(fn, args)
+#  define CV_CPU_CALL_AVX512_KNM_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_AVX512_KNM(fn, args, mode, ...)  CV_CPU_CALL_AVX512_KNM(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512_SKX
+#  define CV_TRY_AVX512_SKX 1
+#  define CV_CPU_FORCE_AVX512_SKX 1
+#  define CV_CPU_HAS_SUPPORT_AVX512_SKX 1
+#  define CV_CPU_CALL_AVX512_SKX(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_AVX512_SKX_(fn, args) return (opt_AVX512_SKX::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX512_SKX
+#  define CV_TRY_AVX512_SKX 1
+#  define CV_CPU_FORCE_AVX512_SKX 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_SKX (cv::checkHardwareSupport(CV_CPU_AVX512_SKX))
+#  define CV_CPU_CALL_AVX512_SKX(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_SKX) return (opt_AVX512_SKX::fn args)
+#  define CV_CPU_CALL_AVX512_SKX_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_SKX) return (opt_AVX512_SKX::fn args)
+#else
+#  define CV_TRY_AVX512_SKX 0
+#  define CV_CPU_FORCE_AVX512_SKX 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_SKX 0
+#  define CV_CPU_CALL_AVX512_SKX(fn, args)
+#  define CV_CPU_CALL_AVX512_SKX_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_AVX512_SKX(fn, args, mode, ...)  CV_CPU_CALL_AVX512_SKX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512_CNL
+#  define CV_TRY_AVX512_CNL 1
+#  define CV_CPU_FORCE_AVX512_CNL 1
+#  define CV_CPU_HAS_SUPPORT_AVX512_CNL 1
+#  define CV_CPU_CALL_AVX512_CNL(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_AVX512_CNL_(fn, args) return (opt_AVX512_CNL::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX512_CNL
+#  define CV_TRY_AVX512_CNL 1
+#  define CV_CPU_FORCE_AVX512_CNL 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_CNL (cv::checkHardwareSupport(CV_CPU_AVX512_CNL))
+#  define CV_CPU_CALL_AVX512_CNL(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_CNL) return (opt_AVX512_CNL::fn args)
+#  define CV_CPU_CALL_AVX512_CNL_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_CNL) return (opt_AVX512_CNL::fn args)
+#else
+#  define CV_TRY_AVX512_CNL 0
+#  define CV_CPU_FORCE_AVX512_CNL 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_CNL 0
+#  define CV_CPU_CALL_AVX512_CNL(fn, args)
+#  define CV_CPU_CALL_AVX512_CNL_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_AVX512_CNL(fn, args, mode, ...)  CV_CPU_CALL_AVX512_CNL(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512_CLX
+#  define CV_TRY_AVX512_CLX 1
+#  define CV_CPU_FORCE_AVX512_CLX 1
+#  define CV_CPU_HAS_SUPPORT_AVX512_CLX 1
+#  define CV_CPU_CALL_AVX512_CLX(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_AVX512_CLX_(fn, args) return (opt_AVX512_CLX::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX512_CLX
+#  define CV_TRY_AVX512_CLX 1
+#  define CV_CPU_FORCE_AVX512_CLX 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_CLX (cv::checkHardwareSupport(CV_CPU_AVX512_CLX))
+#  define CV_CPU_CALL_AVX512_CLX(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_CLX) return (opt_AVX512_CLX::fn args)
+#  define CV_CPU_CALL_AVX512_CLX_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_CLX) return (opt_AVX512_CLX::fn args)
+#else
+#  define CV_TRY_AVX512_CLX 0
+#  define CV_CPU_FORCE_AVX512_CLX 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_CLX 0
+#  define CV_CPU_CALL_AVX512_CLX(fn, args)
+#  define CV_CPU_CALL_AVX512_CLX_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_AVX512_CLX(fn, args, mode, ...)  CV_CPU_CALL_AVX512_CLX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512_ICL
+#  define CV_TRY_AVX512_ICL 1
+#  define CV_CPU_FORCE_AVX512_ICL 1
+#  define CV_CPU_HAS_SUPPORT_AVX512_ICL 1
+#  define CV_CPU_CALL_AVX512_ICL(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_AVX512_ICL_(fn, args) return (opt_AVX512_ICL::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX512_ICL
+#  define CV_TRY_AVX512_ICL 1
+#  define CV_CPU_FORCE_AVX512_ICL 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_ICL (cv::checkHardwareSupport(CV_CPU_AVX512_ICL))
+#  define CV_CPU_CALL_AVX512_ICL(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_ICL) return (opt_AVX512_ICL::fn args)
+#  define CV_CPU_CALL_AVX512_ICL_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_ICL) return (opt_AVX512_ICL::fn args)
+#else
+#  define CV_TRY_AVX512_ICL 0
+#  define CV_CPU_FORCE_AVX512_ICL 0
+#  define CV_CPU_HAS_SUPPORT_AVX512_ICL 0
+#  define CV_CPU_CALL_AVX512_ICL(fn, args)
+#  define CV_CPU_CALL_AVX512_ICL_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_AVX512_ICL(fn, args, mode, ...)  CV_CPU_CALL_AVX512_ICL(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON
+#  define CV_TRY_NEON 1
+#  define CV_CPU_FORCE_NEON 1
+#  define CV_CPU_HAS_SUPPORT_NEON 1
+#  define CV_CPU_CALL_NEON(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_NEON_(fn, args) return (opt_NEON::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_NEON
+#  define CV_TRY_NEON 1
+#  define CV_CPU_FORCE_NEON 0
+#  define CV_CPU_HAS_SUPPORT_NEON (cv::checkHardwareSupport(CV_CPU_NEON))
+#  define CV_CPU_CALL_NEON(fn, args) if (CV_CPU_HAS_SUPPORT_NEON) return (opt_NEON::fn args)
+#  define CV_CPU_CALL_NEON_(fn, args) if (CV_CPU_HAS_SUPPORT_NEON) return (opt_NEON::fn args)
+#else
+#  define CV_TRY_NEON 0
+#  define CV_CPU_FORCE_NEON 0
+#  define CV_CPU_HAS_SUPPORT_NEON 0
+#  define CV_CPU_CALL_NEON(fn, args)
+#  define CV_CPU_CALL_NEON_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_NEON(fn, args, mode, ...)  CV_CPU_CALL_NEON(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_MSA
+#  define CV_TRY_MSA 1
+#  define CV_CPU_FORCE_MSA 1
+#  define CV_CPU_HAS_SUPPORT_MSA 1
+#  define CV_CPU_CALL_MSA(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_MSA_(fn, args) return (opt_MSA::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_MSA
+#  define CV_TRY_MSA 1
+#  define CV_CPU_FORCE_MSA 0
+#  define CV_CPU_HAS_SUPPORT_MSA (cv::checkHardwareSupport(CV_CPU_MSA))
+#  define CV_CPU_CALL_MSA(fn, args) if (CV_CPU_HAS_SUPPORT_MSA) return (opt_MSA::fn args)
+#  define CV_CPU_CALL_MSA_(fn, args) if (CV_CPU_HAS_SUPPORT_MSA) return (opt_MSA::fn args)
+#else
+#  define CV_TRY_MSA 0
+#  define CV_CPU_FORCE_MSA 0
+#  define CV_CPU_HAS_SUPPORT_MSA 0
+#  define CV_CPU_CALL_MSA(fn, args)
+#  define CV_CPU_CALL_MSA_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_MSA(fn, args, mode, ...)  CV_CPU_CALL_MSA(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_VSX
+#  define CV_TRY_VSX 1
+#  define CV_CPU_FORCE_VSX 1
+#  define CV_CPU_HAS_SUPPORT_VSX 1
+#  define CV_CPU_CALL_VSX(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_VSX_(fn, args) return (opt_VSX::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_VSX
+#  define CV_TRY_VSX 1
+#  define CV_CPU_FORCE_VSX 0
+#  define CV_CPU_HAS_SUPPORT_VSX (cv::checkHardwareSupport(CV_CPU_VSX))
+#  define CV_CPU_CALL_VSX(fn, args) if (CV_CPU_HAS_SUPPORT_VSX) return (opt_VSX::fn args)
+#  define CV_CPU_CALL_VSX_(fn, args) if (CV_CPU_HAS_SUPPORT_VSX) return (opt_VSX::fn args)
+#else
+#  define CV_TRY_VSX 0
+#  define CV_CPU_FORCE_VSX 0
+#  define CV_CPU_HAS_SUPPORT_VSX 0
+#  define CV_CPU_CALL_VSX(fn, args)
+#  define CV_CPU_CALL_VSX_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_VSX(fn, args, mode, ...)  CV_CPU_CALL_VSX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_VSX3
+#  define CV_TRY_VSX3 1
+#  define CV_CPU_FORCE_VSX3 1
+#  define CV_CPU_HAS_SUPPORT_VSX3 1
+#  define CV_CPU_CALL_VSX3(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_VSX3_(fn, args) return (opt_VSX3::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_VSX3
+#  define CV_TRY_VSX3 1
+#  define CV_CPU_FORCE_VSX3 0
+#  define CV_CPU_HAS_SUPPORT_VSX3 (cv::checkHardwareSupport(CV_CPU_VSX3))
+#  define CV_CPU_CALL_VSX3(fn, args) if (CV_CPU_HAS_SUPPORT_VSX3) return (opt_VSX3::fn args)
+#  define CV_CPU_CALL_VSX3_(fn, args) if (CV_CPU_HAS_SUPPORT_VSX3) return (opt_VSX3::fn args)
+#else
+#  define CV_TRY_VSX3 0
+#  define CV_CPU_FORCE_VSX3 0
+#  define CV_CPU_HAS_SUPPORT_VSX3 0
+#  define CV_CPU_CALL_VSX3(fn, args)
+#  define CV_CPU_CALL_VSX3_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_VSX3(fn, args, mode, ...)  CV_CPU_CALL_VSX3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_RVV
+#  define CV_TRY_RVV 1
+#  define CV_CPU_FORCE_RVV 1
+#  define CV_CPU_HAS_SUPPORT_RVV 1
+#  define CV_CPU_CALL_RVV(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_RVV_(fn, args) return (opt_RVV::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_RVV
+#  define CV_TRY_RVV 1
+#  define CV_CPU_FORCE_RVV 0
+#  define CV_CPU_HAS_SUPPORT_RVV (cv::checkHardwareSupport(CV_CPU_RVV))
+#  define CV_CPU_CALL_RVV(fn, args) if (CV_CPU_HAS_SUPPORT_RVV) return (opt_RVV::fn args)
+#  define CV_CPU_CALL_RVV_(fn, args) if (CV_CPU_HAS_SUPPORT_RVV) return (opt_RVV::fn args)
+#else
+#  define CV_TRY_RVV 0
+#  define CV_CPU_FORCE_RVV 0
+#  define CV_CPU_HAS_SUPPORT_RVV 0
+#  define CV_CPU_CALL_RVV(fn, args)
+#  define CV_CPU_CALL_RVV_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_RVV(fn, args, mode, ...)  CV_CPU_CALL_RVV(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#define CV_CPU_CALL_BASELINE(fn, args) return (cpu_baseline::fn args)
+#define __CV_CPU_DISPATCH_CHAIN_BASELINE(fn, args, mode, ...)  CV_CPU_CALL_BASELINE(fn, args) /* last in sequence */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cvdef.h b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cvdef.h
new file mode 100644
index 0000000..f785f32
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cvdef.h
@@ -0,0 +1,967 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_CVDEF_H
+#define OPENCV_CORE_CVDEF_H
+
+#include "opencv2/core/version.hpp"
+
+//! @addtogroup core_utils
+//! @{
+
+#ifdef OPENCV_INCLUDE_PORT_FILE  // User-provided header file with custom platform configuration
+#include OPENCV_INCLUDE_PORT_FILE
+#endif
+
+#if !defined CV_DOXYGEN && !defined CV_IGNORE_DEBUG_BUILD_GUARD
+#if (defined(_MSC_VER) && (defined(DEBUG) || defined(_DEBUG))) || \
+    (defined(_GLIBCXX_DEBUG) || defined(_GLIBCXX_DEBUG_PEDANTIC))
+// Guard to prevent using of binary incompatible binaries / runtimes
+// https://github.com/opencv/opencv/pull/9161
+#define CV__DEBUG_NS_BEGIN namespace debug_build_guard {
+#define CV__DEBUG_NS_END }
+namespace cv { namespace debug_build_guard { } using namespace debug_build_guard; }
+#endif
+#endif
+
+#ifndef CV__DEBUG_NS_BEGIN
+#define CV__DEBUG_NS_BEGIN
+#define CV__DEBUG_NS_END
+#endif
+
+
+#ifdef __OPENCV_BUILD
+#include "cvconfig.h"
+#endif
+
+#ifndef __CV_EXPAND
+#define __CV_EXPAND(x) x
+#endif
+
+#ifndef __CV_CAT
+#define __CV_CAT__(x, y) x ## y
+#define __CV_CAT_(x, y) __CV_CAT__(x, y)
+#define __CV_CAT(x, y) __CV_CAT_(x, y)
+#endif
+
+#define __CV_VA_NUM_ARGS_HELPER(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, N, ...) N
+#define __CV_VA_NUM_ARGS(...) __CV_EXPAND(__CV_VA_NUM_ARGS_HELPER(__VA_ARGS__, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0))
+
+#ifdef CV_Func
+// keep current value (through OpenCV port file)
+#elif defined __GNUC__ || (defined (__cpluscplus) && (__cpluscplus >= 201103))
+#define CV_Func __func__
+#elif defined __clang__ && (__clang_minor__ * 100 + __clang_major__ >= 305)
+#define CV_Func __func__
+#elif defined(__STDC_VERSION__) && (__STDC_VERSION >= 199901)
+#define CV_Func __func__
+#elif defined _MSC_VER
+#define CV_Func __FUNCTION__
+#elif defined(__INTEL_COMPILER) && (_INTEL_COMPILER >= 600)
+#define CV_Func __FUNCTION__
+#elif defined __IBMCPP__ && __IBMCPP__ >=500
+#define CV_Func __FUNCTION__
+#elif defined __BORLAND__ && (__BORLANDC__ >= 0x550)
+#define CV_Func __FUNC__
+#else
+#define CV_Func "<unknown>"
+#endif
+
+//! @cond IGNORED
+
+//////////////// static assert /////////////////
+#define CVAUX_CONCAT_EXP(a, b) a##b
+#define CVAUX_CONCAT(a, b) CVAUX_CONCAT_EXP(a,b)
+
+#if defined(__clang__)
+#  ifndef __has_extension
+#    define __has_extension __has_feature /* compatibility, for older versions of clang */
+#  endif
+#  if __has_extension(cxx_static_assert)
+#    define CV_StaticAssert(condition, reason)    static_assert((condition), reason " " #condition)
+#  elif __has_extension(c_static_assert)
+#    define CV_StaticAssert(condition, reason)    _Static_assert((condition), reason " " #condition)
+#  endif
+#elif defined(__GNUC__)
+#  if (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L)
+#    define CV_StaticAssert(condition, reason)    static_assert((condition), reason " " #condition)
+#  endif
+#elif defined(_MSC_VER)
+#  if _MSC_VER >= 1600 /* MSVC 10 */
+#    define CV_StaticAssert(condition, reason)    static_assert((condition), reason " " #condition)
+#  endif
+#endif
+#ifndef CV_StaticAssert
+#  if !defined(__clang__) && defined(__GNUC__) && (__GNUC__*100 + __GNUC_MINOR__ > 302)
+#    define CV_StaticAssert(condition, reason) ({ extern int __attribute__((error("CV_StaticAssert: " reason " " #condition))) CV_StaticAssert(); ((condition) ? 0 : CV_StaticAssert()); })
+#  else
+namespace cv {
+     template <bool x> struct CV_StaticAssert_failed;
+     template <> struct CV_StaticAssert_failed<true> { enum { val = 1 }; };
+     template<int x> struct CV_StaticAssert_test {};
+}
+#    define CV_StaticAssert(condition, reason)\
+       typedef cv::CV_StaticAssert_test< sizeof(cv::CV_StaticAssert_failed< static_cast<bool>(condition) >) > CVAUX_CONCAT(CV_StaticAssert_failed_at_, __LINE__)
+#  endif
+#endif
+
+// Suppress warning "-Wdeprecated-declarations" / C4996
+#if defined(_MSC_VER)
+    #define CV_DO_PRAGMA(x) __pragma(x)
+#elif defined(__GNUC__)
+    #define CV_DO_PRAGMA(x) _Pragma (#x)
+#else
+    #define CV_DO_PRAGMA(x)
+#endif
+
+#ifdef _MSC_VER
+#define CV_SUPPRESS_DEPRECATED_START \
+    CV_DO_PRAGMA(warning(push)) \
+    CV_DO_PRAGMA(warning(disable: 4996))
+#define CV_SUPPRESS_DEPRECATED_END CV_DO_PRAGMA(warning(pop))
+#elif defined (__clang__) || ((__GNUC__)  && (__GNUC__*100 + __GNUC_MINOR__ > 405))
+#define CV_SUPPRESS_DEPRECATED_START \
+    CV_DO_PRAGMA(GCC diagnostic push) \
+    CV_DO_PRAGMA(GCC diagnostic ignored "-Wdeprecated-declarations")
+#define CV_SUPPRESS_DEPRECATED_END CV_DO_PRAGMA(GCC diagnostic pop)
+#else
+#define CV_SUPPRESS_DEPRECATED_START
+#define CV_SUPPRESS_DEPRECATED_END
+#endif
+
+#define CV_UNUSED(name) (void)name
+
+//! @endcond
+
+// undef problematic defines sometimes defined by system headers (windows.h in particular)
+#undef small
+#undef min
+#undef max
+#undef abs
+#undef Complex
+
+#if defined __cplusplus
+#include <limits>
+#else
+#include <limits.h>
+#endif
+
+#include "opencv2/core/hal/interface.h"
+
+#if defined __ICL
+#  define CV_ICC   __ICL
+#elif defined __ICC
+#  define CV_ICC   __ICC
+#elif defined __ECL
+#  define CV_ICC   __ECL
+#elif defined __ECC
+#  define CV_ICC   __ECC
+#elif defined __INTEL_COMPILER
+#  define CV_ICC   __INTEL_COMPILER
+#endif
+
+#ifndef CV_INLINE
+#  if defined __cplusplus
+#    define CV_INLINE static inline
+#  elif defined _MSC_VER
+#    define CV_INLINE __inline
+#  else
+#    define CV_INLINE static
+#  endif
+#endif
+
+#ifndef CV_ALWAYS_INLINE
+#if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
+#define CV_ALWAYS_INLINE inline __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#define CV_ALWAYS_INLINE __forceinline
+#else
+#define CV_ALWAYS_INLINE inline
+#endif
+#endif
+
+#if defined CV_DISABLE_OPTIMIZATION || (defined CV_ICC && !defined CV_ENABLE_UNROLLED)
+#  define CV_ENABLE_UNROLLED 0
+#else
+#  define CV_ENABLE_UNROLLED 1
+#endif
+
+#ifdef __GNUC__
+#  define CV_DECL_ALIGNED(x) __attribute__ ((aligned (x)))
+#elif defined _MSC_VER
+#  define CV_DECL_ALIGNED(x) __declspec(align(x))
+#else
+#  define CV_DECL_ALIGNED(x)
+#endif
+
+/* CPU features and intrinsics support */
+#define CV_CPU_NONE             0
+#define CV_CPU_MMX              1
+#define CV_CPU_SSE              2
+#define CV_CPU_SSE2             3
+#define CV_CPU_SSE3             4
+#define CV_CPU_SSSE3            5
+#define CV_CPU_SSE4_1           6
+#define CV_CPU_SSE4_2           7
+#define CV_CPU_POPCNT           8
+#define CV_CPU_FP16             9
+#define CV_CPU_AVX              10
+#define CV_CPU_AVX2             11
+#define CV_CPU_FMA3             12
+
+#define CV_CPU_AVX_512F         13
+#define CV_CPU_AVX_512BW        14
+#define CV_CPU_AVX_512CD        15
+#define CV_CPU_AVX_512DQ        16
+#define CV_CPU_AVX_512ER        17
+#define CV_CPU_AVX_512IFMA512   18 // deprecated
+#define CV_CPU_AVX_512IFMA      18
+#define CV_CPU_AVX_512PF        19
+#define CV_CPU_AVX_512VBMI      20
+#define CV_CPU_AVX_512VL        21
+#define CV_CPU_AVX_512VBMI2     22
+#define CV_CPU_AVX_512VNNI      23
+#define CV_CPU_AVX_512BITALG    24
+#define CV_CPU_AVX_512VPOPCNTDQ 25
+#define CV_CPU_AVX_5124VNNIW    26
+#define CV_CPU_AVX_5124FMAPS    27
+
+#define CV_CPU_NEON             100
+
+#define CV_CPU_MSA              150
+
+#define CV_CPU_RISCVV           170
+
+#define CV_CPU_VSX              200
+#define CV_CPU_VSX3             201
+
+#define CV_CPU_RVV              210
+
+// CPU features groups
+#define CV_CPU_AVX512_SKX       256
+#define CV_CPU_AVX512_COMMON    257
+#define CV_CPU_AVX512_KNL       258
+#define CV_CPU_AVX512_KNM       259
+#define CV_CPU_AVX512_CNL       260
+#define CV_CPU_AVX512_CLX       261
+#define CV_CPU_AVX512_ICL       262
+
+// when adding to this list remember to update the following enum
+#define CV_HARDWARE_MAX_FEATURE 512
+
+/** @brief Available CPU features.
+*/
+enum CpuFeatures {
+    CPU_MMX             = 1,
+    CPU_SSE             = 2,
+    CPU_SSE2            = 3,
+    CPU_SSE3            = 4,
+    CPU_SSSE3           = 5,
+    CPU_SSE4_1          = 6,
+    CPU_SSE4_2          = 7,
+    CPU_POPCNT          = 8,
+    CPU_FP16            = 9,
+    CPU_AVX             = 10,
+    CPU_AVX2            = 11,
+    CPU_FMA3            = 12,
+
+    CPU_AVX_512F        = 13,
+    CPU_AVX_512BW       = 14,
+    CPU_AVX_512CD       = 15,
+    CPU_AVX_512DQ       = 16,
+    CPU_AVX_512ER       = 17,
+    CPU_AVX_512IFMA512  = 18, // deprecated
+    CPU_AVX_512IFMA     = 18,
+    CPU_AVX_512PF       = 19,
+    CPU_AVX_512VBMI     = 20,
+    CPU_AVX_512VL       = 21,
+    CPU_AVX_512VBMI2    = 22,
+    CPU_AVX_512VNNI     = 23,
+    CPU_AVX_512BITALG   = 24,
+    CPU_AVX_512VPOPCNTDQ= 25,
+    CPU_AVX_5124VNNIW   = 26,
+    CPU_AVX_5124FMAPS   = 27,
+
+    CPU_NEON            = 100,
+
+    CPU_MSA             = 150,
+
+    CPU_RISCVV          = 170,
+
+    CPU_VSX             = 200,
+    CPU_VSX3            = 201,
+
+    CPU_RVV             = 210,
+
+    CPU_AVX512_SKX      = 256, //!< Skylake-X with AVX-512F/CD/BW/DQ/VL
+    CPU_AVX512_COMMON   = 257, //!< Common instructions AVX-512F/CD for all CPUs that support AVX-512
+    CPU_AVX512_KNL      = 258, //!< Knights Landing with AVX-512F/CD/ER/PF
+    CPU_AVX512_KNM      = 259, //!< Knights Mill with AVX-512F/CD/ER/PF/4FMAPS/4VNNIW/VPOPCNTDQ
+    CPU_AVX512_CNL      = 260, //!< Cannon Lake with AVX-512F/CD/BW/DQ/VL/IFMA/VBMI
+    CPU_AVX512_CLX      = 261, //!< Cascade Lake with AVX-512F/CD/BW/DQ/VL/VNNI
+    CPU_AVX512_ICL      = 262, //!< Ice Lake with AVX-512F/CD/BW/DQ/VL/IFMA/VBMI/VNNI/VBMI2/BITALG/VPOPCNTDQ
+
+    CPU_MAX_FEATURE     = 512  // see CV_HARDWARE_MAX_FEATURE
+};
+
+
+#include "cv_cpu_dispatch.h"
+
+#if !defined(CV_STRONG_ALIGNMENT) && defined(__arm__) && !(defined(__aarch64__) || defined(_M_ARM64))
+// int*, int64* should be propertly aligned pointers on ARMv7
+#define CV_STRONG_ALIGNMENT 1
+#endif
+#if !defined(CV_STRONG_ALIGNMENT)
+#define CV_STRONG_ALIGNMENT 0
+#endif
+
+/* fundamental constants */
+#define CV_PI   3.1415926535897932384626433832795
+#define CV_2PI  6.283185307179586476925286766559
+#define CV_LOG2 0.69314718055994530941723212145818
+
+#if defined __ARM_FP16_FORMAT_IEEE \
+    && !defined __CUDACC__
+#  define CV_FP16_TYPE 1
+#else
+#  define CV_FP16_TYPE 0
+#endif
+
+typedef union Cv16suf
+{
+    short i;
+    ushort u;
+#if CV_FP16_TYPE
+    __fp16 h;
+#endif
+}
+Cv16suf;
+
+typedef union Cv32suf
+{
+    int i;
+    unsigned u;
+    float f;
+}
+Cv32suf;
+
+typedef union Cv64suf
+{
+    int64 i;
+    uint64 u;
+    double f;
+}
+Cv64suf;
+
+#ifndef OPENCV_ABI_COMPATIBILITY
+#define OPENCV_ABI_COMPATIBILITY 400
+#endif
+
+#ifdef __OPENCV_BUILD
+#  define DISABLE_OPENCV_3_COMPATIBILITY
+#  define OPENCV_DISABLE_DEPRECATED_COMPATIBILITY
+#endif
+
+#ifndef CV_EXPORTS
+# if (defined _WIN32 || defined WINCE || defined __CYGWIN__) && defined(CVAPI_EXPORTS)
+#   define CV_EXPORTS __declspec(dllexport)
+# elif defined __GNUC__ && __GNUC__ >= 4 && (defined(CVAPI_EXPORTS) || defined(__APPLE__))
+#   define CV_EXPORTS __attribute__ ((visibility ("default")))
+# endif
+#endif
+
+#ifndef CV_EXPORTS
+# define CV_EXPORTS
+#endif
+
+#ifdef _MSC_VER
+#   define CV_EXPORTS_TEMPLATE
+#else
+#   define CV_EXPORTS_TEMPLATE CV_EXPORTS
+#endif
+
+#ifndef CV_DEPRECATED
+#  if defined(__GNUC__)
+#    define CV_DEPRECATED __attribute__ ((deprecated))
+#  elif defined(_MSC_VER)
+#    define CV_DEPRECATED __declspec(deprecated)
+#  else
+#    define CV_DEPRECATED
+#  endif
+#endif
+
+#ifndef CV_DEPRECATED_EXTERNAL
+#  if defined(__OPENCV_BUILD)
+#    define CV_DEPRECATED_EXTERNAL /* nothing */
+#  else
+#    define CV_DEPRECATED_EXTERNAL CV_DEPRECATED
+#  endif
+#endif
+
+
+#ifndef CV_EXTERN_C
+#  ifdef __cplusplus
+#    define CV_EXTERN_C extern "C"
+#  else
+#    define CV_EXTERN_C
+#  endif
+#endif
+
+/* special informative macros for wrapper generators */
+#define CV_EXPORTS_W CV_EXPORTS
+#define CV_EXPORTS_W_SIMPLE CV_EXPORTS
+#define CV_EXPORTS_AS(synonym) CV_EXPORTS
+#define CV_EXPORTS_W_MAP CV_EXPORTS
+#define CV_IN_OUT
+#define CV_OUT
+#define CV_PROP
+#define CV_PROP_RW
+#define CV_WRAP
+#define CV_WRAP_AS(synonym)
+#define CV_WRAP_MAPPABLE(mappable)
+#define CV_WRAP_PHANTOM(phantom_header)
+#define CV_WRAP_DEFAULT(val)
+
+/****************************************************************************************\
+*                                  Matrix type (Mat)                                     *
+\****************************************************************************************/
+
+#define CV_MAT_CN_MASK          ((CV_CN_MAX - 1) << CV_CN_SHIFT)
+#define CV_MAT_CN(flags)        ((((flags) & CV_MAT_CN_MASK) >> CV_CN_SHIFT) + 1)
+#define CV_MAT_TYPE_MASK        (CV_DEPTH_MAX*CV_CN_MAX - 1)
+#define CV_MAT_TYPE(flags)      ((flags) & CV_MAT_TYPE_MASK)
+#define CV_MAT_CONT_FLAG_SHIFT  14
+#define CV_MAT_CONT_FLAG        (1 << CV_MAT_CONT_FLAG_SHIFT)
+#define CV_IS_MAT_CONT(flags)   ((flags) & CV_MAT_CONT_FLAG)
+#define CV_IS_CONT_MAT          CV_IS_MAT_CONT
+#define CV_SUBMAT_FLAG_SHIFT    15
+#define CV_SUBMAT_FLAG          (1 << CV_SUBMAT_FLAG_SHIFT)
+#define CV_IS_SUBMAT(flags)     ((flags) & CV_MAT_SUBMAT_FLAG)
+
+/** Size of each channel item,
+   0x28442211 = 0010 1000 0100 0100 0010 0010 0001 0001 ~ array of sizeof(arr_type_elem) */
+#define CV_ELEM_SIZE1(type) ((0x28442211 >> CV_MAT_DEPTH(type)*4) & 15)
+
+#define CV_ELEM_SIZE(type) (CV_MAT_CN(type)*CV_ELEM_SIZE1(type))
+
+#ifndef MIN
+#  define MIN(a,b)  ((a) > (b) ? (b) : (a))
+#endif
+
+#ifndef MAX
+#  define MAX(a,b)  ((a) < (b) ? (b) : (a))
+#endif
+
+///////////////////////////////////////// Enum operators ///////////////////////////////////////
+
+/**
+
+Provides compatibility operators for both classical and C++11 enum classes,
+as well as exposing the C++11 enum class members for backwards compatibility
+
+@code
+    // Provides operators required for flag enums
+    CV_ENUM_FLAGS(AccessFlag)
+
+    // Exposes the listed members of the enum class AccessFlag to the current namespace
+    CV_ENUM_CLASS_EXPOSE(AccessFlag, ACCESS_READ [, ACCESS_WRITE [, ...] ]);
+@endcode
+*/
+
+#define __CV_ENUM_CLASS_EXPOSE_1(EnumType, MEMBER_CONST)                                              \
+static const EnumType MEMBER_CONST = EnumType::MEMBER_CONST;                                          \
+
+#define __CV_ENUM_CLASS_EXPOSE_2(EnumType, MEMBER_CONST, ...)                                         \
+__CV_ENUM_CLASS_EXPOSE_1(EnumType, MEMBER_CONST);                                                     \
+__CV_EXPAND(__CV_ENUM_CLASS_EXPOSE_1(EnumType, __VA_ARGS__));                                         \
+
+#define __CV_ENUM_CLASS_EXPOSE_3(EnumType, MEMBER_CONST, ...)                                         \
+__CV_ENUM_CLASS_EXPOSE_1(EnumType, MEMBER_CONST);                                                     \
+__CV_EXPAND(__CV_ENUM_CLASS_EXPOSE_2(EnumType, __VA_ARGS__));                                         \
+
+#define __CV_ENUM_CLASS_EXPOSE_4(EnumType, MEMBER_CONST, ...)                                         \
+__CV_ENUM_CLASS_EXPOSE_1(EnumType, MEMBER_CONST);                                                     \
+__CV_EXPAND(__CV_ENUM_CLASS_EXPOSE_3(EnumType, __VA_ARGS__));                                         \
+
+#define __CV_ENUM_CLASS_EXPOSE_5(EnumType, MEMBER_CONST, ...)                                         \
+__CV_ENUM_CLASS_EXPOSE_1(EnumType, MEMBER_CONST);                                                     \
+__CV_EXPAND(__CV_ENUM_CLASS_EXPOSE_4(EnumType, __VA_ARGS__));                                         \
+
+#define __CV_ENUM_CLASS_EXPOSE_6(EnumType, MEMBER_CONST, ...)                                         \
+__CV_ENUM_CLASS_EXPOSE_1(EnumType, MEMBER_CONST);                                                     \
+__CV_EXPAND(__CV_ENUM_CLASS_EXPOSE_5(EnumType, __VA_ARGS__));                                         \
+
+#define __CV_ENUM_CLASS_EXPOSE_7(EnumType, MEMBER_CONST, ...)                                         \
+__CV_ENUM_CLASS_EXPOSE_1(EnumType, MEMBER_CONST);                                                     \
+__CV_EXPAND(__CV_ENUM_CLASS_EXPOSE_6(EnumType, __VA_ARGS__));                                         \
+
+#define __CV_ENUM_CLASS_EXPOSE_8(EnumType, MEMBER_CONST, ...)                                         \
+__CV_ENUM_CLASS_EXPOSE_1(EnumType, MEMBER_CONST);                                                     \
+__CV_EXPAND(__CV_ENUM_CLASS_EXPOSE_7(EnumType, __VA_ARGS__));                                         \
+
+#define __CV_ENUM_CLASS_EXPOSE_9(EnumType, MEMBER_CONST, ...)                                         \
+__CV_ENUM_CLASS_EXPOSE_1(EnumType, MEMBER_CONST);                                                     \
+__CV_EXPAND(__CV_ENUM_CLASS_EXPOSE_8(EnumType, __VA_ARGS__));                                         \
+
+#define __CV_ENUM_FLAGS_LOGICAL_NOT(EnumType)                                                         \
+static inline bool operator!(const EnumType& val)                                                     \
+{                                                                                                     \
+    typedef std::underlying_type<EnumType>::type UnderlyingType;                                      \
+    return !static_cast<UnderlyingType>(val);                                                         \
+}                                                                                                     \
+
+#define __CV_ENUM_FLAGS_LOGICAL_NOT_EQ(Arg1Type, Arg2Type)                                            \
+static inline bool operator!=(const Arg1Type& a, const Arg2Type& b)                                   \
+{                                                                                                     \
+    return static_cast<int>(a) != static_cast<int>(b);                                                \
+}                                                                                                     \
+
+#define __CV_ENUM_FLAGS_LOGICAL_EQ(Arg1Type, Arg2Type)                                                \
+static inline bool operator==(const Arg1Type& a, const Arg2Type& b)                                   \
+{                                                                                                     \
+    return static_cast<int>(a) == static_cast<int>(b);                                                \
+}                                                                                                     \
+
+#define __CV_ENUM_FLAGS_BITWISE_NOT(EnumType)                                                         \
+static inline EnumType operator~(const EnumType& val)                                                 \
+{                                                                                                     \
+    typedef std::underlying_type<EnumType>::type UnderlyingType;                                      \
+    return static_cast<EnumType>(~static_cast<UnderlyingType>(val));                                  \
+}                                                                                                     \
+
+#define __CV_ENUM_FLAGS_BITWISE_OR(EnumType, Arg1Type, Arg2Type)                                      \
+static inline EnumType operator|(const Arg1Type& a, const Arg2Type& b)                                \
+{                                                                                                     \
+    typedef std::underlying_type<EnumType>::type UnderlyingType;                                      \
+    return static_cast<EnumType>(static_cast<UnderlyingType>(a) | static_cast<UnderlyingType>(b));    \
+}                                                                                                     \
+
+#define __CV_ENUM_FLAGS_BITWISE_AND(EnumType, Arg1Type, Arg2Type)                                     \
+static inline EnumType operator&(const Arg1Type& a, const Arg2Type& b)                                \
+{                                                                                                     \
+    typedef std::underlying_type<EnumType>::type UnderlyingType;                                      \
+    return static_cast<EnumType>(static_cast<UnderlyingType>(a) & static_cast<UnderlyingType>(b));    \
+}                                                                                                     \
+
+#define __CV_ENUM_FLAGS_BITWISE_XOR(EnumType, Arg1Type, Arg2Type)                                     \
+static inline EnumType operator^(const Arg1Type& a, const Arg2Type& b)                                \
+{                                                                                                     \
+    typedef std::underlying_type<EnumType>::type UnderlyingType;                                      \
+    return static_cast<EnumType>(static_cast<UnderlyingType>(a) ^ static_cast<UnderlyingType>(b));    \
+}                                                                                                     \
+
+#define __CV_ENUM_FLAGS_BITWISE_OR_EQ(EnumType, Arg1Type)                                             \
+static inline EnumType& operator|=(EnumType& _this, const Arg1Type& val)                              \
+{                                                                                                     \
+    _this = static_cast<EnumType>(static_cast<int>(_this) | static_cast<int>(val));                   \
+    return _this;                                                                                     \
+}                                                                                                     \
+
+#define __CV_ENUM_FLAGS_BITWISE_AND_EQ(EnumType, Arg1Type)                                            \
+static inline EnumType& operator&=(EnumType& _this, const Arg1Type& val)                              \
+{                                                                                                     \
+    _this = static_cast<EnumType>(static_cast<int>(_this) & static_cast<int>(val));                   \
+    return _this;                                                                                     \
+}                                                                                                     \
+
+#define __CV_ENUM_FLAGS_BITWISE_XOR_EQ(EnumType, Arg1Type)                                            \
+static inline EnumType& operator^=(EnumType& _this, const Arg1Type& val)                              \
+{                                                                                                     \
+    _this = static_cast<EnumType>(static_cast<int>(_this) ^ static_cast<int>(val));                   \
+    return _this;                                                                                     \
+}                                                                                                     \
+
+#define CV_ENUM_CLASS_EXPOSE(EnumType, ...)                                                           \
+__CV_EXPAND(__CV_CAT(__CV_ENUM_CLASS_EXPOSE_, __CV_VA_NUM_ARGS(__VA_ARGS__))(EnumType, __VA_ARGS__)); \
+
+#define CV_ENUM_FLAGS(EnumType)                                                                       \
+__CV_ENUM_FLAGS_LOGICAL_NOT      (EnumType)                                                           \
+__CV_ENUM_FLAGS_LOGICAL_EQ       (EnumType, int)                                                      \
+__CV_ENUM_FLAGS_LOGICAL_NOT_EQ   (EnumType, int)                                                      \
+                                                                                                      \
+__CV_ENUM_FLAGS_BITWISE_NOT      (EnumType)                                                           \
+__CV_ENUM_FLAGS_BITWISE_OR       (EnumType, EnumType, EnumType)                                       \
+__CV_ENUM_FLAGS_BITWISE_AND      (EnumType, EnumType, EnumType)                                       \
+__CV_ENUM_FLAGS_BITWISE_XOR      (EnumType, EnumType, EnumType)                                       \
+                                                                                                      \
+__CV_ENUM_FLAGS_BITWISE_OR_EQ    (EnumType, EnumType)                                                 \
+__CV_ENUM_FLAGS_BITWISE_AND_EQ   (EnumType, EnumType)                                                 \
+__CV_ENUM_FLAGS_BITWISE_XOR_EQ   (EnumType, EnumType)                                                 \
+
+/****************************************************************************************\
+*                                    static analysys                                     *
+\****************************************************************************************/
+
+// In practice, some macro are not processed correctly (noreturn is not detected).
+// We need to use simplified definition for them.
+#ifndef CV_STATIC_ANALYSIS
+# if defined(__KLOCWORK__) || defined(__clang_analyzer__) || defined(__COVERITY__)
+#   define CV_STATIC_ANALYSIS 1
+# endif
+#else
+# if defined(CV_STATIC_ANALYSIS) && !(__CV_CAT(1, CV_STATIC_ANALYSIS) == 1)  // defined and not empty
+#   if 0 == CV_STATIC_ANALYSIS
+#     undef CV_STATIC_ANALYSIS
+#   endif
+# endif
+#endif
+
+/****************************************************************************************\
+*                                    Thread sanitizer                                    *
+\****************************************************************************************/
+#ifndef CV_THREAD_SANITIZER
+# if defined(__has_feature)
+#   if __has_feature(thread_sanitizer)
+#     define CV_THREAD_SANITIZER
+#   endif
+# endif
+#endif
+
+/****************************************************************************************\
+*          exchange-add operation for atomic operations on reference counters            *
+\****************************************************************************************/
+
+#ifdef CV_XADD
+  // allow to use user-defined macro
+#elif defined __GNUC__ || defined __clang__
+#  if defined __clang__ && __clang_major__ >= 3 && !defined __ANDROID__ && !defined __EMSCRIPTEN__ && !defined(__CUDACC__)  && !defined __INTEL_COMPILER
+#    ifdef __ATOMIC_ACQ_REL
+#      define CV_XADD(addr, delta) __c11_atomic_fetch_add((_Atomic(int)*)(addr), delta, __ATOMIC_ACQ_REL)
+#    else
+#      define CV_XADD(addr, delta) __atomic_fetch_add((_Atomic(int)*)(addr), delta, 4)
+#    endif
+#  else
+#    if defined __ATOMIC_ACQ_REL && !defined __clang__
+       // version for gcc >= 4.7
+#      define CV_XADD(addr, delta) (int)__atomic_fetch_add((unsigned*)(addr), (unsigned)(delta), __ATOMIC_ACQ_REL)
+#    else
+#      define CV_XADD(addr, delta) (int)__sync_fetch_and_add((unsigned*)(addr), (unsigned)(delta))
+#    endif
+#  endif
+#elif defined _MSC_VER && !defined RC_INVOKED
+#  include <intrin.h>
+#  define CV_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile*)addr, delta)
+#else
+  #ifdef OPENCV_FORCE_UNSAFE_XADD
+    CV_INLINE int CV_XADD(int* addr, int delta) { int tmp = *addr; *addr += delta; return tmp; }
+  #else
+    #error "OpenCV: can't define safe CV_XADD macro for current platform (unsupported). Define CV_XADD macro through custom port header (see OPENCV_INCLUDE_PORT_FILE)"
+  #endif
+#endif
+
+
+/****************************************************************************************\
+*                                  CV_NORETURN attribute                                 *
+\****************************************************************************************/
+
+#ifndef CV_NORETURN
+#  if defined(__GNUC__)
+#    define CV_NORETURN __attribute__((__noreturn__))
+#  elif defined(_MSC_VER) && (_MSC_VER >= 1300)
+#    define CV_NORETURN __declspec(noreturn)
+#  else
+#    define CV_NORETURN /* nothing by default */
+#  endif
+#endif
+
+/****************************************************************************************\
+*                       CV_NODISCARD_STD attribute (C++17)                               *
+* encourages the compiler to issue a warning if the return value is discarded            *
+\****************************************************************************************/
+#ifndef CV_NODISCARD_STD
+#  ifndef __has_cpp_attribute
+//   workaround preprocessor non-compliance https://reviews.llvm.org/D57851
+#    define __has_cpp_attribute(__x) 0
+#  endif
+#  if __has_cpp_attribute(nodiscard)
+#    define CV_NODISCARD_STD [[nodiscard]]
+#  elif __cplusplus >= 201703L
+//   available when compiler is C++17 compliant
+#    define CV_NODISCARD_STD [[nodiscard]]
+#  elif defined(__INTEL_COMPILER)
+     // see above, available when C++17 is enabled
+#  elif defined(_MSC_VER) && _MSC_VER >= 1911 && _MSVC_LANG >= 201703L
+//   available with VS2017 v15.3+ with /std:c++17 or higher; works on functions and classes
+#    define CV_NODISCARD_STD [[nodiscard]]
+#  elif defined(__GNUC__) && (((__GNUC__ * 100) + __GNUC_MINOR__) >= 700) && (__cplusplus >= 201103L)
+//   available with GCC 7.0+; works on functions, works or silently fails on classes
+#    define CV_NODISCARD_STD [[nodiscard]]
+#  elif defined(__GNUC__) && (((__GNUC__ * 100) + __GNUC_MINOR__) >= 408) && (__cplusplus >= 201103L)
+//   available with GCC 4.8+ but it usually does nothing and can fail noisily -- therefore not used
+//   define CV_NODISCARD_STD [[gnu::warn_unused_result]]
+#  endif
+#endif
+#ifndef CV_NODISCARD_STD
+#  define CV_NODISCARD_STD /* nothing by default */
+#endif
+
+
+/****************************************************************************************\
+*                      CV_NODISCARD attribute (deprecated, GCC only)                     *
+* DONT USE: use instead the standard CV_NODISCARD_STD macro above                        *
+*           this legacy method silently fails to issue warning until some version        *
+*           after gcc 6.3.0. Yet with gcc 7+ you can use the above standard method       *
+*           which makes this method useless. Don't use it.                               *
+* @deprecated use instead CV_NODISCARD_STD                                               *
+\****************************************************************************************/
+#ifndef CV_NODISCARD
+#  if defined(__GNUC__)
+#    define CV_NODISCARD __attribute__((__warn_unused_result__))
+#  elif defined(__clang__) && defined(__has_attribute)
+#    if __has_attribute(__warn_unused_result__)
+#      define CV_NODISCARD __attribute__((__warn_unused_result__))
+#    endif
+#  endif
+#endif
+#ifndef CV_NODISCARD
+#  define CV_NODISCARD /* nothing by default */
+#endif
+
+
+/****************************************************************************************\
+*                                    C++ 11                                              *
+\****************************************************************************************/
+#ifndef CV_CXX11
+#  if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1800)
+#    define CV_CXX11 1
+#  endif
+#else
+#  if CV_CXX11 == 0
+#    undef CV_CXX11
+#  endif
+#endif
+#ifndef CV_CXX11
+#  error "OpenCV 4.x+ requires enabled C++11 support"
+#endif
+
+#define CV_CXX_MOVE_SEMANTICS 1
+#define CV_CXX_MOVE(x) std::move(x)
+#define CV_CXX_STD_ARRAY 1
+#include <array>
+#ifndef CV_OVERRIDE
+#  define CV_OVERRIDE override
+#endif
+#ifndef CV_FINAL
+#  define CV_FINAL final
+#endif
+
+#ifndef CV_NOEXCEPT
+#  if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900/*MSVS 2015*/)
+#    define CV_NOEXCEPT noexcept
+#  endif
+#endif
+#ifndef CV_NOEXCEPT
+#  define CV_NOEXCEPT
+#endif
+
+#ifndef CV_CONSTEXPR
+#  if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900/*MSVS 2015*/)
+#    define CV_CONSTEXPR constexpr
+#  endif
+#endif
+#ifndef CV_CONSTEXPR
+#  define CV_CONSTEXPR
+#endif
+
+// Integer types portatibility
+#ifdef OPENCV_STDINT_HEADER
+#include OPENCV_STDINT_HEADER
+#elif defined(__cplusplus)
+#if defined(_MSC_VER) && _MSC_VER < 1600 /* MSVS 2010 */
+namespace cv {
+typedef signed char int8_t;
+typedef unsigned char uint8_t;
+typedef signed short int16_t;
+typedef unsigned short uint16_t;
+typedef signed int int32_t;
+typedef unsigned int uint32_t;
+typedef signed __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+}
+#elif defined(_MSC_VER) || __cplusplus >= 201103L
+#include <cstdint>
+namespace cv {
+using std::int8_t;
+using std::uint8_t;
+using std::int16_t;
+using std::uint16_t;
+using std::int32_t;
+using std::uint32_t;
+using std::int64_t;
+using std::uint64_t;
+}
+#else
+#include <stdint.h>
+namespace cv {
+typedef ::int8_t int8_t;
+typedef ::uint8_t uint8_t;
+typedef ::int16_t int16_t;
+typedef ::uint16_t uint16_t;
+typedef ::int32_t int32_t;
+typedef ::uint32_t uint32_t;
+typedef ::int64_t int64_t;
+typedef ::uint64_t uint64_t;
+}
+#endif
+#else // pure C
+#include <stdint.h>
+#endif
+
+#ifdef __cplusplus
+namespace cv
+{
+
+class float16_t
+{
+public:
+#if CV_FP16_TYPE
+
+    float16_t() : h(0) {}
+    explicit float16_t(float x) { h = (__fp16)x; }
+    operator float() const { return (float)h; }
+    static float16_t fromBits(ushort w)
+    {
+        Cv16suf u;
+        u.u = w;
+        float16_t result;
+        result.h = u.h;
+        return result;
+    }
+    static float16_t zero()
+    {
+        float16_t result;
+        result.h = (__fp16)0;
+        return result;
+    }
+    ushort bits() const
+    {
+        Cv16suf u;
+        u.h = h;
+        return u.u;
+    }
+protected:
+    __fp16 h;
+
+#else
+    float16_t() : w(0) {}
+    explicit float16_t(float x)
+    {
+    #if CV_FP16
+        __m128 v = _mm_load_ss(&x);
+        w = (ushort)_mm_cvtsi128_si32(_mm_cvtps_ph(v, 0));
+    #else
+        Cv32suf in;
+        in.f = x;
+        unsigned sign = in.u & 0x80000000;
+        in.u ^= sign;
+
+        if( in.u >= 0x47800000 )
+            w = (ushort)(in.u > 0x7f800000 ? 0x7e00 : 0x7c00);
+        else
+        {
+            if (in.u < 0x38800000)
+            {
+                in.f += 0.5f;
+                w = (ushort)(in.u - 0x3f000000);
+            }
+            else
+            {
+                unsigned t = in.u + 0xc8000fff;
+                w = (ushort)((t + ((in.u >> 13) & 1)) >> 13);
+            }
+        }
+
+        w = (ushort)(w | (sign >> 16));
+    #endif
+    }
+
+    operator float() const
+    {
+    #if CV_FP16
+        float f;
+        _mm_store_ss(&f, _mm_cvtph_ps(_mm_cvtsi32_si128(w)));
+        return f;
+    #else
+        Cv32suf out;
+
+        unsigned t = ((w & 0x7fff) << 13) + 0x38000000;
+        unsigned sign = (w & 0x8000) << 16;
+        unsigned e = w & 0x7c00;
+
+        out.u = t + (1 << 23);
+        out.u = (e >= 0x7c00 ? t + 0x38000000 :
+                 e == 0 ? (static_cast<void>(out.f -= 6.103515625e-05f), out.u) : t) | sign;
+        return out.f;
+    #endif
+    }
+
+    static float16_t fromBits(ushort b)
+    {
+        float16_t result;
+        result.w = b;
+        return result;
+    }
+    static float16_t zero()
+    {
+        float16_t result;
+        result.w = (ushort)0;
+        return result;
+    }
+    ushort bits() const { return w; }
+protected:
+    ushort w;
+
+#endif
+};
+
+}
+#endif
+
+//! @}
+
+#ifndef __cplusplus
+#include "opencv2/core/fast_math.hpp" // define cvRound(double)
+#endif
+
+#endif // OPENCV_CORE_CVDEF_H
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cvstd.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cvstd.hpp
new file mode 100644
index 0000000..6ce9e4b
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cvstd.hpp
@@ -0,0 +1,190 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_CVSTD_HPP
+#define OPENCV_CORE_CVSTD_HPP
+
+#ifndef __cplusplus
+#  error cvstd.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core/cvdef.h"
+#include <cstddef>
+#include <cstring>
+#include <cctype>
+
+#include <string>
+
+// import useful primitives from stl
+#  include <algorithm>
+#  include <utility>
+#  include <cstdlib> //for abs(int)
+#  include <cmath>
+
+namespace cv
+{
+    static inline uchar abs(uchar a) { return a; }
+    static inline ushort abs(ushort a) { return a; }
+    static inline unsigned abs(unsigned a) { return a; }
+    static inline uint64 abs(uint64 a) { return a; }
+
+    using std::min;
+    using std::max;
+    using std::abs;
+    using std::swap;
+    using std::sqrt;
+    using std::exp;
+    using std::pow;
+    using std::log;
+}
+
+#include "cvstd_wrapper.hpp"
+
+namespace cv {
+
+//! @addtogroup core_utils
+//! @{
+
+//////////////////////////// memory management functions ////////////////////////////
+
+/** @brief Allocates an aligned memory buffer.
+
+The function allocates the buffer of the specified size and returns it. When the buffer size is 16
+bytes or more, the returned buffer is aligned to 16 bytes.
+@param bufSize Allocated buffer size.
+ */
+CV_EXPORTS void* fastMalloc(size_t bufSize);
+
+/** @brief Deallocates a memory buffer.
+
+The function deallocates the buffer allocated with fastMalloc . If NULL pointer is passed, the
+function does nothing. C version of the function clears the pointer *pptr* to avoid problems with
+double memory deallocation.
+@param ptr Pointer to the allocated buffer.
+ */
+CV_EXPORTS void fastFree(void* ptr);
+
+/*!
+  The STL-compliant memory Allocator based on cv::fastMalloc() and cv::fastFree()
+*/
+template<typename _Tp> class Allocator
+{
+public:
+    typedef _Tp value_type;
+    typedef value_type* pointer;
+    typedef const value_type* const_pointer;
+    typedef value_type& reference;
+    typedef const value_type& const_reference;
+    typedef size_t size_type;
+    typedef ptrdiff_t difference_type;
+    template<typename U> class rebind { typedef Allocator<U> other; };
+
+    explicit Allocator() {}
+    ~Allocator() {}
+    explicit Allocator(Allocator const&) {}
+    template<typename U>
+    explicit Allocator(Allocator<U> const&) {}
+
+    // address
+    pointer address(reference r) { return &r; }
+    const_pointer address(const_reference r) { return &r; }
+
+    pointer allocate(size_type count, const void* =0) { return reinterpret_cast<pointer>(fastMalloc(count * sizeof (_Tp))); }
+    void deallocate(pointer p, size_type) { fastFree(p); }
+
+    void construct(pointer p, const _Tp& v) { new(static_cast<void*>(p)) _Tp(v); }
+    void destroy(pointer p) { p->~_Tp(); }
+
+    size_type max_size() const { return cv::max(static_cast<_Tp>(-1)/sizeof(_Tp), 1); }
+};
+
+//! @} core_utils
+
+//! @endcond
+
+//! @addtogroup core_basic
+//! @{
+
+//////////////////////////////// string class ////////////////////////////////
+
+class CV_EXPORTS FileNode; //for string constructor from FileNode
+
+typedef std::string String;
+
+#ifndef OPENCV_DISABLE_STRING_LOWER_UPPER_CONVERSIONS
+
+//! @cond IGNORED
+namespace details {
+// std::tolower is int->int
+static inline char char_tolower(char ch)
+{
+    return (char)std::tolower((int)ch);
+}
+// std::toupper is int->int
+static inline char char_toupper(char ch)
+{
+    return (char)std::toupper((int)ch);
+}
+} // namespace details
+//! @endcond
+
+static inline std::string toLowerCase(const std::string& str)
+{
+    std::string result(str);
+    std::transform(result.begin(), result.end(), result.begin(), details::char_tolower);
+    return result;
+}
+
+static inline std::string toUpperCase(const std::string& str)
+{
+    std::string result(str);
+    std::transform(result.begin(), result.end(), result.begin(), details::char_toupper);
+    return result;
+}
+
+#endif // OPENCV_DISABLE_STRING_LOWER_UPPER_CONVERSIONS
+
+//! @} core_basic
+} // cv
+
+#endif //OPENCV_CORE_CVSTD_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cvstd.inl.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cvstd.inl.hpp
new file mode 100644
index 0000000..37ad1e6
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cvstd.inl.hpp
@@ -0,0 +1,197 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_CVSTDINL_HPP
+#define OPENCV_CORE_CVSTDINL_HPP
+
+#include <complex>
+#include <ostream>
+#include <sstream>
+
+//! @cond IGNORED
+
+#ifdef _MSC_VER
+#pragma warning( push )
+#pragma warning( disable: 4127 )
+#endif
+
+namespace cv
+{
+
+template<typename _Tp> class DataType< std::complex<_Tp> >
+{
+public:
+    typedef std::complex<_Tp>  value_type;
+    typedef value_type         work_type;
+    typedef _Tp                channel_type;
+
+    enum { generic_type = 0,
+           depth        = DataType<channel_type>::depth,
+           channels     = 2,
+           fmt          = DataType<channel_type>::fmt + ((channels - 1) << 8),
+           type         = CV_MAKETYPE(depth, channels) };
+
+    typedef Vec<channel_type, channels> vec_type;
+};
+
+static inline
+std::ostream& operator << (std::ostream& out, Ptr<Formatted> fmtd)
+{
+    fmtd->reset();
+    for(const char* str = fmtd->next(); str; str = fmtd->next())
+        out << str;
+    return out;
+}
+
+static inline
+std::ostream& operator << (std::ostream& out, const Mat& mtx)
+{
+    return out << Formatter::get()->format(mtx);
+}
+
+static inline
+std::ostream& operator << (std::ostream& out, const UMat& m)
+{
+    return out << m.getMat(ACCESS_READ);
+}
+
+template<typename _Tp> static inline
+std::ostream& operator << (std::ostream& out, const Complex<_Tp>& c)
+{
+    return out << "(" << c.re << "," << c.im << ")";
+}
+
+template<typename _Tp> static inline
+std::ostream& operator << (std::ostream& out, const std::vector<Point_<_Tp> >& vec)
+{
+    return out << Formatter::get()->format(Mat(vec));
+}
+
+
+template<typename _Tp> static inline
+std::ostream& operator << (std::ostream& out, const std::vector<Point3_<_Tp> >& vec)
+{
+    return out << Formatter::get()->format(Mat(vec));
+}
+
+
+template<typename _Tp, int m, int n> static inline
+std::ostream& operator << (std::ostream& out, const Matx<_Tp, m, n>& matx)
+{
+    return out << Formatter::get()->format(Mat(matx));
+}
+
+template<typename _Tp> static inline
+std::ostream& operator << (std::ostream& out, const Point_<_Tp>& p)
+{
+    out << "[" << p.x << ", " << p.y << "]";
+    return out;
+}
+
+template<typename _Tp> static inline
+std::ostream& operator << (std::ostream& out, const Point3_<_Tp>& p)
+{
+    out << "[" << p.x << ", " << p.y << ", " << p.z << "]";
+    return out;
+}
+
+template<typename _Tp, int n> static inline
+std::ostream& operator << (std::ostream& out, const Vec<_Tp, n>& vec)
+{
+    out << "[";
+    if (cv::traits::Depth<_Tp>::value <= CV_32S)
+    {
+        for (int i = 0; i < n - 1; ++i) {
+            out << (int)vec[i] << ", ";
+        }
+        out << (int)vec[n-1] << "]";
+    }
+    else
+    {
+        for (int i = 0; i < n - 1; ++i) {
+            out << vec[i] << ", ";
+        }
+        out << vec[n-1] << "]";
+    }
+
+    return out;
+}
+
+template<typename _Tp> static inline
+std::ostream& operator << (std::ostream& out, const Size_<_Tp>& size)
+{
+    return out << "[" << size.width << " x " << size.height << "]";
+}
+
+template<typename _Tp> static inline
+std::ostream& operator << (std::ostream& out, const Rect_<_Tp>& rect)
+{
+    return out << "[" << rect.width << " x " << rect.height << " from (" << rect.x << ", " << rect.y << ")]";
+}
+
+static inline std::ostream& operator << (std::ostream& out, const MatSize& msize)
+{
+    int i, dims = msize.dims();
+    for( i = 0; i < dims; i++ )
+    {
+        out << msize[i];
+        if( i < dims-1 )
+            out << " x ";
+    }
+    return out;
+}
+
+static inline std::ostream &operator<< (std::ostream &s, cv::Range &r)
+{
+    return s << "[" << r.start << " : " << r.end << ")";
+}
+
+} // cv
+
+#ifdef _MSC_VER
+#pragma warning( pop )
+#endif
+
+//! @endcond
+
+#endif // OPENCV_CORE_CVSTDINL_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cvstd_wrapper.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cvstd_wrapper.hpp
new file mode 100644
index 0000000..25e0041
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/cvstd_wrapper.hpp
@@ -0,0 +1,154 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_CVSTD_WRAPPER_HPP
+#define OPENCV_CORE_CVSTD_WRAPPER_HPP
+
+#include "opencv2/core/cvdef.h"
+
+#include <string>
+#include <memory>  // std::shared_ptr
+#include <type_traits>  // std::enable_if
+
+namespace cv {
+
+using std::nullptr_t;
+
+//! @addtogroup core_basic
+//! @{
+
+#ifdef CV_DOXYGEN
+
+template <typename _Tp> using Ptr = std::shared_ptr<_Tp>;  // In ideal world it should look like this, but we need some compatibility workarounds below
+
+template<typename _Tp, typename ... A1> static inline
+Ptr<_Tp> makePtr(const A1&... a1) { return std::make_shared<_Tp>(a1...); }
+
+#else  // cv::Ptr with compatibility workarounds
+
+// It should be defined for C-API types only.
+// C++ types should use regular "delete" operator.
+template<typename Y> struct DefaultDeleter;
+#if 0
+{
+    void operator()(Y* p) const;
+};
+#endif
+
+namespace sfinae {
+template<typename C, typename Ret, typename... Args>
+struct has_parenthesis_operator
+{
+private:
+    template<typename T>
+    static CV_CONSTEXPR std::true_type has_parenthesis_operator_check(typename std::is_same<typename std::decay<decltype(std::declval<T>().operator()(std::declval<Args>()...))>::type, Ret>::type*);
+
+    template<typename> static CV_CONSTEXPR std::false_type has_parenthesis_operator_check(...);
+
+    typedef decltype(has_parenthesis_operator_check<C>(0)) type;
+
+public:
+#if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900/*MSVS 2015*/)
+    static CV_CONSTEXPR bool value = type::value;
+#else
+    // support MSVS 2013
+    static const int value = type::value;
+#endif
+};
+} // namespace sfinae
+
+template <typename T, typename = void>
+struct has_custom_delete
+        : public std::false_type {};
+
+// Force has_custom_delete to std::false_type when NVCC is compiling CUDA source files
+#ifndef __CUDACC__
+template <typename T>
+struct has_custom_delete<T, typename std::enable_if< sfinae::has_parenthesis_operator<DefaultDeleter<T>, void, T*>::value >::type >
+        : public std::true_type {};
+#endif
+
+template<typename T>
+struct Ptr : public std::shared_ptr<T>
+{
+#if 0
+    using std::shared_ptr<T>::shared_ptr;  // GCC 5.x can't handle this
+#else
+    inline Ptr() CV_NOEXCEPT : std::shared_ptr<T>() {}
+    inline Ptr(nullptr_t) CV_NOEXCEPT : std::shared_ptr<T>(nullptr) {}
+    template<typename Y, typename D> inline Ptr(Y* p, D d) : std::shared_ptr<T>(p, d) {}
+    template<typename D> inline Ptr(nullptr_t, D d) : std::shared_ptr<T>(nullptr, d) {}
+
+    template<typename Y> inline Ptr(const Ptr<Y>& r, T* ptr) CV_NOEXCEPT : std::shared_ptr<T>(r, ptr) {}
+
+    inline Ptr(const Ptr<T>& o) CV_NOEXCEPT : std::shared_ptr<T>(o) {}
+    inline Ptr(Ptr<T>&& o) CV_NOEXCEPT : std::shared_ptr<T>(std::move(o)) {}
+
+    template<typename Y> inline Ptr(const Ptr<Y>& o) CV_NOEXCEPT : std::shared_ptr<T>(o) {}
+    template<typename Y> inline Ptr(Ptr<Y>&& o) CV_NOEXCEPT : std::shared_ptr<T>(std::move(o)) {}
+#endif
+    inline Ptr(const std::shared_ptr<T>& o) CV_NOEXCEPT : std::shared_ptr<T>(o) {}
+    inline Ptr(std::shared_ptr<T>&& o) CV_NOEXCEPT : std::shared_ptr<T>(std::move(o)) {}
+
+    // Overload with custom DefaultDeleter: Ptr<IplImage>(...)
+    template<typename Y>
+    inline Ptr(const std::true_type&, Y* ptr) : std::shared_ptr<T>(ptr, DefaultDeleter<Y>()) {}
+
+    // Overload without custom deleter: Ptr<std::string>(...);
+    template<typename Y>
+    inline Ptr(const std::false_type&, Y* ptr) : std::shared_ptr<T>(ptr) {}
+
+    template<typename Y = T>
+    inline Ptr(Y* ptr) : Ptr(has_custom_delete<Y>(), ptr) {}
+
+    // Overload with custom DefaultDeleter: Ptr<IplImage>(...)
+    template<typename Y>
+    inline void reset(const std::true_type&, Y* ptr) { std::shared_ptr<T>::reset(ptr, DefaultDeleter<Y>()); }
+
+    // Overload without custom deleter: Ptr<std::string>(...);
+    template<typename Y>
+    inline void reset(const std::false_type&, Y* ptr) { std::shared_ptr<T>::reset(ptr); }
+
+    template<typename Y>
+    inline void reset(Y* ptr) { Ptr<T>::reset(has_custom_delete<Y>(), ptr); }
+
+    template<class Y, class Deleter>
+    void reset(Y* ptr, Deleter d) { std::shared_ptr<T>::reset(ptr, d); }
+
+    void reset() CV_NOEXCEPT { std::shared_ptr<T>::reset(); }
+
+    Ptr& operator=(const Ptr& o) { std::shared_ptr<T>::operator =(o); return *this; }
+    template<typename Y> inline Ptr& operator=(const Ptr<Y>& o) { std::shared_ptr<T>::operator =(o); return *this; }
+
+    T* operator->() const CV_NOEXCEPT { return std::shared_ptr<T>::get();}
+    typename std::add_lvalue_reference<T>::type operator*() const CV_NOEXCEPT { return *std::shared_ptr<T>::get(); }
+
+    // OpenCV 3.x methods (not a part of standard C++ library)
+    inline void release() { std::shared_ptr<T>::reset(); }
+    inline operator T* () const { return std::shared_ptr<T>::get(); }
+    inline bool empty() const { return std::shared_ptr<T>::get() == nullptr; }
+
+    template<typename Y> inline
+    Ptr<Y> staticCast() const CV_NOEXCEPT { return std::static_pointer_cast<Y>(*this); }
+
+    template<typename Y> inline
+    Ptr<Y> constCast() const CV_NOEXCEPT { return std::const_pointer_cast<Y>(*this); }
+
+    template<typename Y> inline
+    Ptr<Y> dynamicCast() const CV_NOEXCEPT { return std::dynamic_pointer_cast<Y>(*this); }
+};
+
+template<typename _Tp, typename ... A1> static inline
+Ptr<_Tp> makePtr(const A1&... a1)
+{
+    static_assert( !has_custom_delete<_Tp>::value, "Can't use this makePtr with custom DefaultDeleter");
+    return (Ptr<_Tp>)std::make_shared<_Tp>(a1...);
+}
+
+#endif // CV_DOXYGEN
+
+//! @} core_basic
+} // cv
+
+#endif //OPENCV_CORE_CVSTD_WRAPPER_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/detail/async_promise.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/detail/async_promise.hpp
new file mode 100644
index 0000000..6eb3fb5
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/detail/async_promise.hpp
@@ -0,0 +1,71 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_ASYNC_PROMISE_HPP
+#define OPENCV_CORE_ASYNC_PROMISE_HPP
+
+#include "../async.hpp"
+
+#include "exception_ptr.hpp"
+
+namespace cv {
+
+/** @addtogroup core_async
+@{
+*/
+
+
+/** @brief Provides result of asynchronous operations
+
+*/
+class CV_EXPORTS AsyncPromise
+{
+public:
+    ~AsyncPromise() CV_NOEXCEPT;
+    AsyncPromise() CV_NOEXCEPT;
+    explicit AsyncPromise(const AsyncPromise& o) CV_NOEXCEPT;
+    AsyncPromise& operator=(const AsyncPromise& o) CV_NOEXCEPT;
+    void release() CV_NOEXCEPT;
+
+    /** Returns associated AsyncArray
+    @note Can be called once
+    */
+    AsyncArray getArrayResult();
+
+    /** Stores asynchronous result.
+    @param[in] value result
+    */
+    void setValue(InputArray value);
+
+    // TODO "move" setters
+
+#if CV__EXCEPTION_PTR
+    /** Stores exception.
+    @param[in] exception exception to be raised in AsyncArray
+    */
+    void setException(std::exception_ptr exception);
+#endif
+
+    /** Stores exception.
+    @param[in] exception exception to be raised in AsyncArray
+    */
+    void setException(const cv::Exception& exception);
+
+#ifdef CV_CXX11
+    explicit AsyncPromise(AsyncPromise&& o) { p = o.p; o.p = NULL; }
+    AsyncPromise& operator=(AsyncPromise&& o) CV_NOEXCEPT { std::swap(p, o.p); return *this; }
+#endif
+
+
+    // PImpl
+    typedef struct AsyncArray::Impl Impl; friend struct AsyncArray::Impl;
+    inline void* _getImpl() const CV_NOEXCEPT { return p; }
+protected:
+    Impl* p;
+};
+
+
+//! @}
+} // namespace
+#endif // OPENCV_CORE_ASYNC_PROMISE_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/detail/dispatch_helper.impl.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/detail/dispatch_helper.impl.hpp
new file mode 100644
index 0000000..d6ec676
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/detail/dispatch_helper.impl.hpp
@@ -0,0 +1,49 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_DETAIL_DISPATCH_HELPER_IMPL_HPP
+#define OPENCV_CORE_DETAIL_DISPATCH_HELPER_IMPL_HPP
+
+//! @cond IGNORED
+
+namespace cv {
+namespace detail {
+
+template<template<typename> class Functor, typename... Args>
+static inline void depthDispatch(const int depth, Args&&... args)
+{
+    switch (depth)
+    {
+        case CV_8U:
+            Functor<uint8_t>{}(std::forward<Args>(args)...);
+            break;
+        case CV_8S:
+            Functor<int8_t>{}(std::forward<Args>(args)...);
+            break;
+        case CV_16U:
+            Functor<uint16_t>{}(std::forward<Args>(args)...);
+            break;
+        case CV_16S:
+            Functor<int16_t>{}(std::forward<Args>(args)...);
+            break;
+        case CV_32S:
+            Functor<int32_t>{}(std::forward<Args>(args)...);
+            break;
+        case CV_32F:
+            Functor<float>{}(std::forward<Args>(args)...);
+            break;
+        case CV_64F:
+            Functor<double>{}(std::forward<Args>(args)...);
+            break;
+        case CV_16F:
+        default:
+            CV_Error(cv::Error::BadDepth, "Unsupported matrix type.");
+    };
+}
+
+}}
+
+//! @endcond
+
+#endif //OPENCV_CORE_DETAIL_DISPATCH_HELPER_IMPL_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/detail/exception_ptr.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/detail/exception_ptr.hpp
new file mode 100644
index 0000000..d98ffc4
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/detail/exception_ptr.hpp
@@ -0,0 +1,27 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_DETAILS_EXCEPTION_PTR_H
+#define OPENCV_CORE_DETAILS_EXCEPTION_PTR_H
+
+#ifndef CV__EXCEPTION_PTR
+#  if defined(__ANDROID__) && defined(ATOMIC_INT_LOCK_FREE) && ATOMIC_INT_LOCK_FREE < 2
+#    define CV__EXCEPTION_PTR 0  // Not supported, details: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58938
+#  elif defined(CV_CXX11)
+#    define CV__EXCEPTION_PTR 1
+#  elif defined(_MSC_VER)
+#    define CV__EXCEPTION_PTR (_MSC_VER >= 1600)
+#  elif defined(__clang__)
+#    define CV__EXCEPTION_PTR 0  // C++11 only (see above)
+#  elif defined(__GNUC__) && defined(__GXX_EXPERIMENTAL_CXX0X__)
+#    define CV__EXCEPTION_PTR (__GXX_EXPERIMENTAL_CXX0X__ > 0)
+#  endif
+#endif
+#ifndef CV__EXCEPTION_PTR
+#  define CV__EXCEPTION_PTR 0
+#elif CV__EXCEPTION_PTR
+#  include <exception>  // std::exception_ptr
+#endif
+
+#endif // OPENCV_CORE_DETAILS_EXCEPTION_PTR_H
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/directx.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/directx.hpp
new file mode 100644
index 0000000..056a85a
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/directx.hpp
@@ -0,0 +1,184 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the copyright holders or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_DIRECTX_HPP
+#define OPENCV_CORE_DIRECTX_HPP
+
+#include "mat.hpp"
+#include "ocl.hpp"
+
+#if !defined(__d3d11_h__)
+struct ID3D11Device;
+struct ID3D11Texture2D;
+#endif
+
+#if !defined(__d3d10_h__)
+struct ID3D10Device;
+struct ID3D10Texture2D;
+#endif
+
+#if !defined(_D3D9_H_)
+struct IDirect3DDevice9;
+struct IDirect3DDevice9Ex;
+struct IDirect3DSurface9;
+#endif
+
+
+namespace cv { namespace directx {
+
+namespace ocl {
+using namespace cv::ocl;
+
+//! @addtogroup core_directx
+// This section describes OpenCL and DirectX interoperability.
+//
+// To enable DirectX support, configure OpenCV using CMake with WITH_DIRECTX=ON . Note, DirectX is
+// supported only on Windows.
+//
+// To use OpenCL functionality you should first initialize OpenCL context from DirectX resource.
+//
+//! @{
+
+// TODO static functions in the Context class
+//! @brief Creates OpenCL context from D3D11 device
+//
+//! @param pD3D11Device - pointer to D3D11 device
+//! @return Returns reference to OpenCL Context
+CV_EXPORTS Context& initializeContextFromD3D11Device(ID3D11Device* pD3D11Device);
+
+//! @brief Creates OpenCL context from D3D10 device
+//
+//! @param pD3D10Device - pointer to D3D10 device
+//! @return Returns reference to OpenCL Context
+CV_EXPORTS Context& initializeContextFromD3D10Device(ID3D10Device* pD3D10Device);
+
+//! @brief Creates OpenCL context from Direct3DDevice9Ex device
+//
+//! @param pDirect3DDevice9Ex - pointer to Direct3DDevice9Ex device
+//! @return Returns reference to OpenCL Context
+CV_EXPORTS Context& initializeContextFromDirect3DDevice9Ex(IDirect3DDevice9Ex* pDirect3DDevice9Ex);
+
+//! @brief Creates OpenCL context from Direct3DDevice9 device
+//
+//! @param pDirect3DDevice9 - pointer to Direct3Device9 device
+//! @return Returns reference to OpenCL Context
+CV_EXPORTS Context& initializeContextFromDirect3DDevice9(IDirect3DDevice9* pDirect3DDevice9);
+
+//! @}
+
+} // namespace cv::directx::ocl
+
+//! @addtogroup core_directx
+//! @{
+
+//! @brief Converts InputArray to ID3D11Texture2D. If destination texture format is DXGI_FORMAT_NV12 then
+//!        input UMat expected to be in BGR format and data will be downsampled and color-converted to NV12.
+//
+//! @note Note: Destination texture must be allocated by application. Function does memory copy from src to
+//!             pD3D11Texture2D
+//
+//! @param src - source InputArray
+//! @param pD3D11Texture2D - destination D3D11 texture
+CV_EXPORTS void convertToD3D11Texture2D(InputArray src, ID3D11Texture2D* pD3D11Texture2D);
+
+//! @brief Converts ID3D11Texture2D to OutputArray. If input texture format is DXGI_FORMAT_NV12 then
+//!        data will be upsampled and color-converted to BGR format.
+//
+//! @note Note: Destination matrix will be re-allocated if it has not enough memory to match texture size.
+//!             function does memory copy from pD3D11Texture2D to dst
+//
+//! @param pD3D11Texture2D - source D3D11 texture
+//! @param dst             - destination OutputArray
+CV_EXPORTS void convertFromD3D11Texture2D(ID3D11Texture2D* pD3D11Texture2D, OutputArray dst);
+
+//! @brief Converts InputArray to ID3D10Texture2D
+//
+//! @note Note: function does memory copy from src to
+//!             pD3D10Texture2D
+//
+//! @param src             - source InputArray
+//! @param pD3D10Texture2D - destination D3D10 texture
+CV_EXPORTS void convertToD3D10Texture2D(InputArray src, ID3D10Texture2D* pD3D10Texture2D);
+
+//! @brief Converts ID3D10Texture2D to OutputArray
+//
+//! @note Note: function does memory copy from pD3D10Texture2D
+//!             to dst
+//
+//! @param pD3D10Texture2D - source D3D10 texture
+//! @param dst             - destination OutputArray
+CV_EXPORTS void convertFromD3D10Texture2D(ID3D10Texture2D* pD3D10Texture2D, OutputArray dst);
+
+//! @brief Converts InputArray to IDirect3DSurface9
+//
+//! @note Note: function does memory copy from src to
+//!             pDirect3DSurface9
+//
+//! @param src                 - source InputArray
+//! @param pDirect3DSurface9   - destination D3D10 texture
+//! @param surfaceSharedHandle - shared handle
+CV_EXPORTS void convertToDirect3DSurface9(InputArray src, IDirect3DSurface9* pDirect3DSurface9, void* surfaceSharedHandle = NULL);
+
+//! @brief Converts IDirect3DSurface9 to OutputArray
+//
+//! @note Note: function does memory copy from pDirect3DSurface9
+//!             to dst
+//
+//! @param pDirect3DSurface9   - source D3D10 texture
+//! @param dst                 - destination OutputArray
+//! @param surfaceSharedHandle - shared handle
+CV_EXPORTS void convertFromDirect3DSurface9(IDirect3DSurface9* pDirect3DSurface9, OutputArray dst, void* surfaceSharedHandle = NULL);
+
+//! @brief Get OpenCV type from DirectX type
+//! @param iDXGI_FORMAT - enum DXGI_FORMAT for D3D10/D3D11
+//! @return OpenCV type or -1 if there is no equivalent
+CV_EXPORTS int getTypeFromDXGI_FORMAT(const int iDXGI_FORMAT); // enum DXGI_FORMAT for D3D10/D3D11
+
+//! @brief Get OpenCV type from DirectX type
+//! @param iD3DFORMAT - enum D3DTYPE for D3D9
+//! @return OpenCV type or -1 if there is no equivalent
+CV_EXPORTS int getTypeFromD3DFORMAT(const int iD3DFORMAT); // enum D3DTYPE for D3D9
+
+//! @}
+
+} } // namespace cv::directx
+
+#endif // OPENCV_CORE_DIRECTX_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/dualquaternion.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/dualquaternion.hpp
new file mode 100644
index 0000000..1f644e9
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/dualquaternion.hpp
@@ -0,0 +1,979 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2020, Huawei Technologies Co., Ltd. All rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: Liangqian Kong <kongliangqian@huawei.com>
+//         Longbu Wang <wanglongbu@huawei.com>
+#ifndef OPENCV_CORE_DUALQUATERNION_HPP
+#define OPENCV_CORE_DUALQUATERNION_HPP
+
+#include <opencv2/core/quaternion.hpp>
+#include <opencv2/core/affine.hpp>
+
+namespace cv{
+//! @addtogroup core
+//! @{
+
+template <typename _Tp> class DualQuat;
+template <typename _Tp> std::ostream& operator<<(std::ostream&, const DualQuat<_Tp>&);
+
+/**
+ * Dual quaternions were introduced to describe rotation together with translation while ordinary
+ * quaternions can only describe rotation. It can be used for shortest path pose interpolation,
+ * local pose optimization or volumetric deformation. More details can be found
+ * - https://en.wikipedia.org/wiki/Dual_quaternion
+ * - ["A beginners guide to dual-quaternions: what they are, how they work, and how to use them for 3D character hierarchies", Ben Kenwright, 2012](https://borodust.org/public/shared/beginner_dual_quats.pdf)
+ * - ["Dual Quaternions", Yan-Bin Jia, 2013](http://web.cs.iastate.edu/~cs577/handouts/dual-quaternion.pdf)
+ * - ["Geometric Skinning with Approximate Dual Quaternion Blending", Kavan, 2008](https://www.cs.utah.edu/~ladislav/kavan08geometric/kavan08geometric)
+ * - http://rodolphe-vaillant.fr/?e=29
+ *
+ * A unit dual quaternion can be classically represented as:
+ * \f[
+ * \begin{equation}
+ * \begin{split}
+ * \sigma &= \left(r+\frac{\epsilon}{2}tr\right)\\
+ * &= [w, x, y, z, w\_, x\_, y\_, z\_]
+ * \end{split}
+ * \end{equation}
+ * \f]
+ * where \f$r, t\f$ represents the rotation (ordinary unit quaternion) and translation (pure ordinary quaternion) respectively.
+ *
+ * A general dual quaternions which consist of two quaternions is usually represented in form of:
+ * \f[
+ * \sigma = p + \epsilon q
+ * \f]
+ * where the introduced dual unit \f$\epsilon\f$ satisfies \f$\epsilon^2 = \epsilon^3 =...=0\f$, and \f$p, q\f$ are quaternions.
+ *
+ * Alternatively, dual quaternions can also be interpreted as four components which are all [dual numbers](https://www.cs.utah.edu/~ladislav/kavan08geometric/kavan08geometric):
+ * \f[
+ * \sigma = \hat{q}_w + \hat{q}_xi + \hat{q}_yj + \hat{q}_zk
+ * \f]
+ * If we set \f$\hat{q}_x, \hat{q}_y\f$ and \f$\hat{q}_z\f$ equal to 0, a dual quaternion is transformed to a dual number. see normalize().
+ *
+ * If you want to create a dual quaternion, you can use:
+ *
+ * ```
+ * using namespace cv;
+ * double angle = CV_PI;
+ *
+ * // create from eight number
+ * DualQuatd dq1(1, 2, 3, 4, 5, 6, 7, 8); //p = [1,2,3,4]. q=[5,6,7,8]
+ *
+ * // create from Vec
+ * Vec<double, 8> v{1,2,3,4,5,6,7,8};
+ * DualQuatd dq_v{v};
+ *
+ * // create from two quaternion
+ * Quatd p(1, 2, 3, 4);
+ * Quatd q(5, 6, 7, 8);
+ * DualQuatd dq2 = DualQuatd::createFromQuat(p, q);
+ *
+ * // create from an angle, an axis and a translation
+ * Vec3d axis{0, 0, 1};
+ * Vec3d trans{3, 4, 5};
+ * DualQuatd dq3 = DualQuatd::createFromAngleAxisTrans(angle, axis, trans);
+ *
+ * // If you already have an instance of class Affine3, then you can use
+ * Affine3d R = dq3.toAffine3();
+ * DualQuatd dq4 = DualQuatd::createFromAffine3(R);
+ *
+ * // or create directly by affine transformation matrix Rt
+ * // see createFromMat() in detail for the form of Rt
+ * Matx44d Rt = dq3.toMat();
+ * DualQuatd dq5 = DualQuatd::createFromMat(Rt);
+ *
+ * // Any rotation + translation movement can
+ * // be expressed as a rotation + translation around the same line in space (expressed by Plucker
+ * // coords), and here's a way to represent it this way.
+ * Vec3d axis{1, 1, 1}; // axis will be normalized in createFromPitch
+ * Vec3d trans{3, 4 ,5};
+ * axis = axis / std::sqrt(axis.dot(axis));// The formula for computing moment that I use below requires a normalized axis
+ * Vec3d moment = 1.0 / 2 * (trans.cross(axis) + axis.cross(trans.cross(axis)) *
+ *                            std::cos(rotation_angle / 2) / std::sin(rotation_angle / 2));
+ * double d = trans.dot(qaxis);
+ * DualQuatd dq6 = DualQuatd::createFromPitch(angle, d, axis, moment);
+ * ```
+ *
+ * A point \f$v=(x, y, z)\f$ in form of dual quaternion is \f$[1+\epsilon v]=[1,0,0,0,0,x,y,z]\f$.
+ * The transformation of a point \f$v_1\f$ to another point \f$v_2\f$ under the dual quaternion \f$\sigma\f$ is
+ * \f[
+ * 1 + \epsilon v_2 = \sigma * (1 + \epsilon v_1) * \sigma^{\star}
+ * \f]
+ * where \f$\sigma^{\star}=p^*-\epsilon q^*.\f$
+ *
+ * A line in the \f$Pl\ddot{u}cker\f$ coordinates \f$(\hat{l}, m)\f$ defined by the dual quaternion \f$l=\hat{l}+\epsilon m\f$.
+ * To transform a line, \f[l_2 = \sigma * l_1 * \sigma^*,\f] where \f$\sigma=r+\frac{\epsilon}{2}rt\f$ and
+ * \f$\sigma^*=p^*+\epsilon q^*\f$.
+ *
+ * To extract the Vec<double, 8> or Vec<float, 8>, see toVec();
+ *
+ * To extract the affine transformation matrix, see toMat();
+ *
+ * To extract the instance of Affine3, see toAffine3();
+ *
+ * If two quaternions \f$q_0, q_1\f$ are needed to be interpolated, you can use sclerp()
+ * ```
+ * DualQuatd::sclerp(q0, q1, t)
+ * ```
+ * or dqblend().
+ * ```
+ * DualQuatd::dqblend(q0, q1, t)
+ * ```
+ * With more than two dual quaternions to be blended, you can use generalize linear dual quaternion blending
+ * with the corresponding weights, i.e. gdqblend().
+ *
+ */
+template <typename _Tp>
+class CV_EXPORTS DualQuat{
+    static_assert(std::is_floating_point<_Tp>::value, "Dual quaternion only make sense with type of float or double");
+    using value_type = _Tp;
+
+public:
+    static constexpr _Tp CV_DUAL_QUAT_EPS = (_Tp)1.e-6;
+
+    DualQuat();
+
+    /**
+     * @brief create from eight same type numbers.
+     */
+    DualQuat(const _Tp w, const _Tp x, const _Tp y, const _Tp z, const _Tp w_, const _Tp x_, const _Tp y_, const _Tp z_);
+
+    /**
+     * @brief create from a double or float vector.
+     */
+    DualQuat(const Vec<_Tp, 8> &q);
+
+    _Tp w, x, y, z, w_, x_, y_, z_;
+
+    /**
+     * @brief create Dual Quaternion from two same type quaternions p and q.
+     * A Dual Quaternion \f$\sigma\f$ has the form:
+     * \f[\sigma = p + \epsilon q\f]
+     * where p and q are defined as follows:
+     * \f[\begin{equation}
+     *    \begin{split}
+     *    p &= w + x\boldsymbol{i} + y\boldsymbol{j} + z\boldsymbol{k}\\
+     *    q &= w\_ + x\_\boldsymbol{i} + y\_\boldsymbol{j} + z\_\boldsymbol{k}.
+     *    \end{split}
+     *   \end{equation}
+     * \f]
+     * The p and q are the real part and dual part respectively.
+     * @param realPart a quaternion, real part of dual quaternion.
+     * @param dualPart a quaternion, dual part of dual quaternion.
+     * @sa Quat
+    */
+    static DualQuat<_Tp> createFromQuat(const Quat<_Tp> &realPart, const Quat<_Tp> &dualPart);
+
+    /**
+     * @brief create a dual quaternion from a rotation angle \f$\theta\f$, a rotation axis
+     * \f$\boldsymbol{u}\f$ and a translation \f$\boldsymbol{t}\f$.
+     * It generates a dual quaternion \f$\sigma\f$ in the form of
+     * \f[\begin{equation}
+     *    \begin{split}
+     *    \sigma &= r + \frac{\epsilon}{2}\boldsymbol{t}r \\
+     *           &= [\cos(\frac{\theta}{2}), \boldsymbol{u}\sin(\frac{\theta}{2})]
+     *           + \frac{\epsilon}{2}[0, \boldsymbol{t}][[\cos(\frac{\theta}{2}),
+     *           \boldsymbol{u}\sin(\frac{\theta}{2})]]\\
+     *           &= \cos(\frac{\theta}{2}) + \boldsymbol{u}\sin(\frac{\theta}{2})
+     *           + \frac{\epsilon}{2}(-(\boldsymbol{t} \cdot \boldsymbol{u})\sin(\frac{\theta}{2})
+     *           + \boldsymbol{t}\cos(\frac{\theta}{2}) + \boldsymbol{u} \times \boldsymbol{t} \sin(\frac{\theta}{2})).
+     *    \end{split}
+     *    \end{equation}\f]
+     * @param angle rotation angle.
+     * @param axis rotation axis.
+     * @param translation a vector of length 3.
+     * @note Axis will be normalized in this function. And translation is applied
+     * after the rotation. Use @ref createFromQuat(r, r * t / 2) to create a dual quaternion
+     * which translation is applied before rotation.
+     * @sa Quat
+     */
+    static DualQuat<_Tp> createFromAngleAxisTrans(const _Tp angle, const Vec<_Tp, 3> &axis, const Vec<_Tp, 3> &translation);
+
+    /**
+     * @brief Transform this dual quaternion to an affine transformation matrix \f$M\f$.
+     * Dual quaternion consists of a rotation \f$r=[a,b,c,d]\f$ and a translation \f$t=[\Delta x,\Delta y,\Delta z]\f$. The
+     * affine transformation matrix \f$M\f$ has the form
+     * \f[
+     * \begin{bmatrix}
+     * 1-2(e_2^2 +e_3^2) &2(e_1e_2-e_0e_3) &2(e_0e_2+e_1e_3) &\Delta x\\
+     * 2(e_0e_3+e_1e_2)  &1-2(e_1^2+e_3^2) &2(e_2e_3-e_0e_1) &\Delta y\\
+     * 2(e_1e_3-e_0e_2)  &2(e_0e_1+e_2e_3) &1-2(e_1^2-e_2^2) &\Delta z\\
+     * 0&0&0&1
+     * \end{bmatrix}
+     * \f]
+     *  if A is a matrix consisting of  n points to be transformed, this could be achieved by
+     * \f[
+     *  new\_A = M * A
+     * \f]
+     * where A has the form
+     * \f[
+     * \begin{bmatrix}
+     * x_0& x_1& x_2&...&x_n\\
+     * y_0& y_1& y_2&...&y_n\\
+     * z_0& z_1& z_2&...&z_n\\
+     * 1&1&1&...&1
+     * \end{bmatrix}
+     * \f]
+     * where the same subscript represent the same point. The size of A should be \f$[4,n]\f$.
+     * and the same size for matrix new_A.
+     * @param _R 4x4 matrix that represents rotations and translation.
+     * @note Translation is applied after the rotation. Use createFromQuat(r, r * t / 2) to create
+     * a dual quaternion which translation is applied before rotation.
+     */
+    static DualQuat<_Tp> createFromMat(InputArray _R);
+
+    /**
+     * @brief create dual quaternion from an affine matrix. The definition of affine matrix can refer to  createFromMat()
+     */
+    static DualQuat<_Tp> createFromAffine3(const Affine3<_Tp> &R);
+
+    /**
+     * @brief A dual quaternion is a vector in form of
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * \sigma &=\boldsymbol{p} + \epsilon \boldsymbol{q}\\
+     * &= \cos\hat{\frac{\theta}{2}}+\overline{\hat{l}}\sin\frac{\hat{\theta}}{2}
+     * \end{split}
+     * \end{equation}
+     * \f]
+     * where \f$\hat{\theta}\f$ is dual angle and \f$\overline{\hat{l}}\f$ is dual axis:
+     * \f[
+     * \hat{\theta}=\theta + \epsilon d,\\
+     * \overline{\hat{l}}= \hat{l} +\epsilon m.
+     * \f]
+     * In this representation, \f$\theta\f$ is rotation angle and \f$(\hat{l},m)\f$ is the screw axis, d is the translation distance along the axis.
+     *
+     * @param angle rotation angle.
+     * @param d translation along the rotation axis.
+     * @param axis rotation axis represented by quaternion with w = 0.
+     * @param moment the moment of line, and it should be orthogonal to axis.
+     * @note Translation is applied after the rotation. Use createFromQuat(r, r * t / 2) to create
+     * a dual quaternion which translation is applied before rotation.
+     */
+    static DualQuat<_Tp> createFromPitch(const _Tp angle, const _Tp d, const Vec<_Tp, 3> &axis, const Vec<_Tp, 3> &moment);
+
+    /**
+     * @brief return a quaternion which represent the real part of dual quaternion.
+     * The definition of real part is in createFromQuat().
+     * @sa createFromQuat, getDualPart
+     */
+    Quat<_Tp> getRealPart() const;
+
+    /**
+     * @brief return a quaternion which represent the dual part of dual quaternion.
+     * The definition of dual part is in createFromQuat().
+     * @sa createFromQuat, getRealPart
+     */
+    Quat<_Tp> getDualPart() const;
+
+    /**
+     * @brief return the conjugate of a dual quaternion.
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * \sigma^* &= (p + \epsilon q)^*
+     *          &= (p^* + \epsilon q^*)
+     * \end{split}
+     * \end{equation}
+     * \f]
+     * @param dq a dual quaternion.
+     */
+    template <typename T>
+    friend DualQuat<T> conjugate(const DualQuat<T> &dq);
+
+    /**
+     * @brief return the conjugate of a dual quaternion.
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * \sigma^* &= (p + \epsilon q)^*
+     *          &= (p^* + \epsilon q^*)
+     * \end{split}
+     * \end{equation}
+     * \f]
+     */
+    DualQuat<_Tp> conjugate() const;
+
+    /**
+     * @brief return the rotation in quaternion form.
+     */
+    Quat<_Tp> getRotation(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief return the translation vector.
+     * The rotation \f$r\f$ in this dual quaternion \f$\sigma\f$ is applied before translation \f$t\f$.
+     * The dual quaternion \f$\sigma\f$ is defined as
+     * \f[\begin{equation}
+     * \begin{split}
+     * \sigma &= p + \epsilon q \\
+     *        &= r + \frac{\epsilon}{2}{t}r.
+     * \end{split}
+     * \end{equation}\f]
+     * Thus, the translation can be obtained as follows
+     * \f[t = 2qp^*.\f]
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, this dual quaternion assume to be a unit dual quaternion
+     * and this function will save some computations.
+     * @note This dual quaternion's translation is applied after the rotation.
+     */
+    Vec<_Tp, 3> getTranslation(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief return the norm \f$||\sigma||\f$ of dual quaternion \f$\sigma = p + \epsilon q\f$.
+     * \f[
+     *  \begin{equation}
+     *  \begin{split}
+     *  ||\sigma|| &= \sqrt{\sigma * \sigma^*} \\
+     *        &= ||p|| + \epsilon \frac{p \cdot q}{||p||}.
+     *  \end{split}
+     *  \end{equation}
+     *  \f]
+     * Generally speaking, the norm of a not unit dual
+     * quaternion is a dual number. For convenience, we return it in the form of a dual quaternion
+     * , i.e.
+     * \f[ ||\sigma|| = [||p||, 0, 0, 0, \frac{p \cdot q}{||p||}, 0, 0, 0].\f]
+     *
+     * @note The data type of dual number is dual quaternion.
+     */
+    DualQuat<_Tp> norm() const;
+
+    /**
+     * @brief return a normalized dual quaternion.
+     * A dual quaternion can be expressed as
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * \sigma &= p + \epsilon q\\
+     * &=||\sigma||\left(r+\frac{1}{2}tr\right)
+     * \end{split}
+     * \end{equation}
+     * \f]
+     * where \f$r, t\f$ represents the rotation (ordinary quaternion) and translation (pure ordinary quaternion) respectively,
+     * and \f$||\sigma||\f$ is the norm of dual quaternion(a dual number).
+     * A dual quaternion is unit if and only if
+     * \f[
+     * ||p||=1, p \cdot q=0
+     * \f]
+     * where \f$\cdot\f$ means dot product.
+     * The process of normalization is
+     * \f[
+     * \sigma_{u}=\frac{\sigma}{||\sigma||}
+     * \f]
+     * Next, we simply proof \f$\sigma_u\f$ is a unit dual quaternion:
+     * \f[
+     * \renewcommand{\Im}{\operatorname{Im}}
+     * \begin{equation}
+     * \begin{split}
+     * \sigma_{u}=\frac{\sigma}{||\sigma||}&=\frac{p + \epsilon q}{||p||+\epsilon\frac{p\cdot q}{||p||}}\\
+     * &=\frac{p}{||p||}+\epsilon\left(\frac{q}{||p||}-p\frac{p\cdot q}{||p||^3}\right)\\
+     * &=\frac{p}{||p||}+\epsilon\frac{1}{||p||^2}\left(qp^{*}-p\cdot q\right)\frac{p}{||p||}\\
+     * &=\frac{p}{||p||}+\epsilon\frac{1}{||p||^2}\Im(qp^*)\frac{p}{||p||}.\\
+     * \end{split}
+     * \end{equation}
+     * \f]
+     * As expected, the real part is a rotation and dual part is a pure quaternion.
+     */
+    DualQuat<_Tp> normalize() const;
+
+    /**
+     * @brief if \f$\sigma = p + \epsilon q\f$ is a dual quaternion, p is not zero,
+     * the inverse dual quaternion is
+     * \f[\sigma^{-1} = \frac{\sigma^*}{||\sigma||^2}, \f]
+     * or equivalentlly,
+     * \f[\sigma^{-1} = p^{-1} - \epsilon p^{-1}qp^{-1}.\f]
+     * @param dq a dual quaternion.
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, dual quaternion dq assume to be a unit dual quaternion
+     * and this function will save some computations.
+     */
+    template <typename T>
+    friend DualQuat<T> inv(const DualQuat<T> &dq, QuatAssumeType assumeUnit);
+
+    /**
+     * @brief if \f$\sigma = p + \epsilon q\f$ is a dual quaternion, p is not zero,
+     * the inverse dual quaternion is
+     * \f[\sigma^{-1} = \frac{\sigma^*}{||\sigma||^2}, \f]
+     * or equivalentlly,
+     * \f[\sigma^{-1} = p^{-1} - \epsilon p^{-1}qp^{-1}.\f]
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, this dual quaternion assume to be a unit dual quaternion
+     * and this function will save some computations.
+     */
+    DualQuat<_Tp> inv(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief return the dot product of two dual quaternion.
+     * @param p other dual quaternion.
+     */
+    _Tp dot(DualQuat<_Tp> p) const;
+
+    /**
+     ** @brief return the value of \f$p^t\f$ where p is a dual quaternion.
+     * This could be calculated as:
+     * \f[
+     * p^t = \exp(t\ln p)
+     * \f]
+     * @param dq a dual quaternion.
+     * @param t index of power function.
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, dual quaternion dq assume to be a unit dual quaternion
+     * and this function will save some computations.
+     */
+    template <typename T>
+    friend DualQuat<T> power(const DualQuat<T> &dq, const T t, QuatAssumeType assumeUnit);
+
+    /**
+     ** @brief return the value of \f$p^t\f$ where p is a dual quaternion.
+     * This could be calculated as:
+     * \f[
+     * p^t = \exp(t\ln p)
+     * \f]
+     *
+     * @param t index of power function.
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, this dual quaternion assume to be a unit dual quaternion
+     * and this function will save some computations.
+     */
+    DualQuat<_Tp> power(const _Tp t, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief return the value of \f$p^q\f$ where p and q are dual quaternions.
+     * This could be calculated as:
+     * \f[
+     * p^q = \exp(q\ln p)
+     * \f]
+     * @param p a dual quaternion.
+     * @param q a dual quaternion.
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, dual quaternion p assume to be a dual unit quaternion
+     * and this function will save some computations.
+     */
+    template <typename T>
+    friend DualQuat<T> power(const DualQuat<T>& p, const DualQuat<T>& q, QuatAssumeType assumeUnit);
+
+    /**
+     * @brief return the value of \f$p^q\f$ where p and q are dual quaternions.
+     * This could be calculated as:
+     * \f[
+     * p^q = \exp(q\ln p)
+     * \f]
+     *
+     * @param q a dual quaternion
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, this dual quaternion assume to be a dual unit quaternion
+     * and this function will save some computations.
+     */
+    DualQuat<_Tp> power(const DualQuat<_Tp>& q, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief return the value of exponential function value
+     * @param dq a dual quaternion.
+     */
+    template <typename T>
+    friend DualQuat<T> exp(const DualQuat<T> &dq);
+
+    /**
+     * @brief return the value of exponential function value
+     */
+    DualQuat<_Tp> exp() const;
+
+    /**
+     * @brief return the value of logarithm function value
+     *
+     * @param dq a dual quaternion.
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, dual quaternion dq assume to be a unit dual quaternion
+     * and this function will save some computations.
+     */
+    template <typename T>
+    friend DualQuat<T> log(const DualQuat<T> &dq, QuatAssumeType assumeUnit);
+
+    /**
+     * @brief return the value of logarithm function value
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, this dual quaternion assume to be a unit dual quaternion
+     * and this function will save some computations.
+     */
+    DualQuat<_Tp> log(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief Transform this dual quaternion to a vector.
+     */
+    Vec<_Tp, 8> toVec() const;
+
+    /**
+     * @brief Transform this dual quaternion to a affine transformation matrix
+     * the form of matrix, see createFromMat().
+     */
+    Matx<_Tp, 4, 4> toMat(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+      * @brief Transform this dual quaternion to a instance of Affine3.
+      */
+    Affine3<_Tp> toAffine3(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief The screw linear interpolation(ScLERP) is an extension of spherical linear interpolation of dual quaternion.
+     * If \f$\sigma_1\f$ and \f$\sigma_2\f$ are two dual quaternions representing the initial and final pose.
+     * The interpolation of ScLERP function can be defined as:
+     * \f[
+     * ScLERP(t;\sigma_1,\sigma_2) = \sigma_1 * (\sigma_1^{-1} * \sigma_2)^t, t\in[0,1]
+     * \f]
+     *
+     * @param q1 a dual quaternion represents a initial pose.
+     * @param q2 a dual quaternion represents a final pose.
+     * @param t interpolation parameter
+     * @param directChange if true, it always return the shortest path.
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, this dual quaternion assume to be a unit dual quaternion
+     * and this function will save some computations.
+     *
+     * For example
+     * ```
+     * double angle1 = CV_PI / 2;
+     * Vec3d axis{0, 0, 1};
+     * Vec3d t(0, 0, 3);
+     * DualQuatd initial = DualQuatd::createFromAngleAxisTrans(angle1, axis, t);
+     * double angle2 = CV_PI;
+     * DualQuatd final = DualQuatd::createFromAngleAxisTrans(angle2, axis, t);
+     * DualQuatd inter = DualQuatd::sclerp(initial, final, 0.5);
+     * ```
+     */
+    static DualQuat<_Tp> sclerp(const DualQuat<_Tp> &q1, const DualQuat<_Tp> &q2, const _Tp t,
+                                bool directChange=true, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
+    /**
+     * @brief The method of Dual Quaternion linear Blending(DQB) is to compute a transformation between dual quaternion
+     * \f$q_1\f$ and \f$q_2\f$ and can be defined as:
+     * \f[
+     * DQB(t;{\boldsymbol{q}}_1,{\boldsymbol{q}}_2)=
+     * \frac{(1-t){\boldsymbol{q}}_1+t{\boldsymbol{q}}_2}{||(1-t){\boldsymbol{q}}_1+t{\boldsymbol{q}}_2||}.
+     * \f]
+     * where \f$q_1\f$ and \f$q_2\f$ are unit dual quaternions representing the input transformations.
+     * If you want to use DQB that works for more than two rigid transformations, see @ref gdqblend
+     *
+     * @param q1 a unit dual quaternion representing the input transformations.
+     * @param q2 a unit dual quaternion representing the input transformations.
+     * @param t parameter \f$t\in[0,1]\f$.
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, this dual quaternion assume to be a unit dual quaternion
+     * and this function will save some computations.
+     *
+     * @sa gdqblend
+     */
+    static DualQuat<_Tp> dqblend(const DualQuat<_Tp> &q1, const DualQuat<_Tp> &q2, const _Tp t,
+                                   QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
+
+    /**
+     * @brief The generalized Dual Quaternion linear Blending works for more than two rigid transformations.
+     * If these transformations are expressed as unit dual quaternions \f$q_1,...,q_n\f$ with convex weights
+     * \f$w = (w_1,...,w_n)\f$, the generalized DQB is simply
+     * \f[
+     * gDQB(\boldsymbol{w};{\boldsymbol{q}}_1,...,{\boldsymbol{q}}_n)=\frac{w_1{\boldsymbol{q}}_1+...+w_n{\boldsymbol{q}}_n}
+     * {||w_1{\boldsymbol{q}}_1+...+w_n{\boldsymbol{q}}_n||}.
+     * \f]
+     * @param dualquat vector of dual quaternions
+     * @param weights vector of weights, the size of weights should be the same as dualquat, and the weights should
+     * satisfy \f$\sum_0^n w_{i} = 1\f$ and \f$w_i>0\f$.
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, these dual quaternions assume to be unit quaternions
+     * and this function will save some computations.
+     * @note the type of weights' element should be the same as the date type of dual quaternion inside the dualquat.
+     */
+    template <int cn>
+    static DualQuat<_Tp> gdqblend(const Vec<DualQuat<_Tp>, cn> &dualquat, InputArray weights,
+                                QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
+
+    /**
+     * @brief The generalized Dual Quaternion linear Blending works for more than two rigid transformations.
+     * If these transformations are expressed as unit dual quaternions \f$q_1,...,q_n\f$ with convex weights
+     * \f$w = (w_1,...,w_n)\f$, the generalized DQB is simply
+     * \f[
+     * gDQB(\boldsymbol{w};{\boldsymbol{q}}_1,...,{\boldsymbol{q}}_n)=\frac{w_1{\boldsymbol{q}}_1+...+w_n{\boldsymbol{q}}_n}
+     * {||w_1{\boldsymbol{q}}_1+...+w_n{\boldsymbol{q}}_n||}.
+     * \f]
+     * @param dualquat The dual quaternions which have 8 channels and 1 row or 1 col.
+     * @param weights vector of weights, the size of weights should be the same as dualquat, and the weights should
+     * satisfy \f$\sum_0^n w_{i} = 1\f$ and \f$w_i>0\f$.
+     * @param assumeUnit if @ref QUAT_ASSUME_UNIT, these dual quaternions assume to be unit quaternions
+     * and this function will save some computations.
+     * @note the type of weights' element should be the same as the date type of dual quaternion inside the dualquat.
+     */
+    static DualQuat<_Tp> gdqblend(InputArray dualquat, InputArray weights,
+                                QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
+
+    /**
+     * @brief Return opposite dual quaternion \f$-p\f$
+     * which satisfies \f$p + (-p) = 0.\f$
+     *
+     * For example
+     * ```
+     * DualQuatd q{1, 2, 3, 4, 5, 6, 7, 8};
+     * std::cout << -q << std::endl; // [-1, -2, -3, -4, -5, -6, -7, -8]
+     * ```
+     */
+    DualQuat<_Tp> operator-() const;
+
+    /**
+     * @brief return true if two dual quaternions p and q are nearly equal, i.e. when the absolute
+     * value of each \f$p_i\f$ and \f$q_i\f$ is less than CV_DUAL_QUAT_EPS.
+     */
+    bool operator==(const DualQuat<_Tp>&) const;
+
+    /**
+     * @brief Subtraction operator of two dual quaternions p and q.
+     * It returns a new dual quaternion that each value is the sum of \f$p_i\f$ and \f$-q_i\f$.
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * DualQuatd q{5, 6, 7, 8, 9, 10, 11, 12};
+     * std::cout << p - q << std::endl; //[-4, -4, -4, -4, 4, -4, -4, -4]
+     * ```
+     */
+    DualQuat<_Tp> operator-(const DualQuat<_Tp>&) const;
+
+    /**
+     * @brief Subtraction assignment operator of two dual quaternions p and q.
+     * It subtracts right operand from the left operand and assign the result to left operand.
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * DualQuatd q{5, 6, 7, 8, 9, 10, 11, 12};
+     * p -= q; // equivalent to p = p - q
+     * std::cout << p << std::endl; //[-4, -4, -4, -4, 4, -4, -4, -4]
+     *
+     * ```
+     */
+    DualQuat<_Tp>& operator-=(const DualQuat<_Tp>&);
+
+    /**
+     * @brief Addition operator of two dual quaternions p and q.
+     * It returns a new dual quaternion that each value is the sum of \f$p_i\f$ and \f$q_i\f$.
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * DualQuatd q{5, 6, 7, 8, 9, 10, 11, 12};
+     * std::cout << p + q << std::endl; //[6, 8, 10, 12, 14, 16, 18, 20]
+     * ```
+     */
+    DualQuat<_Tp> operator+(const DualQuat<_Tp>&) const;
+
+    /**
+     * @brief Addition assignment operator of two dual quaternions p and q.
+     * It adds right operand to the left operand and assign the result to left operand.
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * DualQuatd q{5, 6, 7, 8, 9, 10, 11, 12};
+     * p += q; // equivalent to p = p + q
+     * std::cout << p << std::endl; //[6, 8, 10, 12, 14, 16, 18, 20]
+     *
+     * ```
+     */
+    DualQuat<_Tp>& operator+=(const DualQuat<_Tp>&);
+
+    /**
+     * @brief Multiplication assignment operator of two quaternions.
+     * It multiplies right operand with the left operand and assign the result to left operand.
+     *
+     * Rule of dual quaternion multiplication:
+     * The dual quaternion can be written as an ordered pair of quaternions [A, B]. Thus
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p * q &= [A, B][C, D]\\
+     * &=[AC, AD + BC]
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * DualQuatd q{5, 6, 7, 8, 9, 10, 11, 12};
+     * p *= q;
+     * std::cout << p << std::endl; //[-60, 12, 30, 24, -216, 80, 124, 120]
+     * ```
+     */
+    DualQuat<_Tp>& operator*=(const DualQuat<_Tp>&);
+
+    /**
+     * @brief Multiplication assignment operator of a quaternions and a scalar.
+     * It multiplies right operand with the left operand and assign the result to left operand.
+     *
+     * Rule of dual quaternion multiplication with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p * s &= [w, x, y, z, w\_, x\_, y\_, z\_] * s\\
+     *  &=[w   s, x   s, y   s, z   s, w\_  \space  s, x\_  \space  s, y\_ \space  s, z\_ \space  s].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * double s = 2.0;
+     * p *= s;
+     * std::cout << p << std::endl; //[2, 4, 6, 8, 10, 12, 14, 16]
+     * ```
+     * @note the type of scalar should be equal to the dual quaternion.
+     */
+    DualQuat<_Tp> operator*=(const _Tp s);
+
+
+    /**
+     * @brief Multiplication operator of two dual quaternions q and p.
+     * Multiplies values on either side of the operator.
+     *
+     * Rule of dual quaternion multiplication:
+     * The dual quaternion can be written as an ordered pair of quaternions [A, B]. Thus
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p * q &= [A, B][C, D]\\
+     * &=[AC, AD + BC]
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * DualQuatd q{5, 6, 7, 8, 9, 10, 11, 12};
+     * std::cout << p * q << std::endl; //[-60, 12, 30, 24, -216, 80, 124, 120]
+     * ```
+     */
+    DualQuat<_Tp> operator*(const DualQuat<_Tp>&) const;
+
+    /**
+     * @brief Division operator of a dual quaternions and a scalar.
+     * It divides left operand with the right operand and assign the result to left operand.
+     *
+     * Rule of dual quaternion division with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p / s &= [w, x, y, z, w\_, x\_, y\_, z\_] / s\\
+     * &=[w/s, x/s, y/s, z/s, w\_/s, x\_/s, y\_/s, z\_/s].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * double s = 2.0;
+     * p /= s; // equivalent to p = p / s
+     * std::cout << p << std::endl; //[0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4]
+     * ```
+     * @note the type of scalar should be equal to this dual quaternion.
+     */
+    DualQuat<_Tp> operator/(const _Tp s) const;
+
+    /**
+     * @brief Division operator of two dual quaternions p and q.
+     * Divides left hand operand by right hand operand.
+     *
+     * Rule of dual quaternion division with a dual quaternion:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p / q &= p * q.inv()\\
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * DualQuatd q{5, 6, 7, 8, 9, 10, 11, 12};
+     * std::cout << p / q << std::endl; // equivalent to p * q.inv()
+     * ```
+     */
+    DualQuat<_Tp> operator/(const DualQuat<_Tp>&) const;
+
+    /**
+     * @brief Division assignment operator of two dual quaternions p and q;
+     * It divides left operand with the right operand and assign the result to left operand.
+     *
+     * Rule of dual quaternion division with a quaternion:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p / q&= p * q.inv()\\
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * DualQuatd q{5, 6, 7, 8, 9, 10, 11, 12};
+     * p /= q; // equivalent to p = p * q.inv()
+     * std::cout << p << std::endl;
+     * ```
+     */
+    DualQuat<_Tp>& operator/=(const DualQuat<_Tp>&);
+
+    /**
+     * @brief Division assignment operator of a dual quaternions and a scalar.
+     * It divides left operand with the right operand and assign the result to left operand.
+     *
+     * Rule of dual quaternion division with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p / s &= [w, x, y, z, w\_, x\_, y\_ ,z\_] / s\\
+     * &=[w / s, x / s, y / s, z / s, w\_ / \space s, x\_ / \space s, y\_ / \space s, z\_ / \space s].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * double s = 2.0;;
+     * p /= s; // equivalent to p = p / s
+     * std::cout << p << std::endl; //[0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0]
+     * ```
+     * @note the type of scalar should be equal to the dual quaternion.
+     */
+    Quat<_Tp>& operator/=(const _Tp s);
+
+    /**
+     * @brief Addition operator of a scalar and a dual quaternions.
+     * Adds right hand operand from left hand operand.
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * double scalar = 2.0;
+     * std::cout << scalar + p << std::endl; //[3.0, 2, 3, 4, 5, 6, 7, 8]
+     * ```
+     * @note the type of scalar should be equal to the dual quaternion.
+     */
+    template <typename T>
+    friend DualQuat<T> cv::operator+(const T s, const DualQuat<T>&);
+
+    /**
+     * @brief Addition operator of a dual quaternions and a scalar.
+     * Adds right hand operand from left hand operand.
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * double scalar = 2.0;
+     * std::cout << p + scalar << std::endl; //[3.0, 2, 3, 4, 5, 6, 7, 8]
+     * ```
+     * @note the type of scalar should be equal to the dual quaternion.
+     */
+    template <typename T>
+    friend DualQuat<T> cv::operator+(const DualQuat<T>&, const T s);
+
+    /**
+     * @brief Multiplication operator of a scalar and a dual quaternions.
+     * It multiplies right operand with the left operand and assign the result to left operand.
+     *
+     * Rule of dual quaternion multiplication with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p * s &= [w, x, y, z, w\_, x\_, y\_, z\_] * s\\
+     * &=[w s, x s, y s, z s, w\_ \space s, x\_ \space s, y\_ \space s, z\_ \space s].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * double s = 2.0;
+     * std::cout << s * p << std::endl; //[2, 4, 6, 8, 10, 12, 14, 16]
+     * ```
+     * @note the type of scalar should be equal to the dual quaternion.
+     */
+    template <typename T>
+    friend DualQuat<T> cv::operator*(const T s, const DualQuat<T>&);
+
+    /**
+     * @brief Subtraction operator of a dual quaternion and a scalar.
+     * Subtracts right hand operand from left hand operand.
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * double scalar = 2.0;
+     * std::cout << p - scalar << std::endl; //[-1, 2, 3, 4, 5, 6, 7, 8]
+     * ```
+     * @note the type of scalar should be equal to the dual quaternion.
+     */
+    template <typename T>
+    friend DualQuat<T> cv::operator-(const DualQuat<T>&, const T s);
+
+    /**
+     * @brief Subtraction operator of a scalar and a dual quaternions.
+     * Subtracts right hand operand from left hand operand.
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * double scalar = 2.0;
+     * std::cout << scalar - p << std::endl; //[1.0, -2, -3, -4, -5, -6, -7, -8]
+     * ```
+     * @note the type of scalar should be equal to the dual quaternion.
+     */
+    template <typename T>
+    friend DualQuat<T> cv::operator-(const T s, const DualQuat<T>&);
+
+    /**
+     * @brief Multiplication operator of a dual quaternions and a scalar.
+     * It multiplies right operand with the left operand and assign the result to left operand.
+     *
+     * Rule of dual quaternion multiplication with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p * s &= [w, x, y, z, w\_, x\_, y\_, z\_] * s\\
+     * &=[w s, x s, y s, z s, w\_ \space s, x\_ \space s, y\_ \space s, z\_ \space s].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * DualQuatd p{1, 2, 3, 4, 5, 6, 7, 8};
+     * double s = 2.0;
+     * std::cout << p * s << std::endl; //[2, 4, 6, 8, 10, 12, 14, 16]
+     * ```
+     * @note the type of scalar should be equal to the dual quaternion.
+     */
+    template <typename T>
+    friend DualQuat<T> cv::operator*(const DualQuat<T>&, const T s);
+
+    template <typename S>
+    friend std::ostream& cv::operator<<(std::ostream&, const DualQuat<S>&);
+
+};
+
+using DualQuatd = DualQuat<double>;
+using DualQuatf = DualQuat<float>;
+
+//! @} core
+}//namespace
+
+#include "dualquaternion.inl.hpp"
+
+#endif /* OPENCV_CORE_QUATERNION_HPP */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/dualquaternion.inl.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/dualquaternion.inl.hpp
new file mode 100644
index 0000000..6abb159
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/dualquaternion.inl.hpp
@@ -0,0 +1,487 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2020, Huawei Technologies Co., Ltd. All rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: Liangqian Kong <kongliangqian@huawei.com>
+//         Longbu Wang <wanglongbu@huawei.com>
+
+#ifndef OPENCV_CORE_DUALQUATERNION_INL_HPP
+#define OPENCV_CORE_DUALQUATERNION_INL_HPP
+
+#ifndef OPENCV_CORE_DUALQUATERNION_HPP
+#error This is not a standalone header. Include dualquaternion.hpp instead.
+#endif
+
+///////////////////////////////////////////////////////////////////////////////////////
+//Implementation
+namespace cv {
+
+template <typename T>
+DualQuat<T>::DualQuat():w(0), x(0), y(0), z(0), w_(0), x_(0), y_(0), z_(0){};
+
+template <typename T>
+DualQuat<T>::DualQuat(const T vw, const T vx, const T vy, const T vz, const T _w, const T _x, const T _y, const T _z):
+                      w(vw), x(vx), y(vy), z(vz), w_(_w), x_(_x), y_(_y), z_(_z){};
+
+template <typename T>
+DualQuat<T>::DualQuat(const Vec<T, 8> &q):w(q[0]), x(q[1]), y(q[2]), z(q[3]),
+                                          w_(q[4]), x_(q[5]), y_(q[6]), z_(q[7]){};
+
+template <typename T>
+DualQuat<T> DualQuat<T>::createFromQuat(const Quat<T> &realPart, const Quat<T> &dualPart)
+{
+    T w = realPart.w;
+    T x = realPart.x;
+    T y = realPart.y;
+    T z = realPart.z;
+    T w_ = dualPart.w;
+    T x_ = dualPart.x;
+    T y_ = dualPart.y;
+    T z_ = dualPart.z;
+    return DualQuat<T>(w, x, y, z, w_, x_, y_, z_);
+}
+
+template <typename T>
+DualQuat<T> DualQuat<T>::createFromAngleAxisTrans(const T angle, const Vec<T, 3> &axis, const Vec<T, 3> &trans)
+{
+    Quat<T> r = Quat<T>::createFromAngleAxis(angle, axis);
+    Quat<T> t{0, trans[0], trans[1], trans[2]};
+    return createFromQuat(r, t * r * T(0.5));
+}
+
+template <typename T>
+DualQuat<T> DualQuat<T>::createFromMat(InputArray _R)
+{
+    CV_CheckTypeEQ(_R.type(), cv::traits::Type<T>::value, "");
+    if (_R.size() != Size(4, 4))
+    {
+        CV_Error(Error::StsBadArg, "The input matrix must have 4 columns and 4 rows");
+    }
+    Mat R = _R.getMat();
+    Quat<T> r = Quat<T>::createFromRotMat(R.colRange(0, 3).rowRange(0, 3));
+    Quat<T> trans(0, R.at<T>(0, 3), R.at<T>(1, 3), R.at<T>(2, 3));
+    return createFromQuat(r, trans * r * T(0.5));
+}
+
+template <typename T>
+DualQuat<T> DualQuat<T>::createFromAffine3(const Affine3<T> &R)
+{
+    return createFromMat(R.matrix);
+}
+
+template <typename T>
+DualQuat<T> DualQuat<T>::createFromPitch(const T angle, const T d, const Vec<T, 3> &axis, const Vec<T, 3> &moment)
+{
+    T half_angle = angle * T(0.5), half_d = d * T(0.5);
+    Quat<T> qaxis = Quat<T>(0, axis[0], axis[1], axis[2]).normalize();
+    Quat<T> qmoment = Quat<T>(0, moment[0], moment[1], moment[2]);
+    qmoment -= qaxis * axis.dot(moment);
+    Quat<T> dual = -half_d * std::sin(half_angle) + std::sin(half_angle) * qmoment +
+        half_d * std::cos(half_angle) * qaxis;
+    return createFromQuat(Quat<T>::createFromAngleAxis(angle, axis), dual);
+}
+
+template <typename T>
+inline bool DualQuat<T>::operator==(const DualQuat<T> &q) const
+{
+    return (abs(w - q.w) < CV_DUAL_QUAT_EPS && abs(x - q.x) < CV_DUAL_QUAT_EPS &&
+            abs(y - q.y) < CV_DUAL_QUAT_EPS && abs(z - q.z) < CV_DUAL_QUAT_EPS &&
+            abs(w_ - q.w_) < CV_DUAL_QUAT_EPS && abs(x_ - q.x_) < CV_DUAL_QUAT_EPS &&
+            abs(y_ - q.y_) < CV_DUAL_QUAT_EPS && abs(z_ - q.z_) < CV_DUAL_QUAT_EPS);
+}
+
+template <typename T>
+inline Quat<T> DualQuat<T>::getRealPart() const
+{
+    return Quat<T>(w, x, y, z);
+}
+
+template <typename T>
+inline Quat<T> DualQuat<T>::getDualPart() const
+{
+    return Quat<T>(w_, x_, y_, z_);
+}
+
+template <typename T>
+inline DualQuat<T> conjugate(const DualQuat<T> &dq)
+{
+    return dq.conjugate();
+}
+
+template <typename T>
+inline DualQuat<T> DualQuat<T>::conjugate() const
+{
+    return DualQuat<T>(w, -x, -y, -z, w_, -x_, -y_, -z_);
+}
+
+template <typename T>
+DualQuat<T> DualQuat<T>::norm() const
+{
+    Quat<T> real = getRealPart();
+    T realNorm = real.norm();
+    Quat<T> dual = getDualPart();
+    if (realNorm < CV_DUAL_QUAT_EPS){
+        return DualQuat<T>(0, 0, 0, 0, 0, 0, 0, 0);
+    }
+    return DualQuat<T>(realNorm, 0, 0, 0, real.dot(dual) / realNorm, 0, 0, 0);
+}
+
+template <typename T>
+inline Quat<T> DualQuat<T>::getRotation(QuatAssumeType assumeUnit) const
+{
+    if (assumeUnit)
+    {
+        return getRealPart();
+    }
+    return getRealPart().normalize();
+}
+
+template <typename T>
+inline Vec<T, 3> DualQuat<T>::getTranslation(QuatAssumeType assumeUnit) const
+{
+    Quat<T> trans = T(2.0) * (getDualPart() * getRealPart().inv(assumeUnit));
+    return Vec<T, 3>{trans[1], trans[2], trans[3]};
+}
+
+template <typename T>
+DualQuat<T> DualQuat<T>::normalize() const
+{
+    Quat<T> p = getRealPart();
+    Quat<T> q = getDualPart();
+    T p_norm = p.norm();
+    if (p_norm < CV_DUAL_QUAT_EPS)
+    {
+        CV_Error(Error::StsBadArg, "Cannot normalize this dual quaternion: the norm is too small.");
+    }
+    Quat<T> p_nr = p / p_norm;
+    Quat<T> q_nr = q / p_norm;
+    return createFromQuat(p_nr, q_nr - p_nr * p_nr.dot(q_nr));
+}
+
+template <typename T>
+inline T DualQuat<T>::dot(DualQuat<T> q) const
+{
+    return q.w * w + q.x * x + q.y * y + q.z * z + q.w_ * w_ + q.x_ * x_ + q.y_ * y_ + q.z_ * z_;
+}
+
+template <typename T>
+inline DualQuat<T> inv(const DualQuat<T> &dq, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT)
+{
+    return dq.inv(assumeUnit);
+}
+
+template <typename T>
+inline DualQuat<T> DualQuat<T>::inv(QuatAssumeType assumeUnit) const
+{
+    Quat<T> real = getRealPart();
+    Quat<T> dual = getDualPart();
+    return createFromQuat(real.inv(assumeUnit), -real.inv(assumeUnit) * dual * real.inv(assumeUnit));
+}
+
+template <typename T>
+inline DualQuat<T> DualQuat<T>::operator-(const DualQuat<T> &q) const
+{
+    return DualQuat<T>(w - q.w, x - q.x, y - q.y, z - q.z, w_ - q.w_, x_ - q.x_, y_ - q.y_, z_ - q.z_);
+}
+
+template <typename T>
+inline DualQuat<T> DualQuat<T>::operator-() const
+{
+    return DualQuat<T>(-w, -x, -y, -z, -w_, -x_, -y_, -z_);
+}
+
+template <typename T>
+inline DualQuat<T> DualQuat<T>::operator+(const DualQuat<T> &q) const
+{
+    return DualQuat<T>(w + q.w, x + q.x, y + q.y, z + q.z, w_ + q.w_, x_ + q.x_, y_ + q.y_, z_ + q.z_);
+}
+
+template <typename T>
+inline DualQuat<T>& DualQuat<T>::operator+=(const DualQuat<T> &q)
+{
+    *this = *this + q;
+    return *this;
+}
+
+template <typename T>
+inline DualQuat<T> DualQuat<T>::operator*(const DualQuat<T> &q) const
+{
+    Quat<T> A = getRealPart();
+    Quat<T> B = getDualPart();
+    Quat<T> C = q.getRealPart();
+    Quat<T> D = q.getDualPart();
+    return DualQuat<T>::createFromQuat(A * C, A * D + B * C);
+}
+
+template <typename T>
+inline DualQuat<T>& DualQuat<T>::operator*=(const DualQuat<T> &q)
+{
+    *this = *this * q;
+    return *this;
+}
+
+template <typename T>
+inline DualQuat<T> operator+(const T a, const DualQuat<T> &q)
+{
+    return DualQuat<T>(a + q.w, q.x, q.y, q.z, q.w_, q.x_, q.y_, q.z_);
+}
+
+template <typename T>
+inline DualQuat<T> operator+(const DualQuat<T> &q, const T a)
+{
+    return DualQuat<T>(a + q.w, q.x, q.y, q.z, q.w_, q.x_, q.y_, q.z_);
+}
+
+template <typename T>
+inline DualQuat<T> operator-(const DualQuat<T> &q, const T a)
+{
+    return DualQuat<T>(q.w - a, q.x, q.y, q.z, q.w_, q.x_, q.y_, q.z_);
+}
+
+template <typename T>
+inline DualQuat<T>& DualQuat<T>::operator-=(const DualQuat<T> &q)
+{
+    *this = *this - q;
+    return *this;
+}
+
+template <typename T>
+inline DualQuat<T> operator-(const T a, const DualQuat<T> &q)
+{
+    return DualQuat<T>(a - q.w, -q.x, -q.y, -q.z, -q.w_, -q.x_, -q.y_, -q.z_);
+}
+
+template <typename T>
+inline DualQuat<T> operator*(const T a, const DualQuat<T> &q)
+{
+    return DualQuat<T>(q.w * a, q.x * a, q.y * a, q.z * a, q.w_ * a, q.x_ * a, q.y_ * a, q.z_ * a);
+}
+
+template <typename T>
+inline DualQuat<T> operator*(const DualQuat<T> &q, const T a)
+{
+    return DualQuat<T>(q.w * a, q.x * a, q.y * a, q.z * a, q.w_ * a, q.x_ * a, q.y_ * a, q.z_ * a);
+}
+
+template <typename T>
+inline DualQuat<T> DualQuat<T>::operator/(const T a) const
+{
+    return DualQuat<T>(w / a, x / a, y / a, z / a, w_ / a, x_ / a, y_ / a, z_ / a);
+}
+
+template <typename T>
+inline DualQuat<T> DualQuat<T>::operator/(const DualQuat<T> &q) const
+{
+    return *this * q.inv();
+}
+
+template <typename T>
+inline DualQuat<T>& DualQuat<T>::operator/=(const DualQuat<T> &q)
+{
+    *this = *this / q;
+    return *this;
+}
+
+template <typename T>
+std::ostream & operator<<(std::ostream &os, const DualQuat<T> &q)
+{
+    os << "DualQuat " << Vec<T, 8>{q.w, q.x, q.y, q.z, q.w_, q.x_, q.y_, q.z_};
+    return os;
+}
+
+template <typename T>
+inline DualQuat<T> exp(const DualQuat<T> &dq)
+{
+    return dq.exp();
+}
+
+namespace detail {
+
+template <typename _Tp>
+Matx<_Tp, 4, 4> jacob_exp(const Quat<_Tp> &q)
+{
+    _Tp nv = std::sqrt(q.x * q.x + q.y * q.y + q.z * q.z);
+    _Tp sinc_nv = abs(nv) < cv::DualQuat<_Tp>::CV_DUAL_QUAT_EPS ? _Tp(1.0) - nv * nv * _Tp(1.0/6.0) : std::sin(nv) / nv;
+    _Tp csiii_nv = abs(nv) < cv::DualQuat<_Tp>::CV_DUAL_QUAT_EPS ? -_Tp(1.0/3.0) : (std::cos(nv) - sinc_nv) / nv / nv;
+    Matx<_Tp, 4, 4> J_exp_quat {
+        std::cos(nv), -sinc_nv * q.x,  -sinc_nv * q.y,  -sinc_nv * q.z,
+        sinc_nv * q.x, csiii_nv * q.x * q.x + sinc_nv, csiii_nv * q.x * q.y, csiii_nv * q.x * q.z,
+        sinc_nv * q.y, csiii_nv * q.y * q.x, csiii_nv * q.y * q.y + sinc_nv, csiii_nv * q.y * q.z,
+        sinc_nv * q.z, csiii_nv * q.z * q.x, csiii_nv * q.z * q.y, csiii_nv * q.z * q.z + sinc_nv
+    };
+    return std::exp(q.w) * J_exp_quat;
+}
+
+} // namespace detail
+
+template <typename T>
+DualQuat<T> DualQuat<T>::exp() const
+{
+    Quat<T> real = getRealPart();
+    return createFromQuat(real.exp(), Quat<T>(detail::jacob_exp(real) * getDualPart().toVec()));
+}
+
+template <typename T>
+DualQuat<T> log(const DualQuat<T> &dq, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT)
+{
+    return dq.log(assumeUnit);
+}
+
+template <typename T>
+DualQuat<T> DualQuat<T>::log(QuatAssumeType assumeUnit) const
+{
+    Quat<T> plog = getRealPart().log(assumeUnit);
+    Matx<T, 4, 4> jacob = detail::jacob_exp(plog);
+    return createFromQuat(plog, Quat<T>(jacob.inv() * getDualPart().toVec()));
+}
+
+template <typename T>
+inline DualQuat<T> power(const DualQuat<T> &dq, const T t, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT)
+{
+    return dq.power(t, assumeUnit);
+}
+
+template <typename T>
+inline DualQuat<T> DualQuat<T>::power(const T t, QuatAssumeType assumeUnit) const
+{
+    return (t * log(assumeUnit)).exp();
+}
+
+template <typename T>
+inline DualQuat<T> power(const DualQuat<T> &p, const DualQuat<T> &q, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT)
+{
+    return p.power(q, assumeUnit);
+}
+
+template <typename T>
+inline DualQuat<T> DualQuat<T>::power(const DualQuat<T> &q, QuatAssumeType assumeUnit) const
+{
+    return (q * log(assumeUnit)).exp();
+}
+
+template <typename T>
+inline Vec<T, 8> DualQuat<T>::toVec() const
+{
+   return Vec<T, 8>(w, x, y, z, w_, x_, y_, z_);
+}
+
+template <typename T>
+Affine3<T> DualQuat<T>::toAffine3(QuatAssumeType assumeUnit) const
+{
+    return Affine3<T>(toMat(assumeUnit));
+}
+
+template <typename T>
+Matx<T, 4, 4> DualQuat<T>::toMat(QuatAssumeType assumeUnit) const
+{
+    Matx<T, 4, 4> rot44 = getRotation(assumeUnit).toRotMat4x4();
+    Vec<T, 3> translation = getTranslation(assumeUnit);
+    rot44(0, 3) = translation[0];
+    rot44(1, 3) = translation[1];
+    rot44(2, 3) = translation[2];
+    return rot44;
+}
+
+template <typename T>
+DualQuat<T> DualQuat<T>::sclerp(const DualQuat<T> &q0, const DualQuat<T> &q1, const T t, bool directChange, QuatAssumeType assumeUnit)
+{
+    DualQuat<T> v0(q0), v1(q1);
+    if (!assumeUnit)
+    {
+        v0 = v0.normalize();
+        v1 = v1.normalize();
+    }
+    Quat<T> v0Real = v0.getRealPart();
+    Quat<T> v1Real = v1.getRealPart();
+    if (directChange && v1Real.dot(v0Real) < 0)
+    {
+        v0 = -v0;
+    }
+    DualQuat<T> v0inv1 = v0.inv() * v1;
+    return v0 * v0inv1.power(t, QUAT_ASSUME_UNIT);
+}
+
+template <typename T>
+DualQuat<T> DualQuat<T>::dqblend(const DualQuat<T> &q1, const DualQuat<T> &q2, const T t, QuatAssumeType assumeUnit)
+{
+    DualQuat<T> v1(q1), v2(q2);
+    if (!assumeUnit)
+    {
+        v1 = v1.normalize();
+        v2 = v2.normalize();
+    }
+    if (v1.getRotation(assumeUnit).dot(v2.getRotation(assumeUnit)) < 0)
+    {
+        return ((1 - t) * v1 - t * v2).normalize();
+    }
+    return ((1 - t) * v1 + t * v2).normalize();
+}
+
+template <typename T>
+DualQuat<T> DualQuat<T>::gdqblend(InputArray _dualquat, InputArray _weight, QuatAssumeType assumeUnit)
+{
+    CV_CheckTypeEQ(_weight.type(), cv::traits::Type<T>::value, "");
+    CV_CheckTypeEQ(_dualquat.type(), CV_MAKETYPE(CV_MAT_DEPTH(cv::traits::Type<T>::value), 8), "");
+    Size dq_s = _dualquat.size();
+    if (dq_s != _weight.size() || (dq_s.height != 1 && dq_s.width != 1))
+    {
+        CV_Error(Error::StsBadArg, "The size of weight must be the same as dualquat, both of them should be (1, n) or (n, 1)");
+    }
+    Mat dualquat = _dualquat.getMat(), weight = _weight.getMat();
+    const int cn = std::max(dq_s.width, dq_s.height);
+    if (!assumeUnit)
+    {
+        for (int i = 0; i < cn; ++i)
+        {
+            dualquat.at<Vec<T, 8>>(i) = DualQuat<T>{dualquat.at<Vec<T, 8>>(i)}.normalize().toVec();
+        }
+    }
+    Vec<T, 8> dq_blend = dualquat.at<Vec<T, 8>>(0) * weight.at<T>(0);
+    Quat<T> q0 = DualQuat<T> {dualquat.at<Vec<T, 8>>(0)}.getRotation(assumeUnit);
+    for (int i = 1; i < cn; ++i)
+    {
+        T k = q0.dot(DualQuat<T>{dualquat.at<Vec<T, 8>>(i)}.getRotation(assumeUnit)) < 0 ? -1: 1;
+        dq_blend = dq_blend + dualquat.at<Vec<T, 8>>(i) * k * weight.at<T>(i);
+    }
+    return DualQuat<T>{dq_blend}.normalize();
+}
+
+template <typename T>
+template <int cn>
+DualQuat<T> DualQuat<T>::gdqblend(const Vec<DualQuat<T>, cn> &_dualquat, InputArray _weight, QuatAssumeType assumeUnit)
+{
+    Vec<DualQuat<T>, cn> dualquat(_dualquat);
+    if (cn == 0)
+    {
+        return DualQuat<T>(1, 0, 0, 0, 0, 0, 0, 0);
+    }
+    Mat dualquat_mat(cn, 1, CV_64FC(8));
+    for (int i = 0; i < cn ; ++i)
+    {
+        dualquat_mat.at<Vec<T, 8>>(i) = dualquat[i].toVec();
+    }
+    return gdqblend(dualquat_mat, _weight, assumeUnit);
+}
+
+} //namespace cv
+
+#endif /*OPENCV_CORE_DUALQUATERNION_INL_HPP*/
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/eigen.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/eigen.hpp
new file mode 100644
index 0000000..51f4147
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/eigen.hpp
@@ -0,0 +1,402 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+#ifndef OPENCV_CORE_EIGEN_HPP
+#define OPENCV_CORE_EIGEN_HPP
+
+#ifndef EIGEN_WORLD_VERSION
+#error "Wrong usage of OpenCV's Eigen utility header. Include Eigen's headers first. See https://github.com/opencv/opencv/issues/17366"
+#endif
+
+#include "opencv2/core.hpp"
+
+#if defined _MSC_VER && _MSC_VER >= 1200
+#define NOMINMAX // fix https://github.com/opencv/opencv/issues/17548
+#pragma warning( disable: 4714 ) //__forceinline is not inlined
+#pragma warning( disable: 4127 ) //conditional expression is constant
+#pragma warning( disable: 4244 ) //conversion from '__int64' to 'int', possible loss of data
+#endif
+
+#if !defined(OPENCV_DISABLE_EIGEN_TENSOR_SUPPORT)
+#if EIGEN_WORLD_VERSION == 3 && EIGEN_MAJOR_VERSION >= 3 \
+    && defined(CV_CXX11) && defined(CV_CXX_STD_ARRAY)
+#include <unsupported/Eigen/CXX11/Tensor>
+#define OPENCV_EIGEN_TENSOR_SUPPORT 1
+#endif  // EIGEN_WORLD_VERSION == 3 && EIGEN_MAJOR_VERSION >= 3
+#endif  // !defined(OPENCV_DISABLE_EIGEN_TENSOR_SUPPORT)
+
+namespace cv
+{
+
+/** @addtogroup core_eigen
+These functions are provided for OpenCV-Eigen interoperability. They convert `Mat`
+objects to corresponding `Eigen::Matrix` objects and vice-versa. Consult the [Eigen
+documentation](https://eigen.tuxfamily.org/dox/group__TutorialMatrixClass.html) for
+information about the `Matrix` template type.
+
+@note Using these functions requires the `Eigen/Dense` or similar header to be
+included before this header.
+*/
+//! @{
+
+#if defined(OPENCV_EIGEN_TENSOR_SUPPORT) || defined(CV_DOXYGEN)
+/** @brief Converts an Eigen::Tensor to a cv::Mat.
+
+The method converts an Eigen::Tensor with shape (H x W x C) to a cv::Mat where:
+ H = number of rows
+ W = number of columns
+ C = number of channels
+
+Usage:
+\code
+Eigen::Tensor<float, 3, Eigen::RowMajor> a_tensor(...);
+// populate tensor with values
+Mat a_mat;
+eigen2cv(a_tensor, a_mat);
+\endcode
+*/
+template <typename _Tp, int _layout> static inline
+void eigen2cv( const Eigen::Tensor<_Tp, 3, _layout> &src, OutputArray dst )
+{
+    if( !(_layout & Eigen::RowMajorBit) )
+    {
+        const std::array<int, 3> shuffle{2, 1, 0};
+        Eigen::Tensor<_Tp, 3, !_layout> row_major_tensor = src.swap_layout().shuffle(shuffle);
+        Mat _src(src.dimension(0), src.dimension(1), CV_MAKETYPE(DataType<_Tp>::type, src.dimension(2)), row_major_tensor.data());
+        _src.copyTo(dst);
+    }
+    else
+    {
+        Mat _src(src.dimension(0), src.dimension(1), CV_MAKETYPE(DataType<_Tp>::type, src.dimension(2)), (void *)src.data());
+        _src.copyTo(dst);
+    }
+}
+
+/** @brief Converts a cv::Mat to an Eigen::Tensor.
+
+The method converts a cv::Mat to an Eigen Tensor with shape (H x W x C) where:
+ H = number of rows
+ W = number of columns
+ C = number of channels
+
+Usage:
+\code
+Mat a_mat(...);
+// populate Mat with values
+Eigen::Tensor<float, 3, Eigen::RowMajor> a_tensor(...);
+cv2eigen(a_mat, a_tensor);
+\endcode
+*/
+template <typename _Tp, int _layout> static inline
+void cv2eigen( const Mat &src, Eigen::Tensor<_Tp, 3, _layout> &dst )
+{
+    if( !(_layout & Eigen::RowMajorBit) )
+    {
+        Eigen::Tensor<_Tp, 3, !_layout> row_major_tensor(src.rows, src.cols, src.channels());
+        Mat _dst(src.rows, src.cols, CV_MAKETYPE(DataType<_Tp>::type, src.channels()), row_major_tensor.data());
+        if (src.type() == _dst.type())
+            src.copyTo(_dst);
+        else
+            src.convertTo(_dst, _dst.type());
+        const std::array<int, 3> shuffle{2, 1, 0};
+        dst = row_major_tensor.swap_layout().shuffle(shuffle);
+    }
+    else
+    {
+        dst.resize(src.rows, src.cols, src.channels());
+        Mat _dst(src.rows, src.cols, CV_MAKETYPE(DataType<_Tp>::type, src.channels()), dst.data());
+        if (src.type() == _dst.type())
+            src.copyTo(_dst);
+        else
+            src.convertTo(_dst, _dst.type());
+    }
+}
+
+/** @brief Maps cv::Mat data to an Eigen::TensorMap.
+
+The method wraps an existing Mat data array with an Eigen TensorMap of shape (H x W x C) where:
+ H = number of rows
+ W = number of columns
+ C = number of channels
+
+Explicit instantiation of the return type is required.
+
+@note Caller should be aware of the lifetime of the cv::Mat instance and take appropriate safety measures.
+The cv::Mat instance will retain ownership of the data and the Eigen::TensorMap will lose access when the cv::Mat data is deallocated.
+
+The example below initializes a cv::Mat and produces an Eigen::TensorMap:
+\code
+float arr[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+Mat a_mat(2, 2, CV_32FC3, arr);
+Eigen::TensorMap<Eigen::Tensor<float, 3, Eigen::RowMajor>> a_tensormap = cv2eigen_tensormap<float>(a_mat);
+\endcode
+*/
+template <typename _Tp> static inline
+Eigen::TensorMap<Eigen::Tensor<_Tp, 3, Eigen::RowMajor>> cv2eigen_tensormap(InputArray src)
+{
+    Mat mat = src.getMat();
+    CV_CheckTypeEQ(mat.type(), CV_MAKETYPE(traits::Type<_Tp>::value, mat.channels()), "");
+    return Eigen::TensorMap<Eigen::Tensor<_Tp, 3, Eigen::RowMajor>>((_Tp *)mat.data, mat.rows, mat.cols, mat.channels());
+}
+#endif // OPENCV_EIGEN_TENSOR_SUPPORT
+
+template<typename _Tp, int _rows, int _cols, int _options, int _maxRows, int _maxCols> static inline
+void eigen2cv( const Eigen::Matrix<_Tp, _rows, _cols, _options, _maxRows, _maxCols>& src, OutputArray dst )
+{
+    if( !(src.Flags & Eigen::RowMajorBit) )
+    {
+        Mat _src(src.cols(), src.rows(), traits::Type<_Tp>::value,
+              (void*)src.data(), src.outerStride()*sizeof(_Tp));
+        transpose(_src, dst);
+    }
+    else
+    {
+        Mat _src(src.rows(), src.cols(), traits::Type<_Tp>::value,
+                 (void*)src.data(), src.outerStride()*sizeof(_Tp));
+        _src.copyTo(dst);
+    }
+}
+
+// Matx case
+template<typename _Tp, int _rows, int _cols, int _options, int _maxRows, int _maxCols> static inline
+void eigen2cv( const Eigen::Matrix<_Tp, _rows, _cols, _options, _maxRows, _maxCols>& src,
+               Matx<_Tp, _rows, _cols>& dst )
+{
+    if( !(src.Flags & Eigen::RowMajorBit) )
+    {
+        dst = Matx<_Tp, _cols, _rows>(static_cast<const _Tp*>(src.data())).t();
+    }
+    else
+    {
+        dst = Matx<_Tp, _rows, _cols>(static_cast<const _Tp*>(src.data()));
+    }
+}
+
+template<typename _Tp, int _rows, int _cols, int _options, int _maxRows, int _maxCols> static inline
+void cv2eigen( const Mat& src,
+               Eigen::Matrix<_Tp, _rows, _cols, _options, _maxRows, _maxCols>& dst )
+{
+    CV_DbgAssert(src.rows == _rows && src.cols == _cols);
+    if( !(dst.Flags & Eigen::RowMajorBit) )
+    {
+        const Mat _dst(src.cols, src.rows, traits::Type<_Tp>::value,
+                 dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        if( src.type() == _dst.type() )
+            transpose(src, _dst);
+        else if( src.cols == src.rows )
+        {
+            src.convertTo(_dst, _dst.type());
+            transpose(_dst, _dst);
+        }
+        else
+            Mat(src.t()).convertTo(_dst, _dst.type());
+    }
+    else
+    {
+        const Mat _dst(src.rows, src.cols, traits::Type<_Tp>::value,
+                 dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        src.convertTo(_dst, _dst.type());
+    }
+}
+
+// Matx case
+template<typename _Tp, int _rows, int _cols, int _options, int _maxRows, int _maxCols> static inline
+void cv2eigen( const Matx<_Tp, _rows, _cols>& src,
+               Eigen::Matrix<_Tp, _rows, _cols, _options, _maxRows, _maxCols>& dst )
+{
+    if( !(dst.Flags & Eigen::RowMajorBit) )
+    {
+        const Mat _dst(_cols, _rows, traits::Type<_Tp>::value,
+                 dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        transpose(src, _dst);
+    }
+    else
+    {
+        const Mat _dst(_rows, _cols, traits::Type<_Tp>::value,
+                 dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        Mat(src).copyTo(_dst);
+    }
+}
+
+template<typename _Tp>  static inline
+void cv2eigen( const Mat& src,
+               Eigen::Matrix<_Tp, Eigen::Dynamic, Eigen::Dynamic>& dst )
+{
+    dst.resize(src.rows, src.cols);
+    if( !(dst.Flags & Eigen::RowMajorBit) )
+    {
+        const Mat _dst(src.cols, src.rows, traits::Type<_Tp>::value,
+             dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        if( src.type() == _dst.type() )
+            transpose(src, _dst);
+        else if( src.cols == src.rows )
+        {
+            src.convertTo(_dst, _dst.type());
+            transpose(_dst, _dst);
+        }
+        else
+            Mat(src.t()).convertTo(_dst, _dst.type());
+    }
+    else
+    {
+        const Mat _dst(src.rows, src.cols, traits::Type<_Tp>::value,
+                 dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        src.convertTo(_dst, _dst.type());
+    }
+}
+
+// Matx case
+template<typename _Tp, int _rows, int _cols> static inline
+void cv2eigen( const Matx<_Tp, _rows, _cols>& src,
+               Eigen::Matrix<_Tp, Eigen::Dynamic, Eigen::Dynamic>& dst )
+{
+    dst.resize(_rows, _cols);
+    if( !(dst.Flags & Eigen::RowMajorBit) )
+    {
+        const Mat _dst(_cols, _rows, traits::Type<_Tp>::value,
+             dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        transpose(src, _dst);
+    }
+    else
+    {
+        const Mat _dst(_rows, _cols, traits::Type<_Tp>::value,
+                 dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        Mat(src).copyTo(_dst);
+    }
+}
+
+template<typename _Tp> static inline
+void cv2eigen( const Mat& src,
+               Eigen::Matrix<_Tp, Eigen::Dynamic, 1>& dst )
+{
+    CV_Assert(src.cols == 1);
+    dst.resize(src.rows);
+
+    if( !(dst.Flags & Eigen::RowMajorBit) )
+    {
+        const Mat _dst(src.cols, src.rows, traits::Type<_Tp>::value,
+                 dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        if( src.type() == _dst.type() )
+            transpose(src, _dst);
+        else
+            Mat(src.t()).convertTo(_dst, _dst.type());
+    }
+    else
+    {
+        const Mat _dst(src.rows, src.cols, traits::Type<_Tp>::value,
+                 dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        src.convertTo(_dst, _dst.type());
+    }
+}
+
+// Matx case
+template<typename _Tp, int _rows> static inline
+void cv2eigen( const Matx<_Tp, _rows, 1>& src,
+               Eigen::Matrix<_Tp, Eigen::Dynamic, 1>& dst )
+{
+    dst.resize(_rows);
+
+    if( !(dst.Flags & Eigen::RowMajorBit) )
+    {
+        const Mat _dst(1, _rows, traits::Type<_Tp>::value,
+                 dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        transpose(src, _dst);
+    }
+    else
+    {
+        const Mat _dst(_rows, 1, traits::Type<_Tp>::value,
+                 dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        src.copyTo(_dst);
+    }
+}
+
+
+template<typename _Tp> static inline
+void cv2eigen( const Mat& src,
+               Eigen::Matrix<_Tp, 1, Eigen::Dynamic>& dst )
+{
+    CV_Assert(src.rows == 1);
+    dst.resize(src.cols);
+    if( !(dst.Flags & Eigen::RowMajorBit) )
+    {
+        const Mat _dst(src.cols, src.rows, traits::Type<_Tp>::value,
+                 dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        if( src.type() == _dst.type() )
+            transpose(src, _dst);
+        else
+            Mat(src.t()).convertTo(_dst, _dst.type());
+    }
+    else
+    {
+        const Mat _dst(src.rows, src.cols, traits::Type<_Tp>::value,
+                 dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        src.convertTo(_dst, _dst.type());
+    }
+}
+
+//Matx
+template<typename _Tp, int _cols> static inline
+void cv2eigen( const Matx<_Tp, 1, _cols>& src,
+               Eigen::Matrix<_Tp, 1, Eigen::Dynamic>& dst )
+{
+    dst.resize(_cols);
+    if( !(dst.Flags & Eigen::RowMajorBit) )
+    {
+        const Mat _dst(_cols, 1, traits::Type<_Tp>::value,
+                 dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        transpose(src, _dst);
+    }
+    else
+    {
+        const Mat _dst(1, _cols, traits::Type<_Tp>::value,
+                 dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+        Mat(src).copyTo(_dst);
+    }
+}
+
+//! @}
+
+} // cv
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/fast_math.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/fast_math.hpp
new file mode 100644
index 0000000..eb4fbe2
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/fast_math.hpp
@@ -0,0 +1,411 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_FAST_MATH_HPP
+#define OPENCV_CORE_FAST_MATH_HPP
+
+#include "opencv2/core/cvdef.h"
+
+//! @addtogroup core_utils
+//! @{
+
+/****************************************************************************************\
+*                                      fast math                                         *
+\****************************************************************************************/
+
+#ifdef __cplusplus
+#  include <cmath>
+#else
+#  ifdef __BORLANDC__
+#    include <fastmath.h>
+#  else
+#    include <math.h>
+#  endif
+#endif
+
+#if defined(__CUDACC__)
+  // nothing, intrinsics/asm code is not supported
+#else
+  #if ((defined _MSC_VER && defined _M_X64) \
+      || (defined __GNUC__ && defined __x86_64__ && defined __SSE2__)) \
+      && !defined(OPENCV_SKIP_INCLUDE_EMMINTRIN_H)
+    #include <emmintrin.h>
+  #endif
+
+  #if defined __PPC64__ && defined __GNUC__ && defined _ARCH_PWR8 \
+      && !defined(OPENCV_SKIP_INCLUDE_ALTIVEC_H)
+    #include <altivec.h>
+    #undef vector
+    #undef bool
+    #undef pixel
+  #endif
+
+  #if defined(CV_INLINE_ROUND_FLT)
+    // user-specified version
+    // CV_INLINE_ROUND_DBL should be defined too
+  #elif defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__ || defined __ARM_NEON__) && !defined __SOFTFP__
+    // 1. general scheme
+    #define ARM_ROUND(_value, _asm_string) \
+        int res; \
+        float temp; \
+        CV_UNUSED(temp); \
+        __asm__(_asm_string : [res] "=r" (res), [temp] "=w" (temp) : [value] "w" (_value)); \
+        return res
+    // 2. version for double
+    #ifdef __clang__
+        #define CV_INLINE_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %[value] \n vmov %[res], %[temp]")
+    #else
+        #define CV_INLINE_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %P[value] \n vmov %[res], %[temp]")
+    #endif
+    // 3. version for float
+    #define CV_INLINE_ROUND_FLT(value) ARM_ROUND(value, "vcvtr.s32.f32 %[temp], %[value]\n vmov %[res], %[temp]")
+  #elif defined __PPC64__ && defined __GNUC__ && defined _ARCH_PWR8
+    // P8 and newer machines can convert fp32/64 to int quickly.
+    #define CV_INLINE_ROUND_DBL(value) \
+        int out; \
+        double temp; \
+        __asm__( "fctiw %[temp],%[in]\n\tmfvsrwz %[out],%[temp]\n\t" : [out] "=r" (out), [temp] "=d" (temp) : [in] "d" ((double)(value)) : ); \
+        return out;
+
+    // FP32 also works with FP64 routine above
+    #define CV_INLINE_ROUND_FLT(value) CV_INLINE_ROUND_DBL(value)
+  #endif
+
+  #ifdef CV_INLINE_ISINF_FLT
+    // user-specified version
+    // CV_INLINE_ISINF_DBL should be defined too
+  #elif defined __PPC64__ && defined _ARCH_PWR9 && defined(scalar_test_data_class)
+    #define CV_INLINE_ISINF_DBL(value) return scalar_test_data_class(value, 0x30);
+    #define CV_INLINE_ISINF_FLT(value) CV_INLINE_ISINF_DBL(value)
+  #endif
+
+  #ifdef CV_INLINE_ISNAN_FLT
+    // user-specified version
+    // CV_INLINE_ISNAN_DBL should be defined too
+  #elif defined __PPC64__ && defined _ARCH_PWR9 && defined(scalar_test_data_class)
+    #define CV_INLINE_ISNAN_DBL(value) return scalar_test_data_class(value, 0x40);
+    #define CV_INLINE_ISNAN_FLT(value) CV_INLINE_ISNAN_DBL(value)
+  #endif
+
+  #if !defined(OPENCV_USE_FASTMATH_BUILTINS) \
+    && ( \
+        defined(__x86_64__) || defined(__i686__) \
+        || defined(__arm__) \
+        || defined(__PPC64__) \
+    )
+    /* Let builtin C math functions when available. Dedicated hardware is available to
+       round and convert FP values. */
+    #define OPENCV_USE_FASTMATH_BUILTINS 1
+  #endif
+
+  /* Enable builtin math functions if possible, desired, and available.
+     Note, not all math functions inline equally. E.g lrint will not inline
+     without the -fno-math-errno option. */
+  #if defined(CV_ICC)
+    // nothing
+  #elif defined(OPENCV_USE_FASTMATH_BUILTINS) && OPENCV_USE_FASTMATH_BUILTINS
+    #if defined(__clang__)
+      #define CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS
+      #if !defined(CV_INLINE_ISNAN_DBL) && __has_builtin(__builtin_isnan)
+        #define CV_INLINE_ISNAN_DBL(value) return __builtin_isnan(value);
+      #endif
+      #if !defined(CV_INLINE_ISNAN_FLT) && __has_builtin(__builtin_isnan)
+        #define CV_INLINE_ISNAN_FLT(value) return __builtin_isnan(value);
+      #endif
+      #if !defined(CV_INLINE_ISINF_DBL) && __has_builtin(__builtin_isinf)
+        #define CV_INLINE_ISINF_DBL(value) return __builtin_isinf(value);
+      #endif
+      #if !defined(CV_INLINE_ISINF_FLT) && __has_builtin(__builtin_isinf)
+        #define CV_INLINE_ISINF_FLT(value) return __builtin_isinf(value);
+      #endif
+    #elif defined(__GNUC__)
+      #define CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS
+      #if !defined(CV_INLINE_ISNAN_DBL)
+        #define CV_INLINE_ISNAN_DBL(value) return __builtin_isnan(value);
+      #endif
+      #if !defined(CV_INLINE_ISNAN_FLT)
+        #define CV_INLINE_ISNAN_FLT(value) return __builtin_isnanf(value);
+      #endif
+      #if !defined(CV_INLINE_ISINF_DBL)
+        #define CV_INLINE_ISINF_DBL(value) return __builtin_isinf(value);
+      #endif
+      #if !defined(CV_INLINE_ISINF_FLT)
+        #define CV_INLINE_ISINF_FLT(value) return __builtin_isinff(value);
+      #endif
+    #elif defined(_MSC_VER)
+      #if !defined(CV_INLINE_ISNAN_DBL)
+        #define CV_INLINE_ISNAN_DBL(value) return isnan(value);
+      #endif
+      #if !defined(CV_INLINE_ISNAN_FLT)
+        #define CV_INLINE_ISNAN_FLT(value) return isnan(value);
+      #endif
+      #if !defined(CV_INLINE_ISINF_DBL)
+        #define CV_INLINE_ISINF_DBL(value) return isinf(value);
+      #endif
+      #if !defined(CV_INLINE_ISINF_FLT)
+        #define CV_INLINE_ISINF_FLT(value) return isinf(value);
+      #endif
+    #endif
+  #endif
+
+#endif // defined(__CUDACC__)
+
+/** @brief Rounds floating-point number to the nearest integer
+
+ @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
+ result is not defined.
+ */
+CV_INLINE int
+cvRound( double value )
+{
+#if defined CV_INLINE_ROUND_DBL
+    CV_INLINE_ROUND_DBL(value);
+#elif ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ \
+    && defined __SSE2__ && !defined __APPLE__) || CV_SSE2) \
+    && !defined(__CUDACC__)
+    __m128d t = _mm_set_sd( value );
+    return _mm_cvtsd_si32(t);
+#elif defined _MSC_VER && defined _M_IX86
+    int t;
+    __asm
+    {
+        fld value;
+        fistp t;
+    }
+    return t;
+#elif defined CV_ICC || defined __GNUC__
+    return (int)(lrint(value));
+#else
+    /* it's ok if round does not comply with IEEE754 standard;
+       the tests should allow +/-1 difference when the tested functions use round */
+    return (int)(value + (value >= 0 ? 0.5 : -0.5));
+#endif
+}
+
+
+/** @brief Rounds floating-point number to the nearest integer not larger than the original.
+
+ The function computes an integer i such that:
+ \f[i \le \texttt{value} < i+1\f]
+ @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
+ result is not defined.
+ */
+CV_INLINE int cvFloor( double value )
+{
+#if (defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS) \
+    && ( \
+        defined(__PPC64__) \
+    )
+    return __builtin_floor(value);
+#else
+    int i = (int)value;
+    return i - (i > value);
+#endif
+}
+
+/** @brief Rounds floating-point number to the nearest integer not smaller than the original.
+
+ The function computes an integer i such that:
+ \f[i \le \texttt{value} < i+1\f]
+ @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
+ result is not defined.
+ */
+CV_INLINE int cvCeil( double value )
+{
+#if (defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS) \
+    && ( \
+        defined(__PPC64__) \
+    )
+    return __builtin_ceil(value);
+#else
+    int i = (int)value;
+    return i + (i < value);
+#endif
+}
+
+/** @brief Determines if the argument is Not A Number.
+
+ @param value The input floating-point value
+
+ The function returns 1 if the argument is Not A Number (as defined by IEEE754 standard), 0
+ otherwise. */
+CV_INLINE int cvIsNaN( double value )
+{
+#if defined CV_INLINE_ISNAN_DBL
+    CV_INLINE_ISNAN_DBL(value);
+#else
+    Cv64suf ieee754;
+    ieee754.f = value;
+    return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) +
+           ((unsigned)ieee754.u != 0) > 0x7ff00000;
+#endif
+}
+
+/** @brief Determines if the argument is Infinity.
+
+ @param value The input floating-point value
+
+ The function returns 1 if the argument is a plus or minus infinity (as defined by IEEE754 standard)
+ and 0 otherwise. */
+CV_INLINE int cvIsInf( double value )
+{
+#if defined CV_INLINE_ISINF_DBL
+    CV_INLINE_ISINF_DBL(value);
+#elif defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64) || defined(__PPC64__)
+    Cv64suf ieee754;
+    ieee754.f = value;
+    return (ieee754.u & 0x7fffffff00000000) ==
+                        0x7ff0000000000000;
+#else
+    Cv64suf ieee754;
+    ieee754.f = value;
+    return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) == 0x7ff00000 &&
+            (unsigned)ieee754.u == 0;
+#endif
+}
+
+#ifdef __cplusplus
+
+/** @overload */
+CV_INLINE int cvRound(float value)
+{
+#if defined CV_INLINE_ROUND_FLT
+    CV_INLINE_ROUND_FLT(value);
+#elif ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ \
+    && defined __SSE2__ && !defined __APPLE__) || CV_SSE2) \
+    && !defined(__CUDACC__)
+    __m128 t = _mm_set_ss( value );
+    return _mm_cvtss_si32(t);
+#elif defined _MSC_VER && defined _M_IX86
+    int t;
+    __asm
+    {
+        fld value;
+        fistp t;
+    }
+    return t;
+#elif defined CV_ICC || defined __GNUC__
+    return (int)(lrintf(value));
+#else
+    /* it's ok if round does not comply with IEEE754 standard;
+     the tests should allow +/-1 difference when the tested functions use round */
+    return (int)(value + (value >= 0 ? 0.5f : -0.5f));
+#endif
+}
+
+/** @overload */
+CV_INLINE int cvRound( int value )
+{
+    return value;
+}
+
+/** @overload */
+CV_INLINE int cvFloor( float value )
+{
+#if (defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS) \
+    && ( \
+        defined(__PPC64__) \
+    )
+    return __builtin_floorf(value);
+#else
+    int i = (int)value;
+    return i - (i > value);
+#endif
+}
+
+/** @overload */
+CV_INLINE int cvFloor( int value )
+{
+    return value;
+}
+
+/** @overload */
+CV_INLINE int cvCeil( float value )
+{
+#if (defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS) \
+    && ( \
+        defined(__PPC64__) \
+    )
+    return __builtin_ceilf(value);
+#else
+    int i = (int)value;
+    return i + (i < value);
+#endif
+}
+
+/** @overload */
+CV_INLINE int cvCeil( int value )
+{
+    return value;
+}
+
+/** @overload */
+CV_INLINE int cvIsNaN( float value )
+{
+#if defined CV_INLINE_ISNAN_FLT
+    CV_INLINE_ISNAN_FLT(value);
+#else
+    Cv32suf ieee754;
+    ieee754.f = value;
+    return (ieee754.u & 0x7fffffff) > 0x7f800000;
+#endif
+}
+
+/** @overload */
+CV_INLINE int cvIsInf( float value )
+{
+#if defined CV_INLINE_ISINF_FLT
+    CV_INLINE_ISINF_FLT(value);
+#else
+    Cv32suf ieee754;
+    ieee754.f = value;
+    return (ieee754.u & 0x7fffffff) == 0x7f800000;
+#endif
+}
+
+#endif // __cplusplus
+
+//! @} core_utils
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/hal.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/hal.hpp
new file mode 100644
index 0000000..0d68078
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/hal.hpp
@@ -0,0 +1,256 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_HAL_HPP
+#define OPENCV_HAL_HPP
+
+#include "opencv2/core/cvdef.h"
+#include "opencv2/core/cvstd.hpp"
+#include "opencv2/core/hal/interface.h"
+
+namespace cv { namespace hal {
+
+//! @addtogroup core_hal_functions
+//! @{
+
+CV_EXPORTS int normHamming(const uchar* a, int n);
+CV_EXPORTS int normHamming(const uchar* a, const uchar* b, int n);
+
+CV_EXPORTS int normHamming(const uchar* a, int n, int cellSize);
+CV_EXPORTS int normHamming(const uchar* a, const uchar* b, int n, int cellSize);
+
+CV_EXPORTS int LU32f(float* A, size_t astep, int m, float* b, size_t bstep, int n);
+CV_EXPORTS int LU64f(double* A, size_t astep, int m, double* b, size_t bstep, int n);
+CV_EXPORTS bool Cholesky32f(float* A, size_t astep, int m, float* b, size_t bstep, int n);
+CV_EXPORTS bool Cholesky64f(double* A, size_t astep, int m, double* b, size_t bstep, int n);
+CV_EXPORTS void SVD32f(float* At, size_t astep, float* W, float* U, size_t ustep, float* Vt, size_t vstep, int m, int n, int flags);
+CV_EXPORTS void SVD64f(double* At, size_t astep, double* W, double* U, size_t ustep, double* Vt, size_t vstep, int m, int n, int flags);
+CV_EXPORTS int QR32f(float* A, size_t astep, int m, int n, int k, float* b, size_t bstep, float* hFactors);
+CV_EXPORTS int QR64f(double* A, size_t astep, int m, int n, int k, double* b, size_t bstep, double* hFactors);
+
+CV_EXPORTS void gemm32f(const float* src1, size_t src1_step, const float* src2, size_t src2_step,
+                        float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step,
+                        int m_a, int n_a, int n_d, int flags);
+CV_EXPORTS void gemm64f(const double* src1, size_t src1_step, const double* src2, size_t src2_step,
+                        double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step,
+                        int m_a, int n_a, int n_d, int flags);
+CV_EXPORTS void gemm32fc(const float* src1, size_t src1_step, const float* src2, size_t src2_step,
+                        float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step,
+                        int m_a, int n_a, int n_d, int flags);
+CV_EXPORTS void gemm64fc(const double* src1, size_t src1_step, const double* src2, size_t src2_step,
+                        double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step,
+                        int m_a, int n_a, int n_d, int flags);
+
+CV_EXPORTS int normL1_(const uchar* a, const uchar* b, int n);
+CV_EXPORTS float normL1_(const float* a, const float* b, int n);
+CV_EXPORTS float normL2Sqr_(const float* a, const float* b, int n);
+
+CV_EXPORTS void exp32f(const float* src, float* dst, int n);
+CV_EXPORTS void exp64f(const double* src, double* dst, int n);
+CV_EXPORTS void log32f(const float* src, float* dst, int n);
+CV_EXPORTS void log64f(const double* src, double* dst, int n);
+
+CV_EXPORTS void fastAtan32f(const float* y, const float* x, float* dst, int n, bool angleInDegrees);
+CV_EXPORTS void fastAtan64f(const double* y, const double* x, double* dst, int n, bool angleInDegrees);
+CV_EXPORTS void magnitude32f(const float* x, const float* y, float* dst, int n);
+CV_EXPORTS void magnitude64f(const double* x, const double* y, double* dst, int n);
+CV_EXPORTS void sqrt32f(const float* src, float* dst, int len);
+CV_EXPORTS void sqrt64f(const double* src, double* dst, int len);
+CV_EXPORTS void invSqrt32f(const float* src, float* dst, int len);
+CV_EXPORTS void invSqrt64f(const double* src, double* dst, int len);
+
+CV_EXPORTS void split8u(const uchar* src, uchar** dst, int len, int cn );
+CV_EXPORTS void split16u(const ushort* src, ushort** dst, int len, int cn );
+CV_EXPORTS void split32s(const int* src, int** dst, int len, int cn );
+CV_EXPORTS void split64s(const int64* src, int64** dst, int len, int cn );
+
+CV_EXPORTS void merge8u(const uchar** src, uchar* dst, int len, int cn );
+CV_EXPORTS void merge16u(const ushort** src, ushort* dst, int len, int cn );
+CV_EXPORTS void merge32s(const int** src, int* dst, int len, int cn );
+CV_EXPORTS void merge64s(const int64** src, int64* dst, int len, int cn );
+
+CV_EXPORTS void add8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void add8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void add16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void add16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void add32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void add32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void add64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
+
+CV_EXPORTS void sub8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void sub8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void sub16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void sub16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void sub32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void sub32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void sub64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
+
+CV_EXPORTS void max8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void max8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void max16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void max16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void max32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void max32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void max64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
+
+CV_EXPORTS void min8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void min8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void min16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void min16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void min32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void min32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void min64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
+
+CV_EXPORTS void absdiff8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void absdiff8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void absdiff16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void absdiff16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void absdiff32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void absdiff32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void absdiff64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
+
+CV_EXPORTS void and8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void or8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void xor8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+CV_EXPORTS void not8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+
+CV_EXPORTS void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
+CV_EXPORTS void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
+CV_EXPORTS void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
+CV_EXPORTS void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
+CV_EXPORTS void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
+CV_EXPORTS void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
+CV_EXPORTS void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
+
+CV_EXPORTS void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void mul16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void mul32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void mul32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void mul64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
+
+CV_EXPORTS void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void div16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void div32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void div32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void div64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
+
+CV_EXPORTS void recip8u( const uchar *, size_t, const uchar * src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void recip8s( const schar *, size_t, const schar * src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void recip16u( const ushort *, size_t, const ushort * src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void recip16s( const short *, size_t, const short * src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void recip32s( const int *, size_t, const int * src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void recip32f( const float *, size_t, const float * src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
+CV_EXPORTS void recip64f( const double *, size_t, const double * src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
+
+CV_EXPORTS void addWeighted8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _scalars );
+CV_EXPORTS void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scalars );
+CV_EXPORTS void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scalars );
+CV_EXPORTS void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scalars );
+CV_EXPORTS void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scalars );
+CV_EXPORTS void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scalars );
+CV_EXPORTS void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scalars );
+
+CV_EXPORTS void cvt16f32f( const float16_t* src, float* dst, int len );
+CV_EXPORTS void cvt32f16f( const float* src, float16_t* dst, int len );
+
+CV_EXPORTS void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len );
+CV_EXPORTS void addRNGBias64f( double* arr, const double* scaleBiasPairs, int len );
+
+struct CV_EXPORTS DFT1D
+{
+    static Ptr<DFT1D> create(int len, int count, int depth, int flags, bool * useBuffer = 0);
+    virtual void apply(const uchar *src, uchar *dst) = 0;
+    virtual ~DFT1D() {}
+};
+
+struct CV_EXPORTS DFT2D
+{
+    static Ptr<DFT2D> create(int width, int height, int depth,
+                             int src_channels, int dst_channels,
+                             int flags, int nonzero_rows = 0);
+    virtual void apply(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step) = 0;
+    virtual ~DFT2D() {}
+};
+
+struct CV_EXPORTS DCT2D
+{
+    static Ptr<DCT2D> create(int width, int height, int depth, int flags);
+    virtual void apply(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step) = 0;
+    virtual ~DCT2D() {}
+};
+
+//! @} core_hal
+
+//=============================================================================
+// for binary compatibility with 3.0
+
+//! @cond IGNORED
+
+CV_EXPORTS int LU(float* A, size_t astep, int m, float* b, size_t bstep, int n);
+CV_EXPORTS int LU(double* A, size_t astep, int m, double* b, size_t bstep, int n);
+CV_EXPORTS bool Cholesky(float* A, size_t astep, int m, float* b, size_t bstep, int n);
+CV_EXPORTS bool Cholesky(double* A, size_t astep, int m, double* b, size_t bstep, int n);
+
+CV_EXPORTS void exp(const float* src, float* dst, int n);
+CV_EXPORTS void exp(const double* src, double* dst, int n);
+CV_EXPORTS void log(const float* src, float* dst, int n);
+CV_EXPORTS void log(const double* src, double* dst, int n);
+
+CV_EXPORTS void fastAtan2(const float* y, const float* x, float* dst, int n, bool angleInDegrees);
+CV_EXPORTS void magnitude(const float* x, const float* y, float* dst, int n);
+CV_EXPORTS void magnitude(const double* x, const double* y, double* dst, int n);
+CV_EXPORTS void sqrt(const float* src, float* dst, int len);
+CV_EXPORTS void sqrt(const double* src, double* dst, int len);
+CV_EXPORTS void invSqrt(const float* src, float* dst, int len);
+CV_EXPORTS void invSqrt(const double* src, double* dst, int len);
+
+//! @endcond
+
+}} //cv::hal
+
+#endif //OPENCV_HAL_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/interface.h b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/interface.h
new file mode 100644
index 0000000..6f0a83d
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/interface.h
@@ -0,0 +1,190 @@
+#ifndef OPENCV_CORE_HAL_INTERFACE_H
+#define OPENCV_CORE_HAL_INTERFACE_H
+
+//! @addtogroup core_hal_interface
+//! @{
+
+//! @name Return codes
+//! @{
+#define CV_HAL_ERROR_OK 0
+#define CV_HAL_ERROR_NOT_IMPLEMENTED 1
+#define CV_HAL_ERROR_UNKNOWN -1
+//! @}
+
+#ifdef __cplusplus
+#include <cstddef>
+#else
+#include <stddef.h>
+#include <stdbool.h>
+#endif
+
+//! @name Data types
+//! primitive types
+//! - schar  - signed 1 byte integer
+//! - uchar  - unsigned 1 byte integer
+//! - short  - signed 2 byte integer
+//! - ushort - unsigned 2 byte integer
+//! - int    - signed 4 byte integer
+//! - uint   - unsigned 4 byte integer
+//! - int64  - signed 8 byte integer
+//! - uint64 - unsigned 8 byte integer
+//! @{
+#if !defined _MSC_VER && !defined __BORLANDC__
+#  if defined __cplusplus && __cplusplus >= 201103L && !defined __APPLE__
+#    include <cstdint>
+#    ifdef __NEWLIB__
+        typedef unsigned int uint;
+#    else
+        typedef std::uint32_t uint;
+#    endif
+#  else
+#    include <stdint.h>
+     typedef uint32_t uint;
+#  endif
+#else
+   typedef unsigned uint;
+#endif
+
+typedef signed char schar;
+
+#ifndef __IPL_H__
+   typedef unsigned char uchar;
+   typedef unsigned short ushort;
+#endif
+
+#if defined _MSC_VER || defined __BORLANDC__
+   typedef __int64 int64;
+   typedef unsigned __int64 uint64;
+#  define CV_BIG_INT(n)   n##I64
+#  define CV_BIG_UINT(n)  n##UI64
+#else
+   typedef int64_t int64;
+   typedef uint64_t uint64;
+#  define CV_BIG_INT(n)   n##LL
+#  define CV_BIG_UINT(n)  n##ULL
+#endif
+
+#define CV_USRTYPE1 (void)"CV_USRTYPE1 support has been dropped in OpenCV 4.0"
+
+#define CV_CN_MAX     512
+#define CV_CN_SHIFT   3
+#define CV_DEPTH_MAX  (1 << CV_CN_SHIFT)
+
+#define CV_8U   0
+#define CV_8S   1
+#define CV_16U  2
+#define CV_16S  3
+#define CV_32S  4
+#define CV_32F  5
+#define CV_64F  6
+#define CV_16F  7
+
+#define CV_MAT_DEPTH_MASK       (CV_DEPTH_MAX - 1)
+#define CV_MAT_DEPTH(flags)     ((flags) & CV_MAT_DEPTH_MASK)
+
+#define CV_MAKETYPE(depth,cn) (CV_MAT_DEPTH(depth) + (((cn)-1) << CV_CN_SHIFT))
+#define CV_MAKE_TYPE CV_MAKETYPE
+
+#define CV_8UC1 CV_MAKETYPE(CV_8U,1)
+#define CV_8UC2 CV_MAKETYPE(CV_8U,2)
+#define CV_8UC3 CV_MAKETYPE(CV_8U,3)
+#define CV_8UC4 CV_MAKETYPE(CV_8U,4)
+#define CV_8UC(n) CV_MAKETYPE(CV_8U,(n))
+
+#define CV_8SC1 CV_MAKETYPE(CV_8S,1)
+#define CV_8SC2 CV_MAKETYPE(CV_8S,2)
+#define CV_8SC3 CV_MAKETYPE(CV_8S,3)
+#define CV_8SC4 CV_MAKETYPE(CV_8S,4)
+#define CV_8SC(n) CV_MAKETYPE(CV_8S,(n))
+
+#define CV_16UC1 CV_MAKETYPE(CV_16U,1)
+#define CV_16UC2 CV_MAKETYPE(CV_16U,2)
+#define CV_16UC3 CV_MAKETYPE(CV_16U,3)
+#define CV_16UC4 CV_MAKETYPE(CV_16U,4)
+#define CV_16UC(n) CV_MAKETYPE(CV_16U,(n))
+
+#define CV_16SC1 CV_MAKETYPE(CV_16S,1)
+#define CV_16SC2 CV_MAKETYPE(CV_16S,2)
+#define CV_16SC3 CV_MAKETYPE(CV_16S,3)
+#define CV_16SC4 CV_MAKETYPE(CV_16S,4)
+#define CV_16SC(n) CV_MAKETYPE(CV_16S,(n))
+
+#define CV_32SC1 CV_MAKETYPE(CV_32S,1)
+#define CV_32SC2 CV_MAKETYPE(CV_32S,2)
+#define CV_32SC3 CV_MAKETYPE(CV_32S,3)
+#define CV_32SC4 CV_MAKETYPE(CV_32S,4)
+#define CV_32SC(n) CV_MAKETYPE(CV_32S,(n))
+
+#define CV_32FC1 CV_MAKETYPE(CV_32F,1)
+#define CV_32FC2 CV_MAKETYPE(CV_32F,2)
+#define CV_32FC3 CV_MAKETYPE(CV_32F,3)
+#define CV_32FC4 CV_MAKETYPE(CV_32F,4)
+#define CV_32FC(n) CV_MAKETYPE(CV_32F,(n))
+
+#define CV_64FC1 CV_MAKETYPE(CV_64F,1)
+#define CV_64FC2 CV_MAKETYPE(CV_64F,2)
+#define CV_64FC3 CV_MAKETYPE(CV_64F,3)
+#define CV_64FC4 CV_MAKETYPE(CV_64F,4)
+#define CV_64FC(n) CV_MAKETYPE(CV_64F,(n))
+
+#define CV_16FC1 CV_MAKETYPE(CV_16F,1)
+#define CV_16FC2 CV_MAKETYPE(CV_16F,2)
+#define CV_16FC3 CV_MAKETYPE(CV_16F,3)
+#define CV_16FC4 CV_MAKETYPE(CV_16F,4)
+#define CV_16FC(n) CV_MAKETYPE(CV_16F,(n))
+//! @}
+
+//! @name Comparison operation
+//! @sa cv::CmpTypes
+//! @{
+#define CV_HAL_CMP_EQ 0
+#define CV_HAL_CMP_GT 1
+#define CV_HAL_CMP_GE 2
+#define CV_HAL_CMP_LT 3
+#define CV_HAL_CMP_LE 4
+#define CV_HAL_CMP_NE 5
+//! @}
+
+//! @name Border processing modes
+//! @sa cv::BorderTypes
+//! @{
+#define CV_HAL_BORDER_CONSTANT 0
+#define CV_HAL_BORDER_REPLICATE 1
+#define CV_HAL_BORDER_REFLECT 2
+#define CV_HAL_BORDER_WRAP 3
+#define CV_HAL_BORDER_REFLECT_101 4
+#define CV_HAL_BORDER_TRANSPARENT 5
+#define CV_HAL_BORDER_ISOLATED 16
+//! @}
+
+//! @name DFT flags
+//! @{
+#define CV_HAL_DFT_INVERSE        1
+#define CV_HAL_DFT_SCALE          2
+#define CV_HAL_DFT_ROWS           4
+#define CV_HAL_DFT_COMPLEX_OUTPUT 16
+#define CV_HAL_DFT_REAL_OUTPUT    32
+#define CV_HAL_DFT_TWO_STAGE      64
+#define CV_HAL_DFT_STAGE_COLS    128
+#define CV_HAL_DFT_IS_CONTINUOUS 512
+#define CV_HAL_DFT_IS_INPLACE 1024
+//! @}
+
+//! @name SVD flags
+//! @{
+#define CV_HAL_SVD_NO_UV    1
+#define CV_HAL_SVD_SHORT_UV 2
+#define CV_HAL_SVD_MODIFY_A 4
+#define CV_HAL_SVD_FULL_UV  8
+//! @}
+
+//! @name Gemm flags
+//! @{
+#define CV_HAL_GEMM_1_T 1
+#define CV_HAL_GEMM_2_T 2
+#define CV_HAL_GEMM_3_T 4
+//! @}
+
+//! @}
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin.hpp
new file mode 100644
index 0000000..ac331f2
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin.hpp
@@ -0,0 +1,706 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_HAL_INTRIN_HPP
+#define OPENCV_HAL_INTRIN_HPP
+
+#include <cmath>
+#include <float.h>
+#include <stdlib.h>
+#include "opencv2/core/cvdef.h"
+
+#define OPENCV_HAL_ADD(a, b) ((a) + (b))
+#define OPENCV_HAL_AND(a, b) ((a) & (b))
+#define OPENCV_HAL_NOP(a) (a)
+#define OPENCV_HAL_1ST(a, b) (a)
+
+namespace {
+inline unsigned int trailingZeros32(unsigned int value) {
+#if defined(_MSC_VER)
+#if (_MSC_VER < 1700) || defined(_M_ARM) || defined(_M_ARM64)
+    unsigned long index = 0;
+    _BitScanForward(&index, value);
+    return (unsigned int)index;
+#elif defined(__clang__)
+    // clang-cl doesn't export _tzcnt_u32 for non BMI systems
+    return value ? __builtin_ctz(value) : 32;
+#else
+    return _tzcnt_u32(value);
+#endif
+#elif defined(__GNUC__) || defined(__GNUG__)
+    return __builtin_ctz(value);
+#elif defined(__ICC) || defined(__INTEL_COMPILER)
+    return _bit_scan_forward(value);
+#elif defined(__clang__)
+    return llvm.cttz.i32(value, true);
+#else
+    static const int MultiplyDeBruijnBitPosition[32] = {
+        0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
+        31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 };
+    return MultiplyDeBruijnBitPosition[((uint32_t)((value & -value) * 0x077CB531U)) >> 27];
+#endif
+}
+}
+
+// unlike HAL API, which is in cv::hal,
+// we put intrinsics into cv namespace to make its
+// access from within opencv code more accessible
+namespace cv {
+
+namespace hal {
+
+enum StoreMode
+{
+    STORE_UNALIGNED = 0,
+    STORE_ALIGNED = 1,
+    STORE_ALIGNED_NOCACHE = 2
+};
+
+}
+
+// TODO FIXIT: Don't use "God" traits. Split on separate cases.
+template<typename _Tp> struct V_TypeTraits
+{
+};
+
+#define CV_INTRIN_DEF_TYPE_TRAITS(type, int_type_, uint_type_, abs_type_, w_type_, q_type_, sum_type_) \
+    template<> struct V_TypeTraits<type> \
+    { \
+        typedef type value_type; \
+        typedef int_type_ int_type; \
+        typedef abs_type_ abs_type; \
+        typedef uint_type_ uint_type; \
+        typedef w_type_ w_type; \
+        typedef q_type_ q_type; \
+        typedef sum_type_ sum_type; \
+    \
+        static inline int_type reinterpret_int(type x) \
+        { \
+            union { type l; int_type i; } v; \
+            v.l = x; \
+            return v.i; \
+        } \
+    \
+        static inline type reinterpret_from_int(int_type x) \
+        { \
+            union { type l; int_type i; } v; \
+            v.i = x; \
+            return v.l; \
+        } \
+    }
+
+#define CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(type, int_type_, uint_type_, abs_type_, w_type_, sum_type_) \
+    template<> struct V_TypeTraits<type> \
+    { \
+        typedef type value_type; \
+        typedef int_type_ int_type; \
+        typedef abs_type_ abs_type; \
+        typedef uint_type_ uint_type; \
+        typedef w_type_ w_type; \
+        typedef sum_type_ sum_type; \
+    \
+        static inline int_type reinterpret_int(type x) \
+        { \
+            union { type l; int_type i; } v; \
+            v.l = x; \
+            return v.i; \
+        } \
+    \
+        static inline type reinterpret_from_int(int_type x) \
+        { \
+            union { type l; int_type i; } v; \
+            v.i = x; \
+            return v.l; \
+        } \
+    }
+
+CV_INTRIN_DEF_TYPE_TRAITS(uchar, schar, uchar, uchar, ushort, unsigned, unsigned);
+CV_INTRIN_DEF_TYPE_TRAITS(schar, schar, uchar, uchar, short, int, int);
+CV_INTRIN_DEF_TYPE_TRAITS(ushort, short, ushort, ushort, unsigned, uint64, unsigned);
+CV_INTRIN_DEF_TYPE_TRAITS(short, short, ushort, ushort, int, int64, int);
+CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(unsigned, int, unsigned, unsigned, uint64, unsigned);
+CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int, int, unsigned, unsigned, int64, int);
+CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(float, int, unsigned, float, double, float);
+CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(uint64, int64, uint64, uint64, void, uint64);
+CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int64, int64, uint64, uint64, void, int64);
+CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(double, int64, uint64, double, void, double);
+
+#ifndef CV_DOXYGEN
+
+#ifndef CV_CPU_OPTIMIZATION_HAL_NAMESPACE
+#ifdef CV_FORCE_SIMD128_CPP
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_EMULATOR_CPP
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_EMULATOR_CPP {
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
+#elif defined(CV_CPU_DISPATCH_MODE)
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE __CV_CAT(hal_, CV_CPU_DISPATCH_MODE)
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace __CV_CAT(hal_, CV_CPU_DISPATCH_MODE) {
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
+#else
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_baseline
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_baseline {
+    #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
+#endif
+#endif // CV_CPU_OPTIMIZATION_HAL_NAMESPACE
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
+#endif
+}
+
+#ifdef CV_DOXYGEN
+#   undef CV_AVX2
+#   undef CV_SSE2
+#   undef CV_NEON
+#   undef CV_VSX
+#   undef CV_FP16
+#   undef CV_MSA
+#   undef CV_RVV
+#endif
+
+#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD || CV_RVV071 || CV_RVV) && !defined(CV_FORCE_SIMD128_CPP)
+#define CV__SIMD_FORWARD 128
+#include "opencv2/core/hal/intrin_forward.hpp"
+#endif
+
+#if CV_SSE2 && !defined(CV_FORCE_SIMD128_CPP)
+
+#include "opencv2/core/hal/intrin_sse_em.hpp"
+#include "opencv2/core/hal/intrin_sse.hpp"
+
+#elif CV_NEON && !defined(CV_FORCE_SIMD128_CPP)
+
+#include "opencv2/core/hal/intrin_neon.hpp"
+
+#elif CV_RVV071 && !defined(CV_FORCE_SIMD128_CPP)
+#define CV_SIMD128_CPP 0
+#include "opencv2/core/hal/intrin_rvv071.hpp"
+
+#elif CV_VSX && !defined(CV_FORCE_SIMD128_CPP)
+
+#include "opencv2/core/hal/intrin_vsx.hpp"
+
+#elif CV_MSA && !defined(CV_FORCE_SIMD128_CPP)
+
+#include "opencv2/core/hal/intrin_msa.hpp"
+
+#elif CV_WASM_SIMD && !defined(CV_FORCE_SIMD128_CPP)
+#include "opencv2/core/hal/intrin_wasm.hpp"
+
+#elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP)
+#include "opencv2/core/hal/intrin_rvv.hpp"
+
+#else
+
+#include "opencv2/core/hal/intrin_cpp.hpp"
+
+#endif
+
+// AVX2 can be used together with SSE2, so
+// we define those two sets of intrinsics at once.
+// Most of the intrinsics do not conflict (the proper overloaded variant is
+// resolved by the argument types, e.g. v_float32x4 ~ SSE2, v_float32x8 ~ AVX2),
+// but some of AVX2 intrinsics get v256_ prefix instead of v_, e.g. v256_load() vs v_load().
+// Correspondingly, the wide intrinsics (which are mapped to the "widest"
+// available instruction set) will get vx_ prefix
+// (and will be mapped to v256_ counterparts) (e.g. vx_load() => v256_load())
+#if CV_AVX2
+
+#define CV__SIMD_FORWARD 256
+#include "opencv2/core/hal/intrin_forward.hpp"
+#include "opencv2/core/hal/intrin_avx.hpp"
+
+#endif
+
+// AVX512 can be used together with SSE2 and AVX2, so
+// we define those sets of intrinsics at once.
+// For some of AVX512 intrinsics get v512_ prefix instead of v_, e.g. v512_load() vs v_load().
+// Wide intrinsics will be mapped to v512_ counterparts in this case(e.g. vx_load() => v512_load())
+#if CV_AVX512_SKX
+
+#define CV__SIMD_FORWARD 512
+#include "opencv2/core/hal/intrin_forward.hpp"
+#include "opencv2/core/hal/intrin_avx512.hpp"
+
+#endif
+
+//! @cond IGNORED
+
+namespace cv {
+
+#ifndef CV_DOXYGEN
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+#endif
+
+#ifndef CV_SIMD128
+#define CV_SIMD128 0
+#endif
+
+#ifndef CV_SIMD128_CPP
+#define CV_SIMD128_CPP 0
+#endif
+
+#ifndef CV_SIMD128_64F
+#define CV_SIMD128_64F 0
+#endif
+
+#ifndef CV_SIMD256
+#define CV_SIMD256 0
+#endif
+
+#ifndef CV_SIMD256_64F
+#define CV_SIMD256_64F 0
+#endif
+
+#ifndef CV_SIMD512
+#define CV_SIMD512 0
+#endif
+
+#ifndef CV_SIMD512_64F
+#define CV_SIMD512_64F 0
+#endif
+
+#ifndef CV_SIMD128_FP16
+#define CV_SIMD128_FP16 0
+#endif
+
+#ifndef CV_SIMD256_FP16
+#define CV_SIMD256_FP16 0
+#endif
+
+#ifndef CV_SIMD512_FP16
+#define CV_SIMD512_FP16 0
+#endif
+
+//==================================================================================================
+
+template<typename _Tp> struct V_RegTraits
+{
+};
+
+#define CV_DEF_REG_TRAITS(prefix, _reg, lane_type, suffix, _u_reg, _w_reg, _q_reg, _int_reg, _round_reg) \
+    template<> struct V_RegTraits<_reg> \
+    { \
+        typedef _reg reg; \
+        typedef _u_reg u_reg; \
+        typedef _w_reg w_reg; \
+        typedef _q_reg q_reg; \
+        typedef _int_reg int_reg; \
+        typedef _round_reg round_reg; \
+    }
+
+#if CV_SIMD128 || CV_SIMD128_CPP
+    CV_DEF_REG_TRAITS(v, v_uint8x16, uchar, u8, v_uint8x16, v_uint16x8, v_uint32x4, v_int8x16, void);
+    CV_DEF_REG_TRAITS(v, v_int8x16, schar, s8, v_uint8x16, v_int16x8, v_int32x4, v_int8x16, void);
+    CV_DEF_REG_TRAITS(v, v_uint16x8, ushort, u16, v_uint16x8, v_uint32x4, v_uint64x2, v_int16x8, void);
+    CV_DEF_REG_TRAITS(v, v_int16x8, short, s16, v_uint16x8, v_int32x4, v_int64x2, v_int16x8, void);
+    CV_DEF_REG_TRAITS(v, v_uint32x4, unsigned, u32, v_uint32x4, v_uint64x2, void, v_int32x4, void);
+    CV_DEF_REG_TRAITS(v, v_int32x4, int, s32, v_uint32x4, v_int64x2, void, v_int32x4, void);
+#if CV_SIMD128_64F || CV_SIMD128_CPP
+    CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, v_float64x2, void, v_int32x4, v_int32x4);
+#else
+    CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, void, void, v_int32x4, v_int32x4);
+#endif
+    CV_DEF_REG_TRAITS(v, v_uint64x2, uint64, u64, v_uint64x2, void, void, v_int64x2, void);
+    CV_DEF_REG_TRAITS(v, v_int64x2, int64, s64, v_uint64x2, void, void, v_int64x2, void);
+#if CV_SIMD128_64F
+    CV_DEF_REG_TRAITS(v, v_float64x2, double, f64, v_float64x2, void, void, v_int64x2, v_int32x4);
+#endif
+#endif
+
+#if CV_SIMD256
+    CV_DEF_REG_TRAITS(v256, v_uint8x32, uchar, u8, v_uint8x32, v_uint16x16, v_uint32x8, v_int8x32, void);
+    CV_DEF_REG_TRAITS(v256, v_int8x32, schar, s8, v_uint8x32, v_int16x16, v_int32x8, v_int8x32, void);
+    CV_DEF_REG_TRAITS(v256, v_uint16x16, ushort, u16, v_uint16x16, v_uint32x8, v_uint64x4, v_int16x16, void);
+    CV_DEF_REG_TRAITS(v256, v_int16x16, short, s16, v_uint16x16, v_int32x8, v_int64x4, v_int16x16, void);
+    CV_DEF_REG_TRAITS(v256, v_uint32x8, unsigned, u32, v_uint32x8, v_uint64x4, void, v_int32x8, void);
+    CV_DEF_REG_TRAITS(v256, v_int32x8, int, s32, v_uint32x8, v_int64x4, void, v_int32x8, void);
+    CV_DEF_REG_TRAITS(v256, v_float32x8, float, f32, v_float32x8, v_float64x4, void, v_int32x8, v_int32x8);
+    CV_DEF_REG_TRAITS(v256, v_uint64x4, uint64, u64, v_uint64x4, void, void, v_int64x4, void);
+    CV_DEF_REG_TRAITS(v256, v_int64x4, int64, s64, v_uint64x4, void, void, v_int64x4, void);
+    CV_DEF_REG_TRAITS(v256, v_float64x4, double, f64, v_float64x4, void, void, v_int64x4, v_int32x8);
+#endif
+
+#if CV_SIMD512
+    CV_DEF_REG_TRAITS(v512, v_uint8x64, uchar, u8, v_uint8x64, v_uint16x32, v_uint32x16, v_int8x64, void);
+    CV_DEF_REG_TRAITS(v512, v_int8x64, schar, s8, v_uint8x64, v_int16x32, v_int32x16, v_int8x64, void);
+    CV_DEF_REG_TRAITS(v512, v_uint16x32, ushort, u16, v_uint16x32, v_uint32x16, v_uint64x8, v_int16x32, void);
+    CV_DEF_REG_TRAITS(v512, v_int16x32, short, s16, v_uint16x32, v_int32x16, v_int64x8, v_int16x32, void);
+    CV_DEF_REG_TRAITS(v512, v_uint32x16, unsigned, u32, v_uint32x16, v_uint64x8, void, v_int32x16, void);
+    CV_DEF_REG_TRAITS(v512, v_int32x16, int, s32, v_uint32x16, v_int64x8, void, v_int32x16, void);
+    CV_DEF_REG_TRAITS(v512, v_float32x16, float, f32, v_float32x16, v_float64x8, void, v_int32x16, v_int32x16);
+    CV_DEF_REG_TRAITS(v512, v_uint64x8, uint64, u64, v_uint64x8, void, void, v_int64x8, void);
+    CV_DEF_REG_TRAITS(v512, v_int64x8, int64, s64, v_uint64x8, void, void, v_int64x8, void);
+    CV_DEF_REG_TRAITS(v512, v_float64x8, double, f64, v_float64x8, void, void, v_int64x8, v_int32x16);
+#endif
+//! @endcond
+
+#if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
+#define CV__SIMD_NAMESPACE simd512
+namespace CV__SIMD_NAMESPACE {
+    #define CV_SIMD 1
+    #define CV_SIMD_64F CV_SIMD512_64F
+    #define CV_SIMD_FP16 CV_SIMD512_FP16
+    #define CV_SIMD_WIDTH 64
+//! @addtogroup core_hal_intrin
+//! @{
+    //! @brief Maximum available vector register capacity 8-bit unsigned integer values
+    typedef v_uint8x64    v_uint8;
+    //! @brief Maximum available vector register capacity 8-bit signed integer values
+    typedef v_int8x64     v_int8;
+    //! @brief Maximum available vector register capacity 16-bit unsigned integer values
+    typedef v_uint16x32   v_uint16;
+    //! @brief Maximum available vector register capacity 16-bit signed integer values
+    typedef v_int16x32    v_int16;
+    //! @brief Maximum available vector register capacity 32-bit unsigned integer values
+    typedef v_uint32x16   v_uint32;
+    //! @brief Maximum available vector register capacity 32-bit signed integer values
+    typedef v_int32x16    v_int32;
+    //! @brief Maximum available vector register capacity 64-bit unsigned integer values
+    typedef v_uint64x8    v_uint64;
+    //! @brief Maximum available vector register capacity 64-bit signed integer values
+    typedef v_int64x8     v_int64;
+    //! @brief Maximum available vector register capacity 32-bit floating point values (single precision)
+    typedef v_float32x16  v_float32;
+    #if CV_SIMD512_64F
+    //! @brief Maximum available vector register capacity 64-bit floating point values (double precision)
+    typedef v_float64x8   v_float64;
+    #endif
+//! @}
+
+    #define VXPREFIX(func) v512##func
+} // namespace
+using namespace CV__SIMD_NAMESPACE;
+#elif CV_SIMD256 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 256)
+#define CV__SIMD_NAMESPACE simd256
+namespace CV__SIMD_NAMESPACE {
+    #define CV_SIMD 1
+    #define CV_SIMD_64F CV_SIMD256_64F
+    #define CV_SIMD_FP16 CV_SIMD256_FP16
+    #define CV_SIMD_WIDTH 32
+//! @addtogroup core_hal_intrin
+//! @{
+    //! @brief Maximum available vector register capacity 8-bit unsigned integer values
+    typedef v_uint8x32   v_uint8;
+    //! @brief Maximum available vector register capacity 8-bit signed integer values
+    typedef v_int8x32    v_int8;
+    //! @brief Maximum available vector register capacity 16-bit unsigned integer values
+    typedef v_uint16x16  v_uint16;
+    //! @brief Maximum available vector register capacity 16-bit signed integer values
+    typedef v_int16x16   v_int16;
+    //! @brief Maximum available vector register capacity 32-bit unsigned integer values
+    typedef v_uint32x8   v_uint32;
+    //! @brief Maximum available vector register capacity 32-bit signed integer values
+    typedef v_int32x8    v_int32;
+    //! @brief Maximum available vector register capacity 64-bit unsigned integer values
+    typedef v_uint64x4   v_uint64;
+    //! @brief Maximum available vector register capacity 64-bit signed integer values
+    typedef v_int64x4    v_int64;
+    //! @brief Maximum available vector register capacity 32-bit floating point values (single precision)
+    typedef v_float32x8  v_float32;
+    #if CV_SIMD256_64F
+    //! @brief Maximum available vector register capacity 64-bit floating point values (double precision)
+    typedef v_float64x4  v_float64;
+    #endif
+//! @}
+
+    #define VXPREFIX(func) v256##func
+} // namespace
+using namespace CV__SIMD_NAMESPACE;
+#elif (CV_SIMD128 || CV_SIMD128_CPP) && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 128)
+#if defined CV_SIMD128_CPP
+#define CV__SIMD_NAMESPACE simd128_cpp
+#else
+#define CV__SIMD_NAMESPACE simd128
+#endif
+namespace CV__SIMD_NAMESPACE {
+    #define CV_SIMD CV_SIMD128
+    #define CV_SIMD_64F CV_SIMD128_64F
+    #define CV_SIMD_WIDTH 16
+//! @addtogroup core_hal_intrin
+//! @{
+    //! @brief Maximum available vector register capacity 8-bit unsigned integer values
+    typedef v_uint8x16  v_uint8;
+    //! @brief Maximum available vector register capacity 8-bit signed integer values
+    typedef v_int8x16   v_int8;
+    //! @brief Maximum available vector register capacity 16-bit unsigned integer values
+    typedef v_uint16x8  v_uint16;
+    //! @brief Maximum available vector register capacity 16-bit signed integer values
+    typedef v_int16x8   v_int16;
+    //! @brief Maximum available vector register capacity 32-bit unsigned integer values
+    typedef v_uint32x4  v_uint32;
+    //! @brief Maximum available vector register capacity 32-bit signed integer values
+    typedef v_int32x4   v_int32;
+    //! @brief Maximum available vector register capacity 64-bit unsigned integer values
+    typedef v_uint64x2  v_uint64;
+    //! @brief Maximum available vector register capacity 64-bit signed integer values
+    typedef v_int64x2   v_int64;
+    //! @brief Maximum available vector register capacity 32-bit floating point values (single precision)
+    typedef v_float32x4 v_float32;
+    #if CV_SIMD128_64F
+    //! @brief Maximum available vector register capacity 64-bit floating point values (double precision)
+    typedef v_float64x2 v_float64;
+    #endif
+//! @}
+
+    #define VXPREFIX(func) v##func
+} // namespace
+using namespace CV__SIMD_NAMESPACE;
+#endif
+
+namespace CV__SIMD_NAMESPACE {
+//! @addtogroup core_hal_intrin
+//! @{
+    //! @name Wide init with value
+    //! @{
+    //! @brief Create maximum available capacity vector with elements set to a specific value
+    inline v_uint8 vx_setall_u8(uchar v) { return VXPREFIX(_setall_u8)(v); }
+    inline v_int8 vx_setall_s8(schar v) { return VXPREFIX(_setall_s8)(v); }
+    inline v_uint16 vx_setall_u16(ushort v) { return VXPREFIX(_setall_u16)(v); }
+    inline v_int16 vx_setall_s16(short v) { return VXPREFIX(_setall_s16)(v); }
+    inline v_int32 vx_setall_s32(int v) { return VXPREFIX(_setall_s32)(v); }
+    inline v_uint32 vx_setall_u32(unsigned v) { return VXPREFIX(_setall_u32)(v); }
+    inline v_float32 vx_setall_f32(float v) { return VXPREFIX(_setall_f32)(v); }
+    inline v_int64 vx_setall_s64(int64 v) { return VXPREFIX(_setall_s64)(v); }
+    inline v_uint64 vx_setall_u64(uint64 v) { return VXPREFIX(_setall_u64)(v); }
+#if CV_SIMD_64F
+    inline v_float64 vx_setall_f64(double v) { return VXPREFIX(_setall_f64)(v); }
+#endif
+    //! @}
+
+    //! @name Wide init with zero
+    //! @{
+    //! @brief Create maximum available capacity vector with elements set to zero
+    inline v_uint8 vx_setzero_u8() { return VXPREFIX(_setzero_u8)(); }
+    inline v_int8 vx_setzero_s8() { return VXPREFIX(_setzero_s8)(); }
+    inline v_uint16 vx_setzero_u16() { return VXPREFIX(_setzero_u16)(); }
+    inline v_int16 vx_setzero_s16() { return VXPREFIX(_setzero_s16)(); }
+    inline v_int32 vx_setzero_s32() { return VXPREFIX(_setzero_s32)(); }
+    inline v_uint32 vx_setzero_u32() { return VXPREFIX(_setzero_u32)(); }
+    inline v_float32 vx_setzero_f32() { return VXPREFIX(_setzero_f32)(); }
+    inline v_int64 vx_setzero_s64() { return VXPREFIX(_setzero_s64)(); }
+    inline v_uint64 vx_setzero_u64() { return VXPREFIX(_setzero_u64)(); }
+#if CV_SIMD_64F
+    inline v_float64 vx_setzero_f64() { return VXPREFIX(_setzero_f64)(); }
+#endif
+    //! @}
+
+    //! @name Wide load from memory
+    //! @{
+    //! @brief Load maximum available capacity register contents from memory
+    inline v_uint8 vx_load(const uchar * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_int8 vx_load(const schar * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_uint16 vx_load(const ushort * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_int16 vx_load(const short * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_int32 vx_load(const int * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_uint32 vx_load(const unsigned * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_float32 vx_load(const float * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_int64 vx_load(const int64 * ptr) { return VXPREFIX(_load)(ptr); }
+    inline v_uint64 vx_load(const uint64 * ptr) { return VXPREFIX(_load)(ptr); }
+#if CV_SIMD_64F
+    inline v_float64 vx_load(const double * ptr) { return VXPREFIX(_load)(ptr); }
+#endif
+    //! @}
+
+    //! @name Wide load from memory(aligned)
+    //! @{
+    //! @brief Load maximum available capacity register contents from memory(aligned)
+    inline v_uint8 vx_load_aligned(const uchar * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_int8 vx_load_aligned(const schar * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_uint16 vx_load_aligned(const ushort * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_int16 vx_load_aligned(const short * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_int32 vx_load_aligned(const int * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_uint32 vx_load_aligned(const unsigned * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_float32 vx_load_aligned(const float * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_int64 vx_load_aligned(const int64 * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+    inline v_uint64 vx_load_aligned(const uint64 * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+#if CV_SIMD_64F
+    inline v_float64 vx_load_aligned(const double * ptr) { return VXPREFIX(_load_aligned)(ptr); }
+#endif
+    //! @}
+
+    //! @name Wide load lower half from memory
+    //! @{
+    //! @brief Load lower half of maximum available capacity register from memory
+    inline v_uint8 vx_load_low(const uchar * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_int8 vx_load_low(const schar * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_uint16 vx_load_low(const ushort * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_int16 vx_load_low(const short * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_int32 vx_load_low(const int * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_uint32 vx_load_low(const unsigned * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_float32 vx_load_low(const float * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_int64 vx_load_low(const int64 * ptr) { return VXPREFIX(_load_low)(ptr); }
+    inline v_uint64 vx_load_low(const uint64 * ptr) { return VXPREFIX(_load_low)(ptr); }
+#if CV_SIMD_64F
+    inline v_float64 vx_load_low(const double * ptr) { return VXPREFIX(_load_low)(ptr); }
+#endif
+    //! @}
+
+    //! @name Wide load halfs from memory
+    //! @{
+    //! @brief Load maximum available capacity register contents from two memory blocks
+    inline v_uint8 vx_load_halves(const uchar * ptr0, const uchar * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_int8 vx_load_halves(const schar * ptr0, const schar * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_uint16 vx_load_halves(const ushort * ptr0, const ushort * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_int16 vx_load_halves(const short * ptr0, const short * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_int32 vx_load_halves(const int * ptr0, const int * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_uint32 vx_load_halves(const unsigned * ptr0, const unsigned * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_float32 vx_load_halves(const float * ptr0, const float * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_int64 vx_load_halves(const int64 * ptr0, const int64 * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+    inline v_uint64 vx_load_halves(const uint64 * ptr0, const uint64 * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+#if CV_SIMD_64F
+    inline v_float64 vx_load_halves(const double * ptr0, const double * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
+#endif
+    //! @}
+
+    //! @name Wide LUT of elements
+    //! @{
+    //! @brief Load maximum available capacity register contents with array elements by provided indexes
+    inline v_uint8 vx_lut(const uchar * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_int8 vx_lut(const schar * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_uint16 vx_lut(const ushort * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_int16 vx_lut(const short* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_int32 vx_lut(const int* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_uint32 vx_lut(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_float32 vx_lut(const float* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_int64 vx_lut(const int64 * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+    inline v_uint64 vx_lut(const uint64 * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+#if CV_SIMD_64F
+    inline v_float64 vx_lut(const double* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
+#endif
+    //! @}
+
+    //! @name Wide LUT of element pairs
+    //! @{
+    //! @brief Load maximum available capacity register contents with array element pairs by provided indexes
+    inline v_uint8 vx_lut_pairs(const uchar * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_int8 vx_lut_pairs(const schar * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_uint16 vx_lut_pairs(const ushort * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_int16 vx_lut_pairs(const short* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_int32 vx_lut_pairs(const int* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_uint32 vx_lut_pairs(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_float32 vx_lut_pairs(const float* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_int64 vx_lut_pairs(const int64 * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+    inline v_uint64 vx_lut_pairs(const uint64 * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+#if CV_SIMD_64F
+    inline v_float64 vx_lut_pairs(const double* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
+#endif
+    //! @}
+
+    //! @name Wide LUT of element quads
+    //! @{
+    //! @brief Load maximum available capacity register contents with array element quads by provided indexes
+    inline v_uint8 vx_lut_quads(const uchar* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    inline v_int8 vx_lut_quads(const schar* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    inline v_uint16 vx_lut_quads(const ushort* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    inline v_int16 vx_lut_quads(const short* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    inline v_int32 vx_lut_quads(const int* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    inline v_uint32 vx_lut_quads(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    inline v_float32 vx_lut_quads(const float* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
+    //! @}
+
+    //! @name Wide load with double expansion
+    //! @{
+    //! @brief Load maximum available capacity register contents from memory with double expand
+    inline v_uint16 vx_load_expand(const uchar * ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_int16 vx_load_expand(const schar * ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_uint32 vx_load_expand(const ushort * ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_int32 vx_load_expand(const short* ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_int64 vx_load_expand(const int* ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_uint64 vx_load_expand(const unsigned* ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_float32 vx_load_expand(const float16_t * ptr) { return VXPREFIX(_load_expand)(ptr); }
+    //! @}
+
+    //! @name Wide load with quad expansion
+    //! @{
+    //! @brief Load maximum available capacity register contents from memory with quad expand
+    inline v_uint32 vx_load_expand_q(const uchar * ptr) { return VXPREFIX(_load_expand_q)(ptr); }
+    inline v_int32 vx_load_expand_q(const schar * ptr) { return VXPREFIX(_load_expand_q)(ptr); }
+    //! @}
+
+    /** @brief SIMD processing state cleanup call */
+    inline void vx_cleanup() { VXPREFIX(_cleanup)(); }
+
+
+//! @cond IGNORED
+
+    // backward compatibility
+    template<typename _Tp, typename _Tvec> static inline
+    void vx_store(_Tp* dst, const _Tvec& v) { return v_store(dst, v); }
+    // backward compatibility
+    template<typename _Tp, typename _Tvec> static inline
+    void vx_store_aligned(_Tp* dst, const _Tvec& v) { return v_store_aligned(dst, v); }
+
+//! @endcond
+
+
+//! @}
+    #undef VXPREFIX
+} // namespace
+
+//! @cond IGNORED
+#ifndef CV_SIMD_64F
+#define CV_SIMD_64F 0
+#endif
+
+#ifndef CV_SIMD_FP16
+#define CV_SIMD_FP16 0  //!< Defined to 1 on native support of operations with float16x8_t / float16x16_t (SIMD256) types
+#endif
+
+#ifndef CV_SIMD
+#define CV_SIMD 0
+#endif
+
+#include "simd_utils.impl.hpp"
+
+#ifndef CV_DOXYGEN
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+#endif
+
+} // cv::
+
+//! @endcond
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_avx.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_avx.hpp
new file mode 100644
index 0000000..979b616
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_avx.hpp
@@ -0,0 +1,3177 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_HAL_INTRIN_AVX_HPP
+#define OPENCV_HAL_INTRIN_AVX_HPP
+
+#define CV_SIMD256 1
+#define CV_SIMD256_64F 1
+#define CV_SIMD256_FP16 0  // no native operations with FP16 type. Only load/store from float32x8 are available (if CV_FP16 == 1)
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+///////// Utils ////////////
+
+inline __m256i _v256_combine(const __m128i& lo, const __m128i& hi)
+{ return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1); }
+
+inline __m256 _v256_combine(const __m128& lo, const __m128& hi)
+{ return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1); }
+
+inline __m256d _v256_combine(const __m128d& lo, const __m128d& hi)
+{ return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 1); }
+
+inline int _v_cvtsi256_si32(const __m256i& a)
+{ return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); }
+
+inline __m256i _v256_shuffle_odd_64(const __m256i& v)
+{ return _mm256_permute4x64_epi64(v, _MM_SHUFFLE(3, 1, 2, 0)); }
+
+inline __m256d _v256_shuffle_odd_64(const __m256d& v)
+{ return _mm256_permute4x64_pd(v, _MM_SHUFFLE(3, 1, 2, 0)); }
+
+template<int imm>
+inline __m256i _v256_permute2x128(const __m256i& a, const __m256i& b)
+{ return _mm256_permute2x128_si256(a, b, imm); }
+
+template<int imm>
+inline __m256 _v256_permute2x128(const __m256& a, const __m256& b)
+{ return _mm256_permute2f128_ps(a, b, imm); }
+
+template<int imm>
+inline __m256d _v256_permute2x128(const __m256d& a, const __m256d& b)
+{ return _mm256_permute2f128_pd(a, b, imm); }
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v256_permute2x128(const _Tpvec& a, const _Tpvec& b)
+{ return _Tpvec(_v256_permute2x128<imm>(a.val, b.val)); }
+
+template<int imm>
+inline __m256i _v256_permute4x64(const __m256i& a)
+{ return _mm256_permute4x64_epi64(a, imm); }
+
+template<int imm>
+inline __m256d _v256_permute4x64(const __m256d& a)
+{ return _mm256_permute4x64_pd(a, imm); }
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v256_permute4x64(const _Tpvec& a)
+{ return _Tpvec(_v256_permute4x64<imm>(a.val)); }
+
+inline __m128i _v256_extract_high(const __m256i& v)
+{ return _mm256_extracti128_si256(v, 1); }
+
+inline __m128  _v256_extract_high(const __m256& v)
+{ return _mm256_extractf128_ps(v, 1); }
+
+inline __m128d _v256_extract_high(const __m256d& v)
+{ return _mm256_extractf128_pd(v, 1); }
+
+inline __m128i _v256_extract_low(const __m256i& v)
+{ return _mm256_castsi256_si128(v); }
+
+inline __m128  _v256_extract_low(const __m256& v)
+{ return _mm256_castps256_ps128(v); }
+
+inline __m128d _v256_extract_low(const __m256d& v)
+{ return _mm256_castpd256_pd128(v); }
+
+inline __m256i _v256_packs_epu32(const __m256i& a, const __m256i& b)
+{
+    const __m256i m = _mm256_set1_epi32(65535);
+    __m256i am = _mm256_min_epu32(a, m);
+    __m256i bm = _mm256_min_epu32(b, m);
+    return _mm256_packus_epi32(am, bm);
+}
+
+template<int i>
+inline int _v256_extract_epi8(const __m256i& a)
+{
+#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/))
+    return _mm256_extract_epi8(a, i);
+#else
+    __m128i b = _mm256_extractf128_si256(a, ((i) >> 4));
+    return _mm_extract_epi8(b, i & 15);  // SSE4.1
+#endif
+}
+
+template<int i>
+inline int _v256_extract_epi16(const __m256i& a)
+{
+#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/))
+    return _mm256_extract_epi16(a, i);
+#else
+    __m128i b = _mm256_extractf128_si256(a, ((i) >> 3));
+    return _mm_extract_epi16(b, i & 7);  // SSE2
+#endif
+}
+
+template<int i>
+inline int _v256_extract_epi32(const __m256i& a)
+{
+#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/))
+    return _mm256_extract_epi32(a, i);
+#else
+    __m128i b = _mm256_extractf128_si256(a, ((i) >> 2));
+    return _mm_extract_epi32(b, i & 3);  // SSE4.1
+#endif
+}
+
+template<int i>
+inline int64 _v256_extract_epi64(const __m256i& a)
+{
+#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/))
+    return _mm256_extract_epi64(a, i);
+#else
+    __m128i b = _mm256_extractf128_si256(a, ((i) >> 1));
+    return _mm_extract_epi64(b, i & 1);  // SSE4.1
+#endif
+}
+
+///////// Types ////////////
+
+struct v_uint8x32
+{
+    typedef uchar lane_type;
+    enum { nlanes = 32 };
+    __m256i val;
+
+    explicit v_uint8x32(__m256i v) : val(v) {}
+    v_uint8x32(uchar v0,  uchar v1,  uchar v2,  uchar v3,
+               uchar v4,  uchar v5,  uchar v6,  uchar v7,
+               uchar v8,  uchar v9,  uchar v10, uchar v11,
+               uchar v12, uchar v13, uchar v14, uchar v15,
+               uchar v16, uchar v17, uchar v18, uchar v19,
+               uchar v20, uchar v21, uchar v22, uchar v23,
+               uchar v24, uchar v25, uchar v26, uchar v27,
+               uchar v28, uchar v29, uchar v30, uchar v31)
+    {
+        val = _mm256_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
+            (char)v4,  (char)v5,  (char)v6 , (char)v7,  (char)v8,  (char)v9,
+            (char)v10, (char)v11, (char)v12, (char)v13, (char)v14, (char)v15,
+            (char)v16, (char)v17, (char)v18, (char)v19, (char)v20, (char)v21,
+            (char)v22, (char)v23, (char)v24, (char)v25, (char)v26, (char)v27,
+            (char)v28, (char)v29, (char)v30, (char)v31);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_uint8x32() {}
+
+    uchar get0() const { return (uchar)_v_cvtsi256_si32(val); }
+};
+
+struct v_int8x32
+{
+    typedef schar lane_type;
+    enum { nlanes = 32 };
+    __m256i val;
+
+    explicit v_int8x32(__m256i v) : val(v) {}
+    v_int8x32(schar v0,  schar v1,  schar v2,  schar v3,
+              schar v4,  schar v5,  schar v6,  schar v7,
+              schar v8,  schar v9,  schar v10, schar v11,
+              schar v12, schar v13, schar v14, schar v15,
+              schar v16, schar v17, schar v18, schar v19,
+              schar v20, schar v21, schar v22, schar v23,
+              schar v24, schar v25, schar v26, schar v27,
+              schar v28, schar v29, schar v30, schar v31)
+    {
+        val = _mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
+            v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20,
+            v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_int8x32() {}
+
+    schar get0() const { return (schar)_v_cvtsi256_si32(val); }
+};
+
+struct v_uint16x16
+{
+    typedef ushort lane_type;
+    enum { nlanes = 16 };
+    __m256i val;
+
+    explicit v_uint16x16(__m256i v) : val(v) {}
+    v_uint16x16(ushort v0,  ushort v1,  ushort v2,  ushort v3,
+                ushort v4,  ushort v5,  ushort v6,  ushort v7,
+                ushort v8,  ushort v9,  ushort v10, ushort v11,
+                ushort v12, ushort v13, ushort v14, ushort v15)
+    {
+        val = _mm256_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
+            (short)v4,  (short)v5,  (short)v6,  (short)v7,  (short)v8,  (short)v9,
+            (short)v10, (short)v11, (short)v12, (short)v13, (short)v14, (short)v15);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_uint16x16() {}
+
+    ushort get0() const { return (ushort)_v_cvtsi256_si32(val); }
+};
+
+struct v_int16x16
+{
+    typedef short lane_type;
+    enum { nlanes = 16 };
+    __m256i val;
+
+    explicit v_int16x16(__m256i v) : val(v) {}
+    v_int16x16(short v0,  short v1,  short v2,  short v3,
+               short v4,  short v5,  short v6,  short v7,
+               short v8,  short v9,  short v10, short v11,
+               short v12, short v13, short v14, short v15)
+    {
+        val = _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_int16x16() {}
+
+    short get0() const { return (short)_v_cvtsi256_si32(val); }
+};
+
+struct v_uint32x8
+{
+    typedef unsigned lane_type;
+    enum { nlanes = 8 };
+    __m256i val;
+
+    explicit v_uint32x8(__m256i v) : val(v) {}
+    v_uint32x8(unsigned v0, unsigned v1, unsigned v2, unsigned v3,
+               unsigned v4, unsigned v5, unsigned v6, unsigned v7)
+    {
+        val = _mm256_setr_epi32((unsigned)v0, (unsigned)v1, (unsigned)v2,
+            (unsigned)v3, (unsigned)v4, (unsigned)v5, (unsigned)v6, (unsigned)v7);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_uint32x8() {}
+
+    unsigned get0() const { return (unsigned)_v_cvtsi256_si32(val); }
+};
+
+struct v_int32x8
+{
+    typedef int lane_type;
+    enum { nlanes = 8 };
+    __m256i val;
+
+    explicit v_int32x8(__m256i v) : val(v) {}
+    v_int32x8(int v0, int v1, int v2, int v3,
+              int v4, int v5, int v6, int v7)
+    {
+        val = _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_int32x8() {}
+
+    int get0() const { return _v_cvtsi256_si32(val); }
+};
+
+struct v_float32x8
+{
+    typedef float lane_type;
+    enum { nlanes = 8 };
+    __m256 val;
+
+    explicit v_float32x8(__m256 v) : val(v) {}
+    v_float32x8(float v0, float v1, float v2, float v3,
+                float v4, float v5, float v6, float v7)
+    {
+        val = _mm256_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_float32x8() {}
+
+    float get0() const { return _mm_cvtss_f32(_mm256_castps256_ps128(val)); }
+};
+
+struct v_uint64x4
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 4 };
+    __m256i val;
+
+    explicit v_uint64x4(__m256i v) : val(v) {}
+    v_uint64x4(uint64 v0, uint64 v1, uint64 v2, uint64 v3)
+    { val = _mm256_setr_epi64x((int64)v0, (int64)v1, (int64)v2, (int64)v3); }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_uint64x4() {}
+
+    uint64 get0() const
+    {
+    #if defined __x86_64__ || defined _M_X64
+        return (uint64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val));
+    #else
+        int a = _mm_cvtsi128_si32(_mm256_castsi256_si128(val));
+        int b = _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_srli_epi64(val, 32)));
+        return (unsigned)a | ((uint64)(unsigned)b << 32);
+    #endif
+    }
+};
+
+struct v_int64x4
+{
+    typedef int64 lane_type;
+    enum { nlanes = 4 };
+    __m256i val;
+
+    explicit v_int64x4(__m256i v) : val(v) {}
+    v_int64x4(int64 v0, int64 v1, int64 v2, int64 v3)
+    { val = _mm256_setr_epi64x(v0, v1, v2, v3); }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_int64x4() {}
+
+    int64 get0() const
+    {
+    #if defined __x86_64__ || defined _M_X64
+        return (int64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val));
+    #else
+        int a = _mm_cvtsi128_si32(_mm256_castsi256_si128(val));
+        int b = _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_srli_epi64(val, 32)));
+        return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
+    #endif
+    }
+};
+
+struct v_float64x4
+{
+    typedef double lane_type;
+    enum { nlanes = 4 };
+    __m256d val;
+
+    explicit v_float64x4(__m256d v) : val(v) {}
+    v_float64x4(double v0, double v1, double v2, double v3)
+    { val = _mm256_setr_pd(v0, v1, v2, v3); }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_float64x4() {}
+
+    double get0() const { return _mm_cvtsd_f64(_mm256_castpd256_pd128(val)); }
+};
+
+//////////////// Load and store operations ///////////////
+
+#define OPENCV_HAL_IMPL_AVX_LOADSTORE(_Tpvec, _Tp)                    \
+    inline _Tpvec v256_load(const _Tp* ptr)                           \
+    { return _Tpvec(_mm256_loadu_si256((const __m256i*)ptr)); }       \
+    inline _Tpvec v256_load_aligned(const _Tp* ptr)                   \
+    { return _Tpvec(_mm256_load_si256((const __m256i*)ptr)); }        \
+    inline _Tpvec v256_load_low(const _Tp* ptr)                       \
+    {                                                                 \
+        __m128i v128 = _mm_loadu_si128((const __m128i*)ptr);          \
+        return _Tpvec(_mm256_castsi128_si256(v128));                  \
+    }                                                                 \
+    inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1)  \
+    {                                                                 \
+        __m128i vlo = _mm_loadu_si128((const __m128i*)ptr0);          \
+        __m128i vhi = _mm_loadu_si128((const __m128i*)ptr1);          \
+        return _Tpvec(_v256_combine(vlo, vhi));                       \
+    }                                                                 \
+    inline void v_store(_Tp* ptr, const _Tpvec& a)                    \
+    { _mm256_storeu_si256((__m256i*)ptr, a.val); }                    \
+    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)            \
+    { _mm256_store_si256((__m256i*)ptr, a.val); }                     \
+    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)    \
+    { _mm256_stream_si256((__m256i*)ptr, a.val); }                    \
+    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+    { \
+        if( mode == hal::STORE_UNALIGNED ) \
+            _mm256_storeu_si256((__m256i*)ptr, a.val); \
+        else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
+            _mm256_stream_si256((__m256i*)ptr, a.val); \
+        else \
+            _mm256_store_si256((__m256i*)ptr, a.val); \
+    } \
+    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                \
+    { _mm_storeu_si128((__m128i*)ptr, _v256_extract_low(a.val)); }    \
+    inline void v_store_high(_Tp* ptr, const _Tpvec& a)               \
+    { _mm_storeu_si128((__m128i*)ptr, _v256_extract_high(a.val)); }
+
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint8x32,  uchar)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int8x32,   schar)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint16x16, ushort)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int16x16,  short)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint32x8,  unsigned)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int32x8,   int)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint64x4,  uint64)
+OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int64x4,   int64)
+
+#define OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(_Tpvec, _Tp, suffix, halfreg)   \
+    inline _Tpvec v256_load(const _Tp* ptr)                               \
+    { return _Tpvec(_mm256_loadu_##suffix(ptr)); }                        \
+    inline _Tpvec v256_load_aligned(const _Tp* ptr)                       \
+    { return _Tpvec(_mm256_load_##suffix(ptr)); }                         \
+    inline _Tpvec v256_load_low(const _Tp* ptr)                           \
+    {                                                                     \
+        return _Tpvec(_mm256_cast##suffix##128_##suffix##256              \
+                     (_mm_loadu_##suffix(ptr)));                          \
+    }                                                                     \
+    inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1)      \
+    {                                                                     \
+        halfreg vlo = _mm_loadu_##suffix(ptr0);                           \
+        halfreg vhi = _mm_loadu_##suffix(ptr1);                           \
+        return _Tpvec(_v256_combine(vlo, vhi));                           \
+    }                                                                     \
+    inline void v_store(_Tp* ptr, const _Tpvec& a)                        \
+    { _mm256_storeu_##suffix(ptr, a.val); }                               \
+    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)                \
+    { _mm256_store_##suffix(ptr, a.val); }                                \
+    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)        \
+    { _mm256_stream_##suffix(ptr, a.val); }                               \
+    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+    { \
+        if( mode == hal::STORE_UNALIGNED ) \
+            _mm256_storeu_##suffix(ptr, a.val); \
+        else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
+            _mm256_stream_##suffix(ptr, a.val); \
+        else \
+            _mm256_store_##suffix(ptr, a.val); \
+    } \
+    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                    \
+    { _mm_storeu_##suffix(ptr, _v256_extract_low(a.val)); }               \
+    inline void v_store_high(_Tp* ptr, const _Tpvec& a)                   \
+    { _mm_storeu_##suffix(ptr, _v256_extract_high(a.val)); }
+
+OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float32x8, float,  ps, __m128)
+OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float64x4, double, pd, __m128d)
+
+#define OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, _Tpvecf, suffix, cast) \
+    inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a)   \
+    { return _Tpvec(cast(a.val)); }
+
+#define OPENCV_HAL_IMPL_AVX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s)          \
+    inline _Tpvec v256_setzero_##suffix()                                        \
+    { return _Tpvec(_mm256_setzero_si256()); }                                   \
+    inline _Tpvec v256_setall_##suffix(_Tp v)                                    \
+    { return _Tpvec(_mm256_set1_##ssuffix((ctype_s)v)); }                        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32,  suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32,   suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int16x16,  suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint32x8,  suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int32x8,   suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint64x4,  suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int64x4,   suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_float32x8, suffix, _mm256_castps_si256)   \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_float64x4, suffix, _mm256_castpd_si256)
+
+OPENCV_HAL_IMPL_AVX_INIT(v_uint8x32,  uchar,    u8,  epi8,   char)
+OPENCV_HAL_IMPL_AVX_INIT(v_int8x32,   schar,    s8,  epi8,   char)
+OPENCV_HAL_IMPL_AVX_INIT(v_uint16x16, ushort,   u16, epi16,  short)
+OPENCV_HAL_IMPL_AVX_INIT(v_int16x16,  short,    s16, epi16,  short)
+OPENCV_HAL_IMPL_AVX_INIT(v_uint32x8,  unsigned, u32, epi32,  int)
+OPENCV_HAL_IMPL_AVX_INIT(v_int32x8,   int,      s32, epi32,  int)
+OPENCV_HAL_IMPL_AVX_INIT(v_uint64x4,  uint64,   u64, epi64x, int64)
+OPENCV_HAL_IMPL_AVX_INIT(v_int64x4,   int64,    s64, epi64x, int64)
+
+#define OPENCV_HAL_IMPL_AVX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
+    inline _Tpvec v256_setzero_##suffix()                                \
+    { return _Tpvec(_mm256_setzero_##zsuffix()); }                       \
+    inline _Tpvec v256_setall_##suffix(_Tp v)                            \
+    { return _Tpvec(_mm256_set1_##zsuffix(v)); }                         \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32,   suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int16x16,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint32x8,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int32x8,   suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint64x4,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int64x4,   suffix, cast)
+
+OPENCV_HAL_IMPL_AVX_INIT_FLT(v_float32x8, float,  f32, ps, _mm256_castsi256_ps)
+OPENCV_HAL_IMPL_AVX_INIT_FLT(v_float64x4, double, f64, pd, _mm256_castsi256_pd)
+
+inline v_float32x8 v_reinterpret_as_f32(const v_float32x8& a)
+{ return a; }
+inline v_float32x8 v_reinterpret_as_f32(const v_float64x4& a)
+{ return v_float32x8(_mm256_castpd_ps(a.val)); }
+
+inline v_float64x4 v_reinterpret_as_f64(const v_float64x4& a)
+{ return a; }
+inline v_float64x4 v_reinterpret_as_f64(const v_float32x8& a)
+{ return v_float64x4(_mm256_castps_pd(a.val)); }
+
+/* Recombine */
+/*#define OPENCV_HAL_IMPL_AVX_COMBINE(_Tpvec, perm)                    \
+    inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)    \
+    { return _Tpvec(perm(a.val, b.val, 0x20)); }                     \
+    inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)   \
+    { return _Tpvec(perm(a.val, b.val, 0x31)); }                     \
+    inline void v_recombine(const _Tpvec& a, const _Tpvec& b,        \
+                             _Tpvec& c, _Tpvec& d)                   \
+    { c = v_combine_low(a, b); d = v_combine_high(a, b); }
+
+#define OPENCV_HAL_IMPL_AVX_UNPACKS(_Tpvec, suffix)                  \
+    OPENCV_HAL_IMPL_AVX_COMBINE(_Tpvec, _mm256_permute2x128_si256)   \
+    inline void v_zip(const _Tpvec& a0, const _Tpvec& a1,            \
+                             _Tpvec& b0, _Tpvec& b1)                 \
+    {                                                                \
+        __m256i v0 = _v256_shuffle_odd_64(a0.val);                   \
+        __m256i v1 = _v256_shuffle_odd_64(a1.val);                   \
+        b0.val = _mm256_unpacklo_##suffix(v0, v1);                   \
+        b1.val = _mm256_unpackhi_##suffix(v0, v1);                   \
+    }
+
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint8x32,  epi8)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_int8x32,   epi8)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint16x16, epi16)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_int16x16,  epi16)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint32x8,  epi32)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_int32x8,   epi32)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint64x4,  epi64)
+OPENCV_HAL_IMPL_AVX_UNPACKS(v_int64x4,   epi64)
+OPENCV_HAL_IMPL_AVX_COMBINE(v_float32x8, _mm256_permute2f128_ps)
+OPENCV_HAL_IMPL_AVX_COMBINE(v_float64x4, _mm256_permute2f128_pd)
+
+inline void v_zip(const v_float32x8& a0, const v_float32x8& a1, v_float32x8& b0, v_float32x8& b1)
+{
+    __m256 v0 = _mm256_unpacklo_ps(a0.val, a1.val);
+    __m256 v1 = _mm256_unpackhi_ps(a0.val, a1.val);
+    v_recombine(v_float32x8(v0), v_float32x8(v1), b0, b1);
+}
+
+inline void v_zip(const v_float64x4& a0, const v_float64x4& a1, v_float64x4& b0, v_float64x4& b1)
+{
+    __m256d v0 = _v_shuffle_odd_64(a0.val);
+    __m256d v1 = _v_shuffle_odd_64(a1.val);
+    b0.val = _mm256_unpacklo_pd(v0, v1);
+    b1.val = _mm256_unpackhi_pd(v0, v1);
+}*/
+
+//////////////// Variant Value reordering ///////////////
+
+// unpacks
+#define OPENCV_HAL_IMPL_AVX_UNPACK(_Tpvec, suffix)                 \
+    inline _Tpvec v256_unpacklo(const _Tpvec& a, const _Tpvec& b)  \
+    { return _Tpvec(_mm256_unpacklo_##suffix(a.val, b.val)); }     \
+    inline _Tpvec v256_unpackhi(const _Tpvec& a, const _Tpvec& b)  \
+    { return _Tpvec(_mm256_unpackhi_##suffix(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_AVX_UNPACK(v_uint8x32,  epi8)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_int8x32,   epi8)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_uint16x16, epi16)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_int16x16,  epi16)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_uint32x8,  epi32)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_int32x8,   epi32)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_uint64x4,  epi64)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_int64x4,   epi64)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_float32x8, ps)
+OPENCV_HAL_IMPL_AVX_UNPACK(v_float64x4, pd)
+
+// blend
+#define OPENCV_HAL_IMPL_AVX_BLEND(_Tpvec, suffix)               \
+    template<int m>                                             \
+    inline _Tpvec v256_blend(const _Tpvec& a, const _Tpvec& b)  \
+    { return _Tpvec(_mm256_blend_##suffix(a.val, b.val, m)); }
+
+OPENCV_HAL_IMPL_AVX_BLEND(v_uint16x16, epi16)
+OPENCV_HAL_IMPL_AVX_BLEND(v_int16x16,  epi16)
+OPENCV_HAL_IMPL_AVX_BLEND(v_uint32x8,  epi32)
+OPENCV_HAL_IMPL_AVX_BLEND(v_int32x8,   epi32)
+OPENCV_HAL_IMPL_AVX_BLEND(v_float32x8, ps)
+OPENCV_HAL_IMPL_AVX_BLEND(v_float64x4, pd)
+
+template<int m>
+inline v_uint64x4 v256_blend(const v_uint64x4& a, const v_uint64x4& b)
+{
+    enum {M0 = m};
+    enum {M1 = (M0 | (M0 << 2)) & 0x33};
+    enum {M2 = (M1 | (M1 << 1)) & 0x55};
+    enum {MM =  M2 | (M2 << 1)};
+    return v_uint64x4(_mm256_blend_epi32(a.val, b.val, MM));
+}
+template<int m>
+inline v_int64x4 v256_blend(const v_int64x4& a, const v_int64x4& b)
+{ return v_int64x4(v256_blend<m>(v_uint64x4(a.val), v_uint64x4(b.val)).val); }
+
+// shuffle
+// todo: emulate 64bit
+#define OPENCV_HAL_IMPL_AVX_SHUFFLE(_Tpvec, intrin)  \
+    template<int m>                                  \
+    inline _Tpvec v256_shuffle(const _Tpvec& a)      \
+    { return _Tpvec(_mm256_##intrin(a.val, m)); }
+
+OPENCV_HAL_IMPL_AVX_SHUFFLE(v_uint32x8,  shuffle_epi32)
+OPENCV_HAL_IMPL_AVX_SHUFFLE(v_int32x8,   shuffle_epi32)
+OPENCV_HAL_IMPL_AVX_SHUFFLE(v_float32x8, permute_ps)
+OPENCV_HAL_IMPL_AVX_SHUFFLE(v_float64x4, permute_pd)
+
+template<typename _Tpvec>
+inline void v256_zip(const _Tpvec& a, const _Tpvec& b, _Tpvec& ab0, _Tpvec& ab1)
+{
+    ab0 = v256_unpacklo(a, b);
+    ab1 = v256_unpackhi(a, b);
+}
+
+template<typename _Tpvec>
+inline _Tpvec v256_combine_diagonal(const _Tpvec& a, const _Tpvec& b)
+{ return _Tpvec(_mm256_blend_epi32(a.val, b.val, 0xf0)); }
+
+inline v_float32x8 v256_combine_diagonal(const v_float32x8& a, const v_float32x8& b)
+{ return v256_blend<0xf0>(a, b); }
+
+inline v_float64x4 v256_combine_diagonal(const v_float64x4& a, const v_float64x4& b)
+{ return v256_blend<0xc>(a, b); }
+
+template<typename _Tpvec>
+inline _Tpvec v256_alignr_128(const _Tpvec& a, const _Tpvec& b)
+{ return v256_permute2x128<0x21>(a, b); }
+
+template<typename _Tpvec>
+inline _Tpvec v256_alignr_64(const _Tpvec& a, const _Tpvec& b)
+{ return _Tpvec(_mm256_alignr_epi8(a.val, b.val, 8)); }
+inline v_float64x4 v256_alignr_64(const v_float64x4& a, const v_float64x4& b)
+{ return v_float64x4(_mm256_shuffle_pd(b.val, a.val, _MM_SHUFFLE(0, 0, 1, 1))); }
+// todo: emulate float32
+
+template<typename _Tpvec>
+inline _Tpvec v256_swap_halves(const _Tpvec& a)
+{ return v256_permute2x128<1>(a, a); }
+
+template<typename _Tpvec>
+inline _Tpvec v256_reverse_64(const _Tpvec& a)
+{ return v256_permute4x64<_MM_SHUFFLE(0, 1, 2, 3)>(a); }
+
+// ZIP
+#define OPENCV_HAL_IMPL_AVX_ZIP(_Tpvec)                              \
+    inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)    \
+    { return v256_permute2x128<0x20>(a, b); }                        \
+    inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)   \
+    { return v256_permute2x128<0x31>(a, b); }                        \
+    inline void v_recombine(const _Tpvec& a, const _Tpvec& b,        \
+                             _Tpvec& c, _Tpvec& d)                   \
+    {                                                                \
+        _Tpvec a1b0 = v256_alignr_128(a, b);                         \
+        c = v256_combine_diagonal(a, a1b0);                          \
+        d = v256_combine_diagonal(a1b0, b);                          \
+    }                                                                \
+    inline void v_zip(const _Tpvec& a, const _Tpvec& b,              \
+                      _Tpvec& ab0, _Tpvec& ab1)                      \
+    {                                                                \
+        _Tpvec ab0ab2, ab1ab3;                                       \
+        v256_zip(a, b, ab0ab2, ab1ab3);                              \
+        v_recombine(ab0ab2, ab1ab3, ab0, ab1);                       \
+    }
+
+OPENCV_HAL_IMPL_AVX_ZIP(v_uint8x32)
+OPENCV_HAL_IMPL_AVX_ZIP(v_int8x32)
+OPENCV_HAL_IMPL_AVX_ZIP(v_uint16x16)
+OPENCV_HAL_IMPL_AVX_ZIP(v_int16x16)
+OPENCV_HAL_IMPL_AVX_ZIP(v_uint32x8)
+OPENCV_HAL_IMPL_AVX_ZIP(v_int32x8)
+OPENCV_HAL_IMPL_AVX_ZIP(v_uint64x4)
+OPENCV_HAL_IMPL_AVX_ZIP(v_int64x4)
+OPENCV_HAL_IMPL_AVX_ZIP(v_float32x8)
+OPENCV_HAL_IMPL_AVX_ZIP(v_float64x4)
+
+////////// Arithmetic, bitwise and comparison operations /////////
+
+/* Element-wise binary and unary operations */
+
+/** Arithmetics **/
+#define OPENCV_HAL_IMPL_AVX_BIN_OP(bin_op, _Tpvec, intrin)            \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)  \
+    { return _Tpvec(intrin(a.val, b.val)); }                          \
+    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)    \
+    { a.val = intrin(a.val, b.val); return a; }
+
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint8x32,  _mm256_adds_epu8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint8x32,  _mm256_subs_epu8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int8x32,   _mm256_adds_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int8x32,   _mm256_subs_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint16x16, _mm256_adds_epu16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint16x16, _mm256_subs_epu16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int16x16,  _mm256_adds_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int16x16,  _mm256_subs_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint32x8,  _mm256_add_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint32x8,  _mm256_sub_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint32x8,  _mm256_mullo_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int32x8,   _mm256_add_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int32x8,   _mm256_sub_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int32x8,   _mm256_mullo_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint64x4,  _mm256_add_epi64)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint64x4,  _mm256_sub_epi64)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int64x4,   _mm256_add_epi64)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int64x4,   _mm256_sub_epi64)
+
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float32x8, _mm256_add_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float32x8, _mm256_sub_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float32x8, _mm256_mul_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float32x8, _mm256_div_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float64x4, _mm256_add_pd)
+OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float64x4, _mm256_sub_pd)
+OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float64x4, _mm256_mul_pd)
+OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float64x4, _mm256_div_pd)
+
+// saturating multiply 8-bit, 16-bit
+inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b)
+{
+    v_uint16x16 c, d;
+    v_mul_expand(a, b, c, d);
+    return v_pack(c, d);
+}
+inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b)
+{
+    v_int16x16 c, d;
+    v_mul_expand(a, b, c, d);
+    return v_pack(c, d);
+}
+inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i pl = _mm256_mullo_epi16(a.val, b.val);
+    __m256i ph = _mm256_mulhi_epu16(a.val, b.val);
+    __m256i p0 = _mm256_unpacklo_epi16(pl, ph);
+    __m256i p1 = _mm256_unpackhi_epi16(pl, ph);
+    return v_uint16x16(_v256_packs_epu32(p0, p1));
+}
+inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b)
+{
+    __m256i pl = _mm256_mullo_epi16(a.val, b.val);
+    __m256i ph = _mm256_mulhi_epi16(a.val, b.val);
+    __m256i p0 = _mm256_unpacklo_epi16(pl, ph);
+    __m256i p1 = _mm256_unpackhi_epi16(pl, ph);
+    return v_int16x16(_mm256_packs_epi32(p0, p1));
+}
+inline v_uint8x32& operator *= (v_uint8x32& a, const v_uint8x32& b)
+{ a = a * b; return a; }
+inline v_int8x32& operator *= (v_int8x32& a, const v_int8x32& b)
+{ a = a * b; return a; }
+inline v_uint16x16& operator *= (v_uint16x16& a, const v_uint16x16& b)
+{ a = a * b; return a; }
+inline v_int16x16& operator *= (v_int16x16& a, const v_int16x16& b)
+{ a = a * b; return a; }
+
+/** Non-saturating arithmetics **/
+#define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
+    inline _Tpvec func(const _Tpvec& a, const _Tpvec& b)   \
+    { return _Tpvec(intrin(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint8x32,  _mm256_add_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int8x32,   _mm256_add_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint16x16, _mm256_add_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int16x16,  _mm256_add_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint8x32,  _mm256_sub_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int8x32,   _mm256_sub_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint16x16, _mm256_sub_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int16x16,  _mm256_sub_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_mul_wrap, v_uint16x16, _mm256_mullo_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_mul_wrap, v_int16x16,  _mm256_mullo_epi16)
+
+inline v_uint8x32 v_mul_wrap(const v_uint8x32& a, const v_uint8x32& b)
+{
+    __m256i ad = _mm256_srai_epi16(a.val, 8);
+    __m256i bd = _mm256_srai_epi16(b.val, 8);
+    __m256i p0 = _mm256_mullo_epi16(a.val, b.val); // even
+    __m256i p1 = _mm256_slli_epi16(_mm256_mullo_epi16(ad, bd), 8); // odd
+
+    const __m256i b01 = _mm256_set1_epi32(0xFF00FF00);
+    return v_uint8x32(_mm256_blendv_epi8(p0, p1, b01));
+}
+inline v_int8x32 v_mul_wrap(const v_int8x32& a, const v_int8x32& b)
+{
+    return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
+}
+
+//  Multiply and expand
+inline void v_mul_expand(const v_uint8x32& a, const v_uint8x32& b,
+                         v_uint16x16& c, v_uint16x16& d)
+{
+    v_uint16x16 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
+inline void v_mul_expand(const v_int8x32& a, const v_int8x32& b,
+                         v_int16x16& c, v_int16x16& d)
+{
+    v_int16x16 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
+inline void v_mul_expand(const v_int16x16& a, const v_int16x16& b,
+                         v_int32x8& c, v_int32x8& d)
+{
+    v_int16x16 vhi = v_int16x16(_mm256_mulhi_epi16(a.val, b.val));
+
+    v_int16x16 v0, v1;
+    v_zip(v_mul_wrap(a, b), vhi, v0, v1);
+
+    c = v_reinterpret_as_s32(v0);
+    d = v_reinterpret_as_s32(v1);
+}
+
+inline void v_mul_expand(const v_uint16x16& a, const v_uint16x16& b,
+                         v_uint32x8& c, v_uint32x8& d)
+{
+    v_uint16x16 vhi = v_uint16x16(_mm256_mulhi_epu16(a.val, b.val));
+
+    v_uint16x16 v0, v1;
+    v_zip(v_mul_wrap(a, b), vhi, v0, v1);
+
+    c = v_reinterpret_as_u32(v0);
+    d = v_reinterpret_as_u32(v1);
+}
+
+inline void v_mul_expand(const v_uint32x8& a, const v_uint32x8& b,
+                         v_uint64x4& c, v_uint64x4& d)
+{
+    __m256i v0 = _mm256_mul_epu32(a.val, b.val);
+    __m256i v1 = _mm256_mul_epu32(_mm256_srli_epi64(a.val, 32), _mm256_srli_epi64(b.val, 32));
+    v_zip(v_uint64x4(v0), v_uint64x4(v1), c, d);
+}
+
+inline v_int16x16 v_mul_hi(const v_int16x16& a, const v_int16x16& b) { return v_int16x16(_mm256_mulhi_epi16(a.val, b.val)); }
+inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return v_uint16x16(_mm256_mulhi_epu16(a.val, b.val)); }
+
+/** Bitwise shifts **/
+#define OPENCV_HAL_IMPL_AVX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai)  \
+    inline _Tpuvec operator << (const _Tpuvec& a, int imm)            \
+    { return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); }             \
+    inline _Tpsvec operator << (const _Tpsvec& a, int imm)            \
+    { return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); }             \
+    inline _Tpuvec operator >> (const _Tpuvec& a, int imm)            \
+    { return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); }             \
+    inline _Tpsvec operator >> (const _Tpsvec& a, int imm)            \
+    { return _Tpsvec(srai(a.val, imm)); }                             \
+    template<int imm>                                                 \
+    inline _Tpuvec v_shl(const _Tpuvec& a)                            \
+    { return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); }             \
+    template<int imm>                                                 \
+    inline _Tpsvec v_shl(const _Tpsvec& a)                            \
+    { return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); }             \
+    template<int imm>                                                 \
+    inline _Tpuvec v_shr(const _Tpuvec& a)                            \
+    { return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); }             \
+    template<int imm>                                                 \
+    inline _Tpsvec v_shr(const _Tpsvec& a)                            \
+    { return _Tpsvec(srai(a.val, imm)); }
+
+OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint16x16, v_int16x16, epi16, _mm256_srai_epi16)
+OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint32x8,  v_int32x8,  epi32, _mm256_srai_epi32)
+
+inline __m256i _mm256_srai_epi64xx(const __m256i a, int imm)
+{
+    __m256i d = _mm256_set1_epi64x((int64)1 << 63);
+    __m256i r = _mm256_srli_epi64(_mm256_add_epi64(a, d), imm);
+    return _mm256_sub_epi64(r, _mm256_srli_epi64(d, imm));
+}
+OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint64x4,  v_int64x4,  epi64, _mm256_srai_epi64xx)
+
+
+/** Bitwise logic **/
+#define OPENCV_HAL_IMPL_AVX_LOGIC_OP(_Tpvec, suffix, not_const)  \
+    OPENCV_HAL_IMPL_AVX_BIN_OP(&, _Tpvec, _mm256_and_##suffix)   \
+    OPENCV_HAL_IMPL_AVX_BIN_OP(|, _Tpvec, _mm256_or_##suffix)    \
+    OPENCV_HAL_IMPL_AVX_BIN_OP(^, _Tpvec, _mm256_xor_##suffix)   \
+    inline _Tpvec operator ~ (const _Tpvec& a)                   \
+    { return _Tpvec(_mm256_xor_##suffix(a.val, not_const)); }
+
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint8x32,   si256, _mm256_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int8x32,    si256, _mm256_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint16x16,  si256, _mm256_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int16x16,   si256, _mm256_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint32x8,   si256, _mm256_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int32x8,    si256, _mm256_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint64x4,   si256, _mm256_set1_epi64x(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int64x4,    si256, _mm256_set1_epi64x(-1))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_float32x8,  ps,    _mm256_castsi256_ps(_mm256_set1_epi32(-1)))
+OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_float64x4,  pd,    _mm256_castsi256_pd(_mm256_set1_epi32(-1)))
+
+/** Select **/
+#define OPENCV_HAL_IMPL_AVX_SELECT(_Tpvec, suffix)                               \
+    inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+    { return _Tpvec(_mm256_blendv_##suffix(b.val, a.val, mask.val)); }
+
+OPENCV_HAL_IMPL_AVX_SELECT(v_uint8x32,  epi8)
+OPENCV_HAL_IMPL_AVX_SELECT(v_int8x32,   epi8)
+OPENCV_HAL_IMPL_AVX_SELECT(v_uint16x16, epi8)
+OPENCV_HAL_IMPL_AVX_SELECT(v_int16x16,  epi8)
+OPENCV_HAL_IMPL_AVX_SELECT(v_uint32x8,  epi8)
+OPENCV_HAL_IMPL_AVX_SELECT(v_int32x8,   epi8)
+OPENCV_HAL_IMPL_AVX_SELECT(v_float32x8, ps)
+OPENCV_HAL_IMPL_AVX_SELECT(v_float64x4, pd)
+
+/** Comparison **/
+#define OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpvec)                     \
+    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)  \
+    { return ~(a == b); }                                         \
+    inline _Tpvec operator <  (const _Tpvec& a, const _Tpvec& b)  \
+    { return b > a; }                                             \
+    inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b)  \
+    { return ~(a < b); }                                          \
+    inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b)  \
+    { return b >= a; }
+
+#define OPENCV_HAL_IMPL_AVX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, sbit)   \
+    inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b)      \
+    { return _Tpuvec(_mm256_cmpeq_##suffix(a.val, b.val)); }             \
+    inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b)       \
+    {                                                                    \
+        __m256i smask = _mm256_set1_##suffix(sbit);                      \
+        return _Tpuvec(_mm256_cmpgt_##suffix(                            \
+                       _mm256_xor_si256(a.val, smask),                   \
+                       _mm256_xor_si256(b.val, smask)));                 \
+    }                                                                    \
+    inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b)      \
+    { return _Tpsvec(_mm256_cmpeq_##suffix(a.val, b.val)); }             \
+    inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b)       \
+    { return _Tpsvec(_mm256_cmpgt_##suffix(a.val, b.val)); }             \
+    OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpuvec)                               \
+    OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpsvec)
+
+OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint8x32,  v_int8x32,  epi8,  (char)-128)
+OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint16x16, v_int16x16, epi16, (short)-32768)
+OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint32x8,  v_int32x8,  epi32, (int)0x80000000)
+
+#define OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(_Tpvec)                 \
+    inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+    { return _Tpvec(_mm256_cmpeq_epi64(a.val, b.val)); }         \
+    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+    { return ~(a == b); }
+
+OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_uint64x4)
+OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_int64x4)
+
+#define OPENCV_HAL_IMPL_AVX_CMP_FLT(bin_op, imm8, _Tpvec, suffix)    \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+    { return _Tpvec(_mm256_cmp_##suffix(a.val, b.val, imm8)); }
+
+#define OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(_Tpvec, suffix)               \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(==, _CMP_EQ_OQ,  _Tpvec, suffix)     \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, suffix)     \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(<,  _CMP_LT_OQ,  _Tpvec, suffix)     \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(>,  _CMP_GT_OQ,  _Tpvec, suffix)     \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(<=, _CMP_LE_OQ,  _Tpvec, suffix)     \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(>=, _CMP_GE_OQ,  _Tpvec, suffix)
+
+OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float32x8, ps)
+OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float64x4, pd)
+
+inline v_float32x8 v_not_nan(const v_float32x8& a)
+{ return v_float32x8(_mm256_cmp_ps(a.val, a.val, _CMP_ORD_Q)); }
+inline v_float64x4 v_not_nan(const v_float64x4& a)
+{ return v_float64x4(_mm256_cmp_pd(a.val, a.val, _CMP_ORD_Q)); }
+
+/** min/max **/
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint8x32,  _mm256_min_epu8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint8x32,  _mm256_max_epu8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int8x32,   _mm256_min_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int8x32,   _mm256_max_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint16x16, _mm256_min_epu16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint16x16, _mm256_max_epu16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int16x16,  _mm256_min_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int16x16,  _mm256_max_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint32x8,  _mm256_min_epu32)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint32x8,  _mm256_max_epu32)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int32x8,   _mm256_min_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int32x8,   _mm256_max_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_float32x8, _mm256_min_ps)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float32x8, _mm256_max_ps)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_float64x4, _mm256_min_pd)
+OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float64x4, _mm256_max_pd)
+
+/** Rotate **/
+template<int imm>
+inline v_uint8x32 v_rotate_left(const v_uint8x32& a, const v_uint8x32& b)
+{
+    enum {IMM_R = (16 - imm) & 0xFF};
+    enum {IMM_R2 = (32 - imm) & 0xFF};
+
+    if (imm == 0)  return a;
+    if (imm == 32) return b;
+    if (imm > 32)  return v_uint8x32();
+
+    __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x03);
+    if (imm == 16) return v_uint8x32(swap);
+    if (imm < 16)  return v_uint8x32(_mm256_alignr_epi8(a.val, swap, IMM_R));
+    return v_uint8x32(_mm256_alignr_epi8(swap, b.val, IMM_R2)); // imm < 32
+}
+
+template<int imm>
+inline v_uint8x32 v_rotate_right(const v_uint8x32& a, const v_uint8x32& b)
+{
+    enum {IMM_L = (imm - 16) & 0xFF};
+
+    if (imm == 0)  return a;
+    if (imm == 32) return b;
+    if (imm > 32)  return v_uint8x32();
+
+    __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x21);
+    if (imm == 16) return v_uint8x32(swap);
+    if (imm < 16)  return v_uint8x32(_mm256_alignr_epi8(swap, a.val, imm));
+    return v_uint8x32(_mm256_alignr_epi8(b.val, swap, IMM_L));
+}
+
+template<int imm>
+inline v_uint8x32 v_rotate_left(const v_uint8x32& a)
+{
+    enum {IMM_L = (imm - 16) & 0xFF};
+    enum {IMM_R = (16 - imm) & 0xFF};
+
+    if (imm == 0) return a;
+    if (imm > 32) return v_uint8x32();
+
+    // ESAC control[3] ? [127:0] = 0
+    __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(0, 0, 2, 0));
+    if (imm == 16) return v_uint8x32(swapz);
+    if (imm < 16)  return v_uint8x32(_mm256_alignr_epi8(a.val, swapz, IMM_R));
+    return v_uint8x32(_mm256_slli_si256(swapz, IMM_L));
+}
+
+template<int imm>
+inline v_uint8x32 v_rotate_right(const v_uint8x32& a)
+{
+    enum {IMM_L = (imm - 16) & 0xFF};
+
+    if (imm == 0) return a;
+    if (imm > 32) return v_uint8x32();
+
+    // ESAC control[3] ? [127:0] = 0
+    __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(2, 0, 0, 1));
+    if (imm == 16) return v_uint8x32(swapz);
+    if (imm < 16)  return v_uint8x32(_mm256_alignr_epi8(swapz, a.val, imm));
+    return v_uint8x32(_mm256_srli_si256(swapz, IMM_L));
+}
+
+#define OPENCV_HAL_IMPL_AVX_ROTATE_CAST(intrin, _Tpvec, cast)     \
+    template<int imm>                                             \
+    inline _Tpvec intrin(const _Tpvec& a, const _Tpvec& b)        \
+    {                                                             \
+        enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)};  \
+        v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a),    \
+                                       v_reinterpret_as_u8(b));   \
+        return _Tpvec(cast(ret.val));                             \
+    }                                                             \
+    template<int imm>                                             \
+    inline _Tpvec intrin(const _Tpvec& a)                         \
+    {                                                             \
+        enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)};  \
+        v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a));   \
+        return _Tpvec(cast(ret.val));                             \
+    }
+
+#define OPENCV_HAL_IMPL_AVX_ROTATE(_Tpvec)                                  \
+    OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left,  _Tpvec, OPENCV_HAL_NOP) \
+    OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, _Tpvec, OPENCV_HAL_NOP)
+
+OPENCV_HAL_IMPL_AVX_ROTATE(v_int8x32)
+OPENCV_HAL_IMPL_AVX_ROTATE(v_uint16x16)
+OPENCV_HAL_IMPL_AVX_ROTATE(v_int16x16)
+OPENCV_HAL_IMPL_AVX_ROTATE(v_uint32x8)
+OPENCV_HAL_IMPL_AVX_ROTATE(v_int32x8)
+OPENCV_HAL_IMPL_AVX_ROTATE(v_uint64x4)
+OPENCV_HAL_IMPL_AVX_ROTATE(v_int64x4)
+
+OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left,  v_float32x8, _mm256_castsi256_ps)
+OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float32x8, _mm256_castsi256_ps)
+OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left,  v_float64x4, _mm256_castsi256_pd)
+OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float64x4, _mm256_castsi256_pd)
+
+/** Reverse **/
+inline v_uint8x32 v_reverse(const v_uint8x32 &a)
+{
+    static const __m256i perm = _mm256_setr_epi8(
+            15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+            15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    __m256i vec = _mm256_shuffle_epi8(a.val, perm);
+    return v_uint8x32(_mm256_permute2x128_si256(vec, vec, 1));
+}
+
+inline v_int8x32 v_reverse(const v_int8x32 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x16 v_reverse(const v_uint16x16 &a)
+{
+    static const __m256i perm = _mm256_setr_epi8(
+            14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,
+            14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+    __m256i vec = _mm256_shuffle_epi8(a.val, perm);
+    return v_uint16x16(_mm256_permute2x128_si256(vec, vec, 1));
+}
+
+inline v_int16x16 v_reverse(const v_int16x16 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x8 v_reverse(const v_uint32x8 &a)
+{
+    static const __m256i perm = _mm256_setr_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+    return v_uint32x8(_mm256_permutevar8x32_epi32(a.val, perm));
+}
+
+inline v_int32x8 v_reverse(const v_int32x8 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x8 v_reverse(const v_float32x8 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x4 v_reverse(const v_uint64x4 &a)
+{
+    return v_uint64x4(_mm256_permute4x64_epi64(a.val, _MM_SHUFFLE(0, 1, 2, 3)));
+}
+
+inline v_int64x4 v_reverse(const v_int64x4 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+inline v_float64x4 v_reverse(const v_float64x4 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
+////////// Reduce and mask /////////
+
+/** Reduce **/
+inline unsigned v_reduce_sum(const v_uint8x32& a)
+{
+    __m256i half = _mm256_sad_epu8(a.val, _mm256_setzero_si256());
+    __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
+    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
+}
+inline int v_reduce_sum(const v_int8x32& a)
+{
+    __m256i half = _mm256_sad_epu8(_mm256_xor_si256(a.val, _mm256_set1_epi8((schar)-128)), _mm256_setzero_si256());
+    __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
+    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter))) - 4096;
+}
+#define OPENCV_HAL_IMPL_AVX_REDUCE_32(_Tpvec, sctype, func, intrin) \
+    inline sctype v_reduce_##func(const _Tpvec& a) \
+    { \
+        __m128i val = intrin(_v256_extract_low(a.val), _v256_extract_high(a.val)); \
+        val = intrin(val, _mm_srli_si128(val,8)); \
+        val = intrin(val, _mm_srli_si128(val,4)); \
+        val = intrin(val, _mm_srli_si128(val,2)); \
+        val = intrin(val, _mm_srli_si128(val,1)); \
+        return (sctype)_mm_cvtsi128_si32(val); \
+    }
+
+OPENCV_HAL_IMPL_AVX_REDUCE_32(v_uint8x32, uchar, min, _mm_min_epu8)
+OPENCV_HAL_IMPL_AVX_REDUCE_32(v_int8x32,  schar, min, _mm_min_epi8)
+OPENCV_HAL_IMPL_AVX_REDUCE_32(v_uint8x32, uchar, max, _mm_max_epu8)
+OPENCV_HAL_IMPL_AVX_REDUCE_32(v_int8x32,  schar, max, _mm_max_epi8)
+
+#define OPENCV_HAL_IMPL_AVX_REDUCE_16(_Tpvec, sctype, func, intrin) \
+    inline sctype v_reduce_##func(const _Tpvec& a)                  \
+    {                                                               \
+        __m128i v0 = _v256_extract_low(a.val);                      \
+        __m128i v1 = _v256_extract_high(a.val);                     \
+        v0 = intrin(v0, v1);                                        \
+        v0 = intrin(v0, _mm_srli_si128(v0, 8));                     \
+        v0 = intrin(v0, _mm_srli_si128(v0, 4));                     \
+        v0 = intrin(v0, _mm_srli_si128(v0, 2));                     \
+        return (sctype) _mm_cvtsi128_si32(v0);                      \
+    }
+
+OPENCV_HAL_IMPL_AVX_REDUCE_16(v_uint16x16, ushort, min, _mm_min_epu16)
+OPENCV_HAL_IMPL_AVX_REDUCE_16(v_int16x16,  short,  min, _mm_min_epi16)
+OPENCV_HAL_IMPL_AVX_REDUCE_16(v_uint16x16, ushort, max, _mm_max_epu16)
+OPENCV_HAL_IMPL_AVX_REDUCE_16(v_int16x16,  short,  max, _mm_max_epi16)
+
+#define OPENCV_HAL_IMPL_AVX_REDUCE_8(_Tpvec, sctype, func, intrin) \
+    inline sctype v_reduce_##func(const _Tpvec& a)                 \
+    {                                                              \
+        __m128i v0 = _v256_extract_low(a.val);                     \
+        __m128i v1 = _v256_extract_high(a.val);                    \
+        v0 = intrin(v0, v1);                                       \
+        v0 = intrin(v0, _mm_srli_si128(v0, 8));                    \
+        v0 = intrin(v0, _mm_srli_si128(v0, 4));                    \
+        return (sctype) _mm_cvtsi128_si32(v0);                     \
+    }
+
+OPENCV_HAL_IMPL_AVX_REDUCE_8(v_uint32x8, unsigned, min, _mm_min_epu32)
+OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8,  int,      min, _mm_min_epi32)
+OPENCV_HAL_IMPL_AVX_REDUCE_8(v_uint32x8, unsigned, max, _mm_max_epu32)
+OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8,  int,      max, _mm_max_epi32)
+
+#define OPENCV_HAL_IMPL_AVX_REDUCE_FLT(func, intrin)                  \
+    inline float v_reduce_##func(const v_float32x8& a)                \
+    {                                                                 \
+        __m128 v0 = _v256_extract_low(a.val);                         \
+        __m128 v1 = _v256_extract_high(a.val);                        \
+        v0 = intrin(v0, v1);                                          \
+        v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 3, 2))); \
+        v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 0, 1))); \
+        return _mm_cvtss_f32(v0);                                     \
+    }
+
+OPENCV_HAL_IMPL_AVX_REDUCE_FLT(min, _mm_min_ps)
+OPENCV_HAL_IMPL_AVX_REDUCE_FLT(max, _mm_max_ps)
+
+inline int v_reduce_sum(const v_int32x8& a)
+{
+    __m256i s0 = _mm256_hadd_epi32(a.val, a.val);
+            s0 = _mm256_hadd_epi32(s0, s0);
+
+    __m128i s1 = _v256_extract_high(s0);
+            s1 = _mm_add_epi32(_v256_extract_low(s0), s1);
+
+    return _mm_cvtsi128_si32(s1);
+}
+
+inline unsigned v_reduce_sum(const v_uint32x8& a)
+{ return v_reduce_sum(v_reinterpret_as_s32(a)); }
+
+inline int v_reduce_sum(const v_int16x16& a)
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+inline unsigned v_reduce_sum(const v_uint16x16& a)
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+
+inline float v_reduce_sum(const v_float32x8& a)
+{
+    __m256 s0 = _mm256_hadd_ps(a.val, a.val);
+           s0 = _mm256_hadd_ps(s0, s0);
+
+    __m128 s1 = _v256_extract_high(s0);
+           s1 = _mm_add_ps(_v256_extract_low(s0), s1);
+
+    return _mm_cvtss_f32(s1);
+}
+
+inline uint64 v_reduce_sum(const v_uint64x4& a)
+{
+    uint64 CV_DECL_ALIGNED(32) idx[2];
+    _mm_store_si128((__m128i*)idx, _mm_add_epi64(_v256_extract_low(a.val), _v256_extract_high(a.val)));
+    return idx[0] + idx[1];
+}
+inline int64 v_reduce_sum(const v_int64x4& a)
+{
+    int64 CV_DECL_ALIGNED(32) idx[2];
+    _mm_store_si128((__m128i*)idx, _mm_add_epi64(_v256_extract_low(a.val), _v256_extract_high(a.val)));
+    return idx[0] + idx[1];
+}
+inline double v_reduce_sum(const v_float64x4& a)
+{
+    __m256d s0 = _mm256_hadd_pd(a.val, a.val);
+    return _mm_cvtsd_f64(_mm_add_pd(_v256_extract_low(s0), _v256_extract_high(s0)));
+}
+
+inline v_float32x8 v_reduce_sum4(const v_float32x8& a, const v_float32x8& b,
+                                 const v_float32x8& c, const v_float32x8& d)
+{
+    __m256 ab = _mm256_hadd_ps(a.val, b.val);
+    __m256 cd = _mm256_hadd_ps(c.val, d.val);
+    return v_float32x8(_mm256_hadd_ps(ab, cd));
+}
+
+inline unsigned v_reduce_sad(const v_uint8x32& a, const v_uint8x32& b)
+{
+    __m256i half = _mm256_sad_epu8(a.val, b.val);
+    __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
+    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
+}
+inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
+{
+    __m256i half = _mm256_set1_epi8(0x7f);
+    half = _mm256_sad_epu8(_mm256_add_epi8(a.val, half), _mm256_add_epi8(b.val, half));
+    __m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
+    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
+}
+inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b)
+{
+    v_uint32x8 l, h;
+    v_expand(v_add_wrap(a - b, b - a), l, h);
+    return v_reduce_sum(l + h);
+}
+inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b)
+{
+    v_uint32x8 l, h;
+    v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h);
+    return v_reduce_sum(l + h);
+}
+inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b)
+{
+    return v_reduce_sum(v_max(a, b) - v_min(a, b));
+}
+inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b)
+{
+    v_int32x8 m = a < b;
+    return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m));
+}
+inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
+{
+    return v_reduce_sum((a - b) & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
+}
+
+/** Popcount **/
+inline v_uint8x32 v_popcount(const v_uint8x32& a)
+{
+    __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+                                             0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
+    __m256i _popcnt_mask = _mm256_set1_epi8(0x0F);
+    return v_uint8x32(_mm256_add_epi8(_mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(                  a.val    , _popcnt_mask)),
+                                      _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_mm256_srli_epi16(a.val, 4), _popcnt_mask))));
+}
+inline v_uint16x16 v_popcount(const v_uint16x16& a)
+{
+    v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
+    p += v_rotate_right<1>(p);
+    return v_reinterpret_as_u16(p) & v256_setall_u16(0x00ff);
+}
+inline v_uint32x8 v_popcount(const v_uint32x8& a)
+{
+    v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
+    p += v_rotate_right<1>(p);
+    p += v_rotate_right<2>(p);
+    return v_reinterpret_as_u32(p) & v256_setall_u32(0x000000ff);
+}
+inline v_uint64x4 v_popcount(const v_uint64x4& a)
+{
+    return v_uint64x4(_mm256_sad_epu8(v_popcount(v_reinterpret_as_u8(a)).val, _mm256_setzero_si256()));
+}
+inline v_uint8x32 v_popcount(const v_int8x32& a)
+{ return v_popcount(v_reinterpret_as_u8(a)); }
+inline v_uint16x16 v_popcount(const v_int16x16& a)
+{ return v_popcount(v_reinterpret_as_u16(a)); }
+inline v_uint32x8 v_popcount(const v_int32x8& a)
+{ return v_popcount(v_reinterpret_as_u32(a)); }
+inline v_uint64x4 v_popcount(const v_int64x4& a)
+{ return v_popcount(v_reinterpret_as_u64(a)); }
+
+/** Mask **/
+inline int v_signmask(const v_int8x32& a)
+{ return _mm256_movemask_epi8(a.val); }
+inline int v_signmask(const v_uint8x32& a)
+{ return v_signmask(v_reinterpret_as_s8(a)); }
+
+inline int v_signmask(const v_int16x16& a)
+{ return v_signmask(v_pack(a, a)) & 0xFFFF; }
+inline int v_signmask(const v_uint16x16& a)
+{ return v_signmask(v_reinterpret_as_s16(a)); }
+
+inline int v_signmask(const v_float32x8& a)
+{ return _mm256_movemask_ps(a.val); }
+inline int v_signmask(const v_float64x4& a)
+{ return _mm256_movemask_pd(a.val); }
+
+inline int v_signmask(const v_int32x8& a)
+{ return v_signmask(v_reinterpret_as_f32(a)); }
+inline int v_signmask(const v_uint32x8& a)
+{ return v_signmask(v_reinterpret_as_f32(a)); }
+
+inline int v_signmask(const v_int64x4& a)
+{ return v_signmask(v_reinterpret_as_f64(a)); }
+inline int v_signmask(const v_uint64x4& a)
+{ return v_signmask(v_reinterpret_as_f64(a)); }
+
+inline int v_scan_forward(const v_int8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_uint8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_int16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_uint16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_int32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_uint32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_float32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_int64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_uint64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_float64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+
+/** Checks **/
+#define OPENCV_HAL_IMPL_AVX_CHECK(_Tpvec, allmask) \
+    inline bool v_check_all(const _Tpvec& a) { return v_signmask(a) == allmask; } \
+    inline bool v_check_any(const _Tpvec& a) { return v_signmask(a) != 0; }
+OPENCV_HAL_IMPL_AVX_CHECK(v_uint8x32, -1)
+OPENCV_HAL_IMPL_AVX_CHECK(v_int8x32, -1)
+OPENCV_HAL_IMPL_AVX_CHECK(v_uint32x8, 255)
+OPENCV_HAL_IMPL_AVX_CHECK(v_int32x8, 255)
+OPENCV_HAL_IMPL_AVX_CHECK(v_uint64x4, 15)
+OPENCV_HAL_IMPL_AVX_CHECK(v_int64x4, 15)
+OPENCV_HAL_IMPL_AVX_CHECK(v_float32x8, 255)
+OPENCV_HAL_IMPL_AVX_CHECK(v_float64x4, 15)
+
+#define OPENCV_HAL_IMPL_AVX_CHECK_SHORT(_Tpvec)  \
+    inline bool v_check_all(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) == 0xaaaaaaaa; } \
+    inline bool v_check_any(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) != 0; }
+OPENCV_HAL_IMPL_AVX_CHECK_SHORT(v_uint16x16)
+OPENCV_HAL_IMPL_AVX_CHECK_SHORT(v_int16x16)
+
+////////// Other math /////////
+
+/** Some frequent operations **/
+#if CV_FMA3
+#define OPENCV_HAL_IMPL_AVX_MULADD(_Tpvec, suffix)                            \
+    inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)    \
+    { return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); }            \
+    inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
+    { return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); }
+#else
+#define OPENCV_HAL_IMPL_AVX_MULADD(_Tpvec, suffix)                                    \
+    inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)            \
+    { return _Tpvec(_mm256_add_##suffix(_mm256_mul_##suffix(a.val, b.val), c.val)); } \
+    inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)         \
+    { return _Tpvec(_mm256_add_##suffix(_mm256_mul_##suffix(a.val, b.val), c.val)); }
+#endif
+
+#define OPENCV_HAL_IMPL_AVX_MISC(_Tpvec, suffix)                              \
+    inline _Tpvec v_sqrt(const _Tpvec& x)                                     \
+    { return _Tpvec(_mm256_sqrt_##suffix(x.val)); }                           \
+    inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)           \
+    { return v_fma(a, a, b * b); }                                            \
+    inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)               \
+    { return v_sqrt(v_fma(a, a, b*b)); }
+
+OPENCV_HAL_IMPL_AVX_MULADD(v_float32x8, ps)
+OPENCV_HAL_IMPL_AVX_MULADD(v_float64x4, pd)
+OPENCV_HAL_IMPL_AVX_MISC(v_float32x8, ps)
+OPENCV_HAL_IMPL_AVX_MISC(v_float64x4, pd)
+
+inline v_int32x8 v_fma(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
+{
+    return a * b + c;
+}
+
+inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_float32x8 v_invsqrt(const v_float32x8& x)
+{
+    v_float32x8 half = x * v256_setall_f32(0.5);
+    v_float32x8 t  = v_float32x8(_mm256_rsqrt_ps(x.val));
+    // todo: _mm256_fnmsub_ps
+    t *= v256_setall_f32(1.5) - ((t * t) * half);
+    return t;
+}
+
+inline v_float64x4 v_invsqrt(const v_float64x4& x)
+{
+    return v256_setall_f64(1.) / v_sqrt(x);
+}
+
+/** Absolute values **/
+#define OPENCV_HAL_IMPL_AVX_ABS(_Tpvec, suffix)         \
+    inline v_u##_Tpvec v_abs(const v_##_Tpvec& x)       \
+    { return v_u##_Tpvec(_mm256_abs_##suffix(x.val)); }
+
+OPENCV_HAL_IMPL_AVX_ABS(int8x32,  epi8)
+OPENCV_HAL_IMPL_AVX_ABS(int16x16, epi16)
+OPENCV_HAL_IMPL_AVX_ABS(int32x8,  epi32)
+
+inline v_float32x8 v_abs(const v_float32x8& x)
+{ return x & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))); }
+inline v_float64x4 v_abs(const v_float64x4& x)
+{ return x & v_float64x4(_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_set1_epi64x(-1), 1))); }
+
+/** Absolute difference **/
+inline v_uint8x32 v_absdiff(const v_uint8x32& a, const v_uint8x32& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint16x16 v_absdiff(const v_uint16x16& a, const v_uint16x16& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint32x8 v_absdiff(const v_uint32x8& a, const v_uint32x8& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+inline v_uint8x32 v_absdiff(const v_int8x32& a, const v_int8x32& b)
+{
+    v_int8x32 d = v_sub_wrap(a, b);
+    v_int8x32 m = a < b;
+    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
+}
+
+inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b)
+{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
+
+inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b)
+{
+    v_int32x8 d = a - b;
+    v_int32x8 m = a < b;
+    return v_reinterpret_as_u32((d ^ m) - m);
+}
+
+inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
+{ return v_abs(a - b); }
+
+inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
+{ return v_abs(a - b); }
+
+/** Saturating absolute difference **/
+inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b)
+{
+    v_int8x32 d = a - b;
+    v_int8x32 m = a < b;
+    return (d ^ m) - m;
+}
+inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+////////// Conversions /////////
+
+/** Rounding **/
+inline v_int32x8 v_round(const v_float32x8& a)
+{ return v_int32x8(_mm256_cvtps_epi32(a.val)); }
+
+inline v_int32x8 v_round(const v_float64x4& a)
+{ return v_int32x8(_mm256_castsi128_si256(_mm256_cvtpd_epi32(a.val))); }
+
+inline v_int32x8 v_round(const v_float64x4& a, const v_float64x4& b)
+{
+    __m128i ai = _mm256_cvtpd_epi32(a.val), bi = _mm256_cvtpd_epi32(b.val);
+    return v_int32x8(_v256_combine(ai, bi));
+}
+
+inline v_int32x8 v_trunc(const v_float32x8& a)
+{ return v_int32x8(_mm256_cvttps_epi32(a.val)); }
+
+inline v_int32x8 v_trunc(const v_float64x4& a)
+{ return v_int32x8(_mm256_castsi128_si256(_mm256_cvttpd_epi32(a.val))); }
+
+inline v_int32x8 v_floor(const v_float32x8& a)
+{ return v_int32x8(_mm256_cvttps_epi32(_mm256_floor_ps(a.val))); }
+
+inline v_int32x8 v_floor(const v_float64x4& a)
+{ return v_trunc(v_float64x4(_mm256_floor_pd(a.val))); }
+
+inline v_int32x8 v_ceil(const v_float32x8& a)
+{ return v_int32x8(_mm256_cvttps_epi32(_mm256_ceil_ps(a.val))); }
+
+inline v_int32x8 v_ceil(const v_float64x4& a)
+{ return v_trunc(v_float64x4(_mm256_ceil_pd(a.val))); }
+
+/** To float **/
+inline v_float32x8 v_cvt_f32(const v_int32x8& a)
+{ return v_float32x8(_mm256_cvtepi32_ps(a.val)); }
+
+inline v_float32x8 v_cvt_f32(const v_float64x4& a)
+{ return v_float32x8(_mm256_castps128_ps256(_mm256_cvtpd_ps(a.val))); }
+
+inline v_float32x8 v_cvt_f32(const v_float64x4& a, const v_float64x4& b)
+{
+    __m128 af = _mm256_cvtpd_ps(a.val), bf = _mm256_cvtpd_ps(b.val);
+    return v_float32x8(_v256_combine(af, bf));
+}
+
+inline v_float64x4 v_cvt_f64(const v_int32x8& a)
+{ return v_float64x4(_mm256_cvtepi32_pd(_v256_extract_low(a.val))); }
+
+inline v_float64x4 v_cvt_f64_high(const v_int32x8& a)
+{ return v_float64x4(_mm256_cvtepi32_pd(_v256_extract_high(a.val))); }
+
+inline v_float64x4 v_cvt_f64(const v_float32x8& a)
+{ return v_float64x4(_mm256_cvtps_pd(_v256_extract_low(a.val))); }
+
+inline v_float64x4 v_cvt_f64_high(const v_float32x8& a)
+{ return v_float64x4(_mm256_cvtps_pd(_v256_extract_high(a.val))); }
+
+// from (Mysticial and wim) https://stackoverflow.com/q/41144668
+inline v_float64x4 v_cvt_f64(const v_int64x4& v)
+{
+    // constants encoded as floating-point
+    __m256i magic_i_lo   = _mm256_set1_epi64x(0x4330000000000000); // 2^52
+    __m256i magic_i_hi32 = _mm256_set1_epi64x(0x4530000080000000); // 2^84 + 2^63
+    __m256i magic_i_all  = _mm256_set1_epi64x(0x4530000080100000); // 2^84 + 2^63 + 2^52
+    __m256d magic_d_all  = _mm256_castsi256_pd(magic_i_all);
+
+    // Blend the 32 lowest significant bits of v with magic_int_lo
+    __m256i v_lo         = _mm256_blend_epi32(magic_i_lo, v.val, 0x55);
+    // Extract the 32 most significant bits of v
+    __m256i v_hi         = _mm256_srli_epi64(v.val, 32);
+    // Flip the msb of v_hi and blend with 0x45300000
+            v_hi         = _mm256_xor_si256(v_hi, magic_i_hi32);
+    // Compute in double precision
+    __m256d v_hi_dbl     = _mm256_sub_pd(_mm256_castsi256_pd(v_hi), magic_d_all);
+    // (v_hi - magic_d_all) + v_lo  Do not assume associativity of floating point addition
+    __m256d result       = _mm256_add_pd(v_hi_dbl, _mm256_castsi256_pd(v_lo));
+    return v_float64x4(result);
+}
+
+////////////// Lookup table access ////////////////////
+
+inline v_int8x32 v256_lut(const schar* tab, const int* idx)
+{
+    return v_int8x32(_mm256_setr_epi8(tab[idx[ 0]], tab[idx[ 1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
+                                      tab[idx[ 8]], tab[idx[ 9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]],
+                                      tab[idx[16]], tab[idx[17]], tab[idx[18]], tab[idx[19]], tab[idx[20]], tab[idx[21]], tab[idx[22]], tab[idx[23]],
+                                      tab[idx[24]], tab[idx[25]], tab[idx[26]], tab[idx[27]], tab[idx[28]], tab[idx[29]], tab[idx[30]], tab[idx[31]]));
+}
+inline v_int8x32 v256_lut_pairs(const schar* tab, const int* idx)
+{
+    return v_int8x32(_mm256_setr_epi16(*(const short*)(tab + idx[ 0]), *(const short*)(tab + idx[ 1]), *(const short*)(tab + idx[ 2]), *(const short*)(tab + idx[ 3]),
+                                       *(const short*)(tab + idx[ 4]), *(const short*)(tab + idx[ 5]), *(const short*)(tab + idx[ 6]), *(const short*)(tab + idx[ 7]),
+                                       *(const short*)(tab + idx[ 8]), *(const short*)(tab + idx[ 9]), *(const short*)(tab + idx[10]), *(const short*)(tab + idx[11]),
+                                       *(const short*)(tab + idx[12]), *(const short*)(tab + idx[13]), *(const short*)(tab + idx[14]), *(const short*)(tab + idx[15])));
+}
+inline v_int8x32 v256_lut_quads(const schar* tab, const int* idx)
+{
+    return v_int8x32(_mm256_i32gather_epi32((const int*)tab, _mm256_loadu_si256((const __m256i*)idx), 1));
+}
+inline v_uint8x32 v256_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut((const schar *)tab, idx)); }
+inline v_uint8x32 v256_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut_pairs((const schar *)tab, idx)); }
+inline v_uint8x32 v256_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut_quads((const schar *)tab, idx)); }
+
+inline v_int16x16 v256_lut(const short* tab, const int* idx)
+{
+    return v_int16x16(_mm256_setr_epi16(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
+                                        tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]));
+}
+inline v_int16x16 v256_lut_pairs(const short* tab, const int* idx)
+{
+    return v_int16x16(_mm256_i32gather_epi32((const int*)tab, _mm256_loadu_si256((const __m256i*)idx), 2));
+}
+inline v_int16x16 v256_lut_quads(const short* tab, const int* idx)
+{
+#if defined(__GNUC__)
+    return v_int16x16(_mm256_i32gather_epi64((const long long int*)tab, _mm_loadu_si128((const __m128i*)idx), 2));//Looks like intrinsic has wrong definition
+#else
+    return v_int16x16(_mm256_i32gather_epi64((const int64*)tab, _mm_loadu_si128((const __m128i*)idx), 2));
+#endif
+}
+inline v_uint16x16 v256_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut((const short *)tab, idx)); }
+inline v_uint16x16 v256_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut_pairs((const short *)tab, idx)); }
+inline v_uint16x16 v256_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut_quads((const short *)tab, idx)); }
+
+inline v_int32x8 v256_lut(const int* tab, const int* idx)
+{
+    return v_int32x8(_mm256_i32gather_epi32(tab, _mm256_loadu_si256((const __m256i*)idx), 4));
+}
+inline v_int32x8 v256_lut_pairs(const int* tab, const int* idx)
+{
+#if defined(__GNUC__)
+    return v_int32x8(_mm256_i32gather_epi64((const long long int*)tab, _mm_loadu_si128((const __m128i*)idx), 4));
+#else
+    return v_int32x8(_mm256_i32gather_epi64((const int64*)tab, _mm_loadu_si128((const __m128i*)idx), 4));
+#endif
+}
+inline v_int32x8 v256_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x8(_v256_combine(_mm_loadu_si128((const __m128i*)(tab + idx[0])), _mm_loadu_si128((const __m128i*)(tab + idx[1]))));
+}
+inline v_uint32x8 v256_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut((const int *)tab, idx)); }
+inline v_uint32x8 v256_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_pairs((const int *)tab, idx)); }
+inline v_uint32x8 v256_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_quads((const int *)tab, idx)); }
+
+inline v_int64x4 v256_lut(const int64* tab, const int* idx)
+{
+#if defined(__GNUC__)
+    return v_int64x4(_mm256_i32gather_epi64((const long long int*)tab, _mm_loadu_si128((const __m128i*)idx), 8));
+#else
+    return v_int64x4(_mm256_i32gather_epi64(tab, _mm_loadu_si128((const __m128i*)idx), 8));
+#endif
+}
+inline v_int64x4 v256_lut_pairs(const int64* tab, const int* idx)
+{
+    return v_int64x4(_v256_combine(_mm_loadu_si128((const __m128i*)(tab + idx[0])), _mm_loadu_si128((const __m128i*)(tab + idx[1]))));
+}
+inline v_uint64x4 v256_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut((const int64 *)tab, idx)); }
+inline v_uint64x4 v256_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut_pairs((const int64 *)tab, idx)); }
+
+inline v_float32x8 v256_lut(const float* tab, const int* idx)
+{
+    return v_float32x8(_mm256_i32gather_ps(tab, _mm256_loadu_si256((const __m256i*)idx), 4));
+}
+inline v_float32x8 v256_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v256_lut_pairs((const int *)tab, idx)); }
+inline v_float32x8 v256_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v256_lut_quads((const int *)tab, idx)); }
+
+inline v_float64x4 v256_lut(const double* tab, const int* idx)
+{
+    return v_float64x4(_mm256_i32gather_pd(tab, _mm_loadu_si128((const __m128i*)idx), 8));
+}
+inline v_float64x4 v256_lut_pairs(const double* tab, const int* idx) { return v_float64x4(_v256_combine(_mm_loadu_pd(tab + idx[0]), _mm_loadu_pd(tab + idx[1]))); }
+
+inline v_int32x8 v_lut(const int* tab, const v_int32x8& idxvec)
+{
+    return v_int32x8(_mm256_i32gather_epi32(tab, idxvec.val, 4));
+}
+
+inline v_uint32x8 v_lut(const unsigned* tab, const v_int32x8& idxvec)
+{
+    return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
+}
+
+inline v_float32x8 v_lut(const float* tab, const v_int32x8& idxvec)
+{
+    return v_float32x8(_mm256_i32gather_ps(tab, idxvec.val, 4));
+}
+
+inline v_float64x4 v_lut(const double* tab, const v_int32x8& idxvec)
+{
+    return v_float64x4(_mm256_i32gather_pd(tab, _mm256_castsi256_si128(idxvec.val), 8));
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x8& idxvec, v_float32x8& x, v_float32x8& y)
+{
+    int CV_DECL_ALIGNED(32) idx[8];
+    v_store_aligned(idx, idxvec);
+    __m128 z = _mm_setzero_ps();
+    __m128 xy01, xy45, xy23, xy67;
+    xy01 = _mm_loadl_pi(z, (const __m64*)(tab + idx[0]));
+    xy01 = _mm_loadh_pi(xy01, (const __m64*)(tab + idx[1]));
+    xy45 = _mm_loadl_pi(z, (const __m64*)(tab + idx[4]));
+    xy45 = _mm_loadh_pi(xy45, (const __m64*)(tab + idx[5]));
+    __m256 xy0145 = _v256_combine(xy01, xy45);
+    xy23 = _mm_loadl_pi(z, (const __m64*)(tab + idx[2]));
+    xy23 = _mm_loadh_pi(xy23, (const __m64*)(tab + idx[3]));
+    xy67 = _mm_loadl_pi(z, (const __m64*)(tab + idx[6]));
+    xy67 = _mm_loadh_pi(xy67, (const __m64*)(tab + idx[7]));
+    __m256 xy2367 = _v256_combine(xy23, xy67);
+
+    __m256 xxyy0145 = _mm256_unpacklo_ps(xy0145, xy2367);
+    __m256 xxyy2367 = _mm256_unpackhi_ps(xy0145, xy2367);
+
+    x = v_float32x8(_mm256_unpacklo_ps(xxyy0145, xxyy2367));
+    y = v_float32x8(_mm256_unpackhi_ps(xxyy0145, xxyy2367));
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x8& idxvec, v_float64x4& x, v_float64x4& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_low(idx, idxvec);
+    __m128d xy0 = _mm_loadu_pd(tab + idx[0]);
+    __m128d xy2 = _mm_loadu_pd(tab + idx[2]);
+    __m128d xy1 = _mm_loadu_pd(tab + idx[1]);
+    __m128d xy3 = _mm_loadu_pd(tab + idx[3]);
+    __m256d xy02 = _v256_combine(xy0, xy2);
+    __m256d xy13 = _v256_combine(xy1, xy3);
+
+    x = v_float64x4(_mm256_unpacklo_pd(xy02, xy13));
+    y = v_float64x4(_mm256_unpackhi_pd(xy02, xy13));
+}
+
+inline v_int8x32 v_interleave_pairs(const v_int8x32& vec)
+{
+    return v_int8x32(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0d0e0c0b090a08, 0x0705060403010200, 0x0f0d0e0c0b090a08, 0x0705060403010200)));
+}
+inline v_uint8x32 v_interleave_pairs(const v_uint8x32& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
+inline v_int8x32 v_interleave_quads(const v_int8x32& vec)
+{
+    return v_int8x32(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0b0e0a0d090c08, 0x0703060205010400, 0x0f0b0e0a0d090c08, 0x0703060205010400)));
+}
+inline v_uint8x32 v_interleave_quads(const v_uint8x32& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x16 v_interleave_pairs(const v_int16x16& vec)
+{
+    return v_int16x16(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0e0b0a0d0c0908, 0x0706030205040100, 0x0f0e0b0a0d0c0908, 0x0706030205040100)));
+}
+inline v_uint16x16 v_interleave_pairs(const v_uint16x16& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
+inline v_int16x16 v_interleave_quads(const v_int16x16& vec)
+{
+    return v_int16x16(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0e07060d0c0504, 0x0b0a030209080100, 0x0f0e07060d0c0504, 0x0b0a030209080100)));
+}
+inline v_uint16x16 v_interleave_quads(const v_uint16x16& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x8 v_interleave_pairs(const v_int32x8& vec)
+{
+    return v_int32x8(_mm256_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+inline v_uint32x8 v_interleave_pairs(const v_uint32x8& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+inline v_float32x8 v_interleave_pairs(const v_float32x8& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+
+inline v_int8x32 v_pack_triplets(const v_int8x32& vec)
+{
+    return v_int8x32(_mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(vec.val, _mm256_broadcastsi128_si256(_mm_set_epi64x(0xffffff0f0e0d0c0a, 0x0908060504020100))),
+                                                 _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
+}
+inline v_uint8x32 v_pack_triplets(const v_uint8x32& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x16 v_pack_triplets(const v_int16x16& vec)
+{
+    return v_int16x16(_mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(vec.val, _mm256_broadcastsi128_si256(_mm_set_epi64x(0xffff0f0e0d0c0b0a, 0x0908050403020100))),
+                                                  _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
+}
+inline v_uint16x16 v_pack_triplets(const v_uint16x16& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x8 v_pack_triplets(const v_int32x8& vec)
+{
+    return v_int32x8(_mm256_permutevar8x32_epi32(vec.val, _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
+}
+inline v_uint32x8 v_pack_triplets(const v_uint32x8& vec) { return v_reinterpret_as_u32(v_pack_triplets(v_reinterpret_as_s32(vec))); }
+inline v_float32x8 v_pack_triplets(const v_float32x8& vec)
+{
+    return v_float32x8(_mm256_permutevar8x32_ps(vec.val, _mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
+}
+
+////////// Matrix operations /////////
+
+//////// Dot Product ////////
+
+// 16 >> 32
+inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
+{ return v_int32x8(_mm256_madd_epi16(a.val, b.val)); }
+inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
+{ return v_dotprod(a, b) + c; }
+
+// 32 >> 64
+inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
+{
+    __m256i even = _mm256_mul_epi32(a.val, b.val);
+    __m256i odd = _mm256_mul_epi32(_mm256_srli_epi64(a.val, 32), _mm256_srli_epi64(b.val, 32));
+    return v_int64x4(_mm256_add_epi64(even, odd));
+}
+inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
+{ return v_dotprod(a, b) + c; }
+
+// 8 >> 32
+inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
+{
+    __m256i even_m = _mm256_set1_epi32(0xFF00FF00);
+    __m256i even_a = _mm256_blendv_epi8(a.val, _mm256_setzero_si256(), even_m);
+    __m256i odd_a  = _mm256_srli_epi16(a.val, 8);
+
+    __m256i even_b = _mm256_blendv_epi8(b.val, _mm256_setzero_si256(), even_m);
+    __m256i odd_b  = _mm256_srli_epi16(b.val, 8);
+
+    __m256i prod0  = _mm256_madd_epi16(even_a, even_b);
+    __m256i prod1  = _mm256_madd_epi16(odd_a, odd_b);
+    return v_uint32x8(_mm256_add_epi32(prod0, prod1));
+}
+inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
+{
+    __m256i even_a = _mm256_srai_epi16(_mm256_bslli_epi128(a.val, 1), 8);
+    __m256i odd_a  = _mm256_srai_epi16(a.val, 8);
+
+    __m256i even_b = _mm256_srai_epi16(_mm256_bslli_epi128(b.val, 1), 8);
+    __m256i odd_b  = _mm256_srai_epi16(b.val, 8);
+
+    __m256i prod0  = _mm256_madd_epi16(even_a, even_b);
+    __m256i prod1  = _mm256_madd_epi16(odd_a, odd_b);
+    return v_int32x8(_mm256_add_epi32(prod0, prod1));
+}
+inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i mullo = _mm256_mullo_epi16(a.val, b.val);
+    __m256i mulhi = _mm256_mulhi_epu16(a.val, b.val);
+    __m256i mul0  = _mm256_unpacklo_epi16(mullo, mulhi);
+    __m256i mul1  = _mm256_unpackhi_epi16(mullo, mulhi);
+
+    __m256i p02   = _mm256_blend_epi32(mul0, _mm256_setzero_si256(), 0xAA);
+    __m256i p13   = _mm256_srli_epi64(mul0, 32);
+    __m256i p46   = _mm256_blend_epi32(mul1, _mm256_setzero_si256(), 0xAA);
+    __m256i p57   = _mm256_srli_epi64(mul1, 32);
+
+    __m256i p15_  = _mm256_add_epi64(p02, p13);
+    __m256i p9d_  = _mm256_add_epi64(p46, p57);
+
+    return v_uint64x4(_mm256_add_epi64(
+        _mm256_unpacklo_epi64(p15_, p9d_),
+        _mm256_unpackhi_epi64(p15_, p9d_)
+    ));
+}
+inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
+{
+    __m256i prod = _mm256_madd_epi16(a.val, b.val);
+    __m256i sign = _mm256_srai_epi32(prod, 31);
+
+    __m256i lo = _mm256_unpacklo_epi32(prod, sign);
+    __m256i hi = _mm256_unpackhi_epi32(prod, sign);
+
+    return v_int64x4(_mm256_add_epi64(
+        _mm256_unpacklo_epi64(lo, hi),
+        _mm256_unpackhi_epi64(lo, hi)
+    ));
+}
+inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b)
+{ return v_dotprod(a, b); }
+inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
+{ return v_dotprod(a, b, c); }
+
+// 32 >> 64
+inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b)
+{ return v_dotprod(a, b); }
+inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
+{ return v_dotprod(a, b, c); }
+
+// 8 >> 32
+inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
+{ return v_dotprod_expand(a, b, c); }
+
+inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 16 >> 64
+inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i mullo = _mm256_mullo_epi16(a.val, b.val);
+    __m256i mulhi = _mm256_mulhi_epu16(a.val, b.val);
+    __m256i mul0  = _mm256_unpacklo_epi16(mullo, mulhi);
+    __m256i mul1  = _mm256_unpackhi_epi16(mullo, mulhi);
+
+    __m256i p02   = _mm256_blend_epi32(mul0, _mm256_setzero_si256(), 0xAA);
+    __m256i p13   = _mm256_srli_epi64(mul0, 32);
+    __m256i p46   = _mm256_blend_epi32(mul1, _mm256_setzero_si256(), 0xAA);
+    __m256i p57   = _mm256_srli_epi64(mul1, 32);
+
+    __m256i p15_  = _mm256_add_epi64(p02, p13);
+    __m256i p9d_  = _mm256_add_epi64(p46, p57);
+
+    return v_uint64x4(_mm256_add_epi64(p15_, p9d_));
+}
+inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
+{
+    __m256i prod = _mm256_madd_epi16(a.val, b.val);
+    __m256i sign = _mm256_srai_epi32(prod, 31);
+    __m256i lo = _mm256_unpacklo_epi32(prod, sign);
+    __m256i hi = _mm256_unpackhi_epi32(prod, sign);
+    return v_int64x4(_mm256_add_epi64(lo, hi));
+}
+inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b)
+{ return v_dotprod_expand(a, b); }
+inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
+{ return v_dotprod_expand(a, b, c); }
+
+#define OPENCV_HAL_AVX_SPLAT2_PS(a, im) \
+    v_float32x8(_mm256_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))
+
+inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0,
+                            const v_float32x8& m1, const v_float32x8& m2,
+                            const v_float32x8& m3)
+{
+    v_float32x8 v04 = OPENCV_HAL_AVX_SPLAT2_PS(v, 0);
+    v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
+    v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
+    v_float32x8 v37 = OPENCV_HAL_AVX_SPLAT2_PS(v, 3);
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
+}
+
+inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0,
+                               const v_float32x8& m1, const v_float32x8& m2,
+                               const v_float32x8& a)
+{
+    v_float32x8 v04 = OPENCV_HAL_AVX_SPLAT2_PS(v, 0);
+    v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
+    v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, a)));
+}
+
+#define OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to)    \
+    inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1,              \
+                               const _Tpvec& a2, const _Tpvec& a3,              \
+                               _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3)  \
+    {                                                                           \
+        __m256i t0 = cast_from(_mm256_unpacklo_##suffix(a0.val, a1.val));       \
+        __m256i t1 = cast_from(_mm256_unpacklo_##suffix(a2.val, a3.val));       \
+        __m256i t2 = cast_from(_mm256_unpackhi_##suffix(a0.val, a1.val));       \
+        __m256i t3 = cast_from(_mm256_unpackhi_##suffix(a2.val, a3.val));       \
+        b0.val = cast_to(_mm256_unpacklo_epi64(t0, t1));                        \
+        b1.val = cast_to(_mm256_unpackhi_epi64(t0, t1));                        \
+        b2.val = cast_to(_mm256_unpacklo_epi64(t2, t3));                        \
+        b3.val = cast_to(_mm256_unpackhi_epi64(t2, t3));                        \
+    }
+
+OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_uint32x8,  epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_int32x8,   epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_float32x8, ps, _mm256_castps_si256, _mm256_castsi256_ps)
+
+//////////////// Value reordering ///////////////
+
+/* Expand */
+#define OPENCV_HAL_IMPL_AVX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin)    \
+    inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+    {                                                               \
+        b0.val = intrin(_v256_extract_low(a.val));                  \
+        b1.val = intrin(_v256_extract_high(a.val));                 \
+    }                                                               \
+    inline _Tpwvec v_expand_low(const _Tpvec& a)                    \
+    { return _Tpwvec(intrin(_v256_extract_low(a.val))); }           \
+    inline _Tpwvec v_expand_high(const _Tpvec& a)                   \
+    { return _Tpwvec(intrin(_v256_extract_high(a.val))); }          \
+    inline _Tpwvec v256_load_expand(const _Tp* ptr)                 \
+    {                                                               \
+        __m128i a = _mm_loadu_si128((const __m128i*)ptr);           \
+        return _Tpwvec(intrin(a));                                  \
+    }
+
+OPENCV_HAL_IMPL_AVX_EXPAND(v_uint8x32,  v_uint16x16, uchar,    _mm256_cvtepu8_epi16)
+OPENCV_HAL_IMPL_AVX_EXPAND(v_int8x32,   v_int16x16,  schar,    _mm256_cvtepi8_epi16)
+OPENCV_HAL_IMPL_AVX_EXPAND(v_uint16x16, v_uint32x8,  ushort,   _mm256_cvtepu16_epi32)
+OPENCV_HAL_IMPL_AVX_EXPAND(v_int16x16,  v_int32x8,   short,    _mm256_cvtepi16_epi32)
+OPENCV_HAL_IMPL_AVX_EXPAND(v_uint32x8,  v_uint64x4,  unsigned, _mm256_cvtepu32_epi64)
+OPENCV_HAL_IMPL_AVX_EXPAND(v_int32x8,   v_int64x4,   int,      _mm256_cvtepi32_epi64)
+
+#define OPENCV_HAL_IMPL_AVX_EXPAND_Q(_Tpvec, _Tp, intrin)   \
+    inline _Tpvec v256_load_expand_q(const _Tp* ptr)        \
+    {                                                       \
+        __m128i a = _mm_loadl_epi64((const __m128i*)ptr);   \
+        return _Tpvec(intrin(a));                           \
+    }
+
+OPENCV_HAL_IMPL_AVX_EXPAND_Q(v_uint32x8, uchar, _mm256_cvtepu8_epi32)
+OPENCV_HAL_IMPL_AVX_EXPAND_Q(v_int32x8,  schar, _mm256_cvtepi8_epi32)
+
+/* pack */
+// 16
+inline v_int8x32 v_pack(const v_int16x16& a, const v_int16x16& b)
+{ return v_int8x32(_v256_shuffle_odd_64(_mm256_packs_epi16(a.val, b.val))); }
+
+inline v_uint8x32 v_pack(const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i t = _mm256_set1_epi16(255);
+    __m256i a1 = _mm256_min_epu16(a.val, t);
+    __m256i b1 = _mm256_min_epu16(b.val, t);
+    return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a1, b1)));
+}
+
+inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b)
+{
+    return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a.val, b.val)));
+}
+
+inline void v_pack_store(schar* ptr, const v_int16x16& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(uchar* ptr, const v_uint16x16& a)
+{
+    const __m256i m = _mm256_set1_epi16(255);
+    __m256i am = _mm256_min_epu16(a.val, m);
+            am =  _v256_shuffle_odd_64(_mm256_packus_epi16(am, am));
+    v_store_low(ptr, v_uint8x32(am));
+}
+
+inline void v_pack_u_store(uchar* ptr, const v_int16x16& a)
+{ v_store_low(ptr, v_pack_u(a, a)); }
+
+template<int n> inline
+v_uint8x32 v_rshr_pack(const v_uint16x16& a, const v_uint16x16& b)
+{
+    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
+    v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
+    return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
+                    v_reinterpret_as_s16((b + delta) >> n));
+}
+
+template<int n> inline
+void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a)
+{
+    v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
+    v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
+}
+
+template<int n> inline
+v_uint8x32 v_rshr_pack_u(const v_int16x16& a, const v_int16x16& b)
+{
+    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
+    return v_pack_u((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a)
+{
+    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
+    v_pack_u_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int8x32 v_rshr_pack(const v_int16x16& a, const v_int16x16& b)
+{
+    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(schar* ptr, const v_int16x16& a)
+{
+    v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+// 32
+inline v_int16x16 v_pack(const v_int32x8& a, const v_int32x8& b)
+{ return v_int16x16(_v256_shuffle_odd_64(_mm256_packs_epi32(a.val, b.val))); }
+
+inline v_uint16x16 v_pack(const v_uint32x8& a, const v_uint32x8& b)
+{ return v_uint16x16(_v256_shuffle_odd_64(_v256_packs_epu32(a.val, b.val))); }
+
+inline v_uint16x16 v_pack_u(const v_int32x8& a, const v_int32x8& b)
+{ return v_uint16x16(_v256_shuffle_odd_64(_mm256_packus_epi32(a.val, b.val))); }
+
+inline void v_pack_store(short* ptr, const v_int32x8& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(ushort* ptr, const v_uint32x8& a)
+{
+    const __m256i m = _mm256_set1_epi32(65535);
+    __m256i am = _mm256_min_epu32(a.val, m);
+            am = _v256_shuffle_odd_64(_mm256_packus_epi32(am, am));
+    v_store_low(ptr, v_uint16x16(am));
+}
+
+inline void v_pack_u_store(ushort* ptr, const v_int32x8& a)
+{ v_store_low(ptr, v_pack_u(a, a)); }
+
+
+template<int n> inline
+v_uint16x16 v_rshr_pack(const v_uint32x8& a, const v_uint32x8& b)
+{
+    // we assume that n > 0, and so the shifted 32-bit values can be treated as signed numbers.
+    v_uint32x8 delta = v256_setall_u32(1 << (n-1));
+    return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
+                    v_reinterpret_as_s32((b + delta) >> n));
+}
+
+template<int n> inline
+void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a)
+{
+    v_uint32x8 delta = v256_setall_u32(1 << (n-1));
+    v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
+}
+
+template<int n> inline
+v_uint16x16 v_rshr_pack_u(const v_int32x8& a, const v_int32x8& b)
+{
+    v_int32x8 delta = v256_setall_s32(1 << (n-1));
+    return v_pack_u((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a)
+{
+    v_int32x8 delta = v256_setall_s32(1 << (n-1));
+    v_pack_u_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int16x16 v_rshr_pack(const v_int32x8& a, const v_int32x8& b)
+{
+    v_int32x8 delta = v256_setall_s32(1 << (n-1));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(short* ptr, const v_int32x8& a)
+{
+    v_int32x8 delta = v256_setall_s32(1 << (n-1));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+// 64
+// Non-saturating pack
+inline v_uint32x8 v_pack(const v_uint64x4& a, const v_uint64x4& b)
+{
+    __m256i a0 = _mm256_shuffle_epi32(a.val, _MM_SHUFFLE(0, 0, 2, 0));
+    __m256i b0 = _mm256_shuffle_epi32(b.val, _MM_SHUFFLE(0, 0, 2, 0));
+    __m256i ab = _mm256_unpacklo_epi64(a0, b0); // a0, a1, b0, b1, a2, a3, b2, b3
+    return v_uint32x8(_v256_shuffle_odd_64(ab));
+}
+
+inline v_int32x8 v_pack(const v_int64x4& a, const v_int64x4& b)
+{ return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
+
+inline void v_pack_store(unsigned* ptr, const v_uint64x4& a)
+{
+    __m256i a0 = _mm256_shuffle_epi32(a.val, _MM_SHUFFLE(0, 0, 2, 0));
+    v_store_low(ptr, v_uint32x8(_v256_shuffle_odd_64(a0)));
+}
+
+inline void v_pack_store(int* ptr, const v_int64x4& b)
+{ v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(b)); }
+
+template<int n> inline
+v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b)
+{
+    v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a)
+{
+    v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b)
+{
+    v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(int* ptr, const v_int64x4& a)
+{
+    v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+// pack boolean
+inline v_uint8x32 v_pack_b(const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i ab = _mm256_packs_epi16(a.val, b.val);
+    return v_uint8x32(_v256_shuffle_odd_64(ab));
+}
+
+inline v_uint8x32 v_pack_b(const v_uint32x8& a, const v_uint32x8& b,
+                           const v_uint32x8& c, const v_uint32x8& d)
+{
+    __m256i ab = _mm256_packs_epi32(a.val, b.val);
+    __m256i cd = _mm256_packs_epi32(c.val, d.val);
+
+    __m256i abcd = _v256_shuffle_odd_64(_mm256_packs_epi16(ab, cd));
+    return v_uint8x32(_mm256_shuffle_epi32(abcd, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+inline v_uint8x32 v_pack_b(const v_uint64x4& a, const v_uint64x4& b, const v_uint64x4& c,
+                           const v_uint64x4& d, const v_uint64x4& e, const v_uint64x4& f,
+                           const v_uint64x4& g, const v_uint64x4& h)
+{
+    __m256i ab = _mm256_packs_epi32(a.val, b.val);
+    __m256i cd = _mm256_packs_epi32(c.val, d.val);
+    __m256i ef = _mm256_packs_epi32(e.val, f.val);
+    __m256i gh = _mm256_packs_epi32(g.val, h.val);
+
+    __m256i abcd = _mm256_packs_epi32(ab, cd);
+    __m256i efgh = _mm256_packs_epi32(ef, gh);
+    __m256i pkall = _v256_shuffle_odd_64(_mm256_packs_epi16(abcd, efgh));
+
+    __m256i rev = _mm256_alignr_epi8(pkall, pkall, 8);
+    return v_uint8x32(_mm256_unpacklo_epi16(pkall, rev));
+}
+
+/* Recombine */
+// its up there with load and store operations
+
+/* Extract */
+#define OPENCV_HAL_IMPL_AVX_EXTRACT(_Tpvec)                    \
+    template<int s>                                            \
+    inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)  \
+    { return v_rotate_right<s>(a, b); }
+
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint8x32)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_int8x32)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint16x16)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_int16x16)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint32x8)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_int32x8)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint64x4)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_int64x4)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_float32x8)
+OPENCV_HAL_IMPL_AVX_EXTRACT(v_float64x4)
+
+template<int i>
+inline uchar v_extract_n(v_uint8x32 a)
+{
+    return (uchar)_v256_extract_epi8<i>(a.val);
+}
+
+template<int i>
+inline schar v_extract_n(v_int8x32 a)
+{
+    return (schar)v_extract_n<i>(v_reinterpret_as_u8(a));
+}
+
+template<int i>
+inline ushort v_extract_n(v_uint16x16 a)
+{
+    return (ushort)_v256_extract_epi16<i>(a.val);
+}
+
+template<int i>
+inline short v_extract_n(v_int16x16 a)
+{
+    return (short)v_extract_n<i>(v_reinterpret_as_u16(a));
+}
+
+template<int i>
+inline uint v_extract_n(v_uint32x8 a)
+{
+    return (uint)_v256_extract_epi32<i>(a.val);
+}
+
+template<int i>
+inline int v_extract_n(v_int32x8 a)
+{
+    return (int)v_extract_n<i>(v_reinterpret_as_u32(a));
+}
+
+template<int i>
+inline uint64 v_extract_n(v_uint64x4 a)
+{
+    return (uint64)_v256_extract_epi64<i>(a.val);
+}
+
+template<int i>
+inline int64 v_extract_n(v_int64x4 v)
+{
+    return (int64)v_extract_n<i>(v_reinterpret_as_u64(v));
+}
+
+template<int i>
+inline float v_extract_n(v_float32x8 v)
+{
+    union { uint iv; float fv; } d;
+    d.iv = v_extract_n<i>(v_reinterpret_as_u32(v));
+    return d.fv;
+}
+
+template<int i>
+inline double v_extract_n(v_float64x4 v)
+{
+    union { uint64 iv; double dv; } d;
+    d.iv = v_extract_n<i>(v_reinterpret_as_u64(v));
+    return d.dv;
+}
+
+template<int i>
+inline v_uint32x8 v_broadcast_element(v_uint32x8 a)
+{
+    static const __m256i perm = _mm256_set1_epi32((char)i);
+    return v_uint32x8(_mm256_permutevar8x32_epi32(a.val, perm));
+}
+
+template<int i>
+inline v_int32x8 v_broadcast_element(const v_int32x8 &a)
+{ return v_reinterpret_as_s32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
+
+template<int i>
+inline v_float32x8 v_broadcast_element(const v_float32x8 &a)
+{ return v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
+
+
+///////////////////// load deinterleave /////////////////////////////
+
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b )
+{
+    __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
+
+    const __m256i sh = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+                                               0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+    __m256i p0 = _mm256_shuffle_epi8(ab0, sh);
+    __m256i p1 = _mm256_shuffle_epi8(ab1, sh);
+    __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
+    __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
+    __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
+    __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
+    a = v_uint8x32(a0);
+    b = v_uint8x32(b0);
+}
+
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b )
+{
+    __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
+
+    const __m256i sh = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+                                               0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+    __m256i p0 = _mm256_shuffle_epi8(ab0, sh);
+    __m256i p1 = _mm256_shuffle_epi8(ab1, sh);
+    __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
+    __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
+    __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
+    __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
+    a = v_uint16x16(a0);
+    b = v_uint16x16(b0);
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b )
+{
+    __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
+
+    enum { sh = 0+2*4+1*16+3*64 };
+    __m256i p0 = _mm256_shuffle_epi32(ab0, sh);
+    __m256i p1 = _mm256_shuffle_epi32(ab1, sh);
+    __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
+    __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
+    __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
+    __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
+    a = v_uint32x8(a0);
+    b = v_uint32x8(b0);
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b )
+{
+    __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
+
+    __m256i pl = _mm256_permute2x128_si256(ab0, ab1, 0 + 2*16);
+    __m256i ph = _mm256_permute2x128_si256(ab0, ab1, 1 + 3*16);
+    __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
+    __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
+    a = v_uint64x4(a0);
+    b = v_uint64x4(b0);
+}
+
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c )
+{
+    __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
+    __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64));
+
+    __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
+    __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
+
+    const __m256i m0 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
+                                               0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    const __m256i m1 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
+                                               -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1);
+
+    __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
+    __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
+    __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
+
+    const __m256i
+    sh_b = _mm256_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13,
+                            0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13),
+    sh_g = _mm256_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14,
+                            1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14),
+    sh_r = _mm256_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15,
+                            2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
+    b0 = _mm256_shuffle_epi8(b0, sh_b);
+    g0 = _mm256_shuffle_epi8(g0, sh_g);
+    r0 = _mm256_shuffle_epi8(r0, sh_r);
+
+    a = v_uint8x32(b0);
+    b = v_uint8x32(g0);
+    c = v_uint8x32(r0);
+}
+
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c )
+{
+    __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
+    __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
+
+    __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
+    __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
+
+    const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
+                                               0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
+    const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
+                                               -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
+    __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
+    __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
+    __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
+    const __m256i sh_b = _mm256_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
+                                                 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+    const __m256i sh_g = _mm256_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13,
+                                                 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
+    const __m256i sh_r = _mm256_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
+                                                 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+    b0 = _mm256_shuffle_epi8(b0, sh_b);
+    g0 = _mm256_shuffle_epi8(g0, sh_g);
+    r0 = _mm256_shuffle_epi8(r0, sh_r);
+
+    a = v_uint16x16(b0);
+    b = v_uint16x16(g0);
+    c = v_uint16x16(r0);
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c )
+{
+    __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
+    __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
+
+    __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
+    __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
+
+    __m256i b0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_low, s02_high, 0x24), bgr1, 0x92);
+    __m256i g0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_high, s02_low, 0x92), bgr1, 0x24);
+    __m256i r0 = _mm256_blend_epi32(_mm256_blend_epi32(bgr1, s02_low, 0x24), s02_high, 0x92);
+
+    b0 = _mm256_shuffle_epi32(b0, 0x6c);
+    g0 = _mm256_shuffle_epi32(g0, 0xb1);
+    r0 = _mm256_shuffle_epi32(r0, 0xc6);
+
+    a = v_uint32x8(b0);
+    b = v_uint32x8(g0);
+    c = v_uint32x8(r0);
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c )
+{
+    __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
+    __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
+
+    __m256i s01 = _mm256_blend_epi32(bgr0, bgr1, 0xf0);
+    __m256i s12 = _mm256_blend_epi32(bgr1, bgr2, 0xf0);
+    __m256i s20r = _mm256_permute4x64_epi64(_mm256_blend_epi32(bgr2, bgr0, 0xf0), 0x1b);
+    __m256i b0 = _mm256_unpacklo_epi64(s01, s20r);
+    __m256i g0 = _mm256_alignr_epi8(s12, s01, 8);
+    __m256i r0 = _mm256_unpackhi_epi64(s20r, s12);
+
+    a = v_uint64x4(b0);
+    b = v_uint64x4(g0);
+    c = v_uint64x4(r0);
+}
+
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c, v_uint8x32& d )
+{
+    __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
+    __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64));
+    __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 96));
+    const __m256i sh = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+                                               0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
+
+    __m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
+    __m256i p1 = _mm256_shuffle_epi8(bgr1, sh);
+    __m256i p2 = _mm256_shuffle_epi8(bgr2, sh);
+    __m256i p3 = _mm256_shuffle_epi8(bgr3, sh);
+
+    __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
+    __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
+    __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
+    __m256i p23h = _mm256_unpackhi_epi32(p2, p3);
+
+    __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
+    __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
+    __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
+    __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
+
+    __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
+    __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
+    __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
+    __m256i a0 = _mm256_unpackhi_epi32(phl, phh);
+
+    a = v_uint8x32(b0);
+    b = v_uint8x32(g0);
+    c = v_uint8x32(r0);
+    d = v_uint8x32(a0);
+}
+
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c, v_uint16x16& d )
+{
+    __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
+    __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
+    __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 48));
+    const __m256i sh = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+                                               0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
+    __m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
+    __m256i p1 = _mm256_shuffle_epi8(bgr1, sh);
+    __m256i p2 = _mm256_shuffle_epi8(bgr2, sh);
+    __m256i p3 = _mm256_shuffle_epi8(bgr3, sh);
+
+    __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
+    __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
+    __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
+    __m256i p23h = _mm256_unpackhi_epi32(p2, p3);
+
+    __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
+    __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
+    __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
+    __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
+
+    __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
+    __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
+    __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
+    __m256i a0 = _mm256_unpackhi_epi32(phl, phh);
+
+    a = v_uint16x16(b0);
+    b = v_uint16x16(g0);
+    c = v_uint16x16(r0);
+    d = v_uint16x16(a0);
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c, v_uint32x8& d )
+{
+    __m256i p0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i p1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
+    __m256i p2 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
+    __m256i p3 = _mm256_loadu_si256((const __m256i*)(ptr + 24));
+
+    __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
+    __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
+    __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
+    __m256i p23h = _mm256_unpackhi_epi32(p2, p3);
+
+    __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
+    __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
+    __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
+    __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
+
+    __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
+    __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
+    __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
+    __m256i a0 = _mm256_unpackhi_epi32(phl, phh);
+
+    a = v_uint32x8(b0);
+    b = v_uint32x8(g0);
+    c = v_uint32x8(r0);
+    d = v_uint32x8(a0);
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c, v_uint64x4& d )
+{
+    __m256i bgra0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i bgra1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
+    __m256i bgra2 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
+    __m256i bgra3 = _mm256_loadu_si256((const __m256i*)(ptr + 12));
+
+    __m256i l02 = _mm256_permute2x128_si256(bgra0, bgra2, 0 + 2*16);
+    __m256i h02 = _mm256_permute2x128_si256(bgra0, bgra2, 1 + 3*16);
+    __m256i l13 = _mm256_permute2x128_si256(bgra1, bgra3, 0 + 2*16);
+    __m256i h13 = _mm256_permute2x128_si256(bgra1, bgra3, 1 + 3*16);
+
+    __m256i b0 = _mm256_unpacklo_epi64(l02, l13);
+    __m256i g0 = _mm256_unpackhi_epi64(l02, l13);
+    __m256i r0 = _mm256_unpacklo_epi64(h02, h13);
+    __m256i a0 = _mm256_unpackhi_epi64(h02, h13);
+
+    a = v_uint64x4(b0);
+    b = v_uint64x4(g0);
+    c = v_uint64x4(r0);
+    d = v_uint64x4(a0);
+}
+
+///////////////////////////// store interleave /////////////////////////////////////
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x32& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m256i xy_l = _mm256_unpacklo_epi8(x.val, y.val);
+    __m256i xy_h = _mm256_unpackhi_epi8(x.val, y.val);
+
+    __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
+    __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, xy0);
+        _mm256_stream_si256((__m256i*)(ptr + 32), xy1);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, xy0);
+        _mm256_store_si256((__m256i*)(ptr + 32), xy1);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, xy0);
+        _mm256_storeu_si256((__m256i*)(ptr + 32), xy1);
+    }
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint16x16& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m256i xy_l = _mm256_unpacklo_epi16(x.val, y.val);
+    __m256i xy_h = _mm256_unpackhi_epi16(x.val, y.val);
+
+    __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
+    __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, xy0);
+        _mm256_stream_si256((__m256i*)(ptr + 16), xy1);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, xy0);
+        _mm256_store_si256((__m256i*)(ptr + 16), xy1);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, xy0);
+        _mm256_storeu_si256((__m256i*)(ptr + 16), xy1);
+    }
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint32x8& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m256i xy_l = _mm256_unpacklo_epi32(x.val, y.val);
+    __m256i xy_h = _mm256_unpackhi_epi32(x.val, y.val);
+
+    __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
+    __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, xy0);
+        _mm256_stream_si256((__m256i*)(ptr + 8), xy1);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, xy0);
+        _mm256_store_si256((__m256i*)(ptr + 8), xy1);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, xy0);
+        _mm256_storeu_si256((__m256i*)(ptr + 8), xy1);
+    }
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64x4& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m256i xy_l = _mm256_unpacklo_epi64(x.val, y.val);
+    __m256i xy_h = _mm256_unpackhi_epi64(x.val, y.val);
+
+    __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
+    __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, xy0);
+        _mm256_stream_si256((__m256i*)(ptr + 4), xy1);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, xy0);
+        _mm256_store_si256((__m256i*)(ptr + 4), xy1);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, xy0);
+        _mm256_storeu_si256((__m256i*)(ptr + 4), xy1);
+    }
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& a, const v_uint8x32& b, const v_uint8x32& c,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    const __m256i sh_b = _mm256_setr_epi8(
+            0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5,
+            0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
+    const __m256i sh_g = _mm256_setr_epi8(
+            5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10,
+            5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
+    const __m256i sh_r = _mm256_setr_epi8(
+            10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15,
+            10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
+
+    __m256i b0 = _mm256_shuffle_epi8(a.val, sh_b);
+    __m256i g0 = _mm256_shuffle_epi8(b.val, sh_g);
+    __m256i r0 = _mm256_shuffle_epi8(c.val, sh_r);
+
+    const __m256i m0 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
+                                               0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    const __m256i m1 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
+                                               0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
+
+    __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
+    __m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1);
+    __m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1);
+
+    __m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
+    __m256i bgr1 = _mm256_permute2x128_si256(p2, p0, 0 + 3*16);
+    __m256i bgr2 = _mm256_permute2x128_si256(p1, p2, 1 + 3*16);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, bgr0);
+        _mm256_stream_si256((__m256i*)(ptr + 32), bgr1);
+        _mm256_stream_si256((__m256i*)(ptr + 64), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, bgr0);
+        _mm256_store_si256((__m256i*)(ptr + 32), bgr1);
+        _mm256_store_si256((__m256i*)(ptr + 64), bgr2);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, bgr0);
+        _mm256_storeu_si256((__m256i*)(ptr + 32), bgr1);
+        _mm256_storeu_si256((__m256i*)(ptr + 64), bgr2);
+    }
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& a, const v_uint16x16& b, const v_uint16x16& c,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    const __m256i sh_b = _mm256_setr_epi8(
+         0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
+         0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+    const __m256i sh_g = _mm256_setr_epi8(
+         10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5,
+         10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
+    const __m256i sh_r = _mm256_setr_epi8(
+         4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
+         4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+
+    __m256i b0 = _mm256_shuffle_epi8(a.val, sh_b);
+    __m256i g0 = _mm256_shuffle_epi8(b.val, sh_g);
+    __m256i r0 = _mm256_shuffle_epi8(c.val, sh_r);
+
+    const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
+                                               0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
+    const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
+                                               -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
+
+    __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
+    __m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1);
+    __m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1);
+
+    __m256i bgr0 = _mm256_permute2x128_si256(p0, p2, 0 + 2*16);
+    //__m256i bgr1 = p1;
+    __m256i bgr2 = _mm256_permute2x128_si256(p0, p2, 1 + 3*16);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, bgr0);
+        _mm256_stream_si256((__m256i*)(ptr + 16), p1);
+        _mm256_stream_si256((__m256i*)(ptr + 32), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, bgr0);
+        _mm256_store_si256((__m256i*)(ptr + 16), p1);
+        _mm256_store_si256((__m256i*)(ptr + 32), bgr2);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, bgr0);
+        _mm256_storeu_si256((__m256i*)(ptr + 16), p1);
+        _mm256_storeu_si256((__m256i*)(ptr + 32), bgr2);
+    }
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& a, const v_uint32x8& b, const v_uint32x8& c,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m256i b0 = _mm256_shuffle_epi32(a.val, 0x6c);
+    __m256i g0 = _mm256_shuffle_epi32(b.val, 0xb1);
+    __m256i r0 = _mm256_shuffle_epi32(c.val, 0xc6);
+
+    __m256i p0 = _mm256_blend_epi32(_mm256_blend_epi32(b0, g0, 0x92), r0, 0x24);
+    __m256i p1 = _mm256_blend_epi32(_mm256_blend_epi32(g0, r0, 0x92), b0, 0x24);
+    __m256i p2 = _mm256_blend_epi32(_mm256_blend_epi32(r0, b0, 0x92), g0, 0x24);
+
+    __m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
+    //__m256i bgr1 = p2;
+    __m256i bgr2 = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, bgr0);
+        _mm256_stream_si256((__m256i*)(ptr + 8), p2);
+        _mm256_stream_si256((__m256i*)(ptr + 16), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, bgr0);
+        _mm256_store_si256((__m256i*)(ptr + 8), p2);
+        _mm256_store_si256((__m256i*)(ptr + 16), bgr2);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, bgr0);
+        _mm256_storeu_si256((__m256i*)(ptr + 8), p2);
+        _mm256_storeu_si256((__m256i*)(ptr + 16), bgr2);
+    }
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& a, const v_uint64x4& b, const v_uint64x4& c,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m256i s01 = _mm256_unpacklo_epi64(a.val, b.val);
+    __m256i s12 = _mm256_unpackhi_epi64(b.val, c.val);
+    __m256i s20 = _mm256_blend_epi32(c.val, a.val, 0xcc);
+
+    __m256i bgr0 = _mm256_permute2x128_si256(s01, s20, 0 + 2*16);
+    __m256i bgr1 = _mm256_blend_epi32(s01, s12, 0x0f);
+    __m256i bgr2 = _mm256_permute2x128_si256(s20, s12, 1 + 3*16);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, bgr0);
+        _mm256_stream_si256((__m256i*)(ptr + 4), bgr1);
+        _mm256_stream_si256((__m256i*)(ptr + 8), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, bgr0);
+        _mm256_store_si256((__m256i*)(ptr + 4), bgr1);
+        _mm256_store_si256((__m256i*)(ptr + 8), bgr2);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, bgr0);
+        _mm256_storeu_si256((__m256i*)(ptr + 4), bgr1);
+        _mm256_storeu_si256((__m256i*)(ptr + 8), bgr2);
+    }
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& a, const v_uint8x32& b,
+                                const v_uint8x32& c, const v_uint8x32& d,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m256i bg0 = _mm256_unpacklo_epi8(a.val, b.val);
+    __m256i bg1 = _mm256_unpackhi_epi8(a.val, b.val);
+    __m256i ra0 = _mm256_unpacklo_epi8(c.val, d.val);
+    __m256i ra1 = _mm256_unpackhi_epi8(c.val, d.val);
+
+    __m256i bgra0_ = _mm256_unpacklo_epi16(bg0, ra0);
+    __m256i bgra1_ = _mm256_unpackhi_epi16(bg0, ra0);
+    __m256i bgra2_ = _mm256_unpacklo_epi16(bg1, ra1);
+    __m256i bgra3_ = _mm256_unpackhi_epi16(bg1, ra1);
+
+    __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
+    __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
+    __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
+    __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, bgra0);
+        _mm256_stream_si256((__m256i*)(ptr + 32), bgra1);
+        _mm256_stream_si256((__m256i*)(ptr + 64), bgra2);
+        _mm256_stream_si256((__m256i*)(ptr + 96), bgra3);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, bgra0);
+        _mm256_store_si256((__m256i*)(ptr + 32), bgra1);
+        _mm256_store_si256((__m256i*)(ptr + 64), bgra2);
+        _mm256_store_si256((__m256i*)(ptr + 96), bgra3);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, bgra0);
+        _mm256_storeu_si256((__m256i*)(ptr + 32), bgra1);
+        _mm256_storeu_si256((__m256i*)(ptr + 64), bgra2);
+        _mm256_storeu_si256((__m256i*)(ptr + 96), bgra3);
+    }
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& a, const v_uint16x16& b,
+                                const v_uint16x16& c, const v_uint16x16& d,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m256i bg0 = _mm256_unpacklo_epi16(a.val, b.val);
+    __m256i bg1 = _mm256_unpackhi_epi16(a.val, b.val);
+    __m256i ra0 = _mm256_unpacklo_epi16(c.val, d.val);
+    __m256i ra1 = _mm256_unpackhi_epi16(c.val, d.val);
+
+    __m256i bgra0_ = _mm256_unpacklo_epi32(bg0, ra0);
+    __m256i bgra1_ = _mm256_unpackhi_epi32(bg0, ra0);
+    __m256i bgra2_ = _mm256_unpacklo_epi32(bg1, ra1);
+    __m256i bgra3_ = _mm256_unpackhi_epi32(bg1, ra1);
+
+    __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
+    __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
+    __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
+    __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, bgra0);
+        _mm256_stream_si256((__m256i*)(ptr + 16), bgra1);
+        _mm256_stream_si256((__m256i*)(ptr + 32), bgra2);
+        _mm256_stream_si256((__m256i*)(ptr + 48), bgra3);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, bgra0);
+        _mm256_store_si256((__m256i*)(ptr + 16), bgra1);
+        _mm256_store_si256((__m256i*)(ptr + 32), bgra2);
+        _mm256_store_si256((__m256i*)(ptr + 48), bgra3);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, bgra0);
+        _mm256_storeu_si256((__m256i*)(ptr + 16), bgra1);
+        _mm256_storeu_si256((__m256i*)(ptr + 32), bgra2);
+        _mm256_storeu_si256((__m256i*)(ptr + 48), bgra3);
+    }
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& a, const v_uint32x8& b,
+                                const v_uint32x8& c, const v_uint32x8& d,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m256i bg0 = _mm256_unpacklo_epi32(a.val, b.val);
+    __m256i bg1 = _mm256_unpackhi_epi32(a.val, b.val);
+    __m256i ra0 = _mm256_unpacklo_epi32(c.val, d.val);
+    __m256i ra1 = _mm256_unpackhi_epi32(c.val, d.val);
+
+    __m256i bgra0_ = _mm256_unpacklo_epi64(bg0, ra0);
+    __m256i bgra1_ = _mm256_unpackhi_epi64(bg0, ra0);
+    __m256i bgra2_ = _mm256_unpacklo_epi64(bg1, ra1);
+    __m256i bgra3_ = _mm256_unpackhi_epi64(bg1, ra1);
+
+    __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
+    __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
+    __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
+    __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, bgra0);
+        _mm256_stream_si256((__m256i*)(ptr + 8), bgra1);
+        _mm256_stream_si256((__m256i*)(ptr + 16), bgra2);
+        _mm256_stream_si256((__m256i*)(ptr + 24), bgra3);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, bgra0);
+        _mm256_store_si256((__m256i*)(ptr + 8), bgra1);
+        _mm256_store_si256((__m256i*)(ptr + 16), bgra2);
+        _mm256_store_si256((__m256i*)(ptr + 24), bgra3);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, bgra0);
+        _mm256_storeu_si256((__m256i*)(ptr + 8), bgra1);
+        _mm256_storeu_si256((__m256i*)(ptr + 16), bgra2);
+        _mm256_storeu_si256((__m256i*)(ptr + 24), bgra3);
+    }
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& a, const v_uint64x4& b,
+                                const v_uint64x4& c, const v_uint64x4& d,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m256i bg0 = _mm256_unpacklo_epi64(a.val, b.val);
+    __m256i bg1 = _mm256_unpackhi_epi64(a.val, b.val);
+    __m256i ra0 = _mm256_unpacklo_epi64(c.val, d.val);
+    __m256i ra1 = _mm256_unpackhi_epi64(c.val, d.val);
+
+    __m256i bgra0 = _mm256_permute2x128_si256(bg0, ra0, 0 + 2*16);
+    __m256i bgra1 = _mm256_permute2x128_si256(bg1, ra1, 0 + 2*16);
+    __m256i bgra2 = _mm256_permute2x128_si256(bg0, ra0, 1 + 3*16);
+    __m256i bgra3 = _mm256_permute2x128_si256(bg1, ra1, 1 + 3*16);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm256_stream_si256((__m256i*)ptr, bgra0);
+        _mm256_stream_si256((__m256i*)(ptr + 4), bgra1);
+        _mm256_stream_si256((__m256i*)(ptr + 8), bgra2);
+        _mm256_stream_si256((__m256i*)(ptr + 12), bgra3);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm256_store_si256((__m256i*)ptr, bgra0);
+        _mm256_store_si256((__m256i*)(ptr + 4), bgra1);
+        _mm256_store_si256((__m256i*)(ptr + 8), bgra2);
+        _mm256_store_si256((__m256i*)(ptr + 12), bgra3);
+    }
+    else
+    {
+        _mm256_storeu_si256((__m256i*)ptr, bgra0);
+        _mm256_storeu_si256((__m256i*)(ptr + 4), bgra1);
+        _mm256_storeu_si256((__m256i*)(ptr + 8), bgra2);
+        _mm256_storeu_si256((__m256i*)(ptr + 12), bgra3);
+    }
+}
+
+#define OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
+{ \
+    _Tpvec1 a1, b1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
+{ \
+    _Tpvec1 a1, b1, c1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
+{ \
+    _Tpvec1 a1, b1, c1, d1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+    d0 = v_reinterpret_as_##suffix0(d1); \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, mode);      \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \
+                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode);  \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                const _Tpvec0& c0, const _Tpvec0& d0, \
+                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
+}
+
+OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int8x32, schar, s8, v_uint8x32, uchar, u8)
+OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int16x16, short, s16, v_uint16x16, ushort, u16)
+OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int32x8, int, s32, v_uint32x8, unsigned, u32)
+OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float32x8, float, f32, v_uint32x8, unsigned, u32)
+OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int64x4, int64, s64, v_uint64x4, uint64, u64)
+OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4, uint64, u64)
+
+//
+// FP16
+//
+
+inline v_float32x8 v256_load_expand(const float16_t* ptr)
+{
+#if CV_FP16
+    return v_float32x8(_mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
+#else
+    float CV_DECL_ALIGNED(32) buf[8];
+    for (int i = 0; i < 8; i++)
+        buf[i] = (float)ptr[i];
+    return v256_load_aligned(buf);
+#endif
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x8& a)
+{
+#if CV_FP16
+    __m128i ah = _mm256_cvtps_ph(a.val, 0);
+    _mm_storeu_si128((__m128i*)ptr, ah);
+#else
+    float CV_DECL_ALIGNED(32) buf[8];
+    v_store_aligned(buf, a);
+    for (int i = 0; i < 8; i++)
+        ptr[i] = float16_t(buf[i]);
+#endif
+}
+
+//
+// end of FP16
+//
+
+inline void v256_cleanup() { _mm256_zeroall(); }
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+} // cv::
+
+#endif // OPENCV_HAL_INTRIN_AVX_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_avx512.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_avx512.hpp
new file mode 100644
index 0000000..d20d6dd
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_avx512.hpp
@@ -0,0 +1,3090 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_HAL_INTRIN_AVX512_HPP
+#define OPENCV_HAL_INTRIN_AVX512_HPP
+
+#if defined(_MSC_VER) && (_MSC_VER < 1920/*MSVS2019*/)
+# pragma warning(disable:4146)  // unary minus operator applied to unsigned type, result still unsigned
+# pragma warning(disable:4309)  // 'argument': truncation of constant value
+# pragma warning(disable:4310)  // cast truncates constant value
+#endif
+
+#define CVT_ROUND_MODES_IMPLEMENTED 0
+
+#define CV_SIMD512 1
+#define CV_SIMD512_64F 1
+#define CV_SIMD512_FP16 0  // no native operations with FP16 type. Only load/store from float32x8 are available (if CV_FP16 == 1)
+
+#define _v512_set_epu64(a7, a6, a5, a4, a3, a2, a1, a0) _mm512_set_epi64((int64)(a7),(int64)(a6),(int64)(a5),(int64)(a4),(int64)(a3),(int64)(a2),(int64)(a1),(int64)(a0))
+#define _v512_set_epu32(a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \
+        _mm512_set_epi64(((int64)(a15)<<32)|(int64)(a14), ((int64)(a13)<<32)|(int64)(a12), ((int64)(a11)<<32)|(int64)(a10), ((int64)( a9)<<32)|(int64)( a8), \
+                         ((int64)( a7)<<32)|(int64)( a6), ((int64)( a5)<<32)|(int64)( a4), ((int64)( a3)<<32)|(int64)( a2), ((int64)( a1)<<32)|(int64)( a0))
+#define _v512_set_epu16(a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
+                        a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \
+        _v512_set_epu32(((unsigned)(a31)<<16)|(unsigned)(a30), ((unsigned)(a29)<<16)|(unsigned)(a28), ((unsigned)(a27)<<16)|(unsigned)(a26), ((unsigned)(a25)<<16)|(unsigned)(a24), \
+                        ((unsigned)(a23)<<16)|(unsigned)(a22), ((unsigned)(a21)<<16)|(unsigned)(a20), ((unsigned)(a19)<<16)|(unsigned)(a18), ((unsigned)(a17)<<16)|(unsigned)(a16), \
+                        ((unsigned)(a15)<<16)|(unsigned)(a14), ((unsigned)(a13)<<16)|(unsigned)(a12), ((unsigned)(a11)<<16)|(unsigned)(a10), ((unsigned)( a9)<<16)|(unsigned)( a8), \
+                        ((unsigned)( a7)<<16)|(unsigned)( a6), ((unsigned)( a5)<<16)|(unsigned)( a4), ((unsigned)( a3)<<16)|(unsigned)( a2), ((unsigned)( a1)<<16)|(unsigned)( a0))
+#define _v512_set_epu8(a63, a62, a61, a60, a59, a58, a57, a56, a55, a54, a53, a52, a51, a50, a49, a48, \
+                       a47, a46, a45, a44, a43, a42, a41, a40, a39, a38, a37, a36, a35, a34, a33, a32, \
+                       a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
+                       a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \
+        _v512_set_epu32(((unsigned)(a63)<<24)|((unsigned)(a62)<<16)|((unsigned)(a61)<<8)|(unsigned)(a60),((unsigned)(a59)<<24)|((unsigned)(a58)<<16)|((unsigned)(a57)<<8)|(unsigned)(a56), \
+                        ((unsigned)(a55)<<24)|((unsigned)(a54)<<16)|((unsigned)(a53)<<8)|(unsigned)(a52),((unsigned)(a51)<<24)|((unsigned)(a50)<<16)|((unsigned)(a49)<<8)|(unsigned)(a48), \
+                        ((unsigned)(a47)<<24)|((unsigned)(a46)<<16)|((unsigned)(a45)<<8)|(unsigned)(a44),((unsigned)(a43)<<24)|((unsigned)(a42)<<16)|((unsigned)(a41)<<8)|(unsigned)(a40), \
+                        ((unsigned)(a39)<<24)|((unsigned)(a38)<<16)|((unsigned)(a37)<<8)|(unsigned)(a36),((unsigned)(a35)<<24)|((unsigned)(a34)<<16)|((unsigned)(a33)<<8)|(unsigned)(a32), \
+                        ((unsigned)(a31)<<24)|((unsigned)(a30)<<16)|((unsigned)(a29)<<8)|(unsigned)(a28),((unsigned)(a27)<<24)|((unsigned)(a26)<<16)|((unsigned)(a25)<<8)|(unsigned)(a24), \
+                        ((unsigned)(a23)<<24)|((unsigned)(a22)<<16)|((unsigned)(a21)<<8)|(unsigned)(a20),((unsigned)(a19)<<24)|((unsigned)(a18)<<16)|((unsigned)(a17)<<8)|(unsigned)(a16), \
+                        ((unsigned)(a15)<<24)|((unsigned)(a14)<<16)|((unsigned)(a13)<<8)|(unsigned)(a12),((unsigned)(a11)<<24)|((unsigned)(a10)<<16)|((unsigned)( a9)<<8)|(unsigned)( a8), \
+                        ((unsigned)( a7)<<24)|((unsigned)( a6)<<16)|((unsigned)( a5)<<8)|(unsigned)( a4),((unsigned)( a3)<<24)|((unsigned)( a2)<<16)|((unsigned)( a1)<<8)|(unsigned)( a0))
+#define _v512_set_epi8(a63, a62, a61, a60, a59, a58, a57, a56, a55, a54, a53, a52, a51, a50, a49, a48, \
+                       a47, a46, a45, a44, a43, a42, a41, a40, a39, a38, a37, a36, a35, a34, a33, a32, \
+                       a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
+                       a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \
+        _v512_set_epu8((uchar)(a63), (uchar)(a62), (uchar)(a61), (uchar)(a60), (uchar)(a59), (uchar)(a58), (uchar)(a57), (uchar)(a56), \
+                       (uchar)(a55), (uchar)(a54), (uchar)(a53), (uchar)(a52), (uchar)(a51), (uchar)(a50), (uchar)(a49), (uchar)(a48), \
+                       (uchar)(a47), (uchar)(a46), (uchar)(a45), (uchar)(a44), (uchar)(a43), (uchar)(a42), (uchar)(a41), (uchar)(a40), \
+                       (uchar)(a39), (uchar)(a38), (uchar)(a37), (uchar)(a36), (uchar)(a35), (uchar)(a34), (uchar)(a33), (uchar)(a32), \
+                       (uchar)(a31), (uchar)(a30), (uchar)(a29), (uchar)(a28), (uchar)(a27), (uchar)(a26), (uchar)(a25), (uchar)(a24), \
+                       (uchar)(a23), (uchar)(a22), (uchar)(a21), (uchar)(a20), (uchar)(a19), (uchar)(a18), (uchar)(a17), (uchar)(a16), \
+                       (uchar)(a15), (uchar)(a14), (uchar)(a13), (uchar)(a12), (uchar)(a11), (uchar)(a10), (uchar)( a9), (uchar)( a8), \
+                       (uchar)( a7), (uchar)( a6), (uchar)( a5), (uchar)( a4), (uchar)( a3), (uchar)( a2), (uchar)( a1), (uchar)( a0))
+
+#ifndef _mm512_cvtpd_pslo
+#ifdef _mm512_zextsi256_si512
+#define _mm512_cvtpd_pslo(a) _mm512_zextps256_ps512(_mm512_cvtpd_ps(a))
+#else
+//if preferred way to extend with zeros is unavailable
+#define _mm512_cvtpd_pslo(a) _mm512_castps256_ps512(_mm512_cvtpd_ps(a))
+#endif
+#endif
+///////// Utils ////////////
+
+namespace
+{
+
+inline __m512i _v512_combine(const __m256i& lo, const __m256i& hi)
+{ return _mm512_inserti32x8(_mm512_castsi256_si512(lo), hi, 1); }
+
+inline __m512 _v512_combine(const __m256& lo, const __m256& hi)
+{ return _mm512_insertf32x8(_mm512_castps256_ps512(lo), hi, 1); }
+
+inline __m512d _v512_combine(const __m256d& lo, const __m256d& hi)
+{ return _mm512_insertf64x4(_mm512_castpd256_pd512(lo), hi, 1); }
+
+inline int _v_cvtsi512_si32(const __m512i& a)
+{ return _mm_cvtsi128_si32(_mm512_castsi512_si128(a)); }
+
+inline __m256i _v512_extract_high(const __m512i& v)
+{ return _mm512_extracti32x8_epi32(v, 1); }
+
+inline __m256  _v512_extract_high(const __m512& v)
+{ return _mm512_extractf32x8_ps(v, 1); }
+
+inline __m256d _v512_extract_high(const __m512d& v)
+{ return _mm512_extractf64x4_pd(v, 1); }
+
+inline __m256i _v512_extract_low(const __m512i& v)
+{ return _mm512_castsi512_si256(v); }
+
+inline __m256  _v512_extract_low(const __m512& v)
+{ return _mm512_castps512_ps256(v); }
+
+inline __m256d _v512_extract_low(const __m512d& v)
+{ return _mm512_castpd512_pd256(v); }
+
+inline __m512i _v512_insert(const __m512i& a, const __m256i& b)
+{ return _mm512_inserti32x8(a, b, 0); }
+
+inline __m512 _v512_insert(const __m512& a, const __m256& b)
+{ return _mm512_insertf32x8(a, b, 0); }
+
+inline __m512d _v512_insert(const __m512d& a, const __m256d& b)
+{ return _mm512_insertf64x4(a, b, 0); }
+
+}
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+///////// Types ////////////
+
+struct v_uint8x64
+{
+    typedef uchar lane_type;
+    enum { nlanes = 64 };
+    __m512i val;
+
+    explicit v_uint8x64(__m512i v) : val(v) {}
+    v_uint8x64(uchar v0,  uchar v1,  uchar v2,  uchar v3,
+               uchar v4,  uchar v5,  uchar v6,  uchar v7,
+               uchar v8,  uchar v9,  uchar v10, uchar v11,
+               uchar v12, uchar v13, uchar v14, uchar v15,
+               uchar v16, uchar v17, uchar v18, uchar v19,
+               uchar v20, uchar v21, uchar v22, uchar v23,
+               uchar v24, uchar v25, uchar v26, uchar v27,
+               uchar v28, uchar v29, uchar v30, uchar v31,
+               uchar v32, uchar v33, uchar v34, uchar v35,
+               uchar v36, uchar v37, uchar v38, uchar v39,
+               uchar v40, uchar v41, uchar v42, uchar v43,
+               uchar v44, uchar v45, uchar v46, uchar v47,
+               uchar v48, uchar v49, uchar v50, uchar v51,
+               uchar v52, uchar v53, uchar v54, uchar v55,
+               uchar v56, uchar v57, uchar v58, uchar v59,
+               uchar v60, uchar v61, uchar v62, uchar v63)
+    {
+        val = _v512_set_epu8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,
+                             v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,
+                             v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
+                             v15, v14, v13, v12, v11, v10, v9,  v8,  v7,  v6,  v5,  v4,  v3,  v2,  v1,  v0);
+    }
+    v_uint8x64() {}
+
+    static inline v_uint8x64 zero() { return v_uint8x64(_mm512_setzero_si512()); }
+
+    uchar get0() const { return (uchar)_v_cvtsi512_si32(val); }
+};
+
+struct v_int8x64
+{
+    typedef schar lane_type;
+    enum { nlanes = 64 };
+    __m512i val;
+
+    explicit v_int8x64(__m512i v) : val(v) {}
+    v_int8x64(schar v0,  schar v1,  schar v2,  schar v3,
+              schar v4,  schar v5,  schar v6,  schar v7,
+              schar v8,  schar v9,  schar v10, schar v11,
+              schar v12, schar v13, schar v14, schar v15,
+              schar v16, schar v17, schar v18, schar v19,
+              schar v20, schar v21, schar v22, schar v23,
+              schar v24, schar v25, schar v26, schar v27,
+              schar v28, schar v29, schar v30, schar v31,
+              schar v32, schar v33, schar v34, schar v35,
+              schar v36, schar v37, schar v38, schar v39,
+              schar v40, schar v41, schar v42, schar v43,
+              schar v44, schar v45, schar v46, schar v47,
+              schar v48, schar v49, schar v50, schar v51,
+              schar v52, schar v53, schar v54, schar v55,
+              schar v56, schar v57, schar v58, schar v59,
+              schar v60, schar v61, schar v62, schar v63)
+    {
+        val = _v512_set_epi8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,
+                             v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,
+                             v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
+                             v15, v14, v13, v12, v11, v10, v9,  v8,  v7,  v6,  v5,  v4,  v3,  v2,  v1,  v0);
+    }
+    v_int8x64() {}
+
+    static inline v_int8x64 zero() { return v_int8x64(_mm512_setzero_si512()); }
+
+    schar get0() const { return (schar)_v_cvtsi512_si32(val); }
+};
+
+struct v_uint16x32
+{
+    typedef ushort lane_type;
+    enum { nlanes = 32 };
+    __m512i val;
+
+    explicit v_uint16x32(__m512i v) : val(v) {}
+    v_uint16x32(ushort v0,  ushort v1,  ushort v2,  ushort v3,
+                ushort v4,  ushort v5,  ushort v6,  ushort v7,
+                ushort v8,  ushort v9,  ushort v10, ushort v11,
+                ushort v12, ushort v13, ushort v14, ushort v15,
+                ushort v16, ushort v17, ushort v18, ushort v19,
+                ushort v20, ushort v21, ushort v22, ushort v23,
+                ushort v24, ushort v25, ushort v26, ushort v27,
+                ushort v28, ushort v29, ushort v30, ushort v31)
+    {
+        val = _v512_set_epu16(v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
+                              v15, v14, v13, v12, v11, v10, v9,  v8,  v7,  v6,  v5,  v4,  v3,  v2,  v1,  v0);
+    }
+    v_uint16x32() {}
+
+    static inline v_uint16x32 zero() { return v_uint16x32(_mm512_setzero_si512()); }
+
+    ushort get0() const { return (ushort)_v_cvtsi512_si32(val); }
+};
+
+struct v_int16x32
+{
+    typedef short lane_type;
+    enum { nlanes = 32 };
+    __m512i val;
+
+    explicit v_int16x32(__m512i v) : val(v) {}
+    v_int16x32(short v0,  short v1,  short v2,  short v3,  short v4,  short v5,  short v6,  short v7,
+               short v8,  short v9,  short v10, short v11, short v12, short v13, short v14, short v15,
+               short v16, short v17, short v18, short v19, short v20, short v21, short v22, short v23,
+               short v24, short v25, short v26, short v27, short v28, short v29, short v30, short v31)
+    {
+        val = _v512_set_epu16((ushort)v31, (ushort)v30, (ushort)v29, (ushort)v28, (ushort)v27, (ushort)v26, (ushort)v25, (ushort)v24,
+                              (ushort)v23, (ushort)v22, (ushort)v21, (ushort)v20, (ushort)v19, (ushort)v18, (ushort)v17, (ushort)v16,
+                              (ushort)v15, (ushort)v14, (ushort)v13, (ushort)v12, (ushort)v11, (ushort)v10, (ushort)v9 , (ushort)v8,
+                              (ushort)v7 , (ushort)v6 , (ushort)v5 , (ushort)v4 , (ushort)v3 , (ushort)v2 , (ushort)v1 , (ushort)v0);
+    }
+    v_int16x32() {}
+
+    static inline v_int16x32 zero() { return v_int16x32(_mm512_setzero_si512()); }
+
+    short get0() const { return (short)_v_cvtsi512_si32(val); }
+};
+
+struct v_uint32x16
+{
+    typedef unsigned lane_type;
+    enum { nlanes = 16 };
+    __m512i val;
+
+    explicit v_uint32x16(__m512i v) : val(v) {}
+    v_uint32x16(unsigned v0,  unsigned v1,  unsigned v2,  unsigned v3,
+                unsigned v4,  unsigned v5,  unsigned v6,  unsigned v7,
+                unsigned v8,  unsigned v9,  unsigned v10, unsigned v11,
+                unsigned v12, unsigned v13, unsigned v14, unsigned v15)
+    {
+        val = _mm512_setr_epi32((int)v0,  (int)v1,  (int)v2,  (int)v3, (int)v4,  (int)v5,  (int)v6,  (int)v7,
+                                (int)v8,  (int)v9,  (int)v10, (int)v11, (int)v12, (int)v13, (int)v14, (int)v15);
+    }
+    v_uint32x16() {}
+
+    static inline v_uint32x16 zero() { return v_uint32x16(_mm512_setzero_si512()); }
+
+    unsigned get0() const { return (unsigned)_v_cvtsi512_si32(val); }
+};
+
+struct v_int32x16
+{
+    typedef int lane_type;
+    enum { nlanes = 16 };
+    __m512i val;
+
+    explicit v_int32x16(__m512i v) : val(v) {}
+    v_int32x16(int v0, int v1, int v2,  int v3,  int v4,  int v5,  int v6,  int v7,
+               int v8, int v9, int v10, int v11, int v12, int v13, int v14, int v15)
+    {
+        val = _mm512_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+    }
+    v_int32x16() {}
+
+    static inline v_int32x16 zero() { return v_int32x16(_mm512_setzero_si512()); }
+
+    int get0() const { return _v_cvtsi512_si32(val); }
+};
+
+struct v_float32x16
+{
+    typedef float lane_type;
+    enum { nlanes = 16 };
+    __m512 val;
+
+    explicit v_float32x16(__m512 v) : val(v) {}
+    v_float32x16(float v0, float v1, float v2,  float v3,  float v4,  float v5,  float v6,  float v7,
+                 float v8, float v9, float v10, float v11, float v12, float v13, float v14, float v15)
+    {
+        val = _mm512_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+    }
+    v_float32x16() {}
+
+    static inline v_float32x16 zero() { return v_float32x16(_mm512_setzero_ps()); }
+
+    float get0() const { return _mm_cvtss_f32(_mm512_castps512_ps128(val)); }
+};
+
+struct v_uint64x8
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 8 };
+    __m512i val;
+
+    explicit v_uint64x8(__m512i v) : val(v) {}
+    v_uint64x8(uint64 v0, uint64 v1, uint64 v2, uint64 v3, uint64 v4, uint64 v5, uint64 v6, uint64 v7)
+    { val = _mm512_setr_epi64((int64)v0, (int64)v1, (int64)v2, (int64)v3, (int64)v4, (int64)v5, (int64)v6, (int64)v7); }
+    v_uint64x8() {}
+
+    static inline v_uint64x8 zero() { return v_uint64x8(_mm512_setzero_si512()); }
+
+    uint64 get0() const
+    {
+    #if defined __x86_64__ || defined _M_X64
+        return (uint64)_mm_cvtsi128_si64(_mm512_castsi512_si128(val));
+    #else
+        int a = _mm_cvtsi128_si32(_mm512_castsi512_si128(val));
+        int b = _mm_cvtsi128_si32(_mm512_castsi512_si128(_mm512_srli_epi64(val, 32)));
+        return (unsigned)a | ((uint64)(unsigned)b << 32);
+    #endif
+    }
+};
+
+struct v_int64x8
+{
+    typedef int64 lane_type;
+    enum { nlanes = 8 };
+    __m512i val;
+
+    explicit v_int64x8(__m512i v) : val(v) {}
+    v_int64x8(int64 v0, int64 v1, int64 v2, int64 v3, int64 v4, int64 v5, int64 v6, int64 v7)
+    { val = _mm512_setr_epi64(v0, v1, v2, v3, v4, v5, v6, v7); }
+    v_int64x8() {}
+
+    static inline v_int64x8 zero() { return v_int64x8(_mm512_setzero_si512()); }
+
+    int64 get0() const
+    {
+    #if defined __x86_64__ || defined _M_X64
+        return (int64)_mm_cvtsi128_si64(_mm512_castsi512_si128(val));
+    #else
+        int a = _mm_cvtsi128_si32(_mm512_castsi512_si128(val));
+        int b = _mm_cvtsi128_si32(_mm512_castsi512_si128(_mm512_srli_epi64(val, 32)));
+        return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
+    #endif
+    }
+};
+
+struct v_float64x8
+{
+    typedef double lane_type;
+    enum { nlanes = 8 };
+    __m512d val;
+
+    explicit v_float64x8(__m512d v) : val(v) {}
+    v_float64x8(double v0, double v1, double v2, double v3, double v4, double v5, double v6, double v7)
+    { val = _mm512_setr_pd(v0, v1, v2, v3, v4, v5, v6, v7); }
+    v_float64x8() {}
+
+    static inline v_float64x8 zero() { return v_float64x8(_mm512_setzero_pd()); }
+
+    double get0() const { return _mm_cvtsd_f64(_mm512_castpd512_pd128(val)); }
+};
+
+//////////////// Load and store operations ///////////////
+
+#define OPENCV_HAL_IMPL_AVX512_LOADSTORE(_Tpvec, _Tp)                    \
+    inline _Tpvec v512_load(const _Tp* ptr)                           \
+    { return _Tpvec(_mm512_loadu_si512((const __m512i*)ptr)); }       \
+    inline _Tpvec v512_load_aligned(const _Tp* ptr)                   \
+    { return _Tpvec(_mm512_load_si512((const __m512i*)ptr)); }        \
+    inline _Tpvec v512_load_low(const _Tp* ptr)                       \
+    {                                                                 \
+        __m256i v256 = _mm256_loadu_si256((const __m256i*)ptr);       \
+        return _Tpvec(_mm512_castsi256_si512(v256));                  \
+    }                                                                 \
+    inline _Tpvec v512_load_halves(const _Tp* ptr0, const _Tp* ptr1)  \
+    {                                                                 \
+        __m256i vlo = _mm256_loadu_si256((const __m256i*)ptr0);       \
+        __m256i vhi = _mm256_loadu_si256((const __m256i*)ptr1);       \
+        return _Tpvec(_v512_combine(vlo, vhi));                       \
+    }                                                                 \
+    inline void v_store(_Tp* ptr, const _Tpvec& a)                    \
+    { _mm512_storeu_si512((__m512i*)ptr, a.val); }                    \
+    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)            \
+    { _mm512_store_si512((__m512i*)ptr, a.val); }                     \
+    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)    \
+    { _mm512_stream_si512((__m512i*)ptr, a.val); }                    \
+    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+    { \
+        if( mode == hal::STORE_UNALIGNED ) \
+            _mm512_storeu_si512((__m512i*)ptr, a.val); \
+        else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
+            _mm512_stream_si512((__m512i*)ptr, a.val); \
+        else \
+            _mm512_store_si512((__m512i*)ptr, a.val); \
+    } \
+    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                \
+    { _mm256_storeu_si256((__m256i*)ptr, _v512_extract_low(a.val)); }    \
+    inline void v_store_high(_Tp* ptr, const _Tpvec& a)               \
+    { _mm256_storeu_si256((__m256i*)ptr, _v512_extract_high(a.val)); }
+
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint8x64,  uchar)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int8x64,   schar)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint16x32, ushort)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int16x32,  short)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint32x16,  unsigned)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int32x16,   int)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint64x8,  uint64)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int64x8,   int64)
+
+#define OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(_Tpvec, _Tp, suffix, halfreg)   \
+    inline _Tpvec v512_load(const _Tp* ptr)                               \
+    { return _Tpvec(_mm512_loadu_##suffix(ptr)); }                        \
+    inline _Tpvec v512_load_aligned(const _Tp* ptr)                       \
+    { return _Tpvec(_mm512_load_##suffix(ptr)); }                         \
+    inline _Tpvec v512_load_low(const _Tp* ptr)                           \
+    {                                                                     \
+        return _Tpvec(_mm512_cast##suffix##256_##suffix##512              \
+                     (_mm256_loadu_##suffix(ptr)));                       \
+    }                                                                     \
+    inline _Tpvec v512_load_halves(const _Tp* ptr0, const _Tp* ptr1)      \
+    {                                                                     \
+        halfreg vlo = _mm256_loadu_##suffix(ptr0);                        \
+        halfreg vhi = _mm256_loadu_##suffix(ptr1);                        \
+        return _Tpvec(_v512_combine(vlo, vhi));                           \
+    }                                                                     \
+    inline void v_store(_Tp* ptr, const _Tpvec& a)                        \
+    { _mm512_storeu_##suffix(ptr, a.val); }                               \
+    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)                \
+    { _mm512_store_##suffix(ptr, a.val); }                                \
+    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)        \
+    { _mm512_stream_##suffix(ptr, a.val); }                               \
+    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+    { \
+        if( mode == hal::STORE_UNALIGNED ) \
+            _mm512_storeu_##suffix(ptr, a.val); \
+        else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
+            _mm512_stream_##suffix(ptr, a.val); \
+        else \
+            _mm512_store_##suffix(ptr, a.val); \
+    } \
+    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                    \
+    { _mm256_storeu_##suffix(ptr, _v512_extract_low(a.val)); }            \
+    inline void v_store_high(_Tp* ptr, const _Tpvec& a)                   \
+    { _mm256_storeu_##suffix(ptr, _v512_extract_high(a.val)); }
+
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float32x16, float,  ps, __m256)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float64x8, double, pd, __m256d)
+
+#define OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, _Tpvecf, suffix, cast) \
+    inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a)   \
+    { return _Tpvec(cast(a.val)); }
+
+#define OPENCV_HAL_IMPL_AVX512_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s)         \
+    inline _Tpvec v512_setzero_##suffix()                                          \
+    { return _Tpvec(_mm512_setzero_si512()); }                                     \
+    inline _Tpvec v512_setall_##suffix(_Tp v)                                      \
+    { return _Tpvec(_mm512_set1_##ssuffix((ctype_s)v)); }                          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64,   suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64,    suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32,  suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int16x32,   suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint32x16,  suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int32x16,   suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint64x8,   suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int64x8,    suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_float32x16, suffix, _mm512_castps_si512) \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_float64x8,  suffix, _mm512_castpd_si512)
+
+OPENCV_HAL_IMPL_AVX512_INIT(v_uint8x64,  uchar,    u8,  epi8,   char)
+OPENCV_HAL_IMPL_AVX512_INIT(v_int8x64,   schar,    s8,  epi8,   char)
+OPENCV_HAL_IMPL_AVX512_INIT(v_uint16x32, ushort,   u16, epi16,  short)
+OPENCV_HAL_IMPL_AVX512_INIT(v_int16x32,  short,    s16, epi16,  short)
+OPENCV_HAL_IMPL_AVX512_INIT(v_uint32x16, unsigned, u32, epi32,  int)
+OPENCV_HAL_IMPL_AVX512_INIT(v_int32x16,  int,      s32, epi32,  int)
+OPENCV_HAL_IMPL_AVX512_INIT(v_uint64x8,  uint64,   u64, epi64,  int64)
+OPENCV_HAL_IMPL_AVX512_INIT(v_int64x8,   int64,    s64, epi64,  int64)
+
+#define OPENCV_HAL_IMPL_AVX512_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
+    inline _Tpvec v512_setzero_##suffix()                                   \
+    { return _Tpvec(_mm512_setzero_##zsuffix()); }                          \
+    inline _Tpvec v512_setall_##suffix(_Tp v)                               \
+    { return _Tpvec(_mm512_set1_##zsuffix(v)); }                            \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64,   suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32, suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int16x32,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint32x16, suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int32x16,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint64x8,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int64x8,   suffix, cast)
+
+OPENCV_HAL_IMPL_AVX512_INIT_FLT(v_float32x16, float,  f32, ps, _mm512_castsi512_ps)
+OPENCV_HAL_IMPL_AVX512_INIT_FLT(v_float64x8,  double, f64, pd, _mm512_castsi512_pd)
+
+inline v_float32x16 v_reinterpret_as_f32(const v_float32x16& a)
+{ return a; }
+inline v_float32x16 v_reinterpret_as_f32(const v_float64x8& a)
+{ return v_float32x16(_mm512_castpd_ps(a.val)); }
+
+inline v_float64x8 v_reinterpret_as_f64(const v_float64x8& a)
+{ return a; }
+inline v_float64x8 v_reinterpret_as_f64(const v_float32x16& a)
+{ return v_float64x8(_mm512_castps_pd(a.val)); }
+
+// FP16
+inline v_float32x16 v512_load_expand(const float16_t* ptr)
+{
+    return v_float32x16(_mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)ptr)));
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x16& a)
+{
+    __m256i ah = _mm512_cvtps_ph(a.val, 0);
+    _mm256_storeu_si256((__m256i*)ptr, ah);
+}
+
+/* Recombine & ZIP */
+inline void v_zip(const v_int8x64& a, const v_int8x64& b, v_int8x64& ab0, v_int8x64& ab1)
+{
+#if CV_AVX_512VBMI
+    __m512i mask0 = _v512_set_epu8( 95,  31,  94,  30,  93,  29,  92,  28,  91,  27,  90,  26,  89,  25,  88,  24,
+                                    87,  23,  86,  22,  85,  21,  84,  20,  83,  19,  82,  18,  81,  17,  80,  16,
+                                    79,  15,  78,  14,  77,  13,  76,  12,  75,  11,  74,  10,  73,   9,  72,   8,
+                                    71,   7,  70,   6,  69,   5,  68,   4,  67,   3,  66,   2,  65,   1,  64,   0);
+    ab0 = v_int8x64(_mm512_permutex2var_epi8(a.val, mask0, b.val));
+    __m512i mask1 = _v512_set_epu8(127,  63, 126,  62, 125,  61, 124,  60, 123,  59, 122,  58, 121,  57, 120,  56,
+                                   119,  55, 118,  54, 117,  53, 116,  52, 115,  51, 114,  50, 113,  49, 112,  48,
+                                   111,  47, 110,  46, 109,  45, 108,  44, 107,  43, 106,  42, 105,  41, 104,  40,
+                                   103,  39, 102,  38, 101,  37, 100,  36,  99,  35,  98,  34,  97,  33,  96,  32);
+    ab1 = v_int8x64(_mm512_permutex2var_epi8(a.val, mask1, b.val));
+#else
+    __m512i low  = _mm512_unpacklo_epi8(a.val, b.val);
+    __m512i high = _mm512_unpackhi_epi8(a.val, b.val);
+    ab0 = v_int8x64(_mm512_permutex2var_epi64(low, _v512_set_epu64(11, 10, 3, 2,  9,  8, 1, 0), high));
+    ab1 = v_int8x64(_mm512_permutex2var_epi64(low, _v512_set_epu64(15, 14, 7, 6, 13, 12, 5, 4), high));
+#endif
+}
+inline void v_zip(const v_int16x32& a, const v_int16x32& b, v_int16x32& ab0, v_int16x32& ab1)
+{
+    __m512i mask0 = _v512_set_epu16(47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41,  9, 40,  8,
+                                    39,  7, 38,  6, 37,  5, 36,  4, 35,  3, 34,  2, 33,  1, 32,  0);
+    ab0 = v_int16x32(_mm512_permutex2var_epi16(a.val, mask0, b.val));
+    __m512i mask1 = _v512_set_epu16(63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24,
+                                    55, 23, 54, 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16);
+    ab1 = v_int16x32(_mm512_permutex2var_epi16(a.val, mask1, b.val));
+}
+inline void v_zip(const v_int32x16& a, const v_int32x16& b, v_int32x16& ab0, v_int32x16& ab1)
+{
+    __m512i mask0 = _v512_set_epu32(23,  7, 22,  6, 21,  5, 20,  4, 19,  3, 18,  2, 17, 1, 16, 0);
+    ab0 = v_int32x16(_mm512_permutex2var_epi32(a.val, mask0, b.val));
+    __m512i mask1 = _v512_set_epu32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
+    ab1 = v_int32x16(_mm512_permutex2var_epi32(a.val, mask1, b.val));
+}
+inline void v_zip(const v_int64x8& a, const v_int64x8& b, v_int64x8& ab0, v_int64x8& ab1)
+{
+    __m512i mask0 = _v512_set_epu64(11, 3, 10, 2,  9, 1,  8, 0);
+    ab0 = v_int64x8(_mm512_permutex2var_epi64(a.val, mask0, b.val));
+    __m512i mask1 = _v512_set_epu64(15, 7, 14, 6, 13, 5, 12, 4);
+    ab1 = v_int64x8(_mm512_permutex2var_epi64(a.val, mask1, b.val));
+}
+
+inline void v_zip(const v_uint8x64&  a, const v_uint8x64&  b, v_uint8x64& ab0, v_uint8x64& ab1)
+{
+    v_int8x64 i0, i1;
+    v_zip(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b), i0, i1);
+    ab0 = v_reinterpret_as_u8(i0);
+    ab1 = v_reinterpret_as_u8(i1);
+}
+inline void v_zip(const v_uint16x32&  a, const v_uint16x32&  b, v_uint16x32& ab0, v_uint16x32& ab1)
+{
+    v_int16x32 i0, i1;
+    v_zip(v_reinterpret_as_s16(a), v_reinterpret_as_s16(b), i0, i1);
+    ab0 = v_reinterpret_as_u16(i0);
+    ab1 = v_reinterpret_as_u16(i1);
+}
+inline void v_zip(const v_uint32x16&  a, const v_uint32x16&  b, v_uint32x16& ab0, v_uint32x16& ab1)
+{
+    v_int32x16 i0, i1;
+    v_zip(v_reinterpret_as_s32(a), v_reinterpret_as_s32(b), i0, i1);
+    ab0 = v_reinterpret_as_u32(i0);
+    ab1 = v_reinterpret_as_u32(i1);
+}
+inline void v_zip(const v_uint64x8&  a, const v_uint64x8&  b, v_uint64x8& ab0, v_uint64x8& ab1)
+{
+    v_int64x8 i0, i1;
+    v_zip(v_reinterpret_as_s64(a), v_reinterpret_as_s64(b), i0, i1);
+    ab0 = v_reinterpret_as_u64(i0);
+    ab1 = v_reinterpret_as_u64(i1);
+}
+inline void v_zip(const v_float32x16&  a, const v_float32x16&  b, v_float32x16& ab0, v_float32x16& ab1)
+{
+    v_int32x16 i0, i1;
+    v_zip(v_reinterpret_as_s32(a), v_reinterpret_as_s32(b), i0, i1);
+    ab0 = v_reinterpret_as_f32(i0);
+    ab1 = v_reinterpret_as_f32(i1);
+}
+inline void v_zip(const v_float64x8&  a, const v_float64x8&  b, v_float64x8& ab0, v_float64x8& ab1)
+{
+    v_int64x8 i0, i1;
+    v_zip(v_reinterpret_as_s64(a), v_reinterpret_as_s64(b), i0, i1);
+    ab0 = v_reinterpret_as_f64(i0);
+    ab1 = v_reinterpret_as_f64(i1);
+}
+
+#define OPENCV_HAL_IMPL_AVX512_COMBINE(_Tpvec, suffix)                                    \
+    inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)                         \
+    { return _Tpvec(_v512_combine(_v512_extract_low(a.val), _v512_extract_low(b.val))); } \
+    inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)                        \
+    { return _Tpvec(_v512_insert(b.val, _v512_extract_high(a.val))); }                    \
+    inline void v_recombine(const _Tpvec& a, const _Tpvec& b,                             \
+                                  _Tpvec& c, _Tpvec& d)                                   \
+    {                                                                                     \
+        c.val = _v512_combine(_v512_extract_low(a.val),_v512_extract_low(b.val));         \
+        d.val = _v512_insert(b.val,_v512_extract_high(a.val));                            \
+    }
+
+
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint8x64,   epi8)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_int8x64,    epi8)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint16x32,  epi16)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_int16x32,   epi16)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint32x16,  epi32)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_int32x16,   epi32)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint64x8,   epi64)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_int64x8,    epi64)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_float32x16, ps)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_float64x8,  pd)
+
+////////// Arithmetic, bitwise and comparison operations /////////
+
+/* Element-wise binary and unary operations */
+
+/** Non-saturating arithmetics **/
+#define OPENCV_HAL_IMPL_AVX512_BIN_FUNC(func, _Tpvec, intrin) \
+    inline _Tpvec func(const _Tpvec& a, const _Tpvec& b)      \
+    { return _Tpvec(intrin(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_uint8x64, _mm512_add_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_int8x64, _mm512_add_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_uint16x32, _mm512_add_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_int16x32, _mm512_add_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_uint8x64, _mm512_sub_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_int8x64, _mm512_sub_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_uint16x32, _mm512_sub_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_int16x32, _mm512_sub_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_mul_wrap, v_uint16x32, _mm512_mullo_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_mul_wrap, v_int16x32, _mm512_mullo_epi16)
+
+inline v_uint8x64 v_mul_wrap(const v_uint8x64& a, const v_uint8x64& b)
+{
+    __m512i ad = _mm512_srai_epi16(a.val, 8);
+    __m512i bd = _mm512_srai_epi16(b.val, 8);
+    __m512i p0 = _mm512_mullo_epi16(a.val, b.val); // even
+    __m512i p1 = _mm512_slli_epi16(_mm512_mullo_epi16(ad, bd), 8); // odd
+    return v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, p0, p1));
+}
+inline v_int8x64 v_mul_wrap(const v_int8x64& a, const v_int8x64& b)
+{
+    return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
+}
+
+#define OPENCV_HAL_IMPL_AVX512_BIN_OP(bin_op, _Tpvec, intrin)            \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)     \
+    { return _Tpvec(intrin(a.val, b.val)); }                             \
+    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)       \
+    { a.val = intrin(a.val, b.val); return a; }
+
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint32x16, _mm512_add_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint32x16, _mm512_sub_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int32x16, _mm512_add_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int32x16, _mm512_sub_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint64x8, _mm512_add_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint64x8, _mm512_sub_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int64x8, _mm512_add_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int64x8, _mm512_sub_epi64)
+
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint32x16, _mm512_mullo_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int32x16, _mm512_mullo_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint64x8, _mm512_mullo_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int64x8, _mm512_mullo_epi64)
+
+/** Saturating arithmetics **/
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint8x64,  _mm512_adds_epu8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint8x64,  _mm512_subs_epu8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int8x64,   _mm512_adds_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int8x64,   _mm512_subs_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint16x32, _mm512_adds_epu16)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint16x32, _mm512_subs_epu16)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int16x32,  _mm512_adds_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int16x32,  _mm512_subs_epi16)
+
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float32x16, _mm512_add_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float32x16, _mm512_sub_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float32x16, _mm512_mul_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float32x16, _mm512_div_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float64x8, _mm512_add_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float64x8, _mm512_sub_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float64x8, _mm512_mul_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float64x8, _mm512_div_pd)
+
+// saturating multiply
+inline v_uint8x64 operator * (const v_uint8x64& a, const v_uint8x64& b)
+{
+    v_uint16x32 c, d;
+    v_mul_expand(a, b, c, d);
+    return v_pack(c, d);
+}
+inline v_int8x64 operator * (const v_int8x64& a, const v_int8x64& b)
+{
+    v_int16x32 c, d;
+    v_mul_expand(a, b, c, d);
+    return v_pack(c, d);
+}
+inline v_uint16x32 operator * (const v_uint16x32& a, const v_uint16x32& b)
+{
+    __m512i pl = _mm512_mullo_epi16(a.val, b.val);
+    __m512i ph = _mm512_mulhi_epu16(a.val, b.val);
+    __m512i p0 = _mm512_unpacklo_epi16(pl, ph);
+    __m512i p1 = _mm512_unpackhi_epi16(pl, ph);
+
+    const __m512i m = _mm512_set1_epi32(65535);
+    return v_uint16x32(_mm512_packus_epi32(_mm512_min_epu32(p0, m), _mm512_min_epu32(p1, m)));
+}
+inline v_int16x32 operator * (const v_int16x32& a, const v_int16x32& b)
+{
+    __m512i pl = _mm512_mullo_epi16(a.val, b.val);
+    __m512i ph = _mm512_mulhi_epi16(a.val, b.val);
+    __m512i p0 = _mm512_unpacklo_epi16(pl, ph);
+    __m512i p1 = _mm512_unpackhi_epi16(pl, ph);
+    return v_int16x32(_mm512_packs_epi32(p0, p1));
+}
+
+inline v_uint8x64& operator *= (v_uint8x64& a, const v_uint8x64& b)
+{ a = a * b; return a; }
+inline v_int8x64& operator *= (v_int8x64& a, const v_int8x64& b)
+{ a = a * b; return a; }
+inline v_uint16x32& operator *= (v_uint16x32& a, const v_uint16x32& b)
+{ a = a * b; return a; }
+inline v_int16x32& operator *= (v_int16x32& a, const v_int16x32& b)
+{ a = a * b; return a; }
+
+inline v_int16x32 v_mul_hi(const v_int16x32& a, const v_int16x32& b) { return v_int16x32(_mm512_mulhi_epi16(a.val, b.val)); }
+inline v_uint16x32 v_mul_hi(const v_uint16x32& a, const v_uint16x32& b) { return v_uint16x32(_mm512_mulhi_epu16(a.val, b.val)); }
+
+//  Multiply and expand
+inline void v_mul_expand(const v_uint8x64& a, const v_uint8x64& b,
+                         v_uint16x32& c, v_uint16x32& d)
+{
+    v_uint16x32 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
+inline void v_mul_expand(const v_int8x64& a, const v_int8x64& b,
+                         v_int16x32& c, v_int16x32& d)
+{
+    v_int16x32 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
+inline void v_mul_expand(const v_int16x32& a, const v_int16x32& b,
+                         v_int32x16& c, v_int32x16& d)
+{
+    v_int16x32 v0, v1;
+    v_zip(v_mul_wrap(a, b), v_mul_hi(a, b), v0, v1);
+
+    c = v_reinterpret_as_s32(v0);
+    d = v_reinterpret_as_s32(v1);
+}
+
+inline void v_mul_expand(const v_uint16x32& a, const v_uint16x32& b,
+                         v_uint32x16& c, v_uint32x16& d)
+{
+    v_uint16x32 v0, v1;
+    v_zip(v_mul_wrap(a, b), v_mul_hi(a, b), v0, v1);
+
+    c = v_reinterpret_as_u32(v0);
+    d = v_reinterpret_as_u32(v1);
+}
+
+inline void v_mul_expand(const v_uint32x16& a, const v_uint32x16& b,
+                         v_uint64x8& c, v_uint64x8& d)
+{
+    v_zip(v_uint64x8(_mm512_mul_epu32(a.val, b.val)),
+          v_uint64x8(_mm512_mul_epu32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32))), c, d);
+}
+
+inline void v_mul_expand(const v_int32x16& a, const v_int32x16& b,
+    v_int64x8& c, v_int64x8& d)
+{
+    v_zip(v_int64x8(_mm512_mul_epi32(a.val, b.val)),
+          v_int64x8(_mm512_mul_epi32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32))), c, d);
+}
+
+/** Bitwise shifts **/
+#define OPENCV_HAL_IMPL_AVX512_SHIFT_OP(_Tpuvec, _Tpsvec, suffix) \
+    inline _Tpuvec operator << (const _Tpuvec& a, int imm)        \
+    { return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); }         \
+    inline _Tpsvec operator << (const _Tpsvec& a, int imm)        \
+    { return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); }         \
+    inline _Tpuvec operator >> (const _Tpuvec& a, int imm)        \
+    { return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); }         \
+    inline _Tpsvec operator >> (const _Tpsvec& a, int imm)        \
+    { return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); }         \
+    template<int imm>                                             \
+    inline _Tpuvec v_shl(const _Tpuvec& a)                        \
+    { return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); }         \
+    template<int imm>                                             \
+    inline _Tpsvec v_shl(const _Tpsvec& a)                        \
+    { return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); }         \
+    template<int imm>                                             \
+    inline _Tpuvec v_shr(const _Tpuvec& a)                        \
+    { return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); }         \
+    template<int imm>                                             \
+    inline _Tpsvec v_shr(const _Tpsvec& a)                        \
+    { return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); }
+
+OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint16x32, v_int16x32, epi16)
+OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint32x16, v_int32x16, epi32)
+OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint64x8,  v_int64x8,  epi64)
+
+
+/** Bitwise logic **/
+#define OPENCV_HAL_IMPL_AVX512_LOGIC_OP(_Tpvec, suffix, not_const) \
+    OPENCV_HAL_IMPL_AVX512_BIN_OP(&, _Tpvec, _mm512_and_##suffix)  \
+    OPENCV_HAL_IMPL_AVX512_BIN_OP(|, _Tpvec, _mm512_or_##suffix)   \
+    OPENCV_HAL_IMPL_AVX512_BIN_OP(^, _Tpvec, _mm512_xor_##suffix)  \
+    inline _Tpvec operator ~ (const _Tpvec& a)                     \
+    { return _Tpvec(_mm512_xor_##suffix(a.val, not_const)); }
+
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint8x64,   si512, _mm512_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int8x64,    si512, _mm512_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint16x32,  si512, _mm512_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int16x32,   si512, _mm512_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint32x16,  si512, _mm512_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int32x16,   si512, _mm512_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint64x8,   si512, _mm512_set1_epi64(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int64x8,    si512, _mm512_set1_epi64(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_float32x16, ps,    _mm512_castsi512_ps(_mm512_set1_epi32(-1)))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_float64x8,  pd,    _mm512_castsi512_pd(_mm512_set1_epi32(-1)))
+
+/** Select **/
+#define OPENCV_HAL_IMPL_AVX512_SELECT(_Tpvec, suffix, zsuf)                      \
+    inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+    { return _Tpvec(_mm512_mask_blend_##suffix(_mm512_cmp_##suffix##_mask(mask.val, _mm512_setzero_##zsuf(), _MM_CMPINT_EQ), a.val, b.val)); }
+
+OPENCV_HAL_IMPL_AVX512_SELECT(v_uint8x64,   epi8, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_int8x64,    epi8, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_uint16x32, epi16, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_int16x32,  epi16, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_uint32x16, epi32, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_int32x16,  epi32, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_uint64x8,  epi64, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_int64x8,   epi64, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_float32x16,   ps,    ps)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_float64x8,    pd,    pd)
+
+/** Comparison **/
+#define OPENCV_HAL_IMPL_AVX512_CMP_INT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)               \
+    { return _Tpvec(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval)); }
+
+#define OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(_Tpvec, sufcmp, sufset, tval)              \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(==, _MM_CMPINT_EQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(!=, _MM_CMPINT_NE,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(<,  _MM_CMPINT_LT,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(>,  _MM_CMPINT_NLE, _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(<=, _MM_CMPINT_LE,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(>=, _MM_CMPINT_NLT, _Tpvec, sufcmp, sufset, tval)
+
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint8x64,   epu8,  epi8, (char)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int8x64,    epi8,  epi8, (char)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint16x32, epu16, epi16, (short)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int16x32,  epi16, epi16, (short)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint32x16, epu32, epi32, (int)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int32x16,  epi32, epi32, (int)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint64x8,  epu64, epi64, (int64)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int64x8,   epi64, epi64, (int64)-1)
+
+#define OPENCV_HAL_IMPL_AVX512_CMP_FLT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)               \
+    { return _Tpvec(_mm512_castsi512_##sufcmp(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval))); }
+
+#define OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(_Tpvec, sufcmp, sufset, tval)           \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(==, _CMP_EQ_OQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(<,  _CMP_LT_OQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(>,  _CMP_GT_OQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(<=, _CMP_LE_OQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(>=, _CMP_GE_OQ,  _Tpvec, sufcmp, sufset, tval)
+
+OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float32x16, ps, epi32, (int)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float64x8,  pd, epi64, (int64)-1)
+
+inline v_float32x16 v_not_nan(const v_float32x16& a)
+{ return v_float32x16(_mm512_castsi512_ps(_mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(a.val, a.val, _CMP_ORD_Q), (int)-1))); }
+inline v_float64x8 v_not_nan(const v_float64x8& a)
+{ return v_float64x8(_mm512_castsi512_pd(_mm512_maskz_set1_epi64(_mm512_cmp_pd_mask(a.val, a.val, _CMP_ORD_Q), (int64)-1))); }
+
+/** min/max **/
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint8x64,   _mm512_min_epu8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint8x64,   _mm512_max_epu8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int8x64,    _mm512_min_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int8x64,    _mm512_max_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint16x32,  _mm512_min_epu16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint16x32,  _mm512_max_epu16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int16x32,   _mm512_min_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int16x32,   _mm512_max_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint32x16,  _mm512_min_epu32)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint32x16,  _mm512_max_epu32)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int32x16,   _mm512_min_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int32x16,   _mm512_max_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint64x8,   _mm512_min_epu64)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint64x8,   _mm512_max_epu64)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int64x8,    _mm512_min_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int64x8,    _mm512_max_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_float32x16, _mm512_min_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_float32x16, _mm512_max_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_float64x8,  _mm512_min_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_float64x8,  _mm512_max_pd)
+
+/** Rotate **/
+namespace {
+    template<bool prec, int imm4, bool part, int imm32>
+    struct _v_rotate_right { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64&) { return v_int8x64(); }};
+    template<int imm4, int imm32>
+    struct _v_rotate_right<true, imm4, false, imm32> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64& b)
+    {
+        return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(b.val, a.val, imm32    ),    imm4 *8),
+                                         _mm512_slli_epi32(_mm512_alignr_epi32(b.val, a.val, imm32 + 1), (4-imm4)*8)));
+    }};
+    template<int imm4>
+    struct _v_rotate_right<true, imm4, false, 15> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64& b)
+    {
+        return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(b.val, a.val, 15),    imm4 *8),
+                                         _mm512_slli_epi32(                                b.val, (4-imm4)*8)));
+    }};
+    template<int imm4, int imm32>
+    struct _v_rotate_right<true, imm4, true, imm32> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b)
+    {
+        return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 16),    imm4 *8),
+                                         _mm512_slli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 15), (4-imm4)*8)));
+    }};
+    template<int imm4>
+    struct _v_rotate_right<true, imm4, true, 31> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b)
+    { return v_int8x64(_mm512_srli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, 15), imm4*8)); }};
+    template<int imm32>
+    struct _v_rotate_right<false, 0, false, imm32> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64& b)
+    { return v_int8x64(_mm512_alignr_epi32(b.val, a.val, imm32)); }};
+    template<>
+    struct _v_rotate_right<false, 0, false, 0> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64&) { return a; }};
+    template<int imm32>
+    struct _v_rotate_right<false, 0, true, imm32> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b)
+    { return v_int8x64(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 16)); }};
+    template<>
+    struct _v_rotate_right<false, 0, true, 16> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b) { return b; }};
+    template<>
+    struct _v_rotate_right<false, 0, true, 32> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64&) { return v_int8x64(); }};
+}
+template<int imm> inline v_int8x64 v_rotate_right(const v_int8x64& a, const v_int8x64& b)
+{
+    return imm >= 128 ? v_int8x64() :
+#if CV_AVX_512VBMI
+    v_int8x64(_mm512_permutex2var_epi8(a.val,
+    _v512_set_epu8(0x3f + imm, 0x3e + imm, 0x3d + imm, 0x3c + imm, 0x3b + imm, 0x3a + imm, 0x39 + imm, 0x38 + imm,
+                   0x37 + imm, 0x36 + imm, 0x35 + imm, 0x34 + imm, 0x33 + imm, 0x32 + imm, 0x31 + imm, 0x30 + imm,
+                   0x2f + imm, 0x2e + imm, 0x2d + imm, 0x2c + imm, 0x2b + imm, 0x2a + imm, 0x29 + imm, 0x28 + imm,
+                   0x27 + imm, 0x26 + imm, 0x25 + imm, 0x24 + imm, 0x23 + imm, 0x22 + imm, 0x21 + imm, 0x20 + imm,
+                   0x1f + imm, 0x1e + imm, 0x1d + imm, 0x1c + imm, 0x1b + imm, 0x1a + imm, 0x19 + imm, 0x18 + imm,
+                   0x17 + imm, 0x16 + imm, 0x15 + imm, 0x14 + imm, 0x13 + imm, 0x12 + imm, 0x11 + imm, 0x10 + imm,
+                   0x0f + imm, 0x0e + imm, 0x0d + imm, 0x0c + imm, 0x0b + imm, 0x0a + imm, 0x09 + imm, 0x08 + imm,
+                   0x07 + imm, 0x06 + imm, 0x05 + imm, 0x04 + imm, 0x03 + imm, 0x02 + imm, 0x01 + imm, 0x00 + imm), b.val));
+#else
+    _v_rotate_right<imm%4!=0, imm%4, (imm/4 > 15), imm/4>::eval(a, b);
+#endif
+}
+template<int imm>
+inline v_int8x64 v_rotate_left(const v_int8x64& a, const v_int8x64& b)
+{
+    if (imm == 0) return a;
+    if (imm == 64) return b;
+    if (imm >= 128) return v_int8x64();
+#if CV_AVX_512VBMI
+    return v_int8x64(_mm512_permutex2var_epi8(b.val,
+           _v512_set_epi8(0x7f - imm,0x7e - imm,0x7d - imm,0x7c - imm,0x7b - imm,0x7a - imm,0x79 - imm,0x78 - imm,
+                          0x77 - imm,0x76 - imm,0x75 - imm,0x74 - imm,0x73 - imm,0x72 - imm,0x71 - imm,0x70 - imm,
+                          0x6f - imm,0x6e - imm,0x6d - imm,0x6c - imm,0x6b - imm,0x6a - imm,0x69 - imm,0x68 - imm,
+                          0x67 - imm,0x66 - imm,0x65 - imm,0x64 - imm,0x63 - imm,0x62 - imm,0x61 - imm,0x60 - imm,
+                          0x5f - imm,0x5e - imm,0x5d - imm,0x5c - imm,0x5b - imm,0x5a - imm,0x59 - imm,0x58 - imm,
+                          0x57 - imm,0x56 - imm,0x55 - imm,0x54 - imm,0x53 - imm,0x52 - imm,0x51 - imm,0x50 - imm,
+                          0x4f - imm,0x4e - imm,0x4d - imm,0x4c - imm,0x4b - imm,0x4a - imm,0x49 - imm,0x48 - imm,
+                          0x47 - imm,0x46 - imm,0x45 - imm,0x44 - imm,0x43 - imm,0x42 - imm,0x41 - imm,0x40 - imm), a.val));
+#else
+    return imm < 64 ? v_rotate_right<64 - imm>(b, a) : v_rotate_right<128 - imm>(v512_setzero_s8(), b);
+#endif
+}
+template<int imm>
+inline v_int8x64 v_rotate_right(const v_int8x64& a)
+{
+    if (imm == 0) return a;
+    if (imm >= 64) return v_int8x64();
+#if CV_AVX_512VBMI
+    return v_int8x64(_mm512_maskz_permutexvar_epi8(0xFFFFFFFFFFFFFFFF >> imm,
+           _v512_set_epu8(0x3f + imm,0x3e + imm,0x3d + imm,0x3c + imm,0x3b + imm,0x3a + imm,0x39 + imm,0x38 + imm,
+                          0x37 + imm,0x36 + imm,0x35 + imm,0x34 + imm,0x33 + imm,0x32 + imm,0x31 + imm,0x30 + imm,
+                          0x2f + imm,0x2e + imm,0x2d + imm,0x2c + imm,0x2b + imm,0x2a + imm,0x29 + imm,0x28 + imm,
+                          0x27 + imm,0x26 + imm,0x25 + imm,0x24 + imm,0x23 + imm,0x22 + imm,0x21 + imm,0x20 + imm,
+                          0x1f + imm,0x1e + imm,0x1d + imm,0x1c + imm,0x1b + imm,0x1a + imm,0x19 + imm,0x18 + imm,
+                          0x17 + imm,0x16 + imm,0x15 + imm,0x14 + imm,0x13 + imm,0x12 + imm,0x11 + imm,0x10 + imm,
+                          0x0f + imm,0x0e + imm,0x0d + imm,0x0c + imm,0x0b + imm,0x0a + imm,0x09 + imm,0x08 + imm,
+                          0x07 + imm,0x06 + imm,0x05 + imm,0x04 + imm,0x03 + imm,0x02 + imm,0x01 + imm,0x00 + imm), a.val));
+#else
+    return v_rotate_right<imm>(a, v512_setzero_s8());
+#endif
+}
+template<int imm>
+inline v_int8x64 v_rotate_left(const v_int8x64& a)
+{
+    if (imm == 0) return a;
+    if (imm >= 64) return v_int8x64();
+#if CV_AVX_512VBMI
+    return v_int8x64(_mm512_maskz_permutexvar_epi8(0xFFFFFFFFFFFFFFFF << imm,
+           _v512_set_epi8(0x3f - imm,0x3e - imm,0x3d - imm,0x3c - imm,0x3b - imm,0x3a - imm,0x39 - imm,0x38 - imm,
+                          0x37 - imm,0x36 - imm,0x35 - imm,0x34 - imm,0x33 - imm,0x32 - imm,0x31 - imm,0x30 - imm,
+                          0x2f - imm,0x2e - imm,0x2d - imm,0x2c - imm,0x2b - imm,0x2a - imm,0x29 - imm,0x28 - imm,
+                          0x27 - imm,0x26 - imm,0x25 - imm,0x24 - imm,0x23 - imm,0x22 - imm,0x21 - imm,0x20 - imm,
+                          0x1f - imm,0x1e - imm,0x1d - imm,0x1c - imm,0x1b - imm,0x1a - imm,0x19 - imm,0x18 - imm,
+                          0x17 - imm,0x16 - imm,0x15 - imm,0x14 - imm,0x13 - imm,0x12 - imm,0x11 - imm,0x10 - imm,
+                          0x0f - imm,0x0e - imm,0x0d - imm,0x0c - imm,0x0b - imm,0x0a - imm,0x09 - imm,0x08 - imm,
+                          0x07 - imm,0x06 - imm,0x05 - imm,0x04 - imm,0x03 - imm,0x02 - imm,0x01 - imm,0x00 - imm), a.val));
+#else
+    return v_rotate_right<64 - imm>(v512_setzero_s8(), a);
+#endif
+}
+
+#define OPENCV_HAL_IMPL_AVX512_ROTATE_PM(_Tpvec, suffix)                                                                                   \
+template<int imm> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)                                                            \
+{ return v_reinterpret_as_##suffix(v_rotate_left<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b))); }      \
+template<int imm> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)                                                           \
+{ return v_reinterpret_as_##suffix(v_rotate_right<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b))); }     \
+template<int imm> inline _Tpvec v_rotate_left(const _Tpvec& a)                                                                             \
+{ return v_reinterpret_as_##suffix(v_rotate_left<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a))); }                              \
+template<int imm> inline _Tpvec v_rotate_right(const _Tpvec& a)                                                                            \
+{ return v_reinterpret_as_##suffix(v_rotate_right<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a))); }
+
+#define OPENCV_HAL_IMPL_AVX512_ROTATE_EC(_Tpvec, suffix)                                                                                   \
+template<int imm>                                                                                                                          \
+inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)                                                                              \
+{                                                                                                                                          \
+    enum { SHIFT2 = (_Tpvec::nlanes - imm) };                                                                                              \
+    enum { MASK = ((1 << _Tpvec::nlanes) - 1) };                                                                                           \
+    if (imm == 0) return a;                                                                                                                \
+    if (imm == _Tpvec::nlanes) return b;                                                                                                   \
+    if (imm >= 2*_Tpvec::nlanes) return _Tpvec::zero();                                                                                    \
+    return _Tpvec(_mm512_mask_expand_##suffix(_mm512_maskz_compress_##suffix((MASK << SHIFT2)&MASK, b.val), (MASK << (imm))&MASK, a.val)); \
+}                                                                                                                                          \
+template<int imm>                                                                                                                          \
+inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)                                                                             \
+{                                                                                                                                          \
+    enum { SHIFT2 = (_Tpvec::nlanes - imm) };                                                                                              \
+    enum { MASK = ((1 << _Tpvec::nlanes) - 1) };                                                                                           \
+    if (imm == 0) return a;                                                                                                                \
+    if (imm == _Tpvec::nlanes) return b;                                                                                                   \
+    if (imm >= 2*_Tpvec::nlanes) return _Tpvec::zero();                                                                                    \
+    return _Tpvec(_mm512_mask_expand_##suffix(_mm512_maskz_compress_##suffix((MASK << (imm))&MASK, a.val), (MASK << SHIFT2)&MASK, b.val)); \
+}                                                                                                                                          \
+template<int imm>                                                                                                                          \
+inline _Tpvec v_rotate_left(const _Tpvec& a)                                                                                               \
+{                                                                                                                                          \
+    if (imm == 0) return a;                                                                                                                \
+    if (imm >= _Tpvec::nlanes) return _Tpvec::zero();                                                                                      \
+    return _Tpvec(_mm512_maskz_expand_##suffix((1 << _Tpvec::nlanes) - (1 << (imm)), a.val));                                              \
+}                                                                                                                                          \
+template<int imm>                                                                                                                          \
+inline _Tpvec v_rotate_right(const _Tpvec& a)                                                                                              \
+{                                                                                                                                          \
+    if (imm == 0) return a;                                                                                                                \
+    if (imm >= _Tpvec::nlanes) return _Tpvec::zero();                                                                                      \
+    return _Tpvec(_mm512_maskz_compress_##suffix((1 << _Tpvec::nlanes) - (1 << (imm)), a.val));                                            \
+}
+
+OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_uint8x64,   u8)
+OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_uint16x32,  u16)
+OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_int16x32,   s16)
+OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_uint32x16,  epi32)
+OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int32x16,   epi32)
+OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_uint64x8,   epi64)
+OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int64x8,    epi64)
+OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float32x16, ps)
+OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float64x8,  pd)
+
+/** Reverse **/
+inline v_uint8x64 v_reverse(const v_uint8x64 &a)
+{
+#if CV_AVX_512VBMI
+    static const __m512i perm = _mm512_set_epi32(
+            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
+            0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f,
+            0x20212223, 0x24252627, 0x28292a2b, 0x2c2d2e2f,
+            0x30313233, 0x34353637, 0x38393a3b, 0x3c3d3e3f);
+    return v_uint8x64(_mm512_permutexvar_epi8(perm, a.val));
+#else
+    static const __m512i shuf = _mm512_set_epi32(
+            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
+            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
+            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
+            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
+    static const __m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);
+    __m512i vec = _mm512_shuffle_epi8(a.val, shuf);
+    return v_uint8x64(_mm512_permutexvar_epi64(perm, vec));
+#endif
+}
+
+inline v_int8x64 v_reverse(const v_int8x64 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x32 v_reverse(const v_uint16x32 &a)
+{
+#if CV_AVX_512VBMI
+    static const __m512i perm = _mm512_set_epi32(
+            0x00000001, 0x00020003, 0x00040005, 0x00060007,
+            0x00080009, 0x000a000b, 0x000c000d, 0x000e000f,
+            0x00100011, 0x00120013, 0x00140015, 0x00160017,
+            0x00180019, 0x001a001b, 0x001c001d, 0x001e001f);
+    return v_uint16x32(_mm512_permutexvar_epi16(perm, a.val));
+#else
+    static const __m512i shuf = _mm512_set_epi32(
+            0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
+            0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
+            0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
+            0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);
+    static const __m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);
+    __m512i vec = _mm512_shuffle_epi8(a.val, shuf);
+    return v_uint16x32(_mm512_permutexvar_epi64(perm, vec));
+#endif
+}
+
+inline v_int16x32 v_reverse(const v_int16x32 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x16 v_reverse(const v_uint32x16 &a)
+{
+    static const __m512i perm = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,14, 15);
+    return v_uint32x16(_mm512_permutexvar_epi32(perm, a.val));
+}
+
+inline v_int32x16 v_reverse(const v_int32x16 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x16 v_reverse(const v_float32x16 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x8 v_reverse(const v_uint64x8 &a)
+{
+    static const __m512i perm = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+    return v_uint64x8(_mm512_permutexvar_epi64(perm, a.val));
+}
+
+inline v_int64x8 v_reverse(const v_int64x8 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+inline v_float64x8 v_reverse(const v_float64x8 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
+////////// Reduce /////////
+
+/** Reduce **/
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64(a, b) a + b
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_8(sctype, func, _Tpvec, ifunc, scop)                                          \
+    inline sctype v_reduce_##func(const _Tpvec& a)                                                                  \
+    { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));                           \
+      sctype CV_DECL_ALIGNED(64) idx[2];                                                                            \
+      _mm_store_si128((__m128i*)idx, _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1))); \
+      return scop(idx[0], idx[1]); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, min, v_uint64x8, min_epu64, min)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, max, v_uint64x8, max_epu64, max)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, sum, v_uint64x8, add_epi64, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64,  min, v_int64x8,  min_epi64, min)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64,  max, v_int64x8,  max_epi64, max)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64,  sum, v_int64x8,  add_epi64, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)
+
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_8F(func, ifunc, scop)                                         \
+    inline double v_reduce_##func(const v_float64x8& a)                                             \
+    { __m256d half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));           \
+      double CV_DECL_ALIGNED(64) idx[2];                                                            \
+      _mm_store_pd(idx, _mm_##ifunc(_mm256_castpd256_pd128(half), _mm256_extractf128_pd(half, 1))); \
+      return scop(idx[0], idx[1]); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_8F(min, min_pd, min)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8F(max, max_pd, max)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8F(sum, add_pd, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)
+
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_16(sctype, func, _Tpvec, ifunc)                                 \
+    inline sctype v_reduce_##func(const _Tpvec& a)                                                    \
+    { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));             \
+      __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8));                                     \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4));                                     \
+      return (sctype)_mm_cvtsi128_si32(quarter); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_16(uint, min, v_uint32x16, min_epu32)
+OPENCV_HAL_IMPL_AVX512_REDUCE_16(uint, max, v_uint32x16, max_epu32)
+OPENCV_HAL_IMPL_AVX512_REDUCE_16(int,  min, v_int32x16,  min_epi32)
+OPENCV_HAL_IMPL_AVX512_REDUCE_16(int,  max, v_int32x16,  max_epi32)
+
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_16F(func, ifunc)                                            \
+    inline float v_reduce_##func(const v_float32x16& a)                                           \
+    { __m256 half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));          \
+      __m128 quarter = _mm_##ifunc(_mm256_castps256_ps128(half), _mm256_extractf128_ps(half, 1)); \
+      quarter = _mm_##ifunc(quarter, _mm_permute_ps(quarter, _MM_SHUFFLE(0, 0, 3, 2)));           \
+      quarter = _mm_##ifunc(quarter, _mm_permute_ps(quarter, _MM_SHUFFLE(0, 0, 0, 1)));           \
+      return _mm_cvtss_f32(quarter); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_16F(min, min_ps)
+OPENCV_HAL_IMPL_AVX512_REDUCE_16F(max, max_ps)
+
+inline float v_reduce_sum(const v_float32x16& a)
+{
+    __m256 half = _mm256_add_ps(_v512_extract_low(a.val), _v512_extract_high(a.val));
+    __m128 quarter = _mm_add_ps(_mm256_castps256_ps128(half), _mm256_extractf128_ps(half, 1));
+    quarter = _mm_hadd_ps(quarter, quarter);
+    return _mm_cvtss_f32(_mm_hadd_ps(quarter, quarter));
+}
+inline int v_reduce_sum(const v_int32x16& a)
+{
+    __m256i half = _mm256_add_epi32(_v512_extract_low(a.val), _v512_extract_high(a.val));
+    __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
+    quarter = _mm_hadd_epi32(quarter, quarter);
+    return _mm_cvtsi128_si32(_mm_hadd_epi32(quarter, quarter));
+}
+inline uint v_reduce_sum(const v_uint32x16& a)
+{ return (uint)v_reduce_sum(v_reinterpret_as_s32(a)); }
+
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_32(sctype, func, _Tpvec, ifunc)                                 \
+    inline sctype v_reduce_##func(const _Tpvec& a)                                                    \
+    { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));             \
+      __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8));                                     \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4));                                     \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 2));                                     \
+      return (sctype)_mm_cvtsi128_si32(quarter); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_32(ushort, min, v_uint16x32, min_epu16)
+OPENCV_HAL_IMPL_AVX512_REDUCE_32(ushort, max, v_uint16x32, max_epu16)
+OPENCV_HAL_IMPL_AVX512_REDUCE_32(short,  min, v_int16x32,  min_epi16)
+OPENCV_HAL_IMPL_AVX512_REDUCE_32(short,  max, v_int16x32,  max_epi16)
+
+inline int v_reduce_sum(const v_int16x32& a)
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+inline uint v_reduce_sum(const v_uint16x32& a)
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_64(sctype, func, _Tpvec, ifunc)                                 \
+    inline sctype v_reduce_##func(const _Tpvec& a)                                                    \
+    { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));             \
+      __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8));                                     \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4));                                     \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 2));                                     \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 1));                                     \
+      return (sctype)_mm_cvtsi128_si32(quarter); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_64(uchar, min, v_uint8x64, min_epu8)
+OPENCV_HAL_IMPL_AVX512_REDUCE_64(uchar, max, v_uint8x64, max_epu8)
+OPENCV_HAL_IMPL_AVX512_REDUCE_64(schar, min, v_int8x64,  min_epi8)
+OPENCV_HAL_IMPL_AVX512_REDUCE_64(schar, max, v_int8x64,  max_epi8)
+
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(sctype, _Tpvec, suffix)                                    \
+    inline sctype v_reduce_sum(const _Tpvec& a)                                                         \
+    {   __m512i a16 = _mm512_add_epi16(_mm512_cvt##suffix##_epi16(_v512_extract_low(a.val)),            \
+                                       _mm512_cvt##suffix##_epi16(_v512_extract_high(a.val)));          \
+        a16 = _mm512_cvtepi16_epi32(_mm256_add_epi16(_v512_extract_low(a16), _v512_extract_high(a16))); \
+        __m256i a8 = _mm256_add_epi32(_v512_extract_low(a16), _v512_extract_high(a16));                 \
+        __m128i a4 = _mm_add_epi32(_mm256_castsi256_si128(a8), _mm256_extracti128_si256(a8, 1));        \
+        a4 = _mm_hadd_epi32(a4, a4);                                                                    \
+        return (sctype)_mm_cvtsi128_si32(_mm_hadd_epi32(a4, a4)); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(uint, v_uint8x64, epu8)
+OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(int,  v_int8x64,  epi8)
+
+inline v_float32x16 v_reduce_sum4(const v_float32x16& a, const v_float32x16& b,
+                                  const v_float32x16& c, const v_float32x16& d)
+{
+    __m256 abl = _mm256_hadd_ps(_v512_extract_low(a.val), _v512_extract_low(b.val));
+    __m256 abh = _mm256_hadd_ps(_v512_extract_high(a.val), _v512_extract_high(b.val));
+    __m256 cdl = _mm256_hadd_ps(_v512_extract_low(c.val), _v512_extract_low(d.val));
+    __m256 cdh = _mm256_hadd_ps(_v512_extract_high(c.val), _v512_extract_high(d.val));
+    return v_float32x16(_v512_combine(_mm256_hadd_ps(abl, cdl), _mm256_hadd_ps(abh, cdh)));
+}
+
+inline unsigned v_reduce_sad(const v_uint8x64& a, const v_uint8x64& b)
+{
+    __m512i val = _mm512_sad_epu8(a.val, b.val);
+    __m256i half = _mm256_add_epi32(_v512_extract_low(val), _v512_extract_high(val));
+    __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
+    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
+}
+inline unsigned v_reduce_sad(const v_int8x64& a, const v_int8x64& b)
+{
+    __m512i val = _mm512_set1_epi8(-128);
+    val = _mm512_sad_epu8(_mm512_add_epi8(a.val, val), _mm512_add_epi8(b.val, val));
+    __m256i half = _mm256_add_epi32(_v512_extract_low(val), _v512_extract_high(val));
+    __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
+    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
+}
+inline unsigned v_reduce_sad(const v_uint16x32& a, const v_uint16x32& b)
+{ return v_reduce_sum(v_add_wrap(a - b, b - a)); }
+inline unsigned v_reduce_sad(const v_int16x32& a, const v_int16x32& b)
+{ return v_reduce_sum(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)))); }
+inline unsigned v_reduce_sad(const v_uint32x16& a, const v_uint32x16& b)
+{ return v_reduce_sum(v_max(a, b) - v_min(a, b)); }
+inline unsigned v_reduce_sad(const v_int32x16& a, const v_int32x16& b)
+{ return v_reduce_sum(v_reinterpret_as_u32(v_max(a, b) - v_min(a, b))); }
+inline float v_reduce_sad(const v_float32x16& a, const v_float32x16& b)
+{ return v_reduce_sum((a - b) & v_float32x16(_mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff)))); }
+inline double v_reduce_sad(const v_float64x8& a, const v_float64x8& b)
+{ return v_reduce_sum((a - b) & v_float64x8(_mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffff)))); }
+
+/** Popcount **/
+inline v_uint8x64 v_popcount(const v_int8x64& a)
+{
+#if CV_AVX_512BITALG
+    return v_uint8x64(_mm512_popcnt_epi8(a.val));
+#elif CV_AVX_512VBMI
+    __m512i _popcnt_table0 = _v512_set_epu8(7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3,
+                                            5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1,
+                                            5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1,
+                                            4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0);
+    __m512i _popcnt_table1 = _v512_set_epu8(7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3,
+                                            6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2,
+                                            6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2,
+                                            5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1);
+    return v_uint8x64(_mm512_sub_epi8(_mm512_permutex2var_epi8(_popcnt_table0, a.val, _popcnt_table1), _mm512_movm_epi8(_mm512_movepi8_mask(a.val))));
+#else
+    __m512i _popcnt_table = _mm512_set4_epi32(0x04030302, 0x03020201, 0x03020201, 0x02010100);
+    __m512i _popcnt_mask = _mm512_set1_epi8(0x0F);
+
+    return v_uint8x64(_mm512_add_epi8(_mm512_shuffle_epi8(_popcnt_table, _mm512_and_si512(                  a.val,     _popcnt_mask)),
+                                      _mm512_shuffle_epi8(_popcnt_table, _mm512_and_si512(_mm512_srli_epi16(a.val, 4), _popcnt_mask))));
+#endif
+}
+inline v_uint16x32 v_popcount(const v_int16x32& a)
+{
+#if CV_AVX_512BITALG
+    return v_uint16x32(_mm512_popcnt_epi16(a.val));
+#elif CV_AVX_512VPOPCNTDQ
+    __m512i zero = _mm512_setzero_si512();
+    return v_uint16x32(_mm512_packs_epi32(_mm512_popcnt_epi32(_mm512_unpacklo_epi16(a.val, zero)),
+                                          _mm512_popcnt_epi32(_mm512_unpackhi_epi16(a.val, zero))));
+#else
+    v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));
+    p += v_rotate_right<1>(p);
+    return v_reinterpret_as_u16(p) & v512_setall_u16(0x00ff);
+#endif
+}
+inline v_uint32x16 v_popcount(const v_int32x16& a)
+{
+#if CV_AVX_512VPOPCNTDQ
+    return v_uint32x16(_mm512_popcnt_epi32(a.val));
+#else
+    v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));
+    p += v_rotate_right<1>(p);
+    p += v_rotate_right<2>(p);
+    return v_reinterpret_as_u32(p) & v512_setall_u32(0x000000ff);
+#endif
+}
+inline v_uint64x8 v_popcount(const v_int64x8& a)
+{
+#if CV_AVX_512VPOPCNTDQ
+    return v_uint64x8(_mm512_popcnt_epi64(a.val));
+#else
+    return v_uint64x8(_mm512_sad_epu8(v_popcount(v_reinterpret_as_s8(a)).val, _mm512_setzero_si512()));
+#endif
+}
+
+
+inline v_uint8x64  v_popcount(const v_uint8x64&  a) { return v_popcount(v_reinterpret_as_s8 (a)); }
+inline v_uint16x32 v_popcount(const v_uint16x32& a) { return v_popcount(v_reinterpret_as_s16(a)); }
+inline v_uint32x16 v_popcount(const v_uint32x16& a) { return v_popcount(v_reinterpret_as_s32(a)); }
+inline v_uint64x8  v_popcount(const v_uint64x8&  a) { return v_popcount(v_reinterpret_as_s64(a)); }
+
+
+////////// Other math /////////
+
+/** Some frequent operations **/
+#if CV_FMA3
+#define OPENCV_HAL_IMPL_AVX512_MULADD(_Tpvec, suffix)                         \
+    inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)    \
+    { return _Tpvec(_mm512_fmadd_##suffix(a.val, b.val, c.val)); }            \
+    inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
+    { return _Tpvec(_mm512_fmadd_##suffix(a.val, b.val, c.val)); }
+#else
+#define OPENCV_HAL_IMPL_AVX512_MULADD(_Tpvec, suffix)                                 \
+    inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)            \
+    { return _Tpvec(_mm512_add_##suffix(_mm512_mul_##suffix(a.val, b.val), c.val)); } \
+    inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)         \
+    { return _Tpvec(_mm512_add_##suffix(_mm512_mul_##suffix(a.val, b.val), c.val)); }
+#endif
+
+#define OPENCV_HAL_IMPL_AVX512_MISC(_Tpvec, suffix)                           \
+    inline _Tpvec v_sqrt(const _Tpvec& x)                                     \
+    { return _Tpvec(_mm512_sqrt_##suffix(x.val)); }                           \
+    inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)           \
+    { return v_fma(a, a, b * b); }                                            \
+    inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)               \
+    { return v_sqrt(v_fma(a, a, b * b)); }
+
+OPENCV_HAL_IMPL_AVX512_MULADD(v_float32x16, ps)
+OPENCV_HAL_IMPL_AVX512_MULADD(v_float64x8,  pd)
+OPENCV_HAL_IMPL_AVX512_MISC(v_float32x16, ps)
+OPENCV_HAL_IMPL_AVX512_MISC(v_float64x8,  pd)
+
+inline v_int32x16 v_fma(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)
+{ return a * b + c; }
+inline v_int32x16 v_muladd(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)
+{ return v_fma(a, b, c); }
+
+inline v_float32x16 v_invsqrt(const v_float32x16& x)
+{
+#if CV_AVX_512ER
+    return v_float32x16(_mm512_rsqrt28_ps(x.val));
+#else
+    v_float32x16 half = x * v512_setall_f32(0.5);
+    v_float32x16 t  = v_float32x16(_mm512_rsqrt14_ps(x.val));
+    t *= v512_setall_f32(1.5) - ((t * t) * half);
+    return t;
+#endif
+}
+
+inline v_float64x8 v_invsqrt(const v_float64x8& x)
+{
+#if CV_AVX_512ER
+    return v_float64x8(_mm512_rsqrt28_pd(x.val));
+#else
+    return v512_setall_f64(1.) / v_sqrt(x);
+//    v_float64x8 half = x * v512_setall_f64(0.5);
+//    v_float64x8 t = v_float64x8(_mm512_rsqrt14_pd(x.val));
+//    t *= v512_setall_f64(1.5) - ((t * t) * half);
+//    t *= v512_setall_f64(1.5) - ((t * t) * half);
+//    return t;
+#endif
+}
+
+/** Absolute values **/
+#define OPENCV_HAL_IMPL_AVX512_ABS(_Tpvec, _Tpuvec, suffix) \
+    inline _Tpuvec v_abs(const _Tpvec& x)                   \
+    { return _Tpuvec(_mm512_abs_##suffix(x.val)); }
+
+OPENCV_HAL_IMPL_AVX512_ABS(v_int8x64,    v_uint8x64,    epi8)
+OPENCV_HAL_IMPL_AVX512_ABS(v_int16x32,   v_uint16x32,  epi16)
+OPENCV_HAL_IMPL_AVX512_ABS(v_int32x16,   v_uint32x16,  epi32)
+OPENCV_HAL_IMPL_AVX512_ABS(v_int64x8,    v_uint64x8,   epi64)
+
+inline v_float32x16 v_abs(const v_float32x16& x)
+{
+#ifdef _mm512_abs_pd
+    return v_float32x16(_mm512_abs_ps(x.val));
+#else
+    return v_float32x16(_mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(x.val),
+                        _v512_set_epu64(0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF,
+                                        0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF))));
+#endif
+}
+
+inline v_float64x8 v_abs(const v_float64x8& x)
+{
+#ifdef _mm512_abs_pd
+    #if defined __GNUC__ && (__GNUC__ < 7 || (__GNUC__ == 7 && __GNUC_MINOR__ <= 3) || (__GNUC__ == 8 && __GNUC_MINOR__ <= 2))
+        // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87476
+        return v_float64x8(_mm512_abs_pd(_mm512_castpd_ps(x.val)));
+    #else
+        return v_float64x8(_mm512_abs_pd(x.val));
+    #endif
+#else
+    return v_float64x8(_mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(x.val),
+                       _v512_set_epu64(0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF,
+                                       0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF))));
+#endif
+}
+
+/** Absolute difference **/
+inline v_uint8x64 v_absdiff(const v_uint8x64& a, const v_uint8x64& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint16x32 v_absdiff(const v_uint16x32& a, const v_uint16x32& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint32x16 v_absdiff(const v_uint32x16& a, const v_uint32x16& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+inline v_uint8x64 v_absdiff(const v_int8x64& a, const v_int8x64& b)
+{
+    v_int8x64 d = v_sub_wrap(a, b);
+    v_int8x64 m = a < b;
+    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
+}
+
+inline v_uint16x32 v_absdiff(const v_int16x32& a, const v_int16x32& b)
+{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
+
+inline v_uint32x16 v_absdiff(const v_int32x16& a, const v_int32x16& b)
+{
+    v_int32x16 d = a - b;
+    v_int32x16 m = a < b;
+    return v_reinterpret_as_u32((d ^ m) - m);
+}
+
+inline v_float32x16 v_absdiff(const v_float32x16& a, const v_float32x16& b)
+{ return v_abs(a - b); }
+
+inline v_float64x8 v_absdiff(const v_float64x8& a, const v_float64x8& b)
+{ return v_abs(a - b); }
+
+/** Saturating absolute difference **/
+inline v_int8x64 v_absdiffs(const v_int8x64& a, const v_int8x64& b)
+{
+    v_int8x64 d = a - b;
+    v_int8x64 m = a < b;
+    return (d ^ m) - m;
+}
+inline v_int16x32 v_absdiffs(const v_int16x32& a, const v_int16x32& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+////////// Conversions /////////
+
+/** Rounding **/
+inline v_int32x16 v_round(const v_float32x16& a)
+{ return v_int32x16(_mm512_cvtps_epi32(a.val)); }
+
+inline v_int32x16 v_round(const v_float64x8& a)
+{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(a.val))); }
+
+inline v_int32x16 v_round(const v_float64x8& a, const v_float64x8& b)
+{ return v_int32x16(_v512_combine(_mm512_cvtpd_epi32(a.val), _mm512_cvtpd_epi32(b.val))); }
+
+inline v_int32x16 v_trunc(const v_float32x16& a)
+{ return v_int32x16(_mm512_cvttps_epi32(a.val)); }
+
+inline v_int32x16 v_trunc(const v_float64x8& a)
+{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvttpd_epi32(a.val))); }
+
+#if CVT_ROUND_MODES_IMPLEMENTED
+inline v_int32x16 v_floor(const v_float32x16& a)
+{ return v_int32x16(_mm512_cvt_roundps_epi32(a.val, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); }
+
+inline v_int32x16 v_floor(const v_float64x8& a)
+{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(a.val, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC))); }
+
+inline v_int32x16 v_ceil(const v_float32x16& a)
+{ return v_int32x16(_mm512_cvt_roundps_epi32(a.val, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); }
+
+inline v_int32x16 v_ceil(const v_float64x8& a)
+{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(a.val, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC))); }
+#else
+inline v_int32x16 v_floor(const v_float32x16& a)
+{ return v_int32x16(_mm512_cvtps_epi32(_mm512_roundscale_ps(a.val, 1))); }
+
+inline v_int32x16 v_floor(const v_float64x8& a)
+{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(_mm512_roundscale_pd(a.val, 1)))); }
+
+inline v_int32x16 v_ceil(const v_float32x16& a)
+{ return v_int32x16(_mm512_cvtps_epi32(_mm512_roundscale_ps(a.val, 2))); }
+
+inline v_int32x16 v_ceil(const v_float64x8& a)
+{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(_mm512_roundscale_pd(a.val, 2)))); }
+#endif
+
+/** To float **/
+inline v_float32x16 v_cvt_f32(const v_int32x16& a)
+{ return v_float32x16(_mm512_cvtepi32_ps(a.val)); }
+
+inline v_float32x16 v_cvt_f32(const v_float64x8& a)
+{ return v_float32x16(_mm512_cvtpd_pslo(a.val)); }
+
+inline v_float32x16 v_cvt_f32(const v_float64x8& a, const v_float64x8& b)
+{ return v_float32x16(_v512_combine(_mm512_cvtpd_ps(a.val), _mm512_cvtpd_ps(b.val))); }
+
+inline v_float64x8 v_cvt_f64(const v_int32x16& a)
+{ return v_float64x8(_mm512_cvtepi32_pd(_v512_extract_low(a.val))); }
+
+inline v_float64x8 v_cvt_f64_high(const v_int32x16& a)
+{ return v_float64x8(_mm512_cvtepi32_pd(_v512_extract_high(a.val))); }
+
+inline v_float64x8 v_cvt_f64(const v_float32x16& a)
+{ return v_float64x8(_mm512_cvtps_pd(_v512_extract_low(a.val))); }
+
+inline v_float64x8 v_cvt_f64_high(const v_float32x16& a)
+{ return v_float64x8(_mm512_cvtps_pd(_v512_extract_high(a.val))); }
+
+// from (Mysticial and wim) https://stackoverflow.com/q/41144668
+inline v_float64x8 v_cvt_f64(const v_int64x8& v)
+{
+#if CV_AVX_512DQ
+    return v_float64x8(_mm512_cvtepi64_pd(v.val));
+#else
+    // constants encoded as floating-point
+    __m512i magic_i_lo   = _mm512_set1_epi64(0x4330000000000000); // 2^52
+    __m512i magic_i_hi32 = _mm512_set1_epi64(0x4530000080000000); // 2^84 + 2^63
+    __m512i magic_i_all  = _mm512_set1_epi64(0x4530000080100000); // 2^84 + 2^63 + 2^52
+    __m512d magic_d_all  = _mm512_castsi512_pd(magic_i_all);
+
+    // Blend the 32 lowest significant bits of v with magic_int_lo
+    __m512i v_lo         = _mm512_mask_blend_epi32(0x5555, magic_i_lo, v.val);
+    // Extract the 32 most significant bits of v
+    __m512i v_hi         = _mm512_srli_epi64(v.val, 32);
+    // Flip the msb of v_hi and blend with 0x45300000
+            v_hi         = _mm512_xor_si512(v_hi, magic_i_hi32);
+    // Compute in double precision
+    __m512d v_hi_dbl     = _mm512_sub_pd(_mm512_castsi512_pd(v_hi), magic_d_all);
+    // (v_hi - magic_d_all) + v_lo  Do not assume associativity of floating point addition
+    __m512d result       = _mm512_add_pd(v_hi_dbl, _mm512_castsi512_pd(v_lo));
+    return v_float64x8(result);
+#endif
+}
+
+////////////// Lookup table access ////////////////////
+
+inline v_int8x64 v512_lut(const schar* tab, const int* idx)
+{
+    __m128i p0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx    ), (const int *)tab, 1));
+    __m128i p1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 1));
+    __m128i p2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 2), (const int *)tab, 1));
+    __m128i p3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 3), (const int *)tab, 1));
+    return v_int8x64(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(p0), p1, 1), p2, 2), p3, 3));
+}
+inline v_int8x64 v512_lut_pairs(const schar* tab, const int* idx)
+{
+    __m256i p0 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx    ), (const int *)tab, 1));
+    __m256i p1 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 1));
+    return v_int8x64(_v512_combine(p0, p1));
+}
+inline v_int8x64 v512_lut_quads(const schar* tab, const int* idx)
+{
+    return v_int8x64(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), (const int *)tab, 1));
+}
+inline v_uint8x64 v512_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut((const schar *)tab, idx)); }
+inline v_uint8x64 v512_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut_pairs((const schar *)tab, idx)); }
+inline v_uint8x64 v512_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut_quads((const schar *)tab, idx)); }
+
+inline v_int16x32 v512_lut(const short* tab, const int* idx)
+{
+    __m256i p0 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx    ), (const int *)tab, 2));
+    __m256i p1 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 2));
+    return v_int16x32(_v512_combine(p0, p1));
+}
+inline v_int16x32 v512_lut_pairs(const short* tab, const int* idx)
+{
+    return v_int16x32(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), (const int *)tab, 2));
+}
+inline v_int16x32 v512_lut_quads(const short* tab, const int* idx)
+{
+#if defined(__GNUC__)
+    return v_int16x32(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 2));
+#else
+    return v_int16x32(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const int64*)tab, 2));
+#endif
+}
+inline v_uint16x32 v512_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut((const short *)tab, idx)); }
+inline v_uint16x32 v512_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut_pairs((const short *)tab, idx)); }
+inline v_uint16x32 v512_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut_quads((const short *)tab, idx)); }
+
+inline v_int32x16 v512_lut(const int* tab, const int* idx)
+{
+    return v_int32x16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), tab, 4));
+}
+inline v_int32x16 v512_lut_pairs(const int* tab, const int* idx)
+{
+#if defined(__GNUC__)
+    return v_int32x16(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 4));
+#else
+    return v_int32x16(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const int64*)tab, 4));
+#endif
+}
+inline v_int32x16 v512_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x16(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
+                          _mm_loadu_si128((const __m128i*)(tab + idx[0]))),
+                          _mm_loadu_si128((const __m128i*)(tab + idx[1])), 1),
+                          _mm_loadu_si128((const __m128i*)(tab + idx[2])), 2),
+                          _mm_loadu_si128((const __m128i*)(tab + idx[3])), 3));
+}
+inline v_uint32x16 v512_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut((const int *)tab, idx)); }
+inline v_uint32x16 v512_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut_pairs((const int *)tab, idx)); }
+inline v_uint32x16 v512_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut_quads((const int *)tab, idx)); }
+
+inline v_int64x8 v512_lut(const int64* tab, const int* idx)
+{
+#if defined(__GNUC__)
+    return v_int64x8(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 8));
+#else
+    return v_int64x8(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), tab , 8));
+#endif
+}
+inline v_int64x8 v512_lut_pairs(const int64* tab, const int* idx)
+{
+    return v_int64x8(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
+                         _mm_loadu_si128((const __m128i*)(tab + idx[0]))),
+                         _mm_loadu_si128((const __m128i*)(tab + idx[1])), 1),
+                         _mm_loadu_si128((const __m128i*)(tab + idx[2])), 2),
+                         _mm_loadu_si128((const __m128i*)(tab + idx[3])), 3));
+}
+inline v_uint64x8 v512_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v512_lut((const int64 *)tab, idx)); }
+inline v_uint64x8 v512_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v512_lut_pairs((const int64 *)tab, idx)); }
+
+inline v_float32x16 v512_lut(const float* tab, const int* idx)
+{
+    return v_float32x16(_mm512_i32gather_ps(_mm512_loadu_si512((const __m512i*)idx), tab, 4));
+}
+inline v_float32x16 v512_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v512_lut_pairs((const int *)tab, idx)); }
+inline v_float32x16 v512_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v512_lut_quads((const int *)tab, idx)); }
+
+inline v_float64x8 v512_lut(const double* tab, const int* idx)
+{
+    return v_float64x8(_mm512_i32gather_pd(_mm256_loadu_si256((const __m256i*)idx), tab, 8));
+}
+inline v_float64x8 v512_lut_pairs(const double* tab, const int* idx)
+{
+        return v_float64x8(_mm512_insertf64x2(_mm512_insertf64x2(_mm512_insertf64x2(_mm512_castpd128_pd512(
+                               _mm_loadu_pd(tab + idx[0])),
+                               _mm_loadu_pd(tab + idx[1]), 1),
+                               _mm_loadu_pd(tab + idx[2]), 2),
+                               _mm_loadu_pd(tab + idx[3]), 3));
+}
+
+inline v_int32x16 v_lut(const int* tab, const v_int32x16& idxvec)
+{
+    return v_int32x16(_mm512_i32gather_epi32(idxvec.val, tab, 4));
+}
+
+inline v_uint32x16 v_lut(const unsigned* tab, const v_int32x16& idxvec)
+{
+    return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
+}
+
+inline v_float32x16 v_lut(const float* tab, const v_int32x16& idxvec)
+{
+    return v_float32x16(_mm512_i32gather_ps(idxvec.val, tab, 4));
+}
+
+inline v_float64x8 v_lut(const double* tab, const v_int32x16& idxvec)
+{
+    return v_float64x8(_mm512_i32gather_pd(_v512_extract_low(idxvec.val), tab, 8));
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x16& idxvec, v_float32x16& x, v_float32x16& y)
+{
+    x.val = _mm512_i32gather_ps(idxvec.val, tab, 4);
+    y.val = _mm512_i32gather_ps(idxvec.val, &tab[1], 4);
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x16& idxvec, v_float64x8& x, v_float64x8& y)
+{
+    x.val = _mm512_i32gather_pd(_v512_extract_low(idxvec.val), tab, 8);
+    y.val = _mm512_i32gather_pd(_v512_extract_low(idxvec.val), &tab[1], 8);
+}
+
+inline v_int8x64 v_interleave_pairs(const v_int8x64& vec)
+{
+    return v_int8x64(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0d0e0c, 0x0b090a08, 0x07050604, 0x03010200)));
+}
+inline v_uint8x64 v_interleave_pairs(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
+inline v_int8x64 v_interleave_quads(const v_int8x64& vec)
+{
+    return v_int8x64(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0b0e0a, 0x0d090c08, 0x07030602, 0x05010400)));
+}
+inline v_uint8x64 v_interleave_quads(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x32 v_interleave_pairs(const v_int16x32& vec)
+{
+    return v_int16x32(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0e0b0a, 0x0d0c0908, 0x07060302, 0x05040100)));
+}
+inline v_uint16x32 v_interleave_pairs(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
+inline v_int16x32 v_interleave_quads(const v_int16x32& vec)
+{
+    return v_int16x32(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0e0706, 0x0d0c0504, 0x0b0a0302, 0x09080100)));
+}
+inline v_uint16x32 v_interleave_quads(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x16 v_interleave_pairs(const v_int32x16& vec)
+{
+    return v_int32x16(_mm512_shuffle_epi32(vec.val, _MM_PERM_ACBD));
+}
+inline v_uint32x16 v_interleave_pairs(const v_uint32x16& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+inline v_float32x16 v_interleave_pairs(const v_float32x16& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+
+inline v_int8x64 v_pack_triplets(const v_int8x64& vec)
+{
+    return v_int8x64(_mm512_permutexvar_epi32(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
+                                                              0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000),
+                                              _mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0xffffff0f, 0x0e0d0c0a, 0x09080605, 0x04020100))));
+}
+inline v_uint8x64 v_pack_triplets(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x32 v_pack_triplets(const v_int16x32& vec)
+{
+    return v_int16x32(_mm512_permutexvar_epi16(_v512_set_epu64(0x001f001f001f001f, 0x001f001f001f001f, 0x001e001d001c001a, 0x0019001800160015,
+                                                               0x0014001200110010, 0x000e000d000c000a, 0x0009000800060005, 0x0004000200010000), vec.val));
+}
+inline v_uint16x32 v_pack_triplets(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x16 v_pack_triplets(const v_int32x16& vec)
+{
+    return v_int32x16(_mm512_permutexvar_epi32(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
+                                                               0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000), vec.val));
+}
+inline v_uint32x16 v_pack_triplets(const v_uint32x16& vec) { return v_reinterpret_as_u32(v_pack_triplets(v_reinterpret_as_s32(vec))); }
+inline v_float32x16 v_pack_triplets(const v_float32x16& vec)
+{
+    return v_float32x16(_mm512_permutexvar_ps(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
+                                                              0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000), vec.val));
+}
+
+////////// Matrix operations /////////
+
+//////// Dot Product ////////
+
+// 16 >> 32
+inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b)
+{ return v_int32x16(_mm512_madd_epi16(a.val, b.val)); }
+inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c)
+{ return v_dotprod(a, b) + c; }
+
+// 32 >> 64
+inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b)
+{
+    __m512i even = _mm512_mul_epi32(a.val, b.val);
+    __m512i odd = _mm512_mul_epi32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32));
+    return v_int64x8(_mm512_add_epi64(even, odd));
+}
+inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c)
+{ return v_dotprod(a, b) + c; }
+
+// 8 >> 32
+inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b)
+{
+    __m512i even_a = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, a.val, _mm512_setzero_si512());
+    __m512i odd_a  = _mm512_srli_epi16(a.val, 8);
+
+    __m512i even_b = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b.val, _mm512_setzero_si512());
+    __m512i odd_b  = _mm512_srli_epi16(b.val, 8);
+
+    __m512i prod0  = _mm512_madd_epi16(even_a, even_b);
+    __m512i prod1  = _mm512_madd_epi16(odd_a, odd_b);
+    return v_uint32x16(_mm512_add_epi32(prod0, prod1));
+}
+inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b)
+{
+    __m512i even_a = _mm512_srai_epi16(_mm512_bslli_epi128(a.val, 1), 8);
+    __m512i odd_a  = _mm512_srai_epi16(a.val, 8);
+
+    __m512i even_b = _mm512_srai_epi16(_mm512_bslli_epi128(b.val, 1), 8);
+    __m512i odd_b  = _mm512_srai_epi16(b.val, 8);
+
+    __m512i prod0  = _mm512_madd_epi16(even_a, even_b);
+    __m512i prod1  = _mm512_madd_epi16(odd_a, odd_b);
+    return v_int32x16(_mm512_add_epi32(prod0, prod1));
+}
+inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b)
+{
+    __m512i mullo = _mm512_mullo_epi16(a.val, b.val);
+    __m512i mulhi = _mm512_mulhi_epu16(a.val, b.val);
+    __m512i mul0  = _mm512_unpacklo_epi16(mullo, mulhi);
+    __m512i mul1  = _mm512_unpackhi_epi16(mullo, mulhi);
+
+    __m512i p02   = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512());
+    __m512i p13   = _mm512_srli_epi64(mul0, 32);
+    __m512i p46   = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512());
+    __m512i p57   = _mm512_srli_epi64(mul1, 32);
+
+    __m512i p15_  = _mm512_add_epi64(p02, p13);
+    __m512i p9d_  = _mm512_add_epi64(p46, p57);
+
+    return v_uint64x8(_mm512_add_epi64(
+        _mm512_unpacklo_epi64(p15_, p9d_),
+        _mm512_unpackhi_epi64(p15_, p9d_)
+    ));
+}
+inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b)
+{
+    __m512i prod = _mm512_madd_epi16(a.val, b.val);
+    __m512i even = _mm512_srai_epi64(_mm512_bslli_epi128(prod, 4), 32);
+    __m512i odd  = _mm512_srai_epi64(prod, 32);
+    return v_int64x8(_mm512_add_epi64(even, odd));
+}
+inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x16 v_dotprod_fast(const v_int16x32& a, const v_int16x32& b)
+{ return v_dotprod(a, b); }
+inline v_int32x16 v_dotprod_fast(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c)
+{ return v_dotprod(a, b, c); }
+
+// 32 >> 64
+inline v_int64x8 v_dotprod_fast(const v_int32x16& a, const v_int32x16& b)
+{ return v_dotprod(a, b); }
+inline v_int64x8 v_dotprod_fast(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c)
+{ return v_dotprod(a, b, c); }
+
+// 8 >> 32
+inline v_uint32x16 v_dotprod_expand_fast(const v_uint8x64& a, const v_uint8x64& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint32x16 v_dotprod_expand_fast(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c)
+{ return v_dotprod_expand(a, b, c); }
+
+inline v_int32x16 v_dotprod_expand_fast(const v_int8x64& a, const v_int8x64& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int32x16 v_dotprod_expand_fast(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 16 >> 64
+inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b)
+{
+    __m512i mullo = _mm512_mullo_epi16(a.val, b.val);
+    __m512i mulhi = _mm512_mulhi_epu16(a.val, b.val);
+    __m512i mul0  = _mm512_unpacklo_epi16(mullo, mulhi);
+    __m512i mul1  = _mm512_unpackhi_epi16(mullo, mulhi);
+
+    __m512i p02   = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512());
+    __m512i p13   = _mm512_srli_epi64(mul0, 32);
+    __m512i p46   = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512());
+    __m512i p57   = _mm512_srli_epi64(mul1, 32);
+
+    __m512i p15_  = _mm512_add_epi64(p02, p13);
+    __m512i p9d_  = _mm512_add_epi64(p46, p57);
+    return v_uint64x8(_mm512_add_epi64(p15_, p9d_));
+}
+inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 32 >> 64f
+inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+
+#define OPENCV_HAL_AVX512_SPLAT2_PS(a, im) \
+    v_float32x16(_mm512_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))
+
+inline v_float32x16 v_matmul(const v_float32x16& v,
+                             const v_float32x16& m0, const v_float32x16& m1,
+                             const v_float32x16& m2, const v_float32x16& m3)
+{
+    v_float32x16 v04 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 0);
+    v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);
+    v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);
+    v_float32x16 v37 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 3);
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
+}
+
+inline v_float32x16 v_matmuladd(const v_float32x16& v,
+                                const v_float32x16& m0, const v_float32x16& m1,
+                                const v_float32x16& m2, const v_float32x16& a)
+{
+    v_float32x16 v04 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 0);
+    v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);
+    v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, a)));
+}
+
+#define OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
+    inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1,              \
+                               const _Tpvec& a2, const _Tpvec& a3,              \
+                               _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3)  \
+    {                                                                           \
+        __m512i t0 = cast_from(_mm512_unpacklo_##suffix(a0.val, a1.val));       \
+        __m512i t1 = cast_from(_mm512_unpacklo_##suffix(a2.val, a3.val));       \
+        __m512i t2 = cast_from(_mm512_unpackhi_##suffix(a0.val, a1.val));       \
+        __m512i t3 = cast_from(_mm512_unpackhi_##suffix(a2.val, a3.val));       \
+        b0.val = cast_to(_mm512_unpacklo_epi64(t0, t1));                        \
+        b1.val = cast_to(_mm512_unpackhi_epi64(t0, t1));                        \
+        b2.val = cast_to(_mm512_unpacklo_epi64(t2, t3));                        \
+        b3.val = cast_to(_mm512_unpackhi_epi64(t2, t3));                        \
+    }
+
+OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_uint32x16,  epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_int32x16,   epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_float32x16, ps, _mm512_castps_si512, _mm512_castsi512_ps)
+
+//////////////// Value reordering ///////////////
+
+/* Expand */
+#define OPENCV_HAL_IMPL_AVX512_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
+    inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+    {                                                               \
+        b0.val = intrin(_v512_extract_low(a.val));                  \
+        b1.val = intrin(_v512_extract_high(a.val));                 \
+    }                                                               \
+    inline _Tpwvec v_expand_low(const _Tpvec& a)                    \
+    { return _Tpwvec(intrin(_v512_extract_low(a.val))); }           \
+    inline _Tpwvec v_expand_high(const _Tpvec& a)                   \
+    { return _Tpwvec(intrin(_v512_extract_high(a.val))); }          \
+    inline _Tpwvec v512_load_expand(const _Tp* ptr)                 \
+    {                                                               \
+        __m256i a = _mm256_loadu_si256((const __m256i*)ptr);        \
+        return _Tpwvec(intrin(a));                                  \
+    }
+
+OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint8x64,  v_uint16x32, uchar,    _mm512_cvtepu8_epi16)
+OPENCV_HAL_IMPL_AVX512_EXPAND(v_int8x64,   v_int16x32,  schar,    _mm512_cvtepi8_epi16)
+OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint16x32, v_uint32x16, ushort,   _mm512_cvtepu16_epi32)
+OPENCV_HAL_IMPL_AVX512_EXPAND(v_int16x32,  v_int32x16,  short,    _mm512_cvtepi16_epi32)
+OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint32x16, v_uint64x8,  unsigned, _mm512_cvtepu32_epi64)
+OPENCV_HAL_IMPL_AVX512_EXPAND(v_int32x16,  v_int64x8,   int,      _mm512_cvtepi32_epi64)
+
+#define OPENCV_HAL_IMPL_AVX512_EXPAND_Q(_Tpvec, _Tp, intrin) \
+    inline _Tpvec v512_load_expand_q(const _Tp* ptr)         \
+    {                                                        \
+        __m128i a = _mm_loadu_si128((const __m128i*)ptr);    \
+        return _Tpvec(intrin(a));                            \
+    }
+
+OPENCV_HAL_IMPL_AVX512_EXPAND_Q(v_uint32x16, uchar, _mm512_cvtepu8_epi32)
+OPENCV_HAL_IMPL_AVX512_EXPAND_Q(v_int32x16,  schar, _mm512_cvtepi8_epi32)
+
+/* pack */
+// 16
+inline v_int8x64 v_pack(const v_int16x32& a, const v_int16x32& b)
+{ return v_int8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }
+
+inline v_uint8x64 v_pack(const v_uint16x32& a, const v_uint16x32& b)
+{
+    const __m512i t = _mm512_set1_epi16(255);
+    return v_uint8x64(_v512_combine(_mm512_cvtepi16_epi8(_mm512_min_epu16(a.val, t)), _mm512_cvtepi16_epi8(_mm512_min_epu16(b.val, t))));
+}
+
+inline v_uint8x64 v_pack_u(const v_int16x32& a, const v_int16x32& b)
+{
+    return v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi16(a.val, b.val)));
+}
+
+inline void v_pack_store(schar* ptr, const v_int16x32& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(uchar* ptr, const v_uint16x32& a)
+{
+    const __m512i m = _mm512_set1_epi16(255);
+    _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi16_epi8(_mm512_min_epu16(a.val, m)));
+}
+
+inline void v_pack_u_store(uchar* ptr, const v_int16x32& a)
+{ v_store_low(ptr, v_pack_u(a, a)); }
+
+template<int n> inline
+v_uint8x64 v_rshr_pack(const v_uint16x32& a, const v_uint16x32& b)
+{
+    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
+    v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));
+    return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
+                    v_reinterpret_as_s16((b + delta) >> n));
+}
+
+template<int n> inline
+void v_rshr_pack_store(uchar* ptr, const v_uint16x32& a)
+{
+    v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));
+    v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
+}
+
+template<int n> inline
+v_uint8x64 v_rshr_pack_u(const v_int16x32& a, const v_int16x32& b)
+{
+    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
+    return v_pack_u((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(uchar* ptr, const v_int16x32& a)
+{
+    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
+    v_pack_u_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int8x64 v_rshr_pack(const v_int16x32& a, const v_int16x32& b)
+{
+    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(schar* ptr, const v_int16x32& a)
+{
+    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+// 32
+inline v_int16x32 v_pack(const v_int32x16& a, const v_int32x16& b)
+{ return v_int16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi32(a.val, b.val))); }
+
+inline v_uint16x32 v_pack(const v_uint32x16& a, const v_uint32x16& b)
+{
+    const __m512i m = _mm512_set1_epi32(65535);
+    return v_uint16x32(_v512_combine(_mm512_cvtepi32_epi16(_mm512_min_epu32(a.val, m)), _mm512_cvtepi32_epi16(_mm512_min_epu32(b.val, m))));
+}
+
+inline v_uint16x32 v_pack_u(const v_int32x16& a, const v_int32x16& b)
+{ return v_uint16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi32(a.val, b.val))); }
+
+inline void v_pack_store(short* ptr, const v_int32x16& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(ushort* ptr, const v_uint32x16& a)
+{
+    const __m512i m = _mm512_set1_epi32(65535);
+    _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi32_epi16(_mm512_min_epu32(a.val, m)));
+}
+
+inline void v_pack_u_store(ushort* ptr, const v_int32x16& a)
+{ v_store_low(ptr, v_pack_u(a, a)); }
+
+
+template<int n> inline
+v_uint16x32 v_rshr_pack(const v_uint32x16& a, const v_uint32x16& b)
+{
+    v_uint32x16 delta = v512_setall_u32(1 << (n-1));
+    return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
+                    v_reinterpret_as_s32((b + delta) >> n));
+}
+
+template<int n> inline
+void v_rshr_pack_store(ushort* ptr, const v_uint32x16& a)
+{
+    v_uint32x16 delta = v512_setall_u32(1 << (n-1));
+    v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
+}
+
+template<int n> inline
+v_uint16x32 v_rshr_pack_u(const v_int32x16& a, const v_int32x16& b)
+{
+    v_int32x16 delta = v512_setall_s32(1 << (n-1));
+    return v_pack_u((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(ushort* ptr, const v_int32x16& a)
+{
+    v_int32x16 delta = v512_setall_s32(1 << (n-1));
+    v_pack_u_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int16x32 v_rshr_pack(const v_int32x16& a, const v_int32x16& b)
+{
+    v_int32x16 delta = v512_setall_s32(1 << (n-1));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(short* ptr, const v_int32x16& a)
+{
+    v_int32x16 delta = v512_setall_s32(1 << (n-1));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+// 64
+// Non-saturating pack
+inline v_uint32x16 v_pack(const v_uint64x8& a, const v_uint64x8& b)
+{ return v_uint32x16(_v512_combine(_mm512_cvtepi64_epi32(a.val), _mm512_cvtepi64_epi32(b.val))); }
+
+inline v_int32x16 v_pack(const v_int64x8& a, const v_int64x8& b)
+{ return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
+
+inline void v_pack_store(unsigned* ptr, const v_uint64x8& a)
+{ _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi64_epi32(a.val)); }
+
+inline void v_pack_store(int* ptr, const v_int64x8& b)
+{ v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(b)); }
+
+template<int n> inline
+v_uint32x16 v_rshr_pack(const v_uint64x8& a, const v_uint64x8& b)
+{
+    v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(unsigned* ptr, const v_uint64x8& a)
+{
+    v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int32x16 v_rshr_pack(const v_int64x8& a, const v_int64x8& b)
+{
+    v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(int* ptr, const v_int64x8& a)
+{
+    v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+// pack boolean
+inline v_uint8x64 v_pack_b(const v_uint16x32& a, const v_uint16x32& b)
+{ return v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }
+
+inline v_uint8x64 v_pack_b(const v_uint32x16& a, const v_uint32x16& b,
+                           const v_uint32x16& c, const v_uint32x16& d)
+{
+    __m512i ab = _mm512_packs_epi32(a.val, b.val);
+    __m512i cd = _mm512_packs_epi32(c.val, d.val);
+
+    return v_uint8x64(_mm512_permutexvar_epi32(_v512_set_epu32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0), _mm512_packs_epi16(ab, cd)));
+}
+
+inline v_uint8x64 v_pack_b(const v_uint64x8& a, const v_uint64x8& b, const v_uint64x8& c,
+                           const v_uint64x8& d, const v_uint64x8& e, const v_uint64x8& f,
+                           const v_uint64x8& g, const v_uint64x8& h)
+{
+    __m512i ab = _mm512_packs_epi32(a.val, b.val);
+    __m512i cd = _mm512_packs_epi32(c.val, d.val);
+    __m512i ef = _mm512_packs_epi32(e.val, f.val);
+    __m512i gh = _mm512_packs_epi32(g.val, h.val);
+
+    __m512i abcd = _mm512_packs_epi32(ab, cd);
+    __m512i efgh = _mm512_packs_epi32(ef, gh);
+
+    return v_uint8x64(_mm512_permutexvar_epi16(_v512_set_epu16(31, 23, 15, 7, 30, 22, 14, 6, 29, 21, 13, 5, 28, 20, 12, 4,
+                                                               27, 19, 11, 3, 26, 18, 10, 2, 25, 17,  9, 1, 24, 16,  8, 0), _mm512_packs_epi16(abcd, efgh)));
+}
+
+/* Recombine */
+// its up there with load and store operations
+
+/* Extract */
+#define OPENCV_HAL_IMPL_AVX512_EXTRACT(_Tpvec)                \
+    template<int s>                                           \
+    inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
+    { return v_rotate_right<s>(a, b); }
+
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint8x64)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int8x64)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint16x32)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int16x32)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint32x16)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int32x16)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint64x8)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int64x8)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float32x16)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float64x8)
+
+#define OPENCV_HAL_IMPL_AVX512_EXTRACT_N(_Tpvec, _Tp) \
+template<int i> inline _Tp v_extract_n(_Tpvec v) { return v_rotate_right<i>(v).get0(); }
+
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint8x64, uchar)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int8x64, schar)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint16x32, ushort)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int16x32, short)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint32x16, uint)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int32x16, int)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint64x8, uint64)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int64x8, int64)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_float32x16, float)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_float64x8, double)
+
+template<int i>
+inline v_uint32x16 v_broadcast_element(v_uint32x16 a)
+{
+    static const __m512i perm = _mm512_set1_epi32((char)i);
+    return v_uint32x16(_mm512_permutexvar_epi32(perm, a.val));
+}
+
+template<int i>
+inline v_int32x16 v_broadcast_element(const v_int32x16 &a)
+{ return v_reinterpret_as_s32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
+
+template<int i>
+inline v_float32x16 v_broadcast_element(const v_float32x16 &a)
+{ return v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
+
+
+///////////////////// load deinterleave /////////////////////////////
+
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b )
+{
+    __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
+#if CV_AVX_512VBMI
+    __m512i mask0 = _v512_set_epu8(126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,
+                                    94,  92,  90,  88,  86,  84,  82,  80,  78,  76,  74,  72,  70,  68, 66, 64,
+                                    62,  60,  58,  56,  54,  52,  50,  48,  46,  44,  42,  40,  38,  36, 34, 32,
+                                    30,  28,  26,  24,  22,  20,  18,  16,  14,  12,  10,   8,   6,   4,  2,  0);
+    __m512i mask1 = _v512_set_epu8(127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,
+                                    95,  93,  91,  89,  87,  85,  83,  81,  79,  77,  75,  73,  71,  69, 67, 65,
+                                    63,  61,  59,  57,  55,  53,  51,  49,  47,  45,  43,  41,  39,  37, 35, 33,
+                                    31,  29,  27,  25,  23,  21,  19,  17,  15,  13,  11,   9,   7,   5,  3,  1);
+    a = v_uint8x64(_mm512_permutex2var_epi8(ab0, mask0, ab1));
+    b = v_uint8x64(_mm512_permutex2var_epi8(ab0, mask1, ab1));
+#else
+    __m512i mask0 = _mm512_set4_epi32(0x0f0d0b09, 0x07050301, 0x0e0c0a08, 0x06040200);
+    __m512i a0b0 = _mm512_shuffle_epi8(ab0, mask0);
+    __m512i a1b1 = _mm512_shuffle_epi8(ab1, mask0);
+    __m512i mask1 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i mask2 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
+    a = v_uint8x64(_mm512_permutex2var_epi64(a0b0, mask1, a1b1));
+    b = v_uint8x64(_mm512_permutex2var_epi64(a0b0, mask2, a1b1));
+#endif
+}
+
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b )
+{
+    __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
+    __m512i mask0 = _v512_set_epu16(62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
+                                    30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10,  8,  6,  4,  2,  0);
+    __m512i mask1 = _v512_set_epu16(63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
+                                    31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11,  9,  7,  5,  3,  1);
+    a = v_uint16x32(_mm512_permutex2var_epi16(ab0, mask0, ab1));
+    b = v_uint16x32(_mm512_permutex2var_epi16(ab0, mask1, ab1));
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b )
+{
+    __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
+    __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
+    a = v_uint32x16(_mm512_permutex2var_epi32(ab0, mask0, ab1));
+    b = v_uint32x16(_mm512_permutex2var_epi32(ab0, mask1, ab1));
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b )
+{
+    __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 8));
+    __m512i mask0 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i mask1 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
+    a = v_uint64x8(_mm512_permutex2var_epi64(ab0, mask0, ab1));
+    b = v_uint64x8(_mm512_permutex2var_epi64(ab0, mask1, ab1));
+}
+
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b, v_uint8x64& c )
+{
+    __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
+    __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 128));
+
+#if CV_AVX_512VBMI2
+    __m512i mask0 = _v512_set_epu8(126, 123, 120, 117, 114, 111, 108, 105, 102,  99,  96,  93,  90,  87,  84, 81,
+                                    78,  75,  72,  69,  66,  63,  60,  57,  54,  51,  48,  45,  42,  39,  36, 33,
+                                    30,  27,  24,  21,  18,  15,  12,   9,   6,   3,   0,  62,  59,  56,  53, 50,
+                                    47,  44,  41,  38,  35,  32,  29,  26,  23,  20,  17,  14,  11,   8,   5,  2);
+    __m512i r0b01 = _mm512_permutex2var_epi8(bgr0, mask0, bgr1);
+    __m512i b1g12 = _mm512_permutex2var_epi8(bgr1, mask0, bgr2);
+    __m512i r12b2 = _mm512_permutex2var_epi8(bgr1,
+                    _v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101,  98,  95,  92,  89,  86,  83, 80,
+                                    77,  74,  71,  68,  65, 127, 124, 121, 118, 115, 112, 109, 106, 103, 100, 97,
+                                    94,  91,  88,  85,  82,  79,  76,  73,  70,  67,  64,  61,  58,  55,  52, 49,
+                                    46,  43,  40,  37,  34,  31,  28,  25,  22,  19,  16,  13,  10,   7,   4,  1), bgr2);
+    a = v_uint8x64(_mm512_mask_compress_epi8(r12b2, 0xffffffffffe00000, r0b01));
+    b = v_uint8x64(_mm512_mask_compress_epi8(b1g12, 0x2492492492492492, bgr0));
+    c = v_uint8x64(_mm512_mask_expand_epi8(r0b01, 0xffffffffffe00000, r12b2));
+#elif CV_AVX_512VBMI
+    __m512i b0g0b1 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr1, bgr0);
+    __m512i g1r1g2 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr2, bgr1);
+    __m512i r2b2r0 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr0, bgr2);
+    a = v_uint8x64(_mm512_permutex2var_epi8(b0g0b1, _v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101,  98,  95,  92,  89,  86,  83,  80,
+                                                                    77,  74,  71,  68,  65,  63,  61,  60,  58,  57,  55,  54,  52,  51,  49,  48,
+                                                                    46,  45,  43,  42,  40,  39,  37,  36,  34,  33,  31,  30,  28,  27,  25,  24,
+                                                                    23,  21,  20,  18,  17,  15,  14,  12,  11,   9,   8,   6,   5,   3,   2,   0), bgr2));
+    b = v_uint8x64(_mm512_permutex2var_epi8(g1r1g2, _v512_set_epu8( 63,  61,  60,  58,  57,  55,  54,  52,  51,  49,  48,  46,  45,  43,  42,  40,
+                                                                    39,  37,  36,  34,  33,  31,  30,  28,  27,  25,  24,  23,  21,  20,  18,  17,
+                                                                    15,  14,  12,  11,   9,   8,   6,   5,   3,   2,   0, 126, 123, 120, 117, 114,
+                                                                   111, 108, 105, 102,  99,  96,  93,  90,  87,  84,  81,  78,  75,  72,  69,  66), bgr0));
+    c = v_uint8x64(_mm512_permutex2var_epi8(r2b2r0, _v512_set_epu8( 63,  60,  57,  54,  51,  48,  45,  42,  39,  36,  33,  30,  27,  24,  21,  18,
+                                                                    15,  12,   9,   6,   3,   0, 125, 122, 119, 116, 113, 110, 107, 104, 101,  98,
+                                                                    95,  92,  89,  86,  83,  80,  77,  74,  71,  68,  65,  62,  59,  56,  53,  50,
+                                                                    47,  44,  41,  38,  35,  32,  29,  26,  23,  20,  17,  14,  11,   8,   5,   2), bgr1));
+#else
+    __m512i mask0 = _v512_set_epu16(61, 58, 55, 52, 49, 46, 43, 40, 37, 34, 63, 60, 57, 54, 51, 48,
+                                    45, 42, 39, 36, 33, 30, 27, 24, 21, 18, 15, 12,  9,  6,  3,  0);
+    __m512i b01g1 = _mm512_permutex2var_epi16(bgr0, mask0, bgr1);
+    __m512i r12b2 = _mm512_permutex2var_epi16(bgr1, mask0, bgr2);
+    __m512i g20r0 = _mm512_permutex2var_epi16(bgr2, mask0, bgr0);
+
+    __m512i b0g0 = _mm512_mask_blend_epi32(0xf800, b01g1, r12b2);
+    __m512i r0b1 = _mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17,
+                                                                   14, 11,  8,  5,  2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0);
+    __m512i g1r1 = _mm512_alignr_epi32(r12b2, g20r0, 11);
+    a = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b0g0, r0b1));
+    c = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, r0b1, g1r1));
+    b = v_uint8x64(_mm512_shuffle_epi8(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1r1, b0g0), _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001)));
+#endif
+}
+
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b, v_uint16x32& c )
+{
+    __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
+    __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
+
+    __m512i mask0 = _v512_set_epu16(61, 58, 55, 52, 49, 46, 43, 40, 37, 34, 63, 60, 57, 54, 51, 48,
+                                    45, 42, 39, 36, 33, 30, 27, 24, 21, 18, 15, 12,  9,  6,  3,  0);
+    __m512i b01g1 = _mm512_permutex2var_epi16(bgr0, mask0, bgr1);
+    __m512i r12b2 = _mm512_permutex2var_epi16(bgr1, mask0, bgr2);
+    __m512i g20r0 = _mm512_permutex2var_epi16(bgr2, mask0, bgr0);
+
+    a = v_uint16x32(_mm512_mask_blend_epi32(0xf800, b01g1, r12b2));
+    b = v_uint16x32(_mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17,
+                                                                    14, 11,  8,  5,  2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0));
+    c = v_uint16x32(_mm512_alignr_epi32(r12b2, g20r0, 11));
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b, v_uint32x16& c )
+{
+    __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
+    __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
+
+    __m512i mask0 = _v512_set_epu32(29, 26, 23, 20, 17, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
+    __m512i b01r1 = _mm512_permutex2var_epi32(bgr0, mask0, bgr1);
+    __m512i g12b2 = _mm512_permutex2var_epi32(bgr1, mask0, bgr2);
+    __m512i r20g0 = _mm512_permutex2var_epi32(bgr2, mask0, bgr0);
+
+    a = v_uint32x16(_mm512_mask_blend_epi32(0xf800, b01r1, g12b2));
+    b = v_uint32x16(_mm512_alignr_epi32(g12b2, r20g0, 11));
+    c = v_uint32x16(_mm512_permutex2var_epi32(bgr1, _v512_set_epu32(21, 20, 19, 18, 17, 16, 13, 10, 7, 4, 1, 26, 25, 24, 23, 22), r20g0));
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b, v_uint64x8& c )
+{
+    __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 8));
+    __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
+
+    __m512i mask0 = _v512_set_epu64(13, 10, 15, 12, 9, 6, 3, 0);
+    __m512i b01g1 = _mm512_permutex2var_epi64(bgr0, mask0, bgr1);
+    __m512i r12b2 = _mm512_permutex2var_epi64(bgr1, mask0, bgr2);
+    __m512i g20r0 = _mm512_permutex2var_epi64(bgr2, mask0, bgr0);
+
+    a = v_uint64x8(_mm512_mask_blend_epi64(0xc0, b01g1, r12b2));
+    c = v_uint64x8(_mm512_alignr_epi64(r12b2, g20r0, 6));
+    b = v_uint64x8(_mm512_permutex2var_epi64(bgr1, _v512_set_epu64(10, 9, 8, 5, 2, 13, 12, 11), g20r0));
+}
+
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b, v_uint8x64& c, v_uint8x64& d )
+{
+    __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
+    __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 128));
+    __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 192));
+
+#if CV_AVX_512VBMI
+    __m512i mask0 = _v512_set_epu8(126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,
+                                    94,  92,  90,  88,  86,  84,  82,  80,  78,  76,  74,  72,  70,  68, 66, 64,
+                                    62,  60,  58,  56,  54,  52,  50,  48,  46,  44,  42,  40,  38,  36, 34, 32,
+                                    30,  28,  26,  24,  22,  20,  18,  16,  14,  12,  10,   8,   6,   4,  2,  0);
+    __m512i mask1 = _v512_set_epu8(127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,
+                                    95,  93,  91,  89,  87,  85,  83,  81,  79,  77,  75,  73,  71,  69, 67, 65,
+                                    63,  61,  59,  57,  55,  53,  51,  49,  47,  45,  43,  41,  39,  37, 35, 33,
+                                    31,  29,  27,  25,  23,  21,  19,  17,  15,  13,  11,   9,   7,   5,  3,  1);
+
+    __m512i br01 = _mm512_permutex2var_epi8(bgra0, mask0, bgra1);
+    __m512i ga01 = _mm512_permutex2var_epi8(bgra0, mask1, bgra1);
+    __m512i br23 = _mm512_permutex2var_epi8(bgra2, mask0, bgra3);
+    __m512i ga23 = _mm512_permutex2var_epi8(bgra2, mask1, bgra3);
+
+    a = v_uint8x64(_mm512_permutex2var_epi8(br01, mask0, br23));
+    c = v_uint8x64(_mm512_permutex2var_epi8(br01, mask1, br23));
+    b = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask0, ga23));
+    d = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask1, ga23));
+#else
+    __m512i mask = _mm512_set4_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
+    __m512i b0g0r0a0 = _mm512_shuffle_epi8(bgra0, mask);
+    __m512i b1g1r1a1 = _mm512_shuffle_epi8(bgra1, mask);
+    __m512i b2g2r2a2 = _mm512_shuffle_epi8(bgra2, mask);
+    __m512i b3g3r3a3 = _mm512_shuffle_epi8(bgra3, mask);
+
+    __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
+
+    __m512i br01 = _mm512_permutex2var_epi32(b0g0r0a0, mask0, b1g1r1a1);
+    __m512i ga01 = _mm512_permutex2var_epi32(b0g0r0a0, mask1, b1g1r1a1);
+    __m512i br23 = _mm512_permutex2var_epi32(b2g2r2a2, mask0, b3g3r3a3);
+    __m512i ga23 = _mm512_permutex2var_epi32(b2g2r2a2, mask1, b3g3r3a3);
+
+    a = v_uint8x64(_mm512_permutex2var_epi32(br01, mask0, br23));
+    c = v_uint8x64(_mm512_permutex2var_epi32(br01, mask1, br23));
+    b = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask0, ga23));
+    d = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask1, ga23));
+#endif
+}
+
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b, v_uint16x32& c, v_uint16x32& d )
+{
+    __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
+    __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
+    __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 96));
+
+    __m512i mask0 = _v512_set_epu16(62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
+                                    30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10,  8,  6,  4,  2,  0);
+    __m512i mask1 = _v512_set_epu16(63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
+                                    31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11,  9,  7,  5,  3,  1);
+
+    __m512i br01 = _mm512_permutex2var_epi16(bgra0, mask0, bgra1);
+    __m512i ga01 = _mm512_permutex2var_epi16(bgra0, mask1, bgra1);
+    __m512i br23 = _mm512_permutex2var_epi16(bgra2, mask0, bgra3);
+    __m512i ga23 = _mm512_permutex2var_epi16(bgra2, mask1, bgra3);
+
+    a = v_uint16x32(_mm512_permutex2var_epi16(br01, mask0, br23));
+    c = v_uint16x32(_mm512_permutex2var_epi16(br01, mask1, br23));
+    b = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask0, ga23));
+    d = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask1, ga23));
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b, v_uint32x16& c, v_uint32x16& d )
+{
+    __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
+    __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
+    __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 48));
+
+    __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
+
+    __m512i br01 = _mm512_permutex2var_epi32(bgra0, mask0, bgra1);
+    __m512i ga01 = _mm512_permutex2var_epi32(bgra0, mask1, bgra1);
+    __m512i br23 = _mm512_permutex2var_epi32(bgra2, mask0, bgra3);
+    __m512i ga23 = _mm512_permutex2var_epi32(bgra2, mask1, bgra3);
+
+    a = v_uint32x16(_mm512_permutex2var_epi32(br01, mask0, br23));
+    c = v_uint32x16(_mm512_permutex2var_epi32(br01, mask1, br23));
+    b = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask0, ga23));
+    d = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask1, ga23));
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b, v_uint64x8& c, v_uint64x8& d )
+{
+    __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 8));
+    __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
+    __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 24));
+
+    __m512i mask0 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i mask1 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
+
+    __m512i br01 = _mm512_permutex2var_epi64(bgra0, mask0, bgra1);
+    __m512i ga01 = _mm512_permutex2var_epi64(bgra0, mask1, bgra1);
+    __m512i br23 = _mm512_permutex2var_epi64(bgra2, mask0, bgra3);
+    __m512i ga23 = _mm512_permutex2var_epi64(bgra2, mask1, bgra3);
+
+    a = v_uint64x8(_mm512_permutex2var_epi64(br01, mask0, br23));
+    c = v_uint64x8(_mm512_permutex2var_epi64(br01, mask1, br23));
+    b = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask0, ga23));
+    d = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask1, ga23));
+}
+
+///////////////////////////// store interleave /////////////////////////////////////
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x64& x, const v_uint8x64& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint8x64 low, high;
+    v_zip(x, y, low, high);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, low.val);
+        _mm512_stream_si512((__m512i*)(ptr + 64), high.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, low.val);
+        _mm512_store_si512((__m512i*)(ptr + 64), high.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, low.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 64), high.val);
+    }
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x32& x, const v_uint16x32& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint16x32 low, high;
+    v_zip(x, y, low, high);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, low.val);
+        _mm512_stream_si512((__m512i*)(ptr + 32), high.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, low.val);
+        _mm512_store_si512((__m512i*)(ptr + 32), high.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, low.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 32), high.val);
+    }
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x16& x, const v_uint32x16& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint32x16 low, high;
+    v_zip(x, y, low, high);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, low.val);
+        _mm512_stream_si512((__m512i*)(ptr + 16), high.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, low.val);
+        _mm512_store_si512((__m512i*)(ptr + 16), high.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, low.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 16), high.val);
+    }
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x8& x, const v_uint64x8& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint64x8 low, high;
+    v_zip(x, y, low, high);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, low.val);
+        _mm512_stream_si512((__m512i*)(ptr + 8), high.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, low.val);
+        _mm512_store_si512((__m512i*)(ptr + 8), high.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, low.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 8), high.val);
+    }
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x64& a, const v_uint8x64& b, const v_uint8x64& c,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+#if CV_AVX_512VBMI
+    __m512i mask0 = _v512_set_epu8(127,  84,  20, 126,  83,  19, 125,  82,  18, 124,  81,  17, 123,  80,  16, 122,
+                                    79,  15, 121,  78,  14, 120,  77,  13, 119,  76,  12, 118,  75,  11, 117,  74,
+                                    10, 116,  73,   9, 115,  72,   8, 114,  71,   7, 113,  70,   6, 112,  69,   5,
+                                   111,  68,   4, 110,  67,   3, 109,  66,   2, 108,  65,   1, 107,  64,   0, 106);
+    __m512i mask1 = _v512_set_epu8( 21,  42, 105,  20,  41, 104,  19,  40, 103,  18,  39, 102,  17,  38, 101,  16,
+                                    37, 100,  15,  36,  99,  14,  35,  98,  13,  34,  97,  12,  33,  96,  11,  32,
+                                    95,  10,  31,  94,   9,  30,  93,   8,  29,  92,   7,  28,  91,   6,  27,  90,
+                                     5,  26,  89,   4,  25,  88,   3,  24,  87,   2,  23,  86,   1,  22,  85,   0);
+    __m512i mask2 = _v512_set_epu8(106, 127,  63, 105, 126,  62, 104, 125,  61, 103, 124,  60, 102, 123,  59, 101,
+                                   122,  58, 100, 121,  57,  99, 120,  56,  98, 119,  55,  97, 118,  54,  96, 117,
+                                    53,  95, 116,  52,  94, 115,  51,  93, 114,  50,  92, 113,  49,  91, 112,  48,
+                                    90, 111,  47,  89, 110,  46,  88, 109,  45,  87, 108,  44,  86, 107,  43,  85);
+    __m512i r2g0r0 = _mm512_permutex2var_epi8(b.val, mask0, c.val);
+    __m512i b0r1b1 = _mm512_permutex2var_epi8(a.val, mask1, c.val);
+    __m512i g1b2g2 = _mm512_permutex2var_epi8(a.val, mask2, b.val);
+
+    __m512i bgr0 = _mm512_mask_blend_epi8(0x9249249249249249, r2g0r0, b0r1b1);
+    __m512i bgr1 = _mm512_mask_blend_epi8(0x9249249249249249, b0r1b1, g1b2g2);
+    __m512i bgr2 = _mm512_mask_blend_epi8(0x9249249249249249, g1b2g2, r2g0r0);
+#else
+    __m512i g1g0 = _mm512_shuffle_epi8(b.val, _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001));
+    __m512i b0g0 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, a.val, g1g0);
+    __m512i r0b1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, c.val, a.val);
+    __m512i g1r1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1g0, c.val);
+
+    __m512i mask0 = _v512_set_epu16(42, 10, 31, 41,  9, 30, 40,  8, 29, 39,  7, 28, 38,  6, 27, 37,
+                                     5, 26, 36,  4, 25, 35,  3, 24, 34,  2, 23, 33,  1, 22, 32,  0);
+    __m512i mask1 = _v512_set_epu16(21, 52, 41, 20, 51, 40, 19, 50, 39, 18, 49, 38, 17, 48, 37, 16,
+                                    47, 36, 15, 46, 35, 14, 45, 34, 13, 44, 33, 12, 43, 32, 11, 42);
+    __m512i mask2 = _v512_set_epu16(63, 31, 20, 62, 30, 19, 61, 29, 18, 60, 28, 17, 59, 27, 16, 58,
+                                    26, 15, 57, 25, 14, 56, 24, 13, 55, 23, 12, 54, 22, 11, 53, 21);
+    __m512i b0g0b2 = _mm512_permutex2var_epi16(b0g0, mask0, r0b1);
+    __m512i r1b1r0 = _mm512_permutex2var_epi16(b0g0, mask1, g1r1);
+    __m512i g2r2g1 = _mm512_permutex2var_epi16(r0b1, mask2, g1r1);
+
+    __m512i bgr0 = _mm512_mask_blend_epi16(0x24924924, b0g0b2, r1b1r0);
+    __m512i bgr1 = _mm512_mask_blend_epi16(0x24924924, r1b1r0, g2r2g1);
+    __m512i bgr2 = _mm512_mask_blend_epi16(0x24924924, g2r2g1, b0g0b2);
+#endif
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgr0);
+        _mm512_stream_si512((__m512i*)(ptr + 64), bgr1);
+        _mm512_stream_si512((__m512i*)(ptr + 128), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgr0);
+        _mm512_store_si512((__m512i*)(ptr + 64), bgr1);
+        _mm512_store_si512((__m512i*)(ptr + 128), bgr2);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgr0);
+        _mm512_storeu_si512((__m512i*)(ptr + 64), bgr1);
+        _mm512_storeu_si512((__m512i*)(ptr + 128), bgr2);
+    }
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x32& a, const v_uint16x32& b, const v_uint16x32& c,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m512i mask0 = _v512_set_epu16(42, 10, 31, 41,  9, 30, 40,  8, 29, 39,  7, 28, 38,  6, 27, 37,
+                                     5, 26, 36,  4, 25, 35,  3, 24, 34,  2, 23, 33,  1, 22, 32,  0);
+    __m512i mask1 = _v512_set_epu16(21, 52, 41, 20, 51, 40, 19, 50, 39, 18, 49, 38, 17, 48, 37, 16,
+                                    47, 36, 15, 46, 35, 14, 45, 34, 13, 44, 33, 12, 43, 32, 11, 42);
+    __m512i mask2 = _v512_set_epu16(63, 31, 20, 62, 30, 19, 61, 29, 18, 60, 28, 17, 59, 27, 16, 58,
+                                    26, 15, 57, 25, 14, 56, 24, 13, 55, 23, 12, 54, 22, 11, 53, 21);
+    __m512i b0g0b2 = _mm512_permutex2var_epi16(a.val, mask0, b.val);
+    __m512i r1b1r0 = _mm512_permutex2var_epi16(a.val, mask1, c.val);
+    __m512i g2r2g1 = _mm512_permutex2var_epi16(b.val, mask2, c.val);
+
+    __m512i bgr0 = _mm512_mask_blend_epi16(0x24924924, b0g0b2, r1b1r0);
+    __m512i bgr1 = _mm512_mask_blend_epi16(0x24924924, r1b1r0, g2r2g1);
+    __m512i bgr2 = _mm512_mask_blend_epi16(0x24924924, g2r2g1, b0g0b2);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgr0);
+        _mm512_stream_si512((__m512i*)(ptr + 32), bgr1);
+        _mm512_stream_si512((__m512i*)(ptr + 64), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgr0);
+        _mm512_store_si512((__m512i*)(ptr + 32), bgr1);
+        _mm512_store_si512((__m512i*)(ptr + 64), bgr2);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgr0);
+        _mm512_storeu_si512((__m512i*)(ptr + 32), bgr1);
+        _mm512_storeu_si512((__m512i*)(ptr + 64), bgr2);
+    }
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x16& a, const v_uint32x16& b, const v_uint32x16& c,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m512i mask0 = _v512_set_epu32(26, 31, 15, 25, 30, 14, 24, 29, 13, 23, 28, 12, 22, 27, 11, 21);
+    __m512i mask1 = _v512_set_epu32(31, 10, 25, 30,  9, 24, 29,  8, 23, 28,  7, 22, 27,  6, 21, 26);
+    __m512i g1b2g2 = _mm512_permutex2var_epi32(a.val, mask0, b.val);
+    __m512i r2r1b1 = _mm512_permutex2var_epi32(a.val, mask1, c.val);
+
+    __m512i bgr0 = _mm512_mask_expand_epi32(_mm512_mask_expand_epi32(_mm512_maskz_expand_epi32(0x9249, a.val), 0x2492, b.val), 0x4924, c.val);
+    __m512i bgr1 = _mm512_mask_blend_epi32(0x9249, r2r1b1, g1b2g2);
+    __m512i bgr2 = _mm512_mask_blend_epi32(0x9249, g1b2g2, r2r1b1);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgr0);
+        _mm512_stream_si512((__m512i*)(ptr + 16), bgr1);
+        _mm512_stream_si512((__m512i*)(ptr + 32), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgr0);
+        _mm512_store_si512((__m512i*)(ptr + 16), bgr1);
+        _mm512_store_si512((__m512i*)(ptr + 32), bgr2);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgr0);
+        _mm512_storeu_si512((__m512i*)(ptr + 16), bgr1);
+        _mm512_storeu_si512((__m512i*)(ptr + 32), bgr2);
+    }
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x8& a, const v_uint64x8& b, const v_uint64x8& c,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m512i mask0 = _v512_set_epu64( 5, 12,  7,  4, 11,  6,  3, 10);
+    __m512i mask1 = _v512_set_epu64(15,  7,  4, 14,  6,  3, 13,  5);
+    __m512i r1b1b2 = _mm512_permutex2var_epi64(a.val, mask0, c.val);
+    __m512i g2r2g1 = _mm512_permutex2var_epi64(b.val, mask1, c.val);
+
+    __m512i bgr0 = _mm512_mask_expand_epi64(_mm512_mask_expand_epi64(_mm512_maskz_expand_epi64(0x49, a.val), 0x92, b.val), 0x24, c.val);
+    __m512i bgr1 = _mm512_mask_blend_epi64(0xdb, g2r2g1, r1b1b2);
+    __m512i bgr2 = _mm512_mask_blend_epi64(0xdb, r1b1b2, g2r2g1);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgr0);
+        _mm512_stream_si512((__m512i*)(ptr + 8), bgr1);
+        _mm512_stream_si512((__m512i*)(ptr + 16), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgr0);
+        _mm512_store_si512((__m512i*)(ptr + 8), bgr1);
+        _mm512_store_si512((__m512i*)(ptr + 16), bgr2);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgr0);
+        _mm512_storeu_si512((__m512i*)(ptr + 8), bgr1);
+        _mm512_storeu_si512((__m512i*)(ptr + 16), bgr2);
+    }
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x64& a, const v_uint8x64& b,
+                                const v_uint8x64& c, const v_uint8x64& d,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint8x64 br01, br23, ga01, ga23;
+    v_zip(a, c, br01, br23);
+    v_zip(b, d, ga01, ga23);
+    v_uint8x64 bgra0, bgra1, bgra2, bgra3;
+    v_zip(br01, ga01, bgra0, bgra1);
+    v_zip(br23, ga23, bgra2, bgra3);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgra0.val);
+        _mm512_stream_si512((__m512i*)(ptr + 64), bgra1.val);
+        _mm512_stream_si512((__m512i*)(ptr + 128), bgra2.val);
+        _mm512_stream_si512((__m512i*)(ptr + 192), bgra3.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgra0.val);
+        _mm512_store_si512((__m512i*)(ptr + 64), bgra1.val);
+        _mm512_store_si512((__m512i*)(ptr + 128), bgra2.val);
+        _mm512_store_si512((__m512i*)(ptr + 192), bgra3.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 64), bgra1.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 128), bgra2.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 192), bgra3.val);
+    }
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x32& a, const v_uint16x32& b,
+                                const v_uint16x32& c, const v_uint16x32& d,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint16x32 br01, br23, ga01, ga23;
+    v_zip(a, c, br01, br23);
+    v_zip(b, d, ga01, ga23);
+    v_uint16x32 bgra0, bgra1, bgra2, bgra3;
+    v_zip(br01, ga01, bgra0, bgra1);
+    v_zip(br23, ga23, bgra2, bgra3);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgra0.val);
+        _mm512_stream_si512((__m512i*)(ptr + 32), bgra1.val);
+        _mm512_stream_si512((__m512i*)(ptr + 64), bgra2.val);
+        _mm512_stream_si512((__m512i*)(ptr + 96), bgra3.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgra0.val);
+        _mm512_store_si512((__m512i*)(ptr + 32), bgra1.val);
+        _mm512_store_si512((__m512i*)(ptr + 64), bgra2.val);
+        _mm512_store_si512((__m512i*)(ptr + 96), bgra3.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 32), bgra1.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 64), bgra2.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 96), bgra3.val);
+    }
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x16& a, const v_uint32x16& b,
+                                const v_uint32x16& c, const v_uint32x16& d,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint32x16 br01, br23, ga01, ga23;
+    v_zip(a, c, br01, br23);
+    v_zip(b, d, ga01, ga23);
+    v_uint32x16 bgra0, bgra1, bgra2, bgra3;
+    v_zip(br01, ga01, bgra0, bgra1);
+    v_zip(br23, ga23, bgra2, bgra3);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgra0.val);
+        _mm512_stream_si512((__m512i*)(ptr + 16), bgra1.val);
+        _mm512_stream_si512((__m512i*)(ptr + 32), bgra2.val);
+        _mm512_stream_si512((__m512i*)(ptr + 48), bgra3.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgra0.val);
+        _mm512_store_si512((__m512i*)(ptr + 16), bgra1.val);
+        _mm512_store_si512((__m512i*)(ptr + 32), bgra2.val);
+        _mm512_store_si512((__m512i*)(ptr + 48), bgra3.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 16), bgra1.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 32), bgra2.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 48), bgra3.val);
+    }
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x8& a, const v_uint64x8& b,
+                                const v_uint64x8& c, const v_uint64x8& d,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint64x8 br01, br23, ga01, ga23;
+    v_zip(a, c, br01, br23);
+    v_zip(b, d, ga01, ga23);
+    v_uint64x8 bgra0, bgra1, bgra2, bgra3;
+    v_zip(br01, ga01, bgra0, bgra1);
+    v_zip(br23, ga23, bgra2, bgra3);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgra0.val);
+        _mm512_stream_si512((__m512i*)(ptr + 8), bgra1.val);
+        _mm512_stream_si512((__m512i*)(ptr + 16), bgra2.val);
+        _mm512_stream_si512((__m512i*)(ptr + 24), bgra3.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgra0.val);
+        _mm512_store_si512((__m512i*)(ptr + 8), bgra1.val);
+        _mm512_store_si512((__m512i*)(ptr + 16), bgra2.val);
+        _mm512_store_si512((__m512i*)(ptr + 24), bgra3.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 8), bgra1.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 16), bgra2.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 24), bgra3.val);
+    }
+}
+
+#define OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
+{ \
+    _Tpvec1 a1, b1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
+{ \
+    _Tpvec1 a1, b1, c1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
+{ \
+    _Tpvec1 a1, b1, c1, d1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+    d0 = v_reinterpret_as_##suffix0(d1); \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, mode);      \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \
+                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode);  \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                const _Tpvec0& c0, const _Tpvec0& d0, \
+                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
+}
+
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int8x64, schar, s8, v_uint8x64, uchar, u8)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int16x32, short, s16, v_uint16x32, ushort, u16)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int32x16, int, s32, v_uint32x16, unsigned, u32)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float32x16, float, f32, v_uint32x16, unsigned, u32)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int64x8, int64, s64, v_uint64x8, uint64, u64)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float64x8, double, f64, v_uint64x8, uint64, u64)
+
+////////// Mask and checks /////////
+
+/** Mask **/
+inline int64 v_signmask(const v_int8x64& a) { return (int64)_mm512_movepi8_mask(a.val); }
+inline int v_signmask(const v_int16x32& a) { return (int)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+inline int v_signmask(const v_int32x16& a) { return (int)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+inline int v_signmask(const v_int64x8& a) { return (int)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+
+inline int64 v_signmask(const v_uint8x64& a) { return v_signmask(v_reinterpret_as_s8(a)); }
+inline int v_signmask(const v_uint16x32& a) { return v_signmask(v_reinterpret_as_s16(a)); }
+inline int v_signmask(const v_uint32x16& a) { return v_signmask(v_reinterpret_as_s32(a)); }
+inline int v_signmask(const v_uint64x8& a) { return v_signmask(v_reinterpret_as_s64(a)); }
+inline int v_signmask(const v_float32x16& a) { return v_signmask(v_reinterpret_as_s32(a)); }
+inline int v_signmask(const v_float64x8& a) { return v_signmask(v_reinterpret_as_s64(a)); }
+
+/** Checks **/
+inline bool v_check_all(const v_int8x64& a) { return !(bool)_mm512_cmp_epi8_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
+inline bool v_check_any(const v_int8x64& a) { return (bool)_mm512_movepi8_mask(a.val); }
+inline bool v_check_all(const v_int16x32& a) { return !(bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
+inline bool v_check_any(const v_int16x32& a) { return (bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+inline bool v_check_all(const v_int32x16& a) { return !(bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
+inline bool v_check_any(const v_int32x16& a) { return (bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+inline bool v_check_all(const v_int64x8& a) { return !(bool)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
+inline bool v_check_any(const v_int64x8& a) { return (bool)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+
+inline bool v_check_all(const v_float32x16& a) { return v_check_all(v_reinterpret_as_s32(a)); }
+inline bool v_check_any(const v_float32x16& a) { return v_check_any(v_reinterpret_as_s32(a)); }
+inline bool v_check_all(const v_float64x8& a) { return v_check_all(v_reinterpret_as_s64(a)); }
+inline bool v_check_any(const v_float64x8& a) { return v_check_any(v_reinterpret_as_s64(a)); }
+inline bool v_check_all(const v_uint8x64& a) { return v_check_all(v_reinterpret_as_s8(a)); }
+inline bool v_check_all(const v_uint16x32& a) { return v_check_all(v_reinterpret_as_s16(a)); }
+inline bool v_check_all(const v_uint32x16& a) { return v_check_all(v_reinterpret_as_s32(a)); }
+inline bool v_check_all(const v_uint64x8& a) { return v_check_all(v_reinterpret_as_s64(a)); }
+inline bool v_check_any(const v_uint8x64& a) { return v_check_any(v_reinterpret_as_s8(a)); }
+inline bool v_check_any(const v_uint16x32& a) { return v_check_any(v_reinterpret_as_s16(a)); }
+inline bool v_check_any(const v_uint32x16& a) { return v_check_any(v_reinterpret_as_s32(a)); }
+inline bool v_check_any(const v_uint64x8& a) { return v_check_any(v_reinterpret_as_s64(a)); }
+
+inline int v_scan_forward(const v_int8x64& a)
+{
+    int64 mask = _mm512_movepi8_mask(a.val);
+    int mask32 = (int)mask;
+    return mask != 0 ? mask32 != 0 ? trailingZeros32(mask32) : 32 + trailingZeros32((int)(mask >> 32)) : 0;
+}
+inline int v_scan_forward(const v_uint8x64& a) { return v_scan_forward(v_reinterpret_as_s8(a)); }
+inline int v_scan_forward(const v_int16x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))); }
+inline int v_scan_forward(const v_uint16x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))); }
+inline int v_scan_forward(const v_int32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }
+inline int v_scan_forward(const v_uint32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }
+inline int v_scan_forward(const v_float32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }
+inline int v_scan_forward(const v_int64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }
+inline int v_scan_forward(const v_uint64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }
+inline int v_scan_forward(const v_float64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }
+
+inline void v512_cleanup() { _mm256_zeroall(); }
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+} // cv::
+
+#endif // OPENCV_HAL_INTRIN_AVX_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_cpp.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_cpp.hpp
new file mode 100644
index 0000000..4622214
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_cpp.hpp
@@ -0,0 +1,3320 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_HAL_INTRIN_CPP_HPP
+#define OPENCV_HAL_INTRIN_CPP_HPP
+
+#include <limits>
+#include <cstring>
+#include <algorithm>
+#include "opencv2/core/saturate.hpp"
+
+//! @cond IGNORED
+#define CV_SIMD128_CPP 1
+#if defined(CV_FORCE_SIMD128_CPP)
+#define CV_SIMD128 1
+#define CV_SIMD128_64F 1
+#endif
+#if defined(CV_DOXYGEN)
+#define CV_SIMD128 1
+#define CV_SIMD128_64F 1
+#define CV_SIMD256 1
+#define CV_SIMD256_64F 1
+#define CV_SIMD512 1
+#define CV_SIMD512_64F 1
+#else
+#define CV_SIMD256 0 // Explicitly disable SIMD256 and SIMD512 support for scalar intrinsic implementation
+#define CV_SIMD512 0 // to avoid warnings during compilation
+#endif
+//! @endcond
+
+namespace cv
+{
+
+#ifndef CV_DOXYGEN
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+#endif
+
+/** @addtogroup core_hal_intrin
+
+"Universal intrinsics" is a types and functions set intended to simplify vectorization of code on
+different platforms. Currently a few different SIMD extensions on different architectures are supported.
+128 bit registers of various types support is implemented for a wide range of architectures
+including x86(__SSE/SSE2/SSE4.2__), ARM(__NEON__), PowerPC(__VSX__), MIPS(__MSA__).
+256 bit long registers are supported on x86(__AVX2__) and 512 bit long registers are supported on x86(__AVX512__).
+In case when there is no SIMD extension available during compilation, fallback C++ implementation of intrinsics
+will be chosen and code will work as expected although it could be slower.
+
+### Types
+
+There are several types representing packed values vector registers, each type is
+implemented as a structure based on a one SIMD register.
+
+- cv::v_uint8 and cv::v_int8: 8-bit integer values (unsigned/signed) - char
+- cv::v_uint16 and cv::v_int16: 16-bit integer values (unsigned/signed) - short
+- cv::v_uint32 and cv::v_int32: 32-bit integer values (unsigned/signed) - int
+- cv::v_uint64 and cv::v_int64: 64-bit integer values (unsigned/signed) - int64
+- cv::v_float32: 32-bit floating point values (signed) - float
+- cv::v_float64: 64-bit floating point values (signed) - double
+
+Exact bit length(and value quantity) of listed types is compile time deduced and depends on architecture SIMD
+capabilities chosen as available during compilation of the library. All the types contains __nlanes__ enumeration
+to check for exact value quantity of the type.
+
+In case the exact bit length of the type is important it is possible to use specific fixed length register types.
+
+There are several types representing 128-bit registers.
+
+- cv::v_uint8x16 and cv::v_int8x16: sixteen 8-bit integer values (unsigned/signed) - char
+- cv::v_uint16x8 and cv::v_int16x8: eight 16-bit integer values (unsigned/signed) - short
+- cv::v_uint32x4 and cv::v_int32x4: four 32-bit integer values (unsigned/signed) - int
+- cv::v_uint64x2 and cv::v_int64x2: two 64-bit integer values (unsigned/signed) - int64
+- cv::v_float32x4: four 32-bit floating point values (signed) - float
+- cv::v_float64x2: two 64-bit floating point values (signed) - double
+
+There are several types representing 256-bit registers.
+
+- cv::v_uint8x32 and cv::v_int8x32: thirty two 8-bit integer values (unsigned/signed) - char
+- cv::v_uint16x16 and cv::v_int16x16: sixteen 16-bit integer values (unsigned/signed) - short
+- cv::v_uint32x8 and cv::v_int32x8: eight 32-bit integer values (unsigned/signed) - int
+- cv::v_uint64x4 and cv::v_int64x4: four 64-bit integer values (unsigned/signed) - int64
+- cv::v_float32x8: eight 32-bit floating point values (signed) - float
+- cv::v_float64x4: four 64-bit floating point values (signed) - double
+
+@note
+256 bit registers at the moment implemented for AVX2 SIMD extension only, if you want to use this type directly,
+don't forget to check the CV_SIMD256 preprocessor definition:
+@code
+#if CV_SIMD256
+//...
+#endif
+@endcode
+
+There are several types representing 512-bit registers.
+
+- cv::v_uint8x64 and cv::v_int8x64: sixty four 8-bit integer values (unsigned/signed) - char
+- cv::v_uint16x32 and cv::v_int16x32: thirty two 16-bit integer values (unsigned/signed) - short
+- cv::v_uint32x16 and cv::v_int32x16: sixteen 32-bit integer values (unsigned/signed) - int
+- cv::v_uint64x8 and cv::v_int64x8: eight 64-bit integer values (unsigned/signed) - int64
+- cv::v_float32x16: sixteen 32-bit floating point values (signed) - float
+- cv::v_float64x8: eight 64-bit floating point values (signed) - double
+@note
+512 bit registers at the moment implemented for AVX512 SIMD extension only, if you want to use this type directly,
+don't forget to check the CV_SIMD512 preprocessor definition.
+
+@note
+cv::v_float64x2 is not implemented in NEON variant, if you want to use this type, don't forget to
+check the CV_SIMD128_64F preprocessor definition.
+
+### Load and store operations
+
+These operations allow to set contents of the register explicitly or by loading it from some memory
+block and to save contents of the register to memory block.
+
+There are variable size register load operations that provide result of maximum available size
+depending on chosen platform capabilities.
+- Constructors:
+@ref v_reg::v_reg(const _Tp *ptr) "from memory",
+- Other create methods:
+vx_setall_s8, vx_setall_u8, ...,
+vx_setzero_u8, vx_setzero_s8, ...
+- Memory load operations:
+vx_load, vx_load_aligned, vx_load_low, vx_load_halves,
+- Memory operations with expansion of values:
+vx_load_expand, vx_load_expand_q
+
+Also there are fixed size register load/store operations.
+
+For 128 bit registers
+- Constructors:
+@ref v_reg::v_reg(const _Tp *ptr) "from memory",
+@ref v_reg::v_reg(_Tp s0, _Tp s1) "from two values", ...
+- Other create methods:
+@ref v_setall_s8, @ref v_setall_u8, ...,
+@ref v_setzero_u8, @ref v_setzero_s8, ...
+- Memory load operations:
+@ref v_load, @ref v_load_aligned, @ref v_load_low, @ref v_load_halves,
+- Memory operations with expansion of values:
+@ref v_load_expand, @ref v_load_expand_q
+
+For 256 bit registers(check CV_SIMD256 preprocessor definition)
+- Constructors:
+@ref v_reg::v_reg(const _Tp *ptr) "from memory",
+@ref v_reg::v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) "from four values", ...
+- Other create methods:
+@ref v256_setall_s8, @ref v256_setall_u8, ...,
+@ref v256_setzero_u8, @ref v256_setzero_s8, ...
+- Memory load operations:
+@ref v256_load, @ref v256_load_aligned, @ref v256_load_low, @ref v256_load_halves,
+- Memory operations with expansion of values:
+@ref v256_load_expand, @ref v256_load_expand_q
+
+For 512 bit registers(check CV_SIMD512 preprocessor definition)
+- Constructors:
+@ref v_reg::v_reg(const _Tp *ptr) "from memory",
+@ref v_reg::v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3, _Tp s4, _Tp s5, _Tp s6, _Tp s7) "from eight values", ...
+- Other create methods:
+@ref v512_setall_s8, @ref v512_setall_u8, ...,
+@ref v512_setzero_u8, @ref v512_setzero_s8, ...
+- Memory load operations:
+@ref v512_load, @ref v512_load_aligned, @ref v512_load_low, @ref v512_load_halves,
+- Memory operations with expansion of values:
+@ref v512_load_expand, @ref v512_load_expand_q
+
+Store to memory operations are similar across different platform capabilities:
+@ref v_store, @ref v_store_aligned,
+@ref v_store_high, @ref v_store_low
+
+### Value reordering
+
+These operations allow to reorder or recombine elements in one or multiple vectors.
+
+- Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
+- Expand: @ref v_expand, @ref v_expand_low, @ref v_expand_high
+- Pack: @ref v_pack, @ref v_pack_u, @ref v_pack_b, @ref v_rshr_pack, @ref v_rshr_pack_u,
+@ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
+- Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high
+- Reverse: @ref v_reverse
+- Extract: @ref v_extract
+
+
+### Arithmetic, bitwise and comparison operations
+
+Element-wise binary and unary operations.
+
+- Arithmetics:
+@ref operator +(const v_reg &a, const v_reg &b) "+",
+@ref operator -(const v_reg &a, const v_reg &b) "-",
+@ref operator *(const v_reg &a, const v_reg &b) "*",
+@ref operator /(const v_reg &a, const v_reg &b) "/",
+@ref v_mul_expand
+
+- Non-saturating arithmetics: @ref v_add_wrap, @ref v_sub_wrap
+
+- Bitwise shifts:
+@ref operator <<(const v_reg &a, int s) "<<",
+@ref operator >>(const v_reg &a, int s) ">>",
+@ref v_shl, @ref v_shr
+
+- Bitwise logic:
+@ref operator &(const v_reg &a, const v_reg &b) "&",
+@ref operator |(const v_reg &a, const v_reg &b) "|",
+@ref operator ^(const v_reg &a, const v_reg &b) "^",
+@ref operator ~(const v_reg &a) "~"
+
+- Comparison:
+@ref operator >(const v_reg &a, const v_reg &b) ">",
+@ref operator >=(const v_reg &a, const v_reg &b) ">=",
+@ref operator <(const v_reg &a, const v_reg &b) "<",
+@ref operator <=(const v_reg &a, const v_reg &b) "<=",
+@ref operator ==(const v_reg &a, const v_reg &b) "==",
+@ref operator !=(const v_reg &a, const v_reg &b) "!="
+
+- min/max: @ref v_min, @ref v_max
+
+### Reduce and mask
+
+Most of these operations return only one value.
+
+- Reduce: @ref v_reduce_min, @ref v_reduce_max, @ref v_reduce_sum, @ref v_popcount
+- Mask: @ref v_signmask, @ref v_check_all, @ref v_check_any, @ref v_select
+
+### Other math
+
+- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude
+- Absolute values: @ref v_abs, @ref v_absdiff, @ref v_absdiffs
+
+### Conversions
+
+Different type conversions and casts:
+
+- Rounding: @ref v_round, @ref v_floor, @ref v_ceil, @ref v_trunc,
+- To float: @ref v_cvt_f32, @ref v_cvt_f64
+- Reinterpret: @ref v_reinterpret_as_u8, @ref v_reinterpret_as_s8, ...
+
+### Matrix operations
+
+In these operations vectors represent matrix rows/columns: @ref v_dotprod, @ref v_dotprod_fast,
+@ref v_dotprod_expand, @ref v_dotprod_expand_fast, @ref v_matmul, @ref v_transpose4x4
+
+### Usability
+
+Most operations are implemented only for some subset of the available types, following matrices
+shows the applicability of different operations to the types.
+
+Regular integers:
+
+| Operations\\Types | uint 8 | int 8 | uint 16 | int 16 | uint 32 | int 32 |
+|-------------------|:-:|:-:|:-:|:-:|:-:|:-:|
+|load, store        | x | x | x | x | x | x |
+|interleave         | x | x | x | x | x | x |
+|expand             | x | x | x | x | x | x |
+|expand_low         | x | x | x | x | x | x |
+|expand_high        | x | x | x | x | x | x |
+|expand_q           | x | x |   |   |   |   |
+|add, sub           | x | x | x | x | x | x |
+|add_wrap, sub_wrap | x | x | x | x |   |   |
+|mul_wrap           | x | x | x | x |   |   |
+|mul                | x | x | x | x | x | x |
+|mul_expand         | x | x | x | x | x |   |
+|compare            | x | x | x | x | x | x |
+|shift              |   |   | x | x | x | x |
+|dotprod            |   |   |   | x |   | x |
+|dotprod_fast       |   |   |   | x |   | x |
+|dotprod_expand     | x | x | x | x |   | x |
+|dotprod_expand_fast| x | x | x | x |   | x |
+|logical            | x | x | x | x | x | x |
+|min, max           | x | x | x | x | x | x |
+|absdiff            | x | x | x | x | x | x |
+|absdiffs           |   | x |   | x |   |   |
+|reduce             | x | x | x | x | x | x |
+|mask               | x | x | x | x | x | x |
+|pack               | x | x | x | x | x | x |
+|pack_u             | x |   | x |   |   |   |
+|pack_b             | x |   |   |   |   |   |
+|unpack             | x | x | x | x | x | x |
+|extract            | x | x | x | x | x | x |
+|rotate (lanes)     | x | x | x | x | x | x |
+|cvt_flt32          |   |   |   |   |   | x |
+|cvt_flt64          |   |   |   |   |   | x |
+|transpose4x4       |   |   |   |   | x | x |
+|reverse            | x | x | x | x | x | x |
+|extract_n          | x | x | x | x | x | x |
+|broadcast_element  |   |   |   |   | x | x |
+
+Big integers:
+
+| Operations\\Types | uint 64 | int 64 |
+|-------------------|:-:|:-:|
+|load, store        | x | x |
+|add, sub           | x | x |
+|shift              | x | x |
+|logical            | x | x |
+|reverse            | x | x |
+|extract            | x | x |
+|rotate (lanes)     | x | x |
+|cvt_flt64          |   | x |
+|extract_n          | x | x |
+
+Floating point:
+
+| Operations\\Types | float 32 | float 64 |
+|-------------------|:-:|:-:|
+|load, store        | x | x |
+|interleave         | x |   |
+|add, sub           | x | x |
+|mul                | x | x |
+|div                | x | x |
+|compare            | x | x |
+|min, max           | x | x |
+|absdiff            | x | x |
+|reduce             | x |   |
+|mask               | x | x |
+|unpack             | x | x |
+|cvt_flt32          |   | x |
+|cvt_flt64          | x |   |
+|sqrt, abs          | x | x |
+|float math         | x | x |
+|transpose4x4       | x |   |
+|extract            | x | x |
+|rotate (lanes)     | x | x |
+|reverse            | x | x |
+|extract_n          | x | x |
+|broadcast_element  | x |   |
+
+ @{ */
+
+template<typename _Tp, int n> struct v_reg
+{
+//! @cond IGNORED
+    typedef _Tp lane_type;
+    enum { nlanes = n };
+// !@endcond
+
+    /** @brief Constructor
+
+    Initializes register with data from memory
+    @param ptr pointer to memory block with data for register */
+    explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; }
+
+    /** @brief Constructor
+
+    Initializes register with two 64-bit values */
+    v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; }
+
+    /** @brief Constructor
+
+    Initializes register with four 32-bit values */
+    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; }
+
+    /** @brief Constructor
+
+    Initializes register with eight 16-bit values */
+    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
+           _Tp s4, _Tp s5, _Tp s6, _Tp s7)
+    {
+        s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
+        s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
+    }
+
+    /** @brief Constructor
+
+    Initializes register with sixteen 8-bit values */
+    v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
+           _Tp s4, _Tp s5, _Tp s6, _Tp s7,
+           _Tp s8, _Tp s9, _Tp s10, _Tp s11,
+           _Tp s12, _Tp s13, _Tp s14, _Tp s15)
+    {
+        s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
+        s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
+        s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11;
+        s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15;
+    }
+
+    /** @brief Default constructor
+
+    Does not initialize anything*/
+    v_reg() {}
+
+    /** @brief Copy constructor */
+    v_reg(const v_reg<_Tp, n> & r)
+    {
+        for( int i = 0; i < n; i++ )
+            s[i] = r.s[i];
+    }
+    /** @brief Access first value
+
+    Returns value of the first lane according to register type, for example:
+    @code{.cpp}
+    v_int32x4 r(1, 2, 3, 4);
+    int v = r.get0(); // returns 1
+    v_uint64x2 r(1, 2);
+    uint64_t v = r.get0(); // returns 1
+    @endcode
+    */
+    _Tp get0() const { return s[0]; }
+
+//! @cond IGNORED
+    _Tp get(const int i) const { return s[i]; }
+    v_reg<_Tp, n> high() const
+    {
+        v_reg<_Tp, n> c;
+        int i;
+        for( i = 0; i < n/2; i++ )
+        {
+            c.s[i] = s[i+(n/2)];
+            c.s[i+(n/2)] = 0;
+        }
+        return c;
+    }
+
+    static v_reg<_Tp, n> zero()
+    {
+        v_reg<_Tp, n> c;
+        for( int i = 0; i < n; i++ )
+            c.s[i] = (_Tp)0;
+        return c;
+    }
+
+    static v_reg<_Tp, n> all(_Tp s)
+    {
+        v_reg<_Tp, n> c;
+        for( int i = 0; i < n; i++ )
+            c.s[i] = s;
+        return c;
+    }
+
+    template<typename _Tp2, int n2> v_reg<_Tp2, n2> reinterpret_as() const
+    {
+        size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n);
+        v_reg<_Tp2, n2> c;
+        std::memcpy(&c.s[0], &s[0], bytes);
+        return c;
+    }
+
+    v_reg& operator=(const v_reg<_Tp, n> & r)
+    {
+        for( int i = 0; i < n; i++ )
+            s[i] = r.s[i];
+        return *this;
+    }
+
+    _Tp s[n];
+//! @endcond
+};
+
+/** @brief Sixteen 8-bit unsigned integer values */
+typedef v_reg<uchar, 16> v_uint8x16;
+/** @brief Sixteen 8-bit signed integer values */
+typedef v_reg<schar, 16> v_int8x16;
+/** @brief Eight 16-bit unsigned integer values */
+typedef v_reg<ushort, 8> v_uint16x8;
+/** @brief Eight 16-bit signed integer values */
+typedef v_reg<short, 8> v_int16x8;
+/** @brief Four 32-bit unsigned integer values */
+typedef v_reg<unsigned, 4> v_uint32x4;
+/** @brief Four 32-bit signed integer values */
+typedef v_reg<int, 4> v_int32x4;
+/** @brief Four 32-bit floating point values (single precision) */
+typedef v_reg<float, 4> v_float32x4;
+/** @brief Two 64-bit floating point values (double precision) */
+typedef v_reg<double, 2> v_float64x2;
+/** @brief Two 64-bit unsigned integer values */
+typedef v_reg<uint64, 2> v_uint64x2;
+/** @brief Two 64-bit signed integer values */
+typedef v_reg<int64, 2> v_int64x2;
+
+#if CV_SIMD256
+/** @brief Thirty two 8-bit unsigned integer values */
+typedef v_reg<uchar, 32> v_uint8x32;
+/** @brief Thirty two 8-bit signed integer values */
+typedef v_reg<schar, 32> v_int8x32;
+/** @brief Sixteen 16-bit unsigned integer values */
+typedef v_reg<ushort, 16> v_uint16x16;
+/** @brief Sixteen 16-bit signed integer values */
+typedef v_reg<short, 16> v_int16x16;
+/** @brief Eight 32-bit unsigned integer values */
+typedef v_reg<unsigned, 8> v_uint32x8;
+/** @brief Eight 32-bit signed integer values */
+typedef v_reg<int, 8> v_int32x8;
+/** @brief Eight 32-bit floating point values (single precision) */
+typedef v_reg<float, 8> v_float32x8;
+/** @brief Four 64-bit floating point values (double precision) */
+typedef v_reg<double, 4> v_float64x4;
+/** @brief Four 64-bit unsigned integer values */
+typedef v_reg<uint64, 4> v_uint64x4;
+/** @brief Four 64-bit signed integer values */
+typedef v_reg<int64, 4> v_int64x4;
+#endif
+
+#if CV_SIMD512
+/** @brief Sixty four 8-bit unsigned integer values */
+typedef v_reg<uchar, 64> v_uint8x64;
+/** @brief Sixty four 8-bit signed integer values */
+typedef v_reg<schar, 64> v_int8x64;
+/** @brief Thirty two 16-bit unsigned integer values */
+typedef v_reg<ushort, 32> v_uint16x32;
+/** @brief Thirty two 16-bit signed integer values */
+typedef v_reg<short, 32> v_int16x32;
+/** @brief Sixteen 32-bit unsigned integer values */
+typedef v_reg<unsigned, 16> v_uint32x16;
+/** @brief Sixteen 32-bit signed integer values */
+typedef v_reg<int, 16> v_int32x16;
+/** @brief Sixteen 32-bit floating point values (single precision) */
+typedef v_reg<float, 16> v_float32x16;
+/** @brief Eight 64-bit floating point values (double precision) */
+typedef v_reg<double, 8> v_float64x8;
+/** @brief Eight 64-bit unsigned integer values */
+typedef v_reg<uint64, 8> v_uint64x8;
+/** @brief Eight 64-bit signed integer values */
+typedef v_reg<int64, 8> v_int64x8;
+#endif
+
+enum {
+    simd128_width = 16,
+#if CV_SIMD256
+    simd256_width = 32,
+#endif
+#if CV_SIMD512
+    simd512_width = 64,
+    simdmax_width = simd512_width
+#elif CV_SIMD256
+    simdmax_width = simd256_width
+#else
+    simdmax_width = simd128_width
+#endif
+};
+
+/** @brief Add values
+
+For all types. */
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator+(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator+=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+
+/** @brief Subtract values
+
+For all types. */
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator-(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator-=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+
+/** @brief Multiply values
+
+For 16- and 32-bit integer types and floating types. */
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator*(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator*=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+
+/** @brief Divide values
+
+For floating types only. */
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator/(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator/=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+
+
+/** @brief Bitwise AND
+
+Only for integer types. */
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator&(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator&=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+
+/** @brief Bitwise OR
+
+Only for integer types. */
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+
+/** @brief Bitwise XOR
+
+Only for integer types.*/
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator^(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator^=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+
+/** @brief Bitwise NOT
+
+Only for integer types.*/
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator~(const v_reg<_Tp, n>& a);
+
+
+#ifndef CV_DOXYGEN
+
+#define CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, ...) \
+__CV_EXPAND(macro_name(uchar, __VA_ARGS__)) \
+__CV_EXPAND(macro_name(schar, __VA_ARGS__)) \
+__CV_EXPAND(macro_name(ushort, __VA_ARGS__)) \
+__CV_EXPAND(macro_name(short, __VA_ARGS__)) \
+__CV_EXPAND(macro_name(unsigned, __VA_ARGS__)) \
+__CV_EXPAND(macro_name(int, __VA_ARGS__)) \
+__CV_EXPAND(macro_name(uint64, __VA_ARGS__)) \
+__CV_EXPAND(macro_name(int64, __VA_ARGS__)) \
+
+#define CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, ...) \
+__CV_EXPAND(macro_name(float, __VA_ARGS__)) \
+__CV_EXPAND(macro_name(double, __VA_ARGS__)) \
+
+#define CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(macro_name, ...) \
+CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, __VA_ARGS__) \
+CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, __VA_ARGS__) \
+
+#define CV__HAL_INTRIN_IMPL_BIN_OP_(_Tp, bin_op) \
+template<int n> inline \
+v_reg<_Tp, n> operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    v_reg<_Tp, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
+    return c; \
+} \
+template<int n> inline \
+v_reg<_Tp, n>& operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    for( int i = 0; i < n; i++ ) \
+        a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
+    return a; \
+}
+
+#define CV__HAL_INTRIN_IMPL_BIN_OP(bin_op) CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, bin_op)
+
+CV__HAL_INTRIN_IMPL_BIN_OP(+)
+CV__HAL_INTRIN_IMPL_BIN_OP(-)
+CV__HAL_INTRIN_IMPL_BIN_OP(*)
+CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, /)
+
+#define CV__HAL_INTRIN_IMPL_BIT_OP_(_Tp, bit_op) \
+template<int n> CV_INLINE \
+v_reg<_Tp, n> operator bit_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    v_reg<_Tp, n> c; \
+    typedef typename V_TypeTraits<_Tp>::int_type itype; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
+                                                        V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
+    return c; \
+} \
+template<int n> CV_INLINE \
+v_reg<_Tp, n>& operator bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    typedef typename V_TypeTraits<_Tp>::int_type itype; \
+    for( int i = 0; i < n; i++ ) \
+        a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
+                                                        V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
+    return a; \
+}
+
+#define CV__HAL_INTRIN_IMPL_BIT_OP(bit_op) \
+CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) \
+CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) /* TODO: FIXIT remove this after masks refactoring */
+
+
+CV__HAL_INTRIN_IMPL_BIT_OP(&)
+CV__HAL_INTRIN_IMPL_BIT_OP(|)
+CV__HAL_INTRIN_IMPL_BIT_OP(^)
+
+#define CV__HAL_INTRIN_IMPL_BITWISE_NOT_(_Tp, dummy) \
+template<int n> CV_INLINE \
+v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) \
+{ \
+    v_reg<_Tp, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i])); \
+    return c; \
+} \
+
+CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BITWISE_NOT_, ~)
+
+#endif  // !CV_DOXYGEN
+
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \
+template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \
+{ \
+    v_reg<_Tp2, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = cfunc(a.s[i]); \
+    return c; \
+}
+
+/** @brief Square root of elements
+
+Only for floating point types.*/
+OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
+
+//! @cond IGNORED
+OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
+OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
+OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
+OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
+//! @endcond
+
+/** @brief Absolute value of elements
+
+Only for floating point types.*/
+OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs,
+                          typename V_TypeTraits<_Tp>::abs_type)
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_MINMAX_FUNC(func, cfunc) \
+template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    v_reg<_Tp, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = cfunc(a.s[i], b.s[i]); \
+    return c; \
+}
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(func, cfunc) \
+template<typename _Tp, int n> inline _Tp func(const v_reg<_Tp, n>& a) \
+{ \
+    _Tp c = a.s[0]; \
+    for( int i = 1; i < n; i++ ) \
+        c = cfunc(c, a.s[i]); \
+    return c; \
+}
+
+/** @brief Choose min values for each pair
+
+Scheme:
+@code
+{A1 A2 ...}
+{B1 B2 ...}
+--------------
+{min(A1,B1) min(A2,B2) ...}
+@endcode
+For all types except 64-bit integer. */
+OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, std::min)
+
+/** @brief Choose max values for each pair
+
+Scheme:
+@code
+{A1 A2 ...}
+{B1 B2 ...}
+--------------
+{max(A1,B1) max(A2,B2) ...}
+@endcode
+For all types except 64-bit integer. */
+OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, std::max)
+
+/** @brief Find one min value
+
+Scheme:
+@code
+{A1 A2 A3 ...} => min(A1,A2,A3,...)
+@endcode
+For all types except 64-bit integer and 64-bit floating point types. */
+OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_min, std::min)
+
+/** @brief Find one max value
+
+Scheme:
+@code
+{A1 A2 A3 ...} => max(A1,A2,A3,...)
+@endcode
+For all types except 64-bit integer and 64-bit floating point types. */
+OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_max, std::max)
+
+static const unsigned char popCountTable[] =
+{
+    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
+};
+/** @brief Count the 1 bits in the vector lanes and return result as corresponding unsigned type
+
+Scheme:
+@code
+{A1 A2 A3 ...} => {popcount(A1), popcount(A2), popcount(A3), ...}
+@endcode
+For all integer types. */
+template<typename _Tp, int n>
+inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_popcount(const v_reg<_Tp, n>& a)
+{
+    v_reg<typename V_TypeTraits<_Tp>::abs_type, n> b = v_reg<typename V_TypeTraits<_Tp>::abs_type, n>::zero();
+    for (int i = 0; i < n*(int)sizeof(_Tp); i++)
+        b.s[i/sizeof(_Tp)] += popCountTable[v_reinterpret_as_u8(a).s[i]];
+    return b;
+}
+
+
+//! @cond IGNORED
+template<typename _Tp, int n>
+inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                      v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval )
+{
+    for( int i = 0; i < n; i++ )
+    {
+        minval.s[i] = std::min(a.s[i], b.s[i]);
+        maxval.s[i] = std::max(a.s[i], b.s[i]);
+    }
+}
+//! @endcond
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
+template<typename _Tp, int n> \
+inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    typedef typename V_TypeTraits<_Tp>::int_type itype; \
+    v_reg<_Tp, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \
+    return c; \
+}
+
+/** @brief Less-than comparison
+
+For all types except 64-bit integer values. */
+OPENCV_HAL_IMPL_CMP_OP(<)
+
+/** @brief Greater-than comparison
+
+For all types except 64-bit integer values. */
+OPENCV_HAL_IMPL_CMP_OP(>)
+
+/** @brief Less-than or equal comparison
+
+For all types except 64-bit integer values. */
+OPENCV_HAL_IMPL_CMP_OP(<=)
+
+/** @brief Greater-than or equal comparison
+
+For all types except 64-bit integer values. */
+OPENCV_HAL_IMPL_CMP_OP(>=)
+
+/** @brief Equal comparison
+
+For all types except 64-bit integer values. */
+OPENCV_HAL_IMPL_CMP_OP(==)
+
+/** @brief Not equal comparison
+
+For all types except 64-bit integer values. */
+OPENCV_HAL_IMPL_CMP_OP(!=)
+
+template<int n>
+inline v_reg<float, n> v_not_nan(const v_reg<float, n>& a)
+{
+    typedef typename V_TypeTraits<float>::int_type itype;
+    v_reg<float, n> c;
+    for (int i = 0; i < n; i++)
+        c.s[i] = V_TypeTraits<float>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
+    return c;
+}
+template<int n>
+inline v_reg<double, n> v_not_nan(const v_reg<double, n>& a)
+{
+    typedef typename V_TypeTraits<double>::int_type itype;
+    v_reg<double, n> c;
+    for (int i = 0; i < n; i++)
+        c.s[i] = V_TypeTraits<double>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
+    return c;
+}
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_ARITHM_OP(func, bin_op, cast_op, _Tp2) \
+template<typename _Tp, int n> \
+inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    typedef _Tp2 rtype; \
+    v_reg<rtype, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \
+    return c; \
+}
+
+/** @brief Add values without saturation
+
+For 8- and 16-bit integer values. */
+OPENCV_HAL_IMPL_ARITHM_OP(v_add_wrap, +, (_Tp), _Tp)
+
+/** @brief Subtract values without saturation
+
+For 8- and 16-bit integer values. */
+OPENCV_HAL_IMPL_ARITHM_OP(v_sub_wrap, -, (_Tp), _Tp)
+
+/** @brief Multiply values without saturation
+
+For 8- and 16-bit integer values. */
+OPENCV_HAL_IMPL_ARITHM_OP(v_mul_wrap, *, (_Tp), _Tp)
+
+//! @cond IGNORED
+template<typename T> inline T _absdiff(T a, T b)
+{
+    return a > b ? a - b : b - a;
+}
+//! @endcond
+
+/** @brief Absolute difference
+
+Returns \f$ |a - b| \f$ converted to corresponding unsigned type.
+Example:
+@code{.cpp}
+v_int32x4 a, b; // {1, 2, 3, 4} and {4, 3, 2, 1}
+v_uint32x4 c = v_absdiff(a, b); // result is {3, 1, 1, 3}
+@endcode
+For 8-, 16-, 32-bit integer source types. */
+template<typename _Tp, int n>
+inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_absdiff(const v_reg<_Tp, n>& a, const v_reg<_Tp, n> & b)
+{
+    typedef typename V_TypeTraits<_Tp>::abs_type rtype;
+    v_reg<rtype, n> c;
+    const rtype mask = (rtype)(std::numeric_limits<_Tp>::is_signed ? (1 << (sizeof(rtype)*8 - 1)) : 0);
+    for( int i = 0; i < n; i++ )
+    {
+        rtype ua = a.s[i] ^ mask;
+        rtype ub = b.s[i] ^ mask;
+        c.s[i] = _absdiff(ua, ub);
+    }
+    return c;
+}
+
+/** @overload
+
+For 32-bit floating point values */
+template<int n> inline v_reg<float, n> v_absdiff(const v_reg<float, n>& a, const v_reg<float, n>& b)
+{
+    v_reg<float, n> c;
+    for( int i = 0; i < c.nlanes; i++ )
+        c.s[i] = _absdiff(a.s[i], b.s[i]);
+    return c;
+}
+
+/** @overload
+
+For 64-bit floating point values */
+template<int n> inline v_reg<double, n> v_absdiff(const v_reg<double, n>& a, const v_reg<double, n>& b)
+{
+    v_reg<double, n> c;
+    for( int i = 0; i < c.nlanes; i++ )
+        c.s[i] = _absdiff(a.s[i], b.s[i]);
+    return c;
+}
+
+/** @brief Saturating absolute difference
+
+Returns \f$ saturate(|a - b|) \f$ .
+For 8-, 16-bit signed integer source types. */
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_absdiffs(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++)
+        c.s[i] = saturate_cast<_Tp>(std::abs(a.s[i] - b.s[i]));
+    return c;
+}
+
+/** @brief Inversed square root
+
+Returns \f$ 1/sqrt(a) \f$
+For floating point types only. */
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = 1.f/std::sqrt(a.s[i]);
+    return c;
+}
+
+/** @brief Magnitude
+
+Returns \f$ sqrt(a^2 + b^2) \f$
+For floating point types only. */
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]);
+    return c;
+}
+
+/** @brief Square of the magnitude
+
+Returns \f$ a^2 + b^2 \f$
+For floating point types only. */
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i];
+    return c;
+}
+
+/** @brief Multiply and add
+
+ Returns \f$ a*b + c \f$
+ For floating point types and signed 32bit int only. */
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_fma(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                           const v_reg<_Tp, n>& c)
+{
+    v_reg<_Tp, n> d;
+    for( int i = 0; i < n; i++ )
+        d.s[i] = a.s[i]*b.s[i] + c.s[i];
+    return d;
+}
+
+/** @brief A synonym for v_fma */
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                              const v_reg<_Tp, n>& c)
+{
+    return v_fma(a, b, c);
+}
+
+/** @brief Dot product of elements
+
+Multiply values in two registers and sum adjacent result pairs.
+
+Scheme:
+@code
+  {A1 A2 ...} // 16-bit
+x {B1 B2 ...} // 16-bit
+-------------
+{A1B1+A2B2 ...} // 32-bit
+
+@endcode
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
+v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    v_reg<w_type, n/2> c;
+    for( int i = 0; i < (n/2); i++ )
+        c.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1];
+    return c;
+}
+
+/** @brief Dot product of elements
+
+Same as cv::v_dotprod, but add a third element to the sum of adjacent pairs.
+Scheme:
+@code
+  {A1 A2 ...} // 16-bit
+x {B1 B2 ...} // 16-bit
+-------------
+  {A1B1+A2B2+C1 ...} // 32-bit
+@endcode
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
+v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+          const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
+{
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    v_reg<w_type, n/2> s;
+    for( int i = 0; i < (n/2); i++ )
+        s.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1] + c.s[i];
+    return s;
+}
+
+/** @brief Fast Dot product of elements
+
+Same as cv::v_dotprod, but it may perform unorder sum between result pairs in some platforms,
+this intrinsic can be used if the sum among all lanes is only matters
+and also it should be yielding better performance on the affected platforms.
+
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
+v_dotprod_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{ return v_dotprod(a, b); }
+
+/** @brief Fast Dot product of elements
+
+Same as cv::v_dotprod_fast, but add a third element to the sum of adjacent pairs.
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
+v_dotprod_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+               const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
+{ return v_dotprod(a, b, c); }
+
+/** @brief Dot product of elements and expand
+
+Multiply values in two registers and expand the sum of adjacent result pairs.
+
+Scheme:
+@code
+  {A1 A2 A3 A4 ...} // 8-bit
+x {B1 B2 B3 B4 ...} // 8-bit
+-------------
+  {A1B1+A2B2+A3B3+A4B4 ...} // 32-bit
+
+@endcode
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
+v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    typedef typename V_TypeTraits<_Tp>::q_type q_type;
+    v_reg<q_type, n/4> s;
+    for( int i = 0; i < (n/4); i++ )
+        s.s[i] = (q_type)a.s[i*4    ]*b.s[i*4    ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
+                 (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3];
+    return s;
+}
+
+/** @brief Dot product of elements
+
+Same as cv::v_dotprod_expand, but add a third element to the sum of adjacent pairs.
+Scheme:
+@code
+  {A1 A2 A3 A4 ...} // 8-bit
+x {B1 B2 B3 B4 ...} // 8-bit
+-------------
+  {A1B1+A2B2+A3B3+A4B4+C1 ...} // 32-bit
+@endcode
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
+v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                 const v_reg<typename V_TypeTraits<_Tp>::q_type, n / 4>& c)
+{
+    typedef typename V_TypeTraits<_Tp>::q_type q_type;
+    v_reg<q_type, n/4> s;
+    for( int i = 0; i < (n/4); i++ )
+        s.s[i] = (q_type)a.s[i*4    ]*b.s[i*4    ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
+                 (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3] + c.s[i];
+    return s;
+}
+
+/** @brief Fast Dot product of elements and expand
+
+Multiply values in two registers and expand the sum of adjacent result pairs.
+
+Same as cv::v_dotprod_expand, but it may perform unorder sum between result pairs in some platforms,
+this intrinsic can be used if the sum among all lanes is only matters
+and also it should be yielding better performance on the affected platforms.
+
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
+v_dotprod_expand_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{ return v_dotprod_expand(a, b); }
+
+/** @brief Fast Dot product of elements
+
+Same as cv::v_dotprod_expand_fast, but add a third element to the sum of adjacent pairs.
+*/
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
+v_dotprod_expand_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                      const v_reg<typename V_TypeTraits<_Tp>::q_type, n / 4>& c)
+{ return v_dotprod_expand(a, b, c); }
+
+/** @brief Multiply and expand
+
+Multiply values two registers and store results in two registers with wider pack type.
+Scheme:
+@code
+  {A B C D} // 32-bit
+x {E F G H} // 32-bit
+---------------
+{AE BF}         // 64-bit
+        {CG DH} // 64-bit
+@endcode
+Example:
+@code{.cpp}
+v_uint32x4 a, b; // {1,2,3,4} and {2,2,2,2}
+v_uint64x2 c, d; // results
+v_mul_expand(a, b, c, d); // c, d = {2,4}, {6, 8}
+@endcode
+Implemented only for 16- and unsigned 32-bit source types (v_int16x8, v_uint16x8, v_uint32x4).
+*/
+template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                                                       v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c,
+                                                       v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d)
+{
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    for( int i = 0; i < (n/2); i++ )
+    {
+        c.s[i] = (w_type)a.s[i]*b.s[i];
+        d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)];
+    }
+}
+
+/** @brief Multiply and extract high part
+
+Multiply values two registers and store high part of the results.
+Implemented only for 16-bit source types (v_int16x8, v_uint16x8). Returns \f$ a*b >> 16 \f$
+*/
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_mul_hi(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    v_reg<_Tp, n> c;
+    for (int i = 0; i < n; i++)
+        c.s[i] = (_Tp)(((w_type)a.s[i] * b.s[i]) >> sizeof(_Tp)*8);
+    return c;
+}
+
+//! @cond IGNORED
+template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
+                                                 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c)
+{
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    for( int i = 0; i < (n/2); i++ )
+    {
+        c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1];
+    }
+}
+//! @endcond
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
+template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
+{ \
+    v_reg<_Tp, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = (_Tp)(a.s[i] shift_op imm); \
+    return c; \
+}
+
+/** @brief Bitwise shift left
+
+For 16-, 32- and 64-bit integer values. */
+OPENCV_HAL_IMPL_SHIFT_OP(<< )
+
+/** @brief Bitwise shift right
+
+For 16-, 32- and 64-bit integer values. */
+OPENCV_HAL_IMPL_SHIFT_OP(>> )
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(suffix,opA,opB) \
+template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a) \
+{ \
+    v_reg<_Tp, n> b; \
+    for (int i = 0; i < n; i++) \
+    { \
+        int sIndex = i opA imm; \
+        if (0 <= sIndex && sIndex < n) \
+        { \
+            b.s[i] = a.s[sIndex]; \
+        } \
+        else \
+        { \
+            b.s[i] = 0; \
+        } \
+    } \
+    return b; \
+} \
+template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    v_reg<_Tp, n> c; \
+    for (int i = 0; i < n; i++) \
+    { \
+        int aIndex = i opA imm; \
+        int bIndex = i opA imm opB n; \
+        if (0 <= bIndex && bIndex < n) \
+        { \
+            c.s[i] = b.s[bIndex]; \
+        } \
+        else if (0 <= aIndex && aIndex < n) \
+        { \
+            c.s[i] = a.s[aIndex]; \
+        } \
+        else \
+        { \
+            c.s[i] = 0; \
+        } \
+    } \
+    return c; \
+}
+
+/** @brief Element shift left among vector
+
+For all type */
+OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(left,  -, +)
+
+/** @brief Element shift right among vector
+
+For all type */
+OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(right, +, -)
+
+/** @brief Sum packed values
+
+Scheme:
+@code
+{A1 A2 A3 ...} => sum{A1,A2,A3,...}
+@endcode
+*/
+template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a)
+{
+    typename V_TypeTraits<_Tp>::sum_type c = a.s[0];
+    for( int i = 1; i < n; i++ )
+        c += a.s[i];
+    return c;
+}
+
+/** @brief Sums all elements of each input vector, returns the vector of sums
+
+ Scheme:
+ @code
+ result[0] = a[0] + a[1] + a[2] + a[3]
+ result[1] = b[0] + b[1] + b[2] + b[3]
+ result[2] = c[0] + c[1] + c[2] + c[3]
+ result[3] = d[0] + d[1] + d[2] + d[3]
+ @endcode
+*/
+template<int n> inline v_reg<float, n> v_reduce_sum4(const v_reg<float, n>& a, const v_reg<float, n>& b,
+    const v_reg<float, n>& c, const v_reg<float, n>& d)
+{
+    v_reg<float, n> r;
+    for(int i = 0; i < (n/4); i++)
+    {
+        r.s[i*4 + 0] = a.s[i*4 + 0] + a.s[i*4 + 1] + a.s[i*4 + 2] + a.s[i*4 + 3];
+        r.s[i*4 + 1] = b.s[i*4 + 0] + b.s[i*4 + 1] + b.s[i*4 + 2] + b.s[i*4 + 3];
+        r.s[i*4 + 2] = c.s[i*4 + 0] + c.s[i*4 + 1] + c.s[i*4 + 2] + c.s[i*4 + 3];
+        r.s[i*4 + 3] = d.s[i*4 + 0] + d.s[i*4 + 1] + d.s[i*4 + 2] + d.s[i*4 + 3];
+    }
+    return r;
+}
+
+/** @brief Sum absolute differences of values
+
+Scheme:
+@code
+{A1 A2 A3 ...} {B1 B2 B3 ...} => sum{ABS(A1-B1),abs(A2-B2),abs(A3-B3),...}
+@endcode
+For all types except 64-bit types.*/
+template<typename _Tp, int n> inline typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type v_reduce_sad(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type c = _absdiff(a.s[0], b.s[0]);
+    for (int i = 1; i < n; i++)
+        c += _absdiff(a.s[i], b.s[i]);
+    return c;
+}
+
+/** @brief Get negative values mask
+@deprecated v_signmask depends on a lane count heavily and therefore isn't universal enough
+
+Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes.
+Example:
+@code{.cpp}
+v_int32x4 r; // set to {-1, -1, 1, 1}
+int mask = v_signmask(r); // mask = 3 <== 00000000 00000000 00000000 00000011
+@endcode
+*/
+template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
+{
+    int mask = 0;
+    for( int i = 0; i < n; i++ )
+        mask |= (V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i;
+    return mask;
+}
+
+/** @brief Get first negative lane index
+
+Returned value is an index of first negative lane (undefined for input of all positive values)
+Example:
+@code{.cpp}
+v_int32x4 r; // set to {0, 0, -1, -1}
+int idx = v_heading_zeros(r); // idx = 2
+@endcode
+*/
+template <typename _Tp, int n> inline int v_scan_forward(const v_reg<_Tp, n>& a)
+{
+    for (int i = 0; i < n; i++)
+        if(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0)
+            return i;
+    return 0;
+}
+
+/** @brief Check if all packed values are less than zero
+
+Unsigned values will be casted to signed: `uchar 254 => char -2`.
+*/
+template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < n; i++ )
+        if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 )
+            return false;
+    return true;
+}
+
+/** @brief Check if any of packed values is less than zero
+
+Unsigned values will be casted to signed: `uchar 254 => char -2`.
+*/
+template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)
+{
+    for( int i = 0; i < n; i++ )
+        if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 )
+            return true;
+    return false;
+}
+
+/** @brief Per-element select (blend operation)
+
+Return value will be built by combining values _a_ and _b_ using the following scheme:
+    result[i] = mask[i] ? a[i] : b[i];
+
+@note: _mask_ element values are restricted to these values:
+- 0: select element from _b_
+- 0xff/0xffff/etc: select element from _a_
+(fully compatible with bitwise-based operator)
+*/
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask,
+                                                           const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    typedef V_TypeTraits<_Tp> Traits;
+    typedef typename Traits::int_type int_type;
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+    {
+        int_type m = Traits::reinterpret_int(mask.s[i]);
+        CV_DbgAssert(m == 0 || m == (~(int_type)0));  // restrict mask values: 0 or 0xff/0xffff/etc
+        c.s[i] = m ? a.s[i] : b.s[i];
+    }
+    return c;
+}
+
+/** @brief Expand values to the wider pack type
+
+Copy contents of register to two registers with 2x wider pack type.
+Scheme:
+@code
+ int32x4     int64x2 int64x2
+{A B C D} ==> {A B} , {C D}
+@endcode */
+template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
+                            v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b0,
+                            v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b1)
+{
+    for( int i = 0; i < (n/2); i++ )
+    {
+        b0.s[i] = a.s[i];
+        b1.s[i] = a.s[i+(n/2)];
+    }
+}
+
+/** @brief Expand lower values to the wider pack type
+
+Same as cv::v_expand, but return lower half of the vector.
+
+Scheme:
+@code
+ int32x4     int64x2
+{A B C D} ==> {A B}
+@endcode */
+template<typename _Tp, int n>
+inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
+v_expand_low(const v_reg<_Tp, n>& a)
+{
+    v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;
+    for( int i = 0; i < (n/2); i++ )
+        b.s[i] = a.s[i];
+    return b;
+}
+
+/** @brief Expand higher values to the wider pack type
+
+Same as cv::v_expand_low, but expand higher half of the vector instead.
+
+Scheme:
+@code
+ int32x4     int64x2
+{A B C D} ==> {C D}
+@endcode */
+template<typename _Tp, int n>
+inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
+v_expand_high(const v_reg<_Tp, n>& a)
+{
+    v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;
+    for( int i = 0; i < (n/2); i++ )
+        b.s[i] = a.s[i+(n/2)];
+    return b;
+}
+
+//! @cond IGNORED
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n>
+    v_reinterpret_as_int(const v_reg<_Tp, n>& a)
+{
+    v_reg<typename V_TypeTraits<_Tp>::int_type, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(a.s[i]);
+    return c;
+}
+
+template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::uint_type, n>
+    v_reinterpret_as_uint(const v_reg<_Tp, n>& a)
+{
+    v_reg<typename V_TypeTraits<_Tp>::uint_type, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]);
+    return c;
+}
+//! @endcond
+
+/** @brief Interleave two vectors
+
+Scheme:
+@code
+  {A1 A2 A3 A4}
+  {B1 B2 B3 B4}
+---------------
+  {A1 B1 A2 B2} and {A3 B3 A4 B4}
+@endcode
+For all types except 64-bit.
+*/
+template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
+                                               v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 )
+{
+    int i;
+    for( i = 0; i < n/2; i++ )
+    {
+        b0.s[i*2] = a0.s[i];
+        b0.s[i*2+1] = a1.s[i];
+    }
+    for( ; i < n; i++ )
+    {
+        b1.s[i*2-n] = a0.s[i];
+        b1.s[i*2-n+1] = a1.s[i];
+    }
+}
+
+/** @brief Load register contents from memory
+
+@param ptr pointer to memory block with data
+@return register object
+
+@note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x16, int ==> cv::v_int32x4, etc.
+
+@note Use vx_load version to get maximum available register length result
+
+@note Alignment requirement:
+if CV_STRONG_ALIGNMENT=1 then passed pointer must be aligned (`sizeof(lane type)` should be enough).
+Do not cast pointer types without runtime check for pointer alignment (like `uchar*` => `int*`).
+ */
+template<typename _Tp>
+inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    return v_reg<_Tp, simd128_width / sizeof(_Tp)>(ptr);
+}
+
+#if CV_SIMD256
+/** @brief Load 256-bit length register contents from memory
+
+@param ptr pointer to memory block with data
+@return register object
+
+@note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x32, int ==> cv::v_int32x8, etc.
+
+@note Check CV_SIMD256 preprocessor definition prior to use.
+Use vx_load version to get maximum available register length result
+
+@note Alignment requirement:
+if CV_STRONG_ALIGNMENT=1 then passed pointer must be aligned (`sizeof(lane type)` should be enough).
+Do not cast pointer types without runtime check for pointer alignment (like `uchar*` => `int*`).
+ */
+template<typename _Tp>
+inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    return v_reg<_Tp, simd256_width / sizeof(_Tp)>(ptr);
+}
+#endif
+
+#if CV_SIMD512
+/** @brief Load 512-bit length register contents from memory
+
+@param ptr pointer to memory block with data
+@return register object
+
+@note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x64, int ==> cv::v_int32x16, etc.
+
+@note Check CV_SIMD512 preprocessor definition prior to use.
+Use vx_load version to get maximum available register length result
+
+@note Alignment requirement:
+if CV_STRONG_ALIGNMENT=1 then passed pointer must be aligned (`sizeof(lane type)` should be enough).
+Do not cast pointer types without runtime check for pointer alignment (like `uchar*` => `int*`).
+ */
+template<typename _Tp>
+inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    return v_reg<_Tp, simd512_width / sizeof(_Tp)>(ptr);
+}
+#endif
+
+/** @brief Load register contents from memory (aligned)
+
+similar to cv::v_load, but source memory block should be aligned (to 16-byte boundary in case of SIMD128, 32-byte - SIMD256, etc)
+
+@note Use vx_load_aligned version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load_aligned(const _Tp* ptr)
+{
+    CV_Assert(isAligned<sizeof(v_reg<_Tp, simd128_width / sizeof(_Tp)>)>(ptr));
+    return v_reg<_Tp, simd128_width / sizeof(_Tp)>(ptr);
+}
+
+#if CV_SIMD256
+/** @brief Load register contents from memory (aligned)
+
+similar to cv::v256_load, but source memory block should be aligned (to 32-byte boundary in case of SIMD256, 64-byte - SIMD512, etc)
+
+@note Check CV_SIMD256 preprocessor definition prior to use.
+Use vx_load_aligned version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load_aligned(const _Tp* ptr)
+{
+    CV_Assert(isAligned<sizeof(v_reg<_Tp, simd256_width / sizeof(_Tp)>)>(ptr));
+    return v_reg<_Tp, simd256_width / sizeof(_Tp)>(ptr);
+}
+#endif
+
+#if CV_SIMD512
+/** @brief Load register contents from memory (aligned)
+
+similar to cv::v512_load, but source memory block should be aligned (to 64-byte boundary in case of SIMD512, etc)
+
+@note Check CV_SIMD512 preprocessor definition prior to use.
+Use vx_load_aligned version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load_aligned(const _Tp* ptr)
+{
+    CV_Assert(isAligned<sizeof(v_reg<_Tp, simd512_width / sizeof(_Tp)>)>(ptr));
+    return v_reg<_Tp, simd512_width / sizeof(_Tp)>(ptr);
+}
+#endif
+
+/** @brief Load 64-bits of data to lower part (high part is undefined).
+
+@param ptr memory block containing data for first half (0..n/2)
+
+@code{.cpp}
+int lo[2] = { 1, 2 };
+v_int32x4 r = v_load_low(lo);
+@endcode
+
+@note Use vx_load_low version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load_low(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
+    for( int i = 0; i < c.nlanes/2; i++ )
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+
+#if CV_SIMD256
+/** @brief Load 128-bits of data to lower part (high part is undefined).
+
+@param ptr memory block containing data for first half (0..n/2)
+
+@code{.cpp}
+int lo[4] = { 1, 2, 3, 4 };
+v_int32x8 r = v256_load_low(lo);
+@endcode
+
+@note Check CV_SIMD256 preprocessor definition prior to use.
+Use vx_load_low version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load_low(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    v_reg<_Tp, simd256_width / sizeof(_Tp)> c;
+    for (int i = 0; i < c.nlanes / 2; i++)
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+#endif
+
+#if CV_SIMD512
+/** @brief Load 256-bits of data to lower part (high part is undefined).
+
+@param ptr memory block containing data for first half (0..n/2)
+
+@code{.cpp}
+int lo[8] = { 1, 2, 3, 4, 5, 6, 7, 8 };
+v_int32x16 r = v512_load_low(lo);
+@endcode
+
+@note Check CV_SIMD512 preprocessor definition prior to use.
+Use vx_load_low version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load_low(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    v_reg<_Tp, simd512_width / sizeof(_Tp)> c;
+    for (int i = 0; i < c.nlanes / 2; i++)
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+#endif
+
+/** @brief Load register contents from two memory blocks
+
+@param loptr memory block containing data for first half (0..n/2)
+@param hiptr memory block containing data for second half (n/2..n)
+
+@code{.cpp}
+int lo[2] = { 1, 2 }, hi[2] = { 3, 4 };
+v_int32x4 r = v_load_halves(lo, hi);
+@endcode
+
+@note Use vx_load_halves version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(loptr));
+    CV_Assert(isAligned<sizeof(_Tp)>(hiptr));
+#endif
+    v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
+    for( int i = 0; i < c.nlanes/2; i++ )
+    {
+        c.s[i] = loptr[i];
+        c.s[i+c.nlanes/2] = hiptr[i];
+    }
+    return c;
+}
+
+#if CV_SIMD256
+/** @brief Load register contents from two memory blocks
+
+@param loptr memory block containing data for first half (0..n/2)
+@param hiptr memory block containing data for second half (n/2..n)
+
+@code{.cpp}
+int lo[4] = { 1, 2, 3, 4 }, hi[4] = { 5, 6, 7, 8 };
+v_int32x8 r = v256_load_halves(lo, hi);
+@endcode
+
+@note Check CV_SIMD256 preprocessor definition prior to use.
+Use vx_load_halves version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load_halves(const _Tp* loptr, const _Tp* hiptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(loptr));
+    CV_Assert(isAligned<sizeof(_Tp)>(hiptr));
+#endif
+    v_reg<_Tp, simd256_width / sizeof(_Tp)> c;
+    for (int i = 0; i < c.nlanes / 2; i++)
+    {
+        c.s[i] = loptr[i];
+        c.s[i + c.nlanes / 2] = hiptr[i];
+    }
+    return c;
+}
+#endif
+
+#if CV_SIMD512
+/** @brief Load register contents from two memory blocks
+
+@param loptr memory block containing data for first half (0..n/2)
+@param hiptr memory block containing data for second half (n/2..n)
+
+@code{.cpp}
+int lo[4] = { 1, 2, 3, 4, 5, 6, 7, 8 }, hi[4] = { 9, 10, 11, 12, 13, 14, 15, 16 };
+v_int32x16 r = v512_load_halves(lo, hi);
+@endcode
+
+@note Check CV_SIMD512 preprocessor definition prior to use.
+Use vx_load_halves version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load_halves(const _Tp* loptr, const _Tp* hiptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(loptr));
+    CV_Assert(isAligned<sizeof(_Tp)>(hiptr));
+#endif
+    v_reg<_Tp, simd512_width / sizeof(_Tp)> c;
+    for (int i = 0; i < c.nlanes / 2; i++)
+    {
+        c.s[i] = loptr[i];
+        c.s[i + c.nlanes / 2] = hiptr[i];
+    }
+    return c;
+}
+#endif
+
+/** @brief Load register contents from memory with double expand
+
+Same as cv::v_load, but result pack type will be 2x wider than memory type.
+
+@code{.cpp}
+short buf[4] = {1, 2, 3, 4}; // type is int16
+v_int32x4 r = v_load_expand(buf); // r = {1, 2, 3, 4} - type is int32
+@endcode
+For 8-, 16-, 32-bit integer source types.
+
+@note Use vx_load_expand version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<typename V_TypeTraits<_Tp>::w_type, simd128_width / sizeof(typename V_TypeTraits<_Tp>::w_type)>
+v_load_expand(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    v_reg<w_type, simd128_width / sizeof(w_type)> c;
+    for( int i = 0; i < c.nlanes; i++ )
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+
+#if CV_SIMD256
+/** @brief Load register contents from memory with double expand
+
+Same as cv::v256_load, but result pack type will be 2x wider than memory type.
+
+@code{.cpp}
+short buf[8] = {1, 2, 3, 4, 5, 6, 7, 8}; // type is int16
+v_int32x8 r = v256_load_expand(buf); // r = {1, 2, 3, 4, 5, 6, 7, 8} - type is int32
+@endcode
+For 8-, 16-, 32-bit integer source types.
+
+@note Check CV_SIMD256 preprocessor definition prior to use.
+Use vx_load_expand version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<typename V_TypeTraits<_Tp>::w_type, simd256_width / sizeof(typename V_TypeTraits<_Tp>::w_type)>
+v256_load_expand(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    v_reg<w_type, simd256_width / sizeof(w_type)> c;
+    for (int i = 0; i < c.nlanes; i++)
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+#endif
+
+#if CV_SIMD512
+/** @brief Load register contents from memory with double expand
+
+Same as cv::v512_load, but result pack type will be 2x wider than memory type.
+
+@code{.cpp}
+short buf[8] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; // type is int16
+v_int32x16 r = v512_load_expand(buf); // r = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} - type is int32
+@endcode
+For 8-, 16-, 32-bit integer source types.
+
+@note Check CV_SIMD512 preprocessor definition prior to use.
+Use vx_load_expand version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<typename V_TypeTraits<_Tp>::w_type, simd512_width / sizeof(typename V_TypeTraits<_Tp>::w_type)>
+v512_load_expand(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    typedef typename V_TypeTraits<_Tp>::w_type w_type;
+    v_reg<w_type, simd512_width / sizeof(w_type)> c;
+    for (int i = 0; i < c.nlanes; i++)
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+#endif
+
+/** @brief Load register contents from memory with quad expand
+
+Same as cv::v_load_expand, but result type is 4 times wider than source.
+@code{.cpp}
+char buf[4] = {1, 2, 3, 4}; // type is int8
+v_int32x4 r = v_load_expand_q(buf); // r = {1, 2, 3, 4} - type is int32
+@endcode
+For 8-bit integer source types.
+
+@note Use vx_load_expand_q version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<typename V_TypeTraits<_Tp>::q_type, simd128_width / sizeof(typename V_TypeTraits<_Tp>::q_type)>
+v_load_expand_q(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    typedef typename V_TypeTraits<_Tp>::q_type q_type;
+    v_reg<q_type, simd128_width / sizeof(q_type)> c;
+    for( int i = 0; i < c.nlanes; i++ )
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+
+#if CV_SIMD256
+/** @brief Load register contents from memory with quad expand
+
+Same as cv::v256_load_expand, but result type is 4 times wider than source.
+@code{.cpp}
+char buf[8] = {1, 2, 3, 4, 5, 6, 7, 8}; // type is int8
+v_int32x8 r = v256_load_expand_q(buf); // r = {1, 2, 3, 4, 5, 6, 7, 8} - type is int32
+@endcode
+For 8-bit integer source types.
+
+@note Check CV_SIMD256 preprocessor definition prior to use.
+Use vx_load_expand_q version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<typename V_TypeTraits<_Tp>::q_type, simd256_width / sizeof(typename V_TypeTraits<_Tp>::q_type)>
+v256_load_expand_q(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    typedef typename V_TypeTraits<_Tp>::q_type q_type;
+    v_reg<q_type, simd256_width / sizeof(q_type)> c;
+    for (int i = 0; i < c.nlanes; i++)
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+#endif
+
+#if CV_SIMD512
+/** @brief Load register contents from memory with quad expand
+
+Same as cv::v512_load_expand, but result type is 4 times wider than source.
+@code{.cpp}
+char buf[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; // type is int8
+v_int32x16 r = v512_load_expand_q(buf); // r = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} - type is int32
+@endcode
+For 8-bit integer source types.
+
+@note Check CV_SIMD512 preprocessor definition prior to use.
+Use vx_load_expand_q version to get maximum available register length result
+*/
+template<typename _Tp>
+inline v_reg<typename V_TypeTraits<_Tp>::q_type, simd512_width / sizeof(typename V_TypeTraits<_Tp>::q_type)>
+v512_load_expand_q(const _Tp* ptr)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    typedef typename V_TypeTraits<_Tp>::q_type q_type;
+    v_reg<q_type, simd512_width / sizeof(q_type)> c;
+    for (int i = 0; i < c.nlanes; i++)
+    {
+        c.s[i] = ptr[i];
+    }
+    return c;
+}
+#endif
+
+/** @brief Load and deinterleave (2 channels)
+
+Load data from memory deinterleave and store to 2 registers.
+Scheme:
+@code
+{A1 B1 A2 B2 ...} ==> {A1 A2 ...}, {B1 B2 ...}
+@endcode
+For all types except 64-bit. */
+template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
+                                                            v_reg<_Tp, n>& b)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    int i, i2;
+    for( i = i2 = 0; i < n; i++, i2 += 2 )
+    {
+        a.s[i] = ptr[i2];
+        b.s[i] = ptr[i2+1];
+    }
+}
+
+/** @brief Load and deinterleave (3 channels)
+
+Load data from memory deinterleave and store to 3 registers.
+Scheme:
+@code
+{A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}
+@endcode
+For all types except 64-bit. */
+template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
+                                                            v_reg<_Tp, n>& b, v_reg<_Tp, n>& c)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    int i, i3;
+    for( i = i3 = 0; i < n; i++, i3 += 3 )
+    {
+        a.s[i] = ptr[i3];
+        b.s[i] = ptr[i3+1];
+        c.s[i] = ptr[i3+2];
+    }
+}
+
+/** @brief Load and deinterleave (4 channels)
+
+Load data from memory deinterleave and store to 4 registers.
+Scheme:
+@code
+{A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...}
+@endcode
+For all types except 64-bit. */
+template<typename _Tp, int n>
+inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
+                                v_reg<_Tp, n>& b, v_reg<_Tp, n>& c,
+                                v_reg<_Tp, n>& d)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    int i, i4;
+    for( i = i4 = 0; i < n; i++, i4 += 4 )
+    {
+        a.s[i] = ptr[i4];
+        b.s[i] = ptr[i4+1];
+        c.s[i] = ptr[i4+2];
+        d.s[i] = ptr[i4+3];
+    }
+}
+
+/** @brief Interleave and store (2 channels)
+
+Interleave and store data from 2 registers to memory.
+Scheme:
+@code
+{A1 A2 ...}, {B1 B2 ...} ==> {A1 B1 A2 B2 ...}
+@endcode
+For all types except 64-bit. */
+template<typename _Tp, int n>
+inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
+                               const v_reg<_Tp, n>& b,
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    int i, i2;
+    for( i = i2 = 0; i < n; i++, i2 += 2 )
+    {
+        ptr[i2] = a.s[i];
+        ptr[i2+1] = b.s[i];
+    }
+}
+
+/** @brief Interleave and store (3 channels)
+
+Interleave and store data from 3 registers to memory.
+Scheme:
+@code
+{A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...} ==> {A1 B1 C1 A2 B2 C2 ...}
+@endcode
+For all types except 64-bit. */
+template<typename _Tp, int n>
+inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
+                                const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    int i, i3;
+    for( i = i3 = 0; i < n; i++, i3 += 3 )
+    {
+        ptr[i3] = a.s[i];
+        ptr[i3+1] = b.s[i];
+        ptr[i3+2] = c.s[i];
+    }
+}
+
+/** @brief Interleave and store (4 channels)
+
+Interleave and store data from 4 registers to memory.
+Scheme:
+@code
+{A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...}
+@endcode
+For all types except 64-bit. */
+template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
+                                                            const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
+                                                            const v_reg<_Tp, n>& d,
+                                                            hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    int i, i4;
+    for( i = i4 = 0; i < n; i++, i4 += 4 )
+    {
+        ptr[i4] = a.s[i];
+        ptr[i4+1] = b.s[i];
+        ptr[i4+2] = c.s[i];
+        ptr[i4+3] = d.s[i];
+    }
+}
+
+/** @brief Store data to memory
+
+Store register contents to memory.
+Scheme:
+@code
+  REG {A B C D} ==> MEM {A B C D}
+@endcode
+Pointer can be unaligned. */
+template<typename _Tp, int n>
+inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    for( int i = 0; i < n; i++ )
+        ptr[i] = a.s[i];
+}
+
+template<typename _Tp, int n>
+inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    v_store(ptr, a);
+}
+
+/** @brief Store data to memory (lower half)
+
+Store lower half of register contents to memory.
+Scheme:
+@code
+  REG {A B C D} ==> MEM {A B}
+@endcode */
+template<typename _Tp, int n>
+inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    for( int i = 0; i < (n/2); i++ )
+        ptr[i] = a.s[i];
+}
+
+/** @brief Store data to memory (higher half)
+
+Store higher half of register contents to memory.
+Scheme:
+@code
+  REG {A B C D} ==> MEM {C D}
+@endcode */
+template<typename _Tp, int n>
+inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a)
+{
+#if CV_STRONG_ALIGNMENT
+    CV_Assert(isAligned<sizeof(_Tp)>(ptr));
+#endif
+    for( int i = 0; i < (n/2); i++ )
+        ptr[i] = a.s[i+(n/2)];
+}
+
+/** @brief Store data to memory (aligned)
+
+Store register contents to memory.
+Scheme:
+@code
+  REG {A B C D} ==> MEM {A B C D}
+@endcode
+Pointer __should__ be aligned by 16-byte boundary. */
+template<typename _Tp, int n>
+inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
+{
+    CV_Assert(isAligned<sizeof(v_reg<_Tp, n>)>(ptr));
+    v_store(ptr, a);
+}
+
+template<typename _Tp, int n>
+inline void v_store_aligned_nocache(_Tp* ptr, const v_reg<_Tp, n>& a)
+{
+    CV_Assert(isAligned<sizeof(v_reg<_Tp, n>)>(ptr));
+    v_store(ptr, a);
+}
+
+template<typename _Tp, int n>
+inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)
+{
+    CV_Assert(isAligned<sizeof(v_reg<_Tp, n>)>(ptr));
+    v_store(ptr, a);
+}
+
+/** @brief Combine vector from first elements of two vectors
+
+Scheme:
+@code
+  {A1 A2 A3 A4}
+  {B1 B2 B3 B4}
+---------------
+  {A1 A2 B1 B2}
+@endcode
+For all types except 64-bit. */
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < (n/2); i++ )
+    {
+        c.s[i] = a.s[i];
+        c.s[i+(n/2)] = b.s[i];
+    }
+    return c;
+}
+
+/** @brief Combine vector from last elements of two vectors
+
+Scheme:
+@code
+  {A1 A2 A3 A4}
+  {B1 B2 B3 B4}
+---------------
+  {A3 A4 B3 B4}
+@endcode
+For all types except 64-bit. */
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < (n/2); i++ )
+    {
+        c.s[i] = a.s[i+(n/2)];
+        c.s[i+(n/2)] = b.s[i+(n/2)];
+    }
+    return c;
+}
+
+/** @brief Combine two vectors from lower and higher parts of two other vectors
+
+@code{.cpp}
+low = cv::v_combine_low(a, b);
+high = cv::v_combine_high(a, b);
+@endcode */
+template<typename _Tp, int n>
+inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
+                        v_reg<_Tp, n>& low, v_reg<_Tp, n>& high)
+{
+    for( int i = 0; i < (n/2); i++ )
+    {
+        low.s[i] = a.s[i];
+        low.s[i+(n/2)] = b.s[i];
+        high.s[i] = a.s[i+(n/2)];
+        high.s[i+(n/2)] = b.s[i+(n/2)];
+    }
+}
+
+/** @brief Vector reverse order
+
+Reverse the order of the vector
+Scheme:
+@code
+  REG {A1 ... An} ==> REG {An ... A1}
+@endcode
+For all types. */
+template<typename _Tp, int n>
+inline v_reg<_Tp, n> v_reverse(const v_reg<_Tp, n>& a)
+{
+    v_reg<_Tp, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = a.s[n-i-1];
+    return c;
+}
+
+/** @brief Vector extract
+
+Scheme:
+@code
+  {A1 A2 A3 A4}
+  {B1 B2 B3 B4}
+========================
+shift = 1  {A2 A3 A4 B1}
+shift = 2  {A3 A4 B1 B2}
+shift = 3  {A4 B1 B2 B3}
+@endcode
+Restriction: 0 <= shift < nlanes
+
+Usage:
+@code
+v_int32x4 a, b, c;
+c = v_extract<2>(a, b);
+@endcode
+For all types. */
+template<int s, typename _Tp, int n>
+inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    v_reg<_Tp, n> r;
+    const int shift = n - s;
+    int i = 0;
+    for (; i < shift; ++i)
+        r.s[i] = a.s[i+s];
+    for (; i < n; ++i)
+        r.s[i] = b.s[i-shift];
+    return r;
+}
+
+/** @brief Vector extract
+
+Scheme:
+Return the s-th element of v.
+Restriction: 0 <= s < nlanes
+
+Usage:
+@code
+v_int32x4 a;
+int r;
+r = v_extract_n<2>(a);
+@endcode
+For all types. */
+template<int s, typename _Tp, int n>
+inline _Tp v_extract_n(const v_reg<_Tp, n>& v)
+{
+    CV_DbgAssert(s >= 0 && s < n);
+    return v.s[s];
+}
+
+/** @brief Broadcast i-th element of vector
+
+Scheme:
+@code
+{ v[0] v[1] v[2] ... v[SZ] } => { v[i], v[i], v[i] ... v[i] }
+@endcode
+Restriction: 0 <= i < nlanes
+Supported types: 32-bit integers and floats (s32/u32/f32)
+ */
+template<int i, typename _Tp, int n>
+inline v_reg<_Tp, n> v_broadcast_element(const v_reg<_Tp, n>& a)
+{
+    CV_DbgAssert(i >= 0 && i < n);
+    return v_reg<_Tp, n>::all(a.s[i]);
+}
+
+/** @brief Round elements
+
+Rounds each value. Input type is float vector ==> output type is int vector.
+@note Only for floating point types.
+*/
+template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = cvRound(a.s[i]);
+    return c;
+}
+
+/** @overload */
+template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a, const v_reg<double, n>& b)
+{
+    v_reg<int, n*2> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = cvRound(a.s[i]);
+        c.s[i+n] = cvRound(b.s[i]);
+    }
+    return c;
+}
+
+/** @brief Floor elements
+
+Floor each value. Input type is float vector ==> output type is int vector.
+@note Only for floating point types.
+*/
+template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = cvFloor(a.s[i]);
+    return c;
+}
+
+/** @brief Ceil elements
+
+Ceil each value. Input type is float vector ==> output type is int vector.
+@note Only for floating point types.
+*/
+template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = cvCeil(a.s[i]);
+    return c;
+}
+
+/** @brief Truncate elements
+
+Truncate each value. Input type is float vector ==> output type is int vector.
+@note Only for floating point types.
+*/
+template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = (int)(a.s[i]);
+    return c;
+}
+
+/** @overload */
+template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)
+{
+    v_reg<int, n*2> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = cvRound(a.s[i]);
+        c.s[i+n] = 0;
+    }
+    return c;
+}
+
+/** @overload */
+template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
+{
+    v_reg<int, n*2> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = cvFloor(a.s[i]);
+        c.s[i+n] = 0;
+    }
+    return c;
+}
+
+/** @overload */
+template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
+{
+    v_reg<int, n*2> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = cvCeil(a.s[i]);
+        c.s[i+n] = 0;
+    }
+    return c;
+}
+
+/** @overload */
+template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
+{
+    v_reg<int, n*2> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = (int)(a.s[i]);
+        c.s[i+n] = 0;
+    }
+    return c;
+}
+
+/** @brief Convert to float
+
+Supported input type is cv::v_int32. */
+template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
+{
+    v_reg<float, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = (float)a.s[i];
+    return c;
+}
+
+/** @brief Convert lower half to float
+
+Supported input type is cv::v_float64. */
+template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a)
+{
+    v_reg<float, n*2> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = (float)a.s[i];
+        c.s[i+n] = 0;
+    }
+    return c;
+}
+
+/** @brief Convert to float
+
+Supported input type is cv::v_float64. */
+template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a, const v_reg<double, n>& b)
+{
+    v_reg<float, n*2> c;
+    for( int i = 0; i < n; i++ )
+    {
+        c.s[i] = (float)a.s[i];
+        c.s[i+n] = (float)b.s[i];
+    }
+    return c;
+}
+
+/** @brief Convert lower half to double
+
+Supported input type is cv::v_int32. */
+template<int n> CV_INLINE v_reg<double, n/2> v_cvt_f64(const v_reg<int, n>& a)
+{
+    v_reg<double, (n/2)> c;
+    for( int i = 0; i < (n/2); i++ )
+        c.s[i] = (double)a.s[i];
+    return c;
+}
+
+/** @brief Convert to double high part of vector
+
+Supported input type is cv::v_int32. */
+template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64_high(const v_reg<int, n>& a)
+{
+    v_reg<double, (n/2)> c;
+    for( int i = 0; i < (n/2); i++ )
+        c.s[i] = (double)a.s[i + (n/2)];
+    return c;
+}
+
+/** @brief Convert lower half to double
+
+Supported input type is cv::v_float32. */
+template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64(const v_reg<float, n>& a)
+{
+    v_reg<double, (n/2)> c;
+    for( int i = 0; i < (n/2); i++ )
+        c.s[i] = (double)a.s[i];
+    return c;
+}
+
+/** @brief Convert to double high part of vector
+
+Supported input type is cv::v_float32. */
+template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64_high(const v_reg<float, n>& a)
+{
+    v_reg<double, (n/2)> c;
+    for( int i = 0; i < (n/2); i++ )
+        c.s[i] = (double)a.s[i + (n/2)];
+    return c;
+}
+
+/** @brief Convert to double
+
+Supported input type is cv::v_int64. */
+template<int n> CV_INLINE v_reg<double, n> v_cvt_f64(const v_reg<int64, n>& a)
+{
+    v_reg<double, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = (double)a.s[i];
+    return c;
+}
+
+
+template<typename _Tp> inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_lut(const _Tp* tab, const int* idx)
+{
+    v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
+    for (int i = 0; i < c.nlanes; i++)
+        c.s[i] = tab[idx[i]];
+    return c;
+}
+template<typename _Tp> inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_lut_pairs(const _Tp* tab, const int* idx)
+{
+    v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
+    for (int i = 0; i < c.nlanes; i++)
+        c.s[i] = tab[idx[i / 2] + i % 2];
+    return c;
+}
+template<typename _Tp> inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_lut_quads(const _Tp* tab, const int* idx)
+{
+    v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
+    for (int i = 0; i < c.nlanes; i++)
+        c.s[i] = tab[idx[i / 4] + i % 4];
+    return c;
+}
+
+template<int n> inline v_reg<int, n> v_lut(const int* tab, const v_reg<int, n>& idx)
+{
+    v_reg<int, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = tab[idx.s[i]];
+    return c;
+}
+
+template<int n> inline v_reg<unsigned, n> v_lut(const unsigned* tab, const v_reg<int, n>& idx)
+{
+    v_reg<int, n> c;
+    for (int i = 0; i < n; i++)
+        c.s[i] = tab[idx.s[i]];
+    return c;
+}
+
+template<int n> inline v_reg<float, n> v_lut(const float* tab, const v_reg<int, n>& idx)
+{
+    v_reg<float, n> c;
+    for( int i = 0; i < n; i++ )
+        c.s[i] = tab[idx.s[i]];
+    return c;
+}
+
+template<int n> inline v_reg<double, n/2> v_lut(const double* tab, const v_reg<int, n>& idx)
+{
+    v_reg<double, n/2> c;
+    for( int i = 0; i < n/2; i++ )
+        c.s[i] = tab[idx.s[i]];
+    return c;
+}
+
+
+template<int n> inline void v_lut_deinterleave(const float* tab, const v_reg<int, n>& idx,
+                                               v_reg<float, n>& x, v_reg<float, n>& y)
+{
+    for( int i = 0; i < n; i++ )
+    {
+        int j = idx.s[i];
+        x.s[i] = tab[j];
+        y.s[i] = tab[j+1];
+    }
+}
+
+template<int n> inline void v_lut_deinterleave(const double* tab, const v_reg<int, n*2>& idx,
+                                               v_reg<double, n>& x, v_reg<double, n>& y)
+{
+    for( int i = 0; i < n; i++ )
+    {
+        int j = idx.s[i];
+        x.s[i] = tab[j];
+        y.s[i] = tab[j+1];
+    }
+}
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_pairs(const v_reg<_Tp, n>& vec)
+{
+    v_reg<_Tp, n> c;
+    for (int i = 0; i < n/4; i++)
+    {
+        c.s[4*i  ] = vec.s[4*i  ];
+        c.s[4*i+1] = vec.s[4*i+2];
+        c.s[4*i+2] = vec.s[4*i+1];
+        c.s[4*i+3] = vec.s[4*i+3];
+    }
+    return c;
+}
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_quads(const v_reg<_Tp, n>& vec)
+{
+    v_reg<_Tp, n> c;
+    for (int i = 0; i < n/8; i++)
+    {
+        c.s[8*i  ] = vec.s[8*i  ];
+        c.s[8*i+1] = vec.s[8*i+4];
+        c.s[8*i+2] = vec.s[8*i+1];
+        c.s[8*i+3] = vec.s[8*i+5];
+        c.s[8*i+4] = vec.s[8*i+2];
+        c.s[8*i+5] = vec.s[8*i+6];
+        c.s[8*i+6] = vec.s[8*i+3];
+        c.s[8*i+7] = vec.s[8*i+7];
+    }
+    return c;
+}
+
+template<typename _Tp, int n> inline v_reg<_Tp, n> v_pack_triplets(const v_reg<_Tp, n>& vec)
+{
+    v_reg<_Tp, n> c;
+    for (int i = 0; i < n/4; i++)
+    {
+        c.s[3*i  ] = vec.s[4*i  ];
+        c.s[3*i+1] = vec.s[4*i+1];
+        c.s[3*i+2] = vec.s[4*i+2];
+    }
+    return c;
+}
+
+/** @brief Transpose 4x4 matrix
+
+Scheme:
+@code
+a0  {A1 A2 A3 A4}
+a1  {B1 B2 B3 B4}
+a2  {C1 C2 C3 C4}
+a3  {D1 D2 D3 D4}
+===============
+b0  {A1 B1 C1 D1}
+b1  {A2 B2 C2 D2}
+b2  {A3 B3 C3 D3}
+b3  {A4 B4 C4 D4}
+@endcode
+*/
+template<typename _Tp, int n>
+inline void v_transpose4x4( v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
+                            const v_reg<_Tp, n>& a2, const v_reg<_Tp, n>& a3,
+                            v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1,
+                            v_reg<_Tp, n>& b2, v_reg<_Tp, n>& b3 )
+{
+    for (int i = 0; i < n / 4; i++)
+    {
+        b0.s[0 + i*4] = a0.s[0 + i*4]; b0.s[1 + i*4] = a1.s[0 + i*4];
+        b0.s[2 + i*4] = a2.s[0 + i*4]; b0.s[3 + i*4] = a3.s[0 + i*4];
+        b1.s[0 + i*4] = a0.s[1 + i*4]; b1.s[1 + i*4] = a1.s[1 + i*4];
+        b1.s[2 + i*4] = a2.s[1 + i*4]; b1.s[3 + i*4] = a3.s[1 + i*4];
+        b2.s[0 + i*4] = a0.s[2 + i*4]; b2.s[1 + i*4] = a1.s[2 + i*4];
+        b2.s[2 + i*4] = a2.s[2 + i*4]; b2.s[3 + i*4] = a3.s[2 + i*4];
+        b3.s[0 + i*4] = a0.s[3 + i*4]; b3.s[1 + i*4] = a1.s[3 + i*4];
+        b3.s[2 + i*4] = a2.s[3 + i*4]; b3.s[3 + i*4] = a3.s[3 + i*4];
+    }
+}
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, prefix, suffix) \
+inline _Tpvec prefix##_setzero_##suffix() { return _Tpvec::zero(); }
+
+//! @name Init with zero
+//! @{
+//! @brief Create new vector with zero elements
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x16, v, u8)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x16, v, s8)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x8, v, u16)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x8, v, s16)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x4, v, u32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x4, v, s32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x4, v, f32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x2, v, f64)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x2, v, u64)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x2, v, s64)
+
+#if CV_SIMD256
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x32, v256, u8)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x32, v256, s8)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x16, v256, u16)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x16, v256, s16)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x8, v256, u32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x8, v256, s32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x8, v256, f32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x4, v256, f64)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x4, v256, u64)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x4, v256, s64)
+#endif
+
+#if CV_SIMD512
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x64, v512, u8)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x64, v512, s8)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x32, v512, u16)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x32, v512, s16)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x16, v512, u32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x16, v512, s32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x16, v512, f32)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x8, v512, f64)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x8, v512, u64)
+OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x8, v512, s64)
+#endif
+//! @}
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, prefix, suffix) \
+inline _Tpvec prefix##_setall_##suffix(_Tp val) { return _Tpvec::all(val); }
+
+//! @name Init with value
+//! @{
+//! @brief Create new vector with elements set to a specific value
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x16, uchar, v, u8)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x16, schar, v, s8)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x8, ushort, v, u16)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x8, short, v, s16)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x4, unsigned, v, u32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x4, int, v, s32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x4, float, v, f32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x2, double, v, f64)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x2, uint64, v, u64)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x2, int64, v, s64)
+
+#if CV_SIMD256
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x32, uchar, v256, u8)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x32, schar, v256, s8)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x16, ushort, v256, u16)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x16, short, v256, s16)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x8, unsigned, v256, u32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x8, int, v256, s32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x8, float, v256, f32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x4, double, v256, f64)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x4, uint64, v256, u64)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x4, int64, v256, s64)
+#endif
+
+#if CV_SIMD512
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x64, uchar, v512, u8)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x64, schar, v512, s8)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x32, ushort, v512, u16)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x32, short, v512, s16)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x16, unsigned, v512, u32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x16, int, v512, s32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x16, float, v512, f32)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x8, double, v512, f64)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x8, uint64, v512, u64)
+OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x8, int64, v512, s64)
+#endif
+//! @}
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_C_REINTERPRET(_Tp, suffix) \
+template<typename _Tp0, int n0> inline v_reg<_Tp, n0*sizeof(_Tp0)/sizeof(_Tp)> \
+    v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \
+{ return a.template reinterpret_as<_Tp, n0*sizeof(_Tp0)/sizeof(_Tp)>(); }
+
+//! @name Reinterpret
+//! @{
+//! @brief Convert vector to different type without modifying underlying data.
+OPENCV_HAL_IMPL_C_REINTERPRET(uchar, u8)
+OPENCV_HAL_IMPL_C_REINTERPRET(schar, s8)
+OPENCV_HAL_IMPL_C_REINTERPRET(ushort, u16)
+OPENCV_HAL_IMPL_C_REINTERPRET(short, s16)
+OPENCV_HAL_IMPL_C_REINTERPRET(unsigned, u32)
+OPENCV_HAL_IMPL_C_REINTERPRET(int, s32)
+OPENCV_HAL_IMPL_C_REINTERPRET(float, f32)
+OPENCV_HAL_IMPL_C_REINTERPRET(double, f64)
+OPENCV_HAL_IMPL_C_REINTERPRET(uint64, u64)
+OPENCV_HAL_IMPL_C_REINTERPRET(int64, s64)
+//! @}
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_C_SHIFTL(_Tp) \
+template<int shift, int n> inline v_reg<_Tp, n> v_shl(const v_reg<_Tp, n>& a) \
+{ return a << shift; }
+
+//! @name Left shift
+//! @{
+//! @brief Shift left
+OPENCV_HAL_IMPL_C_SHIFTL(ushort)
+OPENCV_HAL_IMPL_C_SHIFTL(short)
+OPENCV_HAL_IMPL_C_SHIFTL(unsigned)
+OPENCV_HAL_IMPL_C_SHIFTL(int)
+OPENCV_HAL_IMPL_C_SHIFTL(uint64)
+OPENCV_HAL_IMPL_C_SHIFTL(int64)
+//! @}
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_C_SHIFTR(_Tp) \
+template<int shift, int n> inline v_reg<_Tp, n> v_shr(const v_reg<_Tp, n>& a) \
+{ return a >> shift; }
+
+//! @name Right shift
+//! @{
+//! @brief Shift right
+OPENCV_HAL_IMPL_C_SHIFTR(ushort)
+OPENCV_HAL_IMPL_C_SHIFTR(short)
+OPENCV_HAL_IMPL_C_SHIFTR(unsigned)
+OPENCV_HAL_IMPL_C_SHIFTR(int)
+OPENCV_HAL_IMPL_C_SHIFTR(uint64)
+OPENCV_HAL_IMPL_C_SHIFTR(int64)
+//! @}
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_C_RSHIFTR(_Tp) \
+template<int shift, int n> inline v_reg<_Tp, n> v_rshr(const v_reg<_Tp, n>& a) \
+{ \
+    v_reg<_Tp, n> c; \
+    for( int i = 0; i < n; i++ ) \
+        c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
+    return c; \
+}
+
+//! @name Rounding shift
+//! @{
+//! @brief Rounding shift right
+OPENCV_HAL_IMPL_C_RSHIFTR(ushort)
+OPENCV_HAL_IMPL_C_RSHIFTR(short)
+OPENCV_HAL_IMPL_C_RSHIFTR(unsigned)
+OPENCV_HAL_IMPL_C_RSHIFTR(int)
+OPENCV_HAL_IMPL_C_RSHIFTR(uint64)
+OPENCV_HAL_IMPL_C_RSHIFTR(int64)
+//! @}
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_C_PACK(_Tp, _Tpn, pack_suffix, cast) \
+template<int n> inline v_reg<_Tpn, 2*n> v_##pack_suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    v_reg<_Tpn, 2*n> c; \
+    for( int i = 0; i < n; i++ ) \
+    { \
+        c.s[i] = cast<_Tpn>(a.s[i]); \
+        c.s[i+n] = cast<_Tpn>(b.s[i]); \
+    } \
+    return c; \
+}
+
+//! @name Pack
+//! @{
+//! @brief Pack values from two vectors to one
+//!
+//! Return vector type have twice more elements than input vector types. Variant with _u_ suffix also
+//! converts to corresponding unsigned type.
+//!
+//! - pack: for 16-, 32- and 64-bit integer input types
+//! - pack_u: for 16- and 32-bit signed integer input types
+//!
+//! @note All variants except 64-bit use saturation.
+OPENCV_HAL_IMPL_C_PACK(ushort, uchar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK(short, schar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK(unsigned, ushort, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK(int, short, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK(uint64, unsigned, pack, static_cast)
+OPENCV_HAL_IMPL_C_PACK(int64, int, pack, static_cast)
+OPENCV_HAL_IMPL_C_PACK(short, uchar, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK(int, ushort, pack_u, saturate_cast)
+//! @}
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tp, _Tpn, pack_suffix, cast) \
+template<int shift, int n> inline v_reg<_Tpn, 2*n> v_rshr_##pack_suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    v_reg<_Tpn, 2*n> c; \
+    for( int i = 0; i < n; i++ ) \
+    { \
+        c.s[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
+        c.s[i+n] = cast<_Tpn>((b.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
+    } \
+    return c; \
+}
+
+//! @name Pack with rounding shift
+//! @{
+//! @brief Pack values from two vectors to one with rounding shift
+//!
+//! Values from the input vectors will be shifted right by _n_ bits with rounding, converted to narrower
+//! type and returned in the result vector. Variant with _u_ suffix converts to unsigned type.
+//!
+//! - pack: for 16-, 32- and 64-bit integer input types
+//! - pack_u: for 16- and 32-bit signed integer input types
+//!
+//! @note All variants except 64-bit use saturation.
+OPENCV_HAL_IMPL_C_RSHR_PACK(ushort, uchar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(short, schar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(unsigned, ushort, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(int, short, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(uint64, unsigned, pack, static_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(int64, int, pack, static_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(short, uchar, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK(int, ushort, pack_u, saturate_cast)
+//! @}
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_C_PACK_STORE(_Tp, _Tpn, pack_suffix, cast) \
+template<int n> inline void v_##pack_suffix##_store(_Tpn* ptr, const v_reg<_Tp, n>& a) \
+{ \
+    for( int i = 0; i < n; i++ ) \
+        ptr[i] = cast<_Tpn>(a.s[i]); \
+}
+
+//! @name Pack and store
+//! @{
+//! @brief Store values from the input vector into memory with pack
+//!
+//! Values will be stored into memory with conversion to narrower type.
+//! Variant with _u_ suffix converts to corresponding unsigned type.
+//!
+//! - pack: for 16-, 32- and 64-bit integer input types
+//! - pack_u: for 16- and 32-bit signed integer input types
+//!
+//! @note All variants except 64-bit use saturation.
+OPENCV_HAL_IMPL_C_PACK_STORE(ushort, uchar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(short, schar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(unsigned, ushort, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(int, short, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(uint64, unsigned, pack, static_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(int64, int, pack, static_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(short, uchar, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_PACK_STORE(int, ushort, pack_u, saturate_cast)
+//! @}
+
+//! @brief Helper macro
+//! @ingroup core_hal_intrin_impl
+#define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tp, _Tpn, pack_suffix, cast) \
+template<int shift, int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const v_reg<_Tp, n>& a) \
+{ \
+    for( int i = 0; i < n; i++ ) \
+        ptr[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
+}
+
+//! @name Pack and store with rounding shift
+//! @{
+//! @brief Store values from the input vector into memory with pack
+//!
+//! Values will be shifted _n_ bits right with rounding, converted to narrower type and stored into
+//! memory. Variant with _u_ suffix converts to unsigned type.
+//!
+//! - pack: for 16-, 32- and 64-bit integer input types
+//! - pack_u: for 16- and 32-bit signed integer input types
+//!
+//! @note All variants except 64-bit use saturation.
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(ushort, uchar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(short, schar, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(unsigned, ushort, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(int, short, pack, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(uint64, unsigned, pack, static_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(int64, int, pack, static_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(short, uchar, pack_u, saturate_cast)
+OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(int, ushort, pack_u, saturate_cast)
+//! @}
+
+//! @cond IGNORED
+template<typename _Tpm, typename _Tp, int n>
+inline void _pack_b(_Tpm* mptr, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
+{
+    for (int i = 0; i < n; ++i)
+    {
+        mptr[i] = (_Tpm)a.s[i];
+        mptr[i + n] = (_Tpm)b.s[i];
+    }
+}
+//! @endcond
+
+//! @name Pack boolean values
+//! @{
+//! @brief Pack boolean values from multiple vectors to one unsigned 8-bit integer vector
+//!
+//! @note Must provide valid boolean values to guarantee same result for all architectures.
+
+/** @brief
+//! For 16-bit boolean values
+
+Scheme:
+@code
+a  {0xFFFF 0 0 0xFFFF 0 0xFFFF 0xFFFF 0}
+b  {0xFFFF 0 0xFFFF 0 0 0xFFFF 0 0xFFFF}
+===============
+{
+   0xFF 0 0 0xFF 0 0xFF 0xFF 0
+   0xFF 0 0xFF 0 0 0xFF 0 0xFF
+}
+@endcode */
+
+template<int n> inline v_reg<uchar, 2*n> v_pack_b(const v_reg<ushort, n>& a, const v_reg<ushort, n>& b)
+{
+    v_reg<uchar, 2*n> mask;
+    _pack_b(mask.s, a, b);
+    return mask;
+}
+
+/** @overload
+For 32-bit boolean values
+
+Scheme:
+@code
+a  {0xFFFF.. 0 0 0xFFFF..}
+b  {0 0xFFFF.. 0xFFFF.. 0}
+c  {0xFFFF.. 0 0xFFFF.. 0}
+d  {0 0xFFFF.. 0 0xFFFF..}
+===============
+{
+   0xFF 0 0 0xFF 0 0xFF 0xFF 0
+   0xFF 0 0xFF 0 0 0xFF 0 0xFF
+}
+@endcode */
+
+template<int n> inline v_reg<uchar, 4*n> v_pack_b(const v_reg<unsigned, n>& a, const v_reg<unsigned, n>& b,
+                                                  const v_reg<unsigned, n>& c, const v_reg<unsigned, n>& d)
+{
+    v_reg<uchar, 4*n> mask;
+    _pack_b(mask.s, a, b);
+    _pack_b(mask.s + 2*n, c, d);
+    return mask;
+}
+
+/** @overload
+For 64-bit boolean values
+
+Scheme:
+@code
+a  {0xFFFF.. 0}
+b  {0 0xFFFF..}
+c  {0xFFFF.. 0}
+d  {0 0xFFFF..}
+
+e  {0xFFFF.. 0}
+f  {0xFFFF.. 0}
+g  {0 0xFFFF..}
+h  {0 0xFFFF..}
+===============
+{
+   0xFF 0 0 0xFF 0xFF 0 0 0xFF
+   0xFF 0 0xFF 0 0 0xFF 0 0xFF
+}
+@endcode */
+template<int n> inline v_reg<uchar, 8*n> v_pack_b(const v_reg<uint64, n>& a, const v_reg<uint64, n>& b,
+                                                  const v_reg<uint64, n>& c, const v_reg<uint64, n>& d,
+                                                  const v_reg<uint64, n>& e, const v_reg<uint64, n>& f,
+                                                  const v_reg<uint64, n>& g, const v_reg<uint64, n>& h)
+{
+    v_reg<uchar, 8*n> mask;
+    _pack_b(mask.s, a, b);
+    _pack_b(mask.s + 2*n, c, d);
+    _pack_b(mask.s + 4*n, e, f);
+    _pack_b(mask.s + 6*n, g, h);
+    return mask;
+}
+//! @}
+
+/** @brief Matrix multiplication
+
+Scheme:
+@code
+{A0 A1 A2 A3}   |V0|
+{B0 B1 B2 B3}   |V1|
+{C0 C1 C2 C3}   |V2|
+{D0 D1 D2 D3} x |V3|
+====================
+{R0 R1 R2 R3}, where:
+R0 = A0V0 + B0V1 + C0V2 + D0V3,
+R1 = A1V0 + B1V1 + C1V2 + D1V3
+...
+@endcode
+*/
+template<int n>
+inline v_reg<float, n> v_matmul(const v_reg<float, n>& v,
+                                const v_reg<float, n>& a, const v_reg<float, n>& b,
+                                const v_reg<float, n>& c, const v_reg<float, n>& d)
+{
+    v_reg<float, n> res;
+    for (int i = 0; i < n / 4; i++)
+    {
+        res.s[0 + i*4] = v.s[0 + i*4] * a.s[0 + i*4] + v.s[1 + i*4] * b.s[0 + i*4] + v.s[2 + i*4] * c.s[0 + i*4] + v.s[3 + i*4] * d.s[0 + i*4];
+        res.s[1 + i*4] = v.s[0 + i*4] * a.s[1 + i*4] + v.s[1 + i*4] * b.s[1 + i*4] + v.s[2 + i*4] * c.s[1 + i*4] + v.s[3 + i*4] * d.s[1 + i*4];
+        res.s[2 + i*4] = v.s[0 + i*4] * a.s[2 + i*4] + v.s[1 + i*4] * b.s[2 + i*4] + v.s[2 + i*4] * c.s[2 + i*4] + v.s[3 + i*4] * d.s[2 + i*4];
+        res.s[3 + i*4] = v.s[0 + i*4] * a.s[3 + i*4] + v.s[1 + i*4] * b.s[3 + i*4] + v.s[2 + i*4] * c.s[3 + i*4] + v.s[3 + i*4] * d.s[3 + i*4];
+    }
+    return res;
+}
+
+/** @brief Matrix multiplication and add
+
+Scheme:
+@code
+{A0 A1 A2 A3}   |V0|   |D0|
+{B0 B1 B2 B3}   |V1|   |D1|
+{C0 C1 C2 C3} x |V2| + |D2|
+====================   |D3|
+{R0 R1 R2 R3}, where:
+R0 = A0V0 + B0V1 + C0V2 + D0,
+R1 = A1V0 + B1V1 + C1V2 + D1
+...
+@endcode
+*/
+template<int n>
+inline v_reg<float, n> v_matmuladd(const v_reg<float, n>& v,
+                                   const v_reg<float, n>& a, const v_reg<float, n>& b,
+                                   const v_reg<float, n>& c, const v_reg<float, n>& d)
+{
+    v_reg<float, n> res;
+    for (int i = 0; i < n / 4; i++)
+    {
+        res.s[0 + i * 4] = v.s[0 + i * 4] * a.s[0 + i * 4] + v.s[1 + i * 4] * b.s[0 + i * 4] + v.s[2 + i * 4] * c.s[0 + i * 4] + d.s[0 + i * 4];
+        res.s[1 + i * 4] = v.s[0 + i * 4] * a.s[1 + i * 4] + v.s[1 + i * 4] * b.s[1 + i * 4] + v.s[2 + i * 4] * c.s[1 + i * 4] + d.s[1 + i * 4];
+        res.s[2 + i * 4] = v.s[0 + i * 4] * a.s[2 + i * 4] + v.s[1 + i * 4] * b.s[2 + i * 4] + v.s[2 + i * 4] * c.s[2 + i * 4] + d.s[2 + i * 4];
+        res.s[3 + i * 4] = v.s[0 + i * 4] * a.s[3 + i * 4] + v.s[1 + i * 4] * b.s[3 + i * 4] + v.s[2 + i * 4] * c.s[3 + i * 4] + d.s[3 + i * 4];
+    }
+    return res;
+}
+
+
+template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b)
+{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
+template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b,
+                                                           const v_reg<double, n/2>& c)
+{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }
+
+template<int n> inline v_reg<double, n/2> v_dotprod_expand_fast(const v_reg<int, n>& a, const v_reg<int, n>& b)
+{ return v_dotprod_expand(a, b); }
+template<int n> inline v_reg<double, n/2> v_dotprod_expand_fast(const v_reg<int, n>& a, const v_reg<int, n>& b,
+                                                                const v_reg<double, n/2>& c)
+{ return v_dotprod_expand(a, b, c); }
+
+////// FP16 support ///////
+
+inline v_reg<float, simd128_width / sizeof(float)>
+v_load_expand(const float16_t* ptr)
+{
+    v_reg<float, simd128_width / sizeof(float)> v;
+    for( int i = 0; i < v.nlanes; i++ )
+    {
+        v.s[i] = ptr[i];
+    }
+    return v;
+}
+#if CV_SIMD256
+inline v_reg<float, simd256_width / sizeof(float)>
+v256_load_expand(const float16_t* ptr)
+{
+    v_reg<float, simd256_width / sizeof(float)> v;
+    for (int i = 0; i < v.nlanes; i++)
+    {
+        v.s[i] = ptr[i];
+    }
+    return v;
+}
+#endif
+#if CV_SIMD512
+inline v_reg<float, simd512_width / sizeof(float)>
+v512_load_expand(const float16_t* ptr)
+{
+    v_reg<float, simd512_width / sizeof(float)> v;
+    for (int i = 0; i < v.nlanes; i++)
+    {
+        v.s[i] = ptr[i];
+    }
+    return v;
+}
+#endif
+
+template<int n> inline void
+v_pack_store(float16_t* ptr, const v_reg<float, n>& v)
+{
+    for( int i = 0; i < v.nlanes; i++ )
+    {
+        ptr[i] = float16_t(v.s[i]);
+    }
+}
+
+inline void v_cleanup() {}
+#if CV_SIMD256
+inline void v256_cleanup() {}
+#endif
+#if CV_SIMD512
+inline void v512_cleanup() {}
+#endif
+
+//! @}
+
+#ifndef CV_DOXYGEN
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+#endif
+}
+
+#if !defined(CV_DOXYGEN)
+#undef CV_SIMD256
+#undef CV_SIMD512
+#endif
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_forward.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_forward.hpp
new file mode 100644
index 0000000..979f15a
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_forward.hpp
@@ -0,0 +1,191 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef CV__SIMD_FORWARD
+#error "Need to pre-define forward width"
+#endif
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+/** Types **/
+#if CV__SIMD_FORWARD == 1024
+// [todo] 1024
+#error "1024-long ops not implemented yet"
+#elif CV__SIMD_FORWARD == 512
+// 512
+#define __CV_VX(fun)   v512_##fun
+#define __CV_V_UINT8   v_uint8x64
+#define __CV_V_INT8    v_int8x64
+#define __CV_V_UINT16  v_uint16x32
+#define __CV_V_INT16   v_int16x32
+#define __CV_V_UINT32  v_uint32x16
+#define __CV_V_INT32   v_int32x16
+#define __CV_V_UINT64  v_uint64x8
+#define __CV_V_INT64   v_int64x8
+#define __CV_V_FLOAT32 v_float32x16
+#define __CV_V_FLOAT64 v_float64x8
+struct v_uint8x64;
+struct v_int8x64;
+struct v_uint16x32;
+struct v_int16x32;
+struct v_uint32x16;
+struct v_int32x16;
+struct v_uint64x8;
+struct v_int64x8;
+struct v_float32x16;
+struct v_float64x8;
+#elif CV__SIMD_FORWARD == 256
+// 256
+#define __CV_VX(fun)   v256_##fun
+#define __CV_V_UINT8   v_uint8x32
+#define __CV_V_INT8    v_int8x32
+#define __CV_V_UINT16  v_uint16x16
+#define __CV_V_INT16   v_int16x16
+#define __CV_V_UINT32  v_uint32x8
+#define __CV_V_INT32   v_int32x8
+#define __CV_V_UINT64  v_uint64x4
+#define __CV_V_INT64   v_int64x4
+#define __CV_V_FLOAT32 v_float32x8
+#define __CV_V_FLOAT64 v_float64x4
+struct v_uint8x32;
+struct v_int8x32;
+struct v_uint16x16;
+struct v_int16x16;
+struct v_uint32x8;
+struct v_int32x8;
+struct v_uint64x4;
+struct v_int64x4;
+struct v_float32x8;
+struct v_float64x4;
+#else
+// 128
+#define __CV_VX(fun)   v_##fun
+#define __CV_V_UINT8   v_uint8x16
+#define __CV_V_INT8    v_int8x16
+#define __CV_V_UINT16  v_uint16x8
+#define __CV_V_INT16   v_int16x8
+#define __CV_V_UINT32  v_uint32x4
+#define __CV_V_INT32   v_int32x4
+#define __CV_V_UINT64  v_uint64x2
+#define __CV_V_INT64   v_int64x2
+#define __CV_V_FLOAT32 v_float32x4
+#define __CV_V_FLOAT64 v_float64x2
+struct v_uint8x16;
+struct v_int8x16;
+struct v_uint16x8;
+struct v_int16x8;
+struct v_uint32x4;
+struct v_int32x4;
+struct v_uint64x2;
+struct v_int64x2;
+struct v_float32x4;
+struct v_float64x2;
+#endif
+
+/** Value reordering **/
+
+// Expansion
+void v_expand(const __CV_V_UINT8&,  __CV_V_UINT16&, __CV_V_UINT16&);
+void v_expand(const __CV_V_INT8&,   __CV_V_INT16&,  __CV_V_INT16&);
+void v_expand(const __CV_V_UINT16&, __CV_V_UINT32&, __CV_V_UINT32&);
+void v_expand(const __CV_V_INT16&,  __CV_V_INT32&,  __CV_V_INT32&);
+void v_expand(const __CV_V_UINT32&, __CV_V_UINT64&, __CV_V_UINT64&);
+void v_expand(const __CV_V_INT32&,  __CV_V_INT64&,  __CV_V_INT64&);
+// Low Expansion
+__CV_V_UINT16 v_expand_low(const __CV_V_UINT8&);
+__CV_V_INT16  v_expand_low(const __CV_V_INT8&);
+__CV_V_UINT32 v_expand_low(const __CV_V_UINT16&);
+__CV_V_INT32  v_expand_low(const __CV_V_INT16&);
+__CV_V_UINT64 v_expand_low(const __CV_V_UINT32&);
+__CV_V_INT64  v_expand_low(const __CV_V_INT32&);
+// High Expansion
+__CV_V_UINT16 v_expand_high(const __CV_V_UINT8&);
+__CV_V_INT16  v_expand_high(const __CV_V_INT8&);
+__CV_V_UINT32 v_expand_high(const __CV_V_UINT16&);
+__CV_V_INT32  v_expand_high(const __CV_V_INT16&);
+__CV_V_UINT64 v_expand_high(const __CV_V_UINT32&);
+__CV_V_INT64  v_expand_high(const __CV_V_INT32&);
+// Load & Low Expansion
+__CV_V_UINT16 __CV_VX(load_expand)(const uchar*);
+__CV_V_INT16  __CV_VX(load_expand)(const schar*);
+__CV_V_UINT32 __CV_VX(load_expand)(const ushort*);
+__CV_V_INT32  __CV_VX(load_expand)(const short*);
+__CV_V_UINT64 __CV_VX(load_expand)(const uint*);
+__CV_V_INT64  __CV_VX(load_expand)(const int*);
+// Load lower 8-bit and expand into 32-bit
+__CV_V_UINT32 __CV_VX(load_expand_q)(const uchar*);
+__CV_V_INT32  __CV_VX(load_expand_q)(const schar*);
+
+// Saturating Pack
+__CV_V_UINT8  v_pack(const __CV_V_UINT16&, const __CV_V_UINT16&);
+__CV_V_INT8   v_pack(const __CV_V_INT16&,  const __CV_V_INT16&);
+__CV_V_UINT16 v_pack(const __CV_V_UINT32&, const __CV_V_UINT32&);
+__CV_V_INT16  v_pack(const __CV_V_INT32&,  const __CV_V_INT32&);
+// Non-saturating Pack
+__CV_V_UINT32 v_pack(const __CV_V_UINT64&, const __CV_V_UINT64&);
+__CV_V_INT32  v_pack(const __CV_V_INT64&,  const __CV_V_INT64&);
+// Pack signed integers with unsigned saturation
+__CV_V_UINT8  v_pack_u(const __CV_V_INT16&, const __CV_V_INT16&);
+__CV_V_UINT16 v_pack_u(const __CV_V_INT32&, const __CV_V_INT32&);
+
+/** Arithmetic, bitwise and comparison operations **/
+
+// Non-saturating multiply
+#if CV_VSX
+template<typename Tvec>
+Tvec v_mul_wrap(const Tvec& a, const Tvec& b);
+#else
+__CV_V_UINT8  v_mul_wrap(const __CV_V_UINT8&,  const __CV_V_UINT8&);
+__CV_V_INT8   v_mul_wrap(const __CV_V_INT8&,   const __CV_V_INT8&);
+__CV_V_UINT16 v_mul_wrap(const __CV_V_UINT16&, const __CV_V_UINT16&);
+__CV_V_INT16  v_mul_wrap(const __CV_V_INT16&,  const __CV_V_INT16&);
+#endif
+
+//  Multiply and expand
+#if CV_VSX
+template<typename Tvec, typename Twvec>
+void v_mul_expand(const Tvec& a, const Tvec& b, Twvec& c, Twvec& d);
+#else
+void v_mul_expand(const __CV_V_UINT8&,  const __CV_V_UINT8&,  __CV_V_UINT16&, __CV_V_UINT16&);
+void v_mul_expand(const __CV_V_INT8&,   const __CV_V_INT8&,   __CV_V_INT16&,  __CV_V_INT16&);
+void v_mul_expand(const __CV_V_UINT16&, const __CV_V_UINT16&, __CV_V_UINT32&, __CV_V_UINT32&);
+void v_mul_expand(const __CV_V_INT16&,  const __CV_V_INT16&,  __CV_V_INT32&,  __CV_V_INT32&);
+void v_mul_expand(const __CV_V_UINT32&, const __CV_V_UINT32&, __CV_V_UINT64&, __CV_V_UINT64&);
+void v_mul_expand(const __CV_V_INT32&,  const __CV_V_INT32&,  __CV_V_INT64&,  __CV_V_INT64&);
+#endif
+
+// Conversions
+__CV_V_FLOAT32 v_cvt_f32(const __CV_V_INT32& a);
+__CV_V_FLOAT32 v_cvt_f32(const __CV_V_FLOAT64& a);
+__CV_V_FLOAT32 v_cvt_f32(const __CV_V_FLOAT64& a, const __CV_V_FLOAT64& b);
+__CV_V_FLOAT64 v_cvt_f64(const __CV_V_INT32& a);
+__CV_V_FLOAT64 v_cvt_f64_high(const __CV_V_INT32& a);
+__CV_V_FLOAT64 v_cvt_f64(const __CV_V_FLOAT32& a);
+__CV_V_FLOAT64 v_cvt_f64_high(const __CV_V_FLOAT32& a);
+__CV_V_FLOAT64 v_cvt_f64(const __CV_V_INT64& a);
+
+/** Cleanup **/
+#undef CV__SIMD_FORWARD
+#undef __CV_VX
+#undef __CV_V_UINT8
+#undef __CV_V_INT8
+#undef __CV_V_UINT16
+#undef __CV_V_INT16
+#undef __CV_V_UINT32
+#undef __CV_V_INT32
+#undef __CV_V_UINT64
+#undef __CV_V_INT64
+#undef __CV_V_FLOAT32
+#undef __CV_V_FLOAT64
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+} // cv::
\ No newline at end of file
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_msa.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_msa.hpp
new file mode 100644
index 0000000..c035fda
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_msa.hpp
@@ -0,0 +1,1887 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_HAL_INTRIN_MSA_HPP
+#define OPENCV_HAL_INTRIN_MSA_HPP
+
+#include <algorithm>
+#include "opencv2/core/utility.hpp"
+
+namespace cv
+{
+
+//! @cond IGNORED
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+#define CV_SIMD128 1
+
+//MSA implements 128-bit wide vector registers shared with the 64-bit wide floating-point unit registers.
+//MSA and FPU can not be both present, unless the FPU has 64-bit floating-point registers.
+#define CV_SIMD128_64F 1
+
+struct v_uint8x16
+{
+    typedef uchar lane_type;
+    enum { nlanes = 16 };
+
+    v_uint8x16() {}
+    explicit v_uint8x16(v16u8 v) : val(v) {}
+    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+               uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+    {
+        uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        val = msa_ld1q_u8(v);
+    }
+
+    uchar get0() const
+    {
+        return msa_getq_lane_u8(val, 0);
+    }
+
+    v16u8 val;
+};
+
+struct v_int8x16
+{
+    typedef schar lane_type;
+    enum { nlanes = 16 };
+
+    v_int8x16() {}
+    explicit v_int8x16(v16i8 v) : val(v) {}
+    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+               schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+    {
+        schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        val = msa_ld1q_s8(v);
+    }
+
+    schar get0() const
+    {
+        return msa_getq_lane_s8(val, 0);
+    }
+
+    v16i8 val;
+};
+
+struct v_uint16x8
+{
+    typedef ushort lane_type;
+    enum { nlanes = 8 };
+
+    v_uint16x8() {}
+    explicit v_uint16x8(v8u16 v) : val(v) {}
+    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+    {
+        ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = msa_ld1q_u16(v);
+    }
+
+    ushort get0() const
+    {
+        return msa_getq_lane_u16(val, 0);
+    }
+
+    v8u16 val;
+};
+
+struct v_int16x8
+{
+    typedef short lane_type;
+    enum { nlanes = 8 };
+
+    v_int16x8() {}
+    explicit v_int16x8(v8i16 v) : val(v) {}
+    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+    {
+        short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = msa_ld1q_s16(v);
+    }
+
+    short get0() const
+    {
+        return msa_getq_lane_s16(val, 0);
+    }
+
+    v8i16 val;
+};
+
+struct v_uint32x4
+{
+    typedef unsigned int lane_type;
+    enum { nlanes = 4 };
+
+    v_uint32x4() {}
+    explicit v_uint32x4(v4u32 v) : val(v) {}
+    v_uint32x4(unsigned int v0, unsigned int v1, unsigned int v2, unsigned int v3)
+    {
+        unsigned int v[] = {v0, v1, v2, v3};
+        val = msa_ld1q_u32(v);
+    }
+
+    unsigned int get0() const
+    {
+        return msa_getq_lane_u32(val, 0);
+    }
+
+    v4u32 val;
+};
+
+struct v_int32x4
+{
+    typedef int lane_type;
+    enum { nlanes = 4 };
+
+    v_int32x4() {}
+    explicit v_int32x4(v4i32 v) : val(v) {}
+    v_int32x4(int v0, int v1, int v2, int v3)
+    {
+        int v[] = {v0, v1, v2, v3};
+        val = msa_ld1q_s32(v);
+    }
+
+    int get0() const
+    {
+        return msa_getq_lane_s32(val, 0);
+    }
+
+    v4i32 val;
+};
+
+struct v_float32x4
+{
+    typedef float lane_type;
+    enum { nlanes = 4 };
+
+    v_float32x4() {}
+    explicit v_float32x4(v4f32 v) : val(v) {}
+    v_float32x4(float v0, float v1, float v2, float v3)
+    {
+        float v[] = {v0, v1, v2, v3};
+        val = msa_ld1q_f32(v);
+    }
+
+    float get0() const
+    {
+        return msa_getq_lane_f32(val, 0);
+    }
+
+    v4f32 val;
+};
+
+struct v_uint64x2
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 2 };
+
+    v_uint64x2() {}
+    explicit v_uint64x2(v2u64 v) : val(v) {}
+    v_uint64x2(uint64 v0, uint64 v1)
+    {
+        uint64 v[] = {v0, v1};
+        val = msa_ld1q_u64(v);
+    }
+
+    uint64 get0() const
+    {
+        return msa_getq_lane_u64(val, 0);
+    }
+
+    v2u64 val;
+};
+
+struct v_int64x2
+{
+    typedef int64 lane_type;
+    enum { nlanes = 2 };
+
+    v_int64x2() {}
+    explicit v_int64x2(v2i64 v) : val(v) {}
+    v_int64x2(int64 v0, int64 v1)
+    {
+        int64 v[] = {v0, v1};
+        val = msa_ld1q_s64(v);
+    }
+
+    int64 get0() const
+    {
+        return msa_getq_lane_s64(val, 0);
+    }
+
+    v2i64 val;
+};
+
+struct v_float64x2
+{
+    typedef double lane_type;
+    enum { nlanes = 2 };
+
+    v_float64x2() {}
+    explicit v_float64x2(v2f64 v) : val(v) {}
+    v_float64x2(double v0, double v1)
+    {
+        double v[] = {v0, v1};
+        val = msa_ld1q_f64(v);
+    }
+
+    double get0() const
+    {
+        return msa_getq_lane_f64(val, 0);
+    }
+
+    v2f64 val;
+};
+
+#define OPENCV_HAL_IMPL_MSA_INIT(_Tpv, _Tp, suffix) \
+inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(msa_dupq_n_##suffix((_Tp)0)); } \
+inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(msa_dupq_n_##suffix(v)); } \
+inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(MSA_TPV_REINTERPRET(v16u8, v.val)); } \
+inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(MSA_TPV_REINTERPRET(v16i8, v.val)); } \
+inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(MSA_TPV_REINTERPRET(v8u16, v.val)); } \
+inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(MSA_TPV_REINTERPRET(v8i16, v.val)); } \
+inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(MSA_TPV_REINTERPRET(v4u32, v.val)); } \
+inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(MSA_TPV_REINTERPRET(v4i32, v.val)); } \
+inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(MSA_TPV_REINTERPRET(v2u64, v.val)); } \
+inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(MSA_TPV_REINTERPRET(v2i64, v.val)); } \
+inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(MSA_TPV_REINTERPRET(v4f32, v.val)); } \
+inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(MSA_TPV_REINTERPRET(v2f64, v.val)); }
+
+OPENCV_HAL_IMPL_MSA_INIT(uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_MSA_INIT(int8x16, schar, s8)
+OPENCV_HAL_IMPL_MSA_INIT(uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_MSA_INIT(int16x8, short, s16)
+OPENCV_HAL_IMPL_MSA_INIT(uint32x4, unsigned int, u32)
+OPENCV_HAL_IMPL_MSA_INIT(int32x4, int, s32)
+OPENCV_HAL_IMPL_MSA_INIT(uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_MSA_INIT(int64x2, int64, s64)
+OPENCV_HAL_IMPL_MSA_INIT(float32x4, float, f32)
+OPENCV_HAL_IMPL_MSA_INIT(float64x2, double, f64)
+
+#define OPENCV_HAL_IMPL_MSA_PACK(_Tpvec, _Tpwvec, pack, mov, rshr) \
+inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
+{ \
+    return _Tpvec(mov(a.val, b.val)); \
+} \
+template<int n> inline \
+_Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
+{ \
+    return _Tpvec(rshr(a.val, b.val, n)); \
+}
+
+OPENCV_HAL_IMPL_MSA_PACK(v_uint8x16, v_uint16x8, pack, msa_qpack_u16, msa_qrpackr_u16)
+OPENCV_HAL_IMPL_MSA_PACK(v_int8x16, v_int16x8, pack, msa_qpack_s16, msa_qrpackr_s16)
+OPENCV_HAL_IMPL_MSA_PACK(v_uint16x8, v_uint32x4, pack, msa_qpack_u32, msa_qrpackr_u32)
+OPENCV_HAL_IMPL_MSA_PACK(v_int16x8, v_int32x4, pack, msa_qpack_s32, msa_qrpackr_s32)
+OPENCV_HAL_IMPL_MSA_PACK(v_uint32x4, v_uint64x2, pack, msa_pack_u64, msa_rpackr_u64)
+OPENCV_HAL_IMPL_MSA_PACK(v_int32x4, v_int64x2, pack, msa_pack_s64, msa_rpackr_s64)
+OPENCV_HAL_IMPL_MSA_PACK(v_uint8x16, v_int16x8, pack_u, msa_qpacku_s16, msa_qrpackru_s16)
+OPENCV_HAL_IMPL_MSA_PACK(v_uint16x8, v_int32x4, pack_u, msa_qpacku_s32, msa_qrpackru_s32)
+
+#define OPENCV_HAL_IMPL_MSA_PACK_STORE(_Tpvec, _Tp, hreg, suffix, _Tpwvec, pack, mov, rshr) \
+inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
+{ \
+    hreg a1 = mov(a.val); \
+    msa_st1_##suffix(ptr, a1); \
+} \
+template<int n> inline \
+void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
+{ \
+    hreg a1 = rshr(a.val, n); \
+    msa_st1_##suffix(ptr, a1); \
+}
+
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint8x16, uchar, v8u8, u8, v_uint16x8, pack, msa_qmovn_u16, msa_qrshrn_n_u16)
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_int8x16, schar, v8i8, s8, v_int16x8, pack, msa_qmovn_s16, msa_qrshrn_n_s16)
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint16x8, ushort, v4u16, u16, v_uint32x4, pack, msa_qmovn_u32, msa_qrshrn_n_u32)
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_int16x8, short, v4i16, s16, v_int32x4, pack, msa_qmovn_s32, msa_qrshrn_n_s32)
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint32x4, unsigned, v2u32, u32, v_uint64x2, pack, msa_movn_u64, msa_rshrn_n_u64)
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_int32x4, int, v2i32, s32, v_int64x2, pack, msa_movn_s64, msa_rshrn_n_s64)
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint8x16, uchar, v8u8, u8, v_int16x8, pack_u, msa_qmovun_s16, msa_qrshrun_n_s16)
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint16x8, ushort, v4u16, u16, v_int32x4, pack_u, msa_qmovun_s32, msa_qrshrun_n_s32)
+
+// pack boolean
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    return v_uint8x16(msa_pack_u16(a.val, b.val));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    return v_uint8x16(msa_pack_u16(msa_pack_u32(a.val, b.val), msa_pack_u32(c.val, d.val)));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    v8u16 abcd = msa_pack_u32(msa_pack_u64(a.val, b.val), msa_pack_u64(c.val, d.val));
+    v8u16 efgh = msa_pack_u32(msa_pack_u64(e.val, f.val), msa_pack_u64(g.val, h.val));
+    return v_uint8x16(msa_pack_u16(abcd, efgh));
+}
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2,
+                            const v_float32x4& m3)
+{
+    v4f32 v0 = v.val;
+    v4f32 res = msa_mulq_lane_f32(m0.val, v0, 0);
+    res = msa_mlaq_lane_f32(res, m1.val, v0, 1);
+    res = msa_mlaq_lane_f32(res, m2.val, v0, 2);
+    res = msa_mlaq_lane_f32(res, m3.val, v0, 3);
+    return v_float32x4(res);
+}
+
+inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
+                               const v_float32x4& m1, const v_float32x4& m2,
+                               const v_float32x4& a)
+{
+    v4f32 v0 = v.val;
+    v4f32 res = msa_mulq_lane_f32(m0.val, v0, 0);
+    res = msa_mlaq_lane_f32(res, m1.val, v0, 1);
+    res = msa_mlaq_lane_f32(res, m2.val, v0, 2);
+    res = msa_addq_f32(res, a.val);
+    return v_float32x4(res);
+}
+
+#define OPENCV_HAL_IMPL_MSA_BIN_OP(bin_op, _Tpvec, intrin) \
+inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+} \
+inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+{ \
+    a.val = intrin(a.val, b.val); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint8x16, msa_qaddq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint8x16, msa_qsubq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int8x16, msa_qaddq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int8x16, msa_qsubq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint16x8, msa_qaddq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint16x8, msa_qsubq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int16x8, msa_qaddq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int16x8, msa_qsubq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int32x4, msa_addq_s32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int32x4, msa_subq_s32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_int32x4, msa_mulq_s32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint32x4, msa_addq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint32x4, msa_subq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_uint32x4, msa_mulq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float32x4, msa_addq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float32x4, msa_subq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float32x4, msa_mulq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int64x2, msa_addq_s64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int64x2, msa_subq_s64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint64x2, msa_addq_u64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint64x2, msa_subq_u64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float32x4, msa_divq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float64x2, msa_addq_f64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float64x2, msa_subq_f64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float64x2, msa_mulq_f64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float64x2, msa_divq_f64)
+
+// saturating multiply 8-bit, 16-bit
+#define OPENCV_HAL_IMPL_MSA_MUL_SAT(_Tpvec, _Tpwvec)         \
+inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
+{                                                            \
+    _Tpwvec c, d;                                            \
+    v_mul_expand(a, b, c, d);                                \
+    return v_pack(c, d);                                     \
+}                                                            \
+inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
+{a = a * b; return a; }
+
+OPENCV_HAL_IMPL_MSA_MUL_SAT(v_int8x16,  v_int16x8)
+OPENCV_HAL_IMPL_MSA_MUL_SAT(v_uint8x16, v_uint16x8)
+OPENCV_HAL_IMPL_MSA_MUL_SAT(v_int16x8,  v_int32x4)
+OPENCV_HAL_IMPL_MSA_MUL_SAT(v_uint16x8, v_uint32x4)
+
+//  Multiply and expand
+inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
+                         v_int16x8& c, v_int16x8& d)
+{
+    v16i8 a_lo, a_hi, b_lo, b_hi;
+
+    ILVRL_B2_SB(a.val, msa_dupq_n_s8(0), a_lo, a_hi);
+    ILVRL_B2_SB(b.val, msa_dupq_n_s8(0), b_lo, b_hi);
+    c.val = msa_mulq_s16(msa_paddlq_s8(a_lo), msa_paddlq_s8(b_lo));
+    d.val = msa_mulq_s16(msa_paddlq_s8(a_hi), msa_paddlq_s8(b_hi));
+}
+
+inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
+                         v_uint16x8& c, v_uint16x8& d)
+{
+    v16u8 a_lo, a_hi, b_lo, b_hi;
+
+    ILVRL_B2_UB(a.val, msa_dupq_n_u8(0), a_lo, a_hi);
+    ILVRL_B2_UB(b.val, msa_dupq_n_u8(0), b_lo, b_hi);
+    c.val = msa_mulq_u16(msa_paddlq_u8(a_lo), msa_paddlq_u8(b_lo));
+    d.val = msa_mulq_u16(msa_paddlq_u8(a_hi), msa_paddlq_u8(b_hi));
+}
+
+inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
+                         v_int32x4& c, v_int32x4& d)
+{
+    v8i16 a_lo, a_hi, b_lo, b_hi;
+
+    ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi);
+    ILVRL_H2_SH(b.val, msa_dupq_n_s16(0), b_lo, b_hi);
+    c.val = msa_mulq_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(b_lo));
+    d.val = msa_mulq_s32(msa_paddlq_s16(a_hi), msa_paddlq_s16(b_hi));
+}
+
+inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
+                         v_uint32x4& c, v_uint32x4& d)
+{
+    v8u16 a_lo, a_hi, b_lo, b_hi;
+
+    ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi);
+    ILVRL_H2_UH(b.val, msa_dupq_n_u16(0), b_lo, b_hi);
+    c.val = msa_mulq_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(b_lo));
+    d.val = msa_mulq_u32(msa_paddlq_u16(a_hi), msa_paddlq_u16(b_hi));
+}
+
+inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
+                         v_uint64x2& c, v_uint64x2& d)
+{
+    v4u32 a_lo, a_hi, b_lo, b_hi;
+
+    ILVRL_W2_UW(a.val, msa_dupq_n_u32(0), a_lo, a_hi);
+    ILVRL_W2_UW(b.val, msa_dupq_n_u32(0), b_lo, b_hi);
+    c.val = msa_mulq_u64(msa_paddlq_u32(a_lo), msa_paddlq_u32(b_lo));
+    d.val = msa_mulq_u64(msa_paddlq_u32(a_hi), msa_paddlq_u32(b_hi));
+}
+
+inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
+{
+    v8i16 a_lo, a_hi, b_lo, b_hi;
+
+    ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi);
+    ILVRL_H2_SH(b.val, msa_dupq_n_s16(0), b_lo, b_hi);
+
+    return v_int16x8(msa_packr_s32(msa_mulq_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(b_lo)),
+                                   msa_mulq_s32(msa_paddlq_s16(a_hi), msa_paddlq_s16(b_hi)), 16));
+}
+
+inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v8u16 a_lo, a_hi, b_lo, b_hi;
+
+    ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi);
+    ILVRL_H2_UH(b.val, msa_dupq_n_u16(0), b_lo, b_hi);
+
+    return v_uint16x8(msa_packr_u32(msa_mulq_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(b_lo)),
+                                    msa_mulq_u32(msa_paddlq_u16(a_hi), msa_paddlq_u16(b_hi)), 16));
+}
+
+//////// Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{ return v_int32x4(msa_dotp_s_w(a.val, b.val)); }
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_int32x4(msa_dpadd_s_w(c.val , a.val, b.val)); }
+
+// 32 >> 64
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
+{ return v_int64x2(msa_dotp_s_d(a.val, b.val)); }
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_int64x2(msa_dpadd_s_d(c.val , a.val, b.val)); }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
+{
+    v8u16 even_a = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8), 8);
+    v8u16 odd_a  = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8);
+    v8u16 even_b = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8), 8);
+    v8u16 odd_b  = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8);
+    v4u32 prod   = msa_dotp_u_w(even_a, even_b);
+    return v_uint32x4(msa_dpadd_u_w(prod, odd_a, odd_b));
+}
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{
+    v8u16 even_a = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8), 8);
+    v8u16 odd_a  = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8);
+    v8u16 even_b = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8), 8);
+    v8u16 odd_b  = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8);
+    v4u32 prod   = msa_dpadd_u_w(c.val, even_a, even_b);
+    return v_uint32x4(msa_dpadd_u_w(prod, odd_a, odd_b));
+}
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
+{
+    v8i16 prod = msa_dotp_s_h(a.val, b.val);
+    return v_int32x4(msa_hadd_s32(prod, prod));
+}
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
+                                  const v_int32x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v4u32 even_a = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16), 16);
+    v4u32 odd_a  = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16);
+    v4u32 even_b = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16), 16);
+    v4u32 odd_b  = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16);
+    v2u64 prod   = msa_dotp_u_d(even_a, even_b);
+    return v_uint64x2(msa_dpadd_u_d(prod, odd_a, odd_b));
+}
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b,
+                                   const v_uint64x2& c)
+{
+    v4u32 even_a = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16), 16);
+    v4u32 odd_a  = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16);
+    v4u32 even_b = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16), 16);
+    v4u32 odd_b  = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16);
+    v2u64 prod   = msa_dpadd_u_d(c.val, even_a, even_b);
+    return v_uint64x2(msa_dpadd_u_d(prod, odd_a, odd_b));
+}
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
+{
+    v4i32 prod = msa_dotp_s_w(a.val, b.val);
+    return v_int64x2(msa_hadd_s64(prod, prod));
+}
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
+{ return v_dotprod(a, b); }
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_dotprod(a, b, c); }
+
+// 32 >> 64
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod(a, b); }
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_dotprod(a, b, c); }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_dotprod_expand(a, b, c); }
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod_expand(a, b); }
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+
+#define OPENCV_HAL_IMPL_MSA_LOGIC_OP(_Tpvec, _Tpv, suffix) \
+OPENCV_HAL_IMPL_MSA_BIN_OP(&, _Tpvec, msa_andq_##suffix)   \
+OPENCV_HAL_IMPL_MSA_BIN_OP(|, _Tpvec, msa_orrq_##suffix)   \
+OPENCV_HAL_IMPL_MSA_BIN_OP(^, _Tpvec, msa_eorq_##suffix)   \
+inline _Tpvec operator ~ (const _Tpvec& a) \
+{ \
+    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_u8(MSA_TPV_REINTERPRET(v16u8, a.val)))); \
+}
+
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint8x16, v16u8, u8)
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int8x16, v16i8, s8)
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint16x8, v8u16, u16)
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int16x8, v8i16, s16)
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint32x4, v4u32, u32)
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int32x4, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint64x2, v2u64, u64)
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int64x2, v2i64, s64)
+
+#define OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(bin_op, intrin) \
+inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
+{ \
+    return v_float32x4(MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val)))); \
+} \
+inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
+{ \
+    a.val = MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val))); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(&, msa_andq_s32)
+OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(|, msa_orrq_s32)
+OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(^, msa_eorq_s32)
+
+inline v_float32x4 operator ~ (const v_float32x4& a)
+{
+    return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
+}
+
+/* v_abs */
+#define OPENCV_HAL_IMPL_MSA_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
+inline _Tpuvec v_abs(const _Tpsvec& a) \
+{ \
+    return v_reinterpret_as_##usuffix(_Tpsvec(msa_absq_##ssuffix(a.val))); \
+}
+
+OPENCV_HAL_IMPL_MSA_ABS(v_uint8x16, v_int8x16, u8, s8)
+OPENCV_HAL_IMPL_MSA_ABS(v_uint16x8, v_int16x8, u16, s16)
+OPENCV_HAL_IMPL_MSA_ABS(v_uint32x4, v_int32x4, u32, s32)
+
+/* v_abs(float), v_sqrt, v_invsqrt */
+#define OPENCV_HAL_IMPL_MSA_BASIC_FUNC(_Tpvec, func, intrin) \
+inline _Tpvec func(const _Tpvec& a) \
+{ \
+    return _Tpvec(intrin(a.val)); \
+}
+
+OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float32x4, v_abs, msa_absq_f32)
+OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_abs, msa_absq_f64)
+OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float32x4, v_sqrt, msa_sqrtq_f32)
+OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float32x4, v_invsqrt, msa_rsqrtq_f32)
+OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_sqrt, msa_sqrtq_f64)
+OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_invsqrt, msa_rsqrtq_f64)
+
+#define OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(bin_op, intrin) \
+inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
+{ \
+    return v_float64x2(MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val)))); \
+} \
+inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
+{ \
+    a.val = MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val))); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(&, msa_andq_s64)
+OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(|, msa_orrq_s64)
+OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(^, msa_eorq_s64)
+
+inline v_float64x2 operator ~ (const v_float64x2& a)
+{
+    return v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
+}
+
+// TODO: exp, log, sin, cos
+
+#define OPENCV_HAL_IMPL_MSA_BIN_FUNC(_Tpvec, func, intrin) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+}
+
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_min, msa_minq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_max, msa_maxq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_min, msa_minq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_max, msa_maxq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_min, msa_minq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_max, msa_maxq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_min, msa_minq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_max, msa_maxq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint32x4, v_min, msa_minq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint32x4, v_max, msa_maxq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int32x4, v_min, msa_minq_s32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int32x4, v_max, msa_maxq_s32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float32x4, v_min, msa_minq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float32x4, v_max, msa_maxq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_min, msa_minq_f64)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_max, msa_maxq_f64)
+
+#define OPENCV_HAL_IMPL_MSA_INT_CMP_OP(_Tpvec, _Tpv, suffix, not_suffix) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ceqq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_##not_suffix(msa_ceqq_##suffix(a.val, b.val)))); } \
+inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cltq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgtq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cleq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgeq_##suffix(a.val, b.val))); }
+
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint8x16, v16u8, u8, u8)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int8x16, v16i8, s8, u8)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint16x8, v8u16, u16, u16)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int16x8, v8i16, s16, u16)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint32x4, v4u32, u32, u32)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int32x4, v4i32, s32, u32)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_float32x4, v4f32, f32, u32)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint64x2, v2u64, u64, u64)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int64x2, v2i64, s64, u64)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_float64x2, v2f64, f64, u64)
+
+inline v_float32x4 v_not_nan(const v_float32x4& a)
+{ return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ceqq_f32(a.val, a.val))); }
+inline v_float64x2 v_not_nan(const v_float64x2& a)
+{ return v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ceqq_f64(a.val, a.val))); }
+
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_add_wrap, msa_addq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_add_wrap, msa_addq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_add_wrap, msa_addq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_add_wrap, msa_addq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_sub_wrap, msa_subq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_sub_wrap, msa_subq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_sub_wrap, msa_subq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_sub_wrap, msa_subq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_mul_wrap, msa_mulq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_mul_wrap, msa_mulq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_mul_wrap, msa_mulq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_mul_wrap, msa_mulq_s16)
+
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_absdiff, msa_abdq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_absdiff, msa_abdq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint32x4, v_absdiff, msa_abdq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float32x4, v_absdiff, msa_abdq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_absdiff, msa_abdq_f64)
+
+/** Saturating absolute difference **/
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_absdiffs, msa_qabdq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_absdiffs, msa_qabdq_s16)
+
+#define OPENCV_HAL_IMPL_MSA_BIN_FUNC2(_Tpvec, _Tpvec2, _Tpv, func, intrin) \
+inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec2(MSA_TPV_REINTERPRET(_Tpv, intrin(a.val, b.val))); \
+}
+
+OPENCV_HAL_IMPL_MSA_BIN_FUNC2(v_int8x16, v_uint8x16, v16u8, v_absdiff, msa_abdq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC2(v_int16x8, v_uint16x8, v8u16, v_absdiff, msa_abdq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC2(v_int32x4, v_uint32x4, v4u32, v_absdiff, msa_abdq_s32)
+
+/* v_magnitude, v_sqr_magnitude, v_fma, v_muladd */
+inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+    v_float32x4 x(msa_mlaq_f32(msa_mulq_f32(a.val, a.val), b.val, b.val));
+    return v_sqrt(x);
+}
+
+inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+    return v_float32x4(msa_mlaq_f32(msa_mulq_f32(a.val, a.val), b.val, b.val));
+}
+
+inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+    return v_float32x4(msa_mlaq_f32(c.val, a.val, b.val));
+}
+
+inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_int32x4(msa_mlaq_s32(c.val, a.val, b.val));
+}
+
+inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
+{
+    v_float64x2 x(msa_mlaq_f64(msa_mulq_f64(a.val, a.val), b.val, b.val));
+    return v_sqrt(x);
+}
+
+inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_float64x2(msa_mlaq_f64(msa_mulq_f64(a.val, a.val), b.val, b.val));
+}
+
+inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+    return v_float64x2(msa_mlaq_f64(c.val, a.val, b.val));
+}
+
+inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+    return v_fma(a, b, c);
+}
+
+// trade efficiency for convenience
+#define OPENCV_HAL_IMPL_MSA_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
+inline _Tpvec operator << (const _Tpvec& a, int n) \
+{ return _Tpvec(msa_shlq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \
+inline _Tpvec operator >> (const _Tpvec& a, int n) \
+{ return _Tpvec(msa_shrq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \
+template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
+{ return _Tpvec(msa_shlq_n_##suffix(a.val, n)); } \
+template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
+{ return _Tpvec(msa_shrq_n_##suffix(a.val, n)); } \
+template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
+{ return _Tpvec(msa_rshrq_n_##suffix(a.val, n)); }
+
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint8x16, u8, schar, s8)
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int8x16, s8, schar, s8)
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint16x8, u16, short, s16)
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int16x8, s16, short, s16)
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint32x4, u32, int, s32)
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int32x4, s32, int, s32)
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint64x2, u64, int64, s64)
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int64x2, s64, int64, s64)
+
+/* v_rotate_right, v_rotate_left */
+#define OPENCV_HAL_IMPL_MSA_ROTATE_OP(_Tpvec, _Tpv, _Tpvs, suffix) \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
+{ \
+    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##suffix(0), n))); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
+{ \
+    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(msa_dupq_n_##suffix(0), MSA_TPV_REINTERPRET(_Tpvs, a.val), _Tpvec::nlanes - n))); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
+{ \
+    return a; \
+} \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), MSA_TPV_REINTERPRET(_Tpvs, b.val), n))); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, b.val), MSA_TPV_REINTERPRET(_Tpvs, a.val), _Tpvec::nlanes - n))); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    CV_UNUSED(b); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint8x16, v16u8, v16i8, s8)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int8x16, v16i8, v16i8, s8)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint16x8, v8u16, v8i16, s16)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int16x8, v8i16, v8i16, s16)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint32x4, v4u32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int32x4, v4i32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_float32x4, v4f32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint64x2, v2u64, v2i64, s64)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int64x2, v2i64, v2i64, s64)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_float64x2, v2f64, v2i64, s64)
+
+#define OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec(msa_ld1q_##suffix(ptr)); } \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(msa_ld1q_##suffix(ptr)); } \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ return _Tpvec(msa_combine_##suffix(msa_ld1_##suffix(ptr), msa_dup_n_##suffix((_Tp)0))); } \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ return _Tpvec(msa_combine_##suffix(msa_ld1_##suffix(ptr0), msa_ld1_##suffix(ptr1))); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ msa_st1q_##suffix(ptr, a.val); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ msa_st1q_##suffix(ptr, a.val); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ msa_st1q_##suffix(ptr, a.val); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ msa_st1q_##suffix(ptr, a.val); } \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ \
+    int n  = _Tpvec::nlanes; \
+    for( int i = 0; i < (n/2); i++ ) \
+        ptr[i] = a.val[i]; \
+} \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+    int n  = _Tpvec::nlanes; \
+    for( int i = 0; i < (n/2); i++ ) \
+        ptr[i] = a.val[i+(n/2)]; \
+}
+
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int8x16, schar, s8)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int16x8, short, s16)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int64x2, int64, s64)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_float32x4, float, f32)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_float64x2, double, f64)
+
+
+/** Reverse **/
+inline v_uint8x16 v_reverse(const v_uint8x16 &a)
+{
+    v_uint8x16 c = v_uint8x16((v16u8)__builtin_msa_vshf_b((v16i8)((v2i64){0x08090A0B0C0D0E0F, 0x0001020304050607}), msa_dupq_n_s8(0), (v16i8)a.val));
+    return c;
+}
+
+inline v_int8x16 v_reverse(const v_int8x16 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x8 v_reverse(const v_uint16x8 &a)
+{
+    v_uint16x8 c = v_uint16x8((v8u16)__builtin_msa_vshf_h((v8i16)((v2i64){0x0004000500060007, 0x0000000100020003}), msa_dupq_n_s16(0), (v8i16)a.val));
+    return c;
+}
+
+inline v_int16x8 v_reverse(const v_int16x8 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x4 v_reverse(const v_uint32x4 &a)
+{
+    v_uint32x4 c;
+    c.val[0] = a.val[3];
+    c.val[1] = a.val[2];
+    c.val[2] = a.val[1];
+    c.val[3] = a.val[0];
+    return c;
+}
+
+inline v_int32x4 v_reverse(const v_int32x4 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x4 v_reverse(const v_float32x4 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x2 v_reverse(const v_uint64x2 &a)
+{
+    v_uint64x2 c;
+    c.val[0] = a.val[1];
+    c.val[1] = a.val[0];
+    return c;
+}
+
+inline v_int64x2 v_reverse(const v_int64x2 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+inline v_float64x2 v_reverse(const v_float64x2 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
+
+#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(func, cfunc) \
+inline unsigned short v_reduce_##func(const v_uint16x8& a) \
+{ \
+    v8u16 a_lo, a_hi; \
+    ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi); \
+    v4u32 b = msa_##func##q_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(a_hi)); \
+    v4u32 b_lo, b_hi; \
+    ILVRL_W2_UW(b, msa_dupq_n_u32(0), b_lo, b_hi); \
+    v2u64 c = msa_##func##q_u64(msa_paddlq_u32(b_lo), msa_paddlq_u32(b_hi)); \
+    return (unsigned short)cfunc(c[0], c[1]); \
+}
+
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(max, std::max)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(min, std::min)
+
+#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(func, cfunc) \
+inline short v_reduce_##func(const v_int16x8& a) \
+{ \
+    v8i16 a_lo, a_hi; \
+    ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi); \
+    v4i32 b = msa_##func##q_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(a_hi)); \
+    v4i32 b_lo, b_hi; \
+    ILVRL_W2_SW(b, msa_dupq_n_s32(0), b_lo, b_hi); \
+    v2i64 c = msa_##func##q_s64(msa_paddlq_s32(b_lo), msa_paddlq_s32(b_hi)); \
+    return (short)cfunc(c[0], c[1]); \
+}
+
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(max, std::max)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(min, std::min)
+
+#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(_Tpvec, scalartype, func, cfunc) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    return (scalartype)cfunc(cfunc(a.val[0], a.val[1]), cfunc(a.val[2], a.val[3])); \
+}
+
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_int32x4, int, max, std::max)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_int32x4, int, min, std::min)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_float32x4, float, max, std::max)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_float32x4, float, min, std::min)
+
+
+#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(_Tpvec, scalartype, _Tpvec2, func) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    _Tpvec2 a1, a2; \
+    v_expand(a, a1, a2); \
+    return (scalartype)v_reduce_##func(v_##func(a1, a2)); \
+}
+
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(v_uint8x16, uchar, v_uint16x8, min)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(v_uint8x16, uchar, v_uint16x8, max)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(v_int8x16, char, v_int16x8, min)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(v_int8x16, char, v_int16x8, max)
+
+
+
+#define OPENCV_HAL_IMPL_MSA_REDUCE_SUM(_Tpvec, scalartype, suffix) \
+inline scalartype v_reduce_sum(const _Tpvec& a) \
+{ \
+    return (scalartype)msa_sum_##suffix(a.val); \
+}
+
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint8x16, unsigned short, u8)
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int8x16, short, s8)
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint16x8, unsigned, u16)
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int16x8, int, s16)
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint32x4, uint64_t, u32)
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int32x4, int64_t, s32)
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_float32x4, float, f32)
+
+inline uint64 v_reduce_sum(const v_uint64x2& a)
+{ return (uint64)(msa_getq_lane_u64(a.val, 0) + msa_getq_lane_u64(a.val, 1)); }
+inline int64 v_reduce_sum(const v_int64x2& a)
+{ return (int64)(msa_getq_lane_s64(a.val, 0) + msa_getq_lane_s64(a.val, 1)); }
+inline double v_reduce_sum(const v_float64x2& a)
+{
+    return msa_getq_lane_f64(a.val, 0) + msa_getq_lane_f64(a.val, 1);
+}
+
+/* v_reduce_sum4, v_reduce_sad */
+inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
+                                 const v_float32x4& c, const v_float32x4& d)
+{
+    v4f32 u0 = msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, b.val), MSA_TPV_REINTERPRET(v4i32, a.val))),
+                            MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, b.val), MSA_TPV_REINTERPRET(v4i32, a.val)))); // a0+a1 b0+b1 a2+a3 b2+b3
+    v4f32 u1 = msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, d.val), MSA_TPV_REINTERPRET(v4i32, c.val))),
+                            MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, d.val), MSA_TPV_REINTERPRET(v4i32, c.val)))); // c0+c1 d0+d1 c2+c3 d2+d3
+
+    return v_float32x4(msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, u1), MSA_TPV_REINTERPRET(v2i64, u0))),
+                                    MSA_TPV_REINTERPRET(v4f32, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, u1), MSA_TPV_REINTERPRET(v2i64, u0)))));
+}
+
+inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
+{
+    v16u8 t0 = msa_abdq_u8(a.val, b.val);
+    v8u16 t1 = msa_paddlq_u8(t0);
+    v4u32 t2 = msa_paddlq_u16(t1);
+    return msa_sum_u32(t2);
+}
+inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
+{
+    v16u8 t0 = MSA_TPV_REINTERPRET(v16u8, msa_abdq_s8(a.val, b.val));
+    v8u16 t1 = msa_paddlq_u8(t0);
+    v4u32 t2 = msa_paddlq_u16(t1);
+    return msa_sum_u32(t2);
+}
+inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v8u16 t0 = msa_abdq_u16(a.val, b.val);
+    v4u32 t1 = msa_paddlq_u16(t0);
+    return msa_sum_u32(t1);
+}
+inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
+{
+    v8u16 t0 = MSA_TPV_REINTERPRET(v8u16, msa_abdq_s16(a.val, b.val));
+    v4u32 t1 = msa_paddlq_u16(t0);
+    return msa_sum_u32(t1);
+}
+inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
+{
+    v4u32 t0 = msa_abdq_u32(a.val, b.val);
+    return msa_sum_u32(t0);
+}
+inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
+{
+    v4u32 t0 = MSA_TPV_REINTERPRET(v4u32, msa_abdq_s32(a.val, b.val));
+    return msa_sum_u32(t0);
+}
+inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
+{
+    v4f32 t0 = msa_abdq_f32(a.val, b.val);
+    return msa_sum_f32(t0);
+}
+
+/* v_popcount */
+#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(_Tpvec) \
+inline v_uint8x16 v_popcount(const _Tpvec& a) \
+{ \
+    v16u8 t = MSA_TPV_REINTERPRET(v16u8, msa_cntq_s8(MSA_TPV_REINTERPRET(v16i8, a.val))); \
+    return v_uint8x16(t); \
+}
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(v_uint8x16)
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(v_int8x16)
+
+#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(_Tpvec) \
+inline v_uint16x8 v_popcount(const _Tpvec& a) \
+{ \
+    v8u16 t = MSA_TPV_REINTERPRET(v8u16, msa_cntq_s16(MSA_TPV_REINTERPRET(v8i16, a.val))); \
+    return v_uint16x8(t); \
+}
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(v_uint16x8)
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(v_int16x8)
+
+#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(_Tpvec) \
+inline v_uint32x4 v_popcount(const _Tpvec& a) \
+{ \
+    v4u32 t = MSA_TPV_REINTERPRET(v4u32, msa_cntq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))); \
+    return v_uint32x4(t); \
+}
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(v_uint32x4)
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(v_int32x4)
+
+#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(_Tpvec) \
+inline v_uint64x2 v_popcount(const _Tpvec& a) \
+{ \
+    v2u64 t = MSA_TPV_REINTERPRET(v2u64, msa_cntq_s64(MSA_TPV_REINTERPRET(v2i64, a.val))); \
+    return v_uint64x2(t); \
+}
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(v_uint64x2)
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(v_int64x2)
+
+inline int v_signmask(const v_uint8x16& a)
+{
+    v8i8 m0 = msa_create_s8(CV_BIG_UINT(0x0706050403020100));
+    v16u8 v0 = msa_shlq_u8(msa_shrq_n_u8(a.val, 7), msa_combine_s8(m0, m0));
+    v8u16 v1 = msa_paddlq_u8(v0);
+    v4u32 v2 = msa_paddlq_u16(v1);
+    v2u64 v3 = msa_paddlq_u32(v2);
+    return (int)msa_getq_lane_u64(v3, 0) + ((int)msa_getq_lane_u64(v3, 1) << 8);
+}
+inline int v_signmask(const v_int8x16& a)
+{ return v_signmask(v_reinterpret_as_u8(a)); }
+
+inline int v_signmask(const v_uint16x8& a)
+{
+    v4i16 m0 = msa_create_s16(CV_BIG_UINT(0x0003000200010000));
+    v8u16 v0 = msa_shlq_u16(msa_shrq_n_u16(a.val, 15), msa_combine_s16(m0, m0));
+    v4u32 v1 = msa_paddlq_u16(v0);
+    v2u64 v2 = msa_paddlq_u32(v1);
+    return (int)msa_getq_lane_u64(v2, 0) + ((int)msa_getq_lane_u64(v2, 1) << 4);
+}
+inline int v_signmask(const v_int16x8& a)
+{ return v_signmask(v_reinterpret_as_u16(a)); }
+
+inline int v_signmask(const v_uint32x4& a)
+{
+    v2i32 m0 = msa_create_s32(CV_BIG_UINT(0x0000000100000000));
+    v4u32 v0 = msa_shlq_u32(msa_shrq_n_u32(a.val, 31), msa_combine_s32(m0, m0));
+    v2u64 v1 = msa_paddlq_u32(v0);
+    return (int)msa_getq_lane_u64(v1, 0) + ((int)msa_getq_lane_u64(v1, 1) << 2);
+}
+inline int v_signmask(const v_int32x4& a)
+{ return v_signmask(v_reinterpret_as_u32(a)); }
+inline int v_signmask(const v_float32x4& a)
+{ return v_signmask(v_reinterpret_as_u32(a)); }
+
+inline int v_signmask(const v_uint64x2& a)
+{
+    v2u64 v0 = msa_shrq_n_u64(a.val, 63);
+    return (int)msa_getq_lane_u64(v0, 0) + ((int)msa_getq_lane_u64(v0, 1) << 1);
+}
+inline int v_signmask(const v_int64x2& a)
+{ return v_signmask(v_reinterpret_as_u64(a)); }
+inline int v_signmask(const v_float64x2& a)
+{ return v_signmask(v_reinterpret_as_u64(a)); }
+
+inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
+
+#define OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(_Tpvec, _Tpvec2, suffix, shift) \
+inline bool v_check_all(const v_##_Tpvec& a) \
+{ \
+    _Tpvec2 v0 = msa_shrq_n_##suffix(msa_mvnq_##suffix(a.val), shift); \
+    v2u64 v1 = MSA_TPV_REINTERPRET(v2u64, v0); \
+    return (msa_getq_lane_u64(v1, 0) | msa_getq_lane_u64(v1, 1)) == 0; \
+} \
+inline bool v_check_any(const v_##_Tpvec& a) \
+{ \
+    _Tpvec2 v0 = msa_shrq_n_##suffix(a.val, shift); \
+    v2u64 v1 = MSA_TPV_REINTERPRET(v2u64, v0); \
+    return (msa_getq_lane_u64(v1, 0) | msa_getq_lane_u64(v1, 1)) != 0; \
+}
+
+OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint8x16, v16u8, u8, 7)
+OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint16x8, v8u16, u16, 15)
+OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint32x4, v4u32, u32, 31)
+OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint64x2, v2u64, u64, 63)
+
+inline bool v_check_all(const v_int8x16& a)
+{ return v_check_all(v_reinterpret_as_u8(a)); }
+inline bool v_check_all(const v_int16x8& a)
+{ return v_check_all(v_reinterpret_as_u16(a)); }
+inline bool v_check_all(const v_int32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+inline bool v_check_all(const v_float32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+
+inline bool v_check_any(const v_int8x16& a)
+{ return v_check_any(v_reinterpret_as_u8(a)); }
+inline bool v_check_any(const v_int16x8& a)
+{ return v_check_any(v_reinterpret_as_u16(a)); }
+inline bool v_check_any(const v_int32x4& a)
+{ return v_check_any(v_reinterpret_as_u32(a)); }
+inline bool v_check_any(const v_float32x4& a)
+{ return v_check_any(v_reinterpret_as_u32(a)); }
+
+inline bool v_check_all(const v_int64x2& a)
+{ return v_check_all(v_reinterpret_as_u64(a)); }
+inline bool v_check_all(const v_float64x2& a)
+{ return v_check_all(v_reinterpret_as_u64(a)); }
+inline bool v_check_any(const v_int64x2& a)
+{ return v_check_any(v_reinterpret_as_u64(a)); }
+inline bool v_check_any(const v_float64x2& a)
+{ return v_check_any(v_reinterpret_as_u64(a)); }
+
+/* v_select */
+#define OPENCV_HAL_IMPL_MSA_SELECT(_Tpvec, _Tpv, _Tpvu) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_bslq_u8(MSA_TPV_REINTERPRET(_Tpvu, mask.val), \
+                  MSA_TPV_REINTERPRET(_Tpvu, b.val), MSA_TPV_REINTERPRET(_Tpvu, a.val)))); \
+}
+
+OPENCV_HAL_IMPL_MSA_SELECT(v_uint8x16, v16u8, v16u8)
+OPENCV_HAL_IMPL_MSA_SELECT(v_int8x16, v16i8, v16u8)
+OPENCV_HAL_IMPL_MSA_SELECT(v_uint16x8, v8u16, v16u8)
+OPENCV_HAL_IMPL_MSA_SELECT(v_int16x8, v8i16, v16u8)
+OPENCV_HAL_IMPL_MSA_SELECT(v_uint32x4, v4u32, v16u8)
+OPENCV_HAL_IMPL_MSA_SELECT(v_int32x4, v4i32, v16u8)
+OPENCV_HAL_IMPL_MSA_SELECT(v_float32x4, v4f32, v16u8)
+OPENCV_HAL_IMPL_MSA_SELECT(v_float64x2, v2f64, v16u8)
+
+#define OPENCV_HAL_IMPL_MSA_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix, ssuffix, _Tpv, _Tpvs) \
+inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+{ \
+    _Tpv a_lo = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
+    _Tpv a_hi = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
+    b0.val = msa_paddlq_##suffix(a_lo); \
+    b1.val = msa_paddlq_##suffix(a_hi); \
+} \
+inline _Tpwvec v_expand_low(const _Tpvec& a) \
+{ \
+    _Tpv a_lo = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
+    return _Tpwvec(msa_paddlq_##suffix(a_lo)); \
+} \
+inline _Tpwvec v_expand_high(const _Tpvec& a) \
+{ \
+    _Tpv a_hi = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
+    return _Tpwvec(msa_paddlq_##suffix(a_hi)); \
+} \
+inline _Tpwvec v_load_expand(const _Tp* ptr) \
+{ \
+    return _Tpwvec(msa_movl_##suffix(msa_ld1_##suffix(ptr))); \
+}
+
+OPENCV_HAL_IMPL_MSA_EXPAND(v_uint8x16, v_uint16x8, uchar, u8, s8, v16u8, v16i8)
+OPENCV_HAL_IMPL_MSA_EXPAND(v_int8x16, v_int16x8, schar, s8, s8, v16i8, v16i8)
+OPENCV_HAL_IMPL_MSA_EXPAND(v_uint16x8, v_uint32x4, ushort, u16, s16, v8u16, v8i16)
+OPENCV_HAL_IMPL_MSA_EXPAND(v_int16x8, v_int32x4, short, s16, s16, v8i16, v8i16)
+OPENCV_HAL_IMPL_MSA_EXPAND(v_uint32x4, v_uint64x2, uint, u32, s32, v4u32, v4i32)
+OPENCV_HAL_IMPL_MSA_EXPAND(v_int32x4, v_int64x2, int, s32, s32, v4i32, v4i32)
+
+inline v_uint32x4 v_load_expand_q(const uchar* ptr)
+{
+    return v_uint32x4((v4u32){ptr[0], ptr[1], ptr[2], ptr[3]});
+}
+
+inline v_int32x4 v_load_expand_q(const schar* ptr)
+{
+    return v_int32x4((v4i32){ptr[0], ptr[1], ptr[2], ptr[3]});
+}
+
+/* v_zip, v_combine_low, v_combine_high, v_recombine */
+#define OPENCV_HAL_IMPL_MSA_UNPACKS(_Tpvec, _Tpv, _Tpvs, ssuffix) \
+inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
+{ \
+    b0.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
+    b1.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
+} \
+inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val)))); \
+} \
+inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val)))); \
+} \
+inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
+{ \
+    c.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val))); \
+    d.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val))); \
+}
+
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_uint8x16, v16u8, v16i8, s8)
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_int8x16, v16i8, v16i8, s8)
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_uint16x8, v8u16, v8i16, s16)
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_int16x8, v8i16, v8i16, s16)
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_uint32x4, v4u32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_int32x4, v4i32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_float32x4, v4f32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_float64x2, v2f64, v2i64, s64)
+
+/* v_extract */
+#define OPENCV_HAL_IMPL_MSA_EXTRACT(_Tpvec, _Tpv, _Tpvs, suffix) \
+template <int s> \
+inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), MSA_TPV_REINTERPRET(_Tpvs, b.val), s))); \
+}
+
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint8x16, v16u8, v16i8, s8)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_int8x16, v16i8, v16i8, s8)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint16x8, v8u16, v8i16, s16)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_int16x8, v8i16, v8i16, s16)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint32x4, v4u32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_int32x4, v4i32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint64x2, v2u64, v2i64, s64)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_int64x2, v2i64, v2i64, s64)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_float32x4, v4f32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_float64x2, v2f64, v2i64, s64)
+
+/* v_round, v_floor, v_ceil, v_trunc */
+inline v_int32x4 v_round(const v_float32x4& a)
+{
+    return v_int32x4(msa_cvttintq_s32_f32(a.val));
+}
+
+inline v_int32x4 v_floor(const v_float32x4& a)
+{
+    v4i32 a1 = msa_cvttintq_s32_f32(a.val);
+    return v_int32x4(msa_addq_s32(a1, MSA_TPV_REINTERPRET(v4i32, msa_cgtq_f32(msa_cvtfintq_f32_s32(a1), a.val))));
+}
+
+inline v_int32x4 v_ceil(const v_float32x4& a)
+{
+    v4i32 a1 = msa_cvttintq_s32_f32(a.val);
+    return v_int32x4(msa_subq_s32(a1, MSA_TPV_REINTERPRET(v4i32, msa_cgtq_f32(a.val, msa_cvtfintq_f32_s32(a1)))));
+}
+
+inline v_int32x4 v_trunc(const v_float32x4& a)
+{
+    return v_int32x4(msa_cvttruncq_s32_f32(a.val));
+}
+
+inline v_int32x4 v_round(const v_float64x2& a)
+{
+    return v_int32x4(msa_pack_s64(msa_cvttintq_s64_f64(a.val), msa_dupq_n_s64(0)));
+}
+
+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_int32x4(msa_pack_s64(msa_cvttintq_s64_f64(a.val), msa_cvttintq_s64_f64(b.val)));
+}
+
+inline v_int32x4 v_floor(const v_float64x2& a)
+{
+    v2f64 a1 = msa_cvtrintq_f64(a.val);
+    return v_int32x4(msa_pack_s64(msa_addq_s64(msa_cvttruncq_s64_f64(a1), MSA_TPV_REINTERPRET(v2i64, msa_cgtq_f64(a1, a.val))), msa_dupq_n_s64(0)));
+}
+
+inline v_int32x4 v_ceil(const v_float64x2& a)
+{
+    v2f64 a1 = msa_cvtrintq_f64(a.val);
+    return v_int32x4(msa_pack_s64(msa_subq_s64(msa_cvttruncq_s64_f64(a1), MSA_TPV_REINTERPRET(v2i64, msa_cgtq_f64(a.val, a1))), msa_dupq_n_s64(0)));
+}
+
+inline v_int32x4 v_trunc(const v_float64x2& a)
+{
+    return v_int32x4(msa_pack_s64(msa_cvttruncq_s64_f64(a.val), msa_dupq_n_s64(0)));
+}
+
+#define OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(_Tpvec, _Tpv, _Tpvs, ssuffix) \
+inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
+                           const _Tpvec& a2, const _Tpvec& a3, \
+                           _Tpvec& b0, _Tpvec& b1, \
+                           _Tpvec& b2, _Tpvec& b3) \
+{ \
+    _Tpv t00 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
+    _Tpv t01 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
+    _Tpv t10 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a3.val), MSA_TPV_REINTERPRET(_Tpvs, a2.val))); \
+    _Tpv t11 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a3.val), MSA_TPV_REINTERPRET(_Tpvs, a2.val))); \
+    b0.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, t10), MSA_TPV_REINTERPRET(v2i64, t00))); \
+    b1.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, t10), MSA_TPV_REINTERPRET(v2i64, t00))); \
+    b2.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, t11), MSA_TPV_REINTERPRET(v2i64, t01))); \
+    b3.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, t11), MSA_TPV_REINTERPRET(v2i64, t01))); \
+}
+
+OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(v_uint32x4, v4u32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(v_int32x4, v4i32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(v_float32x4, v4f32, v4i32, s32)
+
+#define OPENCV_HAL_IMPL_MSA_INTERLEAVED(_Tpvec, _Tp, suffix) \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
+{ \
+    msa_ld2q_##suffix(ptr, &a.val, &b.val); \
+} \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
+{ \
+    msa_ld3q_##suffix(ptr, &a.val, &b.val, &c.val); \
+} \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
+                                v_##_Tpvec& c, v_##_Tpvec& d) \
+{ \
+    msa_ld4q_##suffix(ptr, &a.val, &b.val, &c.val, &d.val); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    msa_st2q_##suffix(ptr, a.val, b.val); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    msa_st3q_##suffix(ptr, a.val, b.val, c.val); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                const v_##_Tpvec& c, const v_##_Tpvec& d, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
+{ \
+    msa_st4q_##suffix(ptr, a.val, b.val, c.val, d.val); \
+}
+
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(int8x16, schar, s8)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(int16x8, short, s16)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(int32x4, int, s32)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(float32x4, float, f32)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(int64x2, int64, s64)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(float64x2, double, f64)
+
+/* v_cvt_f32, v_cvt_f64, v_cvt_f64_high */
+inline v_float32x4 v_cvt_f32(const v_int32x4& a)
+{
+    return v_float32x4(msa_cvtfintq_f32_s32(a.val));
+}
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a)
+{
+    return v_float32x4(msa_cvtfq_f32_f64(a.val, msa_dupq_n_f64(0.0f)));
+}
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_float32x4(msa_cvtfq_f32_f64(a.val, b.val));
+}
+
+inline v_float64x2 v_cvt_f64(const v_int32x4& a)
+{
+    return v_float64x2(msa_cvtflq_f64_f32(msa_cvtfintq_f32_s32(a.val)));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
+{
+    return v_float64x2(msa_cvtfhq_f64_f32(msa_cvtfintq_f32_s32(a.val)));
+}
+
+inline v_float64x2 v_cvt_f64(const v_float32x4& a)
+{
+    return v_float64x2(msa_cvtflq_f64_f32(a.val));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
+{
+    return v_float64x2(msa_cvtfhq_f64_f32(a.val));
+}
+
+inline v_float64x2 v_cvt_f64(const v_int64x2& a)
+{
+    return v_float64x2(msa_cvtfintq_f64_s64(a.val));
+}
+
+////////////// Lookup table access ////////////////////
+inline v_int8x16 v_lut(const schar* tab, const int* idx)
+{
+    schar CV_DECL_ALIGNED(32) elems[16] =
+    {
+        tab[idx[ 0]],
+        tab[idx[ 1]],
+        tab[idx[ 2]],
+        tab[idx[ 3]],
+        tab[idx[ 4]],
+        tab[idx[ 5]],
+        tab[idx[ 6]],
+        tab[idx[ 7]],
+        tab[idx[ 8]],
+        tab[idx[ 9]],
+        tab[idx[10]],
+        tab[idx[11]],
+        tab[idx[12]],
+        tab[idx[13]],
+        tab[idx[14]],
+        tab[idx[15]]
+    };
+    return v_int8x16(msa_ld1q_s8(elems));
+}
+inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
+{
+    schar CV_DECL_ALIGNED(32) elems[16] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[3]],
+        tab[idx[3] + 1],
+        tab[idx[4]],
+        tab[idx[4] + 1],
+        tab[idx[5]],
+        tab[idx[5] + 1],
+        tab[idx[6]],
+        tab[idx[6] + 1],
+        tab[idx[7]],
+        tab[idx[7] + 1]
+    };
+    return v_int8x16(msa_ld1q_s8(elems));
+}
+inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
+{
+    schar CV_DECL_ALIGNED(32) elems[16] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[0] + 2],
+        tab[idx[0] + 3],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[1] + 2],
+        tab[idx[1] + 3],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[2] + 2],
+        tab[idx[2] + 3],
+        tab[idx[3]],
+        tab[idx[3] + 1],
+        tab[idx[3] + 2],
+        tab[idx[3] + 3]
+    };
+    return v_int8x16(msa_ld1q_s8(elems));
+}
+inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
+inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
+inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
+
+
+inline v_int16x8 v_lut(const short* tab, const int* idx)
+{
+    short CV_DECL_ALIGNED(32) elems[8] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]],
+        tab[idx[4]],
+        tab[idx[5]],
+        tab[idx[6]],
+        tab[idx[7]]
+    };
+    return v_int16x8(msa_ld1q_s16(elems));
+}
+inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
+{
+    short CV_DECL_ALIGNED(32) elems[8] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[3]],
+        tab[idx[3] + 1]
+    };
+    return v_int16x8(msa_ld1q_s16(elems));
+}
+inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
+{
+    return v_int16x8(msa_combine_s16(msa_ld1_s16(tab + idx[0]), msa_ld1_s16(tab + idx[1])));
+}
+inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
+inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
+inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
+
+inline v_int32x4 v_lut(const int* tab, const int* idx)
+{
+    int CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]]
+    };
+    return v_int32x4(msa_ld1q_s32(elems));
+}
+inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
+{
+    return v_int32x4(msa_combine_s32(msa_ld1_s32(tab + idx[0]), msa_ld1_s32(tab + idx[1])));
+}
+inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x4(msa_ld1q_s32(tab + idx[0]));
+}
+inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
+inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
+inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
+
+inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(msa_combine_s64(msa_create_s64(tab[idx[0]]), msa_create_s64(tab[idx[1]])));
+}
+inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(msa_ld1q_s64(tab + idx[0]));
+}
+inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
+inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
+
+inline v_float32x4 v_lut(const float* tab, const int* idx)
+{
+    float CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]]
+    };
+    return v_float32x4(msa_ld1q_f32(elems));
+}
+inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
+{
+    uint64 CV_DECL_ALIGNED(32) elems[2] =
+    {
+        *(uint64*)(tab + idx[0]),
+        *(uint64*)(tab + idx[1])
+    };
+    return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ld1q_u64(elems)));
+}
+inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
+{
+    return v_float32x4(msa_ld1q_f32(tab + idx[0]));
+}
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+
+    return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+
+inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
+{
+    unsigned CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[msa_getq_lane_s32(idxvec.val, 0)],
+        tab[msa_getq_lane_s32(idxvec.val, 1)],
+        tab[msa_getq_lane_s32(idxvec.val, 2)],
+        tab[msa_getq_lane_s32(idxvec.val, 3)]
+    };
+    return v_uint32x4(msa_ld1q_u32(elems));
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+
+    return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+
+    v4f32 xy02 = msa_combine_f32(msa_ld1_f32(tab + idx[0]), msa_ld1_f32(tab + idx[2]));
+    v4f32 xy13 = msa_combine_f32(msa_ld1_f32(tab + idx[1]), msa_ld1_f32(tab + idx[3]));
+    x = v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, xy13), MSA_TPV_REINTERPRET(v4i32, xy02))));
+    y = v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, xy13), MSA_TPV_REINTERPRET(v4i32, xy02))));
+}
+
+inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
+{
+    v_int8x16 c = v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0705060403010200, 0x0F0D0E0C0B090A08}), msa_dupq_n_s8(0), vec.val));
+    return c;
+}
+inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
+{ return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
+inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
+{
+    v_int8x16 c = v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0703060205010400, 0x0F0B0E0A0D090C08}), msa_dupq_n_s8(0), vec.val));
+    return c;
+}
+inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
+{
+    v_int16x8 c = v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0003000100020000, 0x0007000500060004}), msa_dupq_n_s16(0), vec.val));
+    return c;
+}
+
+inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
+
+inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
+{
+    v_int16x8 c = v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0005000100040000, 0x0007000300060002}), msa_dupq_n_s16(0), vec.val));
+    return c;
+}
+
+inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
+{
+    v_int32x4 c;
+    c.val[0] = vec.val[0];
+    c.val[1] = vec.val[2];
+    c.val[2] = vec.val[1];
+    c.val[3] = vec.val[3];
+    return c;
+}
+
+inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+
+inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
+{
+    v_int8x16 c = v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0908060504020100, 0x131211100E0D0C0A}), msa_dupq_n_s8(0), vec.val));
+    return c;
+}
+
+inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
+{
+    v_int16x8 c = v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0004000200010000, 0x0009000800060005}), msa_dupq_n_s16(0), vec.val));
+    return c;
+}
+
+inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
+inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
+inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
+inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
+
+inline v_float64x2 v_lut(const double* tab, const int* idx)
+{
+    double CV_DECL_ALIGNED(32) elems[2] =
+    {
+        tab[idx[0]],
+        tab[idx[1]]
+    };
+    return v_float64x2(msa_ld1q_f64(elems));
+}
+
+inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
+{
+    return v_float64x2(msa_ld1q_f64(tab + idx[0]));
+}
+
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+
+    return v_float64x2(tab[idx[0]], tab[idx[1]]);
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+
+    v2f64 xy0 = msa_ld1q_f64(tab + idx[0]);
+    v2f64 xy1 = msa_ld1q_f64(tab + idx[1]);
+    x = v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ilvevq_s64(MSA_TPV_REINTERPRET(v2i64, xy1), MSA_TPV_REINTERPRET(v2i64, xy0))));
+    y = v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ilvodq_s64(MSA_TPV_REINTERPRET(v2i64, xy1), MSA_TPV_REINTERPRET(v2i64, xy0))));
+}
+
+template<int i, typename _Tp>
+inline typename _Tp::lane_type v_extract_n(const _Tp& a)
+{
+    return v_rotate_right<i>(a).get0();
+}
+
+template<int i>
+inline v_uint32x4 v_broadcast_element(const v_uint32x4& a)
+{
+    return v_setall_u32(v_extract_n<i>(a));
+}
+template<int i>
+inline v_int32x4 v_broadcast_element(const v_int32x4& a)
+{
+    return v_setall_s32(v_extract_n<i>(a));
+}
+template<int i>
+inline v_float32x4 v_broadcast_element(const v_float32x4& a)
+{
+    return v_setall_f32(v_extract_n<i>(a));
+}
+
+////// FP16 support ///////
+#if CV_FP16
+inline v_float32x4 v_load_expand(const float16_t* ptr)
+{
+#ifndef msa_ld1_f16
+    v4f16 v = (v4f16)msa_ld1_s16((const short*)ptr);
+#else
+    v4f16 v = msa_ld1_f16((const __fp16*)ptr);
+#endif
+    return v_float32x4(msa_cvt_f32_f16(v));
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+{
+    v4f16 hv = msa_cvt_f16_f32(v.val);
+
+#ifndef msa_st1_f16
+    msa_st1_s16((short*)ptr, (int16x4_t)hv);
+#else
+    msa_st1_f16((__fp16*)ptr, hv);
+#endif
+}
+#else
+inline v_float32x4 v_load_expand(const float16_t* ptr)
+{
+    float buf[4];
+    for( int i = 0; i < 4; i++ )
+        buf[i] = (float)ptr[i];
+    return v_load(buf);
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+{
+    float buf[4];
+    v_store(buf, v);
+    for( int i = 0; i < 4; i++ )
+        ptr[i] = (float16_t)buf[i];
+}
+#endif
+
+inline void v_cleanup() {}
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+}
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_neon.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_neon.hpp
new file mode 100644
index 0000000..28cf813
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_neon.hpp
@@ -0,0 +1,2615 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_HAL_INTRIN_NEON_HPP
+#define OPENCV_HAL_INTRIN_NEON_HPP
+
+#include <algorithm>
+#include "opencv2/core/utility.hpp"
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+#define CV_SIMD128 1
+#if defined(__aarch64__) || defined(_M_ARM64)
+#define CV_SIMD128_64F 1
+#else
+#define CV_SIMD128_64F 0
+#endif
+
+// The following macro checks if the code is being compiled for the
+// AArch64 execution state of Armv8, to enable the 128-bit
+// intrinsics. The macro `__ARM_64BIT_STATE` is the one recommended by
+// the Arm C Language Extension (ACLE) specifications [1] to check the
+// availability of 128-bit intrinsics, and it is supporrted by clang
+// and gcc. The macro `_M_ARM64` is the equivalent one for Microsoft
+// Visual Studio [2] .
+//
+// [1] https://developer.arm.com/documentation/101028/0012/13--Advanced-SIMD--Neon--intrinsics
+// [2] https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros
+#if defined(__ARM_64BIT_STATE) || defined(_M_ARM64)
+#define CV_NEON_AARCH64 1
+#else
+#define CV_NEON_AARCH64 0
+#endif
+
+// TODO
+#define CV_NEON_DOT 0
+
+//////////// Utils ////////////
+
+#if CV_SIMD128_64F
+#define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \
+    inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
+    { c = vuzp1q_##suffix(a, b); d = vuzp2q_##suffix(a, b); }
+#define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \
+    inline void _v128_unzip(const _Tpv&a, const _Tpv&b, _Tpv& c, _Tpv& d) \
+    { c = vuzp1_##suffix(a, b); d = vuzp2_##suffix(a, b); }
+#else
+#define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \
+    inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
+    { _Tpvx2 ab = vuzpq_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; }
+#define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \
+    inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
+    { _Tpvx2 ab = vuzp_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; }
+#endif
+
+#if CV_SIMD128_64F
+#define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix) \
+    template <typename T> static inline \
+    _Tpv vreinterpretq_##suffix##_f64(T a) { return (_Tpv) a; } \
+    template <typename T> static inline \
+    float64x2_t vreinterpretq_f64_##suffix(T a) { return (float64x2_t) a; }
+#else
+#define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix)
+#endif
+
+#define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(_Tpv, _Tpvl, suffix) \
+    OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix) \
+    OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpvl##_t, _Tpvl##x2_t, suffix) \
+    OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix)
+
+#define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(_Tpv, _Tpvl, suffix) \
+    OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix)
+
+#define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(_Tpv, _Tpvl, suffix) \
+    OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix)
+
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint8x16, uint8x8,  u8)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int8x16,  int8x8,   s8)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint16x8, uint16x4, u16)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int16x8,  int16x4,  s16)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint32x4, uint32x2, u32)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int32x4,  int32x2,  s32)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(float32x4, float32x2, f32)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(uint64x2, uint64x1, u64)
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(int64x2,  int64x1,  s64)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(float64x2, float64x1,f64)
+#endif
+
+//////////// Types ////////////
+
+struct v_uint8x16
+{
+    typedef uchar lane_type;
+    enum { nlanes = 16 };
+
+    v_uint8x16() {}
+    explicit v_uint8x16(uint8x16_t v) : val(v) {}
+    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+               uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+    {
+        uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        val = vld1q_u8(v);
+    }
+    uchar get0() const
+    {
+        return vgetq_lane_u8(val, 0);
+    }
+
+    uint8x16_t val;
+};
+
+struct v_int8x16
+{
+    typedef schar lane_type;
+    enum { nlanes = 16 };
+
+    v_int8x16() {}
+    explicit v_int8x16(int8x16_t v) : val(v) {}
+    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+               schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+    {
+        schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        val = vld1q_s8(v);
+    }
+    schar get0() const
+    {
+        return vgetq_lane_s8(val, 0);
+    }
+
+    int8x16_t val;
+};
+
+struct v_uint16x8
+{
+    typedef ushort lane_type;
+    enum { nlanes = 8 };
+
+    v_uint16x8() {}
+    explicit v_uint16x8(uint16x8_t v) : val(v) {}
+    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+    {
+        ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = vld1q_u16(v);
+    }
+    ushort get0() const
+    {
+        return vgetq_lane_u16(val, 0);
+    }
+
+    uint16x8_t val;
+};
+
+struct v_int16x8
+{
+    typedef short lane_type;
+    enum { nlanes = 8 };
+
+    v_int16x8() {}
+    explicit v_int16x8(int16x8_t v) : val(v) {}
+    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+    {
+        short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = vld1q_s16(v);
+    }
+    short get0() const
+    {
+        return vgetq_lane_s16(val, 0);
+    }
+
+    int16x8_t val;
+};
+
+struct v_uint32x4
+{
+    typedef unsigned lane_type;
+    enum { nlanes = 4 };
+
+    v_uint32x4() {}
+    explicit v_uint32x4(uint32x4_t v) : val(v) {}
+    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
+    {
+        unsigned v[] = {v0, v1, v2, v3};
+        val = vld1q_u32(v);
+    }
+    unsigned get0() const
+    {
+        return vgetq_lane_u32(val, 0);
+    }
+
+    uint32x4_t val;
+};
+
+struct v_int32x4
+{
+    typedef int lane_type;
+    enum { nlanes = 4 };
+
+    v_int32x4() {}
+    explicit v_int32x4(int32x4_t v) : val(v) {}
+    v_int32x4(int v0, int v1, int v2, int v3)
+    {
+        int v[] = {v0, v1, v2, v3};
+        val = vld1q_s32(v);
+    }
+    int get0() const
+    {
+        return vgetq_lane_s32(val, 0);
+    }
+    int32x4_t val;
+};
+
+struct v_float32x4
+{
+    typedef float lane_type;
+    enum { nlanes = 4 };
+
+    v_float32x4() {}
+    explicit v_float32x4(float32x4_t v) : val(v) {}
+    v_float32x4(float v0, float v1, float v2, float v3)
+    {
+        float v[] = {v0, v1, v2, v3};
+        val = vld1q_f32(v);
+    }
+    float get0() const
+    {
+        return vgetq_lane_f32(val, 0);
+    }
+    float32x4_t val;
+};
+
+struct v_uint64x2
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 2 };
+
+    v_uint64x2() {}
+    explicit v_uint64x2(uint64x2_t v) : val(v) {}
+    v_uint64x2(uint64 v0, uint64 v1)
+    {
+        uint64 v[] = {v0, v1};
+        val = vld1q_u64(v);
+    }
+    uint64 get0() const
+    {
+        return vgetq_lane_u64(val, 0);
+    }
+    uint64x2_t val;
+};
+
+struct v_int64x2
+{
+    typedef int64 lane_type;
+    enum { nlanes = 2 };
+
+    v_int64x2() {}
+    explicit v_int64x2(int64x2_t v) : val(v) {}
+    v_int64x2(int64 v0, int64 v1)
+    {
+        int64 v[] = {v0, v1};
+        val = vld1q_s64(v);
+    }
+    int64 get0() const
+    {
+        return vgetq_lane_s64(val, 0);
+    }
+    int64x2_t val;
+};
+
+#if CV_SIMD128_64F
+struct v_float64x2
+{
+    typedef double lane_type;
+    enum { nlanes = 2 };
+
+    v_float64x2() {}
+    explicit v_float64x2(float64x2_t v) : val(v) {}
+    v_float64x2(double v0, double v1)
+    {
+        double v[] = {v0, v1};
+        val = vld1q_f64(v);
+    }
+    double get0() const
+    {
+        return vgetq_lane_f64(val, 0);
+    }
+    float64x2_t val;
+};
+#endif
+
+#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
+inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
+inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
+inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
+inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
+inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
+inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(vreinterpretq_u16_##suffix(v.val)); } \
+inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpretq_s16_##suffix(v.val)); } \
+inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(vreinterpretq_u32_##suffix(v.val)); } \
+inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(vreinterpretq_s32_##suffix(v.val)); } \
+inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(vreinterpretq_u64_##suffix(v.val)); } \
+inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(vreinterpretq_s64_##suffix(v.val)); } \
+inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); }
+
+OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8)
+OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16)
+OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32)
+OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, s64)
+OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32)
+#if CV_SIMD128_64F
+#define OPENCV_HAL_IMPL_NEON_INIT_64(_Tpv, suffix) \
+inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(vreinterpretq_f64_##suffix(v.val)); }
+OPENCV_HAL_IMPL_NEON_INIT(float64x2, double, f64)
+OPENCV_HAL_IMPL_NEON_INIT_64(uint8x16, u8)
+OPENCV_HAL_IMPL_NEON_INIT_64(int8x16, s8)
+OPENCV_HAL_IMPL_NEON_INIT_64(uint16x8, u16)
+OPENCV_HAL_IMPL_NEON_INIT_64(int16x8, s16)
+OPENCV_HAL_IMPL_NEON_INIT_64(uint32x4, u32)
+OPENCV_HAL_IMPL_NEON_INIT_64(int32x4, s32)
+OPENCV_HAL_IMPL_NEON_INIT_64(uint64x2, u64)
+OPENCV_HAL_IMPL_NEON_INIT_64(int64x2, s64)
+OPENCV_HAL_IMPL_NEON_INIT_64(float32x4, f32)
+OPENCV_HAL_IMPL_NEON_INIT_64(float64x2, f64)
+#endif
+
+#define OPENCV_HAL_IMPL_NEON_PACK(_Tpvec, _Tp, hreg, suffix, _Tpwvec, pack, mov, rshr) \
+inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
+{ \
+    hreg a1 = mov(a.val), b1 = mov(b.val); \
+    return _Tpvec(vcombine_##suffix(a1, b1)); \
+} \
+inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
+{ \
+    hreg a1 = mov(a.val); \
+    vst1_##suffix(ptr, a1); \
+} \
+template<int n> inline \
+_Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
+{ \
+    hreg a1 = rshr(a.val, n); \
+    hreg b1 = rshr(b.val, n); \
+    return _Tpvec(vcombine_##suffix(a1, b1)); \
+} \
+template<int n> inline \
+void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
+{ \
+    hreg a1 = rshr(a.val, n); \
+    vst1_##suffix(ptr, a1); \
+}
+
+OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, pack, vqmovn_u16, vqrshrn_n_u16)
+OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, pack, vqmovn_s16, vqrshrn_n_s16)
+OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, pack, vqmovn_u32, vqrshrn_n_u32)
+OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, pack, vqmovn_s32, vqrshrn_n_s32)
+OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, pack, vmovn_u64, vrshrn_n_u64)
+OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, pack, vmovn_s64, vrshrn_n_s64)
+
+OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, pack_u, vqmovun_s16, vqrshrun_n_s16)
+OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, pack_u, vqmovun_s32, vqrshrun_n_s32)
+
+// pack boolean
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    uint8x16_t ab = vcombine_u8(vmovn_u16(a.val), vmovn_u16(b.val));
+    return v_uint8x16(ab);
+}
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    uint16x8_t nab = vcombine_u16(vmovn_u32(a.val), vmovn_u32(b.val));
+    uint16x8_t ncd = vcombine_u16(vmovn_u32(c.val), vmovn_u32(d.val));
+    return v_uint8x16(vcombine_u8(vmovn_u16(nab), vmovn_u16(ncd)));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    uint32x4_t ab = vcombine_u32(vmovn_u64(a.val), vmovn_u64(b.val));
+    uint32x4_t cd = vcombine_u32(vmovn_u64(c.val), vmovn_u64(d.val));
+    uint32x4_t ef = vcombine_u32(vmovn_u64(e.val), vmovn_u64(f.val));
+    uint32x4_t gh = vcombine_u32(vmovn_u64(g.val), vmovn_u64(h.val));
+
+    uint16x8_t abcd = vcombine_u16(vmovn_u32(ab), vmovn_u32(cd));
+    uint16x8_t efgh = vcombine_u16(vmovn_u32(ef), vmovn_u32(gh));
+    return v_uint8x16(vcombine_u8(vmovn_u16(abcd), vmovn_u16(efgh)));
+}
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2,
+                            const v_float32x4& m3)
+{
+    float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
+    float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
+    res = vmlaq_lane_f32(res, m1.val, vl, 1);
+    res = vmlaq_lane_f32(res, m2.val, vh, 0);
+    res = vmlaq_lane_f32(res, m3.val, vh, 1);
+    return v_float32x4(res);
+}
+
+inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
+                               const v_float32x4& m1, const v_float32x4& m2,
+                               const v_float32x4& a)
+{
+    float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
+    float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
+    res = vmlaq_lane_f32(res, m1.val, vl, 1);
+    res = vmlaq_lane_f32(res, m2.val, vh, 0);
+    res = vaddq_f32(res, a.val);
+    return v_float32x4(res);
+}
+
+#define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
+inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+} \
+inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+{ \
+    a.val = intrin(a.val, b.val); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint32x4, vaddq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint32x4, vsubq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint32x4, vmulq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int64x2, vaddq_s64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int64x2, vsubq_s64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint64x2, vaddq_u64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint64x2, vsubq_u64)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float32x4, vdivq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float64x2, vaddq_f64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float64x2, vsubq_f64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float64x2, vmulq_f64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float64x2, vdivq_f64)
+#else
+inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
+{
+    float32x4_t reciprocal = vrecpeq_f32(b.val);
+    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
+    return v_float32x4(vmulq_f32(a.val, reciprocal));
+}
+inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
+{
+    float32x4_t reciprocal = vrecpeq_f32(b.val);
+    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
+    a.val = vmulq_f32(a.val, reciprocal);
+    return a;
+}
+#endif
+
+// saturating multiply 8-bit, 16-bit
+#define OPENCV_HAL_IMPL_NEON_MUL_SAT(_Tpvec, _Tpwvec)            \
+    inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
+    {                                                            \
+        _Tpwvec c, d;                                            \
+        v_mul_expand(a, b, c, d);                                \
+        return v_pack(c, d);                                     \
+    }                                                            \
+    inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
+    { a = a * b; return a; }
+
+OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int8x16,  v_int16x8)
+OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint8x16, v_uint16x8)
+OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int16x8,  v_int32x4)
+OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint16x8, v_uint32x4)
+
+//  Multiply and expand
+inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
+                         v_int16x8& c, v_int16x8& d)
+{
+    c.val = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
+#if CV_NEON_AARCH64
+    d.val = vmull_high_s8(a.val, b.val);
+#else // #if CV_NEON_AARCH64
+    d.val = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
+#endif // #if CV_NEON_AARCH64
+}
+
+inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
+                         v_uint16x8& c, v_uint16x8& d)
+{
+    c.val = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
+#if CV_NEON_AARCH64
+    d.val = vmull_high_u8(a.val, b.val);
+#else // #if CV_NEON_AARCH64
+    d.val = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
+#endif // #if CV_NEON_AARCH64
+}
+
+inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
+                         v_int32x4& c, v_int32x4& d)
+{
+    c.val = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
+#if CV_NEON_AARCH64
+    d.val = vmull_high_s16(a.val, b.val);
+#else // #if CV_NEON_AARCH64
+    d.val = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
+#endif // #if CV_NEON_AARCH64
+}
+
+inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
+                         v_uint32x4& c, v_uint32x4& d)
+{
+    c.val = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
+#if CV_NEON_AARCH64
+    d.val = vmull_high_u16(a.val, b.val);
+#else // #if CV_NEON_AARCH64
+    d.val = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
+#endif // #if CV_NEON_AARCH64
+}
+
+inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
+                         v_uint64x2& c, v_uint64x2& d)
+{
+    c.val = vmull_u32(vget_low_u32(a.val), vget_low_u32(b.val));
+#if CV_NEON_AARCH64
+    d.val = vmull_high_u32(a.val, b.val);
+#else // #if CV_NEON_AARCH64
+    d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val));
+#endif // #if CV_NEON_AARCH64
+}
+
+inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
+{
+#if CV_NEON_AARCH64
+    int32x4_t c = vmull_high_s16(a.val, b.val);
+#else // #if CV_NEON_AARCH64
+    int32x4_t c = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
+#endif // #if CV_NEON_AARCH64
+    return v_int16x8(vcombine_s16(
+                                  vshrn_n_s32(vmull_s16( vget_low_s16(a.val),  vget_low_s16(b.val)), 16),
+                                  vshrn_n_s32(c, 16)
+                                 ));
+}
+inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
+{
+#if CV_NEON_AARCH64
+    uint32x4_t c = vmull_high_u16(a.val, b.val);
+#else // #if CV_NEON_AARCH64
+    uint32x4_t c = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
+#endif // #if CV_NEON_AARCH64
+    return v_uint16x8(vcombine_u16(
+                                   vshrn_n_u32(vmull_u16( vget_low_u16(a.val),  vget_low_u16(b.val)), 16),
+                                   vshrn_n_u32(c, 16)
+                                  ));
+}
+
+//////// Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{
+    int16x8_t uzp1, uzp2;
+    _v128_unzip(a.val, b.val, uzp1, uzp2);
+    int16x4_t a0 = vget_low_s16(uzp1);
+    int16x4_t b0 = vget_high_s16(uzp1);
+    int16x4_t a1 = vget_low_s16(uzp2);
+    int16x4_t b1 = vget_high_s16(uzp2);
+    int32x4_t p = vmull_s16(a0, b0);
+    return v_int32x4(vmlal_s16(p, a1, b1));
+}
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{
+    int16x8_t uzp1, uzp2;
+    _v128_unzip(a.val, b.val, uzp1, uzp2);
+    int16x4_t a0 = vget_low_s16(uzp1);
+    int16x4_t b0 = vget_high_s16(uzp1);
+    int16x4_t a1 = vget_low_s16(uzp2);
+    int16x4_t b1 = vget_high_s16(uzp2);
+    int32x4_t p = vmlal_s16(c.val, a0, b0);
+    return v_int32x4(vmlal_s16(p, a1, b1));
+}
+
+// 32 >> 64
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
+{
+    int32x4_t uzp1, uzp2;
+    _v128_unzip(a.val, b.val, uzp1, uzp2);
+    int32x2_t a0 = vget_low_s32(uzp1);
+    int32x2_t b0 = vget_high_s32(uzp1);
+    int32x2_t a1 = vget_low_s32(uzp2);
+    int32x2_t b1 = vget_high_s32(uzp2);
+    int64x2_t p = vmull_s32(a0, b0);
+    return v_int64x2(vmlal_s32(p, a1, b1));
+}
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{
+    int32x4_t uzp1, uzp2;
+    _v128_unzip(a.val, b.val, uzp1, uzp2);
+    int32x2_t a0 = vget_low_s32(uzp1);
+    int32x2_t b0 = vget_high_s32(uzp1);
+    int32x2_t a1 = vget_low_s32(uzp2);
+    int32x2_t b1 = vget_high_s32(uzp2);
+    int64x2_t p = vmlal_s32(c.val, a0, b0);
+    return v_int64x2(vmlal_s32(p, a1, b1));
+}
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
+{
+#if CV_NEON_DOT
+    return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
+#else
+    const uint8x16_t zero   = vreinterpretq_u8_u32(vdupq_n_u32(0));
+    const uint8x16_t mask   = vreinterpretq_u8_u32(vdupq_n_u32(0x00FF00FF));
+    const uint16x8_t zero32 = vreinterpretq_u16_u32(vdupq_n_u32(0));
+    const uint16x8_t mask32 = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF));
+
+    uint16x8_t even = vmulq_u16(vreinterpretq_u16_u8(vbslq_u8(mask, a.val, zero)),
+                                vreinterpretq_u16_u8(vbslq_u8(mask, b.val, zero)));
+    uint16x8_t odd  = vmulq_u16(vshrq_n_u16(vreinterpretq_u16_u8(a.val), 8),
+                                vshrq_n_u16(vreinterpretq_u16_u8(b.val), 8));
+
+    uint32x4_t s0 = vaddq_u32(vreinterpretq_u32_u16(vbslq_u16(mask32, even, zero32)),
+                              vreinterpretq_u32_u16(vbslq_u16(mask32, odd,  zero32)));
+    uint32x4_t s1 = vaddq_u32(vshrq_n_u32(vreinterpretq_u32_u16(even), 16),
+                              vshrq_n_u32(vreinterpretq_u32_u16(odd),  16));
+    return v_uint32x4(vaddq_u32(s0, s1));
+#endif
+}
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
+                                   const v_uint32x4& c)
+{
+#if CV_NEON_DOT
+    return v_uint32x4(vdotq_u32(c.val, a.val, b.val));
+#else
+    return v_dotprod_expand(a, b) + c;
+#endif
+}
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
+{
+#if CV_NEON_DOT
+    return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
+#else
+    int16x8_t p0  = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
+    int16x8_t p1  = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
+    int16x8_t uzp1, uzp2;
+    _v128_unzip(p0, p1, uzp1, uzp2);
+    int16x8_t sum = vaddq_s16(uzp1, uzp2);
+    int16x4_t uzpl1, uzpl2;
+    _v128_unzip(vget_low_s16(sum), vget_high_s16(sum), uzpl1, uzpl2);
+    return v_int32x4(vaddl_s16(uzpl1, uzpl2));
+#endif
+}
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
+                                  const v_int32x4& c)
+{
+#if CV_NEON_DOT
+    return v_int32x4(vdotq_s32(c.val, a.val, b.val));
+#else
+    return v_dotprod_expand(a, b) + c;
+#endif
+}
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
+{
+    const uint16x8_t zero = vreinterpretq_u16_u32(vdupq_n_u32(0));
+    const uint16x8_t mask = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF));
+
+    uint32x4_t even = vmulq_u32(vreinterpretq_u32_u16(vbslq_u16(mask, a.val, zero)),
+                                vreinterpretq_u32_u16(vbslq_u16(mask, b.val, zero)));
+    uint32x4_t odd  = vmulq_u32(vshrq_n_u32(vreinterpretq_u32_u16(a.val), 16),
+                                vshrq_n_u32(vreinterpretq_u32_u16(b.val), 16));
+    uint32x4_t uzp1, uzp2;
+    _v128_unzip(even, odd, uzp1, uzp2);
+    uint64x2_t s0  = vaddl_u32(vget_low_u32(uzp1), vget_high_u32(uzp1));
+    uint64x2_t s1  = vaddl_u32(vget_low_u32(uzp2), vget_high_u32(uzp2));
+    return v_uint64x2(vaddq_u64(s0, s1));
+}
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
+{
+    int32x4_t p0  = vmull_s16(vget_low_s16(a.val),  vget_low_s16(b.val));
+    int32x4_t p1  = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
+
+    int32x4_t uzp1, uzp2;
+    _v128_unzip(p0, p1, uzp1, uzp2);
+    int32x4_t sum = vaddq_s32(uzp1, uzp2);
+
+    int32x2_t uzpl1, uzpl2;
+    _v128_unzip(vget_low_s32(sum), vget_high_s32(sum), uzpl1, uzpl2);
+    return v_int64x2(vaddl_s32(uzpl1, uzpl2));
+}
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
+                                  const v_int64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 32 >> 64f
+#if CV_SIMD128_64F
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a,   const v_int32x4& b,
+                                    const v_float64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+#endif
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
+{
+#if CV_NEON_AARCH64
+    int32x4_t p = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
+    return v_int32x4(vmlal_high_s16(p, a.val, b.val));
+#else
+    int16x4_t a0 = vget_low_s16(a.val);
+    int16x4_t a1 = vget_high_s16(a.val);
+    int16x4_t b0 = vget_low_s16(b.val);
+    int16x4_t b1 = vget_high_s16(b.val);
+    int32x4_t p = vmull_s16(a0, b0);
+    return v_int32x4(vmlal_s16(p, a1, b1));
+#endif
+}
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{
+#if CV_NEON_AARCH64
+    int32x4_t p = vmlal_s16(c.val, vget_low_s16(a.val), vget_low_s16(b.val));
+    return v_int32x4(vmlal_high_s16(p, a.val, b.val));
+#else
+    int16x4_t a0 = vget_low_s16(a.val);
+    int16x4_t a1 = vget_high_s16(a.val);
+    int16x4_t b0 = vget_low_s16(b.val);
+    int16x4_t b1 = vget_high_s16(b.val);
+    int32x4_t p = vmlal_s16(c.val, a0, b0);
+    return v_int32x4(vmlal_s16(p, a1, b1));
+#endif
+}
+
+// 32 >> 64
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
+{
+#if CV_NEON_AARCH64
+    int64x2_t p = vmull_s32(vget_low_s32(a.val), vget_low_s32(b.val));
+    return v_int64x2(vmlal_high_s32(p, a.val, b.val));
+#else
+    int32x2_t a0 = vget_low_s32(a.val);
+    int32x2_t a1 = vget_high_s32(a.val);
+    int32x2_t b0 = vget_low_s32(b.val);
+    int32x2_t b1 = vget_high_s32(b.val);
+    int64x2_t p = vmull_s32(a0, b0);
+    return v_int64x2(vmlal_s32(p, a1, b1));
+#endif
+}
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{
+#if CV_NEON_AARCH64
+    int64x2_t p = vmlal_s32(c.val, vget_low_s32(a.val), vget_low_s32(b.val));
+    return v_int64x2(vmlal_high_s32(p, a.val, b.val));
+#else
+    int32x2_t a0 = vget_low_s32(a.val);
+    int32x2_t a1 = vget_high_s32(a.val);
+    int32x2_t b0 = vget_low_s32(b.val);
+    int32x2_t b1 = vget_high_s32(b.val);
+    int64x2_t p = vmlal_s32(c.val, a0, b0);
+    return v_int64x2(vmlal_s32(p, a1, b1));
+#endif
+}
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
+{
+#if CV_NEON_DOT
+    return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
+#else
+    uint16x8_t p0 = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
+    uint16x8_t p1 = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
+    uint32x4_t s0 = vaddl_u16(vget_low_u16(p0), vget_low_u16(p1));
+    uint32x4_t s1 = vaddl_u16(vget_high_u16(p0), vget_high_u16(p1));
+    return v_uint32x4(vaddq_u32(s0, s1));
+#endif
+}
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{
+#if CV_NEON_DOT
+    return v_uint32x4(vdotq_u32(c.val, a.val, b.val));
+#else
+    return v_dotprod_expand_fast(a, b) + c;
+#endif
+}
+
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
+{
+#if CV_NEON_DOT
+    return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
+#else
+    int16x8_t prod = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
+    prod = vmlal_s8(prod, vget_high_s8(a.val), vget_high_s8(b.val));
+    return v_int32x4(vaddl_s16(vget_low_s16(prod), vget_high_s16(prod)));
+#endif
+}
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{
+#if CV_NEON_DOT
+    return v_int32x4(vdotq_s32(c.val, a.val, b.val));
+#else
+    return v_dotprod_expand_fast(a, b) + c;
+#endif
+}
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
+{
+    uint32x4_t p0  = vmull_u16(vget_low_u16(a.val),  vget_low_u16(b.val));
+    uint32x4_t p1  = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
+    uint64x2_t s0  = vaddl_u32(vget_low_u32(p0), vget_high_u32(p0));
+    uint64x2_t s1  = vaddl_u32(vget_low_u32(p1), vget_high_u32(p1));
+    return v_uint64x2(vaddq_u64(s0, s1));
+}
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
+{
+    int32x4_t prod = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
+    prod = vmlal_s16(prod, vget_high_s16(a.val), vget_high_s16(b.val));
+    return v_int64x2(vaddl_s32(vget_low_s32(prod), vget_high_s32(prod)));
+}
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+// 32 >> 64f
+#if CV_SIMD128_64F
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod_fast(a, b)); }
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+#endif
+
+
+#define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
+    OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
+    OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
+    OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \
+    inline _Tpvec operator ~ (const _Tpvec& a) \
+    { \
+        return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \
+    }
+
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint8x16, u8)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int8x16, s8)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint16x8, u16)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int16x8, s16)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint32x4, u32)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64)
+OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64)
+
+#define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
+inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
+{ \
+    return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \
+} \
+inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
+{ \
+    a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32)
+OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32)
+OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32)
+
+inline v_float32x4 operator ~ (const v_float32x4& a)
+{
+    return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
+}
+
+#if CV_SIMD128_64F
+inline v_float32x4 v_sqrt(const v_float32x4& x)
+{
+    return v_float32x4(vsqrtq_f32(x.val));
+}
+
+inline v_float32x4 v_invsqrt(const v_float32x4& x)
+{
+    v_float32x4 one = v_setall_f32(1.0f);
+    return one / v_sqrt(x);
+}
+#else
+inline v_float32x4 v_sqrt(const v_float32x4& x)
+{
+    float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN));
+    float32x4_t e = vrsqrteq_f32(x1);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
+    return v_float32x4(vmulq_f32(x.val, e));
+}
+
+inline v_float32x4 v_invsqrt(const v_float32x4& x)
+{
+    float32x4_t e = vrsqrteq_f32(x.val);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
+    return v_float32x4(e);
+}
+#endif
+
+#define OPENCV_HAL_IMPL_NEON_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
+inline _Tpuvec v_abs(const _Tpsvec& a) { return v_reinterpret_as_##usuffix(_Tpsvec(vabsq_##ssuffix(a.val))); }
+
+OPENCV_HAL_IMPL_NEON_ABS(v_uint8x16, v_int8x16, u8, s8)
+OPENCV_HAL_IMPL_NEON_ABS(v_uint16x8, v_int16x8, u16, s16)
+OPENCV_HAL_IMPL_NEON_ABS(v_uint32x4, v_int32x4, u32, s32)
+
+inline v_float32x4 v_abs(v_float32x4 x)
+{ return v_float32x4(vabsq_f32(x.val)); }
+
+#if CV_SIMD128_64F
+#define OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(bin_op, intrin) \
+inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
+{ \
+    return v_float64x2(vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val)))); \
+} \
+inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
+{ \
+    a.val = vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val))); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(&, vandq_s64)
+OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(|, vorrq_s64)
+OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(^, veorq_s64)
+
+inline v_float64x2 operator ~ (const v_float64x2& a)
+{
+    return v_float64x2(vreinterpretq_f64_s32(vmvnq_s32(vreinterpretq_s32_f64(a.val))));
+}
+
+inline v_float64x2 v_sqrt(const v_float64x2& x)
+{
+    return v_float64x2(vsqrtq_f64(x.val));
+}
+
+inline v_float64x2 v_invsqrt(const v_float64x2& x)
+{
+    v_float64x2 one = v_setall_f64(1.0f);
+    return one / v_sqrt(x);
+}
+
+inline v_float64x2 v_abs(v_float64x2 x)
+{ return v_float64x2(vabsq_f64(x.val)); }
+#endif
+
+// TODO: exp, log, sin, cos
+
+#define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+}
+
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_min, vminq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_max, vmaxq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_min, vminq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_max, vmaxq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_min, vminq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_max, vmaxq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_min, vminq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_max, vmaxq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_min, vminq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_max, vmaxq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_max, vmaxq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_min, vminq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_max, vmaxq_f32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_min, vminq_f64)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_max, vmaxq_f64)
+#endif
+
+#if CV_SIMD128_64F
+inline int64x2_t vmvnq_s64(int64x2_t a)
+{
+    int64x2_t vx = vreinterpretq_s64_u32(vdupq_n_u32(0xFFFFFFFF));
+    return veorq_s64(a, vx);
+}
+inline uint64x2_t vmvnq_u64(uint64x2_t a)
+{
+    uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
+    return veorq_u64(a, vx);
+}
+#endif
+#define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \
+inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
+
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, vreinterpretq_s8_u8, s8, u8)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint64x2, OPENCV_HAL_NOP, u64, u64)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int64x2, vreinterpretq_s64_u64, s64, u64)
+OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float64x2, vreinterpretq_f64_u64, f64, u64)
+#endif
+
+inline v_float32x4 v_not_nan(const v_float32x4& a)
+{ return v_float32x4(vreinterpretq_f32_u32(vceqq_f32(a.val, a.val))); }
+#if CV_SIMD128_64F
+inline v_float64x2 v_not_nan(const v_float64x2& a)
+{ return v_float64x2(vreinterpretq_f64_u64(vceqq_f64(a.val, a.val))); }
+#endif
+
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_add_wrap, vaddq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_mul_wrap, vmulq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_mul_wrap, vmulq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_mul_wrap, vmulq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_mul_wrap, vmulq_s16)
+
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_absdiff, vabdq_f64)
+#endif
+
+/** Saturating absolute difference **/
+inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
+{ return v_int8x16(vqabsq_s8(vqsubq_s8(a.val, b.val))); }
+inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
+{ return v_int16x8(vqabsq_s16(vqsubq_s16(a.val, b.val))); }
+
+#define OPENCV_HAL_IMPL_NEON_BIN_FUNC2(_Tpvec, _Tpvec2, cast, func, intrin) \
+inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec2(cast(intrin(a.val, b.val))); \
+}
+
+OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int8x16, v_uint8x16, vreinterpretq_u8_s8, v_absdiff, vabdq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int16x8, v_uint16x8, vreinterpretq_u16_s16, v_absdiff, vabdq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_FUNC2(v_int32x4, v_uint32x4, vreinterpretq_u32_s32, v_absdiff, vabdq_s32)
+
+inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+    v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
+    return v_sqrt(x);
+}
+
+inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+    return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
+}
+
+inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+#if CV_SIMD128_64F
+    // ARMv8, which adds support for 64-bit floating-point (so CV_SIMD128_64F is defined),
+    // also adds FMA support both for single- and double-precision floating-point vectors
+    return v_float32x4(vfmaq_f32(c.val, a.val, b.val));
+#else
+    return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
+#endif
+}
+
+inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_int32x4(vmlaq_s32(c.val, a.val, b.val));
+}
+
+inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
+{
+    v_float64x2 x(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
+    return v_sqrt(x);
+}
+
+inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_float64x2(vaddq_f64(vmulq_f64(a.val, a.val), vmulq_f64(b.val, b.val)));
+}
+
+inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+    return v_float64x2(vfmaq_f64(c.val, a.val, b.val));
+}
+
+inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+    return v_fma(a, b, c);
+}
+#endif
+
+// trade efficiency for convenience
+#define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
+inline _Tpvec operator << (const _Tpvec& a, int n) \
+{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
+inline _Tpvec operator >> (const _Tpvec& a, int n) \
+{ return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \
+template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
+{ return _Tpvec(vshlq_n_##suffix(a.val, n)); } \
+template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
+{ return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
+template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
+{ return _Tpvec(vrshrq_n_##suffix(a.val, n)); }
+
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, u16, short, s16)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, s16, short, s16)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, u32, int, s32)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint64x2, u64, int64, s64)
+OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64)
+
+#define OPENCV_HAL_IMPL_NEON_ROTATE_OP(_Tpvec, suffix) \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
+{ return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
+{ return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
+{ return a; } \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); } \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
+{ CV_UNUSED(b); return a; }
+
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint8x16, u8)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int8x16, s8)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint16x8, u16)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int16x8, s16)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint32x4, u32)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int32x4, s32)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float32x4, f32)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_uint64x2, u64)
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int64x2, s64)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float64x2, f64)
+#endif
+
+#if defined(__clang__) && defined(__aarch64__)
+// avoid LD2 instruction. details: https://github.com/opencv/opencv/issues/14863
+#define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ \
+typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64; \
+uint64 v = *(unaligned_uint64*)ptr; \
+return _Tpvec(v_reinterpret_as_##suffix(v_uint64x2(v, (uint64)123456))); \
+}
+#else
+#define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr), vdup_n_##suffix((_Tp)0))); }
+#endif
+
+#define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec(vld1q_##suffix(ptr)); } \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(vld1q_##suffix(ptr)); } \
+OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ vst1q_##suffix(ptr, a.val); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ vst1q_##suffix(ptr, a.val); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ vst1q_##suffix(ptr, a.val); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ vst1q_##suffix(ptr, a.val); } \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ vst1_##suffix(ptr, vget_high_##suffix(a.val)); }
+
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int64x2, int64, s64)
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
+#endif
+
+inline unsigned v_reduce_sum(const v_uint8x16& a)
+{
+#if CV_NEON_AARCH64
+    uint16_t t0 = vaddlvq_u8(a.val);
+    return t0;
+#else // #if CV_NEON_AARCH64
+    uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(a.val));
+    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
+    return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
+}
+inline int v_reduce_sum(const v_int8x16& a)
+{
+#if CV_NEON_AARCH64
+    int16_t t0 = vaddlvq_s8(a.val);
+    return t0;
+#else // #if CV_NEON_AARCH64
+    int32x4_t t0 = vpaddlq_s16(vpaddlq_s8(a.val));
+    int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
+    return vget_lane_s32(vpadd_s32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
+}
+inline unsigned v_reduce_sum(const v_uint16x8& a)
+{
+#if CV_NEON_AARCH64
+    uint32_t t0 = vaddlvq_u16(a.val);
+    return t0;
+#else // #if CV_NEON_AARCH64
+    uint32x4_t t0 = vpaddlq_u16(a.val);
+    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
+    return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
+}
+inline int v_reduce_sum(const v_int16x8& a)
+{
+#if CV_NEON_AARCH64
+    int32_t t0 = vaddlvq_s16(a.val);
+    return t0;
+#else // #if CV_NEON_AARCH64
+    int32x4_t t0 = vpaddlq_s16(a.val);
+    int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
+    return vget_lane_s32(vpadd_s32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
+}
+
+#if CV_NEON_AARCH64
+#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    return v##vectorfunc##vq_##suffix(a.val); \
+}
+#else // #if CV_NEON_AARCH64
+#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
+    a0 = vp##vectorfunc##_##suffix(a0, a0); \
+    a0 = vp##vectorfunc##_##suffix(a0, a0); \
+    return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
+}
+#endif // #if CV_NEON_AARCH64
+
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_uint8x16, uint8x8, uchar, max, max, u8)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_uint8x16, uint8x8, uchar, min, min, u8)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_int8x16, int8x8, schar, max, max, s8)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_int8x16, int8x8, schar, min, min, s8)
+
+#if CV_NEON_AARCH64
+#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    return v##vectorfunc##vq_##suffix(a.val); \
+}
+#else // #if CV_NEON_AARCH64
+#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
+    a0 = vp##vectorfunc##_##suffix(a0, a0); \
+    return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
+}
+#endif // #if CV_NEON_AARCH64
+
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, max, max, u16)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, min, min, u16)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, max, max, s16)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, min, min, s16)
+
+#if CV_NEON_AARCH64
+#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    return v##vectorfunc##vq_##suffix(a.val); \
+}
+#else // #if CV_NEON_AARCH64
+#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    _Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
+    return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, vget_high_##suffix(a.val)),0); \
+}
+#endif // #if CV_NEON_AARCH64
+
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, sum, add, u32)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, max, max, u32)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, uint32x2, unsigned, min, min, u32)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, sum, add, s32)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, max, max, s32)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int32x2, int, min, min, s32)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32)
+OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32)
+
+inline uint64 v_reduce_sum(const v_uint64x2& a)
+{
+#if CV_NEON_AARCH64
+    return vaddvq_u64(a.val);
+#else // #if CV_NEON_AARCH64
+    return vget_lane_u64(vadd_u64(vget_low_u64(a.val), vget_high_u64(a.val)),0);
+#endif // #if CV_NEON_AARCH64
+}
+inline int64 v_reduce_sum(const v_int64x2& a)
+{
+#if CV_NEON_AARCH64
+    return vaddvq_s64(a.val);
+#else // #if CV_NEON_AARCH64
+    return vget_lane_s64(vadd_s64(vget_low_s64(a.val), vget_high_s64(a.val)),0);
+#endif // #if CV_NEON_AARCH64
+}
+#if CV_SIMD128_64F
+inline double v_reduce_sum(const v_float64x2& a)
+{
+    return vaddvq_f64(a.val);
+}
+#endif
+
+inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
+                                 const v_float32x4& c, const v_float32x4& d)
+{
+#if CV_NEON_AARCH64
+    float32x4_t ab = vpaddq_f32(a.val, b.val); // a0+a1 a2+a3 b0+b1 b2+b3
+    float32x4_t cd = vpaddq_f32(c.val, d.val); // c0+c1 d0+d1 c2+c3 d2+d3
+    return v_float32x4(vpaddq_f32(ab, cd));  // sumA sumB sumC sumD
+#else // #if CV_NEON_AARCH64
+    float32x4x2_t ab = vtrnq_f32(a.val, b.val);
+    float32x4x2_t cd = vtrnq_f32(c.val, d.val);
+
+    float32x4_t u0 = vaddq_f32(ab.val[0], ab.val[1]); // a0+a1 b0+b1 a2+a3 b2+b3
+    float32x4_t u1 = vaddq_f32(cd.val[0], cd.val[1]); // c0+c1 d0+d1 c2+c3 d2+d3
+
+    float32x4_t v0 = vcombine_f32(vget_low_f32(u0), vget_low_f32(u1));
+    float32x4_t v1 = vcombine_f32(vget_high_f32(u0), vget_high_f32(u1));
+
+    return v_float32x4(vaddq_f32(v0, v1));
+#endif // #if CV_NEON_AARCH64
+}
+
+inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
+{
+#if CV_NEON_AARCH64
+    uint8x16_t t0 = vabdq_u8(a.val, b.val);
+    uint16_t t1 = vaddlvq_u8(t0);
+    return t1;
+#else // #if CV_NEON_AARCH64
+    uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vabdq_u8(a.val, b.val)));
+    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
+    return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
+}
+inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
+{
+#if CV_NEON_AARCH64
+    uint8x16_t t0 = vreinterpretq_u8_s8(vabdq_s8(a.val, b.val));
+    uint16_t t1 = vaddlvq_u8(t0);
+    return t1;
+#else // #if CV_NEON_AARCH64
+    uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s8(vabdq_s8(a.val, b.val))));
+    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
+    return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
+}
+inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
+{
+#if CV_NEON_AARCH64
+    uint16x8_t t0 = vabdq_u16(a.val, b.val);
+    uint32_t t1 = vaddlvq_u16(t0);
+    return t1;
+#else // #if CV_NEON_AARCH64
+    uint32x4_t t0 = vpaddlq_u16(vabdq_u16(a.val, b.val));
+    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
+    return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
+}
+inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
+{
+#if CV_NEON_AARCH64
+    uint16x8_t t0 = vreinterpretq_u16_s16(vabdq_s16(a.val, b.val));
+    uint32_t t1 = vaddlvq_u16(t0);
+    return t1;
+#else // #if CV_NEON_AARCH64
+    uint32x4_t t0 = vpaddlq_u16(vreinterpretq_u16_s16(vabdq_s16(a.val, b.val)));
+    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
+    return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
+}
+inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
+{
+#if CV_NEON_AARCH64
+    uint32x4_t t0 = vabdq_u32(a.val, b.val);
+    uint32_t t1 = vaddvq_u32(t0);
+    return t1;
+#else // #if CV_NEON_AARCH64
+    uint32x4_t t0 = vabdq_u32(a.val, b.val);
+    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
+    return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
+}
+inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
+{
+#if CV_NEON_AARCH64
+    uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val));
+    uint32_t t1 = vaddvq_u32(t0);
+    return t1;
+#else // #if CV_NEON_AARCH64
+    uint32x4_t t0 = vreinterpretq_u32_s32(vabdq_s32(a.val, b.val));
+    uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
+    return vget_lane_u32(vpadd_u32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
+}
+inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
+{
+#if CV_NEON_AARCH64
+    float32x4_t t0 = vabdq_f32(a.val, b.val);
+    return vaddvq_f32(t0);
+#else // #if CV_NEON_AARCH64
+    float32x4_t t0 = vabdq_f32(a.val, b.val);
+    float32x2_t t1 = vpadd_f32(vget_low_f32(t0), vget_high_f32(t0));
+    return vget_lane_f32(vpadd_f32(t1, t1), 0);
+#endif // #if CV_NEON_AARCH64
+}
+
+inline v_uint8x16 v_popcount(const v_uint8x16& a)
+{ return v_uint8x16(vcntq_u8(a.val)); }
+inline v_uint8x16 v_popcount(const v_int8x16& a)
+{ return v_uint8x16(vcntq_u8(vreinterpretq_u8_s8(a.val))); }
+inline v_uint16x8 v_popcount(const v_uint16x8& a)
+{ return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u16(a.val)))); }
+inline v_uint16x8 v_popcount(const v_int16x8& a)
+{ return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s16(a.val)))); }
+inline v_uint32x4 v_popcount(const v_uint32x4& a)
+{ return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u32(a.val))))); }
+inline v_uint32x4 v_popcount(const v_int32x4& a)
+{ return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s32(a.val))))); }
+inline v_uint64x2 v_popcount(const v_uint64x2& a)
+{ return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(a.val)))))); }
+inline v_uint64x2 v_popcount(const v_int64x2& a)
+{ return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s64(a.val)))))); }
+
+inline int v_signmask(const v_uint8x16& a)
+{
+#if CV_NEON_AARCH64
+    const int8x16_t signPosition = {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7};
+    const uint8x16_t byteOrder = {0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15};
+    uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), signPosition);
+    uint8x16_t v1 = vqtbl1q_u8(v0, byteOrder);
+    uint32_t t0 = vaddlvq_u16(vreinterpretq_u16_u8(v1));
+    return t0;
+#else // #if CV_NEON_AARCH64
+    int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
+    uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
+    uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
+    return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
+#endif // #if CV_NEON_AARCH64
+}
+
+inline int v_signmask(const v_int8x16& a)
+{ return v_signmask(v_reinterpret_as_u8(a)); }
+
+inline int v_signmask(const v_uint16x8& a)
+{
+#if CV_NEON_AARCH64
+    const int16x8_t signPosition = {0,1,2,3,4,5,6,7};
+    uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), signPosition);
+    uint32_t t0 = vaddlvq_u16(v0);
+    return t0;
+#else // #if CV_NEON_AARCH64
+    int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000));
+    uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0));
+    uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
+    return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
+#endif // #if CV_NEON_AARCH64
+}
+inline int v_signmask(const v_int16x8& a)
+{ return v_signmask(v_reinterpret_as_u16(a)); }
+
+inline int v_signmask(const v_uint32x4& a)
+{
+#if CV_NEON_AARCH64
+    const int32x4_t signPosition = {0,1,2,3};
+    uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), signPosition);
+    uint32_t t0 = vaddvq_u32(v0);
+    return t0;
+#else // #if CV_NEON_AARCH64
+    int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000));
+    uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0));
+    uint64x2_t v1 = vpaddlq_u32(v0);
+    return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
+#endif // #if CV_NEON_AARCH64
+}
+inline int v_signmask(const v_int32x4& a)
+{ return v_signmask(v_reinterpret_as_u32(a)); }
+inline int v_signmask(const v_float32x4& a)
+{ return v_signmask(v_reinterpret_as_u32(a)); }
+inline int v_signmask(const v_uint64x2& a)
+{
+#if CV_NEON_AARCH64
+    const int64x2_t signPosition = {0,1};
+    uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), signPosition);
+    uint64_t t0 = vaddvq_u64(v0);
+    return t0;
+#else // #if CV_NEON_AARCH64
+    int64x1_t m0 = vdup_n_s64(0);
+    uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), vcombine_s64(m0, m0));
+    return (int)vgetq_lane_u64(v0, 0) + ((int)vgetq_lane_u64(v0, 1) << 1);
+#endif // #if CV_NEON_AARCH64
+}
+inline int v_signmask(const v_int64x2& a)
+{ return v_signmask(v_reinterpret_as_u64(a)); }
+#if CV_SIMD128_64F
+inline int v_signmask(const v_float64x2& a)
+{ return v_signmask(v_reinterpret_as_u64(a)); }
+#endif
+
+inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); }
+#if CV_SIMD128_64F
+inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
+#endif
+
+#if CV_NEON_AARCH64
+    #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
+    inline bool v_check_all(const v_##_Tpvec& a) \
+    { \
+        return (vminvq_##suffix(a.val) >> shift) != 0; \
+    } \
+    inline bool v_check_any(const v_##_Tpvec& a) \
+    { \
+        return (vmaxvq_##suffix(a.val) >> shift) != 0; \
+    }
+#else // #if CV_NEON_AARCH64
+    #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
+    inline bool v_check_all(const v_##_Tpvec& a) \
+    { \
+        _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \
+        uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
+        return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \
+    } \
+    inline bool v_check_any(const v_##_Tpvec& a) \
+    { \
+        _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \
+        uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
+        return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \
+    }
+#endif // #if CV_NEON_AARCH64
+
+OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7)
+OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15)
+OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31)
+
+inline bool v_check_all(const v_uint64x2& a)
+{
+    uint64x2_t v0 = vshrq_n_u64(a.val, 63);
+    return (vgetq_lane_u64(v0, 0) & vgetq_lane_u64(v0, 1)) == 1;
+}
+inline bool v_check_any(const v_uint64x2& a)
+{
+    uint64x2_t v0 = vshrq_n_u64(a.val, 63);
+    return (vgetq_lane_u64(v0, 0) | vgetq_lane_u64(v0, 1)) != 0;
+}
+
+inline bool v_check_all(const v_int8x16& a)
+{ return v_check_all(v_reinterpret_as_u8(a)); }
+inline bool v_check_all(const v_int16x8& a)
+{ return v_check_all(v_reinterpret_as_u16(a)); }
+inline bool v_check_all(const v_int32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+inline bool v_check_all(const v_float32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+
+inline bool v_check_any(const v_int8x16& a)
+{ return v_check_any(v_reinterpret_as_u8(a)); }
+inline bool v_check_any(const v_int16x8& a)
+{ return v_check_any(v_reinterpret_as_u16(a)); }
+inline bool v_check_any(const v_int32x4& a)
+{ return v_check_any(v_reinterpret_as_u32(a)); }
+inline bool v_check_any(const v_float32x4& a)
+{ return v_check_any(v_reinterpret_as_u32(a)); }
+
+inline bool v_check_all(const v_int64x2& a)
+{ return v_check_all(v_reinterpret_as_u64(a)); }
+inline bool v_check_any(const v_int64x2& a)
+{ return v_check_any(v_reinterpret_as_u64(a)); }
+#if CV_SIMD128_64F
+inline bool v_check_all(const v_float64x2& a)
+{ return v_check_all(v_reinterpret_as_u64(a)); }
+inline bool v_check_any(const v_float64x2& a)
+{ return v_check_any(v_reinterpret_as_u64(a)); }
+#endif
+
+#define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(vbslq_##suffix(vreinterpretq_##usuffix##_##suffix(mask.val), a.val, b.val)); \
+}
+
+OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8)
+OPENCV_HAL_IMPL_NEON_SELECT(v_int8x16, s8, u8)
+OPENCV_HAL_IMPL_NEON_SELECT(v_uint16x8, u16, u16)
+OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16)
+OPENCV_HAL_IMPL_NEON_SELECT(v_uint32x4, u32, u32)
+OPENCV_HAL_IMPL_NEON_SELECT(v_int32x4, s32, u32)
+OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_SELECT(v_float64x2, f64, u64)
+#endif
+
+#if CV_NEON_AARCH64
+#define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
+inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+{ \
+    b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
+    b1.val = vmovl_high_##suffix(a.val); \
+} \
+inline _Tpwvec v_expand_low(const _Tpvec& a) \
+{ \
+    return _Tpwvec(vmovl_##suffix(vget_low_##suffix(a.val))); \
+} \
+inline _Tpwvec v_expand_high(const _Tpvec& a) \
+{ \
+    return _Tpwvec(vmovl_high_##suffix(a.val)); \
+} \
+inline _Tpwvec v_load_expand(const _Tp* ptr) \
+{ \
+    return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
+}
+#else
+#define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
+inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+{ \
+    b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
+    b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
+} \
+inline _Tpwvec v_expand_low(const _Tpvec& a) \
+{ \
+    return _Tpwvec(vmovl_##suffix(vget_low_##suffix(a.val))); \
+} \
+inline _Tpwvec v_expand_high(const _Tpvec& a) \
+{ \
+    return _Tpwvec(vmovl_##suffix(vget_high_##suffix(a.val))); \
+} \
+inline _Tpwvec v_load_expand(const _Tp* ptr) \
+{ \
+    return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
+}
+#endif
+
+OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8)
+OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8)
+OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16)
+OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16)
+OPENCV_HAL_IMPL_NEON_EXPAND(v_uint32x4, v_uint64x2, uint, u32)
+OPENCV_HAL_IMPL_NEON_EXPAND(v_int32x4, v_int64x2, int, s32)
+
+inline v_uint32x4 v_load_expand_q(const uchar* ptr)
+{
+    typedef unsigned int CV_DECL_ALIGNED(1) unaligned_uint;
+    uint8x8_t v0 = vcreate_u8(*(unaligned_uint*)ptr);
+    uint16x4_t v1 = vget_low_u16(vmovl_u8(v0));
+    return v_uint32x4(vmovl_u16(v1));
+}
+
+inline v_int32x4 v_load_expand_q(const schar* ptr)
+{
+    typedef unsigned int CV_DECL_ALIGNED(1) unaligned_uint;
+    int8x8_t v0 = vcreate_s8(*(unaligned_uint*)ptr);
+    int16x4_t v1 = vget_low_s16(vmovl_s8(v0));
+    return v_int32x4(vmovl_s16(v1));
+}
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+#define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
+inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
+{ \
+    b0.val = vzip1q_##suffix(a0.val, a1.val); \
+    b1.val = vzip2q_##suffix(a0.val, a1.val); \
+} \
+inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
+} \
+inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
+} \
+inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
+{ \
+    c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
+    d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
+}
+#else
+#define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
+inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
+{ \
+    _Tpvec##x2_t p = vzipq_##suffix(a0.val, a1.val); \
+    b0.val = p.val[0]; \
+    b1.val = p.val[1]; \
+} \
+inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
+} \
+inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
+} \
+inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
+{ \
+    c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
+    d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
+}
+#endif
+
+OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8)
+OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8)
+OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16)
+OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16)
+OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32)
+OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32)
+OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_UNPACKS(float64x2, f64)
+#endif
+
+inline v_uint8x16 v_reverse(const v_uint8x16 &a)
+{
+    uint8x16_t vec = vrev64q_u8(a.val);
+    return v_uint8x16(vextq_u8(vec, vec, 8));
+}
+
+inline v_int8x16 v_reverse(const v_int8x16 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x8 v_reverse(const v_uint16x8 &a)
+{
+    uint16x8_t vec = vrev64q_u16(a.val);
+    return v_uint16x8(vextq_u16(vec, vec, 4));
+}
+
+inline v_int16x8 v_reverse(const v_int16x8 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x4 v_reverse(const v_uint32x4 &a)
+{
+    uint32x4_t vec = vrev64q_u32(a.val);
+    return v_uint32x4(vextq_u32(vec, vec, 2));
+}
+
+inline v_int32x4 v_reverse(const v_int32x4 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x4 v_reverse(const v_float32x4 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x2 v_reverse(const v_uint64x2 &a)
+{
+    uint64x2_t vec = a.val;
+    uint64x1_t vec_lo = vget_low_u64(vec);
+    uint64x1_t vec_hi = vget_high_u64(vec);
+    return v_uint64x2(vcombine_u64(vec_hi, vec_lo));
+}
+
+inline v_int64x2 v_reverse(const v_int64x2 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_reverse(const v_float64x2 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+#endif
+
+#define OPENCV_HAL_IMPL_NEON_EXTRACT(_Tpvec, suffix) \
+template <int s> \
+inline v_##_Tpvec v_extract(const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    return v_##_Tpvec(vextq_##suffix(a.val, b.val, s)); \
+}
+
+OPENCV_HAL_IMPL_NEON_EXTRACT(uint8x16, u8)
+OPENCV_HAL_IMPL_NEON_EXTRACT(int8x16, s8)
+OPENCV_HAL_IMPL_NEON_EXTRACT(uint16x8, u16)
+OPENCV_HAL_IMPL_NEON_EXTRACT(int16x8, s16)
+OPENCV_HAL_IMPL_NEON_EXTRACT(uint32x4, u32)
+OPENCV_HAL_IMPL_NEON_EXTRACT(int32x4, s32)
+OPENCV_HAL_IMPL_NEON_EXTRACT(uint64x2, u64)
+OPENCV_HAL_IMPL_NEON_EXTRACT(int64x2, s64)
+OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64)
+#endif
+
+#define OPENCV_HAL_IMPL_NEON_EXTRACT_N(_Tpvec, _Tp, suffix) \
+template<int i> inline _Tp v_extract_n(_Tpvec v) { return vgetq_lane_##suffix(v.val, i); }
+
+OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int8x16, schar, s8)
+OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int16x8, short, s16)
+OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint32x4, uint, u32)
+OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int64x2, int64, s64)
+OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float32x4, float, f32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float64x2, double, f64)
+#endif
+
+#define OPENCV_HAL_IMPL_NEON_BROADCAST(_Tpvec, _Tp, suffix) \
+template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) { _Tp t = v_extract_n<i>(v); return v_setall_##suffix(t); }
+
+OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_NEON_BROADCAST(v_int8x16, schar, s8)
+OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_NEON_BROADCAST(v_int16x8, short, s16)
+OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint32x4, uint, u32)
+OPENCV_HAL_IMPL_NEON_BROADCAST(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_NEON_BROADCAST(v_int64x2, int64, s64)
+OPENCV_HAL_IMPL_NEON_BROADCAST(v_float32x4, float, f32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_BROADCAST(v_float64x2, double, f64)
+#endif
+
+#if CV_SIMD128_64F
+inline v_int32x4 v_round(const v_float32x4& a)
+{
+    float32x4_t a_ = a.val;
+    int32x4_t result;
+#if defined _MSC_VER
+    result = vcvtnq_s32_f32(a_);
+#else
+    __asm__ ("fcvtns %0.4s, %1.4s"
+             : "=w"(result)
+             : "w"(a_)
+             : /* No clobbers */);
+#endif
+    return v_int32x4(result);
+}
+#else
+inline v_int32x4 v_round(const v_float32x4& a)
+{
+    static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
+        v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
+
+    int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
+    return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
+}
+#endif
+inline v_int32x4 v_floor(const v_float32x4& a)
+{
+    int32x4_t a1 = vcvtq_s32_f32(a.val);
+    uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val);
+    return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask)));
+}
+
+inline v_int32x4 v_ceil(const v_float32x4& a)
+{
+    int32x4_t a1 = vcvtq_s32_f32(a.val);
+    uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1));
+    return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask)));
+}
+
+inline v_int32x4 v_trunc(const v_float32x4& a)
+{ return v_int32x4(vcvtq_s32_f32(a.val)); }
+
+#if CV_SIMD128_64F
+inline v_int32x4 v_round(const v_float64x2& a)
+{
+    static const int32x2_t zero = vdup_n_s32(0);
+    return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
+}
+
+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), vmovn_s64(vcvtaq_s64_f64(b.val))));
+}
+
+inline v_int32x4 v_floor(const v_float64x2& a)
+{
+    static const int32x2_t zero = vdup_n_s32(0);
+    int64x2_t a1 = vcvtq_s64_f64(a.val);
+    uint64x2_t mask = vcgtq_f64(vcvtq_f64_s64(a1), a.val);
+    a1 = vaddq_s64(a1, vreinterpretq_s64_u64(mask));
+    return v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
+}
+
+inline v_int32x4 v_ceil(const v_float64x2& a)
+{
+    static const int32x2_t zero = vdup_n_s32(0);
+    int64x2_t a1 = vcvtq_s64_f64(a.val);
+    uint64x2_t mask = vcgtq_f64(a.val, vcvtq_f64_s64(a1));
+    a1 = vsubq_s64(a1, vreinterpretq_s64_u64(mask));
+    return v_int32x4(vcombine_s32(vmovn_s64(a1), zero));
+}
+
+inline v_int32x4 v_trunc(const v_float64x2& a)
+{
+    static const int32x2_t zero = vdup_n_s32(0);
+    return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
+}
+#endif
+
+#if CV_NEON_AARCH64
+#define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
+inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
+                         const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
+                         v_##_Tpvec& b0, v_##_Tpvec& b1, \
+                         v_##_Tpvec& b2, v_##_Tpvec& b3) \
+{ \
+    /* -- Pass 1: 64b transpose */ \
+    _Tpvec##_t t0 = vreinterpretq_##suffix##32_##suffix##64( \
+                        vtrn1q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a0.val), \
+                                            vreinterpretq_##suffix##64_##suffix##32(a2.val))); \
+    _Tpvec##_t t1 = vreinterpretq_##suffix##32_##suffix##64( \
+                        vtrn1q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a1.val), \
+                                            vreinterpretq_##suffix##64_##suffix##32(a3.val))); \
+    _Tpvec##_t t2 = vreinterpretq_##suffix##32_##suffix##64( \
+                        vtrn2q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a0.val), \
+                                            vreinterpretq_##suffix##64_##suffix##32(a2.val))); \
+    _Tpvec##_t t3 = vreinterpretq_##suffix##32_##suffix##64( \
+                        vtrn2q_##suffix##64(vreinterpretq_##suffix##64_##suffix##32(a1.val), \
+                                            vreinterpretq_##suffix##64_##suffix##32(a3.val))); \
+    /* -- Pass 2: 32b transpose */ \
+    b0.val = vtrn1q_##suffix##32(t0, t1); \
+    b1.val = vtrn2q_##suffix##32(t0, t1); \
+    b2.val = vtrn1q_##suffix##32(t2, t3); \
+    b3.val = vtrn2q_##suffix##32(t2, t3); \
+}
+
+OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u)
+OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s)
+OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f)
+#else // #if CV_NEON_AARCH64
+#define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
+inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
+                         const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
+                         v_##_Tpvec& b0, v_##_Tpvec& b1, \
+                         v_##_Tpvec& b2, v_##_Tpvec& b3) \
+{ \
+    /* m00 m01 m02 m03 */ \
+    /* m10 m11 m12 m13 */ \
+    /* m20 m21 m22 m23 */ \
+    /* m30 m31 m32 m33 */ \
+    _Tpvec##x2_t t0 = vtrnq_##suffix(a0.val, a1.val); \
+    _Tpvec##x2_t t1 = vtrnq_##suffix(a2.val, a3.val); \
+    /* m00 m10 m02 m12 */ \
+    /* m01 m11 m03 m13 */ \
+    /* m20 m30 m22 m32 */ \
+    /* m21 m31 m23 m33 */ \
+    b0.val = vcombine_##suffix(vget_low_##suffix(t0.val[0]), vget_low_##suffix(t1.val[0])); \
+    b1.val = vcombine_##suffix(vget_low_##suffix(t0.val[1]), vget_low_##suffix(t1.val[1])); \
+    b2.val = vcombine_##suffix(vget_high_##suffix(t0.val[0]), vget_high_##suffix(t1.val[0])); \
+    b3.val = vcombine_##suffix(vget_high_##suffix(t0.val[1]), vget_high_##suffix(t1.val[1])); \
+}
+
+OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32)
+OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
+OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
+#endif // #if CV_NEON_AARCH64
+
+#define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
+{ \
+    _Tpvec##x2_t v = vld2q_##suffix(ptr); \
+    a.val = v.val[0]; \
+    b.val = v.val[1]; \
+} \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
+{ \
+    _Tpvec##x3_t v = vld3q_##suffix(ptr); \
+    a.val = v.val[0]; \
+    b.val = v.val[1]; \
+    c.val = v.val[2]; \
+} \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
+                                v_##_Tpvec& c, v_##_Tpvec& d) \
+{ \
+    _Tpvec##x4_t v = vld4q_##suffix(ptr); \
+    a.val = v.val[0]; \
+    b.val = v.val[1]; \
+    c.val = v.val[2]; \
+    d.val = v.val[3]; \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    _Tpvec##x2_t v; \
+    v.val[0] = a.val; \
+    v.val[1] = b.val; \
+    vst2q_##suffix(ptr, v); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    _Tpvec##x3_t v; \
+    v.val[0] = a.val; \
+    v.val[1] = b.val; \
+    v.val[2] = c.val; \
+    vst3q_##suffix(ptr, v); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                const v_##_Tpvec& c, const v_##_Tpvec& d, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec##x4_t v; \
+    v.val[0] = a.val; \
+    v.val[1] = b.val; \
+    v.val[2] = c.val; \
+    v.val[3] = d.val; \
+    vst4q_##suffix(ptr, v); \
+}
+
+#define OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(tp, suffix) \
+inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b ) \
+{ \
+    tp##x1_t a0 = vld1_##suffix(ptr); \
+    tp##x1_t b0 = vld1_##suffix(ptr + 1); \
+    tp##x1_t a1 = vld1_##suffix(ptr + 2); \
+    tp##x1_t b1 = vld1_##suffix(ptr + 3); \
+    a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
+    b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
+} \
+ \
+inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, \
+                                 v_##tp##x2& b, v_##tp##x2& c ) \
+{ \
+    tp##x1_t a0 = vld1_##suffix(ptr); \
+    tp##x1_t b0 = vld1_##suffix(ptr + 1); \
+    tp##x1_t c0 = vld1_##suffix(ptr + 2); \
+    tp##x1_t a1 = vld1_##suffix(ptr + 3); \
+    tp##x1_t b1 = vld1_##suffix(ptr + 4); \
+    tp##x1_t c1 = vld1_##suffix(ptr + 5); \
+    a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
+    b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
+    c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
+} \
+ \
+inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b, \
+                                 v_##tp##x2& c, v_##tp##x2& d ) \
+{ \
+    tp##x1_t a0 = vld1_##suffix(ptr); \
+    tp##x1_t b0 = vld1_##suffix(ptr + 1); \
+    tp##x1_t c0 = vld1_##suffix(ptr + 2); \
+    tp##x1_t d0 = vld1_##suffix(ptr + 3); \
+    tp##x1_t a1 = vld1_##suffix(ptr + 4); \
+    tp##x1_t b1 = vld1_##suffix(ptr + 5); \
+    tp##x1_t c1 = vld1_##suffix(ptr + 6); \
+    tp##x1_t d1 = vld1_##suffix(ptr + 7); \
+    a = v_##tp##x2(vcombine_##suffix(a0, a1)); \
+    b = v_##tp##x2(vcombine_##suffix(b0, b1)); \
+    c = v_##tp##x2(vcombine_##suffix(c0, c1)); \
+    d = v_##tp##x2(vcombine_##suffix(d0, d1)); \
+} \
+ \
+inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
+    vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
+    vst1_##suffix(ptr + 2, vget_high_##suffix(a.val)); \
+    vst1_##suffix(ptr + 3, vget_high_##suffix(b.val)); \
+} \
+ \
+inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \
+                                const v_##tp##x2& b, const v_##tp##x2& c, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
+    vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
+    vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
+    vst1_##suffix(ptr + 3, vget_high_##suffix(a.val)); \
+    vst1_##suffix(ptr + 4, vget_high_##suffix(b.val)); \
+    vst1_##suffix(ptr + 5, vget_high_##suffix(c.val)); \
+} \
+ \
+inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \
+                                const v_##tp##x2& c, const v_##tp##x2& d, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    vst1_##suffix(ptr, vget_low_##suffix(a.val)); \
+    vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \
+    vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \
+    vst1_##suffix(ptr + 3, vget_low_##suffix(d.val)); \
+    vst1_##suffix(ptr + 4, vget_high_##suffix(a.val)); \
+    vst1_##suffix(ptr + 5, vget_high_##suffix(b.val)); \
+    vst1_##suffix(ptr + 6, vget_high_##suffix(c.val)); \
+    vst1_##suffix(ptr + 7, vget_high_##suffix(d.val)); \
+}
+
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2, double, f64)
+#endif
+
+OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(int64, s64)
+OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(uint64, u64)
+
+inline v_float32x4 v_cvt_f32(const v_int32x4& a)
+{
+    return v_float32x4(vcvtq_f32_s32(a.val));
+}
+
+#if CV_SIMD128_64F
+inline v_float32x4 v_cvt_f32(const v_float64x2& a)
+{
+    float32x2_t zero = vdup_n_f32(0.0f);
+    return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), zero));
+}
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_float32x4(vcombine_f32(vcvt_f32_f64(a.val), vcvt_f32_f64(b.val)));
+}
+
+inline v_float64x2 v_cvt_f64(const v_int32x4& a)
+{
+    return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_low_s32(a.val))));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
+{
+    return v_float64x2(vcvt_f64_f32(vcvt_f32_s32(vget_high_s32(a.val))));
+}
+
+inline v_float64x2 v_cvt_f64(const v_float32x4& a)
+{
+    return v_float64x2(vcvt_f64_f32(vget_low_f32(a.val)));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
+{
+    return v_float64x2(vcvt_f64_f32(vget_high_f32(a.val)));
+}
+
+inline v_float64x2 v_cvt_f64(const v_int64x2& a)
+{  return v_float64x2(vcvtq_f64_s64(a.val)); }
+
+#endif
+
+////////////// Lookup table access ////////////////////
+
+inline v_int8x16 v_lut(const schar* tab, const int* idx)
+{
+    schar CV_DECL_ALIGNED(32) elems[16] =
+    {
+        tab[idx[ 0]],
+        tab[idx[ 1]],
+        tab[idx[ 2]],
+        tab[idx[ 3]],
+        tab[idx[ 4]],
+        tab[idx[ 5]],
+        tab[idx[ 6]],
+        tab[idx[ 7]],
+        tab[idx[ 8]],
+        tab[idx[ 9]],
+        tab[idx[10]],
+        tab[idx[11]],
+        tab[idx[12]],
+        tab[idx[13]],
+        tab[idx[14]],
+        tab[idx[15]]
+    };
+    return v_int8x16(vld1q_s8(elems));
+}
+inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
+{
+    schar CV_DECL_ALIGNED(32) elems[16] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[3]],
+        tab[idx[3] + 1],
+        tab[idx[4]],
+        tab[idx[4] + 1],
+        tab[idx[5]],
+        tab[idx[5] + 1],
+        tab[idx[6]],
+        tab[idx[6] + 1],
+        tab[idx[7]],
+        tab[idx[7] + 1]
+    };
+    return v_int8x16(vld1q_s8(elems));
+}
+inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
+{
+    schar CV_DECL_ALIGNED(32) elems[16] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[0] + 2],
+        tab[idx[0] + 3],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[1] + 2],
+        tab[idx[1] + 3],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[2] + 2],
+        tab[idx[2] + 3],
+        tab[idx[3]],
+        tab[idx[3] + 1],
+        tab[idx[3] + 2],
+        tab[idx[3] + 3]
+    };
+    return v_int8x16(vld1q_s8(elems));
+}
+inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
+inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
+inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
+
+inline v_int16x8 v_lut(const short* tab, const int* idx)
+{
+    short CV_DECL_ALIGNED(32) elems[8] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]],
+        tab[idx[4]],
+        tab[idx[5]],
+        tab[idx[6]],
+        tab[idx[7]]
+    };
+    return v_int16x8(vld1q_s16(elems));
+}
+inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
+{
+    short CV_DECL_ALIGNED(32) elems[8] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[3]],
+        tab[idx[3] + 1]
+    };
+    return v_int16x8(vld1q_s16(elems));
+}
+inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
+{
+    return v_int16x8(vcombine_s16(vld1_s16(tab + idx[0]), vld1_s16(tab + idx[1])));
+}
+inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
+inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
+inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
+
+inline v_int32x4 v_lut(const int* tab, const int* idx)
+{
+    int CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]]
+    };
+    return v_int32x4(vld1q_s32(elems));
+}
+inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
+{
+    return v_int32x4(vcombine_s32(vld1_s32(tab + idx[0]), vld1_s32(tab + idx[1])));
+}
+inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x4(vld1q_s32(tab + idx[0]));
+}
+inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
+inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
+inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
+
+inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(vcombine_s64(vcreate_s64(tab[idx[0]]), vcreate_s64(tab[idx[1]])));
+}
+inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(vld1q_s64(tab + idx[0]));
+}
+inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
+inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
+
+inline v_float32x4 v_lut(const float* tab, const int* idx)
+{
+    float CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]]
+    };
+    return v_float32x4(vld1q_f32(elems));
+}
+inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
+{
+    typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64;
+
+    uint64 CV_DECL_ALIGNED(32) elems[2] =
+    {
+        *(unaligned_uint64*)(tab + idx[0]),
+        *(unaligned_uint64*)(tab + idx[1])
+    };
+    return v_float32x4(vreinterpretq_f32_u64(vld1q_u64(elems)));
+}
+inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
+{
+    return v_float32x4(vld1q_f32(tab + idx[0]));
+}
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[vgetq_lane_s32(idxvec.val, 0)],
+        tab[vgetq_lane_s32(idxvec.val, 1)],
+        tab[vgetq_lane_s32(idxvec.val, 2)],
+        tab[vgetq_lane_s32(idxvec.val, 3)]
+    };
+    return v_int32x4(vld1q_s32(elems));
+}
+
+inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
+{
+    unsigned CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[vgetq_lane_s32(idxvec.val, 0)],
+        tab[vgetq_lane_s32(idxvec.val, 1)],
+        tab[vgetq_lane_s32(idxvec.val, 2)],
+        tab[vgetq_lane_s32(idxvec.val, 3)]
+    };
+    return v_uint32x4(vld1q_u32(elems));
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    float CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[vgetq_lane_s32(idxvec.val, 0)],
+        tab[vgetq_lane_s32(idxvec.val, 1)],
+        tab[vgetq_lane_s32(idxvec.val, 2)],
+        tab[vgetq_lane_s32(idxvec.val, 3)]
+    };
+    return v_float32x4(vld1q_f32(elems));
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    /*int CV_DECL_ALIGNED(32) idx[4];
+    v_store(idx, idxvec);
+
+    float32x4_t xy02 = vcombine_f32(vld1_f32(tab + idx[0]), vld1_f32(tab + idx[2]));
+    float32x4_t xy13 = vcombine_f32(vld1_f32(tab + idx[1]), vld1_f32(tab + idx[3]));
+
+    float32x4x2_t xxyy = vuzpq_f32(xy02, xy13);
+    x = v_float32x4(xxyy.val[0]);
+    y = v_float32x4(xxyy.val[1]);*/
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+
+    x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+    y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
+}
+
+inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
+{
+    return v_int8x16(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0705060403010200)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0705060403010200))));
+}
+inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
+inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
+{
+    return v_int8x16(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0703060205010400)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0703060205010400))));
+}
+inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
+{
+    return v_int16x8(vreinterpretq_s16_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0706030205040100)), vtbl1_s8(vget_high_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0706030205040100)))));
+}
+inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
+inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
+{
+    int16x4x2_t res = vzip_s16(vget_low_s16(vec.val), vget_high_s16(vec.val));
+    return v_int16x8(vcombine_s16(res.val[0], res.val[1]));
+}
+inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
+{
+    int32x2x2_t res = vzip_s32(vget_low_s32(vec.val), vget_high_s32(vec.val));
+    return v_int32x4(vcombine_s32(res.val[0], res.val[1]));
+}
+inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+
+inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
+{
+    return v_int8x16(vextq_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0605040201000000)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0807060504020100))), vdupq_n_s8(0), 2));
+}
+inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
+{
+    return v_int16x8(vreinterpretq_s16_s8(vextq_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0504030201000000)), vget_high_s8(vreinterpretq_s8_s16(vec.val))), vdupq_n_s8(0), 2)));
+}
+inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
+inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
+inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_lut(const double* tab, const int* idx)
+{
+    double CV_DECL_ALIGNED(32) elems[2] =
+    {
+        tab[idx[0]],
+        tab[idx[1]]
+    };
+    return v_float64x2(vld1q_f64(elems));
+}
+
+inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
+{
+    return v_float64x2(vld1q_f64(tab + idx[0]));
+}
+
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    double CV_DECL_ALIGNED(32) elems[2] =
+    {
+        tab[vgetq_lane_s32(idxvec.val, 0)],
+        tab[vgetq_lane_s32(idxvec.val, 1)],
+    };
+    return v_float64x2(vld1q_f64(elems));
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+
+    x = v_float64x2(tab[idx[0]], tab[idx[1]]);
+    y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
+}
+#endif
+
+////// FP16 support ///////
+#if CV_FP16
+inline v_float32x4 v_load_expand(const float16_t* ptr)
+{
+    float16x4_t v =
+    #ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
+        (float16x4_t)vld1_s16((const short*)ptr);
+    #else
+        vld1_f16((const __fp16*)ptr);
+    #endif
+    return v_float32x4(vcvt_f32_f16(v));
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+{
+    float16x4_t hv = vcvt_f16_f32(v.val);
+
+    #ifndef vst1_f16 // APPLE compiler defines vst1_f16 as macro
+        vst1_s16((short*)ptr, (int16x4_t)hv);
+    #else
+        vst1_f16((__fp16*)ptr, hv);
+    #endif
+}
+#else
+inline v_float32x4 v_load_expand(const float16_t* ptr)
+{
+    const int N = 4;
+    float buf[N];
+    for( int i = 0; i < N; i++ ) buf[i] = (float)ptr[i];
+    return v_load(buf);
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+{
+    const int N = 4;
+    float buf[N];
+    v_store(buf, v);
+    for( int i = 0; i < N; i++ ) ptr[i] = float16_t(buf[i]);
+}
+#endif
+
+inline void v_cleanup() {}
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+}
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_rvv.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_rvv.hpp
new file mode 100644
index 0000000..a592976
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_rvv.hpp
@@ -0,0 +1,3320 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// The original implementation has been contributed by Yin Zhang.
+// Copyright (C) 2020, Institute of Software, Chinese Academy of Sciences.
+
+#ifndef OPENCV_HAL_INTRIN_RVV_HPP
+#define OPENCV_HAL_INTRIN_RVV_HPP
+
+#include <algorithm>
+
+namespace cv
+{
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+#define CV_SIMD128 1
+#define CV_SIMD128_64F 1
+
+//////////// Unsupported native intrinsics in C++ ////////////
+// The following types have been defined in clang, but not in GCC yet.
+#ifndef __clang__
+
+struct vuint8mf2_t
+{
+    uchar val[8] = {0};
+    vuint8mf2_t() {}
+    vuint8mf2_t(const uchar* ptr)
+    {
+        for (int i = 0; i < 8; ++i)
+        {
+            val[i] = ptr[i];
+        }
+    }
+};
+struct vint8mf2_t
+{
+    schar val[8] = {0};
+    vint8mf2_t() {}
+    vint8mf2_t(const schar* ptr)
+    {
+        for (int i = 0; i < 8; ++i)
+        {
+            val[i] = ptr[i];
+        }
+    }
+};
+struct vuint16mf2_t
+{
+    ushort val[4] = {0};
+    vuint16mf2_t() {}
+    vuint16mf2_t(const ushort* ptr)
+    {
+        for (int i = 0; i < 4; ++i)
+        {
+            val[i] = ptr[i];
+        }
+    }
+};
+struct vint16mf2_t
+{
+    short val[4] = {0};
+    vint16mf2_t() {}
+    vint16mf2_t(const short* ptr)
+    {
+        for (int i = 0; i < 4; ++i)
+        {
+            val[i] = ptr[i];
+        }
+    }
+};
+struct vuint32mf2_t
+{
+    unsigned val[2] = {0};
+    vuint32mf2_t() {}
+    vuint32mf2_t(const unsigned* ptr)
+    {
+        val[0] = ptr[0];
+        val[1] = ptr[1];
+    }
+};
+struct vint32mf2_t
+{
+    int val[2] = {0};
+    vint32mf2_t() {}
+    vint32mf2_t(const int* ptr)
+    {
+        val[0] = ptr[0];
+        val[1] = ptr[1];
+    }
+};
+struct vfloat32mf2_t
+{
+    float val[2] = {0};
+    vfloat32mf2_t() {}
+    vfloat32mf2_t(const float* ptr)
+    {
+        val[0] = ptr[0];
+        val[1] = ptr[1];
+    }
+};
+struct vuint64mf2_t
+{
+    uint64 val[1] = {0};
+    vuint64mf2_t() {}
+    vuint64mf2_t(const uint64* ptr)
+    {
+        val[0] = ptr[0];
+    }
+};
+struct vint64mf2_t
+{
+    int64 val[1] = {0};
+    vint64mf2_t() {}
+    vint64mf2_t(const int64* ptr)
+    {
+        val[0] = ptr[0];
+    }
+};
+struct vfloat64mf2_t
+{
+    double val[1] = {0};
+    vfloat64mf2_t() {}
+    vfloat64mf2_t(const double* ptr)
+    {
+        val[0] = ptr[0];
+    }
+};
+struct vuint8mf4_t
+{
+    uchar val[4] = {0};
+    vuint8mf4_t() {}
+    vuint8mf4_t(const uchar* ptr)
+    {
+        for (int i = 0; i < 4; ++i)
+        {
+            val[i] = ptr[i];
+        }
+    }
+};
+struct vint8mf4_t
+{
+    schar val[4] = {0};
+    vint8mf4_t() {}
+    vint8mf4_t(const schar* ptr)
+    {
+        for (int i = 0; i < 4; ++i)
+        {
+            val[i] = ptr[i];
+        }
+    }
+};
+
+#define OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(_Tpvec, _Tp, suffix, width, n) \
+inline _Tpvec vle##width##_v_##suffix##mf2(const _Tp* ptr, size_t vl) \
+{ \
+    CV_UNUSED(vl); \
+    return _Tpvec(ptr); \
+} \
+inline void vse##width##_v_##suffix##mf2(_Tp* ptr, _Tpvec v, size_t vl) \
+{ \
+    CV_UNUSED(vl); \
+    for (int i = 0; i < n; ++i) \
+    { \
+            ptr[i] = v.val[i]; \
+    } \
+}
+
+OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint8mf2_t, uint8_t, u8, 8, 8)
+OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint8mf2_t, int8_t, i8, 8, 8)
+OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint16mf2_t, uint16_t, u16, 16, 4)
+OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint16mf2_t, int16_t, i16, 16, 4)
+OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint32mf2_t, uint32_t, u32, 32, 2)
+OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint32mf2_t, int32_t, i32, 32, 2)
+OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vfloat32mf2_t, float32_t, f32, 32, 2)
+OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint64mf2_t, uint64_t, u64, 64, 1)
+OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint64mf2_t, int64_t, i64, 64, 1)
+OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vfloat64mf2_t, float64_t, f64, 64, 1)
+
+
+#define OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(_Tpwvec, _Tpvec, _wTp, wcvt, suffix, width, n) \
+inline _Tpwvec wcvt (_Tpvec v, size_t vl) \
+{ \
+    _wTp tmp[n]; \
+    for (int i = 0; i < n; ++i) \
+    { \
+            tmp[i] = (_wTp)v.val[i]; \
+    } \
+    return vle##width##_v_##suffix##m1(tmp, vl); \
+}
+
+OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint16m1_t, vuint8mf2_t, ushort, vwcvtu_x_x_v_u16m1, u16, 16, 8)
+OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint16m1_t, vint8mf2_t, short, vwcvt_x_x_v_i16m1, i16, 16, 8)
+OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint32m1_t, vuint16mf2_t, unsigned, vwcvtu_x_x_v_u32m1, u32, 32, 4)
+OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint32m1_t, vint16mf2_t, int, vwcvt_x_x_v_i32m1, i32, 32, 4)
+OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint64m1_t, vuint32mf2_t, uint64, vwcvtu_x_x_v_u64m1, u64, 64, 2)
+OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint64m1_t, vint32mf2_t, int64, vwcvt_x_x_v_i64m1, i64, 64, 2)
+
+inline vuint8mf4_t vle8_v_u8mf4 (const uint8_t *base, size_t vl)
+{
+    CV_UNUSED(vl);
+    return vuint8mf4_t(base);
+}
+inline vint8mf4_t vle8_v_i8mf4 (const int8_t *base, size_t vl)
+{
+    CV_UNUSED(vl);
+    return vint8mf4_t(base);
+}
+
+inline vuint16mf2_t vwcvtu_x_x_v_u16mf2 (vuint8mf4_t src, size_t vl)
+{
+    ushort tmp[4];
+    for (int i = 0; i < 4; ++i)
+    {
+            tmp[i] = (ushort)src.val[i];
+    }
+    return vle16_v_u16mf2(tmp, vl);
+}
+inline vint16mf2_t vwcvt_x_x_v_i16mf2 (vint8mf4_t src, size_t vl)
+{
+    short tmp[4];
+    for (int i = 0; i < 4; ++i)
+    {
+            tmp[i] = (short)src.val[i];
+    }
+    return vle16_v_i16mf2(tmp, vl);
+}
+#endif
+
+//////////// Types ////////////
+
+#ifndef __clang__
+struct v_uint8x16
+{
+    typedef uchar lane_type;
+    enum { nlanes = 16 };
+
+    v_uint8x16() {}
+    explicit v_uint8x16(vuint8m1_t v)
+    {
+        vse8_v_u8m1(val, v, nlanes);
+    }
+    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+               uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+    {
+        uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        for (int i = 0; i < nlanes; ++i)
+        {
+            val[i] = v[i];
+        }
+    }
+    operator vuint8m1_t() const
+    {
+        return vle8_v_u8m1(val, nlanes);
+    }
+    uchar get0() const
+    {
+        return val[0];
+    }
+
+    uchar val[16];
+};
+
+struct v_int8x16
+{
+    typedef schar lane_type;
+    enum { nlanes = 16 };
+
+    v_int8x16() {}
+    explicit v_int8x16(vint8m1_t v)
+    {
+        vse8_v_i8m1(val, v, nlanes);
+    }
+    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+               schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+    {
+        schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        for (int i = 0; i < nlanes; ++i)
+        {
+            val[i] = v[i];
+        }
+    }
+    operator vint8m1_t() const
+    {
+        return vle8_v_i8m1(val, nlanes);
+    }
+    schar get0() const
+    {
+        return val[0];
+    }
+
+    schar val[16];
+};
+
+struct v_uint16x8
+{
+    typedef ushort lane_type;
+    enum { nlanes = 8 };
+
+    v_uint16x8() {}
+    explicit v_uint16x8(vuint16m1_t v)
+    {
+        vse16_v_u16m1(val, v, nlanes);
+    }
+    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+    {
+        ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        for (int i = 0; i < nlanes; ++i)
+        {
+            val[i] = v[i];
+        }
+    }
+    operator vuint16m1_t() const
+    {
+        return vle16_v_u16m1(val, nlanes);
+    }
+    ushort get0() const
+    {
+        return val[0];
+    }
+
+    ushort val[8];
+};
+
+struct v_int16x8
+{
+    typedef short lane_type;
+    enum { nlanes = 8 };
+
+    v_int16x8() {}
+    explicit v_int16x8(vint16m1_t v)
+    {
+        vse16_v_i16m1(val, v, nlanes);
+    }
+    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+    {
+        short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        for (int i = 0; i < nlanes; ++i)
+        {
+            val[i] = v[i];
+        }
+    }
+    operator vint16m1_t() const
+    {
+        return vle16_v_i16m1(val, nlanes);
+    }
+    short get0() const
+    {
+        return val[0];
+    }
+
+    short val[8];
+};
+
+struct v_uint32x4
+{
+    typedef unsigned lane_type;
+    enum { nlanes = 4 };
+
+    v_uint32x4() {}
+    explicit v_uint32x4(vuint32m1_t v)
+    {
+        vse32_v_u32m1(val, v, nlanes);
+    }
+    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
+    {
+        unsigned v[] = {v0, v1, v2, v3};
+        for (int i = 0; i < nlanes; ++i)
+        {
+            val[i] = v[i];
+        }
+    }
+    operator vuint32m1_t() const
+    {
+        return vle32_v_u32m1(val, nlanes);
+    }
+    unsigned get0() const
+    {
+        return val[0];
+    }
+
+    unsigned val[4];
+};
+
+struct v_int32x4
+{
+    typedef int lane_type;
+    enum { nlanes = 4 };
+
+    v_int32x4() {}
+    explicit v_int32x4(vint32m1_t v)
+    {
+        vse32_v_i32m1(val, v, nlanes);
+    }
+    v_int32x4(int v0, int v1, int v2, int v3)
+    {
+        int v[] = {v0, v1, v2, v3};
+        for (int i = 0; i < nlanes; ++i)
+        {
+            val[i] = v[i];
+        }
+    }
+    operator vint32m1_t() const
+    {
+        return vle32_v_i32m1(val, nlanes);
+    }
+    int get0() const
+    {
+        return val[0];
+    }
+    int val[4];
+};
+
+struct v_float32x4
+{
+    typedef float lane_type;
+    enum { nlanes = 4 };
+
+    v_float32x4() {}
+    explicit v_float32x4(vfloat32m1_t v)
+    {
+        vse32_v_f32m1(val, v, nlanes);
+    }
+    v_float32x4(float v0, float v1, float v2, float v3)
+    {
+        float v[] = {v0, v1, v2, v3};
+        for (int i = 0; i < nlanes; ++i)
+        {
+            val[i] = v[i];
+        }
+    }
+    operator vfloat32m1_t() const
+    {
+        return vle32_v_f32m1(val, nlanes);
+    }
+    float get0() const
+    {
+        return val[0];
+    }
+    float val[4];
+};
+
+struct v_uint64x2
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 2 };
+
+    v_uint64x2() {}
+    explicit v_uint64x2(vuint64m1_t v)
+    {
+        vse64_v_u64m1(val, v, nlanes);
+    }
+    v_uint64x2(uint64 v0, uint64 v1)
+    {
+        uint64 v[] = {v0, v1};
+        for (int i = 0; i < nlanes; ++i)
+        {
+            val[i] = v[i];
+        }
+    }
+    operator vuint64m1_t() const
+    {
+        return vle64_v_u64m1(val, nlanes);
+    }
+    uint64 get0() const
+    {
+        return val[0];
+    }
+
+    uint64 val[2];
+};
+
+struct v_int64x2
+{
+    typedef int64 lane_type;
+    enum { nlanes = 2 };
+
+    v_int64x2() {}
+    explicit v_int64x2(vint64m1_t v)
+    {
+        vse64_v_i64m1(val, v, nlanes);
+    }
+    v_int64x2(int64 v0, int64 v1)
+    {
+        int64 v[] = {v0, v1};
+        for (int i = 0; i < nlanes; ++i)
+        {
+            val[i] = v[i];
+        }
+    }
+    operator vint64m1_t() const
+    {
+        return vle64_v_i64m1(val, nlanes);
+    }
+    int64 get0() const
+    {
+        return val[0];
+    }
+
+    int64 val[2];
+};
+
+#if CV_SIMD128_64F
+struct v_float64x2
+{
+    typedef double lane_type;
+    enum { nlanes = 2 };
+
+    v_float64x2() {}
+    explicit v_float64x2(vfloat64m1_t v)
+    {
+        vse64_v_f64m1(val, v, nlanes);
+    }
+    v_float64x2(double v0, double v1)
+    {
+        double v[] = {v0, v1};
+        for (int i = 0; i < nlanes; ++i)
+        {
+            val[i] = v[i];
+        }
+    }
+    operator vfloat64m1_t() const
+    {
+        return vle64_v_f64m1(val, nlanes);
+    }
+    double get0() const
+    {
+        return val[0];
+    }
+
+    double val[2];
+};
+#endif
+#else
+struct v_uint8x16
+{
+    typedef uchar lane_type;
+    enum { nlanes = 16 };
+
+    v_uint8x16() {}
+    explicit v_uint8x16(vuint8m1_t v)
+    {
+        *pval = v;
+    }
+    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+               uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+    {
+        uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        *pval = vle8_v_u8m1(v, nlanes);
+    }
+    operator vuint8m1_t() const
+    {
+        return *pval;
+    }
+    uchar get0() const
+    {
+        return vmv_x(*pval);
+    }
+    inline v_uint8x16& operator=(const v_uint8x16& vec) {
+        *pval = *(vec.pval);
+        return *this;
+    }
+    inline v_uint8x16(const v_uint8x16& vec) {
+        *pval = *(vec.pval);
+    }
+    uchar val[16];
+    vuint8m1_t* pval = (vuint8m1_t*)val;
+};
+
+struct v_int8x16
+{
+    typedef schar lane_type;
+    enum { nlanes = 16 };
+
+    v_int8x16() {}
+    explicit v_int8x16(vint8m1_t v)
+    {
+        *pval = v;
+    }
+    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+               schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+    {
+        schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        *pval = vle8_v_i8m1(v, nlanes);
+    }
+    operator vint8m1_t() const
+    {
+        return *pval;
+    }
+    schar get0() const
+    {
+        return vmv_x(*pval);
+    }
+    inline v_int8x16& operator=(const v_int8x16& vec) {
+        *pval = *(vec.pval);
+        return *this;
+    }
+    inline v_int8x16(const v_int8x16& vec) {
+        *pval = *(vec.pval);
+    }
+    schar val[16];
+    vint8m1_t* pval = (vint8m1_t*)val;
+};
+
+struct v_uint16x8
+{
+    typedef ushort lane_type;
+    enum { nlanes = 8 };
+
+    v_uint16x8() {}
+    explicit v_uint16x8(vuint16m1_t v)
+    {
+        *pval = v;
+    }
+    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+    {
+        ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        *pval = vle16_v_u16m1(v, nlanes);
+    }
+    operator vuint16m1_t() const
+    {
+        return *pval;
+    }
+    ushort get0() const
+    {
+        return vmv_x(*pval);
+    }
+
+    inline v_uint16x8& operator=(const v_uint16x8& vec) {
+        *pval = *(vec.pval);
+        return *this;
+    }
+    inline v_uint16x8(const v_uint16x8& vec) {
+        *pval = *(vec.pval);
+    }
+    ushort val[8];
+    vuint16m1_t* pval = (vuint16m1_t*)val;
+};
+
+struct v_int16x8
+{
+    typedef short lane_type;
+    enum { nlanes = 8 };
+
+    v_int16x8() {}
+    explicit v_int16x8(vint16m1_t v)
+    {
+        *pval = v;
+    }
+    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+    {
+        short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        *pval = vle16_v_i16m1(v, nlanes);
+    }
+    operator vint16m1_t() const
+    {
+        return *pval;
+    }
+    short get0() const
+    {
+        return vmv_x(*pval);
+    }
+
+    inline v_int16x8& operator=(const v_int16x8& vec) {
+        *pval = *(vec.pval);
+        return *this;
+    }
+    inline v_int16x8(const v_int16x8& vec) {
+        *pval = *(vec.pval);
+    }
+    short val[8];
+    vint16m1_t* pval = (vint16m1_t*)val;
+};
+
+struct v_uint32x4
+{
+    typedef unsigned lane_type;
+    enum { nlanes = 4 };
+
+    v_uint32x4() {}
+    explicit v_uint32x4(vuint32m1_t v)
+    {
+        *pval = v;
+    }
+    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
+    {
+        unsigned v[] = {v0, v1, v2, v3};
+        *pval = vle32_v_u32m1(v, nlanes);
+    }
+    operator vuint32m1_t() const
+    {
+        return *pval;
+    }
+    unsigned get0() const
+    {
+        return vmv_x(*pval);
+    }
+
+    inline v_uint32x4& operator=(const v_uint32x4& vec) {
+        *pval = *(vec.pval);
+        return *this;
+    }
+    inline v_uint32x4(const v_uint32x4& vec) {
+        *pval = *(vec.pval);
+    }
+    unsigned val[4];
+    vuint32m1_t* pval = (vuint32m1_t*)val;
+};
+
+struct v_int32x4
+{
+    typedef int lane_type;
+    enum { nlanes = 4 };
+
+    v_int32x4() {}
+    explicit v_int32x4(vint32m1_t v)
+    {
+        *pval = v;
+    }
+    v_int32x4(int v0, int v1, int v2, int v3)
+    {
+        int v[] = {v0, v1, v2, v3};
+        *pval = vle32_v_i32m1(v, nlanes);
+    }
+    operator vint32m1_t() const
+    {
+        return *pval;
+    }
+    int get0() const
+    {
+        return vmv_x(*pval);
+    }
+
+    inline v_int32x4& operator=(const v_int32x4& vec) {
+        *pval = *(vec.pval);
+        return *this;
+    }
+    inline v_int32x4(const v_int32x4& vec) {
+        *pval = *(vec.pval);
+    }
+    int val[4];
+    vint32m1_t* pval = (vint32m1_t*)val;
+};
+
+struct v_float32x4
+{
+    typedef float lane_type;
+    enum { nlanes = 4 };
+
+    v_float32x4() {}
+    explicit v_float32x4(vfloat32m1_t v)
+    {
+        *pval = v;
+    }
+    v_float32x4(float v0, float v1, float v2, float v3)
+    {
+        float v[] = {v0, v1, v2, v3};
+        *pval = vle32_v_f32m1(v, nlanes);
+    }
+    operator vfloat32m1_t() const
+    {
+        return *pval;
+    }
+    float get0() const
+    {
+        return vfmv_f(*pval);
+    }
+    inline v_float32x4& operator=(const v_float32x4& vec) {
+        *pval = *(vec.pval);
+        return *this;
+    }
+    inline v_float32x4(const v_float32x4& vec) {
+        *pval = *(vec.pval);
+    }
+    float val[4];
+    vfloat32m1_t* pval = (vfloat32m1_t*)val;
+};
+
+struct v_uint64x2
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 2 };
+
+    v_uint64x2() {}
+    explicit v_uint64x2(vuint64m1_t v)
+    {
+        *pval = v;
+    }
+    v_uint64x2(uint64 v0, uint64 v1)
+    {
+        uint64 v[] = {v0, v1};
+        *pval = vle64_v_u64m1(v, nlanes);
+    }
+    operator vuint64m1_t() const
+    {
+        return *pval;
+    }
+    uint64 get0() const
+    {
+        return vmv_x(*pval);
+    }
+
+    inline v_uint64x2& operator=(const v_uint64x2& vec) {
+        *pval = *(vec.pval);
+        return *this;
+    }
+    inline v_uint64x2(const v_uint64x2& vec) {
+        *pval = *(vec.pval);
+    }
+    uint64 val[2];
+    vuint64m1_t* pval = (vuint64m1_t*)val;
+};
+
+struct v_int64x2
+{
+    typedef int64 lane_type;
+    enum { nlanes = 2 };
+
+    v_int64x2() {}
+    explicit v_int64x2(vint64m1_t v)
+    {
+        *pval = v;
+    }
+    v_int64x2(int64 v0, int64 v1)
+    {
+        int64 v[] = {v0, v1};
+        *pval = vle64_v_i64m1(v, nlanes);
+    }
+    operator vint64m1_t() const
+    {
+        return *pval;
+    }
+    int64 get0() const
+    {
+        return vmv_x(*pval);
+    }
+
+    inline v_int64x2& operator=(const v_int64x2& vec) {
+        *pval = *(vec.pval);
+        return *this;
+    }
+    inline v_int64x2(const v_int64x2& vec) {
+        *pval = *(vec.pval);
+    }
+    int64 val[2];
+    vint64m1_t* pval = (vint64m1_t*)val;
+};
+
+#if CV_SIMD128_64F
+struct v_float64x2
+{
+    typedef double lane_type;
+    enum { nlanes = 2 };
+
+    v_float64x2() {}
+    explicit v_float64x2(vfloat64m1_t v)
+    {
+        *pval = v;
+    }
+    v_float64x2(double v0, double v1)
+    {
+        double v[] = {v0, v1};
+        *pval = vle64_v_f64m1(v, nlanes);
+    }
+    operator vfloat64m1_t() const
+    {
+        return *pval;
+    }
+    double get0() const
+    {
+        return vfmv_f(*pval);
+    }
+
+    inline v_float64x2& operator=(const v_float64x2& vec) {
+        *pval = *(vec.pval);
+        return *this;
+    }
+    inline v_float64x2(const v_float64x2& vec) {
+        *pval = *(vec.pval);
+    }
+    double val[2];
+    vfloat64m1_t* pval = (vfloat64m1_t*)val;
+};
+#endif // CV_SIMD128_64F
+#endif // __clang__
+
+//////////// Initial ////////////
+
+#define OPENCV_HAL_IMPL_RVV_INIT_INTEGER(_Tpvec, _Tp, suffix1, suffix2, vl) \
+inline v_##_Tpvec v_setzero_##suffix1() \
+{ \
+    return v_##_Tpvec(vmv_v_x_##suffix2##m1(0, vl)); \
+} \
+inline v_##_Tpvec v_setall_##suffix1(_Tp v) \
+{ \
+    return v_##_Tpvec(vmv_v_x_##suffix2##m1(v, vl)); \
+}
+
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint8x16, uchar, u8, u8, 16)
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int8x16, schar, s8, i8, 16)
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint16x8, ushort, u16, u16, 8)
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int16x8, short, s16, i16, 8)
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint32x4, unsigned, u32, u32, 4)
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int32x4, int, s32, i32, 4)
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint64x2, uint64, u64, u64, 2)
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int64x2, int64, s64, i64, 2)
+
+#define OPENCV_HAL_IMPL_RVV_INIT_FP(_Tpv, _Tp, suffix, vl) \
+inline v_##_Tpv v_setzero_##suffix() \
+{ \
+    return v_##_Tpv(vfmv_v_f_##suffix##m1(0, vl)); \
+} \
+inline v_##_Tpv v_setall_##suffix(_Tp v) \
+{ \
+    return v_##_Tpv(vfmv_v_f_##suffix##m1(v, vl)); \
+}
+
+OPENCV_HAL_IMPL_RVV_INIT_FP(float32x4, float, f32, 4)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_INIT_FP(float64x2, double, f64, 2)
+#endif
+
+//////////// Reinterpret ////////////
+
+#define OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(_Tpvec, suffix) \
+inline v_##_Tpvec v_reinterpret_as_##suffix(const v_##_Tpvec& v) { return v; }
+
+OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint8x16, u8)
+OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int8x16, s8)
+OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint16x8, u16)
+OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int16x8, s16)
+OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint32x4, u32)
+OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int32x4, s32)
+OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(float32x4, f32)
+OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint64x2, u64)
+OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int64x2, s64)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(float64x2, f64)
+#endif
+
+#define OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2) \
+inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
+{ \
+    return v_##_Tpvec1(vreinterpret_v_##nsuffix2##m1_##nsuffix1##m1(v));\
+} \
+inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
+{ \
+    return v_##_Tpvec2(vreinterpret_v_##nsuffix1##m1_##nsuffix2##m1(v));\
+}
+
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8x16, int8x16, u8, s8, u8, i8)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16x8, int16x8, u16, s16, u16, i16)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32x4, int32x4, u32, s32, u32, i32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32x4, float32x4, u32, f32, u32, f32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32x4, float32x4, s32, f32, i32, f32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64x2, int64x2, u64, s64, u64, i64)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64x2, float64x2, u64, f64, u64, f64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int64x2, float64x2, s64, f64, i64, f64)
+#endif
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8x16, uint16x8, u8, u16, u8, u16)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8x16, uint32x4, u8, u32, u8, u32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8x16, uint64x2, u8, u64, u8, u64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16x8, uint32x4, u16, u32, u16, u32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16x8, uint64x2, u16, u64, u16, u64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32x4, uint64x2, u32, u64, u32, u64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8x16, int16x8, s8, s16, i8, i16)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8x16, int32x4, s8, s32, i8, i32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8x16, int64x2, s8, s64, i8, i64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16x8, int32x4, s16, s32, i16, i32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16x8, int64x2, s16, s64, i16, i64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32x4, int64x2, s32, s64, i32, i64)
+
+
+#define OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2, width1, width2) \
+inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
+{ \
+    return v_##_Tpvec1(vreinterpret_v_##nsuffix1##width2##m1_##nsuffix1##width1##m1(vreinterpret_v_##nsuffix2##width2##m1_##nsuffix1##width2##m1(v)));\
+} \
+inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
+{ \
+    return v_##_Tpvec2(vreinterpret_v_##nsuffix1##width2##m1_##nsuffix2##width2##m1(vreinterpret_v_##nsuffix1##width1##m1_##nsuffix1##width2##m1(v)));\
+}
+
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, int16x8, u8, s16, u, i, 8, 16)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, int32x4, u8, s32, u, i, 8, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, int64x2, u8, s64, u, i, 8, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, int8x16, u16, s8, u, i, 16, 8)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, int32x4, u16, s32, u, i, 16, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, int64x2, u16, s64, u, i, 16, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32x4, int8x16, u32, s8, u, i, 32, 8)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32x4, int16x8, u32, s16, u, i, 32, 16)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32x4, int64x2, u32, s64, u, i, 32, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64x2, int8x16, u64, s8, u, i, 64, 8)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64x2, int16x8, u64, s16, u, i, 64, 16)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64x2, int32x4, u64, s32, u, i, 64, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, float32x4, u8, f32, u, f, 8, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, float32x4, u16, f32, u, f, 16, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64x2, float32x4, u64, f32, u, f, 64, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8x16, float32x4, s8, f32, i, f, 8, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16x8, float32x4, s16, f32, i, f, 16, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int64x2, float32x4, s64, f32, i, f, 64, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, float64x2, u8, f64, u, f, 8, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, float64x2, u16, f64, u, f, 16, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32x4, float64x2, u32, f64, u, f, 32, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8x16, float64x2, s8, f64, i, f, 8, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16x8, float64x2, s16, f64, i, f, 16, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int32x4, float64x2, s32, f64, i, f, 32, 64)
+
+// Three times reinterpret
+inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& v) \
+{ \
+    return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u64m1_u32m1(vreinterpret_v_f64m1_u64m1(v))));\
+} \
+inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& v) \
+{ \
+    return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u32m1_u64m1(vreinterpret_v_f32m1_u32m1(v))));\
+}
+
+////////////// Extract //////////////
+
+#define OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(_Tpvec, _Tp, suffix, vmv, vl) \
+template <int s> \
+inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, s, vl), b, _Tpvec::nlanes - s, vl)); \
+} \
+template<int i> inline _Tp v_extract_n(_Tpvec v) \
+{ \
+    return _Tp(vmv(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), v, i, vl))); \
+}
+
+
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint8x16, uchar, u8, vmv_x_s_u8m1_u8, 16)
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int8x16, schar, i8, vmv_x_s_i8m1_i8, 16)
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint16x8, ushort, u16, vmv_x_s_u16m1_u16, 8)
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int16x8, short, i16, vmv_x_s_i16m1_i16, 8)
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint32x4, uint, u32, vmv_x_s_u32m1_u32, 4)
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int32x4, int, i32, vmv_x_s_i32m1_i32, 4)
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint64x2, uint64, u64, vmv_x_s_u64m1_u64, 2)
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int64x2, int64, i64, vmv_x_s_i64m1_i64, 2)
+
+#define OPENCV_HAL_IMPL_RVV_EXTRACT_FP(_Tpvec, _Tp, suffix, vmv, vl) \
+template <int s> \
+inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, s, vl), b, _Tpvec::nlanes - s, vl)); \
+} \
+template<int i> inline _Tp v_extract_n(_Tpvec v) \
+{ \
+    return _Tp(vmv(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), v, i, vl))); \
+}
+
+OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float32x4, float, f32, vfmv_f_s_f32m1_f32, 4)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float64x2, double, f64, vfmv_f_s_f64m1_f64, 2)
+#endif
+
+////////////// Load/Store //////////////
+
+#define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(_Tpvec, _nTpvec, _Tp, hvl, vl, width, suffix, vmv) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ \
+    return _Tpvec(vle##width##_v_##suffix##m1(ptr, vl)); \
+} \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ \
+    return _Tpvec(vle##width##_v_##suffix##m1(ptr, vl)); \
+} \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ \
+    _Tpvec res = _Tpvec(vle##width##_v_##suffix##m1(ptr, hvl)); \
+    return res; \
+} \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ \
+    vse##width##_v_##suffix##m1(ptr, a, vl); \
+} \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ \
+    vse##width##_v_##suffix##m1(ptr, a, vl); \
+} \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ \
+    vse##width##_v_##suffix##m1(ptr, a, vl); \
+} \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ \
+    vse##width##_v_##suffix##m1(ptr, a, vl); \
+} \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ \
+    vse##width##_v_##suffix##m1(ptr, a, hvl); \
+} \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+    vse##width##_v_##suffix##m1(ptr, vslidedown_vx_##suffix##m1(vmv(0, vl), a, hvl, vl), hvl); \
+}
+
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint8x16, vuint8m1_t, uchar, 8, 16, 8, u8, vmv_v_x_u8m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int8x16, vint8m1_t, schar, 8, 16, 8, i8, vmv_v_x_i8m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint16x8, vuint16m1_t, ushort, 4, 8, 16, u16, vmv_v_x_u16m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int16x8, vint16m1_t, short, 4, 8, 16, i16, vmv_v_x_i16m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint32x4, vuint32m1_t, unsigned, 2, 4, 32, u32, vmv_v_x_u32m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int32x4, vint32m1_t, int, 2, 4, 32, i32, vmv_v_x_i32m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint64x2, vuint64m1_t, uint64, 1, 2, 64, u64, vmv_v_x_u64m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int64x2, vint64m1_t, int64, 1, 2, 64, i64, vmv_v_x_i64m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float32x4, vfloat32m1_t, float, 2, 4, 32, f32, vfmv_v_f_f32m1)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64x2, vfloat64m1_t, double, 1, 2, 64, f64, vfmv_v_f_f64m1)
+#endif
+
+inline v_int8x16 v_load_halves(const schar* ptr0, const schar* ptr1)
+{
+    schar elems[16] =
+    {
+        ptr0[0], ptr0[1], ptr0[2], ptr0[3], ptr0[4], ptr0[5], ptr0[6], ptr0[7],
+        ptr1[0], ptr1[1], ptr1[2], ptr1[3], ptr1[4], ptr1[5], ptr1[6], ptr1[7]
+    };
+    return v_int8x16(vle8_v_i8m1(elems, 16));
+}
+inline v_uint8x16 v_load_halves(const uchar* ptr0, const uchar* ptr1) { return v_reinterpret_as_u8(v_load_halves((schar*)ptr0, (schar*)ptr1)); }
+
+inline v_int16x8 v_load_halves(const short* ptr0, const short* ptr1)
+{
+    short elems[8] =
+    {
+        ptr0[0], ptr0[1], ptr0[2], ptr0[3], ptr1[0], ptr1[1], ptr1[2], ptr1[3]
+    };
+    return v_int16x8(vle16_v_i16m1(elems, 8));
+}
+inline v_uint16x8 v_load_halves(const ushort* ptr0, const ushort* ptr1) { return v_reinterpret_as_u16(v_load_halves((short*)ptr0, (short*)ptr1)); }
+
+inline v_int32x4 v_load_halves(const int* ptr0, const int* ptr1)
+{
+    int elems[4] =
+    {
+        ptr0[0], ptr0[1], ptr1[0], ptr1[1]
+    };
+    return v_int32x4(vle32_v_i32m1(elems, 4));
+}
+inline v_float32x4 v_load_halves(const float* ptr0, const float* ptr1)
+{
+    float elems[4] =
+    {
+        ptr0[0], ptr0[1], ptr1[0], ptr1[1]
+    };
+    return v_float32x4(vle32_v_f32m1(elems, 4));
+}
+inline v_uint32x4 v_load_halves(const unsigned* ptr0, const unsigned* ptr1) { return v_reinterpret_as_u32(v_load_halves((int*)ptr0, (int*)ptr1)); }
+
+inline v_int64x2 v_load_halves(const int64* ptr0, const int64* ptr1)
+{
+    int64 elems[2] =
+    {
+        ptr0[0], ptr1[0]
+    };
+    return v_int64x2(vle64_v_i64m1(elems, 2));
+}
+inline v_uint64x2 v_load_halves(const uint64* ptr0, const uint64* ptr1) { return v_reinterpret_as_u64(v_load_halves((int64*)ptr0, (int64*)ptr1)); }
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_load_halves(const double* ptr0, const double* ptr1)
+{
+    double elems[2] =
+    {
+        ptr0[0], ptr1[0]
+    };
+    return v_float64x2(vle64_v_f64m1(elems, 2));
+}
+#endif
+
+
+////////////// Lookup table access ////////////////////
+
+inline v_int8x16 v_lut(const schar* tab, const int* idx)
+{
+    schar elems[16] =
+    {
+        tab[idx[ 0]],
+        tab[idx[ 1]],
+        tab[idx[ 2]],
+        tab[idx[ 3]],
+        tab[idx[ 4]],
+        tab[idx[ 5]],
+        tab[idx[ 6]],
+        tab[idx[ 7]],
+        tab[idx[ 8]],
+        tab[idx[ 9]],
+        tab[idx[10]],
+        tab[idx[11]],
+        tab[idx[12]],
+        tab[idx[13]],
+        tab[idx[14]],
+        tab[idx[15]]
+    };
+    return v_int8x16(vle8_v_i8m1(elems, 16));
+}
+inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
+{
+    schar elems[16] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[3]],
+        tab[idx[3] + 1],
+        tab[idx[4]],
+        tab[idx[4] + 1],
+        tab[idx[5]],
+        tab[idx[5] + 1],
+        tab[idx[6]],
+        tab[idx[6] + 1],
+        tab[idx[7]],
+        tab[idx[7] + 1]
+    };
+    return v_int8x16(vle8_v_i8m1(elems, 16));
+}
+inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
+{
+    schar elems[16] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[0] + 2],
+        tab[idx[0] + 3],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[1] + 2],
+        tab[idx[1] + 3],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[2] + 2],
+        tab[idx[2] + 3],
+        tab[idx[3]],
+        tab[idx[3] + 1],
+        tab[idx[3] + 2],
+        tab[idx[3] + 3]
+    };
+    return v_int8x16(vle8_v_i8m1(elems, 16));
+}
+inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
+inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
+inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
+
+inline v_int16x8 v_lut(const short* tab, const int* idx)
+{
+    short elems[8] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]],
+        tab[idx[4]],
+        tab[idx[5]],
+        tab[idx[6]],
+        tab[idx[7]]
+    };
+    return v_int16x8(vle16_v_i16m1(elems, 8));
+}
+inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
+{
+    short elems[8] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[3]],
+        tab[idx[3] + 1]
+    };
+    return v_int16x8(vle16_v_i16m1(elems, 8));
+}
+inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
+{
+    short elems[8] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[0] + 2],
+        tab[idx[0] + 3],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[1] + 2],
+        tab[idx[1] + 3]
+    };
+    return v_int16x8(vle16_v_i16m1(elems, 8));
+}
+inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
+inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
+inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
+
+inline v_int32x4 v_lut(const int* tab, const int* idx)
+{
+    int elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]]
+    };
+    return v_int32x4(vle32_v_i32m1(elems, 4));
+}
+inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
+{
+    int elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1]
+    };
+    return v_int32x4(vle32_v_i32m1(elems, 4));
+}
+inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x4(vle32_v_i32m1(tab + idx[0], 4));
+}
+
+inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
+inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
+inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
+
+inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
+{
+    int64_t elems[2] =
+    {
+        tab[idx[0]],
+        tab[idx[1]]
+    };
+    return v_int64x2(vle64_v_i64m1(elems, 2));
+}
+inline v_int64x2 v_lut_pairs(const int64* tab, const int* idx)
+{
+    return v_int64x2(vle64_v_i64m1(tab + idx[0], 2));
+}
+inline v_uint64x2 v_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
+inline v_uint64x2 v_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
+
+inline v_float32x4 v_lut(const float* tab, const int* idx)
+{
+    float elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]]
+    };
+    return v_float32x4(vle32_v_f32m1(elems, 4));
+}
+inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
+{
+    float elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1]
+    };
+    return v_float32x4(vle32_v_f32m1(elems, 4));
+}
+inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
+{
+    return v_float32x4(vle32_v_f32m1(tab + idx[0], 4));
+}
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    int elems[4] =
+    {
+        tab[v_extract_n<0>(idxvec)],
+        tab[v_extract_n<1>(idxvec)],
+        tab[v_extract_n<2>(idxvec)],
+        tab[v_extract_n<3>(idxvec)]
+    };
+    return v_int32x4(vle32_v_i32m1(elems, 4));
+}
+
+inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
+{
+    unsigned elems[4] =
+    {
+        tab[v_extract_n<0>(idxvec)],
+        tab[v_extract_n<1>(idxvec)],
+        tab[v_extract_n<2>(idxvec)],
+        tab[v_extract_n<3>(idxvec)]
+    };
+    return v_uint32x4(vle32_v_u32m1(elems, 4));
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    float elems[4] =
+    {
+        tab[v_extract_n<0>(idxvec)],
+        tab[v_extract_n<1>(idxvec)],
+        tab[v_extract_n<2>(idxvec)],
+        tab[v_extract_n<3>(idxvec)]
+    };
+    return v_float32x4(vle32_v_f32m1(elems, 4));
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    int idx[4];
+    v_store_aligned(idx, idxvec);
+
+    x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+    y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
+}
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_lut(const double* tab, const int* idx)
+{
+    double elems[2] =
+    {
+        tab[idx[0]],
+        tab[idx[1]]
+    };
+    return v_float64x2(vle64_v_f64m1(elems, 2));
+}
+
+inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
+{
+    return v_float64x2(vle64_v_f64m1(tab + idx[0], 2));
+}
+
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    double elems[2] =
+    {
+        tab[v_extract_n<0>(idxvec)],
+        tab[v_extract_n<1>(idxvec)]
+    };
+    return v_float64x2(vle64_v_f64m1(elems, 2));
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    int idx[4] = {0};
+    v_store_aligned(idx, idxvec);
+
+    x = v_float64x2(tab[idx[0]], tab[idx[1]]);
+    y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
+}
+#endif
+
+////////////// Pack boolean ////////////////////
+
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    ushort ptr[16] = {0};
+    v_store(ptr, a);
+    v_store(ptr + 8, b);
+    return v_uint8x16(vnsrl_wx_u8m1(vle16_v_u16m2(ptr, 16), 0, 16));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    unsigned ptr[16] = {0};
+    v_store(ptr, a);
+    v_store(ptr + 4, b);
+    v_store(ptr + 8, c);
+    v_store(ptr + 12, d);
+    return v_uint8x16(vnsrl_wx_u8m1(vnsrl_wx_u16m2(vle32_v_u32m4(ptr, 16), 0, 16), 0, 16));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    uint64 ptr[16] = {0};
+    v_store(ptr, a);
+    v_store(ptr + 2, b);
+    v_store(ptr + 4, c);
+    v_store(ptr + 6, d);
+    v_store(ptr + 8, e);
+    v_store(ptr + 10, f);
+    v_store(ptr + 12, g);
+    v_store(ptr + 14, h);
+    return v_uint8x16(vnsrl_wx_u8m1(vnsrl_wx_u16m2(vnsrl_wx_u32m4(vle64_v_u64m8(ptr, 16), 0, 16), 0, 16), 0, 16));
+}
+
+////////////// Arithmetics //////////////
+#define OPENCV_HAL_IMPL_RVV_BIN_OP(bin_op, _Tpvec, intrin, vl) \
+inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a, b, vl)); \
+} \
+inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+{ \
+    a = _Tpvec(intrin(a, b, vl)); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint8x16, vsaddu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint8x16, vssubu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint8x16, vdivu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int8x16, vsadd_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int8x16, vssub_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int8x16, vdiv_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint16x8, vsaddu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint16x8, vssubu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint16x8, vdivu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int16x8, vsadd_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int16x8, vssub_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int16x8, vdiv_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint32x4, vadd_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint32x4, vsub_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_uint32x4, vmul_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint32x4, vdivu_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int32x4, vadd_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int32x4, vsub_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_int32x4, vmul_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int32x4, vdiv_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_float32x4, vfadd_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_float32x4, vfsub_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_float32x4, vfmul_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_float32x4, vfdiv_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint64x2, vadd_vv_u64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint64x2, vsub_vv_u64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_uint64x2, vmul_vv_u64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint64x2, vdivu_vv_u64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int64x2, vadd_vv_i64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int64x2, vsub_vv_i64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_int64x2, vmul_vv_i64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int64x2, vdiv_vv_i64m1, 2)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_float64x2, vfadd_vv_f64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_float64x2, vfsub_vv_f64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_float64x2, vfmul_vv_f64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_float64x2, vfdiv_vv_f64m1, 2)
+#endif
+
+
+////////////// Bitwise logic //////////////
+
+#define OPENCV_HAL_IMPL_RVV_LOGIC_OP(_Tpvec, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_BIN_OP(&, _Tpvec, vand_vv_##suffix##m1, vl) \
+OPENCV_HAL_IMPL_RVV_BIN_OP(|, _Tpvec, vor_vv_##suffix##m1, vl) \
+OPENCV_HAL_IMPL_RVV_BIN_OP(^, _Tpvec, vxor_vv_##suffix##m1, vl) \
+inline _Tpvec operator ~ (const _Tpvec& a) \
+{ \
+    return _Tpvec(vnot_v_##suffix##m1(a, vl)); \
+}
+
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint8x16, u8, 16)
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int8x16, i8, 16)
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint16x8, u16, 8)
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int16x8, i16, 8)
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint32x4, u32, 4)
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int32x4, i32, 4)
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint64x2, u64, 2)
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int64x2, i64, 2)
+
+#define OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(bin_op, intrin) \
+inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
+{ \
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b), 4))); \
+} \
+inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
+{ \
+    a = v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b), 4))); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(&, vand_vv_i32m1)
+OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(|, vor_vv_i32m1)
+OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(^, vxor_vv_i32m1)
+
+inline v_float32x4 operator ~ (const v_float32x4& a)
+{
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a), 4)));
+}
+
+#if CV_SIMD128_64F
+#define OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(bin_op, intrin) \
+inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
+{ \
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b), 2))); \
+} \
+inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
+{ \
+    a = v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b), 2))); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(&, vand_vv_i64m1)
+OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(|, vor_vv_i64m1)
+OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(^, vxor_vv_i64m1)
+
+inline v_float64x2 operator ~ (const v_float64x2& a)
+{
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a), 2)));
+}
+#endif
+
+////////////// Bitwise shifts //////////////
+
+#define OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(_Tpvec, suffix, vl) \
+inline _Tpvec operator << (const _Tpvec& a, int n) \
+{ \
+    return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
+} \
+inline _Tpvec operator >> (const _Tpvec& a, int n) \
+{ \
+    return _Tpvec(vsrl_vx_##suffix##m1(a, uint8_t(n), vl)); \
+} \
+template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
+{ \
+    return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
+} \
+template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
+{ \
+    return _Tpvec(vsrl_vx_##suffix##m1(a, uint8_t(n), vl)); \
+}
+
+#define OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(_Tpvec, suffix, vl) \
+inline _Tpvec operator << (const _Tpvec& a, int n) \
+{ \
+    return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
+} \
+inline _Tpvec operator >> (const _Tpvec& a, int n) \
+{ \
+    return _Tpvec(vsra_vx_##suffix##m1(a, uint8_t(n), vl)); \
+} \
+template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
+{ \
+    return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
+} \
+template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
+{ \
+    return _Tpvec(vsra_vx_##suffix##m1(a, uint8_t(n), vl)); \
+}
+
+OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint8x16, u8, 16)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint16x8, u16, 8)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint32x4, u32, 4)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint64x2, u64, 2)
+OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int8x16, i8, 16)
+OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int16x8, i16, 8)
+OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int32x4, i32, 4)
+OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int64x2, i64, 2)
+
+
+////////////// Comparison //////////////
+
+#define OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, op, intrin, suffix, vl) \
+inline _Tpvec operator op (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    uint64_t ones = -1; \
+    return _Tpvec(vmerge_vxm_##suffix##m1(intrin(a, b, vl), vmv_v_x_##suffix##m1(0, vl), ones, vl)); \
+}
+
+#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, op, intrin, suffix, vl) \
+inline _Tpvec operator op (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    union { uint64 u; double d; } ones; ones.u = -1; \
+    return _Tpvec(vfmerge_vfm_##suffix##m1(intrin(a, b, vl), vfmv_v_f_##suffix##m1(0, vl), ones.d, vl)); \
+}
+
+#define OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(_Tpvec, suffix, width, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ==, vmseq_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, !=, vmsne_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <, vmsltu_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >, vmsgtu_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <=, vmsleu_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >=, vmsgeu_vv_##suffix##m1_b##width, suffix, vl)
+
+#define OPENCV_HAL_IMPL_RVV_SIGNED_CMP(_Tpvec, suffix, width, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ==, vmseq_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, !=, vmsne_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <, vmslt_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >, vmsgt_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <=, vmsle_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >=, vmsge_vv_##suffix##m1_b##width, suffix, vl)
+
+#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP(_Tpvec, suffix, width, vl) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ==, vmfeq_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, !=, vmfne_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, <, vmflt_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, >, vmfgt_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, <=, vmfle_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, >=, vmfge_vv_##suffix##m1_b##width, suffix, vl)
+
+
+OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint8x16, u8, 8, 16)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint16x8, u16, 16, 8)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint32x4, u32, 32, 4)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint64x2, u64, 64, 2)
+OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int8x16, i8, 8, 16)
+OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int16x8, i16, 16, 8)
+OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int32x4, i32, 32, 4)
+OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int64x2, i64, 64, 2)
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float32x4, f32, 32, 4)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float64x2, f64, 64, 2)
+#endif
+
+inline v_float32x4 v_not_nan(const v_float32x4& a)
+{ return a == a; }
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_not_nan(const v_float64x2& a)
+{ return a == a; }
+#endif
+
+////////////// Min/Max //////////////
+
+#define OPENCV_HAL_IMPL_RVV_BIN_FUNC(_Tpvec, func, intrin, vl) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a, b, vl)); \
+}
+
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_min, vminu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_max, vmaxu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_min, vmin_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_max, vmax_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_min, vminu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_max, vmaxu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_min, vmin_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_max, vmax_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32x4, v_min, vminu_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32x4, v_max, vmaxu_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32x4, v_min, vmin_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32x4, v_max, vmax_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32x4, v_min, vfmin_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32x4, v_max, vfmax_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64x2, v_min, vminu_vv_u64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64x2, v_max, vmaxu_vv_u64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64x2, v_min, vmin_vv_i64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64x2, v_max, vmax_vv_i64m1, 2)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64x2, v_min, vfmin_vv_f64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64x2, v_max, vfmax_vv_f64m1, 2)
+#endif
+
+////////////// Arithmetics wrap //////////////
+
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_add_wrap, vadd_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_add_wrap, vadd_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_add_wrap, vadd_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_add_wrap, vadd_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_sub_wrap, vsub_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_sub_wrap, vsub_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_sub_wrap, vsub_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_sub_wrap, vsub_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_mul_wrap, vmul_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_mul_wrap, vmul_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_mul_wrap, vmul_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_mul_wrap, vmul_vv_i16m1, 8)
+
+////////////// Reduce //////////////
+
+#define OPENCV_HAL_IMPL_RVV_REDUCE_SUM(_Tpvec, _wTpvec, _nwTpvec, scalartype, suffix, wsuffix, vl, red) \
+inline scalartype v_reduce_sum(const _Tpvec& a)  \
+{ \
+    _nwTpvec zero = vmv_v_x_##wsuffix##m1(0, vl); \
+    _nwTpvec res = vmv_v_x_##wsuffix##m1(0, vl); \
+    res = v##red##_vs_##suffix##m1_##wsuffix##m1(res, a, zero, vl); \
+    return (scalartype)(_wTpvec(res).get0()); \
+}
+
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint8x16, v_uint16x8, vuint16m1_t, unsigned, u8, u16, 16, wredsumu)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int8x16, v_int16x8, vint16m1_t, int, i8, i16, 16, wredsum)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint16x8, v_uint32x4, vuint32m1_t, unsigned, u16, u32, 8, wredsumu)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int16x8, v_int32x4, vint32m1_t, int, i16, i32, 8, wredsum)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint32x4, v_uint64x2, vuint64m1_t, unsigned, u32, u64, 4, wredsumu)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int32x4, v_int64x2, vint64m1_t, int, i32, i64, 4, wredsum)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint64x2, v_uint64x2, vuint64m1_t, uint64, u64, u64, 2, redsum)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int64x2, v_int64x2, vint64m1_t, int64, i64, i64, 2, redsum)
+
+#define OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(_Tpvec, _wTpvec, _nwTpvec, scalartype, suffix, wsuffix, vl, red) \
+inline scalartype v_reduce_sum(const _Tpvec& a)  \
+{ \
+    _nwTpvec zero = vfmv_v_f_##wsuffix##m1(0, vl); \
+    _nwTpvec res = vfmv_v_f_##wsuffix##m1(0, vl); \
+    res = v##red##_vs_##suffix##m1_##wsuffix##m1(res, a, zero, vl); \
+    return (scalartype)(_wTpvec(res).get0()); \
+}
+
+// vfredsum for float has renamed to fredosum, also updated in GNU.
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float32x4, v_float32x4, vfloat32m1_t, float, f32, f32, 4, fredosum)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float64x2, v_float64x2, vfloat64m1_t, double, f64, f64, 2, fredosum)
+#endif
+
+
+#define OPENCV_HAL_IMPL_RVV_REDUCE(_Tpvec, func, scalartype, suffix, vl, red) \
+inline scalartype v_reduce_##func(const _Tpvec& a)  \
+{ \
+    _Tpvec res = _Tpvec(v##red##_vs_##suffix##m1_##suffix##m1(a, a, a, vl)); \
+    return scalartype(res.get0()); \
+}
+
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8x16, min, uchar, u8, 16, redminu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int8x16, min, schar, i8, 16, redmin)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16x8, min, ushort, u16, 8, redminu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int16x8, min, short, i16, 8, redmin)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32x4, min, unsigned, u32, 4, redminu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int32x4, min, int, i32, 4, redmin)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_float32x4, min, float, f32, 4, fredmin)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8x16, max, uchar, u8, 16, redmaxu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int8x16, max, schar, i8, 16, redmax)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16x8, max, ushort, u16, 8, redmaxu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int16x8, max, short, i16, 8, redmax)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32x4, max, unsigned, u32, 4, redmaxu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int32x4, max, int, i32, 4, redmax)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_float32x4, max, float, f32, 4, fredmax)
+
+
+inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
+                                 const v_float32x4& c, const v_float32x4& d)
+{
+    float elems[4] =
+    {
+        v_reduce_sum(a),
+        v_reduce_sum(b),
+        v_reduce_sum(c),
+        v_reduce_sum(d)
+    };
+    return v_float32x4(vle32_v_f32m1(elems, 4));
+}
+
+////////////// Square-Root //////////////
+
+inline v_float32x4 v_sqrt(const v_float32x4& x)
+{
+    return v_float32x4(vfsqrt_v_f32m1(x, 4));
+}
+
+inline v_float32x4 v_invsqrt(const v_float32x4& x)
+{
+    v_float32x4 one = v_setall_f32(1.0f);
+    return one / v_sqrt(x);
+}
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_sqrt(const v_float64x2& x)
+{
+    return v_float64x2(vfsqrt_v_f64m1(x, 4));
+}
+
+inline v_float64x2 v_invsqrt(const v_float64x2& x)
+{
+    v_float64x2 one = v_setall_f64(1.0f);
+    return one / v_sqrt(x);
+}
+#endif
+
+inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+    v_float32x4 x(vfmacc_vv_f32m1(vfmul_vv_f32m1(a, a, 4), b, b, 4));
+    return v_sqrt(x);
+}
+
+inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+    return v_float32x4(vfmacc_vv_f32m1(vfmul_vv_f32m1(a, a, 4), b, b, 4));
+}
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
+{
+    v_float64x2 x(vfmacc_vv_f64m1(vfmul_vv_f64m1(a, a, 2), b, b, 2));
+    return v_sqrt(x);
+}
+
+inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_float64x2(vfmacc_vv_f64m1(vfmul_vv_f64m1(a, a, 2), b, b, 2));
+}
+#endif
+
+////////////// Multiply-Add //////////////
+
+inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+    return v_float32x4(vfmacc_vv_f32m1(c, a, b, 4));
+}
+inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_int32x4(vmacc_vv_i32m1(c, a, b, 4));
+}
+
+inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+    return v_float64x2(vfmacc_vv_f64m1(c, a, b, 2));
+}
+
+inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+    return v_fma(a, b, c);
+}
+#endif
+
+////////////// Check all/any //////////////
+
+// use overloaded vcpop in clang, no casting like (vuint64m1_t) is needed.
+#ifndef __clang__
+#define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, suffix, shift, vl) \
+inline bool v_check_all(const _Tpvec& a) \
+{ \
+    v_uint64x2 v = v_uint64x2((vuint64m1_t)vsrl_vx_##suffix##m1(vnot_v_##suffix##m1(a, vl), shift, vl)); \
+    return (v.val[0] | v.val[1]) == 0; \
+} \
+inline bool v_check_any(const _Tpvec& a) \
+{ \
+    v_uint64x2 v = v_uint64x2((vuint64m1_t)vsrl_vx_##suffix##m1(a, shift, vl)); \
+    return (v.val[0] | v.val[1]) != 0; \
+}
+
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint8x16, u8, 7, 16)
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint16x8, u16, 15, 8)
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint32x4, u32, 31, 4)
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint64x2, u64, 63, 2)
+
+
+inline bool v_check_all(const v_int8x16& a)
+{ return v_check_all(v_reinterpret_as_u8(a)); }
+inline bool v_check_any(const v_int8x16& a)
+{ return v_check_any(v_reinterpret_as_u8(a)); }
+
+inline bool v_check_all(const v_int16x8& a)
+{ return v_check_all(v_reinterpret_as_u16(a)); }
+inline bool v_check_any(const v_int16x8& a)
+{ return v_check_any(v_reinterpret_as_u16(a)); }
+
+inline bool v_check_all(const v_int32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+inline bool v_check_any(const v_int32x4& a)
+{ return v_check_any(v_reinterpret_as_u32(a)); }
+
+inline bool v_check_all(const v_float32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+inline bool v_check_any(const v_float32x4& a)
+{ return v_check_any(v_reinterpret_as_u32(a)); }
+
+inline bool v_check_all(const v_int64x2& a)
+{ return v_check_all(v_reinterpret_as_u64(a)); }
+inline bool v_check_any(const v_int64x2& a)
+{ return v_check_any(v_reinterpret_as_u64(a)); }
+
+#if CV_SIMD128_64F
+inline bool v_check_all(const v_float64x2& a)
+{ return v_check_all(v_reinterpret_as_u64(a)); }
+inline bool v_check_any(const v_float64x2& a)
+{ return v_check_any(v_reinterpret_as_u64(a)); }
+#endif
+#else
+#define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, vl) \
+inline bool v_check_all(const _Tpvec& a) \
+{ \
+    return vcpop(vmslt(a, 0, vl), vl) == vl; \
+} \
+inline bool v_check_any(const _Tpvec& a) \
+{ \
+    return vcpop(vmslt(a, 0, vl), vl) != 0; \
+}
+
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int8x16, 16)
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int16x8, 8)
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int32x4, 4)
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int64x2, 2)
+
+
+inline bool v_check_all(const v_uint8x16& a)
+{ return v_check_all(v_reinterpret_as_s8(a)); }
+inline bool v_check_any(const v_uint8x16& a)
+{ return v_check_any(v_reinterpret_as_s8(a)); }
+
+inline bool v_check_all(const v_uint16x8& a)
+{ return v_check_all(v_reinterpret_as_s16(a)); }
+inline bool v_check_any(const v_uint16x8& a)
+{ return v_check_any(v_reinterpret_as_s16(a)); }
+
+inline bool v_check_all(const v_uint32x4& a)
+{ return v_check_all(v_reinterpret_as_s32(a)); }
+inline bool v_check_any(const v_uint32x4& a)
+{ return v_check_any(v_reinterpret_as_s32(a)); }
+
+inline bool v_check_all(const v_float32x4& a)
+{ return v_check_all(v_reinterpret_as_s32(a)); }
+inline bool v_check_any(const v_float32x4& a)
+{ return v_check_any(v_reinterpret_as_s32(a)); }
+
+inline bool v_check_all(const v_uint64x2& a)
+{ return v_check_all(v_reinterpret_as_s64(a)); }
+inline bool v_check_any(const v_uint64x2& a)
+{ return v_check_any(v_reinterpret_as_s64(a)); }
+
+#if CV_SIMD128_64F
+inline bool v_check_all(const v_float64x2& a)
+{ return v_check_all(v_reinterpret_as_s64(a)); }
+inline bool v_check_any(const v_float64x2& a)
+{ return v_check_any(v_reinterpret_as_s64(a)); }
+#endif
+#endif
+////////////// abs //////////////
+
+#define OPENCV_HAL_IMPL_RVV_ABSDIFF(_Tpvec, abs) \
+inline _Tpvec v_##abs(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return v_max(a, b) - v_min(a, b); \
+}
+
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint8x16, absdiff)
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint16x8, absdiff)
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint32x4, absdiff)
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float32x4, absdiff)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float64x2, absdiff)
+#endif
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int8x16, absdiffs)
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int16x8, absdiffs)
+
+// use reinterpret instead of c-style casting.
+#ifndef __clang__
+#define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(_Tpvec, _rTpvec, _nwTpvec, sub, rshr, vl) \
+inline _rTpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _rTpvec(rshr((_nwTpvec)sub(v_max(a, b), v_min(a, b), vl), 0, vl)); \
+}
+
+OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int8x16, v_uint8x16, vuint16m2_t, vwsub_vv_i16m2, vnclipu_wx_u8m1, 16)
+OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int16x8, v_uint16x8, vuint32m2_t, vwsub_vv_i32m2, vnclipu_wx_u16m1, 8)
+OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int32x4, v_uint32x4, vuint64m2_t, vwsub_vv_i64m2, vnclipu_wx_u32m1, 4)
+#else
+#define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(_Tpvec, _rTpvec, _nwTpvec, sub, rshr, width, vl) \
+inline _rTpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _rTpvec(rshr(vreinterpret_u##width##m2(sub(v_max(a, b), v_min(a, b), vl)), 0, vl)); \
+}
+
+OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int8x16, v_uint8x16, vuint16m2_t, vwsub_vv_i16m2, vnclipu_wx_u8m1, 16, 16)
+OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int16x8, v_uint16x8, vuint32m2_t, vwsub_vv_i32m2, vnclipu_wx_u16m1, 32, 8)
+OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int32x4, v_uint32x4, vuint64m2_t, vwsub_vv_i64m2, vnclipu_wx_u32m1, 64, 4)
+#endif
+#define OPENCV_HAL_IMPL_RVV_ABS(_Tprvec, _Tpvec, suffix) \
+inline _Tprvec v_abs(const _Tpvec& a) \
+{ \
+    return v_absdiff(a, v_setzero_##suffix()); \
+}
+
+OPENCV_HAL_IMPL_RVV_ABS(v_uint8x16, v_int8x16, s8)
+OPENCV_HAL_IMPL_RVV_ABS(v_uint16x8, v_int16x8, s16)
+OPENCV_HAL_IMPL_RVV_ABS(v_uint32x4, v_int32x4, s32)
+OPENCV_HAL_IMPL_RVV_ABS(v_float32x4, v_float32x4, f32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_ABS(v_float64x2, v_float64x2, f64)
+#endif
+
+
+#define OPENCV_HAL_IMPL_RVV_REDUCE_SAD(_Tpvec, scalartype) \
+inline scalartype v_reduce_sad(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return v_reduce_sum(v_absdiff(a, b)); \
+}
+
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint8x16, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int8x16, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint16x8, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int16x8, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint32x4, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int32x4, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_float32x4, float)
+
+////////////// Select //////////////
+
+#define OPENCV_HAL_IMPL_RVV_SELECT(_Tpvec, merge, ne, vl) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(merge(ne(mask, 0, vl), b, a, vl)); \
+}
+
+OPENCV_HAL_IMPL_RVV_SELECT(v_uint8x16, vmerge_vvm_u8m1, vmsne_vx_u8m1_b8, 16)
+OPENCV_HAL_IMPL_RVV_SELECT(v_int8x16, vmerge_vvm_i8m1, vmsne_vx_i8m1_b8, 16)
+OPENCV_HAL_IMPL_RVV_SELECT(v_uint16x8, vmerge_vvm_u16m1, vmsne_vx_u16m1_b16, 8)
+OPENCV_HAL_IMPL_RVV_SELECT(v_int16x8, vmerge_vvm_i16m1, vmsne_vx_i16m1_b16, 8)
+OPENCV_HAL_IMPL_RVV_SELECT(v_uint32x4, vmerge_vvm_u32m1, vmsne_vx_u32m1_b32, 4)
+OPENCV_HAL_IMPL_RVV_SELECT(v_int32x4, vmerge_vvm_i32m1, vmsne_vx_i32m1_b32, 4)
+OPENCV_HAL_IMPL_RVV_SELECT(v_float32x4, vmerge_vvm_f32m1, vmfne_vf_f32m1_b32, 4)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_SELECT(v_float64x2, vmerge_vvm_f64m1, vmfne_vf_f64m1_b64, 2)
+#endif
+
+////////////// Rotate shift //////////////
+
+#define OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(_Tpvec, suffix, vl) \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
+{ \
+    return _Tpvec(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, n, vl)); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
+{ \
+    return _Tpvec(vslideup_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, n, vl)); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
+{ return a; } \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, n, vl), b, _Tpvec::nlanes - n, vl)); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), b, _Tpvec::nlanes - n, vl), a, n, vl)); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
+{ CV_UNUSED(b); return a; }
+
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint8x16, u8, 16)
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int8x16, i8, 16)
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint16x8, u16, 8)
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int16x8, i16, 8)
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint32x4, u32, 4)
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int32x4, i32, 4)
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint64x2, u64, 2)
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int64x2, i64, 2)
+
+#define OPENCV_HAL_IMPL_RVV_ROTATE_FP(_Tpvec, suffix, vl) \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
+{ \
+    return _Tpvec(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, n, vl)); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
+{ \
+    return _Tpvec(vslideup_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, n, vl)); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
+{ return a; } \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, n, vl), b, _Tpvec::nlanes - n, vl)); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), b, _Tpvec::nlanes - n, vl), a, n, vl)); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
+{ CV_UNUSED(b); return a; }
+
+OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float32x4, f32, 4)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float64x2, f64, 2)
+#endif
+
+////////////// Convert to float //////////////
+
+inline v_float32x4 v_cvt_f32(const v_int32x4& a)
+{
+    return v_float32x4(vfcvt_f_x_v_f32m1(a, 4));
+}
+
+#if CV_SIMD128_64F
+#ifndef __clang__
+inline v_float32x4 v_cvt_f32(const v_float64x2& a)
+{
+    double arr[4] = {a.val[0], a.val[1], 0, 0};
+    vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
+    return v_float32x4(vfncvt_f_f_w_f32m1(tmp, 4));
+}
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{
+    double arr[4] = {a.val[0], a.val[1], b.val[0], b.val[1]};
+    vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
+    return v_float32x4(vfncvt_f_f_w_f32m1(tmp, 4));
+}
+#else
+inline v_float32x4 v_cvt_f32(const v_float64x2& a)
+{
+    vfloat64m2_t zero = vfmv_v_f_f64m2(0, 4);
+    return v_float32x4(vfncvt_f_f_w_f32m1(vset_v_f64m1_f64m2(zero, 0, a), 4));
+}
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{
+    vfloat64m2_t dst = vlmul_ext_v_f64m1_f64m2(a);
+    return v_float32x4(vfncvt_f_f_w_f32m1(vset_v_f64m1_f64m2(dst, 1, b), 4));
+}
+#endif
+
+inline v_float64x2 v_cvt_f64(const v_int32x4& a)
+{
+    double ptr[4] = {0};
+    vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a, 4), 4);
+    double elems[2] =
+    {
+        ptr[0], ptr[1]
+    };
+    return v_float64x2(vle64_v_f64m1(elems, 2));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
+{
+    double ptr[4] = {0};
+    vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a, 4), 4);
+    double elems[2] =
+    {
+        ptr[2], ptr[3]
+    };
+    return v_float64x2(vle64_v_f64m1(elems, 2));
+}
+
+inline v_float64x2 v_cvt_f64(const v_float32x4& a)
+{
+    double ptr[4] = {0};
+    vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a, 4), 4);
+    double elems[2] =
+    {
+        ptr[0], ptr[1]
+    };
+    return v_float64x2(vle64_v_f64m1(elems, 2));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
+{
+    double ptr[4] = {0};
+    vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a, 4), 4);
+    double elems[2] =
+    {
+        ptr[2], ptr[3]
+    };
+    return v_float64x2(vle64_v_f64m1(elems, 2));
+}
+
+inline v_float64x2 v_cvt_f64(const v_int64x2& a)
+{
+    return v_float64x2(vfcvt_f_x_v_f64m1(a, 2));
+}
+#endif
+
+////////////// Broadcast //////////////
+
+#define OPENCV_HAL_IMPL_RVV_BROADCAST(_Tpvec, suffix) \
+template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) \
+{ \
+    return v_setall_##suffix(v_extract_n<i>(v)); \
+}
+
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint8x16, u8)
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_int8x16, s8)
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint16x8, u16)
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_int16x8, s16)
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint32x4, u32)
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_int32x4, s32)
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint64x2, u64)
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_int64x2, s64)
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_float32x4, f32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_float64x2, f64)
+#endif
+
+////////////// Transpose4x4 //////////////
+
+#define OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(_Tpvec, _Tp, suffix) \
+inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
+                         const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
+                         v_##_Tpvec& b0, v_##_Tpvec& b1, \
+                         v_##_Tpvec& b2, v_##_Tpvec& b3) \
+{ \
+    _Tp elems0[4] = \
+    { \
+        v_extract_n<0>(a0), \
+        v_extract_n<0>(a1), \
+        v_extract_n<0>(a2), \
+        v_extract_n<0>(a3) \
+    }; \
+    b0 = v_load(elems0); \
+    _Tp elems1[4] = \
+    { \
+        v_extract_n<1>(a0), \
+        v_extract_n<1>(a1), \
+        v_extract_n<1>(a2), \
+        v_extract_n<1>(a3) \
+    }; \
+    b1 = v_load(elems1); \
+    _Tp elems2[4] = \
+    { \
+        v_extract_n<2>(a0), \
+        v_extract_n<2>(a1), \
+        v_extract_n<2>(a2), \
+        v_extract_n<2>(a3) \
+    }; \
+    b2 = v_load(elems2); \
+    _Tp elems3[4] = \
+    { \
+        v_extract_n<3>(a0), \
+        v_extract_n<3>(a1), \
+        v_extract_n<3>(a2), \
+        v_extract_n<3>(a3) \
+    }; \
+    b3 = v_load(elems3); \
+}
+
+OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(int32x4, int, i32)
+OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(float32x4, float, f32)
+
+////////////// Reverse //////////////
+
+#define OPENCV_HAL_IMPL_RVV_REVERSE(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_reverse(const _Tpvec& a)  \
+{ \
+    _Tp ptr[_Tpvec::nlanes] = {0}; \
+    _Tp ptra[_Tpvec::nlanes] = {0}; \
+    v_store(ptra, a); \
+    for (int i = 0; i < _Tpvec::nlanes; i++) \
+    { \
+        ptr[i] = ptra[_Tpvec::nlanes-i-1]; \
+    } \
+    return v_load(ptr); \
+}
+
+OPENCV_HAL_IMPL_RVV_REVERSE(v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_int8x16, schar, i8)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_int16x8, short, i16)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_int32x4, int, i32)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_float32x4, float, f32)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_int64x2, int64, i64)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_REVERSE(v_float64x2, double, f64)
+#endif
+
+//////////// Value reordering ////////////
+
+#define OPENCV_HAL_IMPL_RVV_EXPAND(_Tpwvec, _Tp, _Tpvec, width, suffix, wcvt, vl) \
+inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+{ \
+    _Tp lptr[_Tpvec::nlanes/2] = {0}; \
+    _Tp hptr[_Tpvec::nlanes/2] = {0}; \
+    v_store_low(lptr, a); \
+    v_store_high(hptr, a); \
+    b0 = _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr, vl), vl)); \
+    b1 = _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(hptr, vl), vl)); \
+} \
+inline _Tpwvec v_expand_low(const _Tpvec& a) \
+{ \
+    _Tp lptr[_Tpvec::nlanes/2] = {0}; \
+    v_store_low(lptr, a); \
+    return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr, vl), vl)); \
+} \
+inline _Tpwvec v_expand_high(const _Tpvec& a) \
+{ \
+    _Tp hptr[_Tpvec::nlanes/2] = {0}; \
+    v_store_high(hptr, a); \
+    return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(hptr, vl), vl)); \
+} \
+inline _Tpwvec v_load_expand(const _Tp* ptr) \
+{ \
+    return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(ptr, vl), vl)); \
+}
+
+OPENCV_HAL_IMPL_RVV_EXPAND(v_uint16x8, uchar, v_uint8x16, 8, u8, vwcvtu_x_x_v_u16m1, 8)
+OPENCV_HAL_IMPL_RVV_EXPAND(v_int16x8, schar, v_int8x16, 8, i8, vwcvt_x_x_v_i16m1, 8)
+OPENCV_HAL_IMPL_RVV_EXPAND(v_uint32x4, ushort, v_uint16x8, 16, u16, vwcvtu_x_x_v_u32m1, 4)
+OPENCV_HAL_IMPL_RVV_EXPAND(v_int32x4, short, v_int16x8, 16, i16, vwcvt_x_x_v_i32m1, 4)
+OPENCV_HAL_IMPL_RVV_EXPAND(v_uint64x2, uint, v_uint32x4, 32, u32, vwcvtu_x_x_v_u64m1, 2)
+OPENCV_HAL_IMPL_RVV_EXPAND(v_int64x2, int, v_int32x4, 32, i32, vwcvt_x_x_v_i64m1, 2)
+
+inline v_uint32x4 v_load_expand_q(const uchar* ptr)
+{
+    return v_uint32x4(vwcvtu_x_x_v_u32m1(vwcvtu_x_x_v_u16mf2(vle8_v_u8mf4(ptr, 4), 4), 4));
+}
+
+inline v_int32x4 v_load_expand_q(const schar* ptr)
+{
+    return v_int32x4(vwcvt_x_x_v_i32m1(vwcvt_x_x_v_i16mf2(vle8_v_i8mf4(ptr, 4), 4), 4));
+}
+
+
+#define OPENCV_HAL_IMPL_RVV_PACK(_Tpvec, _Tp, _wTpvec, _wTp, hwidth, width, hsuffix, suffix, rshr, shr, hvl, vl) \
+inline _Tpvec v_pack(const _wTpvec& a, const _wTpvec& b) \
+{ \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
+    v_store(arr, a); \
+    v_store(arr + _wTpvec::nlanes, b); \
+    return _Tpvec(shr(vle##width##_v_##suffix##m2(arr, vl), 0, vl)); \
+} \
+inline void v_pack_store(_Tp* ptr, const _wTpvec& a) \
+{ \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
+    v_store(arr, a); \
+    v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
+    vse##hwidth##_v_##hsuffix##m1(ptr, shr(vle##width##_v_##suffix##m2(arr, vl), 0, vl), hvl); \
+} \
+template<int n> inline \
+_Tpvec v_rshr_pack(const _wTpvec& a, const _wTpvec& b) \
+{ \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
+    v_store(arr, a); \
+    v_store(arr + _wTpvec::nlanes, b); \
+    return _Tpvec(rshr(vle##width##_v_##suffix##m2(arr, vl), n, vl)); \
+} \
+template<int n> inline \
+void v_rshr_pack_store(_Tp* ptr, const _wTpvec& a) \
+{ \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
+    v_store(arr, a); \
+    v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
+    vse##hwidth##_v_##hsuffix##m1(ptr, _Tpvec(rshr(vle##width##_v_##suffix##m2(arr, vl), n, vl)), hvl); \
+}
+
+OPENCV_HAL_IMPL_RVV_PACK(v_uint8x16, uchar, v_uint16x8, ushort, 8, 16, u8, u16, vnclipu_wx_u8m1, vnclipu_wx_u8m1, 8, 16)
+OPENCV_HAL_IMPL_RVV_PACK(v_int8x16, schar, v_int16x8, short, 8, 16, i8, i16, vnclip_wx_i8m1, vnclip_wx_i8m1, 8, 16)
+OPENCV_HAL_IMPL_RVV_PACK(v_uint16x8, ushort, v_uint32x4, unsigned, 16, 32, u16, u32, vnclipu_wx_u16m1, vnclipu_wx_u16m1, 4, 8)
+OPENCV_HAL_IMPL_RVV_PACK(v_int16x8, short, v_int32x4, int, 16, 32, i16, i32, vnclip_wx_i16m1, vnclip_wx_i16m1, 4, 8)
+OPENCV_HAL_IMPL_RVV_PACK(v_uint32x4, unsigned, v_uint64x2, uint64, 32, 64, u32, u64, vnclipu_wx_u32m1, vnsrl_wx_u32m1, 2, 4)
+OPENCV_HAL_IMPL_RVV_PACK(v_int32x4, int, v_int64x2, int64, 32, 64, i32, i64, vnclip_wx_i32m1, vnsra_wx_i32m1, 2, 4)
+
+
+#define OPENCV_HAL_IMPL_RVV_PACK_U(_Tpvec, _Tp, _wTpvec, _wTp, hwidth, width, hsuffix, suffix, rshr, cast, hvl, vl) \
+inline _Tpvec v_pack_u(const _wTpvec& a, const _wTpvec& b) \
+{ \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
+    v_store(arr, a); \
+    v_store(arr + _wTpvec::nlanes, b); \
+    return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), 0, vl)); \
+} \
+inline void v_pack_u_store(_Tp* ptr, const _wTpvec& a) \
+{ \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
+    v_store(arr, a); \
+    v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
+    vse##hwidth##_v_##hsuffix##m1(ptr, rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), 0, vl), hvl); \
+} \
+template<int n> inline \
+_Tpvec v_rshr_pack_u(const _wTpvec& a, const _wTpvec& b) \
+{ \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
+    v_store(arr, a); \
+    v_store(arr + _wTpvec::nlanes, b); \
+    return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), n, vl)); \
+} \
+template<int n> inline \
+void v_rshr_pack_u_store(_Tp* ptr, const _wTpvec& a) \
+{ \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
+    v_store(arr, a); \
+    v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
+    v_store(ptr, _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), n, vl))); \
+}
+
+OPENCV_HAL_IMPL_RVV_PACK_U(v_uint8x16, uchar, v_int16x8, short, 8, 16, u8, i16, vnclipu_wx_u8m1, vreinterpret_v_i16m2_u16m2, 8, 16)
+OPENCV_HAL_IMPL_RVV_PACK_U(v_uint16x8, ushort, v_int32x4, int, 16, 32, u16, i32, vnclipu_wx_u16m1, vreinterpret_v_i32m2_u32m2, 4, 8)
+
+
+#define OPENCV_HAL_IMPL_RVV_UNPACKS(_Tpvec, _Tp, suffix) \
+inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
+{ \
+    _Tp ptra0[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptra1[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb0[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb1[v_##_Tpvec::nlanes] = {0}; \
+    v_store(ptra0, a0); \
+    v_store(ptra1, a1); \
+    int i; \
+    for( i = 0; i < v_##_Tpvec::nlanes/2; i++ ) \
+    { \
+        ptrb0[i*2] = ptra0[i]; \
+        ptrb0[i*2+1] = ptra1[i]; \
+    } \
+    for( ; i < v_##_Tpvec::nlanes; i++ ) \
+    { \
+        ptrb1[i*2-v_##_Tpvec::nlanes] = ptra0[i]; \
+        ptrb1[i*2-v_##_Tpvec::nlanes+1] = ptra1[i]; \
+    } \
+    b0 = v_load(ptrb0); \
+    b1 = v_load(ptrb1); \
+} \
+inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    _Tp ptra[v_##_Tpvec::nlanes/2] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes/2] = {0}; \
+    v_store_low(ptra, a); \
+    v_store_low(ptrb, b); \
+    return v_load_halves(ptra, ptrb); \
+} \
+inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    _Tp ptra[v_##_Tpvec::nlanes/2] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes/2] = {0}; \
+    v_store_high(ptra, a); \
+    v_store_high(ptrb, b); \
+    return v_load_halves(ptra, ptrb); \
+} \
+inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
+{ \
+    c = v_combine_low(a, b); \
+    d = v_combine_high(a, b); \
+}
+
+OPENCV_HAL_IMPL_RVV_UNPACKS(uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_RVV_UNPACKS(int8x16, schar, i8)
+OPENCV_HAL_IMPL_RVV_UNPACKS(uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_RVV_UNPACKS(int16x8, short, i16)
+OPENCV_HAL_IMPL_RVV_UNPACKS(uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_RVV_UNPACKS(int32x4, int, i32)
+OPENCV_HAL_IMPL_RVV_UNPACKS(float32x4, float, f32)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_UNPACKS(float64x2, double, f64)
+#endif
+
+
+#define OPENCV_HAL_IMPL_RVV_INTERLEAVED(_Tpvec, _Tp) \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
+{ \
+    _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
+    int i, i2; \
+    for( i = i2 = 0; i < v_##_Tpvec::nlanes; i++, i2 += 2 ) \
+    { \
+        ptra[i] = ptr[i2]; \
+        ptrb[i] = ptr[i2+1]; \
+    } \
+    a = v_load(ptra); \
+    b = v_load(ptrb); \
+} \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
+{ \
+    _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
+    int i, i3; \
+    for( i = i3 = 0; i < v_##_Tpvec::nlanes; i++, i3 += 3 ) \
+    { \
+        ptra[i] = ptr[i3]; \
+        ptrb[i] = ptr[i3+1]; \
+        ptrc[i] = ptr[i3+2]; \
+    } \
+    a = v_load(ptra); \
+    b = v_load(ptrb); \
+    c = v_load(ptrc); \
+} \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
+                                v_##_Tpvec& c, v_##_Tpvec& d) \
+{ \
+    _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrd[v_##_Tpvec::nlanes] = {0}; \
+    int i, i4; \
+    for( i = i4 = 0; i < v_##_Tpvec::nlanes; i++, i4 += 4 ) \
+    { \
+        ptra[i] = ptr[i4]; \
+        ptrb[i] = ptr[i4+1]; \
+        ptrc[i] = ptr[i4+2]; \
+        ptrd[i] = ptr[i4+3]; \
+    } \
+    a = v_load(ptra); \
+    b = v_load(ptrb); \
+    c = v_load(ptrc); \
+    d = v_load(ptrd); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    int i, i2; \
+    _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
+    v_store(ptra, a); \
+    v_store(ptrb, b); \
+    for( i = i2 = 0; i < v_##_Tpvec::nlanes; i++, i2 += 2 ) \
+    { \
+        ptr[i2] = ptra[i]; \
+        ptr[i2+1] = ptrb[i]; \
+    } \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    int i, i3; \
+    _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
+    v_store(ptra, a); \
+    v_store(ptrb, b); \
+    v_store(ptrc, c); \
+    for( i = i3 = 0; i < v_##_Tpvec::nlanes; i++, i3 += 3 ) \
+    { \
+        ptr[i3] = ptra[i]; \
+        ptr[i3+1] = ptrb[i]; \
+        ptr[i3+2] = ptrc[i]; \
+    } \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                const v_##_Tpvec& c, const v_##_Tpvec& d, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
+{ \
+    int i, i4; \
+    _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrd[v_##_Tpvec::nlanes] = {0}; \
+    v_store(ptra, a); \
+    v_store(ptrb, b); \
+    v_store(ptrc, c); \
+    v_store(ptrd, d); \
+    for( i = i4 = 0; i < v_##_Tpvec::nlanes; i++, i4 += 4 ) \
+    { \
+        ptr[i4] = ptra[i]; \
+        ptr[i4+1] = ptrb[i]; \
+        ptr[i4+2] = ptrc[i]; \
+        ptr[i4+3] = ptrd[i]; \
+    } \
+} \
+inline v_##_Tpvec v_interleave_pairs(const v_##_Tpvec& vec) \
+{ \
+    _Tp ptr[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrvec[v_##_Tpvec::nlanes] = {0}; \
+    v_store(ptrvec, vec); \
+    for (int i = 0; i < v_##_Tpvec::nlanes/4; i++) \
+    { \
+        ptr[4*i  ] = ptrvec[4*i  ]; \
+        ptr[4*i+1] = ptrvec[4*i+2]; \
+        ptr[4*i+2] = ptrvec[4*i+1]; \
+        ptr[4*i+3] = ptrvec[4*i+3]; \
+    } \
+    return v_load(ptr); \
+} \
+inline v_##_Tpvec v_interleave_quads(const v_##_Tpvec& vec) \
+{ \
+    _Tp ptr[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrvec[v_##_Tpvec::nlanes] = {0}; \
+    v_store(ptrvec, vec); \
+    for (int i = 0; i < v_##_Tpvec::nlanes/8; i++) \
+    { \
+        ptr[8*i  ] = ptrvec[4*i  ]; \
+        ptr[8*i+1] = ptrvec[4*i+4]; \
+        ptr[8*i+2] = ptrvec[4*i+1]; \
+        ptr[8*i+3] = ptrvec[4*i+5]; \
+        ptr[8*i+4] = ptrvec[4*i+2]; \
+        ptr[8*i+5] = ptrvec[4*i+6]; \
+        ptr[8*i+6] = ptrvec[4*i+3]; \
+        ptr[8*i+7] = ptrvec[4*i+7]; \
+    } \
+    return v_load(ptr); \
+}
+
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint8x16, uchar)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(int8x16, schar)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint16x8, ushort)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(int16x8, short)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint32x4, unsigned)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(int32x4, int)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(float32x4, float)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint64x2, uint64)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(int64x2, int64)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(float64x2, double)
+#endif
+
+//////////// PopCount ////////////
+
+static const unsigned char popCountTable[] =
+{
+    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
+};
+
+#define OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(_rTpvec, _Tpvec, _rTp, _Tp, suffix) \
+inline _rTpvec v_popcount(const _Tpvec& a) \
+{ \
+    uchar ptra[16] = {0}; \
+    v_store(ptra, v_reinterpret_as_u8(a)); \
+    _rTp ptr[_Tpvec::nlanes] = {0}; \
+    v_store(ptr, v_setzero_##suffix()); \
+    for (int i = 0; i < _Tpvec::nlanes*(int)sizeof(_Tp); i++) \
+        ptr[i/sizeof(_Tp)] += popCountTable[ptra[i]]; \
+    return v_load(ptr); \
+}
+
+OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint8x16, v_uint8x16, uchar, uchar, u8)
+OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint8x16, v_int8x16, uchar, schar, u8)
+OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint16x8, v_uint16x8, ushort, ushort, u16)
+OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint16x8, v_int16x8, ushort, short, u16)
+OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint32x4, v_uint32x4, unsigned, unsigned, u32)
+OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint32x4, v_int32x4, unsigned, int, u32)
+OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint64x2, v_uint64x2, uint64, uint64, u64)
+OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint64x2, v_int64x2, uint64, int64, u64)
+
+//////////// SignMask ////////////
+
+#ifndef __clang__
+#define OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(_Tpvec, _Tp, suffix, vl, shift) \
+inline int v_signmask(const _Tpvec& a) \
+{ \
+    int mask = 0; \
+    _Tpvec tmp = _Tpvec(vsrl_vx_##suffix##m1(a, shift, vl)); \
+    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
+        mask |= (int)(tmp.val[i]) << i; \
+    return mask; \
+}
+
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint8x16, uchar, u8, 16, 7)
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint16x8, ushort, u16, 8, 15)
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint32x4, unsigned, u32, 4, 31)
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint64x2, uint64, u64, 2, 63)
+
+inline int v_signmask(const v_int8x16& a)
+{ return v_signmask(v_reinterpret_as_u8(a)); }
+inline int v_signmask(const v_int16x8& a)
+{ return v_signmask(v_reinterpret_as_u16(a)); }
+inline int v_signmask(const v_int32x4& a)
+{ return v_signmask(v_reinterpret_as_u32(a)); }
+inline int v_signmask(const v_float32x4& a)
+{ return v_signmask(v_reinterpret_as_u32(a)); }
+inline int v_signmask(const v_int64x2& a)
+{ return v_signmask(v_reinterpret_as_u64(a)); }
+#if CV_SIMD128_64F
+inline int v_signmask(const v_float64x2& a)
+{ return v_signmask(v_reinterpret_as_u64(a)); }
+#endif
+
+#else
+#define OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(_Tpvec, width, vl) \
+inline int v_signmask(const _Tpvec& a) \
+{ \
+    uint8_t ans[16] = {0};\
+    vsm(ans, vmslt(a, 0, vl), vl);\
+    return reinterpret_cast<int*>(ans)[0];\
+}
+
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int8x16, 8, 16)
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int16x8, 16, 8)
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int32x4, 32, 4)
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int64x2, 64, 2)
+
+inline int v_signmask(const v_uint8x16& a)
+{ return v_signmask(v_reinterpret_as_s8(a)); }
+inline int v_signmask(const v_uint16x8& a)
+{ return v_signmask(v_reinterpret_as_s16(a)); }
+inline int v_signmask(const v_uint32x4& a)
+{ return v_signmask(v_reinterpret_as_s32(a)); }
+inline int v_signmask(const v_float32x4& a)
+{ return v_signmask(v_reinterpret_as_s32(a)); }
+inline int v_signmask(const v_uint64x2& a)
+{ return v_signmask(v_reinterpret_as_s64(a)); }
+#if CV_SIMD128_64F
+inline int v_signmask(const v_float64x2& a)
+{ return v_signmask(v_reinterpret_as_s64(a)); }
+#endif
+
+#endif
+
+//////////// Scan forward ////////////
+
+#define OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(_Tpvec, _Tp, suffix) \
+inline int v_scan_forward(const _Tpvec& a) \
+{ \
+    _Tp ptr[_Tpvec::nlanes] = {0}; \
+    v_store(ptr, v_reinterpret_as_##suffix(a)); \
+    for (int i = 0; i < _Tpvec::nlanes; i++) \
+        if(int(ptr[i]) < 0) \
+            return i; \
+    return 0; \
+}
+
+OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int8x16, schar, s8)
+OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int16x8, short, s16)
+OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_float32x4, float, f32)
+OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int64x2, int64, s64)
+#if CV_SIMD128_64F
+OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_float64x2, double, f64)
+#endif
+
+//////////// Pack triplets ////////////
+
+// use reinterpret instead of c-style casting.
+#ifndef __clang__
+inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
+{
+    uint64 ptr[2] = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
+    return v_int8x16((vint8m1_t)vrgather_vv_u8m1((vuint8m1_t)vint8m1_t(vec), (vuint8m1_t)vle64_v_u64m1(ptr, 2), 16));
+}
+inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec)
+{
+    return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec)));
+}
+
+inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
+{
+    uint64 ptr[2] = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
+    return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)vint16m1_t(vec), (vuint8m1_t)vle64_v_u64m1(ptr, 2), 16));
+}
+inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec)
+{
+    return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec)));
+}
+
+inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
+inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
+inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
+
+#else
+
+inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
+{
+    uint64 ptr[2] = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
+    return v_int8x16(vreinterpret_i8m1(vrgather_vv_u8m1(v_reinterpret_as_u8(vec), vreinterpret_u8m1(vle64_v_u64m1(ptr, 2)), 16)));
+}
+inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec)
+{
+    return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec)));
+}
+
+inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
+{
+    uint64 ptr[2] = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
+    return v_int16x8(v_reinterpret_as_s16(v_uint8x16(vrgather_vv_u8m1(v_reinterpret_as_u8(vec), vreinterpret_u8m1(vle64_v_u64m1(ptr, 2)), 16))));
+}
+inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec)
+{
+    return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec)));
+}
+
+inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
+inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
+inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
+
+#endif
+
+////// FP16 support ///////
+
+#if CV_FP16
+inline v_float32x4 v_load_expand(const float16_t* ptr)
+{
+    return v_float32x4(vfwcvt_f_f_v_f32m1(vle16_v_f16mf2(ptr, 4), 4));
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+{
+    vse16_v_f16mf2(ptr, vfncvt_f_f_w_f16mf2(v, 4), 4);
+}
+#else
+inline v_float32x4 v_load_expand(const float16_t* ptr)
+{
+    const int N = 4;
+    float buf[N];
+    for( int i = 0; i < N; i++ ) buf[i] = (float)ptr[i];
+    return v_load(buf);
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+{
+    const int N = 4;
+    float buf[N];
+    v_store(buf, v);
+    for( int i = 0; i < N; i++ ) ptr[i] = float16_t(buf[i]);
+}
+#endif
+
+////////////// Rounding //////////////
+
+inline v_int32x4 v_round(const v_float32x4& a)
+{
+    return v_int32x4(vfcvt_x_f_v_i32m1(a, 4));
+}
+
+inline v_int32x4 v_floor(const v_float32x4& a)
+{
+    v_float32x4 ZP5 = v_setall_f32(0.5f);
+    v_float32x4 t = a - ZP5;
+    return v_int32x4(vfcvt_x_f_v_i32m1(t, 4));
+}
+
+inline v_int32x4 v_ceil(const v_float32x4& a)
+{
+    v_float32x4 ZP5 = v_setall_f32(0.5f);
+    v_float32x4 t = a + ZP5;
+    return v_int32x4(vfcvt_x_f_v_i32m1(t, 4));
+}
+
+inline v_int32x4 v_trunc(const v_float32x4& a)
+{
+    return v_int32x4(vfcvt_rtz_x_f_v_i32m1(a, 4));
+}
+#if CV_SIMD128_64F
+#ifndef __clang__
+inline v_int32x4 v_round(const v_float64x2& a)
+{
+    double arr[4] = {a.val[0], a.val[1], 0, 0};
+    vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
+    return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
+}
+
+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{
+    double arr[4] = {a.val[0], a.val[1], b.val[0], b.val[1]};
+    vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
+    return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
+}
+
+inline v_int32x4 v_floor(const v_float64x2& a)
+{
+    double arr[4] = {a.val[0]-0.5f, a.val[1]-0.5f, 0, 0};
+    vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
+    return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
+}
+
+inline v_int32x4 v_ceil(const v_float64x2& a)
+{
+    double arr[4] = {a.val[0]+0.5f, a.val[1]+0.5f, 0, 0};
+    vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
+    return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
+}
+
+inline v_int32x4 v_trunc(const v_float64x2& a)
+{
+    double arr[4] = {a.val[0], a.val[1], 0, 0};
+    vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
+    return v_int32x4(vfncvt_rtz_x_f_w_i32m1(tmp, 4));
+}
+
+#else
+inline v_int32x4 v_round(const v_float64x2& a)
+{
+    vfloat64m2_t zero = vfmv_v_f_f64m2(0, 4);
+    return v_int32x4(vfncvt_x_f_w_i32m1(vset_v_f64m1_f64m2(zero, 0, a), 4));
+}
+
+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{
+    vfloat64m2_t dst = vlmul_ext_v_f64m1_f64m2(a);
+    return v_int32x4(vfncvt_x_f_w_i32m1(vset_v_f64m1_f64m2(dst, 1, b), 4));
+}
+
+inline v_int32x4 v_floor(const v_float64x2& a)
+{
+    vfloat64m2_t dst = vfmv_v_f_f64m2(0, 4);
+    dst = vset_v_f64m1_f64m2(dst, 0, a);
+    dst = vfsub_vf_f64m2(dst, 0.5, 2);
+    return v_int32x4(vfncvt_x_f_w_i32m1(dst, 4));
+}
+
+inline v_int32x4 v_ceil(const v_float64x2& a)
+{
+    vfloat64m2_t dst = vfmv_v_f_f64m2(0, 4);
+    dst = vset_v_f64m1_f64m2(dst, 0, a);
+    dst = vfadd_vf_f64m2(dst, 0.5, 2);
+    return v_int32x4(vfncvt_x_f_w_i32m1(dst, 4));
+}
+
+inline v_int32x4 v_trunc(const v_float64x2& a)
+{
+    vfloat64m2_t zero = vfmv_v_f_f64m2(0, 4);
+    return v_int32x4(vfncvt_rtz_x_f_w_i32m1(vset_v_f64m1_f64m2(zero, 0, a), 4));
+}
+#endif
+#endif
+
+
+//////// Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{
+    int ptr[8] = {0};
+    v_int32x4 t1, t2;
+    vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
+    v_load_deinterleave(ptr, t1, t2);
+    return t1 + t2;
+}
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{
+    int ptr[8] = {0};
+    v_int32x4 t1, t2;
+    vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
+    v_load_deinterleave(ptr, t1, t2);
+    return t1 + t2 + c;
+}
+
+// 32 >> 64
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
+{
+    int64 ptr[4] = {0};
+    v_int64x2 t1, t2;
+    vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
+    v_load_deinterleave(ptr, t1, t2);
+    return t1 + t2;
+}
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{
+    int64 ptr[4] = {0};
+    v_int64x2 t1, t2;
+    vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
+    v_load_deinterleave(ptr, t1, t2);
+    return t1 + t2 + c;
+}
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
+{
+    unsigned ptr[16] = {0};
+    v_uint32x4 t1, t2, t3, t4;
+    vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
+    v_load_deinterleave(ptr, t1, t2, t3, t4);
+    return t1 + t2 + t3 + t4;
+}
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
+                                   const v_uint32x4& c)
+{
+    unsigned ptr[16] = {0};
+    v_uint32x4 t1, t2, t3, t4;
+    vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
+    v_load_deinterleave(ptr, t1, t2, t3, t4);
+    return t1 + t2 + t3 + t4 + c;
+}
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
+{
+    int ptr[16] = {0};
+    v_int32x4 t1, t2, t3, t4;
+    vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
+    v_load_deinterleave(ptr, t1, t2, t3, t4);
+    return t1 + t2 + t3 + t4;
+}
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
+                                  const v_int32x4& c)
+{
+    int ptr[16] = {0};
+    v_int32x4 t1, t2, t3, t4;
+    vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
+    v_load_deinterleave(ptr, t1, t2, t3, t4);
+    return t1 + t2 + t3 + t4 + c;
+}
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
+{
+    uint64 ptr[8] = {0};
+    v_uint64x2 t1, t2, t3, t4;
+    vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
+    v_load_deinterleave(ptr, t1, t2, t3, t4);
+    return t1 + t2 + t3 + t4;
+}
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{
+    uint64 ptr[8] = {0};
+    v_uint64x2 t1, t2, t3, t4;
+    vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
+    v_load_deinterleave(ptr, t1, t2, t3, t4);
+    return t1 + t2 + t3 + t4 + c;
+}
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
+{
+    int64 ptr[8] = {0};
+    v_int64x2 t1, t2, t3, t4;
+    vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
+    v_load_deinterleave(ptr, t1, t2, t3, t4);
+    return t1 + t2 + t3 + t4;
+}
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
+                                  const v_int64x2& c)
+{
+    int64 ptr[8] = {0};
+    v_int64x2 t1, t2, t3, t4;
+    vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
+    v_load_deinterleave(ptr, t1, t2, t3, t4);
+    return t1 + t2 + t3 + t4 + c;
+}
+
+// 32 >> 64f
+#if CV_SIMD128_64F
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a,   const v_int32x4& b,
+                                    const v_float64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+#endif
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
+{
+    int ptr[8] = {0};
+    vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
+    v_int32x4 t1 = v_load(ptr);
+    v_int32x4 t2 = v_load(ptr+4);
+    return t1 + t2;
+}
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{
+    int ptr[8] = {0};
+    vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
+    v_int32x4 t1 = v_load(ptr);
+    v_int32x4 t2 = v_load(ptr+4);
+    return t1 + t2 + c;
+}
+
+// 32 >> 64
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
+{
+    int64 ptr[4] = {0};
+    vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
+    v_int64x2 t1 = v_load(ptr);
+    v_int64x2 t2 = v_load(ptr+2);
+    return t1 + t2;
+}
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{
+    int64 ptr[4] = {0};
+    vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
+    v_int64x2 t1 = v_load(ptr);
+    v_int64x2 t2 = v_load(ptr+2);
+    return t1 + t2 + c;
+}
+
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
+{
+    unsigned ptr[16] = {0};
+    vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
+    v_uint32x4 t1 = v_load(ptr);
+    v_uint32x4 t2 = v_load(ptr+4);
+    v_uint32x4 t3 = v_load(ptr+8);
+    v_uint32x4 t4 = v_load(ptr+12);
+    return t1 + t2 + t3 + t4;
+}
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{
+    unsigned ptr[16] = {0};
+    vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
+    v_uint32x4 t1 = v_load(ptr);
+    v_uint32x4 t2 = v_load(ptr+4);
+    v_uint32x4 t3 = v_load(ptr+8);
+    v_uint32x4 t4 = v_load(ptr+12);
+    return t1 + t2 + t3 + t4 + c;
+}
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
+{
+    int ptr[16] = {0};
+    vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
+    v_int32x4 t1 = v_load(ptr);
+    v_int32x4 t2 = v_load(ptr+4);
+    v_int32x4 t3 = v_load(ptr+8);
+    v_int32x4 t4 = v_load(ptr+12);
+    return t1 + t2 + t3 + t4;
+}
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{
+    int ptr[16] = {0};
+    vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
+    v_int32x4 t1 = v_load(ptr);
+    v_int32x4 t2 = v_load(ptr+4);
+    v_int32x4 t3 = v_load(ptr+8);
+    v_int32x4 t4 = v_load(ptr+12);
+    return t1 + t2 + t3 + t4 + c;
+}
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
+{
+    uint64 ptr[8] = {0};
+    vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
+    v_uint64x2 t1 = v_load(ptr);
+    v_uint64x2 t2 = v_load(ptr+2);
+    v_uint64x2 t3 = v_load(ptr+4);
+    v_uint64x2 t4 = v_load(ptr+6);
+    return t1 + t2 + t3 + t4;
+}
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{
+    uint64 ptr[8] = {0};
+    vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
+    v_uint64x2 t1 = v_load(ptr);
+    v_uint64x2 t2 = v_load(ptr+2);
+    v_uint64x2 t3 = v_load(ptr+4);
+    v_uint64x2 t4 = v_load(ptr+6);
+    return t1 + t2 + t3 + t4 + c;
+}
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
+{
+    int64 ptr[8] = {0};
+    vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
+    v_int64x2 t1 = v_load(ptr);
+    v_int64x2 t2 = v_load(ptr+2);
+    v_int64x2 t3 = v_load(ptr+4);
+    v_int64x2 t4 = v_load(ptr+6);
+    return t1 + t2 + t3 + t4;
+}
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{
+    int64 ptr[8] = {0};
+    vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
+    v_int64x2 t1 = v_load(ptr);
+    v_int64x2 t2 = v_load(ptr+2);
+    v_int64x2 t3 = v_load(ptr+4);
+    v_int64x2 t4 = v_load(ptr+6);
+    return t1 + t2 + t3 + t4 + c;
+}
+
+// 32 >> 64f
+#if CV_SIMD128_64F
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod_fast(a, b)); }
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+#endif
+
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2,
+                            const v_float32x4& m3)
+{
+    vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n<0>(v), 4);
+    res = vfmacc_vf_f32m1(res, v_extract_n<1>(v), m1, 4);
+    res = vfmacc_vf_f32m1(res, v_extract_n<2>(v), m2, 4);
+    res = vfmacc_vf_f32m1(res, v_extract_n<3>(v), m3, 4);
+    return v_float32x4(res);
+}
+
+inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
+                               const v_float32x4& m1, const v_float32x4& m2,
+                               const v_float32x4& a)
+{
+    vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n<0>(v), 4);
+    res = vfmacc_vf_f32m1(res, v_extract_n<1>(v), m1, 4);
+    res = vfmacc_vf_f32m1(res, v_extract_n<2>(v), m2, 4);
+    return v_float32x4(res) + a;
+}
+
+#define OPENCV_HAL_IMPL_RVV_MUL_EXPAND(_Tpvec, _Tpwvec, _Tpw, suffix, wmul, width, vl, hvl) \
+inline void v_mul_expand(const _Tpvec& a, const _Tpvec& b, _Tpwvec& c, _Tpwvec& d) \
+{ \
+    _Tpw ptr[_Tpwvec::nlanes*2] = {0}; \
+    vse##width##_v_##suffix##m2(ptr, wmul(a, b, vl), vl); \
+    c = _Tpwvec(vle##width##_v_##suffix##m1(ptr, hvl)); \
+    d = _Tpwvec(vle##width##_v_##suffix##m1(ptr+_Tpwvec::nlanes, hvl)); \
+}
+
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint8x16, v_uint16x8, ushort, u16, vwmulu_vv_u16m2, 16, 16, 8)
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int8x16, v_int16x8, short, i16, vwmul_vv_i16m2, 16, 16, 8)
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint16x8, v_uint32x4, unsigned, u32, vwmulu_vv_u32m2, 32, 8, 4)
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int16x8, v_int32x4, int, i32, vwmul_vv_i32m2, 32, 8, 4)
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint32x4, v_uint64x2, uint64, u64, vwmulu_vv_u64m2, 64, 4, 2)
+
+
+inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
+{
+    return v_int16x8(vnsra_wx_i16m1(vwmul_vv_i32m2(a, b, 8), 16, 8));
+}
+inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
+{
+    return v_uint16x8(vnsrl_wx_u16m1(vwmulu_vv_u32m2(a, b, 8), 16, 8));
+}
+
+
+//////// Saturating Multiply ////////
+
+#define OPENCV_HAL_IMPL_RVV_MUL_SAT(_Tpvec, _wTpvec) \
+inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    _wTpvec c, d; \
+    v_mul_expand(a, b, c, d); \
+    return v_pack(c, d); \
+} \
+inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
+{ \
+    a = a * b; \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint8x16, v_uint16x8)
+OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int8x16, v_int16x8)
+OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint16x8, v_uint32x4)
+OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int16x8, v_int32x4)
+
+
+inline void v_cleanup() {}
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+
+}
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_rvv071.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_rvv071.hpp
new file mode 100644
index 0000000..2bdc622
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_rvv071.hpp
@@ -0,0 +1,2545 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// Copyright (C) 2015, PingTouGe Semiconductor Co., Ltd., all rights reserved.
+
+#ifndef OPENCV_HAL_INTRIN_RISCVV_HPP
+#define OPENCV_HAL_INTRIN_RISCVV_HPP
+
+#include <float.h>
+#include <algorithm>
+#include "opencv2/core/utility.hpp"
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+#define CV_SIMD128 1
+#define CV_SIMD128_64F 1
+//////////// Types ////////////
+struct v_uint8x16
+{
+    typedef uchar lane_type;
+    enum { nlanes = 16 };
+
+    v_uint8x16() {}
+    explicit v_uint8x16(vuint8m1_t v) : val(v) {}
+    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+               uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+    {
+        uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        val = (vuint8m1_t)vle_v_u8m1((unsigned char*)v, 16);
+    }
+    uchar get0() const
+    {
+        return vmv_x_s_u8m1_u8(val, 16);
+    }
+
+    vuint8m1_t val;
+};
+
+struct v_int8x16
+{
+    typedef schar lane_type;
+    enum { nlanes = 16 };
+
+    v_int8x16() {}
+    explicit v_int8x16(vint8m1_t v) : val(v) {}
+    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+               schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+    {
+        schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        val = (vint8m1_t)vle_v_i8m1((schar*)v, 16);
+    }
+    schar get0() const
+    {
+        return vmv_x_s_i8m1_i8(val, 16);
+    }
+
+    vint8m1_t val;
+};
+
+struct v_uint16x8
+{
+    typedef ushort lane_type;
+    enum { nlanes = 8 };
+
+    v_uint16x8() {}
+    explicit v_uint16x8(vuint16m1_t v) : val(v) {}
+    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+    {
+        ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = (vuint16m1_t)vle_v_u16m1((unsigned short*)v, 8);
+    }
+    ushort get0() const
+    {
+        return vmv_x_s_u16m1_u16(val, 8);
+    }
+
+    vuint16m1_t val;
+};
+
+struct v_int16x8
+{
+    typedef short lane_type;
+    enum { nlanes = 8 };
+
+    v_int16x8() {}
+    explicit v_int16x8(vint16m1_t v) : val(v) {}
+    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+    {
+        short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = (vint16m1_t)vle_v_i16m1((signed short*)v, 8);
+    }
+    short get0() const
+    {
+        return vmv_x_s_i16m1_i16(val, 8);
+    }
+
+    vint16m1_t val;
+};
+
+struct v_uint32x4
+{
+    typedef unsigned lane_type;
+    enum { nlanes = 4 };
+
+    v_uint32x4() {}
+    explicit v_uint32x4(vuint32m1_t v) : val(v) {}
+    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
+    {
+        unsigned v[] = {v0, v1, v2, v3};
+        val = (vuint32m1_t)vle_v_u32m1((unsigned int*)v, 4);
+    }
+    unsigned get0() const
+    {
+        return vmv_x_s_u32m1_u32(val, 4);
+    }
+
+    vuint32m1_t val;
+};
+
+struct v_int32x4
+{
+    typedef int lane_type;
+    enum { nlanes = 4 };
+
+    v_int32x4() {}
+    explicit v_int32x4(vint32m1_t v) : val(v) {}
+    v_int32x4(int v0, int v1, int v2, int v3)
+    {
+        int v[] = {v0, v1, v2, v3};
+        val = (vint32m1_t)vle_v_i32m1((signed int*)v, 4);
+    }
+    int get0() const
+    {
+        return vmv_x_s_i32m1_i32(val, 4);
+    }
+    vint32m1_t val;
+};
+
+struct v_float32x4
+{
+    typedef float lane_type;
+    enum { nlanes = 4 };
+
+    v_float32x4() {}
+    explicit v_float32x4(vfloat32m1_t v) : val(v) {}
+    v_float32x4(float v0, float v1, float v2, float v3)
+    {
+        float v[] = {v0, v1, v2, v3};
+        val = (vfloat32m1_t)vle_v_f32m1((float*)v, 4);
+    }
+    float get0() const
+    {
+        return vfmv_f_s_f32m1_f32(val, 4);
+    }
+    vfloat32m1_t val;
+};
+
+struct v_uint64x2
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 2 };
+
+    v_uint64x2() {}
+    explicit v_uint64x2(vuint64m1_t v) : val(v) {}
+    v_uint64x2(uint64 v0, uint64 v1)
+    {
+        uint64 v[] = {v0, v1};
+        val = (vuint64m1_t)vle_v_u64m1((unsigned long*)v, 2);
+    }
+    uint64 get0() const
+    {
+        return vmv_x_s_u64m1_u64(val, 2);
+    }
+    vuint64m1_t val;
+};
+
+struct v_int64x2
+{
+    typedef int64 lane_type;
+    enum { nlanes = 2 };
+
+    v_int64x2() {}
+    explicit v_int64x2(vint64m1_t v) : val(v) {}
+    v_int64x2(int64 v0, int64 v1)
+    {
+        int64 v[] = {v0, v1};
+        val = (vint64m1_t)vle_v_i64m1((long*)v, 2);
+    }
+    int64 get0() const
+    {
+        return vmv_x_s_i64m1_i64(val, 2);
+    }
+    vint64m1_t val;
+};
+
+struct v_float64x2
+{
+    typedef double lane_type;
+    enum { nlanes = 2 };
+
+    v_float64x2() {}
+    explicit v_float64x2(vfloat64m1_t v) : val(v) {}
+    v_float64x2(double v0, double v1)
+    {
+        double v[] = {v0, v1};
+        val = (vfloat64m1_t)vle_v_f64m1((double*)v, 2);
+    }
+    double get0() const
+    {
+        return vfmv_f_s_f64m1_f64(val, 2);
+    }
+    vfloat64m1_t val;
+};
+
+#define OPENCV_HAL_IMPL_RISCVV_INIT(_Tpv, _Tp, suffix) \
+inline _Tp##m1_t vreinterpretq_##suffix##_##suffix(_Tp##m1_t v) { return v; } \
+inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16((vuint8m1_t)(v.val)); } \
+inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16((vint8m1_t)(v.val)); } \
+inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8((vuint16m1_t)(v.val)); } \
+inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8((vint16m1_t)(v.val)); } \
+inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4((vuint32m1_t)(v.val)); } \
+inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4((vint32m1_t)(v.val)); } \
+inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2((vuint64m1_t)(v.val)); } \
+inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2((vint64m1_t)(v.val)); } \
+inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4((vfloat32m1_t)(v.val)); }\
+inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2((vfloat64m1_t)(v.val)); }
+
+
+OPENCV_HAL_IMPL_RISCVV_INIT(uint8x16, vuint8, u8)
+OPENCV_HAL_IMPL_RISCVV_INIT(int8x16, vint8, s8)
+OPENCV_HAL_IMPL_RISCVV_INIT(uint16x8, vuint16, u16)
+OPENCV_HAL_IMPL_RISCVV_INIT(int16x8, vint16, s16)
+OPENCV_HAL_IMPL_RISCVV_INIT(uint32x4, vuint32, u32)
+OPENCV_HAL_IMPL_RISCVV_INIT(int32x4, vint32, s32)
+OPENCV_HAL_IMPL_RISCVV_INIT(uint64x2, vuint64, u64)
+OPENCV_HAL_IMPL_RISCVV_INIT(int64x2, vint64, s64)
+OPENCV_HAL_IMPL_RISCVV_INIT(float64x2, vfloat64, f64)
+OPENCV_HAL_IMPL_RISCVV_INIT(float32x4, vfloat32, f32)
+#define OPENCV_HAL_IMPL_RISCVV_INIT_SET(__Tp, _Tp, suffix, len, num) \
+inline v_##_Tp##x##num v_setzero_##suffix() { return v_##_Tp##x##num((v##_Tp##m1_t){0}); }     \
+inline v_##_Tp##x##num v_setall_##suffix(__Tp v) { return v_##_Tp##x##num(vmv_v_x_##len##m1(v, num)); }
+
+OPENCV_HAL_IMPL_RISCVV_INIT_SET(uchar, uint8, u8, u8, 16)
+OPENCV_HAL_IMPL_RISCVV_INIT_SET(char, int8, s8, i8, 16)
+OPENCV_HAL_IMPL_RISCVV_INIT_SET(ushort, uint16, u16, u16, 8)
+OPENCV_HAL_IMPL_RISCVV_INIT_SET(short, int16, s16, i16, 8)
+OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned int, uint32, u32, u32, 4)
+OPENCV_HAL_IMPL_RISCVV_INIT_SET(int, int32, s32, i32, 4)
+OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned long, uint64, u64, u64, 2)
+OPENCV_HAL_IMPL_RISCVV_INIT_SET(long, int64, s64, i64, 2)
+inline v_float32x4 v_setzero_f32() { return v_float32x4((vfloat32m1_t){0}); }
+inline v_float32x4 v_setall_f32(float v) { return v_float32x4(vfmv_v_f_f32m1(v, 4)); }
+
+inline v_float64x2 v_setzero_f64() { return v_float64x2(vfmv_v_f_f64m1(0, 2)); }
+inline v_float64x2 v_setall_f64(double v) { return v_float64x2(vfmv_v_f_f64m1(v, 2)); }
+
+
+#define OPENCV_HAL_IMPL_RISCVV_BIN_OP(bin_op, _Tpvec, intrin) \
+inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+} \
+inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+{ \
+    a.val = intrin(a.val, b.val); \
+    return a; \
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_BIN_OPN(bin_op, _Tpvec, intrin, num) \
+inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val, num)); \
+} \
+inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+{ \
+    a.val = intrin(a.val, b.val, num); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint8x16, vsaddu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint8x16, vssubu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int8x16, vsadd_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int8x16, vssub_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint16x8, vsaddu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint16x8, vssubu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int16x8, vsadd_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int16x8, vssub_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int32x4, vsadd_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int32x4, vssub_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_int32x4, vmul_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint32x4, vadd_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint32x4, vsub_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_uint32x4, vmul_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int64x2, vsadd_vv_i64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int64x2, vssub_vv_i64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint64x2, vadd_vv_u64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint64x2, vsub_vv_u64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float32x4, vfadd_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float32x4, vfsub_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float32x4, vfmul_vv_f32m1, 4)
+inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
+{
+    return v_float32x4(vfdiv_vv_f32m1(a.val, b.val, 4));
+}
+inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
+{
+    a.val = vfdiv_vv_f32m1(a.val, b.val, 4);
+    return a;
+}
+
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float64x2, vfadd_vv_f64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float64x2, vfsub_vv_f64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float64x2, vfmul_vv_f64m1, 2)
+inline v_float64x2 operator / (const v_float64x2& a, const v_float64x2& b)
+{
+    return v_float64x2(vfdiv_vv_f64m1(a.val, b.val, 2));
+}
+inline v_float64x2& operator /= (v_float64x2& a, const v_float64x2& b)
+{
+    a.val = vfdiv_vv_f64m1(a.val, b.val, 2);
+    return a;
+}
+// TODO: exp, log, sin, cos
+
+#define OPENCV_HAL_IMPL_RISCVV_BIN_FUNC(_Tpvec, func, intrin) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(_Tpvec, func, intrin, num) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val, num)); \
+}
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_min, vminu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_max, vmaxu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_min, vmin_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_max, vmax_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_min, vminu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_max, vmaxu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_min, vmin_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_max, vmax_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint32x4, v_min, vminu_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint32x4, v_max, vmaxu_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int32x4, v_min, vmin_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int32x4, v_max, vmax_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float32x4, v_min, vfmin_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float32x4, v_max, vfmax_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float64x2, v_min, vfmin_vv_f64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float64x2, v_max, vfmax_vv_f64m1, 2)
+
+inline v_float32x4 v_sqrt(const v_float32x4& x)
+{
+    return v_float32x4(vfsqrt_v_f32m1(x.val, 4));
+}
+
+inline v_float32x4 v_invsqrt(const v_float32x4& x)
+{
+    return v_float32x4(vfrdiv_vf_f32m1(vfsqrt_v_f32m1(x.val, 4), 1, 4));
+}
+
+inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+    v_float32x4 x(vfmacc_vv_f32m1(vfmul_vv_f32m1(a.val, a.val, 4), b.val, b.val, 4));
+    return v_sqrt(x);
+}
+
+inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+    return v_float32x4(vfmacc_vv_f32m1(vfmul_vv_f32m1(a.val, a.val, 4), b.val, b.val, 4));
+}
+
+inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+    return v_float32x4(vfmacc_vv_f32m1(c.val, a.val, b.val, 4));
+}
+
+inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_int32x4(vmacc_vv_i32m1(c.val, a.val, b.val, 4));
+}
+
+inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2,
+                            const v_float32x4& m3)
+{
+    vfloat32m1_t res = vfmul_vf_f32m1(m0.val, v.val[0], 4);//vmuli_f32(m0.val, v.val, 0);
+    res = vfmacc_vf_f32m1(res, v.val[1], m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    res = vfmacc_vf_f32m1(res, v.val[2], m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    res = vfmacc_vf_f32m1(res, v.val[3], m3.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    return v_float32x4(res);
+}
+
+inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
+                               const v_float32x4& m1, const v_float32x4& m2,
+                               const v_float32x4& a)
+{
+    vfloat32m1_t res = vfmul_vf_f32m1(m0.val, v.val[0], 4);//vmuli_f32(m0.val, v.val, 0);
+    res = vfmacc_vf_f32m1(res, v.val[1], m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    res = vfmacc_vf_f32m1(res, v.val[2], m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    res = vfadd_vv_f32m1(res, a.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    return v_float32x4(res);
+}
+
+inline v_float64x2 v_sqrt(const v_float64x2& x)
+{
+    return v_float64x2(vfsqrt_v_f64m1(x.val, 2));
+}
+
+inline v_float64x2 v_invsqrt(const v_float64x2& x)
+{
+    return v_float64x2(vfrdiv_vf_f64m1(vfsqrt_v_f64m1(x.val, 2), 1, 2));
+}
+
+inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
+{
+    v_float64x2 x(vfmacc_vv_f64m1(vfmul_vv_f64m1(a.val, a.val, 2), b.val, b.val, 2));
+    return v_sqrt(x);
+}
+
+inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_float64x2(vfmacc_vv_f64m1(vfmul_vv_f64m1(a.val, a.val, 2), b.val, b.val, 2));
+}
+
+inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+    return v_float64x2(vfmacc_vv_f64m1(c.val, a.val, b.val, 2));
+}
+
+inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+    return v_fma(a, b, c);
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(_Tpvec, suffix, num) \
+    OPENCV_HAL_IMPL_RISCVV_BIN_OPN(&, _Tpvec, vand_vv_##suffix, num) \
+    OPENCV_HAL_IMPL_RISCVV_BIN_OPN(|, _Tpvec, vor_vv_##suffix, num) \
+    OPENCV_HAL_IMPL_RISCVV_BIN_OPN(^, _Tpvec, vxor_vv_##suffix, num) \
+    inline _Tpvec operator ~ (const _Tpvec & a) \
+    { \
+        return _Tpvec(vnot_v_##suffix(a.val, num)); \
+    }
+
+OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint8x16, u8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint16x8, u16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint32x4, u32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint64x2, u64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int8x16,  i8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int16x8,  i16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int32x4,  i32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int64x2,  i64m1, 2)
+
+#define OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(bin_op, intrin) \
+inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
+{ \
+    return v_float32x4(vfloat32m1_t(intrin(vint32m1_t(a.val), vint32m1_t(b.val), 4))); \
+} \
+inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
+{ \
+    a.val = vfloat32m1_t(intrin(vint32m1_t(a.val), vint32m1_t(b.val), 4)); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(&, vand_vv_i32m1)
+OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(|, vor_vv_i32m1)
+OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(^, vxor_vv_i32m1)
+
+inline v_float32x4 operator ~ (const v_float32x4& a)
+{
+    return v_float32x4((vfloat32m1_t)(vnot_v_i32m1((vint32m1_t)(a.val), 4)));
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(bin_op, intrin) \
+inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
+{ \
+    return v_float64x2(vfloat64m1_t(intrin(vint64m1_t(a.val), vint64m1_t(b.val), 2))); \
+} \
+inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
+{ \
+    a.val = vfloat64m1_t(intrin(vint64m1_t(a.val), vint64m1_t(b.val), 2)); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(&, vand_vv_i64m1)
+OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(|, vor_vv_i64m1)
+OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(^, vxor_vv_i64m1)
+
+inline v_float64x2 operator ~ (const v_float64x2& a)
+{
+    return v_float64x2((vfloat64m1_t)(vnot_v_i64m1((vint64m1_t)(a.val), 2)));
+}
+inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
+{
+    return v_int16x8(vmulh_vv_i16m1(a.val, b.val, 8));
+}
+inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
+{
+    return v_uint16x8(vmulhu_vv_u16m1(a.val, b.val, 8));
+}
+
+//#define OPENCV_HAL_IMPL_RISCVV_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
+//inline _Tpuvec v_abs(const _Tpsvec& a) {    \
+//    E##xm1_t mask=vmflt_vf_e32xm1_f32m1(x.val, 0.0, 4);
+
+//OPENCV_HAL_IMPL_RISCVV_ABS(v_uint8x16, v_int8x16, u8, s8)
+//OPENCV_HAL_IMPL_RISCVV_ABS(v_uint16x8, v_int16x8, u16, s16)
+//OPENCV_HAL_IMPL_RISCVV_ABS(v_uint32x4, v_int32x4, u32, s32)
+
+inline v_uint32x4 v_abs(v_int32x4 x)
+{
+    vbool32_t mask=vmslt_vx_i32m1_b32(x.val, 0, 4);
+    return v_uint32x4((vuint32m1_t)vrsub_vx_i32m1_m(mask, x.val, x.val, 0, 4));
+}
+
+inline v_uint16x8 v_abs(v_int16x8 x)
+{
+    vbool16_t mask=vmslt_vx_i16m1_b16(x.val, 0, 8);
+    return v_uint16x8((vuint16m1_t)vrsub_vx_i16m1_m(mask, x.val, x.val, 0, 8));
+}
+
+inline v_uint8x16 v_abs(v_int8x16 x)
+{
+    vbool8_t mask=vmslt_vx_i8m1_b8(x.val, 0, 16);
+    return v_uint8x16((vuint8m1_t)vrsub_vx_i8m1_m(mask, x.val, x.val, 0, 16));
+}
+
+inline v_float32x4 v_abs(v_float32x4 x)
+{
+    return (v_float32x4)vfsgnjx_vv_f32m1(x.val, x.val, 4);
+}
+
+inline v_float64x2 v_abs(v_float64x2 x)
+{
+    return (v_float64x2)vfsgnjx_vv_f64m1(x.val, x.val, 2);
+}
+
+inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
+{
+    vfloat32m1_t ret = vfsub_vv_f32m1(a.val, b.val, 4);
+    return (v_float32x4)vfsgnjx_vv_f32m1(ret, ret, 4);
+}
+
+inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
+{
+    vfloat64m1_t ret = vfsub_vv_f64m1(a.val, b.val, 2);
+    return (v_float64x2)vfsgnjx_vv_f64m1(ret, ret, 2);
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(bit, num) \
+inline v_uint##bit##x##num v_absdiff(v_uint##bit##x##num a, v_uint##bit##x##num b){    \
+    vuint##bit##m1_t vmax = vmaxu_vv_u##bit##m1(a.val, b.val, num);    \
+    vuint##bit##m1_t vmin = vminu_vv_u##bit##m1(a.val, b.val, num);    \
+    return v_uint##bit##x##num(vsub_vv_u##bit##m1(vmax, vmin, num));\
+}
+
+OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(8, 16)
+OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(16, 8)
+OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(32, 4)
+
+/** Saturating absolute difference **/
+inline v_int8x16 v_absdiffs(v_int8x16 a, v_int8x16 b){
+    vint8m1_t vmax = vmax_vv_i8m1(a.val, b.val, 16);
+    vint8m1_t vmin = vmin_vv_i8m1(a.val, b.val, 16);
+    return v_int8x16(vssub_vv_i8m1(vmax, vmin, 16));
+}
+inline v_int16x8 v_absdiffs(v_int16x8 a, v_int16x8 b){
+    vint16m1_t vmax = vmax_vv_i16m1(a.val, b.val, 8);
+    vint16m1_t vmin = vmin_vv_i16m1(a.val, b.val, 8);
+    return v_int16x8(vssub_vv_i16m1(vmax, vmin, 8));
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_ABSDIFF(_Tpvec, _Tpv, num) \
+inline v_uint##_Tpvec v_absdiff(v_int##_Tpvec a, v_int##_Tpvec b){    \
+     vint##_Tpv##_t max = vmax_vv_i##_Tpv(a.val, b.val, num);\
+     vint##_Tpv##_t min = vmin_vv_i##_Tpv(a.val, b.val, num);\
+    return v_uint##_Tpvec((vuint##_Tpv##_t)vsub_vv_i##_Tpv(max, min, num));    \
+}
+
+OPENCV_HAL_IMPL_RISCVV_ABSDIFF(8x16, 8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_ABSDIFF(16x8, 16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_ABSDIFF(32x4, 32m1, 4)
+
+//  Multiply and expand
+inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
+                         v_int16x8& c, v_int16x8& d)
+{
+    vint16m2_t res = vundefined_i16m2();
+    res = vwmul_vv_i16m2(a.val, b.val, 16);
+    c.val = vget_i16m2_i16m1(res, 0);
+    d.val = vget_i16m2_i16m1(res, 1);
+}
+
+inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
+                         v_uint16x8& c, v_uint16x8& d)
+{
+    vuint16m2_t res = vundefined_u16m2();
+    res = vwmulu_vv_u16m2(a.val, b.val, 16);
+    c.val = vget_u16m2_u16m1(res, 0);
+    d.val = vget_u16m2_u16m1(res, 1);
+}
+
+inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
+                         v_int32x4& c, v_int32x4& d)
+{
+    vint32m2_t res = vundefined_i32m2();
+    res = vwmul_vv_i32m2(a.val, b.val, 8);
+    c.val = vget_i32m2_i32m1(res, 0);
+    d.val = vget_i32m2_i32m1(res, 1);
+}
+
+inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
+                         v_uint32x4& c, v_uint32x4& d)
+{
+    vuint32m2_t res = vundefined_u32m2();
+    res = vwmulu_vv_u32m2(a.val, b.val, 8);
+    c.val = vget_u32m2_u32m1(res, 0);
+    d.val = vget_u32m2_u32m1(res, 1);
+}
+
+inline void v_mul_expand(const v_int32x4& a, const v_int32x4& b,
+                         v_int64x2& c, v_int64x2& d)
+{
+    vint64m2_t res = vundefined_i64m2();
+    res = vwmul_vv_i64m2(a.val, b.val, 4);
+    c.val = vget_i64m2_i64m1(res, 0);
+    d.val = vget_i64m2_i64m1(res, 1);
+}
+
+inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
+                         v_uint64x2& c, v_uint64x2& d)
+{
+    vuint64m2_t res = vundefined_u64m2();
+    res = vwmulu_vv_u64m2(a.val, b.val, 4);
+    c.val = vget_u64m2_u64m1(res, 0);
+    d.val = vget_u64m2_u64m1(res, 1);
+}
+
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_add_wrap, vadd_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_add_wrap, vadd_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_add_wrap, vadd_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_add_wrap, vadd_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_sub_wrap, vsub_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_sub_wrap, vsub_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_sub_wrap, vsub_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_sub_wrap, vsub_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_mul_wrap, vmul_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_mul_wrap, vmul_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_mul_wrap, vmul_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_mul_wrap, vmul_vv_i16m1, 8)
+//////// Dot Product ////////
+// 16 >> 32
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{
+    vint32m2_t res = vundefined_i32m2();
+    res = vwmul_vv_i32m2(a.val, b.val, 8);
+    res = vrgather_vv_i32m2(res, (vuint32m2_t){0, 2, 4, 6, 1, 3, 5, 7}, 8);
+    return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(res, 0), vget_i32m2_i32m1(res, 1), 4));
+}
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{
+    vint32m2_t res = vundefined_i32m2();
+    res = vwmul_vv_i32m2(a.val, b.val, 8);
+    res = vrgather_vv_i32m2(res, (vuint32m2_t){0, 2, 4, 6, 1, 3, 5, 7}, 8);
+    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(res, 0),vget_i32m2_i32m1(res, 1), 4), c.val, 4));
+}
+
+// 32 >> 64
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
+{
+    vint64m2_t res = vundefined_i64m2();
+    res = vwmul_vv_i64m2(a.val, b.val, 4);
+    res = vrgather_vv_i64m2(res, (vuint64m2_t){0, 2, 1, 3}, 4);
+    return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(res, 0), vget_i64m2_i64m1(res, 1), 2));
+}
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{
+    vint64m2_t res = vundefined_i64m2();
+    res = vwmul_vv_i64m2(a.val, b.val, 4);
+    res = vrgather_vv_i64m2(res, (vuint64m2_t){0, 2, 1, 3}, 4);
+    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(res, 0), vget_i64m2_i64m1(res, 1), 2), c.val, 2));
+}
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
+{
+    vuint16m2_t v1 = vundefined_u16m2();
+    vuint32m2_t v2 = vundefined_u32m2();
+    v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
+    v1 = vrgather_vv_u16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
+    v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
+    return v_uint32x4(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4));
+}
+
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
+                                   const v_uint32x4& c)
+{
+    vuint16m2_t v1 = vundefined_u16m2();
+    vuint32m2_t v2 = vundefined_u32m2();
+    v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
+    v1 = vrgather_vv_u16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
+    v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
+    return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4), c.val, 4));
+}
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
+{
+    vint16m2_t v1 = vundefined_i16m2();
+    vint32m2_t v2 = vundefined_i32m2();
+    v1 = vwmul_vv_i16m2(a.val, b.val, 16);
+    v1 = vrgather_vv_i16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
+    v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
+    return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4));
+}
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
+                                   const v_int32x4& c)
+{
+    vint16m2_t v1 = vundefined_i16m2();
+    vint32m2_t v2 = vundefined_i32m2();
+    v1 = vwmul_vv_i16m2(a.val, b.val, 16);
+    v1 = vrgather_vv_i16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
+    v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
+    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4), c.val, 4));
+}
+
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
+{
+    vuint32m2_t v1 = vundefined_u32m2();
+    vuint64m2_t v2 = vundefined_u64m2();
+    v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
+    v1 = vrgather_vv_u32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
+    v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
+    return v_uint64x2(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2));
+}
+
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b,
+                                   const v_uint64x2& c)
+{
+    vuint32m2_t v1 = vundefined_u32m2();
+    vuint64m2_t v2 = vundefined_u64m2();
+    v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
+    v1 = vrgather_vv_u32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
+    v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
+    return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2), c.val, 2));
+}
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
+{
+    vint32m2_t v1 = vundefined_i32m2();
+    vint64m2_t v2 = vundefined_i64m2();
+    v1 = vwmul_vv_i32m2(a.val, b.val, 8);
+    v1 = vrgather_vv_i32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
+    v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
+    return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2));
+}
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
+                                   const v_int64x2& c)
+{
+    vint32m2_t v1 = vundefined_i32m2();
+    vint64m2_t v2 = vundefined_i64m2();
+    v1 = vwmul_vv_i32m2(a.val, b.val, 8);
+    v1 = vrgather_vv_i32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
+    v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
+    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2), c.val, 2));
+}
+
+//////// Fast Dot Product ////////
+// 16 >> 32
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
+{
+    vint32m2_t v1 = vundefined_i32m2();
+    v1 = vwmul_vv_i32m2(a.val, b.val, 8);
+    return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4));
+}
+
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{
+    vint32m2_t v1 = vundefined_i32m2();
+    v1 = vwmul_vv_i32m2(a.val, b.val, 8);
+    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4), c.val, 4));
+}
+
+// 32 >> 64
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
+{
+    vint64m2_t v1 = vundefined_i64m2();
+    v1 = vwmul_vv_i64m2(a.val, b.val, 4);
+    return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 2));
+}
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{
+    vint64m2_t v1 = vundefined_i64m2();
+    v1 = vwmul_vv_i64m2(a.val, b.val, 8);
+    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 4), c.val, 4));
+}
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
+{
+    vuint16m2_t v1 = vundefined_u16m2();
+    vuint32m2_t v2 = vundefined_u32m2();
+    v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
+    v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
+    return v_uint32x4(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4));
+}
+
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{
+    vuint16m2_t v1 = vundefined_u16m2();
+    vuint32m2_t v2 = vundefined_u32m2();
+    v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
+    v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
+    return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4), c.val, 4));
+}
+
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
+{
+    vint16m2_t v1 = vundefined_i16m2();
+    vint32m2_t v2 = vundefined_i32m2();
+    v1 = vwmul_vv_i16m2(a.val, b.val, 16);
+    v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
+    return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4));
+}
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{
+    vint16m2_t v1 = vundefined_i16m2();
+    vint32m2_t v2 = vundefined_i32m2();
+    v1 = vwmul_vv_i16m2(a.val, b.val, 16);
+    v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
+    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4), c.val, 4));
+}
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
+{
+    vuint32m2_t v1 = vundefined_u32m2();
+    vuint64m2_t v2 = vundefined_u64m2();
+    v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
+    v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
+    return v_uint64x2(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2));
+}
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{
+    vuint32m2_t v1 = vundefined_u32m2();
+    vuint64m2_t v2 = vundefined_u64m2();
+    v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
+    v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
+    return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2), c.val, 2));
+}
+
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
+{
+    vint32m2_t v1 = vundefined_i32m2();
+    vint64m2_t v2 = vundefined_i64m2();
+    v1 = vwmul_vv_i32m2(a.val, b.val, 8);
+    v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
+    return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2));
+}
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{
+    vint32m2_t v1 = vundefined_i32m2();
+    vint64m2_t v2 = vundefined_i64m2();
+    v1 = vwmul_vv_i32m2(a.val, b.val, 8);
+    v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
+    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2), c.val, 2));
+}
+
+
+#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(_Tpvec, _Tpvec2, len, scalartype, func, intrin, num) \
+inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
+{\
+    v##_Tpvec2##m1_t val = vmv_v_x_##len##m1(0, num); \
+    val = intrin(val, a.val, val, num);    \
+    return vmv_x_s_##len##m1_##len(val, num);    \
+}
+
+
+#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(_Tpvec, _Tpvec2, scalartype, func, funcu, num) \
+inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
+{\
+    v##_Tpvec##m1_t val = (v##_Tpvec##m1_t)vmv_v_x_i8m1(0, num); \
+    val = v##funcu##_vs_##_Tpvec2##m1_##_Tpvec2##m1(val, a.val, a.val, num);    \
+    return val[0];    \
+}
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int8, int16, i16, int, sum, vwredsum_vs_i8m1_i16m1, 16)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int16, int32, i32, int, sum, vwredsum_vs_i16m1_i32m1, 8)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int32, int64, i64, int, sum, vwredsum_vs_i32m1_i64m1, 4)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint8, uint16, u16, unsigned, sum, vwredsumu_vs_u8m1_u16m1, 16)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint16, uint32, u32, unsigned, sum, vwredsumu_vs_u16m1_u32m1, 8)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint32, uint64, u64, unsigned, sum, vwredsumu_vs_u32m1_u64m1, 4)
+inline float v_reduce_sum(const v_float32x4& a) \
+{\
+    vfloat32m1_t val = vfmv_v_f_f32m1(0.0, 4); \
+    val = vfredsum_vs_f32m1_f32m1(val, a.val, val, 4);    \
+    return vfmv_f_s_f32m1_f32(val, 4);    \
+}
+inline double v_reduce_sum(const v_float64x2& a) \
+{\
+    vfloat64m1_t val = vfmv_v_f_f64m1(0.0, 2); \
+    val = vfredsum_vs_f64m1_f64m1(val, a.val, val, 2);    \
+    return vfmv_f_s_f64m1_f64(val, 2);    \
+}
+inline uint64 v_reduce_sum(const v_uint64x2& a)
+{ return vext_x_v_u64m1_u64((vuint64m1_t)a.val, 0, 2)+vext_x_v_u64m1_u64((vuint64m1_t)a.val, 1, 2); }
+
+inline int64 v_reduce_sum(const v_int64x2& a)
+{ return vext_x_v_i64m1_i64((vint64m1_t)a.val, 0, 2)+vext_x_v_i64m1_i64((vint64m1_t)a.val, 1, 2); }
+
+#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(func)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int8,  i8, int, func, red##func, 16)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int16, i16, int, func, red##func, 8)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int32, i32, int, func, red##func, 4)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int64, i64, int, func, red##func, 2)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint8,  u8, unsigned, func, red##func##u, 16)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint16, u16, unsigned, func, red##func##u, 8)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint32, u32, unsigned, func, red##func##u, 4)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(float32, f32, float, func, fred##func, 4)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(max)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(min)
+
+inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
+                                 const v_float32x4& c, const v_float32x4& d)
+{
+    vfloat32m1_t a0 = vfmv_v_f_f32m1(0.0, 4);
+    vfloat32m1_t b0 = vfmv_v_f_f32m1(0.0, 4);
+    vfloat32m1_t c0 = vfmv_v_f_f32m1(0.0, 4);
+    vfloat32m1_t d0 = vfmv_v_f_f32m1(0.0, 4);
+    a0 = vfredsum_vs_f32m1_f32m1(a0, a.val, a0, 4);
+    b0 = vfredsum_vs_f32m1_f32m1(b0, b.val, b0, 4);
+    c0 = vfredsum_vs_f32m1_f32m1(c0, c.val, c0, 4);
+    d0 = vfredsum_vs_f32m1_f32m1(d0, d.val, d0, 4);
+    return v_float32x4(a0[0], b0[0], c0[0], d0[0]);
+}
+
+inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
+{
+    vfloat32m1_t a0 = vfmv_v_f_f32m1(0.0, 4);
+    vfloat32m1_t x = vfsub_vv_f32m1(a.val, b.val, 4);
+    vbool32_t mask=vmflt_vf_f32m1_b32(x, 0, 4);
+    vfloat32m1_t val = vfrsub_vf_f32m1_m(mask, x, x, 0, 4);
+    a0 = vfredsum_vs_f32m1_f32m1(a0, val, a0, 4);
+    return a0[0];
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(_Tpvec, _Tpvec2) \
+inline unsigned v_reduce_sad(const _Tpvec& a, const _Tpvec&b){    \
+    _Tpvec2 x = v_absdiff(a, b);    \
+    return v_reduce_sum(x);    \
+}
+
+OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int8x16, v_uint8x16)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint8x16, v_uint8x16)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int16x8, v_uint16x8)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint16x8, v_uint16x8)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int32x4, v_uint32x4)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint32x4, v_uint32x4)
+
+#define OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(_Tpvec, _Tp, _T, num, uv) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    vbool##_T##_t mask = vmseq_vv_##_Tp##_b##_T(a.val, b.val, num);    \
+    return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
+} \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    vbool##_T##_t mask = vmsne_vv_##_Tp##_b##_T(a.val, b.val, num);    \
+    return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
+} \
+inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(a.val, b.val, num);    \
+    return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
+} \
+inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(b.val, a.val, num);    \
+    return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
+} \
+inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(a.val, b.val, num);    \
+    return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
+} \
+inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(b.val, a.val, num);    \
+    return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
+} \
+
+OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int8x16, i8m1,  8, 16, _vv_)
+OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int16x8, i16m1, 16, 8, _vv_)
+OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int32x4, i32m1, 32, 4, _vv_)
+OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int64x2, i64m1, 64, 2, _vv_)
+OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint8x16, u8m1, 8, 16, u_vv_)
+OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint16x8, u16m1, 16, 8, u_vv_)
+OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint32x4, u32m1, 32, 4, u_vv_)
+OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint64x2, u64m1, 64, 2, u_vv_)
+
+//TODO: ==
+inline v_float32x4 operator == (const v_float32x4& a, const v_float32x4& b)
+{
+    vbool32_t mask = vmfeq_vv_f32m1_b32(a.val, b.val, 4);
+    vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
+    return v_float32x4((vfloat32m1_t)res);
+}
+inline v_float32x4 operator != (const v_float32x4& a, const v_float32x4& b)
+{
+    vbool32_t mask = vmfne_vv_f32m1_b32(a.val, b.val, 4);
+    vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
+    return v_float32x4((vfloat32m1_t)res);
+}
+inline v_float32x4 operator < (const v_float32x4& a, const v_float32x4& b)
+{
+    vbool32_t mask = vmflt_vv_f32m1_b32(a.val, b.val, 4);
+    vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
+    return v_float32x4((vfloat32m1_t)res);
+}
+inline v_float32x4 operator <= (const v_float32x4& a, const v_float32x4& b)
+{
+    vbool32_t mask = vmfle_vv_f32m1_b32(a.val, b.val, 4);
+    vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
+    return v_float32x4((vfloat32m1_t)res);
+}
+inline v_float32x4 operator > (const v_float32x4& a, const v_float32x4& b)
+{
+    vbool32_t mask = vmfgt_vv_f32m1_b32(a.val, b.val, 4);
+    vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
+    return v_float32x4((vfloat32m1_t)res);
+}
+inline v_float32x4 operator >= (const v_float32x4& a, const v_float32x4& b)
+{
+    vbool32_t mask = vmfge_vv_f32m1_b32(a.val, b.val, 4);
+    vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
+    return v_float32x4((vfloat32m1_t)res);
+}
+inline v_float32x4 v_not_nan(const v_float32x4& a)
+{
+    vbool32_t mask = vmford_vv_f32m1_b32(a.val, a.val, 4);
+    vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
+    return v_float32x4((vfloat32m1_t)res);
+}
+
+//TODO: ==
+inline v_float64x2 operator == (const v_float64x2& a, const v_float64x2& b)
+{
+    vbool64_t mask = vmfeq_vv_f64m1_b64(a.val, b.val, 2);
+    vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
+    return v_float64x2((vfloat64m1_t)res);
+}
+inline v_float64x2 operator != (const v_float64x2& a, const v_float64x2& b)
+{
+    vbool64_t mask = vmfne_vv_f64m1_b64(a.val, b.val, 2);
+    vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
+    return v_float64x2((vfloat64m1_t)res);
+}
+inline v_float64x2 operator < (const v_float64x2& a, const v_float64x2& b)
+{
+    vbool64_t mask = vmflt_vv_f64m1_b64(a.val, b.val, 2);
+    vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
+    return v_float64x2((vfloat64m1_t)res);
+}
+inline v_float64x2 operator <= (const v_float64x2& a, const v_float64x2& b)
+{
+    vbool64_t mask = vmfle_vv_f64m1_b64(a.val, b.val, 2);
+    vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
+    return v_float64x2((vfloat64m1_t)res);
+}
+inline v_float64x2 operator > (const v_float64x2& a, const v_float64x2& b)
+{
+    vbool64_t mask = vmfgt_vv_f64m1_b64(a.val, b.val, 2);
+    vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
+    return v_float64x2((vfloat64m1_t)res);
+}
+inline v_float64x2 operator >= (const v_float64x2& a, const v_float64x2& b)
+{
+    vbool64_t mask = vmfge_vv_f64m1_b64(a.val, b.val, 2);
+    vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
+    return v_float64x2((vfloat64m1_t)res);
+}
+inline v_float64x2 v_not_nan(const v_float64x2& a)
+{
+    vbool64_t mask = vmford_vv_f64m1_b64(a.val, a.val, 2);
+    vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
+    return v_float64x2((vfloat64m1_t)res);
+}
+#define OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(_Tp, _T) \
+inline void v_transpose4x4(const v_##_Tp##32x4& a0, const v_##_Tp##32x4& a1, \
+                         const v_##_Tp##32x4& a2, const v_##_Tp##32x4& a3, \
+                         v_##_Tp##32x4& b0, v_##_Tp##32x4& b1, \
+                         v_##_Tp##32x4& b2, v_##_Tp##32x4& b3) \
+{ \
+    v##_Tp##32m4_t val = vundefined_##_T##m4();    \
+    val = vset_##_T##m4(val, 0, a0.val);    \
+    val = vset_##_T##m4(val, 1, a1.val);    \
+    val = vset_##_T##m4(val, 2, a2.val);    \
+    val = vset_##_T##m4(val, 3, a3.val);   \
+    val = vrgather_vv_##_T##m4(val, (vuint32m4_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);    \
+    b0.val = vget_##_T##m4_##_T##m1(val, 0);   \
+    b1.val = vget_##_T##m4_##_T##m1(val, 1);   \
+    b2.val = vget_##_T##m4_##_T##m1(val, 2);   \
+    b3.val = vget_##_T##m4_##_T##m1(val, 3);   \
+}
+OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(uint, u32)
+OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(int, i32)
+OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(float, f32)
+
+
+#define OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(_Tpvec, suffix, _T, num) \
+inline _Tpvec operator << (const _Tpvec& a, int n) \
+{ return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); } \
+template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
+{ return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); }
+
+#define OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(_Tpvec, suffix, _T, num, intric) \
+inline _Tpvec operator >> (const _Tpvec& a, int n) \
+{ return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); } \
+template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
+{ return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); }\
+template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
+{ return _Tpvec((v##intric##_vx_##_T##m1(vadd_vx_##_T##m1(a.val, 1<<(n-1), num), n, num))); }
+
+// trade efficiency for convenience
+#define OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(suffix, _T, num, intrin) \
+OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(v_##suffix##x##num, suffix, _T, num) \
+OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(v_##suffix##x##num, suffix, _T, num, intrin)
+
+OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint8, u8, 16, srl)
+OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint16, u16, 8, srl)
+OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint32, u32, 4, srl)
+OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint64, u64, 2, srl)
+OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int8, i8, 16, sra)
+OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int16, i16, 8, sra)
+OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int32, i32, 4, sra)
+OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int64, i64, 2, sra)
+
+#if 0
+#define VUP4(n) {0, 1, 2, 3}
+#define VUP8(n) {0, 1, 2, 3, 4, 5, 6, 7}
+#define VUP16(n) {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
+#define VUP2(n) {0, 1}
+#endif
+#define OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(_Tpvec, suffix, _T, num, num2, vmv, len) \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
+{    \
+    suffix##m1_t tmp = vmv##_##_T##m1(0, num);\
+        tmp = vslideup_vx_##_T##m1_m(vmset_m_##len(num), tmp, a.val, n, num);\
+        return _Tpvec(tmp);\
+} \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
+{     \
+        return _Tpvec(vslidedown_vx_##_T##m1(a.val, n, num));\
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
+{ return a; } \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    suffix##m2_t tmp = vundefined_##_T##m2();    \
+    tmp = vset_##_T##m2(tmp, 0, a.val);          \
+    tmp = vset_##_T##m2(tmp, 1, b.val);          \
+        tmp = vslidedown_vx_##_T##m2(tmp, n, num2);\
+        return _Tpvec(vget_##_T##m2_##_T##m1(tmp, 0));\
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    suffix##m2_t tmp = vundefined_##_T##m2();    \
+    tmp = vset_##_T##m2(tmp, 0, b.val);    \
+    tmp = vset_##_T##m2(tmp, 1, a.val);    \
+        tmp = vslideup_vx_##_T##m2(tmp, n, num2);\
+        return _Tpvec(vget_##_T##m2_##_T##m1(tmp, 1));\
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    CV_UNUSED(b); return a; \
+}
+
+OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint8x16, vuint8, u8, 16, 32, vmv_v_x, b8)
+OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int8x16, vint8, i8, 16, 32, vmv_v_x, b8)
+OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint16x8, vuint16, u16, 8, 16, vmv_v_x, b16)
+OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int16x8, vint16, i16, 8, 16, vmv_v_x, b16)
+OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint32x4, vuint32, u32, 4, 8, vmv_v_x, b32)
+OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int32x4, vint32, i32, 4, 8, vmv_v_x, b32)
+OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint64x2, vuint64, u64, 2, 4, vmv_v_x, b64)
+OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int64x2, vint64, i64, 2, 4, vmv_v_x, b64)
+OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_float32x4, vfloat32, f32, 4, 8, vfmv_v_f, b32)
+OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_float64x2, vfloat64, f64, 2, 4, vfmv_v_f, b64)
+
+#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num) \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+  typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64; \
+  vuint64m1_t tmp = {*(unaligned_uint64*)ptr0, *(unaligned_uint64*)ptr1};\
+    return _Tpvec(_Tp2##_t(tmp)); } \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ return _Tpvec(vle_v_##len(ptr, hnum)); }\
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(vle_v_##len(ptr, num)); } \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec((_Tp2##_t)vle_v_##len((const _Tp *)ptr, num)); } \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ vse_v_##len(ptr, a.val, hnum);}\
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+  _Tp2##_t a0 = vslidedown_vx_##len(a.val, hnum, num);    \
+  vse_v_##len(ptr, a0, hnum);}\
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ vse_v_##len(ptr, a.val, num); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ vse_v_##len(ptr, a.val, num); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ vse_v_##len(ptr, a.val, num); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ vse_v_##len(ptr, a.val, num); }
+
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint8x16, uchar, vuint8m1, u8m1, 8, 16)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int8x16,  schar, vint8m1, i8m1, 8, 16)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint16x8, ushort, vuint16m1, u16m1, 4, 8)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int16x8,  short,  vint16m1, i16m1, 4, 8)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint32x4, unsigned, vuint32m1, u32m1, 2, 4)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int32x4,  int,     vint32m1, i32m1, 2, 4)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint64x2, unsigned long, vuint64m1, u64m1, 1, 2)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int64x2,  long,     vint64m1, i64m1, 1, 2)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float32x4, float, vfloat32m1, f32m1, 2, 4)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float64x2, double, vfloat64m1, f64m1, 1, 2)
+
+
+////////////// Lookup table access ////////////////////
+
+inline v_int8x16 v_lut(const schar* tab, const int* idx)
+{
+#if 1
+    schar CV_DECL_ALIGNED(32) elems[16] =
+    {
+        tab[idx[ 0]],
+        tab[idx[ 1]],
+        tab[idx[ 2]],
+        tab[idx[ 3]],
+        tab[idx[ 4]],
+        tab[idx[ 5]],
+        tab[idx[ 6]],
+        tab[idx[ 7]],
+        tab[idx[ 8]],
+        tab[idx[ 9]],
+        tab[idx[10]],
+        tab[idx[11]],
+        tab[idx[12]],
+        tab[idx[13]],
+        tab[idx[14]],
+        tab[idx[15]]
+    };
+    return v_int8x16(vle_v_i8m1(elems, 16));
+#else
+    int32xm4_t index32 = vlev_int32xm4(idx, 16);
+    vint16m2_t index16 = vnsra_vx_i16m2_int32xm4(index32, 0, 16);
+    vint8m1_t index = vnsra_vx_i8m1_i16m2(index16, 0, 16);
+    return v_int8x16(vlxbv_i8m1(tab, index, 16));
+#endif
+}
+
+inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx){
+    schar CV_DECL_ALIGNED(32) elems[16] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[3]],
+        tab[idx[3] + 1],
+        tab[idx[4]],
+        tab[idx[4] + 1],
+        tab[idx[5]],
+        tab[idx[5] + 1],
+        tab[idx[6]],
+        tab[idx[6] + 1],
+        tab[idx[7]],
+        tab[idx[7] + 1]
+    };
+    return v_int8x16(vle_v_i8m1(elems, 16));
+}
+inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
+{
+    schar CV_DECL_ALIGNED(32) elems[16] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[0] + 2],
+        tab[idx[0] + 3],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[1] + 2],
+        tab[idx[1] + 3],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[2] + 2],
+        tab[idx[2] + 3],
+        tab[idx[3]],
+        tab[idx[3] + 1],
+        tab[idx[3] + 2],
+        tab[idx[3] + 3]
+    };
+    return v_int8x16(vle_v_i8m1(elems, 16));
+}
+
+inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
+inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
+inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
+
+inline v_int16x8 v_lut(const short* tab, const int* idx)
+{
+    short CV_DECL_ALIGNED(32) elems[8] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]],
+        tab[idx[4]],
+        tab[idx[5]],
+        tab[idx[6]],
+        tab[idx[7]]
+    };
+    return v_int16x8(vle_v_i16m1(elems, 8));
+}
+inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
+{
+    short CV_DECL_ALIGNED(32) elems[8] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[3]],
+        tab[idx[3] + 1]
+    };
+    return v_int16x8(vle_v_i16m1(elems, 8));
+}
+inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
+{
+    short CV_DECL_ALIGNED(32) elems[8] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[0] + 2],
+        tab[idx[0] + 3],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[1] + 2],
+        tab[idx[1] + 3]
+    };
+    return v_int16x8(vle_v_i16m1(elems, 8));
+}
+inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
+inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
+inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
+
+inline v_int32x4 v_lut(const int* tab, const int* idx)
+{
+    int CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]]
+    };
+    return v_int32x4(vle_v_i32m1(elems, 4));
+}
+inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
+{
+    int CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1]
+    };
+    return v_int32x4(vle_v_i32m1(elems, 4));
+}
+inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x4(vle_v_i32m1(tab+idx[0], 4));
+}
+inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
+inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
+inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
+
+inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
+{
+    vint64m1_t res = {tab[idx[0]], tab[idx[1]]};
+    return v_int64x2(res);
+}
+inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(vle_v_i64m1(tab+idx[0], 2));
+}
+
+inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx)
+{
+    vuint64m1_t res = {tab[idx[0]], tab[idx[1]]};
+    return v_uint64x2(res);
+}
+inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx)
+{
+    return v_uint64x2(vle_v_u64m1(tab+idx[0], 2));
+}
+
+inline v_float32x4 v_lut(const float* tab, const int* idx)
+{
+    float CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]]
+    };
+    return v_float32x4(vle_v_f32m1(elems, 4));
+}
+inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
+{
+    float CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[0]+1],
+        tab[idx[1]],
+        tab[idx[1]+1]
+    };
+    return v_float32x4(vle_v_f32m1(elems, 4));
+}
+inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
+{
+    return v_float32x4(vle_v_f32m1(tab + idx[0], 4));
+}
+inline v_float64x2 v_lut(const double* tab, const int* idx)
+{
+    vfloat64m1_t res = {tab[idx[0]], tab[idx[1]]};
+    return v_float64x2(res);
+}
+inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
+{
+    return v_float64x2(vle_v_f64m1(tab+idx[0], 2));
+}
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idxvec.val[0]],
+        tab[idxvec.val[1]],
+        tab[idxvec.val[2]],
+        tab[idxvec.val[3]]
+    };
+    return v_int32x4(vle_v_i32m1(elems, 4));
+}
+
+inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
+{
+    unsigned CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idxvec.val[0]],
+        tab[idxvec.val[1]],
+        tab[idxvec.val[2]],
+        tab[idxvec.val[3]]
+    };
+    return v_uint32x4(vle_v_u32m1(elems, 4));
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    float CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idxvec.val[0]],
+        tab[idxvec.val[1]],
+        tab[idxvec.val[2]],
+        tab[idxvec.val[3]]
+    };
+    return v_float32x4(vle_v_f32m1(elems, 4));
+}
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    vfloat64m1_t res = {tab[idxvec.val[0]], tab[idxvec.val[1]]};
+    return v_float64x2(res);
+}
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    vint32m1_t index_x = vmul_vx_i32m1(idxvec.val, 4, 4);
+    vint32m1_t index_y = vadd_vx_i32m1(index_x, 4, 4);
+
+    x.val = vlxe_v_f32m1(tab, index_x, 4);
+    y.val = vlxe_v_f32m1(tab, index_y, 4);
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+
+    x = v_float64x2(tab[idx[0]], tab[idx[1]]);
+    y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_PACKS(_Tp, _Tp2, _T2, num2, _T1, num, intrin, shr, _Type) \
+inline v_##_Tp##x##num v_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
+{ \
+    v##_Tp2##m2_t  tmp = vundefined_##_T2##m2();    \
+    tmp = vset_##_T2##m2(tmp, 0, a.val);    \
+    tmp = vset_##_T2##m2(tmp, 1, b.val);    \
+    return v_##_Tp##x##num(shr##_##_T1##m1(tmp, 0, num)); \
+}\
+template<int n> inline \
+v_##_Tp##x##num v_rshr_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
+{ \
+    v##_Tp2##m2_t  tmp = vundefined_##_T2##m2();    \
+    tmp = vset_##_T2##m2(tmp, 0, a.val);    \
+    tmp = vset_##_T2##m2(tmp, 1, b.val);    \
+    return v_##_Tp##x##num(intrin##_##_T1##m1(tmp, n, num)); \
+}\
+inline void v_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
+{ \
+    v##_Tp2##m2_t tmp = vundefined_##_T2##m2();    \
+    tmp = vset_##_T2##m2(tmp, 0, a.val);    \
+    tmp = vset_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2));    \
+    asm("" ::: "memory");                                       \
+    vse_v_##_T1##m1(ptr, shr##_##_T1##m1(tmp, 0, num), num2); \
+}\
+template<int n> inline \
+void v_rshr_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
+{ \
+    v##_Tp2##m2_t tmp = vundefined_##_T2##m2();    \
+    tmp = vset_##_T2##m2(tmp, 0, a.val);    \
+    tmp = vset_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2));    \
+    vse_v_##_T1##m1(ptr, intrin##_##_T1##m1(tmp, n, num), num2); \
+}
+OPENCV_HAL_IMPL_RISCVV_PACKS(int8, int16, i16, 8, i8, 16, vnclip_vx, vnclip_vx, signed char)
+OPENCV_HAL_IMPL_RISCVV_PACKS(int16, int32, i32, 4, i16, 8, vnclip_vx, vnclip_vx, signed short)
+OPENCV_HAL_IMPL_RISCVV_PACKS(int32, int64, i64, 2, i32, 4, vnclip_vx, vnsra_vx, int)
+OPENCV_HAL_IMPL_RISCVV_PACKS(uint8, uint16, u16, 8, u8, 16, vnclipu_vx, vnclipu_vx, unsigned char)
+OPENCV_HAL_IMPL_RISCVV_PACKS(uint16, uint32, u32, 4, u16, 8, vnclipu_vx, vnclipu_vx, unsigned short)
+OPENCV_HAL_IMPL_RISCVV_PACKS(uint32, uint64, u64, 2, u32, 4, vnclipu_vx, vnsrl_vx, unsigned int)
+
+// pack boolean
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    vuint16m2_t tmp = vundefined_u16m2();    \
+    tmp = vset_u16m2(tmp, 0, a.val);    \
+    tmp = vset_u16m2(tmp, 1, b.val);    \
+    return v_uint8x16(vnsrl_vx_u8m1(tmp, 0, 16));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    vuint32m4_t vabcd = vundefined_u32m4();    \
+    vuint16m2_t v16 = vundefined_u16m2();    \
+    vabcd = vset_u32m4(vabcd, 0, a.val);    \
+    vabcd = vset_u32m4(vabcd, 1, b.val);    \
+    vabcd = vset_u32m4(vabcd, 2, c.val);    \
+    vabcd = vset_u32m4(vabcd, 3, d.val);    \
+    v16 = vnsrl_vx_u16m2(vabcd, 0, 16);
+    return v_uint8x16(vnsrl_vx_u8m1(v16, 0, 16));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    vuint64m8_t v64 = vundefined_u64m8();    \
+    vuint32m4_t v32 = vundefined_u32m4();    \
+    vuint16m2_t v16 = vundefined_u16m2();    \
+    v64 = vset_u64m8(v64, 0, a.val);    \
+    v64 = vset_u64m8(v64, 1, b.val);    \
+    v64 = vset_u64m8(v64, 2, c.val);    \
+    v64 = vset_u64m8(v64, 3, d.val);    \
+    v64 = vset_u64m8(v64, 4, e.val);    \
+    v64 = vset_u64m8(v64, 5, f.val);    \
+    v64 = vset_u64m8(v64, 6, g.val);    \
+    v64 = vset_u64m8(v64, 7, h.val);    \
+    v32 = vnsrl_vx_u32m4(v64, 0, 16);
+    v16 = vnsrl_vx_u16m2(v32, 0, 16);
+    return v_uint8x16(vnsrl_vx_u8m1(v16, 0, 16));
+}
+
+//inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b) \
+//{ \
+//    int16xm2_u tmp;    \
+//    tmp.m1[0] = (vint16m1_t)a.val;    \
+//    tmp.m1[1] = (vint16m1_t)b.val;    \
+//    e8xm1_t mask = (e8xm1_t)vmsge_vx_e16xm2_i16m2(tmp.v, 0, 16);\
+//    return v_uint8x16(vnclipuvi_mask_u8m1_u16m2(vmv_v_x_u8m1(0, 16), (vuint16m2_t)tmp.v, 0, mask, 16));
+//}
+
+#define OPENCV_HAL_IMPL_RISCVV_PACK_U(tp1, num1, tp2, num2, _Tp) \
+inline v_uint##tp1##x##num1 v_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
+{ \
+    vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
+    tmp = vset_##i##tp2##m2(tmp, 0, a.val);    \
+    tmp = vset_##i##tp2##m2(tmp, 1, b.val);    \
+    vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
+    return v_uint##tp1##x##num1(vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, 0, num1));    \
+} \
+inline void v_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
+{ \
+    vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
+    tmp = vset_##i##tp2##m2(tmp, 0, a.val);    \
+    vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
+    return vse_v_u##tp1##m1(ptr, vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, 0, num1), num2);    \
+} \
+template<int n> inline \
+v_uint##tp1##x##num1 v_rshr_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
+{ \
+    vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
+    tmp = vset_##i##tp2##m2(tmp, 0, a.val);    \
+    tmp = vset_##i##tp2##m2(tmp, 1, b.val);    \
+    vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
+    return v_uint##tp1##x##num1(vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, n, num1));    \
+} \
+template<int n> inline \
+void v_rshr_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
+{ \
+    vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
+    tmp = vset_##i##tp2##m2(tmp, 0, a.val);    \
+    vint##tp2##m2_t val_ = vmax_vx_i##tp2##m2(tmp, 0, num1);\
+    vuint##tp1##m1_t val = vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val_, n, num1);    \
+    return vse_v_u##tp1##m1(ptr, val, num2);\
+}
+OPENCV_HAL_IMPL_RISCVV_PACK_U(8, 16, 16, 8, unsigned char )
+OPENCV_HAL_IMPL_RISCVV_PACK_U(16, 8, 32, 4, unsigned short)
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+
+// saturating multiply 8-bit, 16-bit
+#define OPENCV_HAL_IMPL_RISCVV_MUL_SAT(_Tpvec, _Tpwvec)            \
+    inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
+    {                                                            \
+        _Tpwvec c, d;                                            \
+        v_mul_expand(a, b, c, d);                                \
+        return v_pack(c, d);                                     \
+    }                                                            \
+    inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
+    { a = a * b; return a; }
+
+OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int8x16,  v_int16x8)
+OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint8x16, v_uint16x8)
+OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int16x8,  v_int32x4)
+OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint16x8, v_uint32x4)
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+static const signed char popCountTable[256] =
+{
+    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
+};
+
+inline vuint8m1_t vcnt_u8(vuint8m1_t val){
+    vuint8m1_t v0 = val & 1;
+    return vlxe_v_u8m1((unsigned char*)popCountTable, val >> 1, 16)+v0;
+}
+
+inline v_uint8x16
+v_popcount(const v_uint8x16& a)
+{
+    return v_uint8x16(vcnt_u8(a.val));
+}
+
+inline v_uint8x16
+v_popcount(const v_int8x16& a)
+{
+    return v_uint8x16(vcnt_u8((vuint8m1_t)a.val));
+}
+
+inline v_uint16x8
+v_popcount(const v_uint16x8& a)
+{
+    vuint8m2_t tmp = vundefined_u8m2();
+    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
+    vuint64m2_t mask = (vuint64m2_t){0x0E0C0A0806040200, 0, 0x0F0D0B0907050301, 0};
+    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
+    vuint16m2_t res = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 8);
+    return v_uint16x8(vget_u16m2_u16m1(res, 0));
+}
+
+inline v_uint16x8
+v_popcount(const v_int16x8& a)
+{
+    vuint8m2_t tmp = vundefined_u8m2();
+    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
+    vuint64m2_t mask = (vuint64m2_t){0x0E0C0A0806040200, 0, 0x0F0D0B0907050301, 0};
+    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
+    vuint16m2_t res = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 8);
+    return v_uint16x8(vget_u16m2_u16m1(res, 0));
+}
+
+inline v_uint32x4
+v_popcount(const v_uint32x4& a)
+{
+    vuint8m2_t tmp = vundefined_u8m2();
+    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
+    vuint64m2_t mask = (vuint64m2_t){0xFFFFFFFF0C080400, 0xFFFFFFFF0D090501,
+                     0xFFFFFFFF0E0A0602, 0xFFFFFFFF0F0B0703};
+    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
+    vuint16m2_t res_ = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 16);
+    vuint32m2_t res  = vwaddu_vv_u32m2(vget_u16m2_u16m1(res_, 0), vget_u16m2_u16m1(res_, 1), 8);
+    return v_uint32x4(vget_u32m2_u32m1(res, 0));
+}
+
+inline v_uint32x4
+v_popcount(const v_int32x4& a)
+{
+    vuint8m2_t tmp = vundefined_u8m2();
+    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
+    vuint64m2_t mask = (vuint64m2_t){0xFFFFFFFF0C080400, 0xFFFFFFFF0D090501,
+                     0xFFFFFFFF0E0A0602, 0xFFFFFFFF0F0B0703};
+    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
+    vuint16m2_t res_ = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 16);
+    vuint32m2_t res  = vwaddu_vv_u32m2(vget_u16m2_u16m1(res_, 0), vget_u16m2_u16m1(res_, 1), 8);
+    return v_uint32x4(vget_u32m2_u32m1(res, 0));
+}
+
+inline v_uint64x2
+v_popcount(const v_uint64x2& a)
+{
+    vuint8m2_t tmp = vundefined_u8m2();
+    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
+    vuint64m2_t mask = (vuint64m2_t){0x0706050403020100, 0x0000000000000000,
+                     0x0F0E0D0C0B0A0908, 0x0000000000000000};
+    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
+    vuint8m1_t zero = vmv_v_x_u8m1(0, 16);
+    vuint8m1_t res1 = zero;
+    vuint8m1_t res2 = zero;
+    res1 = vredsum_vs_u8m1_u8m1(res1, vget_u8m2_u8m1(tmp, 0), zero, 8);
+    res2 = vredsum_vs_u8m1_u8m1(res2, vget_u8m2_u8m1(tmp, 1), zero, 8);
+
+    return v_uint64x2((unsigned long)vmv_x_s_u8m1_u8(res1, 8), (unsigned long)vmv_x_s_u8m1_u8(res2, 8));
+}
+
+inline v_uint64x2
+v_popcount(const v_int64x2& a)
+{
+    vuint8m2_t tmp = vundefined_u8m2();
+    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
+    vuint64m2_t mask = (vuint64m2_t){0x0706050403020100, 0x0000000000000000,
+                     0x0F0E0D0C0B0A0908, 0x0000000000000000};
+    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
+    vuint8m1_t zero = vmv_v_x_u8m1(0, 16);
+    vuint8m1_t res1 = zero;
+    vuint8m1_t res2 = zero;
+    res1 = vredsum_vs_u8m1_u8m1(res1, vget_u8m2_u8m1(tmp, 0), zero, 8);
+    res2 = vredsum_vs_u8m1_u8m1(res2, vget_u8m2_u8m1(tmp, 1), zero, 8);
+
+    return v_uint64x2((unsigned long)vmv_x_s_u8m1_u8(res1, 8), (unsigned long)vmv_x_s_u8m1_u8(res2, 8));
+}
+
+#define SMASK 1, 2, 4, 8, 16, 32, 64, 128
+inline int v_signmask(const v_uint8x16& a)
+{
+    vuint8m1_t t0  = vsrl_vx_u8m1(a.val, 7, 16);
+    vuint8m1_t m1  = (vuint8m1_t){SMASK, SMASK};
+    vuint16m2_t t1 = vwmulu_vv_u16m2(t0, m1, 16);
+    vuint32m1_t res = vmv_v_x_u32m1(0, 4);
+    vuint32m2_t t2 = vwmulu_vx_u32m2(vget_u16m2_u16m1(t1, 1), 256, 8);
+    res = vredsum_vs_u32m2_u32m1(res, t2, res, 8);
+    res = vwredsumu_vs_u16m1_u32m1(res, vget_u16m2_u16m1(t1, 0), res, 8);
+    return vmv_x_s_u32m1_u32(res, 8);
+}
+inline int v_signmask(const v_int8x16& a)
+{
+    vuint8m1_t t0 = vsrl_vx_u8m1((vuint8m1_t)a.val, 7, 16);
+    vuint8m1_t m1 = (vuint8m1_t){SMASK, SMASK};
+    vint16m2_t t1 = (vint16m2_t)vwmulu_vv_u16m2(t0, m1, 16);
+    vint32m1_t res = vmv_v_x_i32m1(0, 4);
+    vint32m2_t t2 = vwmul_vx_i32m2(vget_i16m2_i16m1(t1, 1), 256, 8);
+    res = vredsum_vs_i32m2_i32m1(res, t2, res, 8);
+    res = vwredsum_vs_i16m1_i32m1(res, vget_i16m2_i16m1(t1, 0), res, 8);
+    return vmv_x_s_i32m1_i32(res, 8);
+}
+
+inline int v_signmask(const v_int16x8& a)
+{
+    vint16m1_t t0 = (vint16m1_t)vsrl_vx_u16m1((vuint16m1_t)a.val, 15, 8);
+    vint16m1_t m1 = (vint16m1_t){SMASK};
+    vint16m1_t t1 = vmul_vv_i16m1(t0, m1, 8);
+    vint16m1_t res = vmv_v_x_i16m1(0, 8);
+    res = vredsum_vs_i16m1_i16m1(res, t1, res, 8);
+    return vmv_x_s_i16m1_i16(res, 8);
+}
+inline int v_signmask(const v_uint16x8& a)
+{
+    vint16m1_t t0 = (vint16m1_t)vsrl_vx_u16m1((vuint16m1_t)a.val, 15, 8);
+    vint16m1_t m1 = (vint16m1_t){SMASK};
+    vint16m1_t t1 = vmul_vv_i16m1(t0, m1, 8);
+    vint16m1_t res = vmv_v_x_i16m1(0, 8);
+    res = vredsum_vs_i16m1_i16m1(res, t1, res, 8);
+    return vmv_x_s_i16m1_i16(res, 8);
+}
+inline int v_signmask(const v_int32x4& a)
+{
+    vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1((vuint32m1_t)a.val, 31, 4);
+    vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
+    vint32m1_t res = vmv_v_x_i32m1(0, 4);
+    vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
+    res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
+    return vmv_x_s_i32m1_i32(res, 4);
+}
+inline int v_signmask(const v_uint32x4& a)
+{
+    vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1(a.val, 31, 4);
+    vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
+    vint32m1_t res = vmv_v_x_i32m1(0, 4);
+    vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
+    res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
+    return vmv_x_s_i32m1_i32(res, 4);
+}
+inline int v_signmask(const v_uint64x2& a)
+{
+    vuint64m1_t v0 = vsrl_vx_u64m1(a.val, 63, 2);
+    int res = (int)vext_x_v_u64m1_u64(v0, 0, 2) + ((int)vext_x_v_u64m1_u64(v0, 1, 2) << 1);
+    return res;
+}
+inline int v_signmask(const v_int64x2& a)
+{ return v_signmask(v_reinterpret_as_u64(a)); }
+inline int v_signmask(const v_float64x2& a)
+{ return v_signmask(v_reinterpret_as_u64(a)); }
+inline int v_signmask(const v_float32x4& a)
+{
+    vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1((vuint32m1_t)a.val, 31, 4);
+    vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
+    vint32m1_t res = vmv_v_x_i32m1(0, 4);
+    vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
+    res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
+    return vmv_x_s_i32m1_i32(res, 4);
+}
+
+inline int v_scan_forward(const v_int8x16& a) {
+int val = v_signmask(a);
+if(val==0) return 0;
+else return trailingZeros32(val); }
+inline int v_scan_forward(const v_uint8x16& a) {
+int val = v_signmask(a);
+if(val==0) return 0;
+else return trailingZeros32(val); }
+inline int v_scan_forward(const v_int16x8& a) {
+int val = v_signmask(a);
+if(val==0) return 0;
+else return trailingZeros32(val); }
+inline int v_scan_forward(const v_uint16x8& a) {
+int val = v_signmask(a);
+if(val==0) return 0;
+else return trailingZeros32(val); }
+inline int v_scan_forward(const v_int32x4& a) {
+int val = v_signmask(a);
+if(val==0) return 0;
+else return trailingZeros32(val); }
+inline int v_scan_forward(const v_uint32x4& a) {
+int val = v_signmask(a);
+if(val==0) return 0;
+else return trailingZeros32(val); }
+inline int v_scan_forward(const v_float32x4& a) {
+int val = v_signmask(a);
+if(val==0) return 0;
+else return trailingZeros32(val); }
+inline int v_scan_forward(const v_int64x2& a) {
+int val = v_signmask(a);
+if(val==0) return 0;
+else return trailingZeros32(val); }
+inline int v_scan_forward(const v_uint64x2& a) {
+int val = v_signmask(a);
+if(val==0) return 0;
+else return trailingZeros32(val); }
+
+#define OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(_Tpvec, suffix, _T, shift, num) \
+inline bool v_check_all(const v_##_Tpvec& a) \
+{ \
+    suffix##m1_t v0 = vsrl_vx_##_T(vnot_v_##_T(a.val, num), shift, num); \
+    vuint64m1_t v1 = vuint64m1_t(v0); \
+    return (v1[0] | v1[1]) == 0; \
+} \
+inline bool v_check_any(const v_##_Tpvec& a) \
+{ \
+    suffix##m1_t v0 = vsrl_vx_##_T(a.val, shift, num); \
+    vuint64m1_t v1 = vuint64m1_t(v0); \
+    return (v1[0] | v1[1]) != 0; \
+}
+
+OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint8x16, vuint8,  u8m1, 7, 16)
+OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint16x8, vuint16, u16m1, 15, 8)
+OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint32x4, vuint32, u32m1, 31, 4)
+OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint64x2, vuint64, u64m1, 63, 2)
+
+inline bool v_check_all(const v_int8x16& a)
+{ return v_check_all(v_reinterpret_as_u8(a)); }
+inline bool v_check_all(const v_int16x8& a)
+{ return v_check_all(v_reinterpret_as_u16(a)); }
+inline bool v_check_all(const v_int32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+inline bool v_check_all(const v_float32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+inline bool v_check_all(const v_int64x2& a)
+{ return v_check_all(v_reinterpret_as_u64(a)); }
+inline bool v_check_all(const v_float64x2& a)
+{ return v_check_all(v_reinterpret_as_u64(a)); }
+
+inline bool v_check_any(const v_int8x16& a)
+{ return v_check_any(v_reinterpret_as_u8(a)); }
+inline bool v_check_any(const v_int16x8& a)
+{ return v_check_any(v_reinterpret_as_u16(a)); }
+inline bool v_check_any(const v_int32x4& a)
+{ return v_check_any(v_reinterpret_as_u32(a)); }
+inline bool v_check_any(const v_float32x4& a)
+{ return v_check_any(v_reinterpret_as_u32(a)); }
+inline bool v_check_any(const v_int64x2& a)
+{ return v_check_any(v_reinterpret_as_u64(a)); }
+inline bool v_check_any(const v_float64x2& a)
+{ return v_check_any(v_reinterpret_as_u64(a)); }
+
+#define OPENCV_HAL_IMPL_RISCVV_SELECT(_Tpvec, suffix, _Tpvec2, num) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(vmerge_vvm_##suffix(_Tpvec2(mask.val), b.val, a.val, num)); \
+}
+
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_int8x16,  i8m1, vbool8_t, 16)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_int16x8,  i16m1, vbool16_t, 8)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_int32x4,  i32m1, vbool32_t, 4)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint8x16, u8m1, vbool8_t, 16)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint16x8, u16m1, vbool16_t, 8)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint32x4, u32m1, vbool32_t, 4)
+inline v_float32x4 v_select(const v_float32x4& mask, const v_float32x4& a, const v_float32x4& b)
+{
+    return v_float32x4((vfloat32m1_t)vmerge_vvm_u32m1((vbool32_t)mask.val, (vuint32m1_t)b.val, (vuint32m1_t)a.val, 4));
+}
+inline v_float64x2 v_select(const v_float64x2& mask, const v_float64x2& a, const v_float64x2& b)
+{
+    return v_float64x2((vfloat64m1_t)vmerge_vvm_u64m1((vbool64_t)mask.val, (vuint64m1_t)b.val, (vuint64m1_t)a.val, 2));
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_EXPAND(add, _Tpvec, _Tpwvec, _Tp, _Tp1, num1, _Tp2, num2, _T1, _T2) \
+inline void v_expand(const _Tpvec& a, v_##_Tpwvec& b0, v_##_Tpwvec& b1) \
+{ \
+    _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num1), num1);    \
+    b0.val = vget_##_Tp2##m2_##_Tp2##m1(b, 0);  \
+    b1.val = vget_##_Tp2##m2_##_Tp2##m1(b, 1);  \
+} \
+inline v_##_Tpwvec v_expand_low(const _Tpvec& a) \
+{ \
+    _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num2), num2);    \
+    return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 0)); \
+} \
+inline v_##_Tpwvec v_expand_high(const _Tpvec& a) \
+{ \
+    _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num1), num1);    \
+    return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 1)); \
+} \
+inline v_##_Tpwvec v_load_expand(const _Tp* ptr) \
+{ \
+    _T2##_t val = vle##_v_##_Tp1(ptr, num2);    \
+    _T1##_t b = vw##add##_vv_##_Tp2##m2(val, vmv_v_x_##_Tp1(0, num2), num2);    \
+    return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 0)); \
+}
+
+OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint8x16, uint16x8, uchar, u8m1, 16, u16, 8, vuint16m2, vuint8m1)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint16x8, uint32x4, ushort,  u16m1, 8, u32, 4, vuint32m2, vuint16m1)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint32x4, uint64x2, uint,  u32m1, 4, u64, 2, vuint64m2, vuint32m1)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int8x16, int16x8, schar,  i8m1, 16, i16, 8, vint16m2, vint8m1)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int16x8, int32x4, short,  i16m1, 8, i32, 4, vint32m2, vint16m1)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int32x4, int64x2, int,  i32m1, 4, i64, 2, vint64m2, vint32m1)
+
+inline v_uint32x4 v_load_expand_q(const uchar* ptr)
+{
+    vuint16m2_t b = vundefined_u16m2();
+    vuint32m2_t c = vundefined_u32m2();
+    vuint8m1_t val = vle_v_u8m1(ptr, 4);    \
+    b = vwaddu_vv_u16m2(val, vmv_v_x_u8m1(0, 4), 4);    \
+    c = vwaddu_vv_u32m2(vget_u16m2_u16m1(b, 0), vmv_v_x_u16m1(0, 4), 4);    \
+    return v_uint32x4(vget_u32m2_u32m1(c, 0));
+}
+
+inline v_int32x4 v_load_expand_q(const schar* ptr)
+{
+    vint16m2_t b = vundefined_i16m2();
+    vint32m2_t c = vundefined_i32m2();
+    vint8m1_t val = vle_v_i8m1(ptr, 4);    \
+    b = vwadd_vv_i16m2(val, vmv_v_x_i8m1(0, 4), 4);    \
+    c = vwadd_vv_i32m2(vget_i16m2_i16m1(b, 0), vmv_v_x_i16m1(0, 4), 4);    \
+    return v_int32x4(vget_i32m2_i32m1(c, 0));
+}
+#define VITL_16 (vuint64m2_t){0x1303120211011000, 0x1707160615051404, 0x1B0B1A0A19091808, 0x1F0F1E0E1D0D1C0C}
+#define VITL_8 (vuint64m2_t){0x0009000100080000, 0x000B0003000A0002, 0x000D0005000C0004, 0x000F0007000E0006}
+#define VITL_4 (vuint64m2_t){0x0000000400000000, 0x0000000500000001, 0x0000000600000002, 0x0000000700000003}
+#define VITL_2 (vuint64m2_t){0, 2, 1, 3}
+#define LOW_4  0x0000000100000000, 0x0000000500000004
+#define LOW_8  0x0003000200010000, 0x000B000A00090008
+#define LOW_16 0x0706050403020100, 0x1716151413121110
+#define HIGH_4  0x0000000300000002, 0x0000000700000006
+#define HIGH_8  0x0007000600050004, 0x000F000E000D000C
+#define HIGH_16 0x0F0E0D0C0B0A0908,  0x1F1E1D1C1B1A1918
+#define OPENCV_HAL_IMPL_RISCVV_UNPACKS(_Tpvec, _Tp, _T, _UTp, _UT, num, num2, len, numh) \
+inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
+{ \
+    v##_Tp##m2_t tmp = vundefined_##_T##m2();\
+    tmp = vset_##_T##m2(tmp, 0, a0.val); \
+    tmp = vset_##_T##m2(tmp, 1, a1.val); \
+    vuint64m2_t mask = VITL_##num;    \
+    tmp = (v##_Tp##m2_t)vrgather_vv_##_T##m2((v##_Tp##m2_t)tmp, (v##_UTp##m2_t)mask, num2);    \
+    b0.val = vget_##_T##m2_##_T##m1(tmp, 0); \
+    b1.val = vget_##_T##m2_##_T##m1(tmp, 1); \
+} \
+inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    v##_Tp##m1_t b0 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a.val, b.val, numh, num);    \
+    return v_##_Tpvec(b0);\
+} \
+inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
+{ \
+    v##_Tp##m1_t b0 = vslidedown_vx_##_T##m1(b.val, numh, num);    \
+    v##_Tp##m1_t a0 = vslidedown_vx_##_T##m1(a.val, numh, num);    \
+    v##_Tp##m1_t b1 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num);    \
+    return v_##_Tpvec(b1);\
+} \
+inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
+{ \
+    c.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a.val, b.val, numh, num);    \
+    v##_Tp##m1_t b0 = vslidedown_vx_##_T##m1(b.val, numh, num);    \
+    v##_Tp##m1_t a0 = vslidedown_vx_##_T##m1(a.val, numh, num);    \
+    d.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num);    \
+}
+
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint8x16, uint8, u8, uint8, u8, 16, 32, b8, 8)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(int8x16, int8, i8, uint8, u8, 16, 32, b8, 8)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint16x8, uint16, u16, uint16, u16, 8, 16, b16, 4)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(int16x8, int16, i16, uint16, u16, 8, 16, b16, 4)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint32x4, uint32, u32, uint32, u32, 4, 8, b32, 2)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(int32x4, int32, i32, uint32, u32, 4, 8, b32, 2)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(float32x4, float32, f32, uint32, u32, 4, 8, b32, 2)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(float64x2, float64, f64, uint64, u64, 2, 4, b64, 1)
+
+inline v_uint8x16 v_reverse(const v_uint8x16 &a)
+{
+    vuint64m1_t mask = (vuint64m1_t){0x08090A0B0C0D0E0F, 0x0001020304050607};
+    return v_uint8x16(vrgather_vv_u8m1(a.val, (vuint8m1_t)mask, 16));
+}
+inline v_int8x16 v_reverse(const v_int8x16 &a)
+{
+    vint64m1_t mask = (vint64m1_t){0x08090A0B0C0D0E0F, 0x0001020304050607};
+    return v_int8x16(vrgather_vv_i8m1(a.val, (vuint8m1_t)mask, 16));
+}
+
+inline v_uint16x8 v_reverse(const v_uint16x8 &a)
+{
+    vuint64m1_t mask = (vuint64m1_t){0x0004000500060007, 0x000000100020003};
+    return v_uint16x8(vrgather_vv_u16m1(a.val, (vuint16m1_t)mask, 8));
+}
+
+inline v_int16x8 v_reverse(const v_int16x8 &a)
+{
+    vint64m1_t mask = (vint64m1_t){0x0004000500060007, 0x000000100020003};
+    return v_int16x8(vrgather_vv_i16m1(a.val, (vuint16m1_t)mask, 8));
+}
+inline v_uint32x4 v_reverse(const v_uint32x4 &a)
+{
+    return v_uint32x4(vrgather_vv_u32m1(a.val, (vuint32m1_t){3, 2, 1, 0}, 4));
+}
+
+inline v_int32x4 v_reverse(const v_int32x4 &a)
+{
+    return v_int32x4(vrgather_vv_i32m1(a.val, (vuint32m1_t){3, 2, 1, 0}, 4));
+}
+
+inline v_float32x4 v_reverse(const v_float32x4 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x2 v_reverse(const v_uint64x2 &a)
+{
+    return v_uint64x2(a.val[1], a.val[0]);
+}
+
+inline v_int64x2 v_reverse(const v_int64x2 &a)
+{
+    return v_int64x2(a.val[1], a.val[0]);
+}
+
+inline v_float64x2 v_reverse(const v_float64x2 &a)
+{
+    return v_float64x2(a.val[1], a.val[0]);
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_EXTRACT(_Tpvec, suffix, size) \
+template <int n> \
+inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
+{ return v_rotate_right<n>(a, b);}
+OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint8x16, u8, 0)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int8x16, s8, 0)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint16x8, u16, 1)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int16x8, s16, 1)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint32x4, u32, 2)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int32x4, s32, 2)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint64x2, u64, 3)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int64x2, s64, 3)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_float32x4, f32, 2)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_float64x2, f64, 3)
+
+
+#define OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(_Tpvec, _Tp, suffix) \
+template<int i> inline _Tp v_extract_n(_Tpvec v) { return v.val[i]; }
+
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int8x16, schar, s8)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int16x8, short, s16)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint32x4, uint, u32)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int64x2, int64, s64)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float32x4, float, f32)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float64x2, double, f64)
+
+#define OPENCV_HAL_IMPL_RISCVV_BROADCAST(_Tpvec, _Tp, num) \
+template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) { return _Tpvec(vrgather_vx_##_Tp##m1(v.val, i, num)); }
+
+OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint8x16, u8, 16)
+OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int8x16, i8, 16)
+OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint16x8, u16, 8)
+OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int16x8, i16, 8)
+OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint32x4, u32, 4)
+OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int32x4, i32, 4)
+OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint64x2, u64, 2)
+OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int64x2, i64, 2)
+OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_float32x4, f32, 4)
+inline v_int32x4 v_round(const v_float32x4& a)
+{
+    __builtin_riscv_fsrm(0);
+    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
+    vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
+    vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
+    __builtin_riscv_fsrm(0);
+    return v_int32x4(val);
+}
+inline v_int32x4 v_floor(const v_float32x4& a)
+{
+    __builtin_riscv_fsrm(2);
+    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
+    vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
+    vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
+    __builtin_riscv_fsrm(0);
+    return v_int32x4(val);
+}
+
+inline v_int32x4 v_ceil(const v_float32x4& a)
+{
+    __builtin_riscv_fsrm(3);
+    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
+    vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
+    vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
+    __builtin_riscv_fsrm(0);
+    return v_int32x4(val);
+}
+
+inline v_int32x4 v_trunc(const v_float32x4& a)
+{
+    __builtin_riscv_fsrm(1);
+    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
+    vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
+    vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
+    __builtin_riscv_fsrm(0);
+    return v_int32x4(val);
+}
+
+inline v_int32x4 v_round(const v_float64x2& a)
+{
+    __builtin_riscv_fsrm(0);
+    vfloat64m2_t _val = vundefined_f64m2();
+    _val = vset_f64m2(_val, 0, a.val);
+    //_val = vset_f64m2(_val, 1, a.val);
+    _val = vset_f64m2(_val, 1, vfmv_v_f_f64m1(0, 2));
+    vint32m1_t val = vfncvt_x_f_v_i32m1(_val, 4);
+    __builtin_riscv_fsrm(0);
+    return v_int32x4(val);
+}
+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{
+    __builtin_riscv_fsrm(0);
+    vfloat64m2_t _val = vundefined_f64m2();
+    _val = vset_f64m2(_val, 0, a.val);
+    _val = vset_f64m2(_val, 1, b.val);
+    vint32m1_t val = vfncvt_x_f_v_i32m1(_val, 4);
+    __builtin_riscv_fsrm(0);
+    return v_int32x4(val);
+}
+inline v_int32x4 v_floor(const v_float64x2& a)
+{
+    __builtin_riscv_fsrm(2);
+    vfloat64m2_t _val = vundefined_f64m2();
+    _val = vset_f64m2(_val, 0, a.val);
+    vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
+
+    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
+    vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
+    vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
+    __builtin_riscv_fsrm(0);
+    return v_int32x4(val);
+}
+
+inline v_int32x4 v_ceil(const v_float64x2& a)
+{
+    __builtin_riscv_fsrm(3);
+    vfloat64m2_t _val = vundefined_f64m2();
+    _val = vset_f64m2(_val, 0, a.val);
+    vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
+
+    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
+    vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
+    vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
+    __builtin_riscv_fsrm(0);
+    return v_int32x4(val);
+}
+
+inline v_int32x4 v_trunc(const v_float64x2& a)
+{
+    __builtin_riscv_fsrm(1);
+    vfloat64m2_t _val = vundefined_f64m2();
+    _val = vset_f64m2(_val, 0, a.val);
+    vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
+
+    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
+    vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
+    vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
+    __builtin_riscv_fsrm(0);
+    return v_int32x4(val);
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(intrin, _Tpvec, num, _Tp, _T)    \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
+{ \
+    v##_Tpvec##m1x2_t ret = intrin##2e_v_##_T##m1x2(ptr, num);\
+    a.val = vget_##_T##m1x2_##_T##m1(ret, 0);  \
+    b.val = vget_##_T##m1x2_##_T##m1(ret, 1);  \
+} \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
+{ \
+    v##_Tpvec##m1x3_t ret = intrin##3e_v_##_T##m1x3(ptr, num);\
+    a.val = vget_##_T##m1x3_##_T##m1(ret, 0);  \
+    b.val = vget_##_T##m1x3_##_T##m1(ret, 1);  \
+    c.val = vget_##_T##m1x3_##_T##m1(ret, 2);  \
+}\
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
+                                v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
+{ \
+    v##_Tpvec##m1x4_t ret = intrin##4e_v_##_T##m1x4(ptr, num);\
+    a.val = vget_##_T##m1x4_##_T##m1(ret, 0);  \
+    b.val = vget_##_T##m1x4_##_T##m1(ret, 1);  \
+    c.val = vget_##_T##m1x4_##_T##m1(ret, 2);  \
+    d.val = vget_##_T##m1x4_##_T##m1(ret, 3);  \
+} \
+
+#define OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(intrin, _Tpvec, num, _Tp, _T)    \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    v##_Tpvec##m1x2_t ret = vundefined_##_T##m1x2();      \
+    ret = vset_##_T##m1x2(ret, 0, a.val);  \
+    ret = vset_##_T##m1x2(ret, 1, b.val);  \
+    intrin##2e_v_##_T##m1x2(ptr, ret, num); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
+                                const v_##_Tpvec##x##num& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    v##_Tpvec##m1x3_t ret = vundefined_##_T##m1x3();       \
+    ret = vset_##_T##m1x3(ret, 0, a.val);  \
+    ret = vset_##_T##m1x3(ret, 1, b.val);  \
+    ret = vset_##_T##m1x3(ret, 2, c.val);  \
+    intrin##3e_v_##_T##m1x3(ptr, ret, num); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
+                                const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
+{ \
+    v##_Tpvec##m1x4_t ret = vundefined_##_T##m1x4();             \
+    ret = vset_##_T##m1x4(ret, 0, a.val);  \
+    ret = vset_##_T##m1x4(ret, 1, b.val);  \
+    ret = vset_##_T##m1x4(ret, 2, c.val);  \
+    ret = vset_##_T##m1x4(ret, 3, d.val);  \
+    intrin##4e_v_##_T##m1x4(ptr, ret, num); \
+}
+
+#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(_Tpvec, _Tp, num, ld, st, _T) \
+OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(ld, _Tpvec, num, _Tp, _T)    \
+OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(st, _Tpvec, num, _Tp, _T)
+
+//OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, uchar, )
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int8, schar, 16, vlseg, vsseg, i8)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int16, short, 8, vlseg, vsseg, i16)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int32, int, 4, vlseg, vsseg, i32)
+
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, unsigned char, 16, vlseg, vsseg, u8)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint16, unsigned short, 8, vlseg, vsseg, u16)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint32, unsigned int, 4, vlseg, vsseg, u32)
+
+#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(_Tpvec, _Tp, num, _T) \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
+{ \
+    v##_Tpvec##m1x2_t ret = vlseg2e_v_##_T##m1x2(ptr, num); \
+    a.val = vget_##_T##m1x2_##_T##m1(ret, 0);  \
+    b.val = vget_##_T##m1x2_##_T##m1(ret, 1);  \
+} \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
+{ \
+    v##_Tpvec##m1x3_t ret = vlseg3e_v_##_T##m1x3(ptr, num);    \
+    a.val = vget_##_T##m1x3_##_T##m1(ret, 0);  \
+    b.val = vget_##_T##m1x3_##_T##m1(ret, 1);  \
+    c.val = vget_##_T##m1x3_##_T##m1(ret, 2);  \
+}\
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
+                                v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
+{ \
+    v##_Tpvec##m1x4_t ret = vlseg4e_v_##_T##m1x4(ptr, num);    \
+    a.val = vget_##_T##m1x4_##_T##m1(ret, 0);  \
+    b.val = vget_##_T##m1x4_##_T##m1(ret, 1);  \
+    c.val = vget_##_T##m1x4_##_T##m1(ret, 2);  \
+    d.val = vget_##_T##m1x4_##_T##m1(ret, 3);  \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    v##_Tpvec##m1x2_t ret = vundefined_##_T##m1x2();    \
+    ret = vset_##_T##m1x2(ret, 0, a.val);  \
+    ret = vset_##_T##m1x2(ret, 1, b.val);  \
+    vsseg2e_v_##_T##m1x2(ptr, ret, num);    \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
+                                const v_##_Tpvec##x##num& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    v##_Tpvec##m1x3_t ret = vundefined_##_T##m1x3();    \
+    ret = vset_##_T##m1x3(ret, 0, a.val);  \
+    ret = vset_##_T##m1x3(ret, 1, b.val);  \
+    ret = vset_##_T##m1x3(ret, 2, c.val);  \
+    vsseg3e_v_##_T##m1x3(ptr, ret, num);    \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
+                                const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
+{ \
+    v##_Tpvec##m1x4_t ret = vundefined_##_T##m1x4();    \
+    ret = vset_##_T##m1x4(ret, 0, a.val);  \
+    ret = vset_##_T##m1x4(ret, 1, b.val);  \
+    ret = vset_##_T##m1x4(ret, 2, c.val);  \
+    ret = vset_##_T##m1x4(ret, 3, d.val);  \
+    vsseg4e_v_##_T##m1x4(ptr, ret, num);    \
+}
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float32, float, 4, f32)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float64, double, 2, f64)
+
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(uint64, unsigned long, 2, u64)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(int64, long, 2, i64)
+
+inline v_float32x4 v_cvt_f32(const v_int32x4& a)
+{
+    return v_float32x4(vfcvt_f_x_v_f32m1(a.val, 4));
+}
+
+#if CV_SIMD128_64F
+inline v_float32x4 v_cvt_f32(const v_float64x2& a)
+{
+    vfloat64m2_t _val = vundefined_f64m2();
+    _val = vset_f64m2(_val, 0, a.val);
+    vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
+    return v_float32x4(aval);
+}
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{
+    vfloat64m2_t _val = vundefined_f64m2();
+    _val = vset_f64m2(_val, 0, a.val);
+    _val = vset_f64m2(_val, 1, b.val);
+    vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 4);
+    return v_float32x4(aval);
+}
+
+inline v_float64x2 v_cvt_f64(const v_int32x4& a)
+{
+    vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
+    vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
+    return v_float64x2(vget_f64m2_f64m1(_val, 0));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
+{
+    vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
+    vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
+    return v_float64x2(vget_f64m2_f64m1(_val, 1));
+}
+
+inline v_float64x2 v_cvt_f64(const v_float32x4& a)
+{
+    vfloat64m2_t _val  = vfwcvt_f_f_v_f64m2(a.val, 4);
+    return v_float64x2(vget_f64m2_f64m1(_val, 0));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
+{
+    vfloat64m2_t _val  = vfwcvt_f_f_v_f64m2(a.val, 4);
+    return v_float64x2(vget_f64m2_f64m1(_val, 1));
+}
+
+inline v_float64x2 v_cvt_f64(const v_int64x2& a)
+{
+    return v_float64x2(vfcvt_f_x_v_f64m1(a.val, 2));
+}
+
+#endif
+inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
+{
+    vuint64m1_t m0 = {0x0705060403010200, 0x0F0D0E0C0B090A08};
+    return v_int8x16(vrgather_vv_i8m1(vec.val, (vuint8m1_t)m0, 16));
+}
+inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
+{
+    return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec)));
+}
+
+inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
+{
+    vuint64m1_t m0 = {0x0703060205010400, 0x0F0B0E0A0D090C08};
+    return v_int8x16(vrgather_vv_i8m1(vec.val, (vuint8m1_t)m0, 16));
+}
+inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec)
+{
+    return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec)));
+}
+
+inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
+{
+    vuint64m1_t m0 = {0x0706030205040100, 0x0F0E0B0A0D0C0908};
+    return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)vec.val, (vuint8m1_t)m0, 16));
+}
+inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
+inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
+{
+    vuint64m1_t m0 = {0x0B0A030209080100, 0x0F0E07060D0C0504};
+    return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
+}
+inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
+{
+    vuint64m1_t m0 = {0x0B0A090803020100, 0x0F0E0D0C07060504};
+    return v_int32x4((vint32m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
+}
+inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
+{
+    vuint64m1_t m0 = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
+    return v_int8x16((vint8m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
+}
+inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
+{
+    vuint64m1_t m0 = {0x0908050403020100, 0xFFFFFFFF0D0C0B0A};
+    return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
+}
+inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
+inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
+inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
+
+#if CV_SIMD128_64F
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a,   const v_int32x4& b,
+                                    const v_float64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+{
+    vint64m2_t v1 = vwmul_vv_i64m2(a.val, b.val, 4);
+    vfloat64m1_t res = vfcvt_f_x_v_f64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 2), 2);
+    return v_float64x2(res);
+}
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ v_float64x2 res = v_dotprod_expand_fast(a, b);
+  return res + c; }
+#endif
+////// FP16 support ///////
+inline v_float32x4 v_load_expand(const float16_t* ptr)
+{
+    vfloat16m1_t v = vle_v_f16m1((__fp16*)ptr, 4);
+    vfloat32m2_t v32 = vfwcvt_f_f_v_f32m2(v, 4);
+    return v_float32x4(vget_f32m2_f32m1(v32, 0));
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+{
+    vfloat32m2_t v32 = vundefined_f32m2();
+    v32 = vset_f32m2(v32, 0, v.val);
+    vfloat16m1_t hv = vfncvt_f_f_v_f16m1(v32, 4);
+    vse_v_f16m1((__fp16*)ptr, hv, 4);
+}
+
+
+inline void v_cleanup() {}
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+}
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_sse.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_sse.hpp
new file mode 100644
index 0000000..443ee16
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_sse.hpp
@@ -0,0 +1,3467 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_HAL_SSE_HPP
+#define OPENCV_HAL_SSE_HPP
+
+#include <algorithm>
+#include "opencv2/core/utility.hpp"
+
+#define CV_SIMD128 1
+#define CV_SIMD128_64F 1
+#define CV_SIMD128_FP16 0  // no native operations with FP16 type.
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+//
+// Compilation troubleshooting:
+// - MSVC: error C2719: 'a': formal parameter with requested alignment of 16 won't be aligned
+//   Replace parameter declaration to const reference:
+//   -v_int32x4 a
+//   +const v_int32x4& a
+//
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+///////// Types ////////////
+
+struct v_uint8x16
+{
+    typedef uchar lane_type;
+    typedef __m128i vector_type;
+    enum { nlanes = 16 };
+
+    /* coverity[uninit_ctor]: suppress warning */
+    v_uint8x16() {}
+    explicit v_uint8x16(__m128i v) : val(v) {}
+    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+               uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+    {
+        val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
+                            (char)v4, (char)v5, (char)v6, (char)v7,
+                            (char)v8, (char)v9, (char)v10, (char)v11,
+                            (char)v12, (char)v13, (char)v14, (char)v15);
+    }
+
+    uchar get0() const
+    {
+        return (uchar)_mm_cvtsi128_si32(val);
+    }
+
+    __m128i val;
+};
+
+struct v_int8x16
+{
+    typedef schar lane_type;
+    typedef __m128i vector_type;
+    enum { nlanes = 16 };
+
+    /* coverity[uninit_ctor]: suppress warning */
+    v_int8x16() {}
+    explicit v_int8x16(__m128i v) : val(v) {}
+    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+              schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+    {
+        val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
+                            (char)v4, (char)v5, (char)v6, (char)v7,
+                            (char)v8, (char)v9, (char)v10, (char)v11,
+                            (char)v12, (char)v13, (char)v14, (char)v15);
+    }
+
+    schar get0() const
+    {
+        return (schar)_mm_cvtsi128_si32(val);
+    }
+
+    __m128i val;
+};
+
+struct v_uint16x8
+{
+    typedef ushort lane_type;
+    typedef __m128i vector_type;
+    enum { nlanes = 8 };
+
+    /* coverity[uninit_ctor]: suppress warning */
+    v_uint16x8() {}
+    explicit v_uint16x8(__m128i v) : val(v) {}
+    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+    {
+        val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
+                             (short)v4, (short)v5, (short)v6, (short)v7);
+    }
+
+    ushort get0() const
+    {
+        return (ushort)_mm_cvtsi128_si32(val);
+    }
+
+    __m128i val;
+};
+
+struct v_int16x8
+{
+    typedef short lane_type;
+    typedef __m128i vector_type;
+    enum { nlanes = 8 };
+
+    /* coverity[uninit_ctor]: suppress warning */
+    v_int16x8() {}
+    explicit v_int16x8(__m128i v) : val(v) {}
+    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+    {
+        val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
+                             (short)v4, (short)v5, (short)v6, (short)v7);
+    }
+
+    short get0() const
+    {
+        return (short)_mm_cvtsi128_si32(val);
+    }
+
+    __m128i val;
+};
+
+struct v_uint32x4
+{
+    typedef unsigned lane_type;
+    typedef __m128i vector_type;
+    enum { nlanes = 4 };
+
+    /* coverity[uninit_ctor]: suppress warning */
+    v_uint32x4() {}
+    explicit v_uint32x4(__m128i v) : val(v) {}
+    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
+    {
+        val = _mm_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3);
+    }
+
+    unsigned get0() const
+    {
+        return (unsigned)_mm_cvtsi128_si32(val);
+    }
+
+    __m128i val;
+};
+
+struct v_int32x4
+{
+    typedef int lane_type;
+    typedef __m128i vector_type;
+    enum { nlanes = 4 };
+
+    /* coverity[uninit_ctor]: suppress warning */
+    v_int32x4() {}
+    explicit v_int32x4(__m128i v) : val(v) {}
+    v_int32x4(int v0, int v1, int v2, int v3)
+    {
+        val = _mm_setr_epi32(v0, v1, v2, v3);
+    }
+
+    int get0() const
+    {
+        return _mm_cvtsi128_si32(val);
+    }
+
+    __m128i val;
+};
+
+struct v_float32x4
+{
+    typedef float lane_type;
+    typedef __m128 vector_type;
+    enum { nlanes = 4 };
+
+    /* coverity[uninit_ctor]: suppress warning */
+    v_float32x4() {}
+    explicit v_float32x4(__m128 v) : val(v) {}
+    v_float32x4(float v0, float v1, float v2, float v3)
+    {
+        val = _mm_setr_ps(v0, v1, v2, v3);
+    }
+
+    float get0() const
+    {
+        return _mm_cvtss_f32(val);
+    }
+
+    __m128 val;
+};
+
+struct v_uint64x2
+{
+    typedef uint64 lane_type;
+    typedef __m128i vector_type;
+    enum { nlanes = 2 };
+
+    /* coverity[uninit_ctor]: suppress warning */
+    v_uint64x2() {}
+    explicit v_uint64x2(__m128i v) : val(v) {}
+    v_uint64x2(uint64 v0, uint64 v1)
+    {
+#if defined(_MSC_VER) && _MSC_VER >= 1920/*MSVS 2019*/ && defined(_M_X64) && !defined(__clang__)
+        val = _mm_setr_epi64x((int64_t)v0, (int64_t)v1);
+#elif defined(__GNUC__)
+        val = _mm_setr_epi64((__m64)v0, (__m64)v1);
+#else
+        val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
+#endif
+    }
+
+    uint64 get0() const
+    {
+    #if !defined(__x86_64__) && !defined(_M_X64)
+        int a = _mm_cvtsi128_si32(val);
+        int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
+        return (unsigned)a | ((uint64)(unsigned)b << 32);
+    #else
+        return (uint64)_mm_cvtsi128_si64(val);
+    #endif
+    }
+
+    __m128i val;
+};
+
+struct v_int64x2
+{
+    typedef int64 lane_type;
+    typedef __m128i vector_type;
+    enum { nlanes = 2 };
+
+    /* coverity[uninit_ctor]: suppress warning */
+    v_int64x2() {}
+    explicit v_int64x2(__m128i v) : val(v) {}
+    v_int64x2(int64 v0, int64 v1)
+    {
+#if defined(_MSC_VER) && _MSC_VER >= 1920/*MSVS 2019*/ && defined(_M_X64) && !defined(__clang__)
+        val = _mm_setr_epi64x((int64_t)v0, (int64_t)v1);
+#elif defined(__GNUC__)
+        val = _mm_setr_epi64((__m64)v0, (__m64)v1);
+#else
+        val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
+#endif
+    }
+
+    int64 get0() const
+    {
+    #if !defined(__x86_64__) && !defined(_M_X64)
+        int a = _mm_cvtsi128_si32(val);
+        int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
+        return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
+    #else
+        return _mm_cvtsi128_si64(val);
+    #endif
+    }
+
+    __m128i val;
+};
+
+struct v_float64x2
+{
+    typedef double lane_type;
+    typedef __m128d vector_type;
+    enum { nlanes = 2 };
+
+    /* coverity[uninit_ctor]: suppress warning */
+    v_float64x2() {}
+    explicit v_float64x2(__m128d v) : val(v) {}
+    v_float64x2(double v0, double v1)
+    {
+        val = _mm_setr_pd(v0, v1);
+    }
+
+    double get0() const
+    {
+        return _mm_cvtsd_f64(val);
+    }
+
+    __m128d val;
+};
+
+namespace hal_sse_internal
+{
+    template <typename to_sse_type, typename from_sse_type>
+    to_sse_type v_sse_reinterpret_as(const from_sse_type& val);
+
+#define OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(to_sse_type, from_sse_type, sse_cast_intrin) \
+    template<> inline \
+    to_sse_type v_sse_reinterpret_as(const from_sse_type& a) \
+    { return sse_cast_intrin(a); }
+
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128i, OPENCV_HAL_NOP)
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128, _mm_castps_si128)
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128d, _mm_castpd_si128)
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128i, _mm_castsi128_ps)
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128, OPENCV_HAL_NOP)
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128d, _mm_castpd_ps)
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128i, _mm_castsi128_pd)
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128, _mm_castps_pd)
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128d, OPENCV_HAL_NOP)
+}
+
+#define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
+inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
+inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
+template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
+{ return _Tpvec(cast(a.val)); }
+
+OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, schar, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, schar, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_int32x4, int, s32, si128, epi32, int, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_float32x4, float, f32, ps, ps, float, _mm_castsi128_ps)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_float64x2, double, f64, pd, pd, double, _mm_castsi128_pd)
+
+inline v_uint64x2 v_setzero_u64() { return v_uint64x2(_mm_setzero_si128()); }
+inline v_int64x2 v_setzero_s64() { return v_int64x2(_mm_setzero_si128()); }
+inline v_uint64x2 v_setall_u64(uint64 val) { return v_uint64x2(val, val); }
+inline v_int64x2 v_setall_s64(int64 val) { return v_int64x2(val, val); }
+
+template<typename _Tpvec> inline
+v_uint64x2 v_reinterpret_as_u64(const _Tpvec& a) { return v_uint64x2(a.val); }
+template<typename _Tpvec> inline
+v_int64x2 v_reinterpret_as_s64(const _Tpvec& a) { return v_int64x2(a.val); }
+inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& a)
+{ return v_float32x4(_mm_castsi128_ps(a.val)); }
+inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& a)
+{ return v_float32x4(_mm_castsi128_ps(a.val)); }
+inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& a)
+{ return v_float64x2(_mm_castsi128_pd(a.val)); }
+inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& a)
+{ return v_float64x2(_mm_castsi128_pd(a.val)); }
+
+#define OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(_Tpvec, suffix) \
+inline _Tpvec v_reinterpret_as_##suffix(const v_float32x4& a) \
+{ return _Tpvec(_mm_castps_si128(a.val)); } \
+inline _Tpvec v_reinterpret_as_##suffix(const v_float64x2& a) \
+{ return _Tpvec(_mm_castpd_si128(a.val)); }
+
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint8x16, u8)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int8x16, s8)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint16x8, u16)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int16x8, s16)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint32x4, u32)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int32x4, s32)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint64x2, u64)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int64x2, s64)
+
+inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& a) {return a; }
+inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a) {return a; }
+inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a) {return v_float32x4(_mm_castpd_ps(a.val)); }
+inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a) {return v_float64x2(_mm_castps_pd(a.val)); }
+
+//////////////// PACK ///////////////
+inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
+{
+    __m128i delta = _mm_set1_epi16(255);
+    return v_uint8x16(_mm_packus_epi16(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta)),
+                                       _mm_subs_epu16(b.val, _mm_subs_epu16(b.val, delta))));
+}
+
+inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
+{
+    __m128i delta = _mm_set1_epi16(255);
+    __m128i a1 = _mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta));
+    _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
+}
+
+inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
+{ return v_uint8x16(_mm_packus_epi16(a.val, b.val)); }
+
+inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
+{ _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); }
+
+template<int n> inline
+v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
+{
+    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    return v_uint8x16(_mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(a.val, delta), n),
+                                       _mm_srli_epi16(_mm_adds_epu16(b.val, delta), n)));
+}
+
+template<int n> inline
+void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
+{
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    __m128i a1 = _mm_srli_epi16(_mm_adds_epu16(a.val, delta), n);
+    _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
+}
+
+template<int n> inline
+v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
+{
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
+                                       _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
+{
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
+    _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
+}
+
+inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
+{ return v_int8x16(_mm_packs_epi16(a.val, b.val)); }
+
+inline void v_pack_store(schar* ptr, const v_int16x8& a)
+{ _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); }
+
+template<int n> inline
+v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
+{
+    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    return v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
+                                     _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
+}
+template<int n> inline
+void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
+{
+    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
+    _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a1, a1));
+}
+
+
+// byte-wise "mask ? a : b"
+inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b)
+{
+#if CV_SSE4_1
+    return _mm_blendv_epi8(b, a, mask);
+#else
+    return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask));
+#endif
+}
+
+inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
+{ return v_uint16x8(_v128_packs_epu32(a.val, b.val)); }
+
+inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
+{
+    __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
+    __m128i r = _mm_packs_epi32(a1, a1);
+    _mm_storel_epi64((__m128i*)ptr, _mm_sub_epi16(r, _mm_set1_epi16(-32768)));
+}
+
+template<int n> inline
+v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
+    __m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val, delta), n), delta32);
+    return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768)));
+}
+
+template<int n> inline
+void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
+    __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
+    _mm_storel_epi64((__m128i*)ptr, a2);
+}
+
+inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
+{
+#if CV_SSE4_1
+    return v_uint16x8(_mm_packus_epi32(a.val, b.val));
+#else
+    __m128i delta32 = _mm_set1_epi32(32768);
+
+    // preliminary saturate negative values to zero
+    __m128i a1 = _mm_and_si128(a.val, _mm_cmpgt_epi32(a.val, _mm_set1_epi32(0)));
+    __m128i b1 = _mm_and_si128(b.val, _mm_cmpgt_epi32(b.val, _mm_set1_epi32(0)));
+
+    __m128i r = _mm_packs_epi32(_mm_sub_epi32(a1, delta32), _mm_sub_epi32(b1, delta32));
+    return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
+#endif
+}
+
+inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
+{
+#if CV_SSE4_1
+    _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi32(a.val, a.val));
+#else
+    __m128i delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(a.val, delta32);
+    __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
+    _mm_storel_epi64((__m128i*)ptr, r);
+#endif
+}
+
+template<int n> inline
+v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
+{
+#if CV_SSE4_1
+    __m128i delta = _mm_set1_epi32(1 << (n - 1));
+    return v_uint16x8(_mm_packus_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
+                                       _mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
+#else
+    __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
+    __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
+    __m128i b1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(b.val, delta), n), delta32);
+    __m128i b2 = _mm_sub_epi16(_mm_packs_epi32(b1, b1), _mm_set1_epi16(-32768));
+    return v_uint16x8(_mm_unpacklo_epi64(a2, b2));
+#endif
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
+{
+#if CV_SSE4_1
+    __m128i delta = _mm_set1_epi32(1 << (n - 1));
+    __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
+    _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi32(a1, a1));
+#else
+    __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
+    __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
+    _mm_storel_epi64((__m128i*)ptr, a2);
+#endif
+}
+
+inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
+{ return v_int16x8(_mm_packs_epi32(a.val, b.val)); }
+
+inline void v_pack_store(short* ptr, const v_int32x4& a)
+{
+    _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val));
+}
+
+template<int n> inline
+v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1));
+    return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
+                                     _mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
+}
+
+template<int n> inline
+void v_rshr_pack_store(short* ptr, const v_int32x4& a)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1));
+    __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
+    _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1));
+}
+
+
+// [a0 0 | b0 0]  [a1 0 | b1 0]
+inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
+{
+    __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
+    __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
+    return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
+}
+
+inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
+{
+    __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
+    _mm_storel_epi64((__m128i*)ptr, a1);
+}
+
+// [a0 0 | b0 0]  [a1 0 | b1 0]
+inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
+{
+    __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
+    __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
+    return v_int32x4(_mm_unpacklo_epi32(v0, v1));
+}
+
+inline void v_pack_store(int* ptr, const v_int64x2& a)
+{
+    __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
+    _mm_storel_epi64((__m128i*)ptr, a1);
+}
+
+template<int n> inline
+v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
+{
+    uint64 delta = (uint64)1 << (n-1);
+    v_uint64x2 delta2(delta, delta);
+    __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
+    __m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n);
+    __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
+    __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
+    return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
+}
+
+template<int n> inline
+void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
+{
+    uint64 delta = (uint64)1 << (n-1);
+    v_uint64x2 delta2(delta, delta);
+    __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
+    __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
+    _mm_storel_epi64((__m128i*)ptr, a2);
+}
+
+inline __m128i v_sign_epi64(__m128i a)
+{
+    return _mm_shuffle_epi32(_mm_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1)); // x m0 | x m1
+}
+
+inline __m128i v_srai_epi64(__m128i a, int imm)
+{
+    __m128i smask = v_sign_epi64(a);
+    return _mm_xor_si128(_mm_srli_epi64(_mm_xor_si128(a, smask), imm), smask);
+}
+
+template<int n> inline
+v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
+{
+    int64 delta = (int64)1 << (n-1);
+    v_int64x2 delta2(delta, delta);
+    __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
+    __m128i b1 = v_srai_epi64(_mm_add_epi64(b.val, delta2.val), n);
+    __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
+    __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
+    return v_int32x4(_mm_unpacklo_epi32(v0, v1));
+}
+
+template<int n> inline
+void v_rshr_pack_store(int* ptr, const v_int64x2& a)
+{
+    int64 delta = (int64)1 << (n-1);
+    v_int64x2 delta2(delta, delta);
+    __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
+    __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
+    _mm_storel_epi64((__m128i*)ptr, a2);
+}
+
+// pack boolean
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    __m128i ab = _mm_packs_epi16(a.val, b.val);
+    return v_uint8x16(ab);
+}
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    __m128i ab = _mm_packs_epi32(a.val, b.val);
+    __m128i cd = _mm_packs_epi32(c.val, d.val);
+    return v_uint8x16(_mm_packs_epi16(ab, cd));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    __m128i ab = _mm_packs_epi32(a.val, b.val);
+    __m128i cd = _mm_packs_epi32(c.val, d.val);
+    __m128i ef = _mm_packs_epi32(e.val, f.val);
+    __m128i gh = _mm_packs_epi32(g.val, h.val);
+
+    __m128i abcd = _mm_packs_epi32(ab, cd);
+    __m128i efgh = _mm_packs_epi32(ef, gh);
+    return v_uint8x16(_mm_packs_epi16(abcd, efgh));
+}
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2,
+                            const v_float32x4& m3)
+{
+    __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
+    __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
+    __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
+    __m128 v3 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(3, 3, 3, 3)), m3.val);
+
+    return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3)));
+}
+
+inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
+                               const v_float32x4& m1, const v_float32x4& m2,
+                               const v_float32x4& a)
+{
+    __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
+    __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
+    __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
+
+    return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, a.val)));
+}
+
+#define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return _Tpvec(intrin(a.val, b.val)); \
+    } \
+    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+    { \
+        a.val = intrin(a.val, b.val); \
+        return a; \
+    }
+
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint32x4, _v128_mullo_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int32x4, _v128_mullo_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
+
+// saturating multiply 8-bit, 16-bit
+#define OPENCV_HAL_IMPL_SSE_MUL_SAT(_Tpvec, _Tpwvec)             \
+    inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
+    {                                                            \
+        _Tpwvec c, d;                                            \
+        v_mul_expand(a, b, c, d);                                \
+        return v_pack(c, d);                                     \
+    }                                                            \
+    inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
+    { a = a * b; return a; }
+
+OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint8x16, v_uint16x8)
+OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int8x16,  v_int16x8)
+OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint16x8, v_uint32x4)
+OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int16x8,  v_int32x4)
+
+//  Multiply and expand
+inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
+                         v_uint16x8& c, v_uint16x8& d)
+{
+    v_uint16x8 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
+inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
+                         v_int16x8& c, v_int16x8& d)
+{
+    v_int16x8 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
+inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
+                         v_int32x4& c, v_int32x4& d)
+{
+    __m128i v0 = _mm_mullo_epi16(a.val, b.val);
+    __m128i v1 = _mm_mulhi_epi16(a.val, b.val);
+    c.val = _mm_unpacklo_epi16(v0, v1);
+    d.val = _mm_unpackhi_epi16(v0, v1);
+}
+
+inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
+                         v_uint32x4& c, v_uint32x4& d)
+{
+    __m128i v0 = _mm_mullo_epi16(a.val, b.val);
+    __m128i v1 = _mm_mulhi_epu16(a.val, b.val);
+    c.val = _mm_unpacklo_epi16(v0, v1);
+    d.val = _mm_unpackhi_epi16(v0, v1);
+}
+
+inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
+                         v_uint64x2& c, v_uint64x2& d)
+{
+    __m128i c0 = _mm_mul_epu32(a.val, b.val);
+    __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
+    c.val = _mm_unpacklo_epi64(c0, c1);
+    d.val = _mm_unpackhi_epi64(c0, c1);
+}
+
+inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b) { return v_int16x8(_mm_mulhi_epi16(a.val, b.val)); }
+inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) { return v_uint16x8(_mm_mulhi_epu16(a.val, b.val)); }
+
+//////// Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{ return v_int32x4(_mm_madd_epi16(a.val, b.val)); }
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_dotprod(a, b) + c; }
+
+// 32 >> 64
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
+{
+#if CV_SSE4_1
+    __m128i even = _mm_mul_epi32(a.val, b.val);
+    __m128i odd = _mm_mul_epi32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
+    return v_int64x2(_mm_add_epi64(even, odd));
+#else
+    __m128i even_u = _mm_mul_epu32(a.val, b.val);
+    __m128i odd_u = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
+    // convert unsigned to signed high multiplication (from: Agner Fog(veclib) and H S Warren: Hacker's delight, 2003, p. 132)
+    __m128i a_sign = _mm_srai_epi32(a.val, 31);
+    __m128i b_sign = _mm_srai_epi32(b.val, 31);
+    // |x * sign of x
+    __m128i axb  = _mm_and_si128(a.val, b_sign);
+    __m128i bxa  = _mm_and_si128(b.val, a_sign);
+    // sum of sign corrections
+    __m128i ssum = _mm_add_epi32(bxa, axb);
+    __m128i even_ssum = _mm_slli_epi64(ssum, 32);
+    __m128i odd_ssum = _mm_and_si128(ssum, _mm_set_epi32(-1, 0, -1, 0));
+    // convert to signed and prod
+    return v_int64x2(_mm_add_epi64(_mm_sub_epi64(even_u, even_ssum), _mm_sub_epi64(odd_u, odd_ssum)));
+#endif
+}
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_dotprod(a, b) + c; }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
+{
+    __m128i a0 = _mm_srli_epi16(_mm_slli_si128(a.val, 1), 8); // even
+    __m128i a1 = _mm_srli_epi16(a.val, 8); // odd
+    __m128i b0 = _mm_srli_epi16(_mm_slli_si128(b.val, 1), 8);
+    __m128i b1 = _mm_srli_epi16(b.val, 8);
+    __m128i p0 = _mm_madd_epi16(a0, b0);
+    __m128i p1 = _mm_madd_epi16(a1, b1);
+    return v_uint32x4(_mm_add_epi32(p0, p1));
+}
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
+{
+    __m128i a0 = _mm_srai_epi16(_mm_slli_si128(a.val, 1), 8); // even
+    __m128i a1 = _mm_srai_epi16(a.val, 8); // odd
+    __m128i b0 = _mm_srai_epi16(_mm_slli_si128(b.val, 1), 8);
+    __m128i b1 = _mm_srai_epi16(b.val, 8);
+    __m128i p0 = _mm_madd_epi16(a0, b0);
+    __m128i p1 = _mm_madd_epi16(a1, b1);
+    return v_int32x4(_mm_add_epi32(p0, p1));
+}
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v_uint32x4 c, d;
+    v_mul_expand(a, b, c, d);
+
+    v_uint64x2 c0, c1, d0, d1;
+    v_expand(c, c0, c1);
+    v_expand(d, d0, d1);
+
+    c0 += c1; d0 += d1;
+    return v_uint64x2(_mm_add_epi64(
+        _mm_unpacklo_epi64(c0.val, d0.val),
+        _mm_unpackhi_epi64(c0.val, d0.val)
+    ));
+}
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
+{
+    v_int32x4 prod = v_dotprod(a, b);
+    v_int64x2 c, d;
+    v_expand(prod, c, d);
+    return v_int64x2(_mm_add_epi64(
+        _mm_unpacklo_epi64(c.val, d.val),
+        _mm_unpackhi_epi64(c.val, d.val)
+    ));
+}
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+{
+#if CV_SSE4_1
+    return v_cvt_f64(v_dotprod(a, b));
+#else
+    v_float64x2 c = v_cvt_f64(a) * v_cvt_f64(b);
+    v_float64x2 d = v_cvt_f64_high(a) * v_cvt_f64_high(b);
+
+    return v_float64x2(_mm_add_pd(
+        _mm_unpacklo_pd(c.val, d.val),
+        _mm_unpackhi_pd(c.val, d.val)
+    ));
+#endif
+}
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
+{ return v_dotprod(a, b); }
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_dotprod(a, b) + c; }
+
+// 32 >> 64
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod(a, b); }
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_dotprod_fast(a, b) + c; }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
+{
+    __m128i a0 = v_expand_low(a).val;
+    __m128i a1 = v_expand_high(a).val;
+    __m128i b0 = v_expand_low(b).val;
+    __m128i b1 = v_expand_high(b).val;
+    __m128i p0 = _mm_madd_epi16(a0, b0);
+    __m128i p1 = _mm_madd_epi16(a1, b1);
+    return v_uint32x4(_mm_add_epi32(p0, p1));
+}
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
+{
+#if CV_SSE4_1
+    __m128i a0 = _mm_cvtepi8_epi16(a.val);
+    __m128i a1 = v_expand_high(a).val;
+    __m128i b0 = _mm_cvtepi8_epi16(b.val);
+    __m128i b1 = v_expand_high(b).val;
+    __m128i p0 = _mm_madd_epi16(a0, b0);
+    __m128i p1 = _mm_madd_epi16(a1, b1);
+    return v_int32x4(_mm_add_epi32(p0, p1));
+#else
+    return v_dotprod_expand(a, b);
+#endif
+}
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v_uint32x4 c, d;
+    v_mul_expand(a, b, c, d);
+
+    v_uint64x2 c0, c1, d0, d1;
+    v_expand(c, c0, c1);
+    v_expand(d, d0, d1);
+
+    c0 += c1; d0 += d1;
+    return c0 + d0;
+}
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
+{
+    v_int32x4 prod = v_dotprod(a, b);
+    v_int64x2 c, d;
+    v_expand(prod, c, d);
+    return c + d;
+}
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+// 32 >> 64f
+v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c);
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a,   const v_int32x4& b, const v_float64x2& c)
+{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }
+
+#define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
+    OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
+    OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
+    OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \
+    inline _Tpvec operator ~ (const _Tpvec& a) \
+    { \
+        return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \
+    }
+
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint8x16, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int8x16, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint16x8, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int16x8, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint32x4, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int32x4, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint64x2, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int64x2, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1)))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1)))
+
+inline v_float32x4 v_sqrt(const v_float32x4& x)
+{ return v_float32x4(_mm_sqrt_ps(x.val)); }
+
+inline v_float32x4 v_invsqrt(const v_float32x4& x)
+{
+    const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
+    __m128 t = x.val;
+    __m128 h = _mm_mul_ps(t, _0_5);
+    t = _mm_rsqrt_ps(t);
+    t = _mm_mul_ps(t, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t, t), h)));
+    return v_float32x4(t);
+}
+
+inline v_float64x2 v_sqrt(const v_float64x2& x)
+{ return v_float64x2(_mm_sqrt_pd(x.val)); }
+
+inline v_float64x2 v_invsqrt(const v_float64x2& x)
+{
+    const __m128d v_1 = _mm_set1_pd(1.);
+    return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val)));
+}
+
+#define OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(_Tpuvec, _Tpsvec, func, suffix, subWidth) \
+inline _Tpuvec v_abs(const _Tpsvec& x) \
+{ return _Tpuvec(_mm_##func##_ep##suffix(x.val, _mm_sub_ep##subWidth(_mm_setzero_si128(), x.val))); }
+
+OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint8x16, v_int8x16, min, u8, i8)
+OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint16x8, v_int16x8, max, i16, i16)
+inline v_uint32x4 v_abs(const v_int32x4& x)
+{
+    __m128i s = _mm_srli_epi32(x.val, 31);
+    __m128i f = _mm_srai_epi32(x.val, 31);
+    return v_uint32x4(_mm_add_epi32(_mm_xor_si128(x.val, f), s));
+}
+inline v_float32x4 v_abs(const v_float32x4& x)
+{ return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); }
+inline v_float64x2 v_abs(const v_float64x2& x)
+{
+    return v_float64x2(_mm_and_pd(x.val,
+        _mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1))));
+}
+
+// TODO: exp, log, sin, cos
+
+#define OPENCV_HAL_IMPL_SSE_BIN_FUNC(_Tpvec, func, intrin) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+}
+
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_min, _mm_min_epu8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_max, _mm_max_epu8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_min, _mm_min_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_max, _mm_max_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_min, _mm_min_ps)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_max, _mm_max_ps)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_min, _mm_min_pd)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_max, _mm_max_pd)
+
+inline v_int8x16 v_min(const v_int8x16& a, const v_int8x16& b)
+{
+#if CV_SSE4_1
+    return v_int8x16(_mm_min_epi8(a.val, b.val));
+#else
+    __m128i delta = _mm_set1_epi8((char)-128);
+    return v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta),
+                                                       _mm_xor_si128(b.val, delta))));
+#endif
+}
+inline v_int8x16 v_max(const v_int8x16& a, const v_int8x16& b)
+{
+#if CV_SSE4_1
+    return v_int8x16(_mm_max_epi8(a.val, b.val));
+#else
+    __m128i delta = _mm_set1_epi8((char)-128);
+    return v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta),
+                                                       _mm_xor_si128(b.val, delta))));
+#endif
+}
+inline v_uint16x8 v_min(const v_uint16x8& a, const v_uint16x8& b)
+{
+#if CV_SSE4_1
+    return v_uint16x8(_mm_min_epu16(a.val, b.val));
+#else
+    return v_uint16x8(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, b.val)));
+#endif
+}
+inline v_uint16x8 v_max(const v_uint16x8& a, const v_uint16x8& b)
+{
+#if CV_SSE4_1
+    return v_uint16x8(_mm_max_epu16(a.val, b.val));
+#else
+    return v_uint16x8(_mm_adds_epu16(_mm_subs_epu16(a.val, b.val), b.val));
+#endif
+}
+inline v_uint32x4 v_min(const v_uint32x4& a, const v_uint32x4& b)
+{
+#if CV_SSE4_1
+    return v_uint32x4(_mm_min_epu32(a.val, b.val));
+#else
+    __m128i delta = _mm_set1_epi32((int)0x80000000);
+    __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
+    return v_uint32x4(v_select_si128(mask, b.val, a.val));
+#endif
+}
+inline v_uint32x4 v_max(const v_uint32x4& a, const v_uint32x4& b)
+{
+#if CV_SSE4_1
+    return v_uint32x4(_mm_max_epu32(a.val, b.val));
+#else
+    __m128i delta = _mm_set1_epi32((int)0x80000000);
+    __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
+    return v_uint32x4(v_select_si128(mask, a.val, b.val));
+#endif
+}
+inline v_int32x4 v_min(const v_int32x4& a, const v_int32x4& b)
+{
+#if CV_SSE4_1
+    return v_int32x4(_mm_min_epi32(a.val, b.val));
+#else
+    return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), b.val, a.val));
+#endif
+}
+inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b)
+{
+#if CV_SSE4_1
+    return v_int32x4(_mm_max_epi32(a.val, b.val));
+#else
+    return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), a.val, b.val));
+#endif
+}
+
+#define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \
+inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
+{ return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
+inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
+} \
+inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
+{ return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
+inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
+} \
+inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    __m128i smask = _mm_set1_##suffix(sbit); \
+    return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \
+} \
+inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    __m128i smask = _mm_set1_##suffix(sbit); \
+    return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \
+} \
+inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    __m128i smask = _mm_set1_##suffix(sbit); \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \
+    return _Tpuvec(_mm_xor_si128(res, not_mask)); \
+} \
+inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    __m128i smask = _mm_set1_##suffix(sbit); \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \
+    return _Tpuvec(_mm_xor_si128(res, not_mask)); \
+} \
+inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \
+} \
+inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \
+} \
+inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \
+} \
+inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \
+}
+
+OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)-128)
+OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768)
+OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000)
+
+#define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \
+inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \
+inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \
+inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \
+inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
+OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)
+
+#if CV_SSE4_1
+#define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmpeq_epi64(a.val, b.val)); } \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ return ~(a == b); }
+#else
+#define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ __m128i cmp = _mm_cmpeq_epi32(a.val, b.val); \
+  return _Tpvec(_mm_and_si128(cmp, _mm_shuffle_epi32(cmp, _MM_SHUFFLE(2, 3, 0, 1)))); } \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ return ~(a == b); }
+#endif
+
+OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2)
+OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_int64x2)
+
+inline v_float32x4 v_not_nan(const v_float32x4& a)
+{ return v_float32x4(_mm_cmpord_ps(a.val, a.val)); }
+inline v_float64x2 v_not_nan(const v_float64x2& a)
+{ return v_float64x2(_mm_cmpord_pd(a.val, a.val)); }
+
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_add_wrap, _mm_add_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_mul_wrap, _mm_mullo_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_mul_wrap, _mm_mullo_epi16)
+
+inline v_uint8x16 v_mul_wrap(const v_uint8x16& a, const v_uint8x16& b)
+{
+    __m128i ad = _mm_srai_epi16(a.val, 8);
+    __m128i bd = _mm_srai_epi16(b.val, 8);
+    __m128i p0 = _mm_mullo_epi16(a.val, b.val); // even
+    __m128i p1 = _mm_slli_epi16(_mm_mullo_epi16(ad, bd), 8); // odd
+    const __m128i b01 = _mm_set1_epi32(0xFF00FF00);
+    return v_uint8x16(_v128_blendv_epi8(p0, p1, b01));
+}
+inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
+{
+    return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
+}
+
+/** Absolute difference **/
+
+inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
+{
+    v_int8x16 d = v_sub_wrap(a, b);
+    v_int8x16 m = a < b;
+    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
+}
+inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
+{
+    return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)));
+}
+inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
+{
+    v_int32x4 d = a - b;
+    v_int32x4 m = a < b;
+    return v_reinterpret_as_u32((d ^ m) - m);
+}
+
+/** Saturating absolute difference **/
+inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
+{
+    v_int8x16 d = a - b;
+    v_int8x16 m = a < b;
+    return (d ^ m) - m;
+ }
+inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+
+inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return a * b + c;
+}
+
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+#if CV_FMA3
+    return v_float32x4(_mm_fmadd_ps(a.val, b.val, c.val));
+#else
+    return v_float32x4(_mm_add_ps(_mm_mul_ps(a.val, b.val), c.val));
+#endif
+}
+
+inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+#if CV_FMA3
+    return v_float64x2(_mm_fmadd_pd(a.val, b.val, c.val));
+#else
+    return v_float64x2(_mm_add_pd(_mm_mul_pd(a.val, b.val), c.val));
+#endif
+}
+
+#define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
+inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    _Tpreg absmask = _mm_castsi128_##suffix(absmask_vec); \
+    return _Tpvec(_mm_and_##suffix(_mm_sub_##suffix(a.val, b.val), absmask)); \
+} \
+inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    _Tpvec res = v_fma(a, a, b*b); \
+    return _Tpvec(_mm_sqrt_##suffix(res.val)); \
+} \
+inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return v_fma(a, a, b*b); \
+} \
+inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
+{ \
+    return v_fma(a, b, c); \
+}
+
+OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff))
+OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1))
+
+#define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
+inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
+{ \
+    return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
+} \
+inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
+{ \
+    return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
+} \
+inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
+{ \
+    return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
+} \
+inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
+{ \
+    return _Tpsvec(srai(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpuvec v_shl(const _Tpuvec& a) \
+{ \
+    return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpsvec v_shl(const _Tpsvec& a) \
+{ \
+    return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpuvec v_shr(const _Tpuvec& a) \
+{ \
+    return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpsvec v_shr(const _Tpsvec& a) \
+{ \
+    return _Tpsvec(srai(a.val, imm)); \
+}
+
+OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16, _mm_srai_epi16)
+OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32)
+OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64)
+
+namespace hal_sse_internal
+{
+    template <int imm,
+        bool is_invalid = ((imm < 0) || (imm > 16)),
+        bool is_first = (imm == 0),
+        bool is_half = (imm == 8),
+        bool is_second = (imm == 16),
+        bool is_other = (((imm > 0) && (imm < 8)) || ((imm > 8) && (imm < 16)))>
+    class v_sse_palignr_u8_class;
+
+    template <int imm>
+    class v_sse_palignr_u8_class<imm, true, false, false, false, false>;
+
+    template <int imm>
+    class v_sse_palignr_u8_class<imm, false, true, false, false, false>
+    {
+    public:
+        inline __m128i operator()(const __m128i& a, const __m128i&) const
+        {
+            return a;
+        }
+    };
+
+    template <int imm>
+    class v_sse_palignr_u8_class<imm, false, false, true, false, false>
+    {
+    public:
+        inline __m128i operator()(const __m128i& a, const __m128i& b) const
+        {
+            return _mm_unpacklo_epi64(_mm_unpackhi_epi64(a, a), b);
+        }
+    };
+
+    template <int imm>
+    class v_sse_palignr_u8_class<imm, false, false, false, true, false>
+    {
+    public:
+        inline __m128i operator()(const __m128i&, const __m128i& b) const
+        {
+            return b;
+        }
+    };
+
+    template <int imm>
+    class v_sse_palignr_u8_class<imm, false, false, false, false, true>
+    {
+#if CV_SSSE3
+    public:
+        inline __m128i operator()(const __m128i& a, const __m128i& b) const
+        {
+            return _mm_alignr_epi8(b, a, imm);
+        }
+#else
+    public:
+        inline __m128i operator()(const __m128i& a, const __m128i& b) const
+        {
+            enum { imm2 = (sizeof(__m128i) - imm) };
+            return _mm_or_si128(_mm_srli_si128(a, imm), _mm_slli_si128(b, imm2));
+        }
+#endif
+    };
+
+    template <int imm>
+    inline __m128i v_sse_palignr_u8(const __m128i& a, const __m128i& b)
+    {
+        CV_StaticAssert((imm >= 0) && (imm <= 16), "Invalid imm for v_sse_palignr_u8.");
+        return v_sse_palignr_u8_class<imm>()(a, b);
+    }
+}
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_right(const _Tpvec &a)
+{
+    using namespace hal_sse_internal;
+    enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
+    return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
+        _mm_srli_si128(
+            v_sse_reinterpret_as<__m128i>(a.val), imm2)));
+}
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_left(const _Tpvec &a)
+{
+    using namespace hal_sse_internal;
+    enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
+    return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
+        _mm_slli_si128(
+            v_sse_reinterpret_as<__m128i>(a.val), imm2)));
+}
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_right(const _Tpvec &a, const _Tpvec &b)
+{
+    using namespace hal_sse_internal;
+    enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
+    return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
+        v_sse_palignr_u8<imm2>(
+            v_sse_reinterpret_as<__m128i>(a.val),
+            v_sse_reinterpret_as<__m128i>(b.val))));
+}
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_left(const _Tpvec &a, const _Tpvec &b)
+{
+    using namespace hal_sse_internal;
+    enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) };
+    return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
+        v_sse_palignr_u8<imm2>(
+            v_sse_reinterpret_as<__m128i>(b.val),
+            v_sse_reinterpret_as<__m128i>(a.val))));
+}
+
+#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ return _Tpvec(_mm_loadl_epi64((const __m128i*)ptr)); } \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+    return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
+                                     _mm_loadl_epi64((const __m128i*)ptr1))); \
+} \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ _mm_storeu_si128((__m128i*)ptr, a.val); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ _mm_store_si128((__m128i*)ptr, a.val); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ _mm_stream_si128((__m128i*)ptr, a.val); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+{ \
+    if( mode == hal::STORE_UNALIGNED ) \
+        _mm_storeu_si128((__m128i*)ptr, a.val); \
+    else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
+        _mm_stream_si128((__m128i*)ptr, a.val); \
+    else \
+        _mm_store_si128((__m128i*)ptr, a.val); \
+} \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ _mm_storel_epi64((__m128i*)ptr, a.val); } \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a.val, a.val)); }
+
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint8x16, uchar)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int8x16, schar)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint16x8, ushort)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int16x8, short)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint32x4, unsigned)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int32x4, int)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint64x2, uint64)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int64x2, int64)
+
+#define OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec(_mm_loadu_##suffix(ptr)); } \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(_mm_load_##suffix(ptr)); } \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ return _Tpvec(_mm_castsi128_##suffix(_mm_loadl_epi64((const __m128i*)ptr))); } \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+    return _Tpvec(_mm_castsi128_##suffix( \
+        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
+                           _mm_loadl_epi64((const __m128i*)ptr1)))); \
+} \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ _mm_storeu_##suffix(ptr, a.val); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ _mm_store_##suffix(ptr, a.val); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ _mm_stream_##suffix(ptr, a.val); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+{ \
+    if( mode == hal::STORE_UNALIGNED ) \
+        _mm_storeu_##suffix(ptr, a.val); \
+    else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
+        _mm_stream_##suffix(ptr, a.val); \
+    else \
+        _mm_store_##suffix(ptr, a.val); \
+} \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+    __m128i a1 = _mm_cast##suffix##_si128(a.val); \
+    _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a1, a1)); \
+}
+
+OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
+
+inline unsigned v_reduce_sum(const v_uint8x16& a)
+{
+    __m128i half = _mm_sad_epu8(a.val, _mm_setzero_si128());
+    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
+}
+inline int v_reduce_sum(const v_int8x16& a)
+{
+    __m128i half = _mm_set1_epi8((schar)-128);
+    half = _mm_sad_epu8(_mm_xor_si128(a.val, half), _mm_setzero_si128());
+    return _mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half))) - 2048;
+}
+#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(func) \
+inline schar v_reduce_##func(const v_int8x16& a) \
+{ \
+    __m128i val = a.val; \
+    __m128i smask = _mm_set1_epi8((schar)-128); \
+    val = _mm_xor_si128(val, smask); \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
+    return (schar)_mm_cvtsi128_si32(val) ^ (schar)-128; \
+} \
+inline uchar v_reduce_##func(const v_uint8x16& a) \
+{ \
+    __m128i val = a.val; \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
+    val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
+    return (uchar)_mm_cvtsi128_si32(val); \
+}
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(max)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(min)
+
+#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \
+inline scalartype v_reduce_##func(const v_##_Tpvec& a) \
+{ \
+    __m128i val = a.val; \
+    val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
+    val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
+    val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
+    return (scalartype)_mm_cvtsi128_si32(val); \
+} \
+inline unsigned scalartype v_reduce_##func(const v_u##_Tpvec& a) \
+{ \
+    __m128i val = a.val; \
+    __m128i smask = _mm_set1_epi16(sbit); \
+    val = _mm_xor_si128(val, smask); \
+    val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
+    val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
+    val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
+    return (unsigned scalartype)(_mm_cvtsi128_si32(val) ^  sbit); \
+}
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, max, epi16, (short)-32768)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, min, epi16, (short)-32768)
+
+#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, cast_from, cast_to, extract) \
+inline scalartype v_reduce_sum(const _Tpvec& a) \
+{ \
+    regtype val = a.val; \
+    val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 8))); \
+    val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 4))); \
+    return (scalartype)_mm_cvt##extract(val); \
+}
+
+#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    scalartype CV_DECL_ALIGNED(16) buf[4]; \
+    v_store_aligned(buf, a); \
+    scalartype s0 = scalar_func(buf[0], buf[1]); \
+    scalartype s1 = scalar_func(buf[2], buf[3]); \
+    return scalar_func(s0, s1); \
+}
+
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_uint32x4, unsigned, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)
+
+inline int v_reduce_sum(const v_int16x8& a)
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+inline unsigned v_reduce_sum(const v_uint16x8& a)
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+
+inline uint64 v_reduce_sum(const v_uint64x2& a)
+{
+    uint64 CV_DECL_ALIGNED(32) idx[2];
+    v_store_aligned(idx, a);
+    return idx[0] + idx[1];
+}
+inline int64 v_reduce_sum(const v_int64x2& a)
+{
+    int64 CV_DECL_ALIGNED(32) idx[2];
+    v_store_aligned(idx, a);
+    return idx[0] + idx[1];
+}
+inline double v_reduce_sum(const v_float64x2& a)
+{
+    double CV_DECL_ALIGNED(32) idx[2];
+    v_store_aligned(idx, a);
+    return idx[0] + idx[1];
+}
+
+inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
+                                 const v_float32x4& c, const v_float32x4& d)
+{
+#if CV_SSE3
+    __m128 ab = _mm_hadd_ps(a.val, b.val);
+    __m128 cd = _mm_hadd_ps(c.val, d.val);
+    return v_float32x4(_mm_hadd_ps(ab, cd));
+#else
+    __m128 ac = _mm_add_ps(_mm_unpacklo_ps(a.val, c.val), _mm_unpackhi_ps(a.val, c.val));
+    __m128 bd = _mm_add_ps(_mm_unpacklo_ps(b.val, d.val), _mm_unpackhi_ps(b.val, d.val));
+    return v_float32x4(_mm_add_ps(_mm_unpacklo_ps(ac, bd), _mm_unpackhi_ps(ac, bd)));
+#endif
+}
+
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
+
+inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
+{
+    __m128i half = _mm_sad_epu8(a.val, b.val);
+    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
+}
+inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
+{
+    __m128i half = _mm_set1_epi8(0x7f);
+    half = _mm_sad_epu8(_mm_add_epi8(a.val, half), _mm_add_epi8(b.val, half));
+    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
+}
+inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v_uint32x4 l, h;
+    v_expand(v_absdiff(a, b), l, h);
+    return v_reduce_sum(l + h);
+}
+inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
+{
+    v_uint32x4 l, h;
+    v_expand(v_absdiff(a, b), l, h);
+    return v_reduce_sum(l + h);
+}
+inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
+{
+    return v_reduce_sum(v_absdiff(a, b));
+}
+inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
+{
+    return v_reduce_sum(v_absdiff(a, b));
+}
+inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
+{
+    return v_reduce_sum(v_absdiff(a, b));
+}
+
+inline v_uint8x16 v_popcount(const v_uint8x16& a)
+{
+    __m128i m1 = _mm_set1_epi32(0x55555555);
+    __m128i m2 = _mm_set1_epi32(0x33333333);
+    __m128i m4 = _mm_set1_epi32(0x0f0f0f0f);
+    __m128i p = a.val;
+    p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1));
+    p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2));
+    p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4));
+    return v_uint8x16(p);
+}
+inline v_uint16x8 v_popcount(const v_uint16x8& a)
+{
+    v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
+    p += v_rotate_right<1>(p);
+    return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
+}
+inline v_uint32x4 v_popcount(const v_uint32x4& a)
+{
+    v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
+    p += v_rotate_right<1>(p);
+    p += v_rotate_right<2>(p);
+    return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
+}
+inline v_uint64x2 v_popcount(const v_uint64x2& a)
+{
+    return v_uint64x2(_mm_sad_epu8(v_popcount(v_reinterpret_as_u8(a)).val, _mm_setzero_si128()));
+}
+inline v_uint8x16 v_popcount(const v_int8x16& a)
+{ return v_popcount(v_reinterpret_as_u8(a)); }
+inline v_uint16x8 v_popcount(const v_int16x8& a)
+{ return v_popcount(v_reinterpret_as_u16(a)); }
+inline v_uint32x4 v_popcount(const v_int32x4& a)
+{ return v_popcount(v_reinterpret_as_u32(a)); }
+inline v_uint64x2 v_popcount(const v_int64x2& a)
+{ return v_popcount(v_reinterpret_as_u64(a)); }
+
+#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, cast_op, allmask) \
+inline int v_signmask(const _Tpvec& a)   { return _mm_movemask_##suffix(cast_op(a.val)); } \
+inline bool v_check_all(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)) == allmask; } \
+inline bool v_check_any(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)) != 0; }
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, 65535)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, 65535)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, ps, _mm_castsi128_ps, 15)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, ps, _mm_castsi128_ps, 15)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint64x2, pd, _mm_castsi128_pd, 3)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int64x2, pd, _mm_castsi128_pd, 3)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, 15)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, 3)
+
+#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(_Tpvec) \
+inline int v_signmask(const _Tpvec& a) { return _mm_movemask_epi8(_mm_packs_epi16(a.val, a.val)) & 255; } \
+inline bool v_check_all(const _Tpvec& a) { return (_mm_movemask_epi8(a.val) & 0xaaaa) == 0xaaaa; } \
+inline bool v_check_any(const _Tpvec& a) { return (_mm_movemask_epi8(a.val) & 0xaaaa) != 0; }
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(v_uint16x8)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(v_int16x8)
+
+inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+
+#if CV_SSE4_1
+#define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, cast_ret, cast, suffix) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(cast_ret(_mm_blendv_##suffix(cast(b.val), cast(a.val), cast(mask.val)))); \
+}
+
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, _mm_castps_si128, _mm_castsi128_ps, ps)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, _mm_castps_si128, _mm_castsi128_ps, ps)
+// OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, TBD, TBD, pd)
+// OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, TBD, TBD, ps)
+OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP, ps)
+OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, OPENCV_HAL_NOP, OPENCV_HAL_NOP, pd)
+
+#else // CV_SSE4_1
+
+#define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(_mm_xor_##suffix(b.val, _mm_and_##suffix(_mm_xor_##suffix(b.val, a.val), mask.val))); \
+}
+
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128)
+// OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128)
+// OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps)
+OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd)
+#endif
+
+/* Expand */
+#define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin)    \
+    inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+    {                                                               \
+        b0.val = intrin(a.val);                                     \
+        b1.val = __CV_CAT(intrin, _high)(a.val);                    \
+    }                                                               \
+    inline _Tpwvec v_expand_low(const _Tpvec& a)                    \
+    { return _Tpwvec(intrin(a.val)); }                              \
+    inline _Tpwvec v_expand_high(const _Tpvec& a)                   \
+    { return _Tpwvec(__CV_CAT(intrin, _high)(a.val)); }             \
+    inline _Tpwvec v_load_expand(const _Tp* ptr)                    \
+    {                                                               \
+        __m128i a = _mm_loadl_epi64((const __m128i*)ptr);           \
+        return _Tpwvec(intrin(a));                                  \
+    }
+
+OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8,  uchar,    _v128_cvtepu8_epi16)
+OPENCV_HAL_IMPL_SSE_EXPAND(v_int8x16,  v_int16x8,   schar,    _v128_cvtepi8_epi16)
+OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4,  ushort,   _v128_cvtepu16_epi32)
+OPENCV_HAL_IMPL_SSE_EXPAND(v_int16x8,  v_int32x4,   short,    _v128_cvtepi16_epi32)
+OPENCV_HAL_IMPL_SSE_EXPAND(v_uint32x4, v_uint64x2,  unsigned, _v128_cvtepu32_epi64)
+OPENCV_HAL_IMPL_SSE_EXPAND(v_int32x4,  v_int64x2,   int,      _v128_cvtepi32_epi64)
+
+#define OPENCV_HAL_IMPL_SSE_EXPAND_Q(_Tpvec, _Tp, intrin)  \
+    inline _Tpvec v_load_expand_q(const _Tp* ptr)          \
+    {                                                      \
+        __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);   \
+        return _Tpvec(intrin(a));                          \
+    }
+
+OPENCV_HAL_IMPL_SSE_EXPAND_Q(v_uint32x4, uchar, _v128_cvtepu8_epi32)
+OPENCV_HAL_IMPL_SSE_EXPAND_Q(v_int32x4,  schar, _v128_cvtepi8_epi32)
+
+#define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \
+inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
+{ \
+    b0.val = _mm_unpacklo_##suffix(a0.val, a1.val); \
+    b1.val = _mm_unpackhi_##suffix(a0.val, a1.val); \
+} \
+inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
+    return _Tpvec(cast_to(_mm_unpacklo_epi64(a1, b1))); \
+} \
+inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
+    return _Tpvec(cast_to(_mm_unpackhi_epi64(a1, b1))); \
+} \
+inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
+{ \
+    __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
+    c.val = cast_to(_mm_unpacklo_epi64(a1, b1)); \
+    d.val = cast_to(_mm_unpackhi_epi64(a1, b1)); \
+}
+
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_int16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
+
+inline v_uint8x16 v_reverse(const v_uint8x16 &a)
+{
+#if CV_SSSE3
+    static const __m128i perm = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    return v_uint8x16(_mm_shuffle_epi8(a.val, perm));
+#else
+    uchar CV_DECL_ALIGNED(32) d[16];
+    v_store_aligned(d, a);
+    return v_uint8x16(d[15], d[14], d[13], d[12], d[11], d[10], d[9], d[8], d[7], d[6], d[5], d[4], d[3], d[2], d[1], d[0]);
+#endif
+}
+
+inline v_int8x16 v_reverse(const v_int8x16 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x8 v_reverse(const v_uint16x8 &a)
+{
+#if CV_SSSE3
+    static const __m128i perm = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+    return v_uint16x8(_mm_shuffle_epi8(a.val, perm));
+#else
+    __m128i r = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 1, 2, 3));
+    r = _mm_shufflelo_epi16(r, _MM_SHUFFLE(2, 3, 0, 1));
+    r = _mm_shufflehi_epi16(r, _MM_SHUFFLE(2, 3, 0, 1));
+    return v_uint16x8(r);
+#endif
+}
+
+inline v_int16x8 v_reverse(const v_int16x8 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x4 v_reverse(const v_uint32x4 &a)
+{
+    return v_uint32x4(_mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 1, 2, 3)));
+}
+
+inline v_int32x4 v_reverse(const v_int32x4 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x4 v_reverse(const v_float32x4 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x2 v_reverse(const v_uint64x2 &a)
+{
+    return v_uint64x2(_mm_shuffle_epi32(a.val, _MM_SHUFFLE(1, 0, 3, 2)));
+}
+
+inline v_int64x2 v_reverse(const v_int64x2 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+inline v_float64x2 v_reverse(const v_float64x2 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
+template<int s, typename _Tpvec>
+inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
+{
+    return v_rotate_right<s>(a, b);
+}
+
+inline v_int32x4 v_round(const v_float32x4& a)
+{ return v_int32x4(_mm_cvtps_epi32(a.val)); }
+
+inline v_int32x4 v_floor(const v_float32x4& a)
+{
+    __m128i a1 = _mm_cvtps_epi32(a.val);
+    __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(a1), a.val));
+    return v_int32x4(_mm_add_epi32(a1, mask));
+}
+
+inline v_int32x4 v_ceil(const v_float32x4& a)
+{
+    __m128i a1 = _mm_cvtps_epi32(a.val);
+    __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(a.val, _mm_cvtepi32_ps(a1)));
+    return v_int32x4(_mm_sub_epi32(a1, mask));
+}
+
+inline v_int32x4 v_trunc(const v_float32x4& a)
+{ return v_int32x4(_mm_cvttps_epi32(a.val)); }
+
+inline v_int32x4 v_round(const v_float64x2& a)
+{ return v_int32x4(_mm_cvtpd_epi32(a.val)); }
+
+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{
+    __m128i ai = _mm_cvtpd_epi32(a.val), bi = _mm_cvtpd_epi32(b.val);
+    return v_int32x4(_mm_unpacklo_epi64(ai, bi));
+}
+
+inline v_int32x4 v_floor(const v_float64x2& a)
+{
+    __m128i a1 = _mm_cvtpd_epi32(a.val);
+    __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(_mm_cvtepi32_pd(a1), a.val));
+    mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
+    return v_int32x4(_mm_add_epi32(a1, mask));
+}
+
+inline v_int32x4 v_ceil(const v_float64x2& a)
+{
+    __m128i a1 = _mm_cvtpd_epi32(a.val);
+    __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(a.val, _mm_cvtepi32_pd(a1)));
+    mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
+    return v_int32x4(_mm_sub_epi32(a1, mask));
+}
+
+inline v_int32x4 v_trunc(const v_float64x2& a)
+{ return v_int32x4(_mm_cvttpd_epi32(a.val)); }
+
+#define OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
+inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
+                           const _Tpvec& a2, const _Tpvec& a3, \
+                           _Tpvec& b0, _Tpvec& b1, \
+                           _Tpvec& b2, _Tpvec& b3) \
+{ \
+    __m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \
+    __m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \
+    __m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \
+    __m128i t3 = cast_from(_mm_unpackhi_##suffix(a2.val, a3.val)); \
+\
+    b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \
+    b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \
+    b2.val = cast_to(_mm_unpacklo_epi64(t2, t3)); \
+    b3.val = cast_to(_mm_unpackhi_epi64(t2, t3)); \
+}
+
+OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
+
+// load deinterleave
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
+{
+    __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
+    __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
+
+    __m128i t10 = _mm_unpacklo_epi8(t00, t01);
+    __m128i t11 = _mm_unpackhi_epi8(t00, t01);
+
+    __m128i t20 = _mm_unpacklo_epi8(t10, t11);
+    __m128i t21 = _mm_unpackhi_epi8(t10, t11);
+
+    __m128i t30 = _mm_unpacklo_epi8(t20, t21);
+    __m128i t31 = _mm_unpackhi_epi8(t20, t21);
+
+    a.val = _mm_unpacklo_epi8(t30, t31);
+    b.val = _mm_unpackhi_epi8(t30, t31);
+}
+
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
+{
+#if CV_SSE4_1
+    const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
+    const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    __m128i s0 = _mm_loadu_si128((const __m128i*)ptr);
+    __m128i s1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
+    __m128i s2 = _mm_loadu_si128((const __m128i*)(ptr + 32));
+    __m128i a0 = _mm_blendv_epi8(_mm_blendv_epi8(s0, s1, m0), s2, m1);
+    __m128i b0 = _mm_blendv_epi8(_mm_blendv_epi8(s1, s2, m0), s0, m1);
+    __m128i c0 = _mm_blendv_epi8(_mm_blendv_epi8(s2, s0, m0), s1, m1);
+    const __m128i sh_b = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
+    const __m128i sh_g = _mm_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14);
+    const __m128i sh_r = _mm_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
+    a0 = _mm_shuffle_epi8(a0, sh_b);
+    b0 = _mm_shuffle_epi8(b0, sh_g);
+    c0 = _mm_shuffle_epi8(c0, sh_r);
+    a.val = a0;
+    b.val = b0;
+    c.val = c0;
+#elif CV_SSSE3
+    const __m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14);
+    const __m128i m1 = _mm_alignr_epi8(m0, m0, 11);
+    const __m128i m2 = _mm_alignr_epi8(m0, m0, 6);
+
+    __m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
+    __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
+    __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 32));
+
+    __m128i s0 = _mm_shuffle_epi8(t0, m0);
+    __m128i s1 = _mm_shuffle_epi8(t1, m1);
+    __m128i s2 = _mm_shuffle_epi8(t2, m2);
+
+    t0 = _mm_alignr_epi8(s1, _mm_slli_si128(s0, 10), 5);
+    a.val = _mm_alignr_epi8(s2, t0, 5);
+
+    t1 = _mm_alignr_epi8(_mm_srli_si128(s1, 5), _mm_slli_si128(s0, 5), 6);
+    b.val = _mm_alignr_epi8(_mm_srli_si128(s2, 5), t1, 5);
+
+    t2 = _mm_alignr_epi8(_mm_srli_si128(s2, 10), s1, 11);
+    c.val = _mm_alignr_epi8(t2, s0, 11);
+#else
+    __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
+    __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
+    __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 32));
+
+    __m128i t10 = _mm_unpacklo_epi8(t00, _mm_unpackhi_epi64(t01, t01));
+    __m128i t11 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t00, t00), t02);
+    __m128i t12 = _mm_unpacklo_epi8(t01, _mm_unpackhi_epi64(t02, t02));
+
+    __m128i t20 = _mm_unpacklo_epi8(t10, _mm_unpackhi_epi64(t11, t11));
+    __m128i t21 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t10, t10), t12);
+    __m128i t22 = _mm_unpacklo_epi8(t11, _mm_unpackhi_epi64(t12, t12));
+
+    __m128i t30 = _mm_unpacklo_epi8(t20, _mm_unpackhi_epi64(t21, t21));
+    __m128i t31 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t20, t20), t22);
+    __m128i t32 = _mm_unpacklo_epi8(t21, _mm_unpackhi_epi64(t22, t22));
+
+    a.val = _mm_unpacklo_epi8(t30, _mm_unpackhi_epi64(t31, t31));
+    b.val = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t30, t30), t32);
+    c.val = _mm_unpacklo_epi8(t31, _mm_unpackhi_epi64(t32, t32));
+#endif
+}
+
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
+{
+    __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ...
+    __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
+    __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); // a8 b8 c8 d8 ...
+    __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 48)); // a12 b12 c12 d12 ...
+
+    __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ...
+    __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ...
+    __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ...
+    __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b6 b14 ...
+
+    u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ...
+    u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ...
+    u2 = _mm_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ...
+    u3 = _mm_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ...
+
+    v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ...
+    v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ...
+    v2 = _mm_unpackhi_epi8(u0, u1); // c0 c2 c4 c6 ...
+    v3 = _mm_unpackhi_epi8(u2, u3); // c1 c3 c5 c7 ...
+
+    a.val = _mm_unpacklo_epi8(v0, v1);
+    b.val = _mm_unpackhi_epi8(v0, v1);
+    c.val = _mm_unpacklo_epi8(v2, v3);
+    d.val = _mm_unpackhi_epi8(v2, v3);
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b)
+{
+    __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));     // a0 b0 a1 b1 a2 b2 a3 b3
+    __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); // a4 b4 a5 b5 a6 b6 a7 b7
+
+    __m128i v2 = _mm_unpacklo_epi16(v0, v1); // a0 a4 b0 b4 a1 a5 b1 b5
+    __m128i v3 = _mm_unpackhi_epi16(v0, v1); // a2 a6 b2 b6 a3 a7 b3 b7
+    __m128i v4 = _mm_unpacklo_epi16(v2, v3); // a0 a2 a4 a6 b0 b2 b4 b6
+    __m128i v5 = _mm_unpackhi_epi16(v2, v3); // a1 a3 a5 a7 b1 b3 b5 b7
+
+    a.val = _mm_unpacklo_epi16(v4, v5); // a0 a1 a2 a3 a4 a5 a6 a7
+    b.val = _mm_unpackhi_epi16(v4, v5); // b0 b1 ab b3 b4 b5 b6 b7
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
+{
+#if CV_SSE4_1
+    __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));
+    __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8));
+    __m128i v2 = _mm_loadu_si128((__m128i*)(ptr + 16));
+    __m128i a0 = _mm_blend_epi16(_mm_blend_epi16(v0, v1, 0x92), v2, 0x24);
+    __m128i b0 = _mm_blend_epi16(_mm_blend_epi16(v2, v0, 0x92), v1, 0x24);
+    __m128i c0 = _mm_blend_epi16(_mm_blend_epi16(v1, v2, 0x92), v0, 0x24);
+
+    const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+    const __m128i sh_b = _mm_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
+    const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+    a0 = _mm_shuffle_epi8(a0, sh_a);
+    b0 = _mm_shuffle_epi8(b0, sh_b);
+    c0 = _mm_shuffle_epi8(c0, sh_c);
+
+    a.val = a0;
+    b.val = b0;
+    c.val = c0;
+#else
+    __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
+    __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 8));
+    __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 16));
+
+    __m128i t10 = _mm_unpacklo_epi16(t00, _mm_unpackhi_epi64(t01, t01));
+    __m128i t11 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t00, t00), t02);
+    __m128i t12 = _mm_unpacklo_epi16(t01, _mm_unpackhi_epi64(t02, t02));
+
+    __m128i t20 = _mm_unpacklo_epi16(t10, _mm_unpackhi_epi64(t11, t11));
+    __m128i t21 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t10, t10), t12);
+    __m128i t22 = _mm_unpacklo_epi16(t11, _mm_unpackhi_epi64(t12, t12));
+
+    a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21));
+    b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22);
+    c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22));
+#endif
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
+{
+    __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1
+    __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 8)); // a2 b2 c2 d2 ...
+    __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
+    __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 24)); // a6 b6 c6 d6 ...
+
+    __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ...
+    __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ...
+    __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ...
+    __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ...
+
+    u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ...
+    u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ...
+    u2 = _mm_unpackhi_epi16(v0, v2); // c0 c2 c4 c6 ...
+    u3 = _mm_unpackhi_epi16(v1, v3); // c1 c3 c5 c7 ...
+
+    a.val = _mm_unpacklo_epi16(u0, u1);
+    b.val = _mm_unpackhi_epi16(u0, u1);
+    c.val = _mm_unpacklo_epi16(u2, u3);
+    d.val = _mm_unpackhi_epi16(u2, u3);
+}
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b)
+{
+    __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));     // a0 b0 a1 b1
+    __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 4)); // a2 b2 a3 b3
+
+    __m128i v2 = _mm_unpacklo_epi32(v0, v1); // a0 a2 b0 b2
+    __m128i v3 = _mm_unpackhi_epi32(v0, v1); // a1 a3 b1 b3
+
+    a.val = _mm_unpacklo_epi32(v2, v3); // a0 a1 a2 a3
+    b.val = _mm_unpackhi_epi32(v2, v3); // b0 b1 ab b3
+}
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
+{
+    __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
+    __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 4));
+    __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 8));
+
+    __m128i t10 = _mm_unpacklo_epi32(t00, _mm_unpackhi_epi64(t01, t01));
+    __m128i t11 = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t00, t00), t02);
+    __m128i t12 = _mm_unpacklo_epi32(t01, _mm_unpackhi_epi64(t02, t02));
+
+    a.val = _mm_unpacklo_epi32(t10, _mm_unpackhi_epi64(t11, t11));
+    b.val = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t10, t10), t12);
+    c.val = _mm_unpacklo_epi32(t11, _mm_unpackhi_epi64(t12, t12));
+}
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
+{
+    v_uint32x4 s0(_mm_loadu_si128((const __m128i*)ptr));        // a0 b0 c0 d0
+    v_uint32x4 s1(_mm_loadu_si128((const __m128i*)(ptr + 4)));  // a1 b1 c1 d1
+    v_uint32x4 s2(_mm_loadu_si128((const __m128i*)(ptr + 8)));  // a2 b2 c2 d2
+    v_uint32x4 s3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3
+
+    v_transpose4x4(s0, s1, s2, s3, a, b, c, d);
+}
+
+inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
+{
+    __m128 u0 = _mm_loadu_ps(ptr);       // a0 b0 a1 b1
+    __m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3
+
+    a.val = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(2, 0, 2, 0)); // a0 a1 a2 a3
+    b.val = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(3, 1, 3, 1)); // b0 b1 ab b3
+}
+
+inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c)
+{
+    __m128 t0 = _mm_loadu_ps(ptr + 0);
+    __m128 t1 = _mm_loadu_ps(ptr + 4);
+    __m128 t2 = _mm_loadu_ps(ptr + 8);
+
+    __m128 at12 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(0, 1, 0, 2));
+    a.val = _mm_shuffle_ps(t0, at12, _MM_SHUFFLE(2, 0, 3, 0));
+
+    __m128 bt01 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(0, 0, 0, 1));
+    __m128 bt12 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(0, 2, 0, 3));
+    b.val = _mm_shuffle_ps(bt01, bt12, _MM_SHUFFLE(2, 0, 2, 0));
+
+    __m128 ct01 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(0, 1, 0, 2));
+    c.val = _mm_shuffle_ps(ct01, t2, _MM_SHUFFLE(3, 0, 2, 0));
+}
+
+inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c, v_float32x4& d)
+{
+    __m128 t0 = _mm_loadu_ps(ptr +  0);
+    __m128 t1 = _mm_loadu_ps(ptr +  4);
+    __m128 t2 = _mm_loadu_ps(ptr +  8);
+    __m128 t3 = _mm_loadu_ps(ptr + 12);
+    __m128 t02lo = _mm_unpacklo_ps(t0, t2);
+    __m128 t13lo = _mm_unpacklo_ps(t1, t3);
+    __m128 t02hi = _mm_unpackhi_ps(t0, t2);
+    __m128 t13hi = _mm_unpackhi_ps(t1, t3);
+    a.val = _mm_unpacklo_ps(t02lo, t13lo);
+    b.val = _mm_unpackhi_ps(t02lo, t13lo);
+    c.val = _mm_unpacklo_ps(t02hi, t13hi);
+    d.val = _mm_unpackhi_ps(t02hi, t13hi);
+}
+
+inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b)
+{
+    __m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
+    __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2));
+
+    a = v_uint64x2(_mm_unpacklo_epi64(t0, t1));
+    b = v_uint64x2(_mm_unpackhi_epi64(t0, t1));
+}
+
+inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
+{
+    __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0, b0
+    __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0, a1
+    __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // b1, c1
+
+    t1 = _mm_shuffle_epi32(t1, 0x4e); // a1, c0
+
+    a = v_uint64x2(_mm_unpacklo_epi64(t0, t1));
+    b = v_uint64x2(_mm_unpacklo_epi64(_mm_unpackhi_epi64(t0, t0), t2));
+    c = v_uint64x2(_mm_unpackhi_epi64(t1, t2));
+}
+
+inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a,
+                                v_uint64x2& b, v_uint64x2& c, v_uint64x2& d)
+{
+    __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0
+    __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0 d0
+    __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // a1 b1
+    __m128i t3 = _mm_loadu_si128((const __m128i*)(ptr + 6)); // c1 d1
+
+    a = v_uint64x2(_mm_unpacklo_epi64(t0, t2));
+    b = v_uint64x2(_mm_unpackhi_epi64(t0, t2));
+    c = v_uint64x2(_mm_unpacklo_epi64(t1, t3));
+    d = v_uint64x2(_mm_unpackhi_epi64(t1, t3));
+}
+
+// store interleave
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
+                                hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    __m128i v0 = _mm_unpacklo_epi8(a.val, b.val);
+    __m128i v1 = _mm_unpackhi_epi8(a.val, b.val);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 16), v1);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 16), v1);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 16), v1);
+    }
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
+                                const v_uint8x16& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+#if CV_SSE4_1
+    const __m128i sh_a = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
+    const __m128i sh_b = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
+    const __m128i sh_c = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
+    __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
+    __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
+    __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
+
+    const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
+    const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    __m128i v0 = _mm_blendv_epi8(_mm_blendv_epi8(a0, b0, m1), c0, m0);
+    __m128i v1 = _mm_blendv_epi8(_mm_blendv_epi8(b0, c0, m1), a0, m0);
+    __m128i v2 = _mm_blendv_epi8(_mm_blendv_epi8(c0, a0, m1), b0, m0);
+#elif CV_SSSE3
+    const __m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5);
+    const __m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10);
+    const __m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15);
+
+    __m128i t0 = _mm_alignr_epi8(b.val, _mm_slli_si128(a.val, 10), 5);
+    t0 = _mm_alignr_epi8(c.val, t0, 5);
+    __m128i v0 = _mm_shuffle_epi8(t0, m0);
+
+    __m128i t1 = _mm_alignr_epi8(_mm_srli_si128(b.val, 5), _mm_slli_si128(a.val, 5), 6);
+    t1 = _mm_alignr_epi8(_mm_srli_si128(c.val, 5), t1, 5);
+    __m128i v1 = _mm_shuffle_epi8(t1, m1);
+
+    __m128i t2 = _mm_alignr_epi8(_mm_srli_si128(c.val, 10), b.val, 11);
+    t2 = _mm_alignr_epi8(t2, a.val, 11);
+    __m128i v2 = _mm_shuffle_epi8(t2, m2);
+#else
+    __m128i z = _mm_setzero_si128();
+    __m128i ab0 = _mm_unpacklo_epi8(a.val, b.val);
+    __m128i ab1 = _mm_unpackhi_epi8(a.val, b.val);
+    __m128i c0 = _mm_unpacklo_epi8(c.val, z);
+    __m128i c1 = _mm_unpackhi_epi8(c.val, z);
+
+    __m128i p00 = _mm_unpacklo_epi16(ab0, c0);
+    __m128i p01 = _mm_unpackhi_epi16(ab0, c0);
+    __m128i p02 = _mm_unpacklo_epi16(ab1, c1);
+    __m128i p03 = _mm_unpackhi_epi16(ab1, c1);
+
+    __m128i p10 = _mm_unpacklo_epi32(p00, p01);
+    __m128i p11 = _mm_unpackhi_epi32(p00, p01);
+    __m128i p12 = _mm_unpacklo_epi32(p02, p03);
+    __m128i p13 = _mm_unpackhi_epi32(p02, p03);
+
+    __m128i p20 = _mm_unpacklo_epi64(p10, p11);
+    __m128i p21 = _mm_unpackhi_epi64(p10, p11);
+    __m128i p22 = _mm_unpacklo_epi64(p12, p13);
+    __m128i p23 = _mm_unpackhi_epi64(p12, p13);
+
+    p20 = _mm_slli_si128(p20, 1);
+    p22 = _mm_slli_si128(p22, 1);
+
+    __m128i p30 = _mm_slli_epi64(_mm_unpacklo_epi32(p20, p21), 8);
+    __m128i p31 = _mm_srli_epi64(_mm_unpackhi_epi32(p20, p21), 8);
+    __m128i p32 = _mm_slli_epi64(_mm_unpacklo_epi32(p22, p23), 8);
+    __m128i p33 = _mm_srli_epi64(_mm_unpackhi_epi32(p22, p23), 8);
+
+    __m128i p40 = _mm_unpacklo_epi64(p30, p31);
+    __m128i p41 = _mm_unpackhi_epi64(p30, p31);
+    __m128i p42 = _mm_unpacklo_epi64(p32, p33);
+    __m128i p43 = _mm_unpackhi_epi64(p32, p33);
+
+    __m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10));
+    __m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6));
+    __m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2));
+#endif
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 16), v1);
+        _mm_stream_si128((__m128i*)(ptr + 32), v2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 16), v1);
+        _mm_store_si128((__m128i*)(ptr + 32), v2);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 16), v1);
+        _mm_storeu_si128((__m128i*)(ptr + 32), v2);
+    }
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
+                                const v_uint8x16& c, const v_uint8x16& d,
+                                hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    // a0 a1 a2 a3 ....
+    // b0 b1 b2 b3 ....
+    // c0 c1 c2 c3 ....
+    // d0 d1 d2 d3 ....
+    __m128i u0 = _mm_unpacklo_epi8(a.val, c.val); // a0 c0 a1 c1 ...
+    __m128i u1 = _mm_unpackhi_epi8(a.val, c.val); // a8 c8 a9 c9 ...
+    __m128i u2 = _mm_unpacklo_epi8(b.val, d.val); // b0 d0 b1 d1 ...
+    __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ...
+
+    __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ...
+    __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ...
+    __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ...
+    __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ...
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 16), v1);
+        _mm_stream_si128((__m128i*)(ptr + 32), v2);
+        _mm_stream_si128((__m128i*)(ptr + 48), v3);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 16), v1);
+        _mm_store_si128((__m128i*)(ptr + 32), v2);
+        _mm_store_si128((__m128i*)(ptr + 48), v3);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 16), v1);
+        _mm_storeu_si128((__m128i*)(ptr + 32), v2);
+        _mm_storeu_si128((__m128i*)(ptr + 48), v3);
+    }
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
+                                hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    __m128i v0 = _mm_unpacklo_epi16(a.val, b.val);
+    __m128i v1 = _mm_unpackhi_epi16(a.val, b.val);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 8), v1);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 8), v1);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 8), v1);
+    }
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
+                                const v_uint16x8& b, const v_uint16x8& c,
+                                hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+#if CV_SSE4_1
+    const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+    const __m128i sh_b = _mm_setr_epi8(10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
+    const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+    __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
+    __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
+    __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
+
+    __m128i v0 = _mm_blend_epi16(_mm_blend_epi16(a0, b0, 0x92), c0, 0x24);
+    __m128i v1 = _mm_blend_epi16(_mm_blend_epi16(c0, a0, 0x92), b0, 0x24);
+    __m128i v2 = _mm_blend_epi16(_mm_blend_epi16(b0, c0, 0x92), a0, 0x24);
+#else
+    __m128i z = _mm_setzero_si128();
+    __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val);
+    __m128i ab1 = _mm_unpackhi_epi16(a.val, b.val);
+    __m128i c0 = _mm_unpacklo_epi16(c.val, z);
+    __m128i c1 = _mm_unpackhi_epi16(c.val, z);
+
+    __m128i p10 = _mm_unpacklo_epi32(ab0, c0);
+    __m128i p11 = _mm_unpackhi_epi32(ab0, c0);
+    __m128i p12 = _mm_unpacklo_epi32(ab1, c1);
+    __m128i p13 = _mm_unpackhi_epi32(ab1, c1);
+
+    __m128i p20 = _mm_unpacklo_epi64(p10, p11);
+    __m128i p21 = _mm_unpackhi_epi64(p10, p11);
+    __m128i p22 = _mm_unpacklo_epi64(p12, p13);
+    __m128i p23 = _mm_unpackhi_epi64(p12, p13);
+
+    p20 = _mm_slli_si128(p20, 2);
+    p22 = _mm_slli_si128(p22, 2);
+
+    __m128i p30 = _mm_unpacklo_epi64(p20, p21);
+    __m128i p31 = _mm_unpackhi_epi64(p20, p21);
+    __m128i p32 = _mm_unpacklo_epi64(p22, p23);
+    __m128i p33 = _mm_unpackhi_epi64(p22, p23);
+
+    __m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10));
+    __m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6));
+    __m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2));
+#endif
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 8), v1);
+        _mm_stream_si128((__m128i*)(ptr + 16), v2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 8), v1);
+        _mm_store_si128((__m128i*)(ptr + 16), v2);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 8), v1);
+        _mm_storeu_si128((__m128i*)(ptr + 16), v2);
+    }
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
+                                const v_uint16x8& c, const v_uint16x8& d,
+                                hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    // a0 a1 a2 a3 ....
+    // b0 b1 b2 b3 ....
+    // c0 c1 c2 c3 ....
+    // d0 d1 d2 d3 ....
+    __m128i u0 = _mm_unpacklo_epi16(a.val, c.val); // a0 c0 a1 c1 ...
+    __m128i u1 = _mm_unpackhi_epi16(a.val, c.val); // a4 c4 a5 c5 ...
+    __m128i u2 = _mm_unpacklo_epi16(b.val, d.val); // b0 d0 b1 d1 ...
+    __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ...
+
+    __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ...
+    __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ...
+    __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ...
+    __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ...
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 8), v1);
+        _mm_stream_si128((__m128i*)(ptr + 16), v2);
+        _mm_stream_si128((__m128i*)(ptr + 24), v3);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 8), v1);
+        _mm_store_si128((__m128i*)(ptr + 16), v2);
+        _mm_store_si128((__m128i*)(ptr + 24), v3);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 8), v1);
+        _mm_storeu_si128((__m128i*)(ptr + 16), v2);
+        _mm_storeu_si128((__m128i*)(ptr + 24), v3);
+    }
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
+                                hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    __m128i v0 = _mm_unpacklo_epi32(a.val, b.val);
+    __m128i v1 = _mm_unpackhi_epi32(a.val, b.val);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 4), v1);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 4), v1);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 4), v1);
+    }
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
+                                const v_uint32x4& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    v_uint32x4 z = v_setzero_u32(), u0, u1, u2, u3;
+    v_transpose4x4(a, b, c, z, u0, u1, u2, u3);
+
+    __m128i v0 = _mm_or_si128(u0.val, _mm_slli_si128(u1.val, 12));
+    __m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8));
+    __m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4));
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 4), v1);
+        _mm_stream_si128((__m128i*)(ptr + 8), v2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 4), v1);
+        _mm_store_si128((__m128i*)(ptr + 8), v2);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 4), v1);
+        _mm_storeu_si128((__m128i*)(ptr + 8), v2);
+    }
+}
+
+inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
+                               const v_uint32x4& c, const v_uint32x4& d,
+                               hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    v_uint32x4 v0, v1, v2, v3;
+    v_transpose4x4(a, b, c, d, v0, v1, v2, v3);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0.val);
+        _mm_stream_si128((__m128i*)(ptr + 4), v1.val);
+        _mm_stream_si128((__m128i*)(ptr + 8), v2.val);
+        _mm_stream_si128((__m128i*)(ptr + 12), v3.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0.val);
+        _mm_store_si128((__m128i*)(ptr + 4), v1.val);
+        _mm_store_si128((__m128i*)(ptr + 8), v2.val);
+        _mm_store_si128((__m128i*)(ptr + 12), v3.val);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0.val);
+        _mm_storeu_si128((__m128i*)(ptr + 4), v1.val);
+        _mm_storeu_si128((__m128i*)(ptr + 8), v2.val);
+        _mm_storeu_si128((__m128i*)(ptr + 12), v3.val);
+    }
+}
+
+// 2-channel, float only
+inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
+                               hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    __m128 v0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1
+    __m128 v1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_ps(ptr, v0);
+        _mm_stream_ps(ptr + 4, v1);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_ps(ptr, v0);
+        _mm_store_ps(ptr + 4, v1);
+    }
+    else
+    {
+        _mm_storeu_ps(ptr, v0);
+        _mm_storeu_ps(ptr + 4, v1);
+    }
+}
+
+inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
+                               const v_float32x4& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    __m128 u0 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(0, 0, 0, 0));
+    __m128 u1 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(1, 1, 0, 0));
+    __m128 v0 = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(2, 0, 2, 0));
+    __m128 u2 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(1, 1, 1, 1));
+    __m128 u3 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(2, 2, 2, 2));
+    __m128 v1 = _mm_shuffle_ps(u2, u3, _MM_SHUFFLE(2, 0, 2, 0));
+    __m128 u4 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(3, 3, 2, 2));
+    __m128 u5 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(3, 3, 3, 3));
+    __m128 v2 = _mm_shuffle_ps(u4, u5, _MM_SHUFFLE(2, 0, 2, 0));
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_ps(ptr, v0);
+        _mm_stream_ps(ptr + 4, v1);
+        _mm_stream_ps(ptr + 8, v2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_ps(ptr, v0);
+        _mm_store_ps(ptr + 4, v1);
+        _mm_store_ps(ptr + 8, v2);
+    }
+    else
+    {
+        _mm_storeu_ps(ptr, v0);
+        _mm_storeu_ps(ptr + 4, v1);
+        _mm_storeu_ps(ptr + 8, v2);
+    }
+}
+
+inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
+                               const v_float32x4& c, const v_float32x4& d,
+                               hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    __m128 u0 = _mm_unpacklo_ps(a.val, c.val);
+    __m128 u1 = _mm_unpacklo_ps(b.val, d.val);
+    __m128 u2 = _mm_unpackhi_ps(a.val, c.val);
+    __m128 u3 = _mm_unpackhi_ps(b.val, d.val);
+    __m128 v0 = _mm_unpacklo_ps(u0, u1);
+    __m128 v2 = _mm_unpacklo_ps(u2, u3);
+    __m128 v1 = _mm_unpackhi_ps(u0, u1);
+    __m128 v3 = _mm_unpackhi_ps(u2, u3);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_ps(ptr, v0);
+        _mm_stream_ps(ptr + 4, v1);
+        _mm_stream_ps(ptr + 8, v2);
+        _mm_stream_ps(ptr + 12, v3);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_ps(ptr, v0);
+        _mm_store_ps(ptr + 4, v1);
+        _mm_store_ps(ptr + 8, v2);
+        _mm_store_ps(ptr + 12, v3);
+    }
+    else
+    {
+        _mm_storeu_ps(ptr, v0);
+        _mm_storeu_ps(ptr + 4, v1);
+        _mm_storeu_ps(ptr + 8, v2);
+        _mm_storeu_ps(ptr + 12, v3);
+    }
+}
+
+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
+                               hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
+    __m128i v1 = _mm_unpackhi_epi64(a.val, b.val);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 2), v1);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 2), v1);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 2), v1);
+    }
+}
+
+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
+                               const v_uint64x2& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
+    __m128i v1 = _mm_unpacklo_epi64(c.val, _mm_unpackhi_epi64(a.val, a.val));
+    __m128i v2 = _mm_unpackhi_epi64(b.val, c.val);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 2), v1);
+        _mm_stream_si128((__m128i*)(ptr + 4), v2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 2), v1);
+        _mm_store_si128((__m128i*)(ptr + 4), v2);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 2), v1);
+        _mm_storeu_si128((__m128i*)(ptr + 4), v2);
+    }
+}
+
+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
+                               const v_uint64x2& c, const v_uint64x2& d,
+                               hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
+    __m128i v1 = _mm_unpacklo_epi64(c.val, d.val);
+    __m128i v2 = _mm_unpackhi_epi64(a.val, b.val);
+    __m128i v3 = _mm_unpackhi_epi64(c.val, d.val);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 2), v1);
+        _mm_stream_si128((__m128i*)(ptr + 4), v2);
+        _mm_stream_si128((__m128i*)(ptr + 6), v3);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 2), v1);
+        _mm_store_si128((__m128i*)(ptr + 4), v2);
+        _mm_store_si128((__m128i*)(ptr + 6), v3);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 2), v1);
+        _mm_storeu_si128((__m128i*)(ptr + 4), v2);
+        _mm_storeu_si128((__m128i*)(ptr + 6), v3);
+    }
+}
+
+#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
+{ \
+    _Tpvec1 a1, b1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
+{ \
+    _Tpvec1 a1, b1, c1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
+{ \
+    _Tpvec1 a1, b1, c1, d1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+    d0 = v_reinterpret_as_##suffix0(d1); \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                hal::StoreMode mode = hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, mode);      \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                const _Tpvec0& c0, hal::StoreMode mode = hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode);  \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                const _Tpvec0& c0, const _Tpvec0& d0, \
+                                hal::StoreMode mode = hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
+}
+
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int64x2, int64, s64, v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, uint64, u64)
+
+inline v_float32x4 v_cvt_f32(const v_int32x4& a)
+{
+    return v_float32x4(_mm_cvtepi32_ps(a.val));
+}
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a)
+{
+    return v_float32x4(_mm_cvtpd_ps(a.val));
+}
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_float32x4(_mm_movelh_ps(_mm_cvtpd_ps(a.val), _mm_cvtpd_ps(b.val)));
+}
+
+inline v_float64x2 v_cvt_f64(const v_int32x4& a)
+{
+    return v_float64x2(_mm_cvtepi32_pd(a.val));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
+{
+    return v_float64x2(_mm_cvtepi32_pd(_mm_srli_si128(a.val,8)));
+}
+
+inline v_float64x2 v_cvt_f64(const v_float32x4& a)
+{
+    return v_float64x2(_mm_cvtps_pd(a.val));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
+{
+    return v_float64x2(_mm_cvtps_pd(_mm_movehl_ps(a.val, a.val)));
+}
+
+// from (Mysticial and wim) https://stackoverflow.com/q/41144668
+inline v_float64x2 v_cvt_f64(const v_int64x2& v)
+{
+    // constants encoded as floating-point
+    __m128i magic_i_hi32 = _mm_set1_epi64x(0x4530000080000000); // 2^84 + 2^63
+    __m128i magic_i_all  = _mm_set1_epi64x(0x4530000080100000); // 2^84 + 2^63 + 2^52
+    __m128d magic_d_all  = _mm_castsi128_pd(magic_i_all);
+    // Blend the 32 lowest significant bits of v with magic_int_lo
+#if CV_SSE4_1
+    __m128i magic_i_lo   = _mm_set1_epi64x(0x4330000000000000); // 2^52
+    __m128i v_lo         = _mm_blend_epi16(v.val, magic_i_lo, 0xcc);
+#else
+    __m128i magic_i_lo   = _mm_set1_epi32(0x43300000); // 2^52
+    __m128i v_lo         = _mm_unpacklo_epi32(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(0, 0, 2, 0)), magic_i_lo);
+#endif
+    // Extract the 32 most significant bits of v
+    __m128i v_hi         = _mm_srli_epi64(v.val, 32);
+    // Flip the msb of v_hi and blend with 0x45300000
+            v_hi         = _mm_xor_si128(v_hi, magic_i_hi32);
+    // Compute in double precision
+    __m128d v_hi_dbl     = _mm_sub_pd(_mm_castsi128_pd(v_hi), magic_d_all);
+    // (v_hi - magic_d_all) + v_lo  Do not assume associativity of floating point addition
+    __m128d result       = _mm_add_pd(v_hi_dbl, _mm_castsi128_pd(v_lo));
+    return v_float64x2(result);
+}
+
+////////////// Lookup table access ////////////////////
+
+inline v_int8x16 v_lut(const schar* tab, const int* idx)
+{
+#if defined(_MSC_VER)
+    return v_int8x16(_mm_setr_epi8(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
+                                   tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]));
+#else
+    return v_int8x16(_mm_setr_epi64(
+                        _mm_setr_pi8(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]]),
+                        _mm_setr_pi8(tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]])
+                    ));
+#endif
+}
+inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
+{
+#if defined(_MSC_VER)
+    return v_int8x16(_mm_setr_epi16(*(const short*)(tab + idx[0]), *(const short*)(tab + idx[1]), *(const short*)(tab + idx[2]), *(const short*)(tab + idx[3]),
+                                    *(const short*)(tab + idx[4]), *(const short*)(tab + idx[5]), *(const short*)(tab + idx[6]), *(const short*)(tab + idx[7])));
+#else
+    return v_int8x16(_mm_setr_epi64(
+                        _mm_setr_pi16(*(const short*)(tab + idx[0]), *(const short*)(tab + idx[1]), *(const short*)(tab + idx[2]), *(const short*)(tab + idx[3])),
+                        _mm_setr_pi16(*(const short*)(tab + idx[4]), *(const short*)(tab + idx[5]), *(const short*)(tab + idx[6]), *(const short*)(tab + idx[7]))
+                    ));
+#endif
+}
+inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
+{
+#if defined(_MSC_VER)
+    return v_int8x16(_mm_setr_epi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
+                                    *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
+#else
+    return v_int8x16(_mm_setr_epi64(
+                        _mm_setr_pi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1])),
+                        _mm_setr_pi32(*(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]))
+                    ));
+#endif
+}
+inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((const schar *)tab, idx)); }
+inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((const schar *)tab, idx)); }
+inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((const schar *)tab, idx)); }
+
+inline v_int16x8 v_lut(const short* tab, const int* idx)
+{
+#if defined(_MSC_VER)
+    return v_int16x8(_mm_setr_epi16(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
+                                    tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
+#else
+    return v_int16x8(_mm_setr_epi64(
+                        _mm_setr_pi16(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]),
+                        _mm_setr_pi16(tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]])
+                    ));
+#endif
+}
+inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
+{
+#if defined(_MSC_VER)
+    return v_int16x8(_mm_setr_epi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
+                                    *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
+#else
+    return v_int16x8(_mm_setr_epi64(
+                        _mm_setr_pi32(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1])),
+                        _mm_setr_pi32(*(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]))
+                    ));
+#endif
+}
+inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
+{
+    return v_int16x8(_mm_set_epi64x(*(const int64_t*)(tab + idx[1]), *(const int64_t*)(tab + idx[0])));
+}
+inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((const short *)tab, idx)); }
+inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((const short *)tab, idx)); }
+inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((const short *)tab, idx)); }
+
+inline v_int32x4 v_lut(const int* tab, const int* idx)
+{
+#if defined(_MSC_VER)
+    return v_int32x4(_mm_setr_epi32(tab[idx[0]], tab[idx[1]],
+                                    tab[idx[2]], tab[idx[3]]));
+#else
+    return v_int32x4(_mm_setr_epi64(
+                        _mm_setr_pi32(tab[idx[0]], tab[idx[1]]),
+                        _mm_setr_pi32(tab[idx[2]], tab[idx[3]])
+                    ));
+#endif
+}
+inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
+{
+    return v_int32x4(_mm_set_epi64x(*(const int64_t*)(tab + idx[1]), *(const int64_t*)(tab + idx[0])));
+}
+inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x4(_mm_loadu_si128((const __m128i*)(tab + idx[0])));
+}
+inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int *)tab, idx)); }
+inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int *)tab, idx)); }
+inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int *)tab, idx)); }
+
+inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(_mm_set_epi64x(tab[idx[1]], tab[idx[0]]));
+}
+inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(_mm_loadu_si128((const __m128i*)(tab + idx[0])));
+}
+inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
+inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
+
+inline v_float32x4 v_lut(const float* tab, const int* idx)
+{
+    return v_float32x4(_mm_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
+}
+inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_pairs((const int *)tab, idx)); }
+inline v_float32x4 v_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_quads((const int *)tab, idx)); }
+
+inline v_float64x2 v_lut(const double* tab, const int* idx)
+{
+    return v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]]));
+}
+inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) { return v_float64x2(_mm_castsi128_pd(_mm_loadu_si128((const __m128i*)(tab + idx[0])))); }
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    return v_int32x4(_mm_setr_epi32(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
+}
+
+inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
+{
+    return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    return v_float32x4(_mm_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
+}
+
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    int idx[2];
+    v_store_low(idx, idxvec);
+    return v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]]));
+}
+
+// loads pairs from the table and deinterleaves them, e.g. returns:
+//   x = (tab[idxvec[0], tab[idxvec[1]], tab[idxvec[2]], tab[idxvec[3]]),
+//   y = (tab[idxvec[0]+1], tab[idxvec[1]+1], tab[idxvec[2]+1], tab[idxvec[3]+1])
+// note that the indices are float's indices, not the float-pair indices.
+// in theory, this function can be used to implement bilinear interpolation,
+// when idxvec are the offsets within the image.
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    __m128 z = _mm_setzero_ps();
+    __m128 xy01 = _mm_loadl_pi(z, (__m64*)(tab + idx[0]));
+    __m128 xy23 = _mm_loadl_pi(z, (__m64*)(tab + idx[2]));
+    xy01 = _mm_loadh_pi(xy01, (__m64*)(tab + idx[1]));
+    xy23 = _mm_loadh_pi(xy23, (__m64*)(tab + idx[3]));
+    __m128 xxyy02 = _mm_unpacklo_ps(xy01, xy23);
+    __m128 xxyy13 = _mm_unpackhi_ps(xy01, xy23);
+    x = v_float32x4(_mm_unpacklo_ps(xxyy02, xxyy13));
+    y = v_float32x4(_mm_unpackhi_ps(xxyy02, xxyy13));
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    int idx[2];
+    v_store_low(idx, idxvec);
+    __m128d xy0 = _mm_loadu_pd(tab + idx[0]);
+    __m128d xy1 = _mm_loadu_pd(tab + idx[1]);
+    x = v_float64x2(_mm_unpacklo_pd(xy0, xy1));
+    y = v_float64x2(_mm_unpackhi_pd(xy0, xy1));
+}
+
+inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
+{
+#if CV_SSSE3
+    return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0d0e0c0b090a08, 0x0705060403010200)));
+#else
+    __m128i a = _mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(3, 1, 2, 0));
+    a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(3, 1, 2, 0));
+    a = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0));
+    return v_int8x16(_mm_unpacklo_epi8(a, _mm_unpackhi_epi64(a, a)));
+#endif
+}
+inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
+inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
+{
+#if CV_SSSE3
+    return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0b0e0a0d090c08, 0x0703060205010400)));
+#else
+    __m128i a = _mm_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0));
+    return v_int8x16(_mm_unpacklo_epi8(a, _mm_unpackhi_epi64(a, a)));
+#endif
+}
+inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
+{
+#if CV_SSSE3
+    return v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0e0b0a0d0c0908, 0x0706030205040100)));
+#else
+    __m128i a = _mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(3, 1, 2, 0));
+    return v_int16x8(_mm_shufflehi_epi16(a, _MM_SHUFFLE(3, 1, 2, 0)));
+#endif
+}
+inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
+inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
+{
+#if CV_SSSE3
+    return v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0x0f0e07060d0c0504, 0x0b0a030209080100)));
+#else
+    return v_int16x8(_mm_unpacklo_epi16(vec.val, _mm_unpackhi_epi64(vec.val, vec.val)));
+#endif
+}
+inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
+{
+    return v_int32x4(_mm_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+
+inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
+{
+#if CV_SSSE3
+    return v_int8x16(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0xffffff0f0e0d0c0a, 0x0908060504020100)));
+#else
+    __m128i mask = _mm_set1_epi64x(0x00000000FFFFFFFF);
+    __m128i a = _mm_srli_si128(_mm_or_si128(_mm_andnot_si128(mask, vec.val), _mm_and_si128(mask, _mm_sll_epi32(vec.val, _mm_set_epi64x(0, 8)))), 1);
+    return v_int8x16(_mm_srli_si128(_mm_shufflelo_epi16(a, _MM_SHUFFLE(2, 1, 0, 3)), 2));
+#endif
+}
+inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
+{
+#if CV_SSSE3
+    return v_int16x8(_mm_shuffle_epi8(vec.val, _mm_set_epi64x(0xffff0f0e0d0c0b0a, 0x0908050403020100)));
+#else
+    return v_int16x8(_mm_srli_si128(_mm_shufflelo_epi16(vec.val, _MM_SHUFFLE(2, 1, 0, 3)), 2));
+#endif
+}
+inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
+inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
+inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
+
+template<int i>
+inline uchar v_extract_n(const v_uint8x16& v)
+{
+#if CV_SSE4_1
+    return (uchar)_mm_extract_epi8(v.val, i);
+#else
+    return v_rotate_right<i>(v).get0();
+#endif
+}
+
+template<int i>
+inline schar v_extract_n(const v_int8x16& v)
+{
+    return (schar)v_extract_n<i>(v_reinterpret_as_u8(v));
+}
+
+template<int i>
+inline ushort v_extract_n(const v_uint16x8& v)
+{
+    return (ushort)_mm_extract_epi16(v.val, i);
+}
+
+template<int i>
+inline short v_extract_n(const v_int16x8& v)
+{
+    return (short)v_extract_n<i>(v_reinterpret_as_u16(v));
+}
+
+template<int i>
+inline uint v_extract_n(const v_uint32x4& v)
+{
+#if CV_SSE4_1
+    return (uint)_mm_extract_epi32(v.val, i);
+#else
+    return v_rotate_right<i>(v).get0();
+#endif
+}
+
+template<int i>
+inline int v_extract_n(const v_int32x4& v)
+{
+    return (int)v_extract_n<i>(v_reinterpret_as_u32(v));
+}
+
+template<int i>
+inline uint64 v_extract_n(const v_uint64x2& v)
+{
+#ifdef CV__SIMD_NATIVE_mm_extract_epi64
+    return (uint64)_v128_extract_epi64<i>(v.val);
+#else
+    return v_rotate_right<i>(v).get0();
+#endif
+}
+
+template<int i>
+inline int64 v_extract_n(const v_int64x2& v)
+{
+    return (int64)v_extract_n<i>(v_reinterpret_as_u64(v));
+}
+
+template<int i>
+inline float v_extract_n(const v_float32x4& v)
+{
+    union { uint iv; float fv; } d;
+    d.iv = v_extract_n<i>(v_reinterpret_as_u32(v));
+    return d.fv;
+}
+
+template<int i>
+inline double v_extract_n(const v_float64x2& v)
+{
+    union { uint64 iv; double dv; } d;
+    d.iv = v_extract_n<i>(v_reinterpret_as_u64(v));
+    return d.dv;
+}
+
+template<int i>
+inline v_int32x4 v_broadcast_element(const v_int32x4& v)
+{
+    return v_int32x4(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(i,i,i,i)));
+}
+
+template<int i>
+inline v_uint32x4 v_broadcast_element(const v_uint32x4& v)
+{
+    return v_uint32x4(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(i,i,i,i)));
+}
+
+template<int i>
+inline v_float32x4 v_broadcast_element(const v_float32x4& v)
+{
+    return v_float32x4(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE((char)i,(char)i,(char)i,(char)i)));
+}
+
+////////////// FP16 support ///////////////////////////
+
+inline v_float32x4 v_load_expand(const float16_t* ptr)
+{
+#if CV_FP16
+    return v_float32x4(_mm_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
+#else
+    const __m128i z = _mm_setzero_si128(), delta = _mm_set1_epi32(0x38000000);
+    const __m128i signmask = _mm_set1_epi32(0x80000000), maxexp = _mm_set1_epi32(0x7c000000);
+    const __m128 deltaf = _mm_castsi128_ps(_mm_set1_epi32(0x38800000));
+    __m128i bits = _mm_unpacklo_epi16(z, _mm_loadl_epi64((const __m128i*)ptr)); // h << 16
+    __m128i e = _mm_and_si128(bits, maxexp), sign = _mm_and_si128(bits, signmask);
+    __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_xor_si128(bits, sign), 3), delta); // ((h & 0x7fff) << 13) + delta
+    __m128i zt = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_add_epi32(t, _mm_set1_epi32(1 << 23))), deltaf));
+
+    t = _mm_add_epi32(t, _mm_and_si128(delta, _mm_cmpeq_epi32(maxexp, e)));
+    __m128i zmask = _mm_cmpeq_epi32(e, z);
+    __m128i ft = v_select_si128(zmask, zt, t);
+    return v_float32x4(_mm_castsi128_ps(_mm_or_si128(ft, sign)));
+#endif
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+{
+#if CV_FP16
+    __m128i fp16_value = _mm_cvtps_ph(v.val, 0);
+    _mm_storel_epi64((__m128i*)ptr, fp16_value);
+#else
+    const __m128i signmask = _mm_set1_epi32(0x80000000);
+    const __m128i rval = _mm_set1_epi32(0x3f000000);
+
+    __m128i t = _mm_castps_si128(v.val);
+    __m128i sign = _mm_srai_epi32(_mm_and_si128(t, signmask), 16);
+    t = _mm_andnot_si128(signmask, t);
+
+    __m128i finitemask = _mm_cmpgt_epi32(_mm_set1_epi32(0x47800000), t);
+    __m128i isnan = _mm_cmpgt_epi32(t, _mm_set1_epi32(0x7f800000));
+    __m128i naninf = v_select_si128(isnan, _mm_set1_epi32(0x7e00), _mm_set1_epi32(0x7c00));
+    __m128i tinymask = _mm_cmpgt_epi32(_mm_set1_epi32(0x38800000), t);
+    __m128i tt = _mm_castps_si128(_mm_add_ps(_mm_castsi128_ps(t), _mm_castsi128_ps(rval)));
+    tt = _mm_sub_epi32(tt, rval);
+    __m128i odd = _mm_and_si128(_mm_srli_epi32(t, 13), _mm_set1_epi32(1));
+    __m128i nt = _mm_add_epi32(t, _mm_set1_epi32(0xc8000fff));
+    nt = _mm_srli_epi32(_mm_add_epi32(nt, odd), 13);
+    t = v_select_si128(tinymask, tt, nt);
+    t = v_select_si128(finitemask, t, naninf);
+    t = _mm_or_si128(t, sign);
+    t = _mm_packs_epi32(t, t);
+    _mm_storel_epi64((__m128i*)ptr, t);
+#endif
+}
+
+inline void v_cleanup() {}
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+}
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_sse_em.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_sse_em.hpp
new file mode 100644
index 0000000..6fb0881
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_sse_em.hpp
@@ -0,0 +1,180 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_HAL_INTRIN_SSE_EM_HPP
+#define OPENCV_HAL_INTRIN_SSE_EM_HPP
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+#define OPENCV_HAL_SSE_WRAP_1(fun, tp) \
+    inline tp _v128_##fun(const tp& a) \
+    { return _mm_##fun(a); }
+
+#define OPENCV_HAL_SSE_WRAP_2(fun, tp) \
+    inline tp _v128_##fun(const tp& a, const tp& b) \
+    { return _mm_##fun(a, b); }
+
+#define OPENCV_HAL_SSE_WRAP_3(fun, tp) \
+    inline tp _v128_##fun(const tp& a, const tp& b, const tp& c) \
+    { return _mm_##fun(a, b, c); }
+
+///////////////////////////// XOP /////////////////////////////
+
+// [todo] define CV_XOP
+#if 1 // CV_XOP
+inline __m128i _v128_comgt_epu32(const __m128i& a, const __m128i& b)
+{
+    const __m128i delta = _mm_set1_epi32((int)0x80000000);
+    return _mm_cmpgt_epi32(_mm_xor_si128(a, delta), _mm_xor_si128(b, delta));
+}
+// wrapping XOP
+#else
+OPENCV_HAL_SSE_WRAP_2(_v128_comgt_epu32, __m128i)
+#endif // !CV_XOP
+
+///////////////////////////// SSE4.1 /////////////////////////////
+
+#if !CV_SSE4_1
+
+/** Swizzle **/
+inline __m128i _v128_blendv_epi8(const __m128i& a, const __m128i& b, const __m128i& mask)
+{ return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(b, a), mask)); }
+
+/** Convert **/
+// 8 >> 16
+inline __m128i _v128_cvtepu8_epi16(const __m128i& a)
+{
+    const __m128i z = _mm_setzero_si128();
+    return _mm_unpacklo_epi8(a, z);
+}
+inline __m128i _v128_cvtepi8_epi16(const __m128i& a)
+{ return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); }
+// 8 >> 32
+inline __m128i _v128_cvtepu8_epi32(const __m128i& a)
+{
+    const __m128i z = _mm_setzero_si128();
+    return _mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z);
+}
+inline __m128i _v128_cvtepi8_epi32(const __m128i& a)
+{
+    __m128i r = _mm_unpacklo_epi8(a, a);
+    r = _mm_unpacklo_epi8(r, r);
+    return _mm_srai_epi32(r, 24);
+}
+// 16 >> 32
+inline __m128i _v128_cvtepu16_epi32(const __m128i& a)
+{
+    const __m128i z = _mm_setzero_si128();
+    return _mm_unpacklo_epi16(a, z);
+}
+inline __m128i _v128_cvtepi16_epi32(const __m128i& a)
+{ return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16); }
+// 32 >> 64
+inline __m128i _v128_cvtepu32_epi64(const __m128i& a)
+{
+    const __m128i z = _mm_setzero_si128();
+    return _mm_unpacklo_epi32(a, z);
+}
+inline __m128i _v128_cvtepi32_epi64(const __m128i& a)
+{ return _mm_unpacklo_epi32(a, _mm_srai_epi32(a, 31)); }
+
+/** Arithmetic **/
+inline __m128i _v128_mullo_epi32(const __m128i& a, const __m128i& b)
+{
+    __m128i c0 = _mm_mul_epu32(a, b);
+    __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a, 32), _mm_srli_epi64(b, 32));
+    __m128i d0 = _mm_unpacklo_epi32(c0, c1);
+    __m128i d1 = _mm_unpackhi_epi32(c0, c1);
+    return _mm_unpacklo_epi64(d0, d1);
+}
+
+/** Math **/
+inline __m128i _v128_min_epu32(const __m128i& a, const __m128i& b)
+{ return _v128_blendv_epi8(a, b, _v128_comgt_epu32(a, b)); }
+
+// wrapping SSE4.1
+#else
+OPENCV_HAL_SSE_WRAP_1(cvtepu8_epi16, __m128i)
+OPENCV_HAL_SSE_WRAP_1(cvtepi8_epi16, __m128i)
+OPENCV_HAL_SSE_WRAP_1(cvtepu8_epi32, __m128i)
+OPENCV_HAL_SSE_WRAP_1(cvtepi8_epi32, __m128i)
+OPENCV_HAL_SSE_WRAP_1(cvtepu16_epi32, __m128i)
+OPENCV_HAL_SSE_WRAP_1(cvtepi16_epi32, __m128i)
+OPENCV_HAL_SSE_WRAP_1(cvtepu32_epi64, __m128i)
+OPENCV_HAL_SSE_WRAP_1(cvtepi32_epi64, __m128i)
+OPENCV_HAL_SSE_WRAP_2(min_epu32, __m128i)
+OPENCV_HAL_SSE_WRAP_2(mullo_epi32, __m128i)
+OPENCV_HAL_SSE_WRAP_3(blendv_epi8, __m128i)
+#endif // !CV_SSE4_1
+
+///////////////////////////// Revolutionary /////////////////////////////
+
+/** Convert **/
+// 16 << 8
+inline __m128i _v128_cvtepu8_epi16_high(const __m128i& a)
+{
+    const __m128i z = _mm_setzero_si128();
+    return _mm_unpackhi_epi8(a, z);
+}
+inline __m128i _v128_cvtepi8_epi16_high(const __m128i& a)
+{ return _mm_srai_epi16(_mm_unpackhi_epi8(a, a), 8); }
+// 32 << 16
+inline __m128i _v128_cvtepu16_epi32_high(const __m128i& a)
+{
+    const __m128i z = _mm_setzero_si128();
+    return _mm_unpackhi_epi16(a, z);
+}
+inline __m128i _v128_cvtepi16_epi32_high(const __m128i& a)
+{ return _mm_srai_epi32(_mm_unpackhi_epi16(a, a), 16); }
+// 64 << 32
+inline __m128i _v128_cvtepu32_epi64_high(const __m128i& a)
+{
+    const __m128i z = _mm_setzero_si128();
+    return _mm_unpackhi_epi32(a, z);
+}
+inline __m128i _v128_cvtepi32_epi64_high(const __m128i& a)
+{ return _mm_unpackhi_epi32(a, _mm_srai_epi32(a, 31)); }
+
+/** Miscellaneous **/
+inline __m128i _v128_packs_epu32(const __m128i& a, const __m128i& b)
+{
+    const __m128i m = _mm_set1_epi32(65535);
+    __m128i am = _v128_min_epu32(a, m);
+    __m128i bm = _v128_min_epu32(b, m);
+#if CV_SSE4_1
+    return _mm_packus_epi32(am, bm);
+#else
+    const __m128i d = _mm_set1_epi32(32768), nd = _mm_set1_epi16(-32768);
+    am = _mm_sub_epi32(am, d);
+    bm = _mm_sub_epi32(bm, d);
+    am = _mm_packs_epi32(am, bm);
+    return _mm_sub_epi16(am, nd);
+#endif
+}
+
+template<int i>
+inline int64 _v128_extract_epi64(const __m128i& a)
+{
+#if defined(CV__SIMD_HAVE_mm_extract_epi64) || (CV_SSE4_1 && (defined(__x86_64__)/*GCC*/ || defined(_M_X64)/*MSVC*/))
+#define CV__SIMD_NATIVE_mm_extract_epi64 1
+    return _mm_extract_epi64(a, i);
+#else
+    CV_DECL_ALIGNED(16) int64 tmp[2];
+    _mm_store_si128((__m128i*)tmp, a);
+    return tmp[i];
+#endif
+}
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+} // cv::
+
+#endif // OPENCV_HAL_INTRIN_SSE_EM_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_vsx.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_vsx.hpp
new file mode 100644
index 0000000..b198643
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_vsx.hpp
@@ -0,0 +1,1608 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_HAL_VSX_HPP
+#define OPENCV_HAL_VSX_HPP
+
+#include <algorithm>
+#include "opencv2/core/utility.hpp"
+
+#define CV_SIMD128 1
+#define CV_SIMD128_64F 1
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+///////// Types ////////////
+
+struct v_uint8x16
+{
+    typedef uchar lane_type;
+    enum { nlanes = 16 };
+    vec_uchar16 val;
+
+    explicit v_uint8x16(const vec_uchar16& v) : val(v)
+    {}
+    v_uint8x16()
+    {}
+    v_uint8x16(vec_bchar16 v) : val(vec_uchar16_c(v))
+    {}
+    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+               uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+        : val(vec_uchar16_set(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15))
+    {}
+
+    static inline v_uint8x16 zero() { return v_uint8x16(vec_uchar16_z); }
+
+    uchar get0() const
+    { return vec_extract(val, 0); }
+};
+
+struct v_int8x16
+{
+    typedef schar lane_type;
+    enum { nlanes = 16 };
+    vec_char16 val;
+
+    explicit v_int8x16(const vec_char16& v) : val(v)
+    {}
+    v_int8x16()
+    {}
+    v_int8x16(vec_bchar16 v) : val(vec_char16_c(v))
+    {}
+    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+              schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+        : val(vec_char16_set(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15))
+    {}
+
+    static inline v_int8x16 zero() { return v_int8x16(vec_char16_z); }
+
+    schar get0() const
+    { return vec_extract(val, 0); }
+};
+
+struct v_uint16x8
+{
+    typedef ushort lane_type;
+    enum { nlanes = 8 };
+    vec_ushort8 val;
+
+    explicit v_uint16x8(const vec_ushort8& v) : val(v)
+    {}
+    v_uint16x8()
+    {}
+    v_uint16x8(vec_bshort8 v) : val(vec_ushort8_c(v))
+    {}
+    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+        : val(vec_ushort8_set(v0, v1, v2, v3, v4, v5, v6, v7))
+    {}
+
+    static inline v_uint16x8 zero() { return v_uint16x8(vec_ushort8_z); }
+
+    ushort get0() const
+    { return vec_extract(val, 0); }
+};
+
+struct v_int16x8
+{
+    typedef short lane_type;
+    enum { nlanes = 8 };
+    vec_short8 val;
+
+    explicit v_int16x8(const vec_short8& v) : val(v)
+    {}
+    v_int16x8()
+    {}
+    v_int16x8(vec_bshort8 v) : val(vec_short8_c(v))
+    {}
+    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+        : val(vec_short8_set(v0, v1, v2, v3, v4, v5, v6, v7))
+    {}
+
+    static inline v_int16x8 zero() { return v_int16x8(vec_short8_z); }
+
+    short get0() const
+    { return vec_extract(val, 0); }
+};
+
+struct v_uint32x4
+{
+    typedef unsigned lane_type;
+    enum { nlanes = 4 };
+    vec_uint4 val;
+
+    explicit v_uint32x4(const vec_uint4& v) : val(v)
+    {}
+    v_uint32x4()
+    {}
+    v_uint32x4(vec_bint4 v) : val(vec_uint4_c(v))
+    {}
+    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3) : val(vec_uint4_set(v0, v1, v2, v3))
+    {}
+
+    static inline v_uint32x4 zero() { return v_uint32x4(vec_uint4_z); }
+
+    uint get0() const
+    { return vec_extract(val, 0); }
+};
+
+struct v_int32x4
+{
+    typedef int lane_type;
+    enum { nlanes = 4 };
+    vec_int4 val;
+
+    explicit v_int32x4(const vec_int4& v) : val(v)
+    {}
+    v_int32x4()
+    {}
+    v_int32x4(vec_bint4 v) : val(vec_int4_c(v))
+    {}
+    v_int32x4(int v0, int v1, int v2, int v3) : val(vec_int4_set(v0, v1, v2, v3))
+    {}
+
+    static inline v_int32x4 zero() { return v_int32x4(vec_int4_z); }
+
+    int get0() const
+    { return vec_extract(val, 0); }
+};
+
+struct v_float32x4
+{
+    typedef float lane_type;
+    enum { nlanes = 4 };
+    vec_float4 val;
+
+    explicit v_float32x4(const vec_float4& v) : val(v)
+    {}
+    v_float32x4()
+    {}
+    v_float32x4(vec_bint4 v) : val(vec_float4_c(v))
+    {}
+    v_float32x4(float v0, float v1, float v2, float v3) : val(vec_float4_set(v0, v1, v2, v3))
+    {}
+
+    static inline v_float32x4 zero() { return v_float32x4(vec_float4_z); }
+
+    float get0() const
+    { return vec_extract(val, 0); }
+};
+
+struct v_uint64x2
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 2 };
+    vec_udword2 val;
+
+    explicit v_uint64x2(const vec_udword2& v) : val(v)
+    {}
+    v_uint64x2()
+    {}
+    v_uint64x2(vec_bdword2 v) : val(vec_udword2_c(v))
+    {}
+    v_uint64x2(uint64 v0, uint64 v1) : val(vec_udword2_set(v0, v1))
+    {}
+
+    static inline v_uint64x2 zero() { return v_uint64x2(vec_udword2_z); }
+
+    uint64 get0() const
+    { return vec_extract(val, 0); }
+};
+
+struct v_int64x2
+{
+    typedef int64 lane_type;
+    enum { nlanes = 2 };
+    vec_dword2 val;
+
+    explicit v_int64x2(const vec_dword2& v) : val(v)
+    {}
+    v_int64x2()
+    {}
+    v_int64x2(vec_bdword2 v) : val(vec_dword2_c(v))
+    {}
+    v_int64x2(int64 v0, int64 v1) : val(vec_dword2_set(v0, v1))
+    {}
+
+    static inline v_int64x2 zero() { return v_int64x2(vec_dword2_z); }
+
+    int64 get0() const
+    { return vec_extract(val, 0); }
+};
+
+struct v_float64x2
+{
+    typedef double lane_type;
+    enum { nlanes = 2 };
+    vec_double2 val;
+
+    explicit v_float64x2(const vec_double2& v) : val(v)
+    {}
+    v_float64x2()
+    {}
+    v_float64x2(vec_bdword2 v) : val(vec_double2_c(v))
+    {}
+    v_float64x2(double v0, double v1) : val(vec_double2_set(v0, v1))
+    {}
+
+    static inline v_float64x2 zero() { return v_float64x2(vec_double2_z); }
+
+    double get0() const
+    { return vec_extract(val, 0); }
+};
+
+#define OPENCV_HAL_IMPL_VSX_EXTRACT_N(_Tpvec, _Tp) \
+template<int i> inline _Tp v_extract_n(VSX_UNUSED(_Tpvec v)) { return vec_extract(v.val, i); }
+
+OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint8x16, uchar)
+OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int8x16, schar)
+OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint16x8, ushort)
+OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int16x8, short)
+OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint32x4, uint)
+OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int32x4, int)
+OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint64x2, uint64)
+OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int64x2, int64)
+OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_float32x4, float)
+OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_float64x2, double)
+
+//////////////// Load and store operations ///////////////
+
+/*
+ * clang-5 aborted during parse "vec_xxx_c" only if it's
+ * inside a function template which is defined by preprocessor macro.
+ *
+ * if vec_xxx_c defined as C++ cast, clang-5 will pass it
+*/
+#define OPENCV_HAL_IMPL_VSX_INITVEC(_Tpvec, _Tp, suffix, cast)                        \
+inline _Tpvec v_setzero_##suffix() { return _Tpvec(vec_splats((_Tp)0)); }             \
+inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(vec_splats((_Tp)v));}          \
+template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0 &a)  \
+{ return _Tpvec((cast)a.val); }
+
+OPENCV_HAL_IMPL_VSX_INITVEC(v_uint8x16, uchar, u8, vec_uchar16)
+OPENCV_HAL_IMPL_VSX_INITVEC(v_int8x16, schar, s8, vec_char16)
+OPENCV_HAL_IMPL_VSX_INITVEC(v_uint16x8, ushort, u16, vec_ushort8)
+OPENCV_HAL_IMPL_VSX_INITVEC(v_int16x8, short, s16, vec_short8)
+OPENCV_HAL_IMPL_VSX_INITVEC(v_uint32x4, uint, u32, vec_uint4)
+OPENCV_HAL_IMPL_VSX_INITVEC(v_int32x4, int, s32, vec_int4)
+OPENCV_HAL_IMPL_VSX_INITVEC(v_uint64x2, uint64, u64, vec_udword2)
+OPENCV_HAL_IMPL_VSX_INITVEC(v_int64x2, int64, s64, vec_dword2)
+OPENCV_HAL_IMPL_VSX_INITVEC(v_float32x4, float, f32, vec_float4)
+OPENCV_HAL_IMPL_VSX_INITVEC(v_float64x2, double, f64, vec_double2)
+
+#define OPENCV_HAL_IMPL_VSX_LOADSTORE_C(_Tpvec, _Tp, ld, ld_a, st, st_a)    \
+inline _Tpvec v_load(const _Tp* ptr)                                        \
+{ return _Tpvec(ld(0, ptr)); }                                              \
+inline _Tpvec v_load_aligned(VSX_UNUSED(const _Tp* ptr))                    \
+{ return _Tpvec(ld_a(0, ptr)); }                                            \
+inline _Tpvec v_load_low(const _Tp* ptr)                                    \
+{ return _Tpvec(vec_ld_l8(ptr)); }                                          \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1)               \
+{ return _Tpvec(vec_mergesqh(vec_ld_l8(ptr0), vec_ld_l8(ptr1))); }          \
+inline void v_store(_Tp* ptr, const _Tpvec& a)                              \
+{ st(a.val, 0, ptr); }                                                      \
+inline void v_store_aligned(VSX_UNUSED(_Tp* ptr), const _Tpvec& a)          \
+{ st_a(a.val, 0, ptr); }                                                    \
+inline void v_store_aligned_nocache(VSX_UNUSED(_Tp* ptr), const _Tpvec& a)  \
+{ st_a(a.val, 0, ptr); }                                                    \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode)         \
+{ if(mode == hal::STORE_UNALIGNED) st(a.val, 0, ptr); else st_a(a.val, 0, ptr); } \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a)                          \
+{ vec_st_l8(a.val, ptr); }                                                  \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a)                         \
+{ vec_st_h8(a.val, ptr); }
+
+// working around gcc bug for aligned ld/st
+// if runtime check for vec_ld/st fail we failback to unaligned ld/st
+// https://github.com/opencv/opencv/issues/13211
+#ifdef CV_COMPILER_VSX_BROKEN_ALIGNED
+    #define OPENCV_HAL_IMPL_VSX_LOADSTORE(_Tpvec, _Tp) \
+    OPENCV_HAL_IMPL_VSX_LOADSTORE_C(_Tpvec, _Tp, vsx_ld, vsx_ld, vsx_st, vsx_st)
+#else
+    #define OPENCV_HAL_IMPL_VSX_LOADSTORE(_Tpvec, _Tp) \
+    OPENCV_HAL_IMPL_VSX_LOADSTORE_C(_Tpvec, _Tp, vsx_ld, vec_ld, vsx_st, vec_st)
+#endif
+
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_uint8x16,  uchar)
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_int8x16,   schar)
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_uint16x8,  ushort)
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_int16x8,   short)
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_uint32x4,  uint)
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_int32x4,   int)
+OPENCV_HAL_IMPL_VSX_LOADSTORE(v_float32x4, float)
+
+OPENCV_HAL_IMPL_VSX_LOADSTORE_C(v_float64x2, double, vsx_ld,  vsx_ld,  vsx_st,  vsx_st)
+OPENCV_HAL_IMPL_VSX_LOADSTORE_C(v_uint64x2,  uint64, vsx_ld2, vsx_ld2, vsx_st2, vsx_st2)
+OPENCV_HAL_IMPL_VSX_LOADSTORE_C(v_int64x2,    int64, vsx_ld2, vsx_ld2, vsx_st2, vsx_st2)
+
+//////////////// Value reordering ///////////////
+
+/* de&interleave */
+#define OPENCV_HAL_IMPL_VSX_INTERLEAVE(_Tp, _Tpvec)                          \
+inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b)        \
+{ vec_ld_deinterleave(ptr, a.val, b.val);}                                   \
+inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a,                   \
+                                _Tpvec& b, _Tpvec& c)                        \
+{ vec_ld_deinterleave(ptr, a.val, b.val, c.val); }                           \
+inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b,        \
+                                                _Tpvec& c, _Tpvec& d)        \
+{ vec_ld_deinterleave(ptr, a.val, b.val, c.val, d.val); }                    \
+inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b,   \
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ vec_st_interleave(a.val, b.val, ptr); }                                    \
+inline void v_store_interleave(_Tp* ptr, const _Tpvec& a,                    \
+                               const _Tpvec& b, const _Tpvec& c,             \
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ vec_st_interleave(a.val, b.val, c.val, ptr); }                             \
+inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b,   \
+                                         const _Tpvec& c, const _Tpvec& d,   \
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ vec_st_interleave(a.val, b.val, c.val, d.val, ptr); }
+
+OPENCV_HAL_IMPL_VSX_INTERLEAVE(uchar, v_uint8x16)
+OPENCV_HAL_IMPL_VSX_INTERLEAVE(schar, v_int8x16)
+OPENCV_HAL_IMPL_VSX_INTERLEAVE(ushort, v_uint16x8)
+OPENCV_HAL_IMPL_VSX_INTERLEAVE(short, v_int16x8)
+OPENCV_HAL_IMPL_VSX_INTERLEAVE(uint, v_uint32x4)
+OPENCV_HAL_IMPL_VSX_INTERLEAVE(int, v_int32x4)
+OPENCV_HAL_IMPL_VSX_INTERLEAVE(float, v_float32x4)
+OPENCV_HAL_IMPL_VSX_INTERLEAVE(double, v_float64x2)
+OPENCV_HAL_IMPL_VSX_INTERLEAVE(int64, v_int64x2)
+OPENCV_HAL_IMPL_VSX_INTERLEAVE(uint64, v_uint64x2)
+
+/* Expand */
+#define OPENCV_HAL_IMPL_VSX_EXPAND(_Tpvec, _Tpwvec, _Tp, fl, fh)  \
+inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1)   \
+{                                                                 \
+    b0.val = fh(a.val);                                           \
+    b1.val = fl(a.val);                                           \
+}                                                                 \
+inline _Tpwvec v_expand_low(const _Tpvec& a)                      \
+{ return _Tpwvec(fh(a.val)); }                                    \
+inline _Tpwvec v_expand_high(const _Tpvec& a)                     \
+{ return _Tpwvec(fl(a.val)); }                                    \
+inline _Tpwvec v_load_expand(const _Tp* ptr)                      \
+{ return _Tpwvec(fh(vec_ld_l8(ptr))); }
+
+OPENCV_HAL_IMPL_VSX_EXPAND(v_uint8x16, v_uint16x8, uchar, vec_unpacklu, vec_unpackhu)
+OPENCV_HAL_IMPL_VSX_EXPAND(v_int8x16, v_int16x8, schar, vec_unpackl, vec_unpackh)
+OPENCV_HAL_IMPL_VSX_EXPAND(v_uint16x8, v_uint32x4, ushort, vec_unpacklu, vec_unpackhu)
+OPENCV_HAL_IMPL_VSX_EXPAND(v_int16x8, v_int32x4, short, vec_unpackl, vec_unpackh)
+OPENCV_HAL_IMPL_VSX_EXPAND(v_uint32x4, v_uint64x2, uint, vec_unpacklu, vec_unpackhu)
+OPENCV_HAL_IMPL_VSX_EXPAND(v_int32x4, v_int64x2, int, vec_unpackl, vec_unpackh)
+
+/* Load and zero expand a 4 byte value into the second dword, first is don't care. */
+#if !defined(CV_COMPILER_VSX_BROKEN_ASM)
+    #define _LXSIWZX(out, ptr, T) __asm__ ("lxsiwzx %x0, 0, %1\r\n" : "=wa"(out) : "r" (ptr) : "memory");
+#else
+    /* This is compiler-agnostic, but will introduce an unneeded splat on the critical path. */
+    #define _LXSIWZX(out, ptr, T) out = (T)vec_udword2_sp(*(uint32_t*)(ptr));
+#endif
+
+inline v_uint32x4 v_load_expand_q(const uchar* ptr)
+{
+    // Zero-extend the extra 24B instead of unpacking. Usually faster in small kernel
+    // Likewise note, value is zero extended and upper 4 bytes are zero'ed.
+    vec_uchar16 pmu = {8, 12, 12, 12, 9, 12, 12, 12, 10, 12, 12, 12, 11, 12, 12, 12};
+    vec_uchar16 out;
+
+    _LXSIWZX(out, ptr, vec_uchar16);
+    out = vec_perm(out, out, pmu);
+    return v_uint32x4((vec_uint4)out);
+}
+
+inline v_int32x4 v_load_expand_q(const schar* ptr)
+{
+    vec_char16 out;
+    vec_short8 outs;
+    vec_int4 outw;
+
+    _LXSIWZX(out, ptr, vec_char16);
+    outs = vec_unpackl(out);
+    outw = vec_unpackh(outs);
+    return v_int32x4(outw);
+}
+
+/* pack */
+#define OPENCV_HAL_IMPL_VSX_PACK(_Tpvec, _Tp, _Tpwvec, _Tpvn, _Tpdel, sfnc, pkfnc, addfnc, pack)    \
+inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b)                                          \
+{                                                                                                   \
+    return _Tpvec(pkfnc(a.val, b.val));                                                             \
+}                                                                                                   \
+inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a)                                            \
+{                                                                                                   \
+    vec_st_l8(pkfnc(a.val, a.val), ptr);                                                            \
+}                                                                                                   \
+template<int n>                                                                                     \
+inline _Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b)                                     \
+{                                                                                                   \
+    const __vector _Tpvn vn = vec_splats((_Tpvn)n);                                                 \
+    const __vector _Tpdel delta = vec_splats((_Tpdel)((_Tpdel)1 << (n-1)));                         \
+    return _Tpvec(pkfnc(sfnc(addfnc(a.val, delta), vn), sfnc(addfnc(b.val, delta), vn)));           \
+}                                                                                                   \
+template<int n>                                                                                     \
+inline void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a)                                       \
+{                                                                                                   \
+    const __vector _Tpvn vn = vec_splats((_Tpvn)n);                                                 \
+    const __vector _Tpdel delta = vec_splats((_Tpdel)((_Tpdel)1 << (n-1)));                         \
+    vec_st_l8(pkfnc(sfnc(addfnc(a.val, delta), vn), delta), ptr);                                   \
+}
+
+OPENCV_HAL_IMPL_VSX_PACK(v_uint8x16, uchar, v_uint16x8, unsigned short, unsigned short,
+                         vec_sr, vec_packs, vec_adds, pack)
+OPENCV_HAL_IMPL_VSX_PACK(v_int8x16, schar, v_int16x8, unsigned short, short,
+                         vec_sra, vec_packs, vec_adds, pack)
+
+OPENCV_HAL_IMPL_VSX_PACK(v_uint16x8, ushort, v_uint32x4, unsigned int, unsigned int,
+                         vec_sr, vec_packs, vec_add, pack)
+OPENCV_HAL_IMPL_VSX_PACK(v_int16x8, short, v_int32x4, unsigned int, int,
+                         vec_sra, vec_packs, vec_add, pack)
+
+OPENCV_HAL_IMPL_VSX_PACK(v_uint32x4, uint, v_uint64x2, unsigned long long, unsigned long long,
+                         vec_sr, vec_pack, vec_add, pack)
+OPENCV_HAL_IMPL_VSX_PACK(v_int32x4, int, v_int64x2, unsigned long long, long long,
+                         vec_sra, vec_pack, vec_add, pack)
+
+OPENCV_HAL_IMPL_VSX_PACK(v_uint8x16, uchar, v_int16x8, unsigned short, short,
+                         vec_sra, vec_packsu, vec_adds, pack_u)
+OPENCV_HAL_IMPL_VSX_PACK(v_uint16x8, ushort, v_int32x4, unsigned int, int,
+                         vec_sra, vec_packsu, vec_add, pack_u)
+// Following variant is not implemented on other platforms:
+//OPENCV_HAL_IMPL_VSX_PACK(v_uint32x4, uint, v_int64x2, unsigned long long, long long,
+//                         vec_sra, vec_packsu, vec_add, pack_u)
+
+// pack boolean
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    vec_uchar16 ab = vec_pack(a.val, b.val);
+    return v_uint8x16(ab);
+}
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    vec_ushort8 ab = vec_pack(a.val, b.val);
+    vec_ushort8 cd = vec_pack(c.val, d.val);
+    return v_uint8x16(vec_pack(ab, cd));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    vec_uint4 ab = vec_pack(a.val, b.val);
+    vec_uint4 cd = vec_pack(c.val, d.val);
+    vec_uint4 ef = vec_pack(e.val, f.val);
+    vec_uint4 gh = vec_pack(g.val, h.val);
+
+    vec_ushort8 abcd = vec_pack(ab, cd);
+    vec_ushort8 efgh = vec_pack(ef, gh);
+    return v_uint8x16(vec_pack(abcd, efgh));
+}
+
+/* Recombine */
+template <typename _Tpvec>
+inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1)
+{
+    b0.val = vec_mergeh(a0.val, a1.val);
+    b1.val = vec_mergel(a0.val, a1.val);
+}
+
+template <typename _Tpvec>
+inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)
+{ return _Tpvec(vec_mergesql(a.val, b.val)); }
+
+template <typename _Tpvec>
+inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)
+{ return _Tpvec(vec_mergesqh(a.val, b.val)); }
+
+template <typename _Tpvec>
+inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d)
+{
+    c.val = vec_mergesqh(a.val, b.val);
+    d.val = vec_mergesql(a.val, b.val);
+}
+
+////////// Arithmetic, bitwise and comparison operations /////////
+
+/* Element-wise binary and unary operations */
+/** Arithmetics **/
+#define OPENCV_HAL_IMPL_VSX_BIN_OP(bin_op, _Tpvec, intrin)       \
+inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(intrin(a.val, b.val)); }                         \
+inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)   \
+{ a.val = intrin(a.val, b.val); return a; }
+
+OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint8x16, vec_adds)
+OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint8x16, vec_subs)
+OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int8x16,  vec_adds)
+OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int8x16, vec_subs)
+OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint16x8, vec_adds)
+OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint16x8, vec_subs)
+OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int16x8, vec_adds)
+OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int16x8, vec_subs)
+OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint32x4, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint32x4, vec_sub)
+OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_uint32x4, vec_mul)
+OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int32x4, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int32x4, vec_sub)
+OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_int32x4, vec_mul)
+OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float32x4, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float32x4, vec_sub)
+OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float32x4, vec_mul)
+OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float32x4, vec_div)
+OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float64x2, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float64x2, vec_sub)
+OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float64x2, vec_mul)
+OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float64x2, vec_div)
+OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint64x2, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint64x2, vec_sub)
+OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int64x2, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int64x2, vec_sub)
+
+// saturating multiply
+#define OPENCV_HAL_IMPL_VSX_MUL_SAT(_Tpvec, _Tpwvec)             \
+    inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
+    {                                                            \
+        _Tpwvec c, d;                                            \
+        v_mul_expand(a, b, c, d);                                \
+        return v_pack(c, d);                                     \
+    }                                                            \
+    inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
+    { a = a * b; return a; }
+
+OPENCV_HAL_IMPL_VSX_MUL_SAT(v_int8x16,  v_int16x8)
+OPENCV_HAL_IMPL_VSX_MUL_SAT(v_uint8x16, v_uint16x8)
+OPENCV_HAL_IMPL_VSX_MUL_SAT(v_int16x8,  v_int32x4)
+OPENCV_HAL_IMPL_VSX_MUL_SAT(v_uint16x8, v_uint32x4)
+
+template<typename Tvec, typename Twvec>
+inline void v_mul_expand(const Tvec& a, const Tvec& b, Twvec& c, Twvec& d)
+{
+    Twvec p0 = Twvec(vec_mule(a.val, b.val));
+    Twvec p1 = Twvec(vec_mulo(a.val, b.val));
+    v_zip(p0, p1, c, d);
+}
+
+inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
+{
+    vec_int4 p0 = vec_mule(a.val, b.val);
+    vec_int4 p1 = vec_mulo(a.val, b.val);
+    static const vec_uchar16 perm = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
+    return v_int16x8(vec_perm(vec_short8_c(p0), vec_short8_c(p1), perm));
+}
+inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
+{
+    vec_uint4 p0 = vec_mule(a.val, b.val);
+    vec_uint4 p1 = vec_mulo(a.val, b.val);
+    static const vec_uchar16 perm = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
+    return v_uint16x8(vec_perm(vec_ushort8_c(p0), vec_ushort8_c(p1), perm));
+}
+
+/** Non-saturating arithmetics **/
+#define OPENCV_HAL_IMPL_VSX_BIN_FUNC(func, intrin)    \
+template<typename _Tpvec>                             \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b)  \
+{ return _Tpvec(intrin(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_add_wrap, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_sub_wrap, vec_sub)
+OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_mul_wrap, vec_mul)
+
+/** Bitwise shifts **/
+#define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpvec, shr, splfunc)   \
+inline _Tpvec operator << (const _Tpvec& a, int imm)         \
+{ return _Tpvec(vec_sl(a.val, splfunc(imm))); }              \
+inline _Tpvec operator >> (const _Tpvec& a, int imm)         \
+{ return _Tpvec(shr(a.val, splfunc(imm))); }                 \
+template<int imm> inline _Tpvec v_shl(const _Tpvec& a)       \
+{ return _Tpvec(vec_sl(a.val, splfunc(imm))); }              \
+template<int imm> inline _Tpvec v_shr(const _Tpvec& a)       \
+{ return _Tpvec(shr(a.val, splfunc(imm))); }
+
+OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint8x16, vec_sr, vec_uchar16_sp)
+OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint16x8, vec_sr, vec_ushort8_sp)
+OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint32x4, vec_sr, vec_uint4_sp)
+OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint64x2, vec_sr, vec_udword2_sp)
+// algebraic right shift
+OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int8x16, vec_sra, vec_uchar16_sp)
+OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int16x8, vec_sra, vec_ushort8_sp)
+OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int32x4, vec_sra, vec_uint4_sp)
+OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int64x2, vec_sra, vec_udword2_sp)
+
+/** Bitwise logic **/
+#define OPENCV_HAL_IMPL_VSX_LOGIC_OP(_Tpvec)    \
+OPENCV_HAL_IMPL_VSX_BIN_OP(&, _Tpvec, vec_and)  \
+OPENCV_HAL_IMPL_VSX_BIN_OP(|, _Tpvec, vec_or)   \
+OPENCV_HAL_IMPL_VSX_BIN_OP(^, _Tpvec, vec_xor)  \
+inline _Tpvec operator ~ (const _Tpvec& a)      \
+{ return _Tpvec(vec_not(a.val)); }
+
+OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint8x16)
+OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int8x16)
+OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint16x8)
+OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int16x8)
+OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint32x4)
+OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int32x4)
+OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint64x2)
+OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_int64x2)
+OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_float32x4)
+OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_float64x2)
+
+/** Bitwise select **/
+#define OPENCV_HAL_IMPL_VSX_SELECT(_Tpvec, cast)                             \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(vec_sel(b.val, a.val, cast(mask.val))); }
+
+OPENCV_HAL_IMPL_VSX_SELECT(v_uint8x16, vec_bchar16_c)
+OPENCV_HAL_IMPL_VSX_SELECT(v_int8x16, vec_bchar16_c)
+OPENCV_HAL_IMPL_VSX_SELECT(v_uint16x8, vec_bshort8_c)
+OPENCV_HAL_IMPL_VSX_SELECT(v_int16x8, vec_bshort8_c)
+OPENCV_HAL_IMPL_VSX_SELECT(v_uint32x4, vec_bint4_c)
+OPENCV_HAL_IMPL_VSX_SELECT(v_int32x4, vec_bint4_c)
+OPENCV_HAL_IMPL_VSX_SELECT(v_float32x4, vec_bint4_c)
+OPENCV_HAL_IMPL_VSX_SELECT(v_float64x2, vec_bdword2_c)
+
+/** Comparison **/
+#define OPENCV_HAL_IMPL_VSX_INT_CMP_OP(_Tpvec)                 \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b)   \
+{ return _Tpvec(vec_cmpeq(a.val, b.val)); }                    \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)   \
+{ return _Tpvec(vec_cmpne(a.val, b.val)); }                    \
+inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b)    \
+{ return _Tpvec(vec_cmplt(a.val, b.val)); }                    \
+inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b)    \
+{ return _Tpvec(vec_cmpgt(a.val, b.val)); }                    \
+inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b)   \
+{ return _Tpvec(vec_cmple(a.val, b.val)); }                    \
+inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b)   \
+{ return _Tpvec(vec_cmpge(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint8x16)
+OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int8x16)
+OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint16x8)
+OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int16x8)
+OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint32x4)
+OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int32x4)
+OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_float32x4)
+OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_float64x2)
+OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint64x2)
+OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int64x2)
+
+inline v_float32x4 v_not_nan(const v_float32x4& a)
+{ return v_float32x4(vec_cmpeq(a.val, a.val)); }
+inline v_float64x2 v_not_nan(const v_float64x2& a)
+{ return v_float64x2(vec_cmpeq(a.val, a.val)); }
+
+/** min/max **/
+OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_min, vec_min)
+OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_max, vec_max)
+
+/** Rotate **/
+#define OPENCV_IMPL_VSX_ROTATE(_Tpvec, suffix, shf, cast)                       \
+template<int imm>                                                               \
+inline _Tpvec v_rotate_##suffix(const _Tpvec& a)                                \
+{                                                                               \
+    const int wd = imm * sizeof(typename _Tpvec::lane_type);                    \
+    if (wd > 15)                                                                \
+        return _Tpvec::zero();                                                  \
+    return _Tpvec((cast)shf(vec_uchar16_c(a.val), vec_uchar16_sp(wd << 3)));    \
+}
+
+#define OPENCV_IMPL_VSX_ROTATE_LR(_Tpvec, cast)     \
+OPENCV_IMPL_VSX_ROTATE(_Tpvec, left, vec_slo, cast) \
+OPENCV_IMPL_VSX_ROTATE(_Tpvec, right, vec_sro, cast)
+
+OPENCV_IMPL_VSX_ROTATE_LR(v_uint8x16, vec_uchar16)
+OPENCV_IMPL_VSX_ROTATE_LR(v_int8x16,  vec_char16)
+OPENCV_IMPL_VSX_ROTATE_LR(v_uint16x8, vec_ushort8)
+OPENCV_IMPL_VSX_ROTATE_LR(v_int16x8,  vec_short8)
+OPENCV_IMPL_VSX_ROTATE_LR(v_uint32x4, vec_uint4)
+OPENCV_IMPL_VSX_ROTATE_LR(v_int32x4,  vec_int4)
+OPENCV_IMPL_VSX_ROTATE_LR(v_float32x4, vec_float4)
+OPENCV_IMPL_VSX_ROTATE_LR(v_uint64x2, vec_udword2)
+OPENCV_IMPL_VSX_ROTATE_LR(v_int64x2,  vec_dword2)
+OPENCV_IMPL_VSX_ROTATE_LR(v_float64x2, vec_double2)
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)
+{
+    enum { CV_SHIFT = 16 - imm * (sizeof(typename _Tpvec::lane_type)) };
+    if (CV_SHIFT == 16)
+        return a;
+#ifdef __IBMCPP__
+    return _Tpvec(vec_sld(b.val, a.val, CV_SHIFT & 15));
+#else
+    return _Tpvec(vec_sld(b.val, a.val, CV_SHIFT));
+#endif
+}
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)
+{
+    enum { CV_SHIFT = imm * (sizeof(typename _Tpvec::lane_type)) };
+    if (CV_SHIFT == 16)
+        return b;
+    return _Tpvec(vec_sld(a.val, b.val, CV_SHIFT));
+}
+
+#define OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, suffix, rg1, rg2)   \
+template<int imm>                                                 \
+inline _Tpvec v_rotate_##suffix(const _Tpvec& a, const _Tpvec& b) \
+{                                                                 \
+    if (imm == 1)                                                 \
+        return _Tpvec(vec_permi(rg1.val, rg2.val, 2));            \
+    return imm ? b : a;                                           \
+}
+
+#define OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(_Tpvec)    \
+OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, left,  b, a)  \
+OPENCV_IMPL_VSX_ROTATE_64_2RG(_Tpvec, right, a, b)
+
+OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_float64x2)
+OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_uint64x2)
+OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_int64x2)
+
+/* Reverse */
+inline v_uint8x16 v_reverse(const v_uint8x16 &a)
+{
+    static const vec_uchar16 perm = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+    vec_uchar16 vec = (vec_uchar16)a.val;
+    return v_uint8x16(vec_perm(vec, vec, perm));
+}
+
+inline v_int8x16 v_reverse(const v_int8x16 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x8 v_reverse(const v_uint16x8 &a)
+{
+    static const vec_uchar16 perm = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
+    vec_uchar16 vec = (vec_uchar16)a.val;
+    return v_reinterpret_as_u16(v_uint8x16(vec_perm(vec, vec, perm)));
+}
+
+inline v_int16x8 v_reverse(const v_int16x8 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x4 v_reverse(const v_uint32x4 &a)
+{
+    static const vec_uchar16 perm = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
+    vec_uchar16 vec = (vec_uchar16)a.val;
+    return v_reinterpret_as_u32(v_uint8x16(vec_perm(vec, vec, perm)));
+}
+
+inline v_int32x4 v_reverse(const v_int32x4 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x4 v_reverse(const v_float32x4 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x2 v_reverse(const v_uint64x2 &a)
+{
+    static const vec_uchar16 perm = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7};
+    vec_uchar16 vec = (vec_uchar16)a.val;
+    return v_reinterpret_as_u64(v_uint8x16(vec_perm(vec, vec, perm)));
+}
+
+inline v_int64x2 v_reverse(const v_int64x2 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+inline v_float64x2 v_reverse(const v_float64x2 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
+/* Extract */
+template<int s, typename _Tpvec>
+inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
+{ return v_rotate_right<s>(a, b); }
+
+////////// Reduce and mask /////////
+
+/** Reduce **/
+inline uint v_reduce_sum(const v_uint8x16& a)
+{
+    const vec_uint4 zero4 = vec_uint4_z;
+    vec_uint4 sum4 = vec_sum4s(a.val, zero4);
+    return (uint)vec_extract(vec_sums(vec_int4_c(sum4), vec_int4_c(zero4)), 3);
+}
+inline int v_reduce_sum(const v_int8x16& a)
+{
+    const vec_int4 zero4 = vec_int4_z;
+    vec_int4 sum4 = vec_sum4s(a.val, zero4);
+    return (int)vec_extract(vec_sums(sum4, zero4), 3);
+}
+inline int v_reduce_sum(const v_int16x8& a)
+{
+    const vec_int4 zero = vec_int4_z;
+    return saturate_cast<int>(vec_extract(vec_sums(vec_sum4s(a.val, zero), zero), 3));
+}
+inline uint v_reduce_sum(const v_uint16x8& a)
+{
+    const vec_int4 v4 = vec_int4_c(vec_unpackhu(vec_adds(a.val, vec_sld(a.val, a.val, 8))));
+    return saturate_cast<uint>(vec_extract(vec_sums(v4, vec_int4_z), 3));
+}
+
+#define OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(_Tpvec, _Tpvec2, scalartype, suffix, func) \
+inline scalartype v_reduce_##suffix(const _Tpvec& a)                               \
+{                                                                                  \
+    const _Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8));                      \
+    return vec_extract(func(rs, vec_sld(rs, rs, 4)), 0);                           \
+}
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_uint32x4, vec_uint4, uint, sum, vec_add)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_uint32x4, vec_uint4, uint, max, vec_max)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_uint32x4, vec_uint4, uint, min, vec_min)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_int32x4, vec_int4, int, sum, vec_add)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_int32x4, vec_int4, int, max, vec_max)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_int32x4, vec_int4, int, min, vec_min)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, sum, vec_add)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, max, vec_max)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, min, vec_min)
+
+inline uint64 v_reduce_sum(const v_uint64x2& a)
+{
+    return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
+}
+inline int64 v_reduce_sum(const v_int64x2& a)
+{
+    return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
+}
+inline double v_reduce_sum(const v_float64x2& a)
+{
+    return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
+}
+
+#define OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(_Tpvec, _Tpvec2, scalartype, suffix, func) \
+inline scalartype v_reduce_##suffix(const _Tpvec& a)                               \
+{                                                                                  \
+    _Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8));                            \
+    rs = func(rs, vec_sld(rs, rs, 4));                                             \
+    return vec_extract(func(rs, vec_sld(rs, rs, 2)), 0);                           \
+}
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint16x8, vec_ushort8, ushort, max, vec_max)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint16x8, vec_ushort8, ushort, min, vec_min)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int16x8, vec_short8, short, max, vec_max)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int16x8, vec_short8, short, min, vec_min)
+
+#define OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(_Tpvec, _Tpvec2, scalartype, suffix, func) \
+inline scalartype v_reduce_##suffix(const _Tpvec& a)                               \
+{                                                                                  \
+    _Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8));                            \
+    rs = func(rs, vec_sld(rs, rs, 4));                                             \
+    rs = func(rs, vec_sld(rs, rs, 2));                                             \
+    return vec_extract(func(rs, vec_sld(rs, rs, 1)), 0);                           \
+}
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(v_uint8x16, vec_uchar16, uchar, max, vec_max)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(v_uint8x16, vec_uchar16, uchar, min, vec_min)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(v_int8x16, vec_char16, schar, max, vec_max)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(v_int8x16, vec_char16, schar, min, vec_min)
+
+inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
+                                 const v_float32x4& c, const v_float32x4& d)
+{
+    vec_float4 ac = vec_add(vec_mergel(a.val, c.val), vec_mergeh(a.val, c.val));
+    ac = vec_add(ac, vec_sld(ac, ac, 8));
+
+    vec_float4 bd = vec_add(vec_mergel(b.val, d.val), vec_mergeh(b.val, d.val));
+    bd = vec_add(bd, vec_sld(bd, bd, 8));
+    return v_float32x4(vec_mergeh(ac, bd));
+}
+
+inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
+{
+    const vec_uint4 zero4 = vec_uint4_z;
+    vec_uint4 sum4 = vec_sum4s(vec_absd(a.val, b.val), zero4);
+    return (unsigned)vec_extract(vec_sums(vec_int4_c(sum4), vec_int4_c(zero4)), 3);
+}
+inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
+{
+    const vec_int4 zero4 = vec_int4_z;
+    vec_char16 ad = vec_abss(vec_subs(a.val, b.val));
+    vec_int4 sum4 = vec_sum4s(ad, zero4);
+    return (unsigned)vec_extract(vec_sums(sum4, zero4), 3);
+}
+inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
+{
+    vec_ushort8 ad = vec_absd(a.val, b.val);
+    VSX_UNUSED(vec_int4) sum = vec_sums(vec_int4_c(vec_unpackhu(ad)) + vec_int4_c(vec_unpacklu(ad)), vec_int4_z);
+    return (unsigned)vec_extract(sum, 3);
+}
+inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
+{
+    const vec_int4 zero4 = vec_int4_z;
+    vec_short8 ad = vec_abss(vec_subs(a.val, b.val));
+    vec_int4 sum4 = vec_sum4s(ad, zero4);
+    return (unsigned)vec_extract(vec_sums(sum4, zero4), 3);
+}
+inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
+{
+    const vec_uint4 ad = vec_absd(a.val, b.val);
+    const vec_uint4 rd = vec_add(ad, vec_sld(ad, ad, 8));
+    return vec_extract(vec_add(rd, vec_sld(rd, rd, 4)), 0);
+}
+inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
+{
+    vec_int4 ad = vec_abss(vec_sub(a.val, b.val));
+    return (unsigned)vec_extract(vec_sums(ad, vec_int4_z), 3);
+}
+inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
+{
+    const vec_float4 ad = vec_abs(vec_sub(a.val, b.val));
+    const vec_float4 rd = vec_add(ad, vec_sld(ad, ad, 8));
+    return vec_extract(vec_add(rd, vec_sld(rd, rd, 4)), 0);
+}
+
+/** Popcount **/
+inline v_uint8x16 v_popcount(const v_uint8x16& a)
+{ return v_uint8x16(vec_popcntu(a.val)); }
+inline v_uint8x16 v_popcount(const v_int8x16& a)
+{ return v_uint8x16(vec_popcntu(a.val)); }
+inline v_uint16x8 v_popcount(const v_uint16x8& a)
+{ return v_uint16x8(vec_popcntu(a.val)); }
+inline v_uint16x8 v_popcount(const v_int16x8& a)
+{ return v_uint16x8(vec_popcntu(a.val)); }
+inline v_uint32x4 v_popcount(const v_uint32x4& a)
+{ return v_uint32x4(vec_popcntu(a.val)); }
+inline v_uint32x4 v_popcount(const v_int32x4& a)
+{ return v_uint32x4(vec_popcntu(a.val)); }
+inline v_uint64x2 v_popcount(const v_uint64x2& a)
+{ return v_uint64x2(vec_popcntu(a.val)); }
+inline v_uint64x2 v_popcount(const v_int64x2& a)
+{ return v_uint64x2(vec_popcntu(a.val)); }
+
+/** Mask **/
+inline int v_signmask(const v_uint8x16& a)
+{
+    static const vec_uchar16 qperm = {120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0};
+    return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
+}
+inline int v_signmask(const v_int8x16& a)
+{ return v_signmask(v_reinterpret_as_u8(a)); }
+
+inline int v_signmask(const v_int16x8& a)
+{
+    static const vec_uchar16 qperm = {112, 96, 80, 64, 48, 32, 16, 0, 128, 128, 128, 128, 128, 128, 128, 128};
+    return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
+}
+inline int v_signmask(const v_uint16x8& a)
+{ return v_signmask(v_reinterpret_as_s16(a)); }
+
+inline int v_signmask(const v_int32x4& a)
+{
+    static const vec_uchar16 qperm = {96, 64, 32, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128};
+    return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
+}
+inline int v_signmask(const v_uint32x4& a)
+{ return v_signmask(v_reinterpret_as_s32(a)); }
+inline int v_signmask(const v_float32x4& a)
+{ return v_signmask(v_reinterpret_as_s32(a)); }
+
+inline int v_signmask(const v_int64x2& a)
+{
+    VSX_UNUSED(const vec_dword2) sv = vec_sr(a.val, vec_udword2_sp(63));
+    return (int)vec_extract(sv, 0) | (int)vec_extract(sv, 1) << 1;
+}
+inline int v_signmask(const v_uint64x2& a)
+{ return v_signmask(v_reinterpret_as_s64(a)); }
+inline int v_signmask(const v_float64x2& a)
+{ return v_signmask(v_reinterpret_as_s64(a)); }
+
+inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
+
+template<typename _Tpvec>
+inline bool v_check_all(const _Tpvec& a)
+{ return vec_all_lt(a.val, _Tpvec::zero().val); }
+inline bool v_check_all(const v_uint8x16& a)
+{ return v_check_all(v_reinterpret_as_s8(a)); }
+inline bool v_check_all(const v_uint16x8& a)
+{ return v_check_all(v_reinterpret_as_s16(a)); }
+inline bool v_check_all(const v_uint32x4& a)
+{ return v_check_all(v_reinterpret_as_s32(a)); }
+inline bool v_check_all(const v_uint64x2& a)
+{ return v_check_all(v_reinterpret_as_s64(a)); }
+inline bool v_check_all(const v_float32x4& a)
+{ return v_check_all(v_reinterpret_as_s32(a)); }
+inline bool v_check_all(const v_float64x2& a)
+{ return v_check_all(v_reinterpret_as_s64(a)); }
+
+template<typename _Tpvec>
+inline bool v_check_any(const _Tpvec& a)
+{ return vec_any_lt(a.val, _Tpvec::zero().val); }
+inline bool v_check_any(const v_uint8x16& a)
+{ return v_check_any(v_reinterpret_as_s8(a)); }
+inline bool v_check_any(const v_uint16x8& a)
+{ return v_check_any(v_reinterpret_as_s16(a)); }
+inline bool v_check_any(const v_uint32x4& a)
+{ return v_check_any(v_reinterpret_as_s32(a)); }
+inline bool v_check_any(const v_uint64x2& a)
+{ return v_check_any(v_reinterpret_as_s64(a)); }
+inline bool v_check_any(const v_float32x4& a)
+{ return v_check_any(v_reinterpret_as_s32(a)); }
+inline bool v_check_any(const v_float64x2& a)
+{ return v_check_any(v_reinterpret_as_s64(a)); }
+
+////////// Other math /////////
+
+/** Some frequent operations **/
+inline v_float32x4 v_sqrt(const v_float32x4& x)
+{ return v_float32x4(vec_sqrt(x.val)); }
+inline v_float64x2 v_sqrt(const v_float64x2& x)
+{ return v_float64x2(vec_sqrt(x.val)); }
+
+inline v_float32x4 v_invsqrt(const v_float32x4& x)
+{ return v_float32x4(vec_rsqrt(x.val)); }
+inline v_float64x2 v_invsqrt(const v_float64x2& x)
+{ return v_float64x2(vec_rsqrt(x.val)); }
+
+#define OPENCV_HAL_IMPL_VSX_MULADD(_Tpvec)                                  \
+inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)                 \
+{ return _Tpvec(vec_sqrt(vec_madd(a.val, a.val, vec_mul(b.val, b.val)))); } \
+inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)             \
+{ return _Tpvec(vec_madd(a.val, a.val, vec_mul(b.val, b.val))); }           \
+inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)      \
+{ return _Tpvec(vec_madd(a.val, b.val, c.val)); }                           \
+inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)   \
+{ return _Tpvec(vec_madd(a.val, b.val, c.val)); }
+
+OPENCV_HAL_IMPL_VSX_MULADD(v_float32x4)
+OPENCV_HAL_IMPL_VSX_MULADD(v_float64x2)
+
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{ return a * b + c; }
+
+// TODO: exp, log, sin, cos
+
+/** Absolute values **/
+inline v_uint8x16 v_abs(const v_int8x16& x)
+{ return v_uint8x16(vec_uchar16_c(vec_abs(x.val))); }
+
+inline v_uint16x8 v_abs(const v_int16x8& x)
+{ return v_uint16x8(vec_ushort8_c(vec_abs(x.val))); }
+
+inline v_uint32x4 v_abs(const v_int32x4& x)
+{ return v_uint32x4(vec_uint4_c(vec_abs(x.val))); }
+
+inline v_float32x4 v_abs(const v_float32x4& x)
+{ return v_float32x4(vec_abs(x.val)); }
+
+inline v_float64x2 v_abs(const v_float64x2& x)
+{ return v_float64x2(vec_abs(x.val)); }
+
+/** Absolute difference **/
+// unsigned
+OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_absdiff, vec_absd)
+
+inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
+{ return v_reinterpret_as_u8(v_sub_wrap(v_max(a, b), v_min(a, b))); }
+inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
+{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
+inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
+{ return v_reinterpret_as_u32(v_max(a, b) - v_min(a, b)); }
+
+inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
+{ return v_abs(a - b); }
+inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
+{ return v_abs(a - b); }
+
+/** Absolute difference for signed integers **/
+inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
+{ return v_int8x16(vec_abss(vec_subs(a.val, b.val))); }
+inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
+{ return v_int16x8(vec_abss(vec_subs(a.val, b.val))); }
+
+////////// Conversions /////////
+
+/** Rounding **/
+inline v_int32x4 v_round(const v_float32x4& a)
+{ return v_int32x4(vec_cts(vec_rint(a.val))); }
+
+inline v_int32x4 v_round(const v_float64x2& a)
+{ return v_int32x4(vec_mergesqo(vec_ctso(vec_rint(a.val)), vec_int4_z)); }
+
+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{ return v_int32x4(vec_mergesqo(vec_ctso(vec_rint(a.val)), vec_ctso(vec_rint(b.val)))); }
+
+inline v_int32x4 v_floor(const v_float32x4& a)
+{ return v_int32x4(vec_cts(vec_floor(a.val))); }
+
+inline v_int32x4 v_floor(const v_float64x2& a)
+{ return v_int32x4(vec_mergesqo(vec_ctso(vec_floor(a.val)), vec_int4_z)); }
+
+inline v_int32x4 v_ceil(const v_float32x4& a)
+{ return v_int32x4(vec_cts(vec_ceil(a.val))); }
+
+inline v_int32x4 v_ceil(const v_float64x2& a)
+{ return v_int32x4(vec_mergesqo(vec_ctso(vec_ceil(a.val)), vec_int4_z)); }
+
+inline v_int32x4 v_trunc(const v_float32x4& a)
+{ return v_int32x4(vec_cts(a.val)); }
+
+inline v_int32x4 v_trunc(const v_float64x2& a)
+{ return v_int32x4(vec_mergesqo(vec_ctso(a.val), vec_int4_z)); }
+
+/** To float **/
+inline v_float32x4 v_cvt_f32(const v_int32x4& a)
+{ return v_float32x4(vec_ctf(a.val)); }
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a)
+{ return v_float32x4(vec_mergesqo(vec_cvfo(a.val), vec_float4_z)); }
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{ return v_float32x4(vec_mergesqo(vec_cvfo(a.val), vec_cvfo(b.val))); }
+
+inline v_float64x2 v_cvt_f64(const v_int32x4& a)
+{ return v_float64x2(vec_ctdo(vec_mergeh(a.val, a.val))); }
+
+inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
+{ return v_float64x2(vec_ctdo(vec_mergel(a.val, a.val))); }
+
+inline v_float64x2 v_cvt_f64(const v_float32x4& a)
+{ return v_float64x2(vec_cvfo(vec_mergeh(a.val, a.val))); }
+
+inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
+{ return v_float64x2(vec_cvfo(vec_mergel(a.val, a.val))); }
+
+inline v_float64x2 v_cvt_f64(const v_int64x2& a)
+{ return v_float64x2(vec_ctd(a.val)); }
+
+////////////// Lookup table access ////////////////////
+
+inline v_int8x16 v_lut(const schar* tab, const int* idx)
+{
+    return v_int8x16(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]], tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]],
+                     tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]);
+}
+inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
+{
+    return v_reinterpret_as_s8(v_int16x8(*(const short*)(tab+idx[0]), *(const short*)(tab+idx[1]), *(const short*)(tab+idx[2]), *(const short*)(tab+idx[3]),
+                                       *(const short*)(tab+idx[4]), *(const short*)(tab+idx[5]), *(const short*)(tab+idx[6]), *(const short*)(tab+idx[7])));
+}
+inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
+{
+    return v_reinterpret_as_s8(v_int32x4(*(const int*)(tab+idx[0]), *(const int*)(tab+idx[1]), *(const int*)(tab+idx[2]), *(const int*)(tab+idx[3])));
+}
+inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((const schar*)tab, idx)); }
+inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((const schar*)tab, idx)); }
+inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((const schar*)tab, idx)); }
+
+inline v_int16x8 v_lut(const short* tab, const int* idx)
+{
+    return v_int16x8(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]], tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]);
+}
+inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
+{
+    return v_reinterpret_as_s16(v_int32x4(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]), *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
+}
+inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
+{
+    return v_reinterpret_as_s16(v_int64x2(*(const int64*)(tab + idx[0]), *(const int64*)(tab + idx[1])));
+}
+inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((const short*)tab, idx)); }
+inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((const short*)tab, idx)); }
+inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((const short*)tab, idx)); }
+
+inline v_int32x4 v_lut(const int* tab, const int* idx)
+{
+    return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
+{
+    return v_reinterpret_as_s32(v_int64x2(*(const int64*)(tab + idx[0]), *(const int64*)(tab + idx[1])));
+}
+inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x4(vsx_ld(0, tab + idx[0]));
+}
+inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int*)tab, idx)); }
+inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int*)tab, idx)); }
+inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int*)tab, idx)); }
+
+inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(tab[idx[0]], tab[idx[1]]);
+}
+inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(vsx_ld2(0, tab + idx[0]));
+}
+inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
+inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
+
+inline v_float32x4 v_lut(const float* tab, const int* idx)
+{
+    return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_pairs((const int*)tab, idx)); }
+inline v_float32x4 v_lut_quads(const float* tab, const int* idx) { return v_load(tab + *idx); }
+
+inline v_float64x2 v_lut(const double* tab, const int* idx)
+{
+    return v_float64x2(tab[idx[0]], tab[idx[1]]);
+}
+inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) { return v_load(tab + *idx); }
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    const int idx[4] = {
+        vec_extract(idxvec.val, 0),
+        vec_extract(idxvec.val, 1),
+        vec_extract(idxvec.val, 2),
+        vec_extract(idxvec.val, 3)
+    };
+    return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+
+inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
+{
+    const int idx[4] = {
+        vec_extract(idxvec.val, 0),
+        vec_extract(idxvec.val, 1),
+        vec_extract(idxvec.val, 2),
+        vec_extract(idxvec.val, 3)
+    };
+    return v_uint32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    const int idx[4] = {
+        vec_extract(idxvec.val, 0),
+        vec_extract(idxvec.val, 1),
+        vec_extract(idxvec.val, 2),
+        vec_extract(idxvec.val, 3)
+    };
+    return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    const int idx[2] = {
+        vec_extract(idxvec.val, 0),
+        vec_extract(idxvec.val, 1)
+    };
+    return v_float64x2(tab[idx[0]], tab[idx[1]]);
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    vec_float4 xy0 = vec_ld_l8(tab + vec_extract(idxvec.val, 0));
+    vec_float4 xy1 = vec_ld_l8(tab + vec_extract(idxvec.val, 1));
+    vec_float4 xy2 = vec_ld_l8(tab + vec_extract(idxvec.val, 2));
+    vec_float4 xy3 = vec_ld_l8(tab + vec_extract(idxvec.val, 3));
+    vec_float4 xy02 = vec_mergeh(xy0, xy2); // x0, x2, y0, y2
+    vec_float4 xy13 = vec_mergeh(xy1, xy3); // x1, x3, y1, y3
+    x.val = vec_mergeh(xy02, xy13);
+    y.val = vec_mergel(xy02, xy13);
+}
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    vec_double2 xy0 = vsx_ld(vec_extract(idxvec.val, 0), tab);
+    vec_double2 xy1 = vsx_ld(vec_extract(idxvec.val, 1), tab);
+    x.val = vec_mergeh(xy0, xy1);
+    y.val = vec_mergel(xy0, xy1);
+}
+
+inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
+{
+    static const vec_uchar16 perm = {0, 2, 1, 3, 4, 6, 5, 7, 8, 10, 9, 11, 12, 14, 13, 15};
+    return v_int8x16(vec_perm(vec.val, vec.val, perm));
+}
+inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
+{ return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
+
+inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
+{
+    static const vec_uchar16 perm = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15};
+    return v_int8x16(vec_perm(vec.val, vec.val, perm));
+}
+inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec)
+{ return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
+{
+    static const vec_uchar16 perm = {0,1, 4,5, 2,3, 6,7, 8,9, 12,13, 10,11, 14,15};
+    return v_int16x8(vec_perm(vec.val, vec.val, perm));
+}
+inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec)
+{ return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
+
+inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
+{
+    static const vec_uchar16 perm = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15};
+    return v_int16x8(vec_perm(vec.val, vec.val, perm));
+}
+inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec)
+{ return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
+{
+    static const vec_uchar16 perm = {0,1,2,3, 8,9,10,11, 4,5,6,7, 12,13,14,15};
+    return v_int32x4(vec_perm(vec.val, vec.val, perm));
+}
+inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec)
+{ return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+inline v_float32x4 v_interleave_pairs(const v_float32x4& vec)
+{ return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+
+inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
+{
+    static const vec_uchar16 perm = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, 15, 15, 15};
+    return v_int8x16(vec_perm(vec.val, vec.val, perm));
+}
+inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec)
+{ return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
+{
+    static const vec_uchar16 perm = {0,1, 2,3, 4,5, 8,9, 10,11, 12,13, 14,15, 14,15};
+    return v_int16x8(vec_perm(vec.val, vec.val, perm));
+}
+inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec)
+{ return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_pack_triplets(const v_int32x4& vec)
+{ return vec; }
+inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec)
+{ return vec; }
+inline v_float32x4 v_pack_triplets(const v_float32x4& vec)
+{ return vec; }
+
+/////// FP16 support ////////
+
+inline v_float32x4 v_load_expand(const float16_t* ptr)
+{
+    vec_ushort8 vf16 = vec_ld_l8((const ushort*)ptr);
+#if CV_VSX3 && defined(vec_extract_fp_from_shorth)
+    return v_float32x4(vec_extract_fp_from_shorth(vf16));
+#elif CV_VSX3 && !defined(CV_COMPILER_VSX_BROKEN_ASM)
+    vec_float4 vf32;
+    __asm__ __volatile__ ("xvcvhpsp %x0,%x1" : "=wa" (vf32) : "wa" (vec_mergeh(vf16, vf16)));
+    return v_float32x4(vf32);
+#else
+    const vec_int4 z = vec_int4_z, delta = vec_int4_sp(0x38000000);
+    const vec_int4 signmask = vec_int4_sp(0x80000000);
+    const vec_int4 maxexp = vec_int4_sp(0x7c000000);
+    const vec_float4 deltaf = vec_float4_c(vec_int4_sp(0x38800000));
+
+    vec_int4 bits = vec_int4_c(vec_mergeh(vec_short8_c(z), vec_short8_c(vf16)));
+    vec_int4 e = vec_and(bits, maxexp), sign = vec_and(bits, signmask);
+    vec_int4 t = vec_add(vec_sr(vec_xor(bits, sign), vec_uint4_sp(3)), delta); // ((h & 0x7fff) << 13) + delta
+    vec_int4 zt = vec_int4_c(vec_sub(vec_float4_c(vec_add(t, vec_int4_sp(1 << 23))), deltaf));
+
+    t = vec_add(t, vec_and(delta, vec_cmpeq(maxexp, e)));
+    vec_bint4 zmask = vec_cmpeq(e, z);
+    vec_int4 ft = vec_sel(t, zt, zmask);
+    return v_float32x4(vec_float4_c(vec_or(ft, sign)));
+#endif
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+{
+// fixme: Is there any builtin op or intrinsic that cover "xvcvsphp"?
+#if CV_VSX3 && !defined(CV_COMPILER_VSX_BROKEN_ASM)
+    vec_ushort8 vf16;
+    __asm__ __volatile__ ("xvcvsphp %x0,%x1" : "=wa" (vf16) : "wa" (v.val));
+    vec_st_l8(vec_mergesqe(vf16, vf16), ptr);
+#else
+    const vec_int4 signmask = vec_int4_sp(0x80000000);
+    const vec_int4 rval = vec_int4_sp(0x3f000000);
+
+    vec_int4 t = vec_int4_c(v.val);
+    vec_int4 sign = vec_sra(vec_and(t, signmask), vec_uint4_sp(16));
+    t = vec_and(vec_nor(signmask, signmask), t);
+
+    vec_bint4 finitemask = vec_cmpgt(vec_int4_sp(0x47800000), t);
+    vec_bint4 isnan = vec_cmpgt(t, vec_int4_sp(0x7f800000));
+    vec_int4 naninf = vec_sel(vec_int4_sp(0x7c00), vec_int4_sp(0x7e00), isnan);
+    vec_bint4 tinymask = vec_cmpgt(vec_int4_sp(0x38800000), t);
+    vec_int4 tt = vec_int4_c(vec_add(vec_float4_c(t), vec_float4_c(rval)));
+    tt = vec_sub(tt, rval);
+    vec_int4 odd = vec_and(vec_sr(t, vec_uint4_sp(13)), vec_int4_sp(1));
+    vec_int4 nt = vec_add(t, vec_int4_sp(0xc8000fff));
+    nt = vec_sr(vec_add(nt, odd), vec_uint4_sp(13));
+    t = vec_sel(nt, tt, tinymask);
+    t = vec_sel(naninf, t, finitemask);
+    t = vec_or(t, sign);
+    vec_st_l8(vec_packs(t, t), ptr);
+#endif
+}
+
+inline void v_cleanup() {}
+
+
+/** Reinterpret **/
+/** its up there with load and store operations **/
+
+////////// Matrix operations /////////
+
+//////// Dot Product ////////
+// 16 >> 32
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{ return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)); }
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_int32x4(vec_msum(a.val, b.val, c.val)); }
+
+// 32 >> 64
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
+{
+    vec_dword2 even = vec_mule(a.val, b.val);
+    vec_dword2 odd = vec_mulo(a.val, b.val);
+    return v_int64x2(vec_add(even, odd));
+}
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_dotprod(a, b) + c; }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_uint32x4(vec_msum(a.val, b.val, c.val)); }
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
+{ return v_uint32x4(vec_msum(a.val, b.val, vec_uint4_z)); }
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
+{
+    const vec_ushort8 eight = vec_ushort8_sp(8);
+    vec_short8 a0 = vec_sra((vec_short8)vec_sld(a.val, a.val, 1), eight); // even
+    vec_short8 a1 = vec_sra((vec_short8)a.val, eight); // odd
+    vec_short8 b0 = vec_sra((vec_short8)vec_sld(b.val, b.val, 1), eight);
+    vec_short8 b1 = vec_sra((vec_short8)b.val, eight);
+    return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, vec_int4_z)));
+}
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{
+    const vec_ushort8 eight = vec_ushort8_sp(8);
+    vec_short8 a0 = vec_sra((vec_short8)vec_sld(a.val, a.val, 1), eight); // even
+    vec_short8 a1 = vec_sra((vec_short8)a.val, eight); // odd
+    vec_short8 b0 = vec_sra((vec_short8)vec_sld(b.val, b.val, 1), eight);
+    vec_short8 b1 = vec_sra((vec_short8)b.val, eight);
+    return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, c.val)));
+}
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
+{
+    const vec_uint4 zero = vec_uint4_z;
+    vec_uint4 even = vec_mule(a.val, b.val);
+    vec_uint4 odd  = vec_mulo(a.val, b.val);
+    vec_udword2 e0 = (vec_udword2)vec_mergee(even, zero);
+    vec_udword2 e1 = (vec_udword2)vec_mergeo(even, zero);
+    vec_udword2 o0 = (vec_udword2)vec_mergee(odd, zero);
+    vec_udword2 o1 = (vec_udword2)vec_mergeo(odd, zero);
+    vec_udword2 s0 = vec_add(e0, o0);
+    vec_udword2 s1 = vec_add(e1, o1);
+    return v_uint64x2(vec_add(s0, s1));
+}
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
+{
+    v_int32x4 prod = v_dotprod(a, b);
+    v_int64x2 c, d;
+    v_expand(prod, c, d);
+    return v_int64x2(vec_add(vec_mergeh(c.val, d.val), vec_mergel(c.val, d.val)));
+}
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
+{ return v_dotprod(a, b); }
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)) + c; }
+// 32 >> 64
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod(a, b); }
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_dotprod(a, b, c); }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_uint32x4(vec_msum(a.val, b.val, vec_uint4_z)) + c; }
+
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
+{
+    vec_short8 a0 = vec_unpackh(a.val);
+    vec_short8 a1 = vec_unpackl(a.val);
+    vec_short8 b0 = vec_unpackh(b.val);
+    vec_short8 b1 = vec_unpackl(b.val);
+    return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, vec_int4_z)));
+}
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
+{
+    v_int32x4 prod = v_dotprod(a, b);
+    v_int64x2 c, d;
+    v_expand(prod, c, d);
+    return c + d;
+}
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod_expand(a, b); }
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2,
+                            const v_float32x4& m3)
+{
+    const vec_float4 v0 = vec_splat(v.val, 0);
+    const vec_float4 v1 = vec_splat(v.val, 1);
+    const vec_float4 v2 = vec_splat(v.val, 2);
+    VSX_UNUSED(const vec_float4) v3 = vec_splat(v.val, 3);
+    return v_float32x4(vec_madd(v0, m0.val, vec_madd(v1, m1.val, vec_madd(v2, m2.val, vec_mul(v3, m3.val)))));
+}
+
+inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
+                               const v_float32x4& m1, const v_float32x4& m2,
+                               const v_float32x4& a)
+{
+    const vec_float4 v0 = vec_splat(v.val, 0);
+    const vec_float4 v1 = vec_splat(v.val, 1);
+    const vec_float4 v2 = vec_splat(v.val, 2);
+    return v_float32x4(vec_madd(v0, m0.val, vec_madd(v1, m1.val, vec_madd(v2, m2.val, a.val))));
+}
+
+#define OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(_Tpvec, _Tpvec2)                        \
+inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1,                   \
+                           const _Tpvec& a2, const _Tpvec& a3,                   \
+                           _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3)       \
+{                                                                                \
+    _Tpvec2 a02 = vec_mergeh(a0.val, a2.val);                                    \
+    _Tpvec2 a13 = vec_mergeh(a1.val, a3.val);                                    \
+    b0.val = vec_mergeh(a02, a13);                                               \
+    b1.val = vec_mergel(a02, a13);                                               \
+    a02 = vec_mergel(a0.val, a2.val);                                            \
+    a13 = vec_mergel(a1.val, a3.val);                                            \
+    b2.val  = vec_mergeh(a02, a13);                                              \
+    b3.val  = vec_mergel(a02, a13);                                              \
+}
+OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_uint32x4, vec_uint4)
+OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_int32x4, vec_int4)
+OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_float32x4, vec_float4)
+
+template<int i, typename Tvec>
+inline Tvec v_broadcast_element(const Tvec& v)
+{ return Tvec(vec_splat(v.val, i)); }
+
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+}
+
+#endif // OPENCV_HAL_VSX_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_wasm.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_wasm.hpp
new file mode 100644
index 0000000..b4178af
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/intrin_wasm.hpp
@@ -0,0 +1,2782 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_HAL_INTRIN_WASM_HPP
+#define OPENCV_HAL_INTRIN_WASM_HPP
+
+#include <limits>
+#include <cstring>
+#include <algorithm>
+#include "opencv2/core/saturate.hpp"
+
+#define CV_SIMD128 1
+#define CV_SIMD128_64F 0 // Now all implementation of f64 use fallback, so disable it.
+#define CV_SIMD128_FP16 0
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+#if (__EMSCRIPTEN_major__ * 1000000 + __EMSCRIPTEN_minor__ * 1000 + __EMSCRIPTEN_tiny__) < (1038046)
+// handle renames: https://github.com/emscripten-core/emscripten/pull/9440 (https://github.com/emscripten-core/emscripten/commit/755d5b46cb84d0aa120c10981b11d05646c29673)
+#define wasm_i32x4_trunc_saturate_f32x4 wasm_trunc_saturate_i32x4_f32x4
+#define wasm_u32x4_trunc_saturate_f32x4 wasm_trunc_saturate_u32x4_f32x4
+#define wasm_i64x2_trunc_saturate_f64x2 wasm_trunc_saturate_i64x2_f64x2
+#define wasm_u64x2_trunc_saturate_f64x2 wasm_trunc_saturate_u64x2_f64x2
+#define wasm_f32x4_convert_i32x4 wasm_convert_f32x4_i32x4
+#define wasm_f32x4_convert_u32x4 wasm_convert_f32x4_u32x4
+#define wasm_f64x2_convert_i64x2 wasm_convert_f64x2_i64x2
+#define wasm_f64x2_convert_u64x2 wasm_convert_f64x2_u64x2
+#endif // COMPATIBILITY: <1.38.46
+
+///////// Types ///////////
+
+struct v_uint8x16
+{
+    typedef uchar lane_type;
+    typedef v128_t vector_type;
+    enum { nlanes = 16 };
+
+    v_uint8x16() {}
+    explicit v_uint8x16(v128_t v) : val(v) {}
+    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+            uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+    {
+        uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        val = wasm_v128_load(v);
+    }
+
+    uchar get0() const
+    {
+        return (uchar)wasm_i8x16_extract_lane(val, 0);
+    }
+
+    v128_t val;
+};
+
+struct v_int8x16
+{
+    typedef schar lane_type;
+    typedef v128_t vector_type;
+    enum { nlanes = 16 };
+
+    v_int8x16() {}
+    explicit v_int8x16(v128_t v) : val(v) {}
+    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+            schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+    {
+        schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        val = wasm_v128_load(v);
+    }
+
+    schar get0() const
+    {
+        return wasm_i8x16_extract_lane(val, 0);
+    }
+
+    v128_t val;
+};
+
+struct v_uint16x8
+{
+    typedef ushort lane_type;
+    typedef v128_t vector_type;
+    enum { nlanes = 8 };
+
+    v_uint16x8() {}
+    explicit v_uint16x8(v128_t v) : val(v) {}
+    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+    {
+        ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = wasm_v128_load(v);
+    }
+
+    ushort get0() const
+    {
+        return (ushort)wasm_i16x8_extract_lane(val, 0);    // wasm_u16x8_extract_lane() unimplemented yet
+    }
+
+    v128_t val;
+};
+
+struct v_int16x8
+{
+    typedef short lane_type;
+    typedef v128_t vector_type;
+    enum { nlanes = 8 };
+
+    v_int16x8() {}
+    explicit v_int16x8(v128_t v) : val(v) {}
+    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+    {
+        short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = wasm_v128_load(v);
+    }
+
+    short get0() const
+    {
+        return wasm_i16x8_extract_lane(val, 0);
+    }
+
+    v128_t val;
+};
+
+struct v_uint32x4
+{
+    typedef unsigned lane_type;
+    typedef v128_t vector_type;
+    enum { nlanes = 4 };
+
+    v_uint32x4() {}
+    explicit v_uint32x4(v128_t v) : val(v) {}
+    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
+    {
+        unsigned v[] = {v0, v1, v2, v3};
+        val = wasm_v128_load(v);
+    }
+
+    unsigned get0() const
+    {
+        return (unsigned)wasm_i32x4_extract_lane(val, 0);
+    }
+
+    v128_t val;
+};
+
+struct v_int32x4
+{
+    typedef int lane_type;
+    typedef v128_t vector_type;
+    enum { nlanes = 4 };
+
+    v_int32x4() {}
+    explicit v_int32x4(v128_t v) : val(v) {}
+    v_int32x4(int v0, int v1, int v2, int v3)
+    {
+        int v[] = {v0, v1, v2, v3};
+        val = wasm_v128_load(v);
+    }
+
+    int get0() const
+    {
+        return wasm_i32x4_extract_lane(val, 0);
+    }
+
+    v128_t val;
+};
+
+struct v_float32x4
+{
+    typedef float lane_type;
+    typedef v128_t vector_type;
+    enum { nlanes = 4 };
+
+    v_float32x4() {}
+    explicit v_float32x4(v128_t v) : val(v) {}
+    v_float32x4(float v0, float v1, float v2, float v3)
+    {
+        float v[] = {v0, v1, v2, v3};
+        val = wasm_v128_load(v);
+    }
+
+    float get0() const
+    {
+        return wasm_f32x4_extract_lane(val, 0);
+    }
+
+    v128_t val;
+};
+
+struct v_uint64x2
+{
+    typedef uint64 lane_type;
+    typedef v128_t vector_type;
+    enum { nlanes = 2 };
+
+    v_uint64x2() {}
+    explicit v_uint64x2(v128_t v) : val(v) {}
+    v_uint64x2(uint64 v0, uint64 v1)
+    {
+        uint64 v[] = {v0, v1};
+        val = wasm_v128_load(v);
+    }
+
+    uint64 get0() const
+    {
+        return (uint64)wasm_i64x2_extract_lane(val, 0);
+    }
+
+    v128_t val;
+};
+
+struct v_int64x2
+{
+    typedef int64 lane_type;
+    typedef v128_t vector_type;
+    enum { nlanes = 2 };
+
+    v_int64x2() {}
+    explicit v_int64x2(v128_t v) : val(v) {}
+    v_int64x2(int64 v0, int64 v1)
+    {
+        int64 v[] = {v0, v1};
+        val = wasm_v128_load(v);
+    }
+
+    int64 get0() const
+    {
+        return wasm_i64x2_extract_lane(val, 0);
+    }
+
+    v128_t val;
+};
+
+struct v_float64x2
+{
+    typedef double lane_type;
+    typedef v128_t vector_type;
+    enum { nlanes = 2 };
+
+    v_float64x2() {}
+    explicit v_float64x2(v128_t v) : val(v) {}
+    v_float64x2(double v0, double v1)
+    {
+        double v[] = {v0, v1};
+        val = wasm_v128_load(v);
+    }
+
+    double get0() const
+    {
+        return wasm_f64x2_extract_lane(val, 0);
+    }
+
+    v128_t val;
+};
+
+namespace
+{
+#define OPENCV_HAL_IMPL_REINTERPRET_INT(ft, tt) \
+inline tt reinterpret_int(ft x) { union { ft l; tt i; } v; v.l = x; return v.i; }
+OPENCV_HAL_IMPL_REINTERPRET_INT(uchar, schar)
+OPENCV_HAL_IMPL_REINTERPRET_INT(schar, schar)
+OPENCV_HAL_IMPL_REINTERPRET_INT(ushort, short)
+OPENCV_HAL_IMPL_REINTERPRET_INT(short, short)
+OPENCV_HAL_IMPL_REINTERPRET_INT(unsigned, int)
+OPENCV_HAL_IMPL_REINTERPRET_INT(int, int)
+OPENCV_HAL_IMPL_REINTERPRET_INT(float, int)
+OPENCV_HAL_IMPL_REINTERPRET_INT(uint64, int64)
+OPENCV_HAL_IMPL_REINTERPRET_INT(int64, int64)
+OPENCV_HAL_IMPL_REINTERPRET_INT(double, int64)
+
+static const unsigned char popCountTable[] =
+{
+    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
+};
+}  // namespace
+
+static v128_t wasm_unpacklo_i8x16(v128_t a, v128_t b) {
+    return wasm_v8x16_shuffle(a, b, 0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23);
+}
+
+static v128_t wasm_unpacklo_i16x8(v128_t a, v128_t b) {
+    return wasm_v8x16_shuffle(a, b, 0,1,16,17,2,3,18,19,4,5,20,21,6,7,22,23);
+}
+
+static v128_t wasm_unpacklo_i32x4(v128_t a, v128_t b) {
+    return wasm_v8x16_shuffle(a, b, 0,1,2,3,16,17,18,19,4,5,6,7,20,21,22,23);
+}
+
+static v128_t wasm_unpacklo_i64x2(v128_t a, v128_t b) {
+    return wasm_v8x16_shuffle(a, b, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
+}
+
+static v128_t wasm_unpackhi_i8x16(v128_t a, v128_t b) {
+    return wasm_v8x16_shuffle(a, b, 8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31);
+}
+
+static v128_t wasm_unpackhi_i16x8(v128_t a, v128_t b) {
+    return wasm_v8x16_shuffle(a, b, 8,9,24,25,10,11,26,27,12,13,28,29,14,15,30,31);
+}
+
+static v128_t wasm_unpackhi_i32x4(v128_t a, v128_t b) {
+    return wasm_v8x16_shuffle(a, b, 8,9,10,11,24,25,26,27,12,13,14,15,28,29,30,31);
+}
+
+static v128_t wasm_unpackhi_i64x2(v128_t a, v128_t b) {
+    return wasm_v8x16_shuffle(a, b, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
+}
+
+/** Convert **/
+// 8 >> 16
+inline v128_t v128_cvtu8x16_i16x8(const v128_t& a)
+{
+    const v128_t z = wasm_i8x16_splat(0);
+    return wasm_unpacklo_i8x16(a, z);
+}
+inline v128_t v128_cvti8x16_i16x8(const v128_t& a)
+{ return wasm_i16x8_shr(wasm_unpacklo_i8x16(a, a), 8); }
+// 8 >> 32
+inline v128_t v128_cvtu8x16_i32x4(const v128_t& a)
+{
+    const v128_t z = wasm_i8x16_splat(0);
+    return wasm_unpacklo_i16x8(wasm_unpacklo_i8x16(a, z), z);
+}
+inline v128_t v128_cvti8x16_i32x4(const v128_t& a)
+{
+    v128_t r = wasm_unpacklo_i8x16(a, a);
+    r = wasm_unpacklo_i8x16(r, r);
+    return wasm_i32x4_shr(r, 24);
+}
+// 16 >> 32
+inline v128_t v128_cvtu16x8_i32x4(const v128_t& a)
+{
+    const v128_t z = wasm_i8x16_splat(0);
+    return wasm_unpacklo_i16x8(a, z);
+}
+inline v128_t v128_cvti16x8_i32x4(const v128_t& a)
+{ return wasm_i32x4_shr(wasm_unpacklo_i16x8(a, a), 16); }
+// 32 >> 64
+inline v128_t v128_cvtu32x4_i64x2(const v128_t& a)
+{
+    const v128_t z = wasm_i8x16_splat(0);
+    return wasm_unpacklo_i32x4(a, z);
+}
+inline v128_t v128_cvti32x4_i64x2(const v128_t& a)
+{ return wasm_unpacklo_i32x4(a, wasm_i32x4_shr(a, 31)); }
+
+// 16 << 8
+inline v128_t v128_cvtu8x16_i16x8_high(const v128_t& a)
+{
+    const v128_t z = wasm_i8x16_splat(0);
+    return wasm_unpackhi_i8x16(a, z);
+}
+inline v128_t v128_cvti8x16_i16x8_high(const v128_t& a)
+{ return wasm_i16x8_shr(wasm_unpackhi_i8x16(a, a), 8); }
+// 32 << 16
+inline v128_t v128_cvtu16x8_i32x4_high(const v128_t& a)
+{
+    const v128_t z = wasm_i8x16_splat(0);
+    return wasm_unpackhi_i16x8(a, z);
+}
+inline v128_t v128_cvti16x8_i32x4_high(const v128_t& a)
+{ return wasm_i32x4_shr(wasm_unpackhi_i16x8(a, a), 16); }
+// 64 << 32
+inline v128_t v128_cvtu32x4_i64x2_high(const v128_t& a)
+{
+    const v128_t z = wasm_i8x16_splat(0);
+    return wasm_unpackhi_i32x4(a, z);
+}
+inline v128_t v128_cvti32x4_i64x2_high(const v128_t& a)
+{ return wasm_unpackhi_i32x4(a, wasm_i32x4_shr(a, 31)); }
+
+#define OPENCV_HAL_IMPL_WASM_INITVEC(_Tpvec, _Tp, suffix, zsuffix, _Tps) \
+inline _Tpvec v_setzero_##suffix() { return _Tpvec(wasm_##zsuffix##_splat((_Tps)0)); } \
+inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(wasm_##zsuffix##_splat((_Tps)v)); } \
+template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
+{ return _Tpvec(a.val); }
+
+OPENCV_HAL_IMPL_WASM_INITVEC(v_uint8x16, uchar, u8, i8x16, schar)
+OPENCV_HAL_IMPL_WASM_INITVEC(v_int8x16, schar, s8, i8x16, schar)
+OPENCV_HAL_IMPL_WASM_INITVEC(v_uint16x8, ushort, u16, i16x8, short)
+OPENCV_HAL_IMPL_WASM_INITVEC(v_int16x8, short, s16, i16x8, short)
+OPENCV_HAL_IMPL_WASM_INITVEC(v_uint32x4, unsigned, u32, i32x4, int)
+OPENCV_HAL_IMPL_WASM_INITVEC(v_int32x4, int, s32, i32x4, int)
+OPENCV_HAL_IMPL_WASM_INITVEC(v_float32x4, float, f32, f32x4, float)
+OPENCV_HAL_IMPL_WASM_INITVEC(v_uint64x2, uint64, u64, i64x2, int64)
+OPENCV_HAL_IMPL_WASM_INITVEC(v_int64x2, int64, s64, i64x2, int64)
+OPENCV_HAL_IMPL_WASM_INITVEC(v_float64x2, double, f64, f64x2, double)
+
+//////////////// PACK ///////////////
+inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v128_t maxval = wasm_i16x8_splat(255);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval));
+    v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u16x8_gt(b.val, maxval));
+    return v_uint8x16(wasm_v8x16_shuffle(a1, b1, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
+}
+inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
+{
+    v128_t maxval = wasm_i16x8_splat(127);
+    v128_t minval = wasm_i16x8_splat(-128);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
+    v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i16x8_gt(b.val, maxval));
+    v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
+    v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i16x8_lt(b1, minval));
+    return v_int8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
+}
+inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
+{
+    v128_t maxval = wasm_i32x4_splat(65535);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval));
+    v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u32x4_gt(b.val, maxval));
+    return v_uint16x8(wasm_v8x16_shuffle(a1, b1, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
+}
+inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
+{
+    v128_t maxval = wasm_i32x4_splat(32767);
+    v128_t minval = wasm_i32x4_splat(-32768);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
+    v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i32x4_gt(b.val, maxval));
+    v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
+    v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i32x4_lt(b1, minval));
+    return v_int16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
+}
+inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
+{
+    return v_uint32x4(wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
+}
+inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
+{
+    return v_int32x4(wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
+}
+inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
+{
+    v128_t maxval = wasm_i16x8_splat(255);
+    v128_t minval = wasm_i16x8_splat(0);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
+    v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i16x8_gt(b.val, maxval));
+    v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
+    v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i16x8_lt(b1, minval));
+    return v_uint8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
+}
+inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
+{
+    v128_t maxval = wasm_i32x4_splat(65535);
+    v128_t minval = wasm_i32x4_splat(0);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
+    v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i32x4_gt(b.val, maxval));
+    v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
+    v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i32x4_lt(b1, minval));
+    return v_uint16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
+}
+
+template<int n>
+inline v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
+    v128_t a1 = wasm_u16x8_shr(wasm_i16x8_add(a.val, delta), n);
+    v128_t b1 = wasm_u16x8_shr(wasm_i16x8_add(b.val, delta), n);
+    v128_t maxval = wasm_i16x8_splat(255);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u16x8_gt(a1, maxval));
+    v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_u16x8_gt(b1, maxval));
+    return v_uint8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
+}
+template<int n>
+inline v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
+{
+    v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
+    v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
+    v128_t b1 = wasm_i16x8_shr(wasm_i16x8_add(b.val, delta), n);
+    v128_t maxval = wasm_i16x8_splat(127);
+    v128_t minval = wasm_i16x8_splat(-128);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
+    v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i16x8_gt(b1, maxval));
+    v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
+    v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i16x8_lt(b1, minval));
+    return v_int8x16(wasm_v8x16_shuffle(a3, b3, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
+}
+template<int n>
+inline v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
+{
+    v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
+    v128_t a1 = wasm_u32x4_shr(wasm_i32x4_add(a.val, delta), n);
+    v128_t b1 = wasm_u32x4_shr(wasm_i32x4_add(b.val, delta), n);
+    v128_t maxval = wasm_i32x4_splat(65535);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u32x4_gt(a1, maxval));
+    v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_u32x4_gt(b1, maxval));
+    return v_uint16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
+}
+template<int n>
+inline v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
+{
+    v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
+    v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
+    v128_t b1 = wasm_i32x4_shr(wasm_i32x4_add(b.val, delta), n);
+    v128_t maxval = wasm_i32x4_splat(32767);
+    v128_t minval = wasm_i16x8_splat(-32768);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
+    v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i32x4_gt(b1, maxval));
+    v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
+    v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i32x4_lt(b1, minval));
+    return v_int16x8(wasm_v8x16_shuffle(a3, b3, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
+}
+template<int n>
+inline v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
+{
+    v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
+    v128_t a1 = wasm_u64x2_shr(wasm_i64x2_add(a.val, delta), n);
+    v128_t b1 = wasm_u64x2_shr(wasm_i64x2_add(b.val, delta), n);
+    return v_uint32x4(wasm_v8x16_shuffle(a1, b1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
+}
+template<int n>
+inline v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
+{
+    v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
+    v128_t a1 = wasm_i64x2_shr(wasm_i64x2_add(a.val, delta), n);
+    v128_t b1 = wasm_i64x2_shr(wasm_i64x2_add(b.val, delta), n);
+    return v_int32x4(wasm_v8x16_shuffle(a1, b1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
+}
+template<int n>
+inline v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
+{
+    v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
+    v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
+    v128_t b1 = wasm_i16x8_shr(wasm_i16x8_add(b.val, delta), n);
+    v128_t maxval = wasm_i16x8_splat(255);
+    v128_t minval = wasm_i16x8_splat(0);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
+    v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i16x8_gt(b1, maxval));
+    v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
+    v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i16x8_lt(b1, minval));
+    return v_uint8x16(wasm_v8x16_shuffle(a3, b3, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
+}
+template<int n>
+inline v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
+{
+    v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
+    v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
+    v128_t b1 = wasm_i32x4_shr(wasm_i32x4_add(b.val, delta), n);
+    v128_t maxval = wasm_i32x4_splat(65535);
+    v128_t minval = wasm_i16x8_splat(0);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
+    v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i32x4_gt(b1, maxval));
+    v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
+    v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i32x4_lt(b1, minval));
+    return v_uint16x8(wasm_v8x16_shuffle(a3, b3, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
+}
+
+inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
+{
+    v128_t maxval = wasm_i16x8_splat(255);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval));
+    v128_t r = wasm_v8x16_shuffle(a1, a1, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
+    uchar t_ptr[16];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<8; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+inline void v_pack_store(schar* ptr, const v_int16x8& a)
+{
+    v128_t maxval = wasm_i16x8_splat(127);
+    v128_t minval = wasm_i16x8_splat(-128);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
+    v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
+    v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
+    schar t_ptr[16];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<8; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
+{
+    v128_t maxval = wasm_i32x4_splat(65535);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval));
+    v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
+    ushort t_ptr[8];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<4; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+inline void v_pack_store(short* ptr, const v_int32x4& a)
+{
+    v128_t maxval = wasm_i32x4_splat(32767);
+    v128_t minval = wasm_i32x4_splat(-32768);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
+    v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
+    v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
+    short t_ptr[8];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<4; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
+{
+    v128_t r = wasm_v8x16_shuffle(a.val, a.val, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
+    unsigned t_ptr[4];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<2; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+inline void v_pack_store(int* ptr, const v_int64x2& a)
+{
+    v128_t r = wasm_v8x16_shuffle(a.val, a.val, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
+    int t_ptr[4];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<2; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
+{
+    v128_t maxval = wasm_i16x8_splat(255);
+    v128_t minval = wasm_i16x8_splat(0);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
+    v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
+    v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
+    uchar t_ptr[16];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<8; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
+{
+    v128_t maxval = wasm_i32x4_splat(65535);
+    v128_t minval = wasm_i32x4_splat(0);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
+    v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
+    v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
+    ushort t_ptr[8];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<4; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+
+template<int n>
+inline void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
+{
+    v128_t delta = wasm_i16x8_splat((short)(1 << (n-1)));
+    v128_t a1 = wasm_u16x8_shr(wasm_i16x8_add(a.val, delta), n);
+    v128_t maxval = wasm_i16x8_splat(255);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u16x8_gt(a1, maxval));
+    v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
+    uchar t_ptr[16];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<8; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+template<int n>
+inline void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
+{
+    v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
+    v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
+    v128_t maxval = wasm_i16x8_splat(127);
+    v128_t minval = wasm_i16x8_splat(-128);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
+    v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
+    v128_t r = wasm_v8x16_shuffle(a3, a3, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
+    schar t_ptr[16];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<8; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+template<int n>
+inline void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
+{
+    v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
+    v128_t a1 = wasm_u32x4_shr(wasm_i32x4_add(a.val, delta), n);
+    v128_t maxval = wasm_i32x4_splat(65535);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u32x4_gt(a1, maxval));
+    v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
+    ushort t_ptr[8];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<4; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+template<int n>
+inline void v_rshr_pack_store(short* ptr, const v_int32x4& a)
+{
+    v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
+    v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
+    v128_t maxval = wasm_i32x4_splat(32767);
+    v128_t minval = wasm_i32x4_splat(-32768);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
+    v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
+    v128_t r = wasm_v8x16_shuffle(a3, a3, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
+    short t_ptr[8];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<4; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+template<int n>
+inline void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
+{
+    v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
+    v128_t a1 = wasm_u64x2_shr(wasm_i64x2_add(a.val, delta), n);
+    v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
+    unsigned t_ptr[4];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<2; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+template<int n>
+inline void v_rshr_pack_store(int* ptr, const v_int64x2& a)
+{
+    v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
+    v128_t a1 = wasm_i64x2_shr(wasm_i64x2_add(a.val, delta), n);
+    v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
+    int t_ptr[4];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<2; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+template<int n>
+inline void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
+{
+    v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
+    v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
+    v128_t maxval = wasm_i16x8_splat(255);
+    v128_t minval = wasm_i16x8_splat(0);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
+    v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
+    v128_t r = wasm_v8x16_shuffle(a3, a3, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
+    uchar t_ptr[16];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<8; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+template<int n>
+inline void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
+{
+    v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
+    v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
+    v128_t maxval = wasm_i32x4_splat(65535);
+    v128_t minval = wasm_i32x4_splat(0);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
+    v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
+    v128_t r = wasm_v8x16_shuffle(a3, a3, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
+    ushort t_ptr[8];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<4; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v128_t maxval = wasm_i16x8_splat(255);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval));
+    v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u16x8_gt(b.val, maxval));
+    return v_uint8x16(wasm_v8x16_shuffle(a1, b1, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    v128_t maxval = wasm_i32x4_splat(255);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval));
+    v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u32x4_gt(b.val, maxval));
+    v128_t c1 = wasm_v128_bitselect(maxval, c.val, wasm_u32x4_gt(c.val, maxval));
+    v128_t d1 = wasm_v128_bitselect(maxval, d.val, wasm_u32x4_gt(d.val, maxval));
+    v128_t ab = wasm_v8x16_shuffle(a1, b1, 0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28);
+    v128_t cd = wasm_v8x16_shuffle(c1, d1, 0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28);
+    return v_uint8x16(wasm_v8x16_shuffle(ab, cd, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    v128_t maxval = wasm_i32x4_splat(255);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, ((__u64x2)(a.val) > (__u64x2)maxval));
+    v128_t b1 = wasm_v128_bitselect(maxval, b.val, ((__u64x2)(b.val) > (__u64x2)maxval));
+    v128_t c1 = wasm_v128_bitselect(maxval, c.val, ((__u64x2)(c.val) > (__u64x2)maxval));
+    v128_t d1 = wasm_v128_bitselect(maxval, d.val, ((__u64x2)(d.val) > (__u64x2)maxval));
+    v128_t e1 = wasm_v128_bitselect(maxval, e.val, ((__u64x2)(e.val) > (__u64x2)maxval));
+    v128_t f1 = wasm_v128_bitselect(maxval, f.val, ((__u64x2)(f.val) > (__u64x2)maxval));
+    v128_t g1 = wasm_v128_bitselect(maxval, g.val, ((__u64x2)(g.val) > (__u64x2)maxval));
+    v128_t h1 = wasm_v128_bitselect(maxval, h.val, ((__u64x2)(h.val) > (__u64x2)maxval));
+    v128_t ab = wasm_v8x16_shuffle(a1, b1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
+    v128_t cd = wasm_v8x16_shuffle(c1, d1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
+    v128_t ef = wasm_v8x16_shuffle(e1, f1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
+    v128_t gh = wasm_v8x16_shuffle(g1, h1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
+    v128_t abcd = wasm_v8x16_shuffle(ab, cd, 0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
+    v128_t efgh = wasm_v8x16_shuffle(ef, gh, 0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
+    return v_uint8x16(wasm_v8x16_shuffle(abcd, efgh, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23));
+}
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2,
+                            const v_float32x4& m3)
+{
+    v128_t v0 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 0));
+    v128_t v1 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 1));
+    v128_t v2 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 2));
+    v128_t v3 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 3));
+    v0 = wasm_f32x4_mul(v0, m0.val);
+    v1 = wasm_f32x4_mul(v1, m1.val);
+    v2 = wasm_f32x4_mul(v2, m2.val);
+    v3 = wasm_f32x4_mul(v3, m3.val);
+
+    return v_float32x4(wasm_f32x4_add(wasm_f32x4_add(v0, v1), wasm_f32x4_add(v2, v3)));
+}
+
+inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
+                               const v_float32x4& m1, const v_float32x4& m2,
+                               const v_float32x4& a)
+{
+    v128_t v0 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 0));
+    v128_t v1 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 1));
+    v128_t v2 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 2));
+    v0 = wasm_f32x4_mul(v0, m0.val);
+    v1 = wasm_f32x4_mul(v1, m1.val);
+    v2 = wasm_f32x4_mul(v2, m2.val);
+
+    return v_float32x4(wasm_f32x4_add(wasm_f32x4_add(v0, v1), wasm_f32x4_add(v2, a.val)));
+}
+
+#define OPENCV_HAL_IMPL_WASM_BIN_OP(bin_op, _Tpvec, intrin) \
+inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+} \
+inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+{ \
+    a.val = intrin(a.val, b.val); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint8x16, wasm_u8x16_add_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint8x16, wasm_u8x16_sub_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int8x16, wasm_i8x16_add_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int8x16, wasm_i8x16_sub_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint16x8, wasm_u16x8_add_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint16x8, wasm_u16x8_sub_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int16x8, wasm_i16x8_add_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int16x8, wasm_i16x8_sub_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint32x4, wasm_i32x4_add)
+OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint32x4, wasm_i32x4_sub)
+OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_uint32x4, wasm_i32x4_mul)
+OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int32x4, wasm_i32x4_add)
+OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int32x4, wasm_i32x4_sub)
+OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_int32x4, wasm_i32x4_mul)
+OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_float32x4, wasm_f32x4_add)
+OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_float32x4, wasm_f32x4_sub)
+OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_float32x4, wasm_f32x4_mul)
+OPENCV_HAL_IMPL_WASM_BIN_OP(/, v_float32x4, wasm_f32x4_div)
+OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint64x2, wasm_i64x2_add)
+OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint64x2, wasm_i64x2_sub)
+OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int64x2, wasm_i64x2_add)
+OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int64x2, wasm_i64x2_sub)
+OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_float64x2, wasm_f64x2_add)
+OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_float64x2, wasm_f64x2_sub)
+OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_float64x2, wasm_f64x2_mul)
+OPENCV_HAL_IMPL_WASM_BIN_OP(/, v_float64x2, wasm_f64x2_div)
+
+// saturating multiply 8-bit, 16-bit
+#define OPENCV_HAL_IMPL_WASM_MUL_SAT(_Tpvec, _Tpwvec)        \
+inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
+{                                                            \
+    _Tpwvec c, d;                                            \
+    v_mul_expand(a, b, c, d);                                \
+    return v_pack(c, d);                                     \
+}                                                            \
+inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
+{ a = a * b; return a; }
+
+OPENCV_HAL_IMPL_WASM_MUL_SAT(v_uint8x16, v_uint16x8)
+OPENCV_HAL_IMPL_WASM_MUL_SAT(v_int8x16,  v_int16x8)
+OPENCV_HAL_IMPL_WASM_MUL_SAT(v_uint16x8, v_uint32x4)
+OPENCV_HAL_IMPL_WASM_MUL_SAT(v_int16x8,  v_int32x4)
+
+//  Multiply and expand
+inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
+                         v_uint16x8& c, v_uint16x8& d)
+{
+    v_uint16x8 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
+inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
+                         v_int16x8& c, v_int16x8& d)
+{
+    v_int16x8 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
+inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
+                         v_int32x4& c, v_int32x4& d)
+{
+    v_int32x4 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c.val = wasm_i32x4_mul(a0.val, b0.val);
+    d.val = wasm_i32x4_mul(a1.val, b1.val);
+}
+
+inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
+                         v_uint32x4& c, v_uint32x4& d)
+{
+    v_uint32x4 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c.val = wasm_i32x4_mul(a0.val, b0.val);
+    d.val = wasm_i32x4_mul(a1.val, b1.val);
+}
+
+inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
+                         v_uint64x2& c, v_uint64x2& d)
+{
+    v_uint64x2 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c.val = ((__u64x2)(a0.val) * (__u64x2)(b0.val));
+    d.val = ((__u64x2)(a1.val) * (__u64x2)(b1.val));
+}
+
+inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
+{
+    v_int32x4 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    v128_t c = wasm_i32x4_mul(a0.val, b0.val);
+    v128_t d = wasm_i32x4_mul(a1.val, b1.val);
+    return v_int16x8(wasm_v8x16_shuffle(c, d, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31));
+}
+inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v_uint32x4 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    v128_t c = wasm_i32x4_mul(a0.val, b0.val);
+    v128_t d = wasm_i32x4_mul(a1.val, b1.val);
+    return v_uint16x8(wasm_v8x16_shuffle(c, d, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31));
+}
+
+//////// Dot Product ////////
+
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{
+    v128_t a0 = wasm_i32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
+    v128_t a1 = wasm_i32x4_shr(a.val, 16);
+    v128_t b0 = wasm_i32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
+    v128_t b1 = wasm_i32x4_shr(b.val, 16);
+    v128_t c = wasm_i32x4_mul(a0, b0);
+    v128_t d = wasm_i32x4_mul(a1, b1);
+    return v_int32x4(wasm_i32x4_add(c, d));
+}
+
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_dotprod(a, b) + c; }
+
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
+{
+    v128_t a0 = wasm_i64x2_shr(wasm_i64x2_shl(a.val, 32), 32);
+    v128_t a1 = wasm_i64x2_shr(a.val, 32);
+    v128_t b0 = wasm_i64x2_shr(wasm_i64x2_shl(b.val, 32), 32);
+    v128_t b1 = wasm_i64x2_shr(b.val, 32);
+    v128_t c = (v128_t)((__i64x2)a0 * (__i64x2)b0);
+    v128_t d = (v128_t)((__i64x2)a1 * (__i64x2)b1);
+    return v_int64x2(wasm_i64x2_add(c, d));
+}
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{
+    return v_dotprod(a, b) + c;
+}
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
+{
+    v128_t a0 = wasm_u16x8_shr(wasm_i16x8_shl(a.val, 8), 8);
+    v128_t a1 = wasm_u16x8_shr(a.val, 8);
+    v128_t b0 = wasm_u16x8_shr(wasm_i16x8_shl(b.val, 8), 8);
+    v128_t b1 = wasm_u16x8_shr(b.val, 8);
+    return v_uint32x4((
+        v_dotprod(v_int16x8(a0), v_int16x8(b0)) +
+        v_dotprod(v_int16x8(a1), v_int16x8(b1))).val
+    );
+}
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
+{
+    v128_t a0 = wasm_i16x8_shr(wasm_i16x8_shl(a.val, 8), 8);
+    v128_t a1 = wasm_i16x8_shr(a.val, 8);
+    v128_t b0 = wasm_i16x8_shr(wasm_i16x8_shl(b.val, 8), 8);
+    v128_t b1 = wasm_i16x8_shr(b.val, 8);
+    return v_int32x4(
+        v_dotprod(v_int16x8(a0), v_int16x8(b0)) +
+        v_dotprod(v_int16x8(a1), v_int16x8(b1))
+    );
+}
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v128_t a0 = wasm_u32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
+    v128_t a1 = wasm_u32x4_shr(a.val, 16);
+    v128_t b0 = wasm_u32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
+    v128_t b1 = wasm_u32x4_shr(b.val, 16);
+    return v_uint64x2((
+        v_dotprod(v_int32x4(a0), v_int32x4(b0)) +
+        v_dotprod(v_int32x4(a1), v_int32x4(b1))).val
+    );
+}
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
+{
+    v128_t a0 = wasm_i32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
+    v128_t a1 = wasm_i32x4_shr(a.val, 16);
+    v128_t b0 = wasm_i32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
+    v128_t b1 = wasm_i32x4_shr(b.val, 16);
+    return v_int64x2((
+        v_dotprod(v_int32x4(a0), v_int32x4(b0)) +
+        v_dotprod(v_int32x4(a1), v_int32x4(b1)))
+    );
+}
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
+{ return v_dotprod(a, b); }
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_dotprod(a, b, c); }
+
+// 32 >> 64
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod(a, b); }
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_dotprod(a, b, c); }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_dotprod_expand(a, b, c); }
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod_expand(a, b); }
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+
+#define OPENCV_HAL_IMPL_WASM_LOGIC_OP(_Tpvec) \
+OPENCV_HAL_IMPL_WASM_BIN_OP(&, _Tpvec, wasm_v128_and) \
+OPENCV_HAL_IMPL_WASM_BIN_OP(|, _Tpvec, wasm_v128_or) \
+OPENCV_HAL_IMPL_WASM_BIN_OP(^, _Tpvec, wasm_v128_xor) \
+inline _Tpvec operator ~ (const _Tpvec& a) \
+{ \
+    return _Tpvec(wasm_v128_not(a.val)); \
+}
+
+OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint8x16)
+OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int8x16)
+OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint16x8)
+OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int16x8)
+OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint32x4)
+OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int32x4)
+OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint64x2)
+OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int64x2)
+OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_float32x4)
+OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_float64x2)
+
+inline v_float32x4 v_sqrt(const v_float32x4& x)
+{
+    return v_float32x4(wasm_f32x4_sqrt(x.val));
+}
+
+inline v_float32x4 v_invsqrt(const v_float32x4& x)
+{
+    const v128_t _1_0 = wasm_f32x4_splat(1.0);
+    return v_float32x4(wasm_f32x4_div(_1_0, wasm_f32x4_sqrt(x.val)));
+}
+
+inline v_float64x2 v_sqrt(const v_float64x2& x)
+{
+    return v_float64x2(wasm_f64x2_sqrt(x.val));
+}
+
+inline v_float64x2 v_invsqrt(const v_float64x2& x)
+{
+    const v128_t _1_0 = wasm_f64x2_splat(1.0);
+    return v_float64x2(wasm_f64x2_div(_1_0, wasm_f64x2_sqrt(x.val)));
+}
+
+#define OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(_Tpuvec, _Tpsvec, suffix, zsuffix, shiftWidth) \
+inline _Tpuvec v_abs(const _Tpsvec& x) \
+{ \
+    v128_t s = wasm_##suffix##_shr(x.val, shiftWidth); \
+    v128_t f = wasm_##zsuffix##_shr(x.val, shiftWidth); \
+    return _Tpuvec(wasm_##zsuffix##_add(wasm_v128_xor(x.val, f), s)); \
+}
+
+OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(v_uint8x16, v_int8x16, u8x16, i8x16, 7)
+OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(v_uint16x8, v_int16x8, u16x8, i16x8, 15)
+OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(v_uint32x4, v_int32x4, u32x4, i32x4, 31)
+
+inline v_float32x4 v_abs(const v_float32x4& x)
+{ return v_float32x4(wasm_f32x4_abs(x.val)); }
+inline v_float64x2 v_abs(const v_float64x2& x)
+{
+    return v_float64x2(wasm_f64x2_abs(x.val));
+}
+
+// TODO: exp, log, sin, cos
+
+#define OPENCV_HAL_IMPL_WASM_BIN_FUNC(_Tpvec, func, intrin) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+}
+
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float32x4, v_min, wasm_f32x4_min)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float32x4, v_max, wasm_f32x4_max)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float64x2, v_min, wasm_f64x2_min)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float64x2, v_max, wasm_f64x2_max)
+
+#define OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(_Tpvec, suffix) \
+inline _Tpvec v_min(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(wasm_v128_bitselect(b.val, a.val, wasm_##suffix##_gt(a.val, b.val))); \
+} \
+inline _Tpvec v_max(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(wasm_v128_bitselect(a.val, b.val, wasm_##suffix##_gt(a.val, b.val))); \
+}
+
+OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(v_int8x16, i8x16)
+OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(v_int16x8, i16x8)
+OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(v_int32x4, i32x4)
+
+#define OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(_Tpvec, suffix, deltaNum) \
+inline _Tpvec v_min(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    v128_t delta = wasm_##suffix##_splat(deltaNum); \
+    v128_t mask = wasm_##suffix##_gt(wasm_v128_xor(a.val, delta), wasm_v128_xor(b.val, delta)); \
+    return _Tpvec(wasm_v128_bitselect(b.val, a.val, mask)); \
+} \
+inline _Tpvec v_max(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    v128_t delta = wasm_##suffix##_splat(deltaNum); \
+    v128_t mask = wasm_##suffix##_gt(wasm_v128_xor(a.val, delta), wasm_v128_xor(b.val, delta)); \
+    return _Tpvec(wasm_v128_bitselect(a.val, b.val, mask)); \
+}
+
+OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint8x16, i8x16, (schar)0x80)
+OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint16x8, i16x8, (short)0x8000)
+OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint32x4, i32x4, (int)0x80000000)
+
+#define OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(_Tpvec, suffix, esuffix) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(wasm_##esuffix##_eq(a.val, b.val)); } \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(wasm_##esuffix##_ne(a.val, b.val)); } \
+inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(wasm_##suffix##_lt(a.val, b.val)); } \
+inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(wasm_##suffix##_gt(a.val, b.val)); } \
+inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(wasm_##suffix##_le(a.val, b.val)); } \
+inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(wasm_##suffix##_ge(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint8x16, u8x16, i8x16)
+OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_int8x16, i8x16, i8x16)
+OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint16x8, u16x8, i16x8)
+OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_int16x8, i16x8, i16x8)
+OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint32x4, u32x4, i32x4)
+OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_int32x4, i32x4, i32x4)
+OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_float32x4, f32x4, f32x4)
+OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_float64x2, f64x2, f64x2)
+
+#define OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(_Tpvec, cast) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ return cast(v_reinterpret_as_f64(a) == v_reinterpret_as_f64(b)); } \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ return cast(v_reinterpret_as_f64(a) != v_reinterpret_as_f64(b)); }
+
+OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(v_uint64x2, v_reinterpret_as_u64)
+OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(v_int64x2, v_reinterpret_as_s64)
+
+inline v_float32x4 v_not_nan(const v_float32x4& a)
+{
+    v128_t z = wasm_i32x4_splat(0x7fffffff);
+    v128_t t = wasm_i32x4_splat(0x7f800000);
+    return v_float32x4(wasm_u32x4_lt(wasm_v128_and(a.val, z), t));
+}
+inline v_float64x2 v_not_nan(const v_float64x2& a)
+{
+    v128_t z = wasm_i64x2_splat(0x7fffffffffffffff);
+    v128_t t = wasm_i64x2_splat(0x7ff0000000000000);
+    return v_float64x2((__u64x2)(wasm_v128_and(a.val, z)) < (__u64x2)t);
+}
+
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16, v_add_wrap, wasm_i8x16_add)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int8x16, v_add_wrap, wasm_i8x16_add)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8, v_add_wrap, wasm_i16x8_add)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_add_wrap, wasm_i16x8_add)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16, v_sub_wrap, wasm_i8x16_sub)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int8x16, v_sub_wrap, wasm_i8x16_sub)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8, v_sub_wrap, wasm_i16x8_sub)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_sub_wrap, wasm_i16x8_sub)
+#if (__EMSCRIPTEN_major__ * 1000000 + __EMSCRIPTEN_minor__ * 1000 + __EMSCRIPTEN_tiny__) >= (1039012)
+// details: https://github.com/opencv/opencv/issues/18097 ( https://github.com/emscripten-core/emscripten/issues/12018 )
+// 1.39.12: https://github.com/emscripten-core/emscripten/commit/cd801d0f110facfd694212a3c8b2ed2ffcd630e2
+inline v_uint8x16 v_mul_wrap(const v_uint8x16& a, const v_uint8x16& b)
+{
+    uchar a_[16], b_[16];
+    wasm_v128_store(a_, a.val);
+    wasm_v128_store(b_, b.val);
+    for (int i = 0; i < 16; i++)
+        a_[i] = (uchar)(a_[i] * b_[i]);
+    return v_uint8x16(wasm_v128_load(a_));
+}
+inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
+{
+    schar a_[16], b_[16];
+    wasm_v128_store(a_, a.val);
+    wasm_v128_store(b_, b.val);
+    for (int i = 0; i < 16; i++)
+        a_[i] = (schar)(a_[i] * b_[i]);
+    return v_int8x16(wasm_v128_load(a_));
+}
+#else
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16, v_mul_wrap, wasm_i8x16_mul)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int8x16, v_mul_wrap, wasm_i8x16_mul)
+#endif
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8, v_mul_wrap, wasm_i16x8_mul)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_mul_wrap, wasm_i16x8_mul)
+
+
+/** Absolute difference **/
+
+inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
+{
+    v_int8x16 d = v_sub_wrap(a, b);
+    v_int8x16 m = a < b;
+    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
+}
+inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
+{
+    return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)));
+}
+inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
+{
+    v_int32x4 d = a - b;
+    v_int32x4 m = a < b;
+    return v_reinterpret_as_u32((d ^ m) - m);
+}
+
+/** Saturating absolute difference **/
+inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
+{
+    v_int8x16 d = a - b;
+    v_int8x16 m = a < b;
+    return (d ^ m) - m;
+ }
+inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+
+inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return a * b + c;
+}
+
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+    return a * b + c;
+}
+
+inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+    return a * b + c;
+}
+
+inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
+{
+    v128_t absmask_vec = wasm_i32x4_splat(0x7fffffff);
+    return v_float32x4(wasm_v128_and(wasm_f32x4_sub(a.val, b.val), absmask_vec));
+}
+inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
+{
+    v128_t absmask_vec = wasm_u64x2_shr(wasm_i32x4_splat(-1), 1);
+    return v_float64x2(wasm_v128_and(wasm_f64x2_sub(a.val, b.val), absmask_vec));
+}
+
+#define OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(_Tpvec, suffix) \
+inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    v128_t a_Square = wasm_##suffix##_mul(a.val, a.val); \
+    v128_t b_Square = wasm_##suffix##_mul(b.val, b.val); \
+    return _Tpvec(wasm_##suffix##_sqrt(wasm_##suffix##_add(a_Square, b_Square))); \
+} \
+inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    v128_t a_Square = wasm_##suffix##_mul(a.val, a.val); \
+    v128_t b_Square = wasm_##suffix##_mul(b.val, b.val); \
+    return _Tpvec(wasm_##suffix##_add(a_Square, b_Square)); \
+} \
+inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
+{ \
+    return _Tpvec(wasm_##suffix##_add(wasm_##suffix##_mul(a.val, b.val), c.val)); \
+}
+
+OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(v_float32x4, f32x4)
+OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(v_float64x2, f64x2)
+
+#define OPENCV_HAL_IMPL_WASM_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, ssuffix) \
+inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
+{ \
+    return _Tpuvec(wasm_##suffix##_shl(a.val, imm)); \
+} \
+inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
+{ \
+    return _Tpsvec(wasm_##suffix##_shl(a.val, imm)); \
+} \
+inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
+{ \
+    return _Tpuvec(wasm_##ssuffix##_shr(a.val, imm)); \
+} \
+inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
+{ \
+    return _Tpsvec(wasm_##suffix##_shr(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpuvec v_shl(const _Tpuvec& a) \
+{ \
+    return _Tpuvec(wasm_##suffix##_shl(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpsvec v_shl(const _Tpsvec& a) \
+{ \
+    return _Tpsvec(wasm_##suffix##_shl(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpuvec v_shr(const _Tpuvec& a) \
+{ \
+    return _Tpuvec(wasm_##ssuffix##_shr(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpsvec v_shr(const _Tpsvec& a) \
+{ \
+    return _Tpsvec(wasm_##suffix##_shr(a.val, imm)); \
+}
+
+OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint8x16, v_int8x16, i8x16, u8x16)
+OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint16x8, v_int16x8, i16x8, u16x8)
+OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint32x4, v_int32x4, i32x4, u32x4)
+OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint64x2, v_int64x2, i64x2, u64x2)
+
+namespace hal_wasm_internal
+{
+    template <int imm,
+        bool is_invalid = ((imm < 0) || (imm > 16)),
+        bool is_first = (imm == 0),
+        bool is_second = (imm == 16),
+        bool is_other = (((imm > 0) && (imm < 16)))>
+    class v_wasm_palignr_u8_class;
+
+    template <int imm>
+    class v_wasm_palignr_u8_class<imm, true, false, false, false>;
+
+    template <int imm>
+    class v_wasm_palignr_u8_class<imm, false, true, false, false>
+    {
+    public:
+        inline v128_t operator()(const v128_t& a, const v128_t&) const
+        {
+            return a;
+        }
+    };
+
+    template <int imm>
+    class v_wasm_palignr_u8_class<imm, false, false, true, false>
+    {
+    public:
+        inline v128_t operator()(const v128_t&, const v128_t& b) const
+        {
+            return b;
+        }
+    };
+
+    template <int imm>
+    class v_wasm_palignr_u8_class<imm, false, false, false, true>
+    {
+    public:
+        inline v128_t operator()(const v128_t& a, const v128_t& b) const
+        {
+            enum { imm2 = (sizeof(v128_t) - imm) };
+            return wasm_v8x16_shuffle(a, b,
+                                      imm, imm+1, imm+2, imm+3,
+                                      imm+4, imm+5, imm+6, imm+7,
+                                      imm+8, imm+9, imm+10, imm+11,
+                                      imm+12, imm+13, imm+14, imm+15);
+        }
+    };
+
+    template <int imm>
+    inline v128_t v_wasm_palignr_u8(const v128_t& a, const v128_t& b)
+    {
+        CV_StaticAssert((imm >= 0) && (imm <= 16), "Invalid imm for v_wasm_palignr_u8.");
+        return v_wasm_palignr_u8_class<imm>()(a, b);
+    }
+}
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_right(const _Tpvec &a)
+{
+    using namespace hal_wasm_internal;
+    enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
+    v128_t z = wasm_i8x16_splat(0);
+    return _Tpvec(v_wasm_palignr_u8<imm2>(a.val, z));
+}
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_left(const _Tpvec &a)
+{
+    using namespace hal_wasm_internal;
+    enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) };
+    v128_t z = wasm_i8x16_splat(0);
+    return _Tpvec(v_wasm_palignr_u8<imm2>(z, a.val));
+}
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_right(const _Tpvec &a, const _Tpvec &b)
+{
+    using namespace hal_wasm_internal;
+    enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
+    return _Tpvec(v_wasm_palignr_u8<imm2>(a.val, b.val));
+}
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_left(const _Tpvec &a, const _Tpvec &b)
+{
+    using namespace hal_wasm_internal;
+    enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) };
+    return _Tpvec(v_wasm_palignr_u8<imm2>(b.val, a.val));
+}
+
+#define OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(_Tpvec, _Tp) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec(wasm_v128_load(ptr)); } \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(wasm_v128_load(ptr)); } \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ \
+    _Tp tmp[_Tpvec::nlanes] = {0}; \
+    for (int i=0; i<_Tpvec::nlanes/2; ++i) { \
+        tmp[i] = ptr[i]; \
+    } \
+    return _Tpvec(wasm_v128_load(tmp)); \
+} \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+    _Tp tmp[_Tpvec::nlanes]; \
+    for (int i=0; i<_Tpvec::nlanes/2; ++i) { \
+        tmp[i] = ptr0[i]; \
+        tmp[i+_Tpvec::nlanes/2] = ptr1[i]; \
+    } \
+    return _Tpvec(wasm_v128_load(tmp)); \
+} \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ wasm_v128_store(ptr, a.val); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ wasm_v128_store(ptr, a.val); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ wasm_v128_store(ptr, a.val); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ \
+    wasm_v128_store(ptr, a.val); \
+} \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ \
+    _Tpvec::lane_type a_[_Tpvec::nlanes]; \
+    wasm_v128_store(a_, a.val); \
+    for (int i = 0; i < (_Tpvec::nlanes / 2); i++) \
+        ptr[i] = a_[i]; \
+} \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+    _Tpvec::lane_type a_[_Tpvec::nlanes]; \
+    wasm_v128_store(a_, a.val); \
+    for (int i = 0; i < (_Tpvec::nlanes / 2); i++) \
+        ptr[i] = a_[i + (_Tpvec::nlanes / 2)]; \
+}
+
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint8x16, uchar)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int8x16, schar)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint16x8, ushort)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int16x8, short)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint32x4, unsigned)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int32x4, int)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint64x2, uint64)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int64x2, int64)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_float32x4, float)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_float64x2, double)
+
+
+/** Reverse **/
+inline v_uint8x16 v_reverse(const v_uint8x16 &a)
+{ return v_uint8x16(wasm_v8x16_shuffle(a.val, a.val, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); }
+
+inline v_int8x16 v_reverse(const v_int8x16 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x8 v_reverse(const v_uint16x8 &a)
+{ return v_uint16x8(wasm_v8x16_shuffle(a.val, a.val, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); }
+
+inline v_int16x8 v_reverse(const v_int16x8 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x4 v_reverse(const v_uint32x4 &a)
+{ return v_uint32x4(wasm_v8x16_shuffle(a.val, a.val, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3)); }
+
+inline v_int32x4 v_reverse(const v_int32x4 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x4 v_reverse(const v_float32x4 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x2 v_reverse(const v_uint64x2 &a)
+{ return v_uint64x2(wasm_v8x16_shuffle(a.val, a.val, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); }
+
+inline v_int64x2 v_reverse(const v_int64x2 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+inline v_float64x2 v_reverse(const v_float64x2 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
+
+#define OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, esuffix) \
+inline scalartype v_reduce_sum(const _Tpvec& a) \
+{ \
+    regtype val = a.val; \
+    val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7)); \
+    val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3)); \
+    return (scalartype)wasm_##esuffix##_extract_lane(val, 0); \
+}
+
+OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_uint32x4, unsigned, v128_t, i32x4, i32x4)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_int32x4, int, v128_t, i32x4, i32x4)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_float32x4, float, v128_t, f32x4, f32x4)
+
+// To do: Optimize v_reduce_sum with wasm intrin.
+//        Now use fallback implementation as there is no widening op in wasm intrin.
+
+#define OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(_Tpvec, scalartype) \
+inline scalartype v_reduce_sum(const _Tpvec& a) \
+{ \
+    _Tpvec::lane_type a_[_Tpvec::nlanes]; \
+    wasm_v128_store(a_, a.val); \
+    scalartype c = a_[0]; \
+    for (int i = 1; i < _Tpvec::nlanes; i++) \
+        c += a_[i]; \
+    return c; \
+}
+
+OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_uint8x16, unsigned)
+OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_int8x16, int)
+OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_uint16x8, unsigned)
+OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_int16x8, int)
+
+
+#define OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(_Tpvec, scalartype, regtype, suffix, esuffix) \
+inline scalartype v_reduce_sum(const _Tpvec& a) \
+{ \
+    regtype val = a.val; \
+    val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7)); \
+    return (scalartype)wasm_##esuffix##_extract_lane(val, 0); \
+}
+OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(v_uint64x2, uint64, v128_t, i64x2, i64x2)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(v_int64x2, int64,  v128_t, i64x2, i64x2)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(v_float64x2, double,  v128_t, f64x2,f64x2)
+
+inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
+                                 const v_float32x4& c, const v_float32x4& d)
+{
+    v128_t ac = wasm_f32x4_add(wasm_unpacklo_i32x4(a.val, c.val), wasm_unpackhi_i32x4(a.val, c.val));
+    v128_t bd = wasm_f32x4_add(wasm_unpacklo_i32x4(b.val, d.val), wasm_unpackhi_i32x4(b.val, d.val));
+    return v_float32x4(wasm_f32x4_add(wasm_unpacklo_i32x4(ac, bd), wasm_unpackhi_i32x4(ac, bd)));
+}
+
+#define OPENCV_HAL_IMPL_WASM_REDUCE_OP(_Tpvec, scalartype, func, scalar_func) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    scalartype buf[_Tpvec::nlanes]; \
+    v_store(buf, a); \
+    scalartype tmp = buf[0]; \
+    for (int i=1; i<_Tpvec::nlanes; ++i) { \
+        tmp = scalar_func(tmp, buf[i]); \
+    } \
+    return tmp; \
+}
+
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint8x16, uchar, max, std::max)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint8x16, uchar, min, std::min)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int8x16, schar, max, std::max)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int8x16, schar, min, std::min)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint16x8, ushort, max, std::max)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint16x8, ushort, min, std::min)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int16x8, short, max, std::max)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int16x8, short, min, std::min)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint32x4, unsigned, max, std::max)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint32x4, unsigned, min, std::min)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int32x4, int, max, std::max)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int32x4, int, min, std::min)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_float32x4, float, max, std::max)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_float32x4, float, min, std::min)
+
+inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
+{
+    v_uint16x8 l16, h16;
+    v_uint32x4 l16_l32, l16_h32, h16_l32, h16_h32;
+    v_expand(v_absdiff(a, b), l16, h16);
+    v_expand(l16, l16_l32, l16_h32);
+    v_expand(h16, h16_l32, h16_h32);
+    return v_reduce_sum(l16_l32+l16_h32+h16_l32+h16_h32);
+}
+inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
+{
+    v_uint16x8 l16, h16;
+    v_uint32x4 l16_l32, l16_h32, h16_l32, h16_h32;
+    v_expand(v_absdiff(a, b), l16, h16);
+    v_expand(l16, l16_l32, l16_h32);
+    v_expand(h16, h16_l32, h16_h32);
+    return v_reduce_sum(l16_l32+l16_h32+h16_l32+h16_h32);
+}
+inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v_uint32x4 l, h;
+    v_expand(v_absdiff(a, b), l, h);
+    return v_reduce_sum(l + h);
+}
+inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
+{
+    v_uint32x4 l, h;
+    v_expand(v_absdiff(a, b), l, h);
+    return v_reduce_sum(l + h);
+}
+inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
+{
+    return v_reduce_sum(v_absdiff(a, b));
+}
+inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
+{
+    return v_reduce_sum(v_absdiff(a, b));
+}
+inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
+{
+    return v_reduce_sum(v_absdiff(a, b));
+}
+
+inline v_uint8x16 v_popcount(const v_uint8x16& a)
+{
+    v128_t m1 = wasm_i32x4_splat(0x55555555);
+    v128_t m2 = wasm_i32x4_splat(0x33333333);
+    v128_t m4 = wasm_i32x4_splat(0x0f0f0f0f);
+    v128_t p = a.val;
+    p = wasm_i32x4_add(wasm_v128_and(wasm_u32x4_shr(p, 1), m1), wasm_v128_and(p, m1));
+    p = wasm_i32x4_add(wasm_v128_and(wasm_u32x4_shr(p, 2), m2), wasm_v128_and(p, m2));
+    p = wasm_i32x4_add(wasm_v128_and(wasm_u32x4_shr(p, 4), m4), wasm_v128_and(p, m4));
+    return v_uint8x16(p);
+}
+inline v_uint16x8 v_popcount(const v_uint16x8& a)
+{
+    v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
+    p += v_rotate_right<1>(p);
+    return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
+}
+inline v_uint32x4 v_popcount(const v_uint32x4& a)
+{
+    v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
+    p += v_rotate_right<1>(p);
+    p += v_rotate_right<2>(p);
+    return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
+}
+inline v_uint64x2 v_popcount(const v_uint64x2& a)
+{
+    uint64 a_[2], b_[2] = { 0 };
+    wasm_v128_store(a_, a.val);
+    for (int i = 0; i < 16; i++)
+        b_[i / 8] += popCountTable[((uint8_t*)a_)[i]];
+    return v_uint64x2(wasm_v128_load(b_));
+}
+inline v_uint8x16 v_popcount(const v_int8x16& a)
+{ return v_popcount(v_reinterpret_as_u8(a)); }
+inline v_uint16x8 v_popcount(const v_int16x8& a)
+{ return v_popcount(v_reinterpret_as_u16(a)); }
+inline v_uint32x4 v_popcount(const v_int32x4& a)
+{ return v_popcount(v_reinterpret_as_u32(a)); }
+inline v_uint64x2 v_popcount(const v_int64x2& a)
+{ return v_popcount(v_reinterpret_as_u64(a)); }
+
+#define OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(_Tpvec, suffix, scalarType) \
+inline int v_signmask(const _Tpvec& a) \
+{ \
+    _Tpvec::lane_type a_[_Tpvec::nlanes]; \
+    wasm_v128_store(a_, a.val); \
+    int mask = 0; \
+    for (int i = 0; i < _Tpvec::nlanes; i++) \
+        mask |= (reinterpret_int(a_[i]) < 0) << i; \
+    return mask; \
+} \
+inline bool v_check_all(const _Tpvec& a) \
+{ return wasm_i8x16_all_true(wasm_##suffix##_lt(a.val, wasm_##suffix##_splat(0))); } \
+inline bool v_check_any(const _Tpvec& a) \
+{ return wasm_i8x16_any_true(wasm_##suffix##_lt(a.val, wasm_##suffix##_splat(0)));; }
+
+OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_uint8x16, i8x16, schar)
+OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_int8x16, i8x16, schar)
+OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_uint16x8, i16x8, short)
+OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_int16x8, i16x8, short)
+OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_uint32x4, i32x4, int)
+OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_int32x4, i32x4, int)
+OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_float32x4, i32x4, float)
+OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_float64x2, f64x2, double)
+
+#define OPENCV_HAL_IMPL_WASM_CHECK_ALL_ANY(_Tpvec, suffix, esuffix) \
+inline bool v_check_all(const _Tpvec& a) \
+{ \
+    v128_t masked = v_reinterpret_as_##esuffix(a).val; \
+    masked = wasm_i32x4_replace_lane(masked, 0, 0xffffffff); \
+    masked = wasm_i32x4_replace_lane(masked, 2, 0xffffffff); \
+    return wasm_i8x16_all_true(wasm_##suffix##_lt(masked, wasm_##suffix##_splat(0))); \
+} \
+inline bool v_check_any(const _Tpvec& a) \
+{ \
+    v128_t masked = v_reinterpret_as_##esuffix(a).val; \
+    masked = wasm_i32x4_replace_lane(masked, 0, 0x0); \
+    masked = wasm_i32x4_replace_lane(masked, 2, 0x0); \
+    return wasm_i8x16_any_true(wasm_##suffix##_lt(masked, wasm_##suffix##_splat(0))); \
+} \
+
+OPENCV_HAL_IMPL_WASM_CHECK_ALL_ANY(v_int64x2, i32x4, s32)
+OPENCV_HAL_IMPL_WASM_CHECK_ALL_ANY(v_uint64x2, i32x4, u32)
+
+
+inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+
+#define OPENCV_HAL_IMPL_WASM_SELECT(_Tpvec) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(wasm_v128_bitselect(a.val, b.val, mask.val)); \
+}
+
+OPENCV_HAL_IMPL_WASM_SELECT(v_uint8x16)
+OPENCV_HAL_IMPL_WASM_SELECT(v_int8x16)
+OPENCV_HAL_IMPL_WASM_SELECT(v_uint16x8)
+OPENCV_HAL_IMPL_WASM_SELECT(v_int16x8)
+OPENCV_HAL_IMPL_WASM_SELECT(v_uint32x4)
+OPENCV_HAL_IMPL_WASM_SELECT(v_int32x4)
+OPENCV_HAL_IMPL_WASM_SELECT(v_uint64x2)
+OPENCV_HAL_IMPL_WASM_SELECT(v_int64x2)
+OPENCV_HAL_IMPL_WASM_SELECT(v_float32x4)
+OPENCV_HAL_IMPL_WASM_SELECT(v_float64x2)
+
+#define OPENCV_HAL_IMPL_WASM_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin)    \
+inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1)      \
+{                                                                    \
+    b0.val = intrin(a.val);                                          \
+    b1.val = __CV_CAT(intrin, _high)(a.val);                         \
+}                                                                    \
+inline _Tpwvec v_expand_low(const _Tpvec& a)                         \
+{ return _Tpwvec(intrin(a.val)); }                                   \
+inline _Tpwvec v_expand_high(const _Tpvec& a)                        \
+{ return _Tpwvec(__CV_CAT(intrin, _high)(a.val)); }                  \
+inline _Tpwvec v_load_expand(const _Tp* ptr)                         \
+{                                                                    \
+    v128_t a = wasm_v128_load(ptr);                                  \
+    return _Tpwvec(intrin(a));                                       \
+}
+
+OPENCV_HAL_IMPL_WASM_EXPAND(v_uint8x16, v_uint16x8, uchar, v128_cvtu8x16_i16x8)
+OPENCV_HAL_IMPL_WASM_EXPAND(v_int8x16,  v_int16x8,  schar, v128_cvti8x16_i16x8)
+OPENCV_HAL_IMPL_WASM_EXPAND(v_uint16x8, v_uint32x4, ushort, v128_cvtu16x8_i32x4)
+OPENCV_HAL_IMPL_WASM_EXPAND(v_int16x8,  v_int32x4,  short, v128_cvti16x8_i32x4)
+OPENCV_HAL_IMPL_WASM_EXPAND(v_uint32x4, v_uint64x2, unsigned, v128_cvtu32x4_i64x2)
+OPENCV_HAL_IMPL_WASM_EXPAND(v_int32x4,  v_int64x2,  int, v128_cvti32x4_i64x2)
+
+#define OPENCV_HAL_IMPL_WASM_EXPAND_Q(_Tpvec, _Tp, intrin)  \
+inline _Tpvec v_load_expand_q(const _Tp* ptr)               \
+{                                                           \
+    v128_t a = wasm_v128_load(ptr);                         \
+    return _Tpvec(intrin(a));                               \
+}
+
+OPENCV_HAL_IMPL_WASM_EXPAND_Q(v_uint32x4, uchar, v128_cvtu8x16_i32x4)
+OPENCV_HAL_IMPL_WASM_EXPAND_Q(v_int32x4, schar, v128_cvti8x16_i32x4)
+
+#define OPENCV_HAL_IMPL_WASM_UNPACKS(_Tpvec, suffix) \
+inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
+{ \
+    b0.val = wasm_unpacklo_##suffix(a0.val, a1.val); \
+    b1.val = wasm_unpackhi_##suffix(a0.val, a1.val); \
+} \
+inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(wasm_unpacklo_i64x2(a.val, b.val)); \
+} \
+inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(wasm_unpackhi_i64x2(a.val, b.val)); \
+} \
+inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
+{ \
+    c.val = wasm_unpacklo_i64x2(a.val, b.val); \
+    d.val = wasm_unpackhi_i64x2(a.val, b.val); \
+}
+
+OPENCV_HAL_IMPL_WASM_UNPACKS(v_uint8x16, i8x16)
+OPENCV_HAL_IMPL_WASM_UNPACKS(v_int8x16, i8x16)
+OPENCV_HAL_IMPL_WASM_UNPACKS(v_uint16x8, i16x8)
+OPENCV_HAL_IMPL_WASM_UNPACKS(v_int16x8, i16x8)
+OPENCV_HAL_IMPL_WASM_UNPACKS(v_uint32x4, i32x4)
+OPENCV_HAL_IMPL_WASM_UNPACKS(v_int32x4, i32x4)
+OPENCV_HAL_IMPL_WASM_UNPACKS(v_float32x4, i32x4)
+OPENCV_HAL_IMPL_WASM_UNPACKS(v_float64x2, i64x2)
+
+template<int s, typename _Tpvec>
+inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
+{
+    return v_rotate_right<s>(a, b);
+}
+
+inline v_int32x4 v_round(const v_float32x4& a)
+{
+    v128_t h = wasm_f32x4_splat(0.5);
+    return v_int32x4(wasm_i32x4_trunc_saturate_f32x4(wasm_f32x4_add(a.val, h)));
+}
+
+inline v_int32x4 v_floor(const v_float32x4& a)
+{
+    v128_t a1 = wasm_i32x4_trunc_saturate_f32x4(a.val);
+    v128_t mask = wasm_f32x4_lt(a.val, wasm_f32x4_convert_i32x4(a1));
+    return v_int32x4(wasm_i32x4_add(a1, mask));
+}
+
+inline v_int32x4 v_ceil(const v_float32x4& a)
+{
+    v128_t a1 = wasm_i32x4_trunc_saturate_f32x4(a.val);
+    v128_t mask = wasm_f32x4_gt(a.val, wasm_f32x4_convert_i32x4(a1));
+    return v_int32x4(wasm_i32x4_sub(a1, mask));
+}
+
+inline v_int32x4 v_trunc(const v_float32x4& a)
+{ return v_int32x4(wasm_i32x4_trunc_saturate_f32x4(a.val)); }
+
+#define OPENCV_HAL_IMPL_WASM_MATH_FUNC(func, cfunc) \
+inline v_int32x4 func(const v_float64x2& a) \
+{ \
+    double a_[2]; \
+    wasm_v128_store(a_, a.val); \
+    int c_[4]; \
+    c_[0] = cfunc(a_[0]); \
+    c_[1] = cfunc(a_[1]); \
+    c_[2] = 0; \
+    c_[3] = 0; \
+    return v_int32x4(wasm_v128_load(c_)); \
+}
+
+OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_round, cvRound)
+OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_floor, cvFloor)
+OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_ceil, cvCeil)
+OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_trunc, int)
+
+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{
+    double a_[2], b_[2];
+    wasm_v128_store(a_, a.val);
+    wasm_v128_store(b_, b.val);
+    int c_[4];
+    c_[0] = cvRound(a_[0]);
+    c_[1] = cvRound(a_[1]);
+    c_[2] = cvRound(b_[0]);
+    c_[3] = cvRound(b_[1]);
+    return v_int32x4(wasm_v128_load(c_));
+}
+
+#define OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(_Tpvec, suffix) \
+inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
+                           const _Tpvec& a2, const _Tpvec& a3, \
+                           _Tpvec& b0, _Tpvec& b1, \
+                           _Tpvec& b2, _Tpvec& b3) \
+{ \
+    v128_t t0 = wasm_unpacklo_##suffix(a0.val, a1.val); \
+    v128_t t1 = wasm_unpacklo_##suffix(a2.val, a3.val); \
+    v128_t t2 = wasm_unpackhi_##suffix(a0.val, a1.val); \
+    v128_t t3 = wasm_unpackhi_##suffix(a2.val, a3.val); \
+\
+    b0.val = wasm_unpacklo_i64x2(t0, t1); \
+    b1.val = wasm_unpackhi_i64x2(t0, t1); \
+    b2.val = wasm_unpacklo_i64x2(t2, t3); \
+    b3.val = wasm_unpackhi_i64x2(t2, t3); \
+}
+
+OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(v_uint32x4, i32x4)
+OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(v_int32x4, i32x4)
+OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(v_float32x4, i32x4)
+
+// load deinterleave
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
+{
+    v128_t t00 = wasm_v128_load(ptr);
+    v128_t t01 = wasm_v128_load(ptr + 16);
+
+    a.val = wasm_v8x16_shuffle(t00, t01, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30);
+    b.val = wasm_v8x16_shuffle(t00, t01, 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31);
+}
+
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
+{
+    v128_t t00 = wasm_v128_load(ptr);
+    v128_t t01 = wasm_v128_load(ptr + 16);
+    v128_t t02 = wasm_v128_load(ptr + 32);
+
+    v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,3,6,9,12,15,18,21,24,27,30,1,2,4,5,7);
+    v128_t t11 = wasm_v8x16_shuffle(t00, t01, 1,4,7,10,13,16,19,22,25,28,31,0,2,3,5,6);
+    v128_t t12 = wasm_v8x16_shuffle(t00, t01, 2,5,8,11,14,17,20,23,26,29,0,1,3,4,6,7);
+
+    a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29);
+    b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30);
+    c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31);
+}
+
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
+{
+    v128_t u0 = wasm_v128_load(ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ...
+    v128_t u1 = wasm_v128_load(ptr + 16); // a4 b4 c4 d4 ...
+    v128_t u2 = wasm_v128_load(ptr + 32); // a8 b8 c8 d8 ...
+    v128_t u3 = wasm_v128_load(ptr + 48); // a12 b12 c12 d12 ...
+
+    v128_t v0 = wasm_v8x16_shuffle(u0, u1, 0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29);
+    v128_t v1 = wasm_v8x16_shuffle(u2, u3, 0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29);
+    v128_t v2 = wasm_v8x16_shuffle(u0, u1, 2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31);
+    v128_t v3 = wasm_v8x16_shuffle(u2, u3, 2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31);
+
+    a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
+    b.val = wasm_v8x16_shuffle(v0, v1, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
+    c.val = wasm_v8x16_shuffle(v2, v3, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
+    d.val = wasm_v8x16_shuffle(v2, v3, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b)
+{
+    v128_t v0 = wasm_v128_load(ptr);     // a0 b0 a1 b1 a2 b2 a3 b3
+    v128_t v1 = wasm_v128_load(ptr + 8); // a4 b4 a5 b5 a6 b6 a7 b7
+
+    a.val = wasm_v8x16_shuffle(v0, v1, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29); // a0 a1 a2 a3 a4 a5 a6 a7
+    b.val = wasm_v8x16_shuffle(v0, v1, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31); // b0 b1 ab b3 b4 b5 b6 b7
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
+{
+    v128_t t00 = wasm_v128_load(ptr);        // a0 b0 c0 a1 b1 c1 a2 b2
+    v128_t t01 = wasm_v128_load(ptr + 8);    // c2 a3 b3 c3 a4 b4 c4 a5
+    v128_t t02 = wasm_v128_load(ptr + 16);  // b5 c5 a6 b6 c6 a7 b7 c7
+
+    v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,6,7,12,13,18,19,24,25,30,31,2,3,4,5);
+    v128_t t11 = wasm_v8x16_shuffle(t00, t01, 2,3,8,9,14,15,20,21,26,27,0,1,4,5,6,7);
+    v128_t t12 = wasm_v8x16_shuffle(t00, t01, 4,5,10,11,16,17,22,23,28,29,0,1,2,3,6,7);
+
+    a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,26,27);
+    b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,16,17,22,23,28,29);
+    c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,8,9,18,19,24,25,30,31);
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
+{
+    v128_t u0 = wasm_v128_load(ptr); // a0 b0 c0 d0 a1 b1 c1 d1
+    v128_t u1 = wasm_v128_load(ptr + 8); // a2 b2 c2 d2 ...
+    v128_t u2 = wasm_v128_load(ptr + 16); // a4 b4 c4 d4 ...
+    v128_t u3 = wasm_v128_load(ptr + 24); // a6 b6 c6 d6 ...
+
+    v128_t v0 = wasm_v8x16_shuffle(u0, u1, 0,1,8,9,16,17,24,25,2,3,10,11,18,19,26,27); // a0 a1 a2 a3 b0 b1 b2 b3
+    v128_t v1 = wasm_v8x16_shuffle(u2, u3, 0,1,8,9,16,17,24,25,2,3,10,11,18,19,26,27); // a4 a5 a6 a7 b4 b5 b6 b7
+    v128_t v2 = wasm_v8x16_shuffle(u0, u1, 4,5,12,13,20,21,28,29,6,7,14,15,22,23,30,31); // c0 c1 c2 c3 d0 d1 d2 d3
+    v128_t v3 = wasm_v8x16_shuffle(u2, u3, 4,5,12,13,20,21,28,29,6,7,14,15,22,23,30,31); // c4 c5 c6 c7 d4 d5 d6 d7
+
+    a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
+    b.val = wasm_v8x16_shuffle(v0, v1, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
+    c.val = wasm_v8x16_shuffle(v2, v3, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
+    d.val = wasm_v8x16_shuffle(v2, v3, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
+}
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b)
+{
+    v128_t v0 = wasm_v128_load(ptr);     // a0 b0 a1 b1
+    v128_t v1 = wasm_v128_load(ptr + 4); // a2 b2 a3 b3
+
+    a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); // a0 a1 a2 a3
+    b.val = wasm_v8x16_shuffle(v0, v1, 4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); // b0 b1 b2 b3
+}
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
+{
+    v128_t t00 = wasm_v128_load(ptr);        // a0 b0 c0 a1
+    v128_t t01 = wasm_v128_load(ptr + 4);     // b2 c2 a3 b3
+    v128_t t02 = wasm_v128_load(ptr + 8);    // c3 a4 b4 c4
+
+    v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,2,3,12,13,14,15,24,25,26,27,4,5,6,7);
+    v128_t t11 = wasm_v8x16_shuffle(t00, t01, 4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3);
+    v128_t t12 = wasm_v8x16_shuffle(t00, t01, 8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7);
+
+    a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
+    b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
+    c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
+}
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
+{
+    v_uint32x4 s0(wasm_v128_load(ptr));      // a0 b0 c0 d0
+    v_uint32x4 s1(wasm_v128_load(ptr + 4));  // a1 b1 c1 d1
+    v_uint32x4 s2(wasm_v128_load(ptr + 8));  // a2 b2 c2 d2
+    v_uint32x4 s3(wasm_v128_load(ptr + 12)); // a3 b3 c3 d3
+
+    v_transpose4x4(s0, s1, s2, s3, a, b, c, d);
+}
+
+inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
+{
+    v128_t v0 = wasm_v128_load(ptr);       // a0 b0 a1 b1
+    v128_t v1 = wasm_v128_load((ptr + 4)); // a2 b2 a3 b3
+
+    a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); // a0 a1 a2 a3
+    b.val = wasm_v8x16_shuffle(v0, v1, 4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); // b0 b1 b2 b3
+}
+
+inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c)
+{
+    v128_t t00 = wasm_v128_load(ptr);        // a0 b0 c0 a1
+    v128_t t01 = wasm_v128_load(ptr + 4);     // b2 c2 a3 b3
+    v128_t t02 = wasm_v128_load(ptr + 8);    // c3 a4 b4 c4
+
+    v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,2,3,12,13,14,15,24,25,26,27,4,5,6,7);
+    v128_t t11 = wasm_v8x16_shuffle(t00, t01, 4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3);
+    v128_t t12 = wasm_v8x16_shuffle(t00, t01, 8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7);
+
+    a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
+    b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
+    c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
+}
+
+inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c, v_float32x4& d)
+{
+    v_float32x4 s0(wasm_v128_load(ptr));      // a0 b0 c0 d0
+    v_float32x4 s1(wasm_v128_load(ptr + 4));  // a1 b1 c1 d1
+    v_float32x4 s2(wasm_v128_load(ptr + 8));  // a2 b2 c2 d2
+    v_float32x4 s3(wasm_v128_load(ptr + 12)); // a3 b3 c3 d3
+
+    v_transpose4x4(s0, s1, s2, s3, a, b, c, d);
+}
+
+inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b)
+{
+    v128_t t0 = wasm_v128_load(ptr);      // a0 b0
+    v128_t t1 = wasm_v128_load(ptr + 2);  // a1 b1
+
+    a.val = wasm_unpacklo_i64x2(t0, t1);
+    b.val = wasm_unpackhi_i64x2(t0, t1);
+}
+
+inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
+{
+    v128_t t0 = wasm_v128_load(ptr);     // a0, b0
+    v128_t t1 = wasm_v128_load(ptr + 2); // c0, a1
+    v128_t t2 = wasm_v128_load(ptr + 4); // b1, c1
+
+    a.val = wasm_v8x16_shuffle(t0, t1, 0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
+    b.val = wasm_v8x16_shuffle(t0, t2, 8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23);
+    c.val = wasm_v8x16_shuffle(t1, t2, 0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
+}
+
+inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a,
+                                v_uint64x2& b, v_uint64x2& c, v_uint64x2& d)
+{
+    v128_t t0 = wasm_v128_load(ptr);     // a0 b0
+    v128_t t1 = wasm_v128_load(ptr + 2); // c0 d0
+    v128_t t2 = wasm_v128_load(ptr + 4); // a1 b1
+    v128_t t3 = wasm_v128_load(ptr + 6); // c1 d1
+
+    a.val = wasm_unpacklo_i64x2(t0, t2);
+    b.val = wasm_unpackhi_i64x2(t0, t2);
+    c.val = wasm_unpacklo_i64x2(t1, t3);
+    d.val = wasm_unpackhi_i64x2(t1, t3);
+}
+
+// store interleave
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
+                                hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t v0 = wasm_unpacklo_i8x16(a.val, b.val);
+    v128_t v1 = wasm_unpackhi_i8x16(a.val, b.val);
+
+    wasm_v128_store(ptr, v0);
+    wasm_v128_store(ptr + 16, v1);
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
+                                const v_uint8x16& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5);
+    v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 21,0,6,22,0,7,23,0,8,24,0,9,25,0,10,26);
+    v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0);
+
+    v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15);
+    v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15);
+    v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31);
+
+    wasm_v128_store(ptr, t10);
+    wasm_v128_store(ptr + 16, t11);
+    wasm_v128_store(ptr + 32, t12);
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
+                                const v_uint8x16& c, const v_uint8x16& d,
+                                hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    // a0 a1 a2 a3 ....
+    // b0 b1 b2 b3 ....
+    // c0 c1 c2 c3 ....
+    // d0 d1 d2 d3 ....
+    v128_t u0 = wasm_unpacklo_i8x16(a.val, c.val); // a0 c0 a1 c1 ...
+    v128_t u1 = wasm_unpackhi_i8x16(a.val, c.val); // a8 c8 a9 c9 ...
+    v128_t u2 = wasm_unpacklo_i8x16(b.val, d.val); // b0 d0 b1 d1 ...
+    v128_t u3 = wasm_unpackhi_i8x16(b.val, d.val); // b8 d8 b9 d9 ...
+
+    v128_t v0 = wasm_unpacklo_i8x16(u0, u2); // a0 b0 c0 d0 ...
+    v128_t v1 = wasm_unpackhi_i8x16(u0, u2); // a4 b4 c4 d4 ...
+    v128_t v2 = wasm_unpacklo_i8x16(u1, u3); // a8 b8 c8 d8 ...
+    v128_t v3 = wasm_unpackhi_i8x16(u1, u3); // a12 b12 c12 d12 ...
+
+    wasm_v128_store(ptr, v0);
+    wasm_v128_store(ptr + 16, v1);
+    wasm_v128_store(ptr + 32, v2);
+    wasm_v128_store(ptr + 48, v3);
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
+                                hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t v0 = wasm_unpacklo_i16x8(a.val, b.val);
+    v128_t v1 = wasm_unpackhi_i16x8(a.val, b.val);
+
+    wasm_v128_store(ptr, v0);
+    wasm_v128_store(ptr + 8, v1);
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
+                                const v_uint16x8& b, const v_uint16x8& c,
+                                hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,16,17,0,0,2,3,18,19,0,0,4,5,20,21);
+    v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 0,0,6,7,22,23,0,0,8,9,24,25,0,0,10,11);
+    v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 26,27,0,0,12,13,28,29,0,0,14,15,30,31,0,0);
+
+    v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,16,17,6,7,8,9,18,19,12,13,14,15);
+    v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 20,21,2,3,4,5,22,23,8,9,10,11,24,25,14,15);
+    v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 0,1,26,27,4,5,6,7,28,29,10,11,12,13,30,31);
+
+    wasm_v128_store(ptr, t10);
+    wasm_v128_store(ptr + 8, t11);
+    wasm_v128_store(ptr + 16, t12);
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
+                                const v_uint16x8& c, const v_uint16x8& d,
+                                hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    // a0 a1 a2 a3 ....
+    // b0 b1 b2 b3 ....
+    // c0 c1 c2 c3 ....
+    // d0 d1 d2 d3 ....
+    v128_t u0 = wasm_unpacklo_i16x8(a.val, c.val); // a0 c0 a1 c1 ...
+    v128_t u1 = wasm_unpackhi_i16x8(a.val, c.val); // a4 c4 a5 c5 ...
+    v128_t u2 = wasm_unpacklo_i16x8(b.val, d.val); // b0 d0 b1 d1 ...
+    v128_t u3 = wasm_unpackhi_i16x8(b.val, d.val); // b4 d4 b5 d5 ...
+
+    v128_t v0 = wasm_unpacklo_i16x8(u0, u2); // a0 b0 c0 d0 ...
+    v128_t v1 = wasm_unpackhi_i16x8(u0, u2); // a2 b2 c2 d2 ...
+    v128_t v2 = wasm_unpacklo_i16x8(u1, u3); // a4 b4 c4 d4 ...
+    v128_t v3 = wasm_unpackhi_i16x8(u1, u3); // a6 b6 c6 d6 ...
+
+    wasm_v128_store(ptr, v0);
+    wasm_v128_store(ptr + 8, v1);
+    wasm_v128_store(ptr + 16, v2);
+    wasm_v128_store(ptr + 24, v3);
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
+                                hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t v0 = wasm_unpacklo_i32x4(a.val, b.val);
+    v128_t v1 = wasm_unpackhi_i32x4(a.val, b.val);
+
+    wasm_v128_store(ptr, v0);
+    wasm_v128_store(ptr + 4, v1);
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
+                                const v_uint32x4& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,16,17,18,19,0,0,0,0,4,5,6,7);
+    v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 20,21,22,23,0,0,0,0,8,9,10,11,24,25,26,27);
+    v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,0,0,0,12,13,14,15,28,29,30,31,0,0,0,0);
+
+    v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,4,5,6,7,16,17,18,19,12,13,14,15);
+    v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,1,2,3,20,21,22,23,8,9,10,11,12,13,14,15);
+    v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 24,25,26,27,4,5,6,7,8,9,10,11,28,29,30,31);
+
+    wasm_v128_store(ptr, t10);
+    wasm_v128_store(ptr + 4, t11);
+    wasm_v128_store(ptr + 8, t12);
+}
+
+inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
+                               const v_uint32x4& c, const v_uint32x4& d,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v_uint32x4 v0, v1, v2, v3;
+    v_transpose4x4(a, b, c, d, v0, v1, v2, v3);
+
+    wasm_v128_store(ptr, v0.val);
+    wasm_v128_store(ptr + 4, v1.val);
+    wasm_v128_store(ptr + 8, v2.val);
+    wasm_v128_store(ptr + 12, v3.val);
+}
+
+// 2-channel, float only
+inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t v0 = wasm_unpacklo_i32x4(a.val, b.val);
+    v128_t v1 = wasm_unpackhi_i32x4(a.val, b.val);
+
+    wasm_v128_store(ptr, v0);
+    wasm_v128_store(ptr + 4, v1);
+}
+
+inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
+                               const v_float32x4& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,16,17,18,19,0,0,0,0,4,5,6,7);
+    v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 20,21,22,23,0,0,0,0,8,9,10,11,24,25,26,27);
+    v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,0,0,0,12,13,14,15,28,29,30,31,0,0,0,0);
+
+    v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,4,5,6,7,16,17,18,19,12,13,14,15);
+    v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,1,2,3,20,21,22,23,8,9,10,11,12,13,14,15);
+    v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 24,25,26,27,4,5,6,7,8,9,10,11,28,29,30,31);
+
+    wasm_v128_store(ptr, t10);
+    wasm_v128_store(ptr + 4, t11);
+    wasm_v128_store(ptr + 8, t12);
+}
+
+inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
+                               const v_float32x4& c, const v_float32x4& d,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v_float32x4 v0, v1, v2, v3;
+    v_transpose4x4(a, b, c, d, v0, v1, v2, v3);
+
+    wasm_v128_store(ptr, v0.val);
+    wasm_v128_store(ptr + 4, v1.val);
+    wasm_v128_store(ptr + 8, v2.val);
+    wasm_v128_store(ptr + 12, v3.val);
+}
+
+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t v0 = wasm_unpacklo_i64x2(a.val, b.val);
+    v128_t v1 = wasm_unpackhi_i64x2(a.val, b.val);
+
+    wasm_v128_store(ptr, v0);
+    wasm_v128_store(ptr + 2, v1);
+}
+
+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
+                               const v_uint64x2& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t v0 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
+    v128_t v1 = wasm_v8x16_shuffle(a.val, c.val, 16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15);
+    v128_t v2 = wasm_v8x16_shuffle(b.val, c.val, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
+
+    wasm_v128_store(ptr, v0);
+    wasm_v128_store(ptr + 2, v1);
+    wasm_v128_store(ptr + 4, v2);
+}
+
+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
+                               const v_uint64x2& c, const v_uint64x2& d,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t v0 = wasm_unpacklo_i64x2(a.val, b.val);
+    v128_t v1 = wasm_unpacklo_i64x2(c.val, d.val);
+    v128_t v2 = wasm_unpackhi_i64x2(a.val, b.val);
+    v128_t v3 = wasm_unpackhi_i64x2(c.val, d.val);
+
+    wasm_v128_store(ptr, v0);
+    wasm_v128_store(ptr + 2, v1);
+    wasm_v128_store(ptr + 4, v2);
+    wasm_v128_store(ptr + 6, v3);
+}
+
+#define OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
+{ \
+    _Tpvec1 a1, b1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
+{ \
+    _Tpvec1 a1, b1, c1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
+{ \
+    _Tpvec1 a1, b1, c1, d1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+    d0 = v_reinterpret_as_##suffix0(d1); \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                hal::StoreMode mode = hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, mode);      \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                const _Tpvec0& c0, hal::StoreMode mode = hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode);  \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                const _Tpvec0& c0, const _Tpvec0& d0, \
+                                hal::StoreMode mode = hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
+}
+
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int64x2, int64, s64, v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, uint64, u64)
+
+inline v_float32x4 v_cvt_f32(const v_int32x4& a)
+{
+    return v_float32x4(wasm_f32x4_convert_i32x4(a.val));
+}
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a)
+{
+    double a_[2];
+    wasm_v128_store(a_, a.val);
+    float c_[4];
+    c_[0] = (float)(a_[0]);
+    c_[1] = (float)(a_[1]);
+    c_[2] = 0;
+    c_[3] = 0;
+    return v_float32x4(wasm_v128_load(c_));
+}
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{
+    double a_[2], b_[2];
+    wasm_v128_store(a_, a.val);
+    wasm_v128_store(b_, b.val);
+    float c_[4];
+    c_[0] = (float)(a_[0]);
+    c_[1] = (float)(a_[1]);
+    c_[2] = (float)(b_[0]);
+    c_[3] = (float)(b_[1]);
+    return v_float32x4(wasm_v128_load(c_));
+}
+
+inline v_float64x2 v_cvt_f64(const v_int32x4& a)
+{
+#ifdef __wasm_unimplemented_simd128__
+    v128_t p = v128_cvti32x4_i64x2(a.val);
+    return v_float64x2(wasm_f64x2_convert_i64x2(p));
+#else
+    int a_[4];
+    wasm_v128_store(a_, a.val);
+    double c_[2];
+    c_[0] = (double)(a_[0]);
+    c_[1] = (double)(a_[1]);
+    return v_float64x2(wasm_v128_load(c_));
+#endif
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
+{
+#ifdef __wasm_unimplemented_simd128__
+    v128_t p = v128_cvti32x4_i64x2_high(a.val);
+    return v_float64x2(wasm_f64x2_convert_i64x2(p));
+#else
+    int a_[4];
+    wasm_v128_store(a_, a.val);
+    double c_[2];
+    c_[0] = (double)(a_[2]);
+    c_[1] = (double)(a_[3]);
+    return v_float64x2(wasm_v128_load(c_));
+#endif
+}
+
+inline v_float64x2 v_cvt_f64(const v_float32x4& a)
+{
+    float a_[4];
+    wasm_v128_store(a_, a.val);
+    double c_[2];
+    c_[0] = (double)(a_[0]);
+    c_[1] = (double)(a_[1]);
+    return v_float64x2(wasm_v128_load(c_));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
+{
+    float a_[4];
+    wasm_v128_store(a_, a.val);
+    double c_[2];
+    c_[0] = (double)(a_[2]);
+    c_[1] = (double)(a_[3]);
+    return v_float64x2(wasm_v128_load(c_));
+}
+
+inline v_float64x2 v_cvt_f64(const v_int64x2& a)
+{
+#ifdef __wasm_unimplemented_simd128__
+    return v_float64x2(wasm_f64x2_convert_i64x2(a.val));
+#else
+    int64 a_[2];
+    wasm_v128_store(a_, a.val);
+    double c_[2];
+    c_[0] = (double)(a_[0]);
+    c_[1] = (double)(a_[1]);
+    return v_float64x2(wasm_v128_load(c_));
+#endif
+}
+
+////////////// Lookup table access ////////////////////
+
+inline v_int8x16 v_lut(const schar* tab, const int* idx)
+{
+    return v_int8x16(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
+                     tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]);
+}
+inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
+{
+    return v_int8x16(tab[idx[0]], tab[idx[0]+1], tab[idx[1]], tab[idx[1]+1], tab[idx[2]], tab[idx[2]+1], tab[idx[3]], tab[idx[3]+1],
+                     tab[idx[4]], tab[idx[4]+1], tab[idx[5]], tab[idx[5]+1], tab[idx[6]], tab[idx[6]+1], tab[idx[7]], tab[idx[7]+1]);
+}
+inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
+{
+    return v_int8x16(tab[idx[0]], tab[idx[0]+1], tab[idx[0]+2], tab[idx[0]+3], tab[idx[1]], tab[idx[1]+1], tab[idx[1]+2], tab[idx[1]+3],
+                     tab[idx[2]], tab[idx[2]+1], tab[idx[2]+2], tab[idx[2]+3], tab[idx[3]], tab[idx[3]+1], tab[idx[3]+2], tab[idx[3]+3]);
+}
+inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((const schar *)tab, idx)); }
+inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((const schar *)tab, idx)); }
+inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((const schar *)tab, idx)); }
+
+inline v_int16x8 v_lut(const short* tab, const int* idx)
+{
+    return v_int16x8(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
+                     tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]);
+}
+inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
+{
+    return v_int16x8(tab[idx[0]], tab[idx[0]+1], tab[idx[1]], tab[idx[1]+1],
+                     tab[idx[2]], tab[idx[2]+1], tab[idx[3]], tab[idx[3]+1]);
+}
+inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
+{
+    return v_int16x8(tab[idx[0]], tab[idx[0]+1], tab[idx[0]+2], tab[idx[0]+3],
+                     tab[idx[1]], tab[idx[1]+1], tab[idx[1]+2], tab[idx[1]+3]);
+}
+inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((const short *)tab, idx)); }
+inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((const short *)tab, idx)); }
+inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((const short *)tab, idx)); }
+
+inline v_int32x4 v_lut(const int* tab, const int* idx)
+{
+    return v_int32x4(tab[idx[0]], tab[idx[1]],
+                     tab[idx[2]], tab[idx[3]]);
+}
+inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
+{
+    return v_int32x4(tab[idx[0]], tab[idx[0]+1],
+                     tab[idx[1]], tab[idx[1]+1]);
+}
+inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x4(wasm_v128_load(tab + idx[0]));
+}
+inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int *)tab, idx)); }
+inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int *)tab, idx)); }
+inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int *)tab, idx)); }
+
+inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(tab[idx[0]], tab[idx[1]]);
+}
+inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(wasm_v128_load(tab + idx[0]));
+}
+inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
+inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
+
+inline v_float32x4 v_lut(const float* tab, const int* idx)
+{
+    return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_pairs((const int *)tab, idx)); }
+inline v_float32x4 v_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_quads((const int *)tab, idx)); }
+
+inline v_float64x2 v_lut(const double* tab, const int* idx)
+{
+    return v_float64x2(tab[idx[0]], tab[idx[1]]);
+}
+inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
+{
+    return v_float64x2(wasm_v128_load(tab + idx[0]));
+}
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    return v_int32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
+                     tab[wasm_i32x4_extract_lane(idxvec.val, 1)],
+                     tab[wasm_i32x4_extract_lane(idxvec.val, 2)],
+                     tab[wasm_i32x4_extract_lane(idxvec.val, 3)]);
+}
+
+inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
+{
+    return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    return v_float32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
+                       tab[wasm_i32x4_extract_lane(idxvec.val, 1)],
+                       tab[wasm_i32x4_extract_lane(idxvec.val, 2)],
+                       tab[wasm_i32x4_extract_lane(idxvec.val, 3)]);
+}
+
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    return v_float64x2(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
+                       tab[wasm_i32x4_extract_lane(idxvec.val, 1)]);
+}
+
+// loads pairs from the table and deinterleaves them, e.g. returns:
+//   x = (tab[idxvec[0], tab[idxvec[1]], tab[idxvec[2]], tab[idxvec[3]]),
+//   y = (tab[idxvec[0]+1], tab[idxvec[1]+1], tab[idxvec[2]+1], tab[idxvec[3]+1])
+// note that the indices are float's indices, not the float-pair indices.
+// in theory, this function can be used to implement bilinear interpolation,
+// when idxvec are the offsets within the image.
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    x = v_float32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
+                    tab[wasm_i32x4_extract_lane(idxvec.val, 1)],
+                    tab[wasm_i32x4_extract_lane(idxvec.val, 2)],
+                    tab[wasm_i32x4_extract_lane(idxvec.val, 3)]);
+    y = v_float32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)+1],
+                    tab[wasm_i32x4_extract_lane(idxvec.val, 1)+1],
+                    tab[wasm_i32x4_extract_lane(idxvec.val, 2)+1],
+                    tab[wasm_i32x4_extract_lane(idxvec.val, 3)+1]);
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    v128_t xy0 = wasm_v128_load(tab + wasm_i32x4_extract_lane(idxvec.val, 0));
+    v128_t xy1 = wasm_v128_load(tab + wasm_i32x4_extract_lane(idxvec.val, 1));
+    x.val = wasm_unpacklo_i64x2(xy0, xy1);
+    y.val = wasm_unpacklo_i64x2(xy0, xy1);
+}
+
+inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
+{
+    return v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15));
+}
+inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
+inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
+{
+    return v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,4,1,5,2,6,3,7,8,12,9,13,10,14,11,15));
+}
+inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
+{
+    return v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15));
+}
+inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
+inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
+{
+    return v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15));
+}
+inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
+{
+    return v_int32x4(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,8,9,10,11,4,5,6,7,12,13,14,15));
+}
+inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+inline v_float32x4 v_interleave_pairs(const v_float32x4& vec)
+{
+    return v_float32x4(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,8,9,10,11,4,5,6,7,12,13,14,15));
+}
+
+inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
+{
+    return v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,4,5,6,8,9,10,12,13,14,16,16,16,16));
+}
+inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
+{
+    return v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,4,5,8,9,10,11,12,13,14,15,6,7));
+}
+inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
+inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
+inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
+
+template<int i, typename _Tp>
+inline typename _Tp::lane_type v_extract_n(const _Tp& a)
+{
+    return v_rotate_right<i>(a).get0();
+}
+
+template<int i>
+inline v_uint32x4 v_broadcast_element(const v_uint32x4& a)
+{
+    return v_setall_u32(v_extract_n<i>(a));
+}
+template<int i>
+inline v_int32x4 v_broadcast_element(const v_int32x4& a)
+{
+    return v_setall_s32(v_extract_n<i>(a));
+}
+template<int i>
+inline v_float32x4 v_broadcast_element(const v_float32x4& a)
+{
+    return v_setall_f32(v_extract_n<i>(a));
+}
+
+
+////////////// FP16 support ///////////////////////////
+
+inline v_float32x4 v_load_expand(const float16_t* ptr)
+{
+    float a[4];
+    for (int i = 0; i < 4; i++)
+        a[i] = ptr[i];
+    return v_float32x4(wasm_v128_load(a));
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+{
+    double v_[4];
+    wasm_v128_store(v_, v.val);
+    ptr[0] = float16_t(v_[0]);
+    ptr[1] = float16_t(v_[1]);
+    ptr[2] = float16_t(v_[2]);
+    ptr[3] = float16_t(v_[3]);
+}
+
+inline void v_cleanup() {}
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+}
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/msa_macros.h b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/msa_macros.h
new file mode 100644
index 0000000..fad8c5a
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/msa_macros.h
@@ -0,0 +1,1558 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_HAL_MSA_MACROS_H
+#define OPENCV_CORE_HAL_MSA_MACROS_H
+
+#ifdef __mips_msa
+#include "msa.h"
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Define 64 bits vector types */
+typedef signed char v8i8 __attribute__ ((vector_size(8), aligned(8)));
+typedef unsigned char v8u8 __attribute__ ((vector_size(8), aligned(8)));
+typedef short v4i16 __attribute__ ((vector_size(8), aligned(8)));
+typedef unsigned short v4u16 __attribute__ ((vector_size(8), aligned(8)));
+typedef int v2i32 __attribute__ ((vector_size(8), aligned(8)));
+typedef unsigned int v2u32 __attribute__ ((vector_size(8), aligned(8)));
+typedef long long v1i64 __attribute__ ((vector_size(8), aligned(8)));
+typedef unsigned long long v1u64 __attribute__ ((vector_size(8), aligned(8)));
+typedef float v2f32 __attribute__ ((vector_size(8), aligned(8)));
+typedef double v1f64 __attribute__ ((vector_size(8), aligned(8)));
+
+
+/* Load values from the given memory a 64-bit vector. */
+#define msa_ld1_s8(__a)  (*((v8i8*)(__a)))
+#define msa_ld1_s16(__a) (*((v4i16*)(__a)))
+#define msa_ld1_s32(__a) (*((v2i32*)(__a)))
+#define msa_ld1_s64(__a) (*((v1i64*)(__a)))
+#define msa_ld1_u8(__a)  (*((v8u8*)(__a)))
+#define msa_ld1_u16(__a) (*((v4u16*)(__a)))
+#define msa_ld1_u32(__a) (*((v2u32*)(__a)))
+#define msa_ld1_u64(__a) (*((v1u64*)(__a)))
+#define msa_ld1_f32(__a) (*((v2f32*)(__a)))
+#define msa_ld1_f64(__a) (*((v1f64*)(__a)))
+
+/* Load values from the given memory address to a 128-bit vector */
+#define msa_ld1q_s8(__a)  ((v16i8)__builtin_msa_ld_b(__a, 0))
+#define msa_ld1q_s16(__a) ((v8i16)__builtin_msa_ld_h(__a, 0))
+#define msa_ld1q_s32(__a) ((v4i32)__builtin_msa_ld_w(__a, 0))
+#define msa_ld1q_s64(__a) ((v2i64)__builtin_msa_ld_d(__a, 0))
+#define msa_ld1q_u8(__a)  ((v16u8)__builtin_msa_ld_b(__a, 0))
+#define msa_ld1q_u16(__a) ((v8u16)__builtin_msa_ld_h(__a, 0))
+#define msa_ld1q_u32(__a) ((v4u32)__builtin_msa_ld_w(__a, 0))
+#define msa_ld1q_u64(__a) ((v2u64)__builtin_msa_ld_d(__a, 0))
+#define msa_ld1q_f32(__a) ((v4f32)__builtin_msa_ld_w(__a, 0))
+#define msa_ld1q_f64(__a) ((v2f64)__builtin_msa_ld_d(__a, 0))
+
+/* Store 64bits vector elements values to the given memory address. */
+#define msa_st1_s8(__a, __b)  (*((v8i8*)(__a)) = __b)
+#define msa_st1_s16(__a, __b) (*((v4i16*)(__a)) = __b)
+#define msa_st1_s32(__a, __b) (*((v2i32*)(__a)) = __b)
+#define msa_st1_s64(__a, __b) (*((v1i64*)(__a)) = __b)
+#define msa_st1_u8(__a, __b)  (*((v8u8*)(__a)) = __b)
+#define msa_st1_u16(__a, __b) (*((v4u16*)(__a)) = __b)
+#define msa_st1_u32(__a, __b) (*((v2u32*)(__a)) = __b)
+#define msa_st1_u64(__a, __b) (*((v1u64*)(__a)) = __b)
+#define msa_st1_f32(__a, __b) (*((v2f32*)(__a)) = __b)
+#define msa_st1_f64(__a, __b) (*((v1f64*)(__a)) = __b)
+
+/* Store the values of elements in the 128 bits vector __a to the given memory address __a. */
+#define msa_st1q_s8(__a, __b)  (__builtin_msa_st_b((v16i8)(__b), __a, 0))
+#define msa_st1q_s16(__a, __b) (__builtin_msa_st_h((v8i16)(__b), __a, 0))
+#define msa_st1q_s32(__a, __b) (__builtin_msa_st_w((v4i32)(__b), __a, 0))
+#define msa_st1q_s64(__a, __b) (__builtin_msa_st_d((v2i64)(__b), __a, 0))
+#define msa_st1q_u8(__a, __b)  (__builtin_msa_st_b((v16i8)(__b), __a, 0))
+#define msa_st1q_u16(__a, __b) (__builtin_msa_st_h((v8i16)(__b), __a, 0))
+#define msa_st1q_u32(__a, __b) (__builtin_msa_st_w((v4i32)(__b), __a, 0))
+#define msa_st1q_u64(__a, __b) (__builtin_msa_st_d((v2i64)(__b), __a, 0))
+#define msa_st1q_f32(__a, __b) (__builtin_msa_st_w((v4i32)(__b), __a, 0))
+#define msa_st1q_f64(__a, __b) (__builtin_msa_st_d((v2i64)(__b), __a, 0))
+
+/* Store the value of the element with the index __c in vector __a to the given memory address __a. */
+#define msa_st1_lane_s8(__a, __b, __c)   (*((int8_t*)(__a)) = __b[__c])
+#define msa_st1_lane_s16(__a, __b, __c)  (*((int16_t*)(__a)) = __b[__c])
+#define msa_st1_lane_s32(__a, __b, __c)  (*((int32_t*)(__a)) = __b[__c])
+#define msa_st1_lane_s64(__a, __b, __c)  (*((int64_t*)(__a)) = __b[__c])
+#define msa_st1_lane_u8(__a, __b, __c)   (*((uint8_t*)(__a)) = __b[__c])
+#define msa_st1_lane_u16(__a, __b, __c)  (*((uint16_t*)(__a)) = __b[__c])
+#define msa_st1_lane_u32(__a, __b, __c)  (*((uint32_t*)(__a)) = __b[__c])
+#define msa_st1_lane_u64(__a, __b, __c)  (*((uint64_t*)(__a)) = __b[__c])
+#define msa_st1_lane_f32(__a, __b, __c)  (*((float*)(__a)) = __b[__c])
+#define msa_st1_lane_f64(__a, __b, __c)  (*((double*)(__a)) = __b[__c])
+#define msa_st1q_lane_s8(__a, __b, __c)  (*((int8_t*)(__a)) = (int8_t)__builtin_msa_copy_s_b(__b, __c))
+#define msa_st1q_lane_s16(__a, __b, __c) (*((int16_t*)(__a)) = (int16_t)__builtin_msa_copy_s_h(__b, __c))
+#define msa_st1q_lane_s32(__a, __b, __c) (*((int32_t*)(__a)) = __builtin_msa_copy_s_w(__b, __c))
+#define msa_st1q_lane_s64(__a, __b, __c) (*((int64_t*)(__a)) = __builtin_msa_copy_s_d(__b, __c))
+#define msa_st1q_lane_u8(__a, __b, __c)  (*((uint8_t*)(__a)) = (uint8_t)__builtin_msa_copy_u_b((v16i8)(__b), __c))
+#define msa_st1q_lane_u16(__a, __b, __c) (*((uint16_t*)(__a)) = (uint16_t)__builtin_msa_copy_u_h((v8i16)(__b), __c))
+#define msa_st1q_lane_u32(__a, __b, __c) (*((uint32_t*)(__a)) = __builtin_msa_copy_u_w((v4i32)(__b), __c))
+#define msa_st1q_lane_u64(__a, __b, __c) (*((uint64_t*)(__a)) = __builtin_msa_copy_u_d((v2i64)(__b), __c))
+#define msa_st1q_lane_f32(__a, __b, __c) (*((float*)(__a)) = __b[__c])
+#define msa_st1q_lane_f64(__a, __b, __c) (*((double*)(__a)) = __b[__c])
+
+/* Duplicate elements for 64-bit doubleword vectors */
+#define msa_dup_n_s8(__a)  ((v8i8)__builtin_msa_copy_s_d((v2i64)__builtin_msa_fill_b((int32_t)(__a)), 0))
+#define msa_dup_n_s16(__a) ((v4i16)__builtin_msa_copy_s_d((v2i64)__builtin_msa_fill_h((int32_t)(__a)), 0))
+#define msa_dup_n_s32(__a) ((v2i32){__a, __a})
+#define msa_dup_n_s64(__a) ((v1i64){__a})
+#define msa_dup_n_u8(__a)  ((v8u8)__builtin_msa_copy_u_d((v2i64)__builtin_msa_fill_b((int32_t)(__a)), 0))
+#define msa_dup_n_u16(__a) ((v4u16)__builtin_msa_copy_u_d((v2i64)__builtin_msa_fill_h((int32_t)(__a)), 0))
+#define msa_dup_n_u32(__a) ((v2u32){__a, __a})
+#define msa_dup_n_u64(__a) ((v1u64){__a})
+#define msa_dup_n_f32(__a) ((v2f32){__a, __a})
+#define msa_dup_n_f64(__a) ((v1f64){__a})
+
+/* Duplicate elements for 128-bit quadword vectors */
+#define msa_dupq_n_s8(__a)  (__builtin_msa_fill_b((int32_t)(__a)))
+#define msa_dupq_n_s16(__a) (__builtin_msa_fill_h((int32_t)(__a)))
+#define msa_dupq_n_s32(__a) (__builtin_msa_fill_w((int32_t)(__a)))
+#define msa_dupq_n_s64(__a) (__builtin_msa_fill_d((int64_t)(__a)))
+#define msa_dupq_n_u8(__a)  ((v16u8)__builtin_msa_fill_b((int32_t)(__a)))
+#define msa_dupq_n_u16(__a) ((v8u16)__builtin_msa_fill_h((int32_t)(__a)))
+#define msa_dupq_n_u32(__a) ((v4u32)__builtin_msa_fill_w((int32_t)(__a)))
+#define msa_dupq_n_u64(__a) ((v2u64)__builtin_msa_fill_d((int64_t)(__a)))
+#define msa_dupq_n_f32(__a) ((v4f32){__a, __a, __a, __a})
+#define msa_dupq_n_f64(__a) ((v2f64){__a, __a})
+#define msa_dupq_lane_s8(__a, __b)  (__builtin_msa_splat_b(__a, __b))
+#define msa_dupq_lane_s16(__a, __b) (__builtin_msa_splat_h(__a, __b))
+#define msa_dupq_lane_s32(__a, __b) (__builtin_msa_splat_w(__a, __b))
+#define msa_dupq_lane_s64(__a, __b) (__builtin_msa_splat_d(__a, __b))
+#define msa_dupq_lane_u8(__a, __b)  ((v16u8)__builtin_msa_splat_b((v16i8)(__a), __b))
+#define msa_dupq_lane_u16(__a, __b) ((v8u16)__builtin_msa_splat_h((v8i16)(__a), __b))
+#define msa_dupq_lane_u32(__a, __b) ((v4u32)__builtin_msa_splat_w((v4i32)(__a), __b))
+#define msa_dupq_lane_u64(__a, __b) ((v2u64)__builtin_msa_splat_d((v2i64)(__a), __b))
+
+/* Create a 64 bits vector */
+#define msa_create_s8(__a)  ((v8i8)((uint64_t)(__a)))
+#define msa_create_s16(__a) ((v4i16)((uint64_t)(__a)))
+#define msa_create_s32(__a) ((v2i32)((uint64_t)(__a)))
+#define msa_create_s64(__a) ((v1i64)((uint64_t)(__a)))
+#define msa_create_u8(__a)  ((v8u8)((uint64_t)(__a)))
+#define msa_create_u16(__a) ((v4u16)((uint64_t)(__a)))
+#define msa_create_u32(__a) ((v2u32)((uint64_t)(__a)))
+#define msa_create_u64(__a) ((v1u64)((uint64_t)(__a)))
+#define msa_create_f32(__a) ((v2f32)((uint64_t)(__a)))
+#define msa_create_f64(__a) ((v1f64)((uint64_t)(__a)))
+
+/* Sign extends or zero extends each element in a 64 bits vector to twice its original length, and places the results in a 128 bits vector. */
+/*Transform v8i8 to v8i16*/
+#define msa_movl_s8(__a) \
+((v8i16){(__a)[0], (__a)[1], (__a)[2], (__a)[3], \
+         (__a)[4], (__a)[5], (__a)[6], (__a)[7]})
+
+/*Transform v8u8 to v8u16*/
+#define msa_movl_u8(__a) \
+((v8u16){(__a)[0], (__a)[1], (__a)[2], (__a)[3], \
+         (__a)[4], (__a)[5], (__a)[6], (__a)[7]})
+
+/*Transform v4i16 to v8i16*/
+#define msa_movl_s16(__a) ((v4i32){(__a)[0], (__a)[1], (__a)[2], (__a)[3]})
+
+/*Transform v2i32 to v4i32*/
+#define msa_movl_s32(__a) ((v2i64){(__a)[0], (__a)[1]})
+
+/*Transform v4u16 to v8u16*/
+#define msa_movl_u16(__a) ((v4u32){(__a)[0], (__a)[1], (__a)[2], (__a)[3]})
+
+/*Transform v2u32 to v4u32*/
+#define msa_movl_u32(__a) ((v2u64){(__a)[0], (__a)[1]})
+
+/* Copies the least significant half of each element of a 128 bits vector into the corresponding elements of a 64 bits vector. */
+#define msa_movn_s16(__a) \
+({ \
+  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)(__a)); \
+  (v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_movn_s32(__a) \
+({ \
+  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)(__a)); \
+  (v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_movn_s64(__a) \
+({ \
+  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)(__a)); \
+  (v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_movn_u16(__a) \
+({ \
+  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)(__a)); \
+  (v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_movn_u32(__a) \
+({ \
+  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)(__a)); \
+  (v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_movn_u64(__a) \
+({ \
+  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)(__a)); \
+  (v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+/* qmovn */
+#define msa_qmovn_s16(__a) \
+({ \
+  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_s_h((v8i16)(__a), 7)); \
+  (v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_qmovn_s32(__a) \
+({ \
+  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_s_w((v4i32)(__a), 15)); \
+  (v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_qmovn_s64(__a) \
+({ \
+  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_s_d((v2i64)(__a), 31)); \
+  (v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_qmovn_u16(__a) \
+({ \
+  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_u_h((v8u16)(__a), 7)); \
+  (v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_qmovn_u32(__a) \
+({ \
+  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_u_w((v4u32)(__a), 15)); \
+  (v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_qmovn_u64(__a) \
+({ \
+  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_u_d((v2u64)(__a), 31)); \
+  (v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+/* qmovun */
+#define msa_qmovun_s16(__a) \
+({ \
+  v8i16 __d = __builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a)); \
+  v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_u_h((v8u16)__d, 7)); \
+  (v8u8)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+#define msa_qmovun_s32(__a) \
+({ \
+  v4i32 __d = __builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a)); \
+  v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_u_w((v4u32)__d, 15)); \
+  (v4u16)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+#define msa_qmovun_s64(__a) \
+({ \
+  v2i64 __d = __builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a)); \
+  v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_u_d((v2u64)__d, 31)); \
+  (v2u32)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+/* Right shift elements in a 128 bits vector by an immediate value, and places the results in a 64 bits vector. */
+#define msa_shrn_n_s16(__a, __b) \
+({ \
+  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srai_h((v8i16)(__a), (int)(__b))); \
+  (v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_shrn_n_s32(__a, __b) \
+({ \
+  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srai_w((v4i32)(__a), (int)(__b))); \
+  (v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_shrn_n_s64(__a, __b) \
+({ \
+  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srai_d((v2i64)(__a), (int)(__b))); \
+  (v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_shrn_n_u16(__a, __b) \
+({ \
+  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srli_h((v8i16)(__a), (int)(__b))); \
+  (v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_shrn_n_u32(__a, __b) \
+({ \
+  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srli_w((v4i32)(__a), (int)(__b))); \
+  (v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_shrn_n_u64(__a, __b) \
+({ \
+  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srli_d((v2i64)(__a), (int)(__b))); \
+  (v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+/* Right shift elements in a 128 bits vector by an immediate value, and places the results in a 64 bits vector. */
+#define msa_rshrn_n_s16(__a, __b) \
+({ \
+  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srari_h((v8i16)(__a), (int)__b)); \
+  (v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_rshrn_n_s32(__a, __b) \
+({ \
+  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srari_w((v4i32)(__a), (int)__b)); \
+  (v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_rshrn_n_s64(__a, __b) \
+({ \
+  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srari_d((v2i64)(__a), (int)__b)); \
+  (v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_rshrn_n_u16(__a, __b) \
+({ \
+  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srlri_h((v8i16)(__a), (int)__b)); \
+  (v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_rshrn_n_u32(__a, __b) \
+({ \
+  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srlri_w((v4i32)(__a), (int)__b)); \
+  (v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_rshrn_n_u64(__a, __b) \
+({ \
+  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srlri_d((v2i64)(__a), (int)__b)); \
+  (v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+/* Right shift elements in a 128 bits vector by an immediate value, saturate the results and them in a 64 bits vector. */
+#define msa_qrshrn_n_s16(__a, __b) \
+({ \
+  v8i16 __d = __builtin_msa_sat_s_h(__builtin_msa_srari_h((v8i16)(__a), (int)(__b)), 7); \
+  v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__d); \
+  (v8i8)__builtin_msa_copy_s_d((v2i64)__e, 0); \
+})
+
+#define msa_qrshrn_n_s32(__a, __b) \
+({ \
+  v4i32 __d = __builtin_msa_sat_s_w(__builtin_msa_srari_w((v4i32)(__a), (int)(__b)), 15); \
+  v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__d); \
+  (v4i16)__builtin_msa_copy_s_d((v2i64)__e, 0); \
+})
+
+#define msa_qrshrn_n_s64(__a, __b) \
+({ \
+  v2i64 __d = __builtin_msa_sat_s_d(__builtin_msa_srari_d((v2i64)(__a), (int)(__b)), 31); \
+  v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__d); \
+  (v2i32)__builtin_msa_copy_s_d((v2i64)__e, 0); \
+})
+
+#define msa_qrshrn_n_u16(__a, __b) \
+({ \
+  v8u16 __d = __builtin_msa_sat_u_h((v8u16)__builtin_msa_srlri_h((v8i16)(__a), (int)(__b)), 7); \
+  v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__d); \
+  (v8u8)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+#define msa_qrshrn_n_u32(__a, __b) \
+({ \
+  v4u32 __d = __builtin_msa_sat_u_w((v4u32)__builtin_msa_srlri_w((v4i32)(__a), (int)(__b)), 15); \
+  v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__d); \
+  (v4u16)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+#define msa_qrshrn_n_u64(__a, __b) \
+({ \
+  v2u64 __d = __builtin_msa_sat_u_d((v2u64)__builtin_msa_srlri_d((v2i64)(__a), (int)(__b)), 31); \
+  v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__d); \
+  (v2u32)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+/* Right shift elements in a 128 bits vector by an immediate value, saturate the results and them in a 64 bits vector.
+   Input is signed and output is unsigned. */
+#define msa_qrshrun_n_s16(__a, __b) \
+({ \
+  v8i16 __d = __builtin_msa_srlri_h(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a)), (int)(__b)); \
+  v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_u_h((v8u16)__d, 7)); \
+  (v8u8)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+#define msa_qrshrun_n_s32(__a, __b) \
+({ \
+  v4i32 __d = __builtin_msa_srlri_w(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a)), (int)(__b)); \
+  v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_u_w((v4u32)__d, 15)); \
+  (v4u16)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+#define msa_qrshrun_n_s64(__a, __b) \
+({ \
+  v2i64 __d = __builtin_msa_srlri_d(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a)), (int)(__b)); \
+  v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_u_d((v2u64)__d, 31)); \
+  (v2u32)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+/* pack */
+#define msa_pack_s16(__a, __b) (__builtin_msa_pckev_b((v16i8)(__b), (v16i8)(__a)))
+#define msa_pack_s32(__a, __b) (__builtin_msa_pckev_h((v8i16)(__b), (v8i16)(__a)))
+#define msa_pack_s64(__a, __b) (__builtin_msa_pckev_w((v4i32)(__b), (v4i32)(__a)))
+#define msa_pack_u16(__a, __b) ((v16u8)__builtin_msa_pckev_b((v16i8)(__b), (v16i8)(__a)))
+#define msa_pack_u32(__a, __b) ((v8u16)__builtin_msa_pckev_h((v8i16)(__b), (v8i16)(__a)))
+#define msa_pack_u64(__a, __b) ((v4u32)__builtin_msa_pckev_w((v4i32)(__b), (v4i32)(__a)))
+
+/* qpack */
+#define msa_qpack_s16(__a, __b) \
+(__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_s_h((v8i16)(__b), 7), (v16i8)__builtin_msa_sat_s_h((v8i16)(__a), 7)))
+#define msa_qpack_s32(__a, __b) \
+(__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_s_w((v4i32)(__b), 15), (v8i16)__builtin_msa_sat_s_w((v4i32)(__a), 15)))
+#define msa_qpack_s64(__a, __b) \
+(__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_s_d((v2i64)(__b), 31), (v4i32)__builtin_msa_sat_s_d((v2i64)(__a), 31)))
+#define msa_qpack_u16(__a, __b) \
+((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)(__b), 7), (v16i8)__builtin_msa_sat_u_h((v8u16)(__a), 7)))
+#define msa_qpack_u32(__a, __b) \
+((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)(__b), 15), (v8i16)__builtin_msa_sat_u_w((v4u32)(__a), 15)))
+#define msa_qpack_u64(__a, __b) \
+((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)(__b), 31), (v4i32)__builtin_msa_sat_u_d((v2u64)(__a), 31)))
+
+/* qpacku */
+#define msa_qpacku_s16(__a, __b) \
+((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__b))), 7), \
+                              (v16i8)__builtin_msa_sat_u_h((v8u16)(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a))), 7)))
+#define msa_qpacku_s32(__a, __b) \
+((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__b))), 15), \
+                              (v8i16)__builtin_msa_sat_u_w((v4u32)(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a))), 15)))
+#define msa_qpacku_s64(__a, __b) \
+((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__b))), 31), \
+                              (v4i32)__builtin_msa_sat_u_d((v2u64)(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a))), 31)))
+
+/* packr */
+#define msa_packr_s16(__a, __b, __c) \
+(__builtin_msa_pckev_b((v16i8)__builtin_msa_srai_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srai_h((v8i16)(__a), (int)(__c))))
+#define msa_packr_s32(__a, __b, __c) \
+(__builtin_msa_pckev_h((v8i16)__builtin_msa_srai_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srai_w((v4i32)(__a), (int)(__c))))
+#define msa_packr_s64(__a, __b, __c) \
+(__builtin_msa_pckev_w((v4i32)__builtin_msa_srai_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srai_d((v2i64)(__a), (int)(__c))))
+#define msa_packr_u16(__a, __b, __c) \
+((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_srli_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srli_h((v8i16)(__a), (int)(__c))))
+#define msa_packr_u32(__a, __b, __c) \
+((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_srli_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srli_w((v4i32)(__a), (int)(__c))))
+#define msa_packr_u64(__a, __b, __c) \
+((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_srli_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srli_d((v2i64)(__a), (int)(__c))))
+
+/* rpackr */
+#define msa_rpackr_s16(__a, __b, __c) \
+(__builtin_msa_pckev_b((v16i8)__builtin_msa_srari_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srari_h((v8i16)(__a), (int)(__c))))
+#define msa_rpackr_s32(__a, __b, __c) \
+(__builtin_msa_pckev_h((v8i16)__builtin_msa_srari_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srari_w((v4i32)(__a), (int)(__c))))
+#define msa_rpackr_s64(__a, __b, __c) \
+(__builtin_msa_pckev_w((v4i32)__builtin_msa_srari_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srari_d((v2i64)(__a), (int)(__c))))
+#define msa_rpackr_u16(__a, __b, __c) \
+((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_srlri_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srlri_h((v8i16)(__a), (int)(__c))))
+#define msa_rpackr_u32(__a, __b, __c) \
+((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_srlri_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srlri_w((v4i32)(__a), (int)(__c))))
+#define msa_rpackr_u64(__a, __b, __c) \
+((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_srlri_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srlri_d((v2i64)(__a), (int)(__c))))
+
+/* qrpackr */
+#define msa_qrpackr_s16(__a, __b, __c) \
+(__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_s_h(__builtin_msa_srari_h((v8i16)(__b), (int)(__c)), 7), \
+                       (v16i8)__builtin_msa_sat_s_h(__builtin_msa_srari_h((v8i16)(__a), (int)(__c)), 7)))
+#define msa_qrpackr_s32(__a, __b, __c) \
+(__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_s_w(__builtin_msa_srari_w((v4i32)(__b), (int)(__c)), 15), \
+                       (v8i16)__builtin_msa_sat_s_w(__builtin_msa_srari_w((v4i32)(__a), (int)(__c)), 15)))
+#define msa_qrpackr_s64(__a, __b, __c) \
+(__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_s_d(__builtin_msa_srari_d((v2i64)(__b), (int)(__c)), 31), \
+                       (v4i32)__builtin_msa_sat_s_d(__builtin_msa_srari_d((v2i64)(__a), (int)(__c)), 31)))
+#define msa_qrpackr_u16(__a, __b, __c) \
+((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)__builtin_msa_srlri_h((v8i16)(__b), (int)(__c)), 7), \
+                              (v16i8)__builtin_msa_sat_u_h((v8u16)__builtin_msa_srlri_h((v8i16)(__a), (int)(__c)), 7)))
+#define msa_qrpackr_u32(__a, __b, __c) \
+((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)__builtin_msa_srlri_w((v4i32)(__b), (int)(__c)), 15), \
+                              (v8i16)__builtin_msa_sat_u_w((v4u32)__builtin_msa_srlri_w((v4i32)(__a), (int)(__c)), 15)))
+#define msa_qrpackr_u64(__a, __b, __c) \
+((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)__builtin_msa_srlri_d((v2i64)(__b), (int)(__c)), 31), \
+                              (v4i32)__builtin_msa_sat_u_d((v2u64)__builtin_msa_srlri_d((v2i64)(__a), (int)(__c)), 31)))
+
+/* qrpackru */
+#define msa_qrpackru_s16(__a, __b, __c) \
+({ \
+  v8i16 __d = __builtin_msa_srlri_h(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a)), (int)(__c)); \
+  v8i16 __e = __builtin_msa_srlri_h(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__b)), (int)(__c)); \
+  (v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)__e, 7), (v16i8)__builtin_msa_sat_u_h((v8u16)__d, 7)); \
+})
+
+#define msa_qrpackru_s32(__a, __b, __c) \
+({ \
+  v4i32 __d = __builtin_msa_srlri_w(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a)), (int)(__c)); \
+  v4i32 __e = __builtin_msa_srlri_w(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__b)), (int)(__c)); \
+  (v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)__e, 15), (v8i16)__builtin_msa_sat_u_w((v4u32)__d, 15)); \
+})
+
+#define msa_qrpackru_s64(__a, __b, __c) \
+({ \
+  v2i64 __d = __builtin_msa_srlri_d(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a)), (int)(__c)); \
+  v2i64 __e = __builtin_msa_srlri_d(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__b)), (int)(__c)); \
+  (v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)__e, 31), (v4i32)__builtin_msa_sat_u_d((v2u64)__d, 31)); \
+})
+
+/* Minimum values between corresponding elements in the two vectors are written to the returned vector. */
+#define msa_minq_s8(__a, __b)  (__builtin_msa_min_s_b(__a, __b))
+#define msa_minq_s16(__a, __b) (__builtin_msa_min_s_h(__a, __b))
+#define msa_minq_s32(__a, __b) (__builtin_msa_min_s_w(__a, __b))
+#define msa_minq_s64(__a, __b) (__builtin_msa_min_s_d(__a, __b))
+#define msa_minq_u8(__a, __b)  ((v16u8)__builtin_msa_min_u_b(__a, __b))
+#define msa_minq_u16(__a, __b) ((v8u16)__builtin_msa_min_u_h(__a, __b))
+#define msa_minq_u32(__a, __b) ((v4u32)__builtin_msa_min_u_w(__a, __b))
+#define msa_minq_u64(__a, __b) ((v2u64)__builtin_msa_min_u_d(__a, __b))
+#define msa_minq_f32(__a, __b) (__builtin_msa_fmin_w(__a, __b))
+#define msa_minq_f64(__a, __b) (__builtin_msa_fmin_d(__a, __b))
+
+/* Maximum values between corresponding elements in the two vectors are written to the returned vector. */
+#define msa_maxq_s8(__a, __b)  (__builtin_msa_max_s_b(__a, __b))
+#define msa_maxq_s16(__a, __b) (__builtin_msa_max_s_h(__a, __b))
+#define msa_maxq_s32(__a, __b) (__builtin_msa_max_s_w(__a, __b))
+#define msa_maxq_s64(__a, __b) (__builtin_msa_max_s_d(__a, __b))
+#define msa_maxq_u8(__a, __b)  ((v16u8)__builtin_msa_max_u_b(__a, __b))
+#define msa_maxq_u16(__a, __b) ((v8u16)__builtin_msa_max_u_h(__a, __b))
+#define msa_maxq_u32(__a, __b) ((v4u32)__builtin_msa_max_u_w(__a, __b))
+#define msa_maxq_u64(__a, __b) ((v2u64)__builtin_msa_max_u_d(__a, __b))
+#define msa_maxq_f32(__a, __b) (__builtin_msa_fmax_w(__a, __b))
+#define msa_maxq_f64(__a, __b) (__builtin_msa_fmax_d(__a, __b))
+
+/* Vector type reinterpretion */
+#define MSA_TPV_REINTERPRET(_Tpv, Vec) ((_Tpv)(Vec))
+
+/* Add the odd elements in vector __a with the even elements in vector __b to double width elements in the returned vector. */
+/* v8i16 msa_hadd_s16 ((v16i8)__a, (v16i8)__b) */
+#define msa_hadd_s16(__a, __b) (__builtin_msa_hadd_s_h((v16i8)(__a), (v16i8)(__b)))
+/* v4i32 msa_hadd_s32 ((v8i16)__a, (v8i16)__b) */
+#define msa_hadd_s32(__a, __b) (__builtin_msa_hadd_s_w((v8i16)(__a), (v8i16)(__b)))
+/* v2i64 msa_hadd_s64 ((v4i32)__a, (v4i32)__b) */
+#define msa_hadd_s64(__a, __b) (__builtin_msa_hadd_s_d((v4i32)(__a), (v4i32)(__b)))
+
+/* Copy even elements in __a to the left half and even elements in __b to the right half and return the result vector. */
+#define msa_pckev_s8(__a, __b)  (__builtin_msa_pckev_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_pckev_s16(__a, __b) (__builtin_msa_pckev_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_pckev_s32(__a, __b) (__builtin_msa_pckev_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_pckev_s64(__a, __b) (__builtin_msa_pckev_d((v2i64)(__a), (v2i64)(__b)))
+
+/* Copy even elements in __a to the left half and even elements in __b to the right half and return the result vector. */
+#define msa_pckod_s8(__a, __b)  (__builtin_msa_pckod_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_pckod_s16(__a, __b) (__builtin_msa_pckod_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_pckod_s32(__a, __b) (__builtin_msa_pckod_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_pckod_s64(__a, __b) (__builtin_msa_pckod_d((v2i64)(__a), (v2i64)(__b)))
+
+#ifdef _MIPSEB
+#define LANE_IMM0_1(x)  (0b1 - ((x) & 0b1))
+#define LANE_IMM0_3(x)  (0b11 - ((x) & 0b11))
+#define LANE_IMM0_7(x)  (0b111 - ((x) & 0b111))
+#define LANE_IMM0_15(x) (0b1111 - ((x) & 0b1111))
+#else
+#define LANE_IMM0_1(x)  ((x) & 0b1)
+#define LANE_IMM0_3(x)  ((x) & 0b11)
+#define LANE_IMM0_7(x)  ((x) & 0b111)
+#define LANE_IMM0_15(x) ((x) & 0b1111)
+#endif
+
+#define msa_get_lane_u8(__a, __b)        ((uint8_t)(__a)[LANE_IMM0_7(__b)])
+#define msa_get_lane_s8(__a, __b)        ((int8_t)(__a)[LANE_IMM0_7(__b)])
+#define msa_get_lane_u16(__a, __b)       ((uint16_t)(__a)[LANE_IMM0_3(__b)])
+#define msa_get_lane_s16(__a, __b)       ((int16_t)(__a)[LANE_IMM0_3(__b)])
+#define msa_get_lane_u32(__a, __b)       ((uint32_t)(__a)[LANE_IMM0_1(__b)])
+#define msa_get_lane_s32(__a, __b)       ((int32_t)(__a)[LANE_IMM0_1(__b)])
+#define msa_get_lane_f32(__a, __b)       ((float)(__a)[LANE_IMM0_3(__b)])
+#define msa_get_lane_s64(__a, __b)       ((int64_t)(__a)[LANE_IMM0_1(__b)])
+#define msa_get_lane_u64(__a, __b)       ((uint64_t)(__a)[LANE_IMM0_1(__b)])
+#define msa_get_lane_f64(__a, __b)       ((double)(__a)[LANE_IMM0_1(__b)])
+#define msa_getq_lane_u8(__a, imm0_15)   ((uint8_t)__builtin_msa_copy_u_b((v16i8)(__a), imm0_15))
+#define msa_getq_lane_s8(__a, imm0_15)   ((int8_t)__builtin_msa_copy_s_b(__a, imm0_15))
+#define msa_getq_lane_u16(__a, imm0_7)   ((uint16_t)__builtin_msa_copy_u_h((v8i16)(__a), imm0_7))
+#define msa_getq_lane_s16(__a, imm0_7)   ((int16_t)__builtin_msa_copy_s_h(__a, imm0_7))
+#define msa_getq_lane_u32(__a, imm0_3)   __builtin_msa_copy_u_w((v4i32)(__a), imm0_3)
+#define msa_getq_lane_s32                __builtin_msa_copy_s_w
+#define msa_getq_lane_f32(__a, __b)      ((float)(__a)[LANE_IMM0_3(__b)])
+#define msa_getq_lane_f64(__a, __b)      ((double)(__a)[LANE_IMM0_1(__b)])
+#if (__mips == 64)
+#define msa_getq_lane_u64(__a, imm0_1)   __builtin_msa_copy_u_d((v2i64)(__a), imm0_1)
+#define msa_getq_lane_s64                __builtin_msa_copy_s_d
+#else
+#define msa_getq_lane_u64(__a, imm0_1)   ((uint64_t)(__a)[LANE_IMM0_1(imm0_1)])
+#define msa_getq_lane_s64(__a, imm0_1)   ((int64_t)(__a)[LANE_IMM0_1(imm0_1)])
+#endif
+
+/* combine */
+#if (__mips == 64)
+#define __COMBINE_64_64(__TYPE, a, b)    ((__TYPE)((v2u64){((v1u64)(a))[0], ((v1u64)(b))[0]}))
+#else
+#define __COMBINE_64_64(__TYPE, a, b)    ((__TYPE)((v4u32){((v2u32)(a))[0], ((v2u32)(a))[1],  \
+                                                           ((v2u32)(b))[0], ((v2u32)(b))[1]}))
+#endif
+
+/* v16i8 msa_combine_s8 (v8i8 __a, v8i8 __b) */
+#define msa_combine_s8(__a, __b)  __COMBINE_64_64(v16i8, __a, __b)
+
+/* v8i16 msa_combine_s16(v4i16 __a, v4i16 __b) */
+#define msa_combine_s16(__a, __b)  __COMBINE_64_64(v8i16, __a, __b)
+
+/* v4i32 msa_combine_s32(v2i32 __a, v2i32 __b) */
+#define msa_combine_s32(__a, __b)  __COMBINE_64_64(v4i32, __a, __b)
+
+/* v2i64 msa_combine_s64(v1i64 __a, v1i64 __b) */
+#define msa_combine_s64(__a, __b)  __COMBINE_64_64(v2i64, __a, __b)
+
+/* v4f32 msa_combine_f32(v2f32 __a, v2f32 __b) */
+#define msa_combine_f32(__a, __b)  __COMBINE_64_64(v4f32, __a, __b)
+
+/* v16u8 msa_combine_u8(v8u8 __a, v8u8 __b) */
+#define msa_combine_u8(__a, __b)  __COMBINE_64_64(v16u8, __a, __b)
+
+/* v8u16 msa_combine_u16(v4u16 __a, v4u16 __b) */
+#define msa_combine_u16(__a, __b)  __COMBINE_64_64(v8u16, __a, __b)
+
+/* v4u32 msa_combine_u32(v2u32 __a, v2u32 __b) */
+#define msa_combine_u32(__a, __b)  __COMBINE_64_64(v4u32, __a, __b)
+
+/* v2u64 msa_combine_u64(v1u64 __a, v1u64 __b) */
+#define msa_combine_u64(__a, __b)  __COMBINE_64_64(v2u64, __a, __b)
+
+/* v2f64 msa_combine_f64(v1f64 __a, v1f64 __b) */
+#define msa_combine_f64(__a, __b)  __COMBINE_64_64(v2f64, __a, __b)
+
+/* get_low, get_high */
+#if (__mips == 64)
+#define __GET_LOW(__TYPE, a)   ((__TYPE)((v1u64)(__builtin_msa_copy_u_d((v2i64)(a), 0))))
+#define __GET_HIGH(__TYPE, a)  ((__TYPE)((v1u64)(__builtin_msa_copy_u_d((v2i64)(a), 1))))
+#else
+#define __GET_LOW(__TYPE, a)   ((__TYPE)(((v2u64)(a))[0]))
+#define __GET_HIGH(__TYPE, a)  ((__TYPE)(((v2u64)(a))[1]))
+#endif
+
+/* v8i8 msa_get_low_s8(v16i8 __a) */
+#define msa_get_low_s8(__a)  __GET_LOW(v8i8, __a)
+
+/* v4i16 msa_get_low_s16(v8i16 __a) */
+#define msa_get_low_s16(__a)  __GET_LOW(v4i16, __a)
+
+/* v2i32 msa_get_low_s32(v4i32 __a) */
+#define msa_get_low_s32(__a)  __GET_LOW(v2i32, __a)
+
+/* v1i64 msa_get_low_s64(v2i64 __a) */
+#define msa_get_low_s64(__a)  __GET_LOW(v1i64, __a)
+
+/* v8u8 msa_get_low_u8(v16u8 __a) */
+#define msa_get_low_u8(__a)  __GET_LOW(v8u8, __a)
+
+/* v4u16 msa_get_low_u16(v8u16 __a) */
+#define msa_get_low_u16(__a)  __GET_LOW(v4u16, __a)
+
+/* v2u32 msa_get_low_u32(v4u32 __a) */
+#define msa_get_low_u32(__a)  __GET_LOW(v2u32, __a)
+
+/* v1u64 msa_get_low_u64(v2u64 __a) */
+#define msa_get_low_u64(__a)  __GET_LOW(v1u64, __a)
+
+/* v2f32 msa_get_low_f32(v4f32 __a) */
+#define msa_get_low_f32(__a)  __GET_LOW(v2f32, __a)
+
+/* v1f64 msa_get_low_f64(v2f64 __a) */
+#define msa_get_low_f64(__a)  __GET_LOW(v1f64, __a)
+
+/* v8i8 msa_get_high_s8(v16i8 __a) */
+#define msa_get_high_s8(__a)  __GET_HIGH(v8i8, __a)
+
+/* v4i16 msa_get_high_s16(v8i16 __a) */
+#define msa_get_high_s16(__a)  __GET_HIGH(v4i16, __a)
+
+/* v2i32 msa_get_high_s32(v4i32 __a) */
+#define msa_get_high_s32(__a)  __GET_HIGH(v2i32, __a)
+
+/* v1i64 msa_get_high_s64(v2i64 __a) */
+#define msa_get_high_s64(__a)  __GET_HIGH(v1i64, __a)
+
+/* v8u8 msa_get_high_u8(v16u8 __a) */
+#define msa_get_high_u8(__a)  __GET_HIGH(v8u8, __a)
+
+/* v4u16 msa_get_high_u16(v8u16 __a) */
+#define msa_get_high_u16(__a)  __GET_HIGH(v4u16, __a)
+
+/* v2u32 msa_get_high_u32(v4u32 __a) */
+#define msa_get_high_u32(__a)  __GET_HIGH(v2u32, __a)
+
+/* v1u64 msa_get_high_u64(v2u64 __a) */
+#define msa_get_high_u64(__a)  __GET_HIGH(v1u64, __a)
+
+/* v2f32 msa_get_high_f32(v4f32 __a) */
+#define msa_get_high_f32(__a)  __GET_HIGH(v2f32, __a)
+
+/* v1f64 msa_get_high_f64(v2f64 __a) */
+#define msa_get_high_f64(__a)  __GET_HIGH(v1f64, __a)
+
+/* ri = ai * b[lane] */
+/* v4f32 msa_mulq_lane_f32(v4f32 __a, v4f32 __b, const int __lane) */
+#define msa_mulq_lane_f32(__a, __b, __lane)  ((__a) * msa_getq_lane_f32(__b, __lane))
+
+/* ri = ai + bi * c[lane] */
+/* v4f32 msa_mlaq_lane_f32(v4f32 __a, v4f32 __b, v4f32 __c, const int __lane) */
+#define msa_mlaq_lane_f32(__a, __b, __c, __lane)  ((__a) + ((__b) * msa_getq_lane_f32(__c, __lane)))
+
+/* uint16_t msa_sum_u16(v8u16 __a)*/
+#define msa_sum_u16(__a)                         \
+({                                               \
+  v4u32 _b;                                      \
+  v2u64 _c;                                      \
+  _b = __builtin_msa_hadd_u_w(__a, __a);         \
+  _c = __builtin_msa_hadd_u_d(_b, _b);           \
+  (uint16_t)(_c[0] + _c[1]);                     \
+})
+
+/* int16_t msa_sum_s16(v8i16 __a) */
+#define msa_sum_s16(__a)                        \
+({                                              \
+  v4i32 _b;                                     \
+  v2i64 _c;                                     \
+  _b = __builtin_msa_hadd_s_w(__a, __a);        \
+  _c = __builtin_msa_hadd_s_d(_b, _b);          \
+  (int32_t)(_c[0] + _c[1]);                     \
+})
+
+
+/* uint32_t msa_sum_u32(v4u32 __a)*/
+#define msa_sum_u32(__a)                       \
+({                                             \
+  v2u64 _b;                                    \
+  _b = __builtin_msa_hadd_u_d(__a, __a);       \
+  (uint32_t)(_b[0] + _b[1]);                   \
+})
+
+/* int32_t  msa_sum_s32(v4i32 __a)*/
+#define msa_sum_s32(__a)                       \
+({                                             \
+  v2i64 _b;                                    \
+  _b = __builtin_msa_hadd_s_d(__a, __a);       \
+  (int64_t)(_b[0] + _b[1]);                    \
+})
+
+/* uint8_t msa_sum_u8(v16u8 __a)*/
+#define msa_sum_u8(__a)                        \
+({                                             \
+  v8u16 _b16;                                    \
+  v4u32 _c32;                                    \
+  _b16 = __builtin_msa_hadd_u_h(__a, __a);       \
+  _c32 = __builtin_msa_hadd_u_w(_b16, _b16);         \
+  (uint8_t)msa_sum_u32(_c32);                    \
+})
+
+/* int8_t msa_sum_s8(v16s8 __a)*/
+#define msa_sum_s8(__a)                        \
+({                                             \
+  v8i16 _b16;                                    \
+  v4i32 _c32;                                    \
+  _b16 = __builtin_msa_hadd_s_h(__a, __a);       \
+  _c32 = __builtin_msa_hadd_s_w(_b16, _b16);         \
+  (int16_t)msa_sum_s32(_c32);                     \
+})
+
+/* float msa_sum_f32(v4f32 __a)*/
+#define msa_sum_f32(__a)  ((__a)[0] + (__a)[1] + (__a)[2] + (__a)[3])
+
+/* v8u16 msa_paddlq_u8(v16u8 __a) */
+#define msa_paddlq_u8(__a)  (__builtin_msa_hadd_u_h(__a, __a))
+
+/* v8i16 msa_paddlq_s8(v16i8 __a) */
+#define msa_paddlq_s8(__a)  (__builtin_msa_hadd_s_h(__a, __a))
+
+/* v4u32 msa_paddlq_u16 (v8u16 __a)*/
+#define msa_paddlq_u16(__a)  (__builtin_msa_hadd_u_w(__a, __a))
+
+/* v4i32 msa_paddlq_s16 (v8i16 __a)*/
+#define msa_paddlq_s16(__a)  (__builtin_msa_hadd_s_w(__a, __a))
+
+/* v2u64 msa_paddlq_u32(v4u32 __a) */
+#define msa_paddlq_u32(__a)  (__builtin_msa_hadd_u_d(__a, __a))
+
+/* v2i64 msa_paddlq_s32(v4i32 __a) */
+#define msa_paddlq_s32(__a)  (__builtin_msa_hadd_s_d(__a, __a))
+
+#define V8U8_2_V8U16(x)   {(uint16_t)x[0], (uint16_t)x[1], (uint16_t)x[2], (uint16_t)x[3], \
+                           (uint16_t)x[4], (uint16_t)x[5], (uint16_t)x[6], (uint16_t)x[7]}
+#define V8U8_2_V8I16(x)   {(int16_t)x[0], (int16_t)x[1], (int16_t)x[2], (int16_t)x[3], \
+                           (int16_t)x[4], (int16_t)x[5], (int16_t)x[6], (int16_t)x[7]}
+#define V8I8_2_V8I16(x)   {(int16_t)x[0], (int16_t)x[1], (int16_t)x[2], (int16_t)x[3], \
+                           (int16_t)x[4], (int16_t)x[5], (int16_t)x[6], (int16_t)x[7]}
+#define V4U16_2_V4U32(x)  {(uint32_t)x[0], (uint32_t)x[1], (uint32_t)x[2], (uint32_t)x[3]}
+#define V4U16_2_V4I32(x)  {(int32_t)x[0], (int32_t)x[1], (int32_t)x[2], (int32_t)x[3]}
+#define V4I16_2_V4I32(x)  {(int32_t)x[0], (int32_t)x[1], (int32_t)x[2], (int32_t)x[3]}
+#define V2U32_2_V2U64(x)  {(uint64_t)x[0], (uint64_t)x[1]}
+#define V2U32_2_V2I64(x)  {(int64_t)x[0], (int64_t)x[1]}
+
+/* v8u16 msa_mull_u8(v8u8 __a, v8u8 __b) */
+#define msa_mull_u8(__a, __b)  ((v8u16)__builtin_msa_mulv_h((v8i16)V8U8_2_V8I16(__a), (v8i16)V8U8_2_V8I16(__b)))
+
+/* v8i16 msa_mull_s8(v8i8 __a, v8i8 __b)*/
+#define msa_mull_s8(__a, __b)  (__builtin_msa_mulv_h((v8i16)V8I8_2_V8I16(__a), (v8i16)V8I8_2_V8I16(__b)))
+
+/* v4u32 msa_mull_u16(v4u16 __a, v4u16 __b) */
+#define msa_mull_u16(__a, __b)  ((v4u32)__builtin_msa_mulv_w((v4i32)V4U16_2_V4I32(__a), (v4i32)V4U16_2_V4I32(__b)))
+
+/* v4i32 msa_mull_s16(v4i16 __a, v4i16 __b) */
+#define msa_mull_s16(__a, __b)  (__builtin_msa_mulv_w((v4i32)V4I16_2_V4I32(__a), (v4i32)V4I16_2_V4I32(__b)))
+
+/* v2u64 msa_mull_u32(v2u32 __a, v2u32 __b) */
+#define msa_mull_u32(__a, __b)  ((v2u64)__builtin_msa_mulv_d((v2i64)V2U32_2_V2I64(__a), (v2i64)V2U32_2_V2I64(__b)))
+
+/* bitwise and: __builtin_msa_and_v */
+#define msa_andq_u8(__a, __b)  ((v16u8)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_andq_s8(__a, __b)  ((v16i8)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_andq_u16(__a, __b) ((v8u16)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_andq_s16(__a, __b) ((v8i16)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_andq_u32(__a, __b) ((v4u32)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_andq_s32(__a, __b) ((v4i32)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_andq_u64(__a, __b) ((v2u64)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_andq_s64(__a, __b) ((v2i64)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+
+/* bitwise or: __builtin_msa_or_v */
+#define msa_orrq_u8(__a, __b)  ((v16u8)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_orrq_s8(__a, __b)  ((v16i8)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_orrq_u16(__a, __b) ((v8u16)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_orrq_s16(__a, __b) ((v8i16)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_orrq_u32(__a, __b) ((v4u32)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_orrq_s32(__a, __b) ((v4i32)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_orrq_u64(__a, __b) ((v2u64)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_orrq_s64(__a, __b) ((v2i64)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+
+/* bitwise xor: __builtin_msa_xor_v */
+#define msa_eorq_u8(__a, __b)  ((v16u8)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_eorq_s8(__a, __b)  ((v16i8)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_eorq_u16(__a, __b) ((v8u16)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_eorq_s16(__a, __b) ((v8i16)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_eorq_u32(__a, __b) ((v4u32)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_eorq_s32(__a, __b) ((v4i32)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_eorq_u64(__a, __b) ((v2u64)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_eorq_s64(__a, __b) ((v2i64)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+
+/* bitwise not: v16u8 __builtin_msa_xori_b (v16u8, 0xff) */
+#define msa_mvnq_u8(__a)  ((v16u8)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+#define msa_mvnq_s8(__a)  ((v16i8)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+#define msa_mvnq_u16(__a) ((v8u16)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+#define msa_mvnq_s16(__a) ((v8i16)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+#define msa_mvnq_u32(__a) ((v4u32)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+#define msa_mvnq_s32(__a) ((v4i32)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+#define msa_mvnq_u64(__a) ((v2u64)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+#define msa_mvnq_s64(__a) ((v2i64)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+
+/* compare equal: ceq -> ri = ai == bi ? 1...1:0...0 */
+#define msa_ceqq_u8(__a, __b)  ((v16u8)__builtin_msa_ceq_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_ceqq_s8(__a, __b)  ((v16u8)__builtin_msa_ceq_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_ceqq_u16(__a, __b) ((v8u16)__builtin_msa_ceq_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_ceqq_s16(__a, __b) ((v8u16)__builtin_msa_ceq_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_ceqq_u32(__a, __b) ((v4u32)__builtin_msa_ceq_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_ceqq_s32(__a, __b) ((v4u32)__builtin_msa_ceq_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_ceqq_f32(__a, __b) ((v4u32)__builtin_msa_fceq_w((v4f32)(__a), (v4f32)(__b)))
+#define msa_ceqq_u64(__a, __b) ((v2u64)__builtin_msa_ceq_d((v2i64)(__a), (v2i64)(__b)))
+#define msa_ceqq_s64(__a, __b) ((v2u64)__builtin_msa_ceq_d((v2i64)(__a), (v2i64)(__b)))
+#define msa_ceqq_f64(__a, __b) ((v2u64)__builtin_msa_fceq_d((v2f64)(__a), (v2f64)(__b)))
+
+/* Compare less-than: clt -> ri = ai < bi ? 1...1:0...0 */
+#define msa_cltq_u8(__a, __b)  ((v16u8)__builtin_msa_clt_u_b((v16u8)(__a), (v16u8)(__b)))
+#define msa_cltq_s8(__a, __b)  ((v16u8)__builtin_msa_clt_s_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_cltq_u16(__a, __b) ((v8u16)__builtin_msa_clt_u_h((v8u16)(__a), (v8u16)(__b)))
+#define msa_cltq_s16(__a, __b) ((v8u16)__builtin_msa_clt_s_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_cltq_u32(__a, __b) ((v4u32)__builtin_msa_clt_u_w((v4u32)(__a), (v4u32)(__b)))
+#define msa_cltq_s32(__a, __b) ((v4u32)__builtin_msa_clt_s_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_cltq_f32(__a, __b) ((v4u32)__builtin_msa_fclt_w((v4f32)(__a), (v4f32)(__b)))
+#define msa_cltq_u64(__a, __b) ((v2u64)__builtin_msa_clt_u_d((v2u64)(__a), (v2u64)(__b)))
+#define msa_cltq_s64(__a, __b) ((v2u64)__builtin_msa_clt_s_d((v2i64)(__a), (v2i64)(__b)))
+#define msa_cltq_f64(__a, __b) ((v2u64)__builtin_msa_fclt_d((v2f64)(__a), (v2f64)(__b)))
+
+/* compare greater-than: cgt -> ri = ai > bi ? 1...1:0...0 */
+#define msa_cgtq_u8(__a, __b)  ((v16u8)__builtin_msa_clt_u_b((v16u8)(__b), (v16u8)(__a)))
+#define msa_cgtq_s8(__a, __b)  ((v16u8)__builtin_msa_clt_s_b((v16i8)(__b), (v16i8)(__a)))
+#define msa_cgtq_u16(__a, __b) ((v8u16)__builtin_msa_clt_u_h((v8u16)(__b), (v8u16)(__a)))
+#define msa_cgtq_s16(__a, __b) ((v8u16)__builtin_msa_clt_s_h((v8i16)(__b), (v8i16)(__a)))
+#define msa_cgtq_u32(__a, __b) ((v4u32)__builtin_msa_clt_u_w((v4u32)(__b), (v4u32)(__a)))
+#define msa_cgtq_s32(__a, __b) ((v4u32)__builtin_msa_clt_s_w((v4i32)(__b), (v4i32)(__a)))
+#define msa_cgtq_f32(__a, __b) ((v4u32)__builtin_msa_fclt_w((v4f32)(__b), (v4f32)(__a)))
+#define msa_cgtq_u64(__a, __b) ((v2u64)__builtin_msa_clt_u_d((v2u64)(__b), (v2u64)(__a)))
+#define msa_cgtq_s64(__a, __b) ((v2u64)__builtin_msa_clt_s_d((v2i64)(__b), (v2i64)(__a)))
+#define msa_cgtq_f64(__a, __b) ((v2u64)__builtin_msa_fclt_d((v2f64)(__b), (v2f64)(__a)))
+
+/* compare less-equal: cle -> ri = ai <= bi ? 1...1:0...0 */
+#define msa_cleq_u8(__a, __b)  ((v16u8)__builtin_msa_cle_u_b((v16u8)(__a), (v16u8)(__b)))
+#define msa_cleq_s8(__a, __b)  ((v16u8)__builtin_msa_cle_s_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_cleq_u16(__a, __b) ((v8u16)__builtin_msa_cle_u_h((v8u16)(__a), (v8u16)(__b)))
+#define msa_cleq_s16(__a, __b) ((v8u16)__builtin_msa_cle_s_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_cleq_u32(__a, __b) ((v4u32)__builtin_msa_cle_u_w((v4u32)(__a), (v4u32)(__b)))
+#define msa_cleq_s32(__a, __b) ((v4u32)__builtin_msa_cle_s_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_cleq_f32(__a, __b) ((v4u32)__builtin_msa_fcle_w((v4f32)(__a), (v4f32)(__b)))
+#define msa_cleq_u64(__a, __b) ((v2u64)__builtin_msa_cle_u_d((v2u64)(__a), (v2u64)(__b)))
+#define msa_cleq_s64(__a, __b) ((v2u64)__builtin_msa_cle_s_d((v2i64)(__a), (v2i64)(__b)))
+#define msa_cleq_f64(__a, __b) ((v2u64)__builtin_msa_fcle_d((v2f64)(__a), (v2f64)(__b)))
+
+/* compare greater-equal: cge -> ri = ai >= bi ? 1...1:0...0 */
+#define msa_cgeq_u8(__a, __b)  ((v16u8)__builtin_msa_cle_u_b((v16u8)(__b), (v16u8)(__a)))
+#define msa_cgeq_s8(__a, __b)  ((v16u8)__builtin_msa_cle_s_b((v16i8)(__b), (v16i8)(__a)))
+#define msa_cgeq_u16(__a, __b) ((v8u16)__builtin_msa_cle_u_h((v8u16)(__b), (v8u16)(__a)))
+#define msa_cgeq_s16(__a, __b) ((v8u16)__builtin_msa_cle_s_h((v8i16)(__b), (v8i16)(__a)))
+#define msa_cgeq_u32(__a, __b) ((v4u32)__builtin_msa_cle_u_w((v4u32)(__b), (v4u32)(__a)))
+#define msa_cgeq_s32(__a, __b) ((v4u32)__builtin_msa_cle_s_w((v4i32)(__b), (v4i32)(__a)))
+#define msa_cgeq_f32(__a, __b) ((v4u32)__builtin_msa_fcle_w((v4f32)(__b), (v4f32)(__a)))
+#define msa_cgeq_u64(__a, __b) ((v2u64)__builtin_msa_cle_u_d((v2u64)(__b), (v2u64)(__a)))
+#define msa_cgeq_s64(__a, __b) ((v2u64)__builtin_msa_cle_s_d((v2i64)(__b), (v2i64)(__a)))
+#define msa_cgeq_f64(__a, __b) ((v2u64)__builtin_msa_fcle_d((v2f64)(__b), (v2f64)(__a)))
+
+/* Shift Left Logical: shl -> ri = ai << bi; */
+#define msa_shlq_u8(__a, __b)  ((v16u8)__builtin_msa_sll_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_shlq_s8(__a, __b)  ((v16i8)__builtin_msa_sll_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_shlq_u16(__a, __b) ((v8u16)__builtin_msa_sll_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_shlq_s16(__a, __b) ((v8i16)__builtin_msa_sll_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_shlq_u32(__a, __b) ((v4u32)__builtin_msa_sll_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_shlq_s32(__a, __b) ((v4i32)__builtin_msa_sll_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_shlq_u64(__a, __b) ((v2u64)__builtin_msa_sll_d((v2i64)(__a), (v2i64)(__b)))
+#define msa_shlq_s64(__a, __b) ((v2i64)__builtin_msa_sll_d((v2i64)(__a), (v2i64)(__b)))
+
+/* Immediate Shift Left Logical: shl -> ri = ai << imm; */
+#define msa_shlq_n_u8(__a, __imm)  ((v16u8)__builtin_msa_slli_b((v16i8)(__a), __imm))
+#define msa_shlq_n_s8(__a, __imm)  ((v16i8)__builtin_msa_slli_b((v16i8)(__a), __imm))
+#define msa_shlq_n_u16(__a, __imm) ((v8u16)__builtin_msa_slli_h((v8i16)(__a), __imm))
+#define msa_shlq_n_s16(__a, __imm) ((v8i16)__builtin_msa_slli_h((v8i16)(__a), __imm))
+#define msa_shlq_n_u32(__a, __imm) ((v4u32)__builtin_msa_slli_w((v4i32)(__a), __imm))
+#define msa_shlq_n_s32(__a, __imm) ((v4i32)__builtin_msa_slli_w((v4i32)(__a), __imm))
+#define msa_shlq_n_u64(__a, __imm) ((v2u64)__builtin_msa_slli_d((v2i64)(__a), __imm))
+#define msa_shlq_n_s64(__a, __imm) ((v2i64)__builtin_msa_slli_d((v2i64)(__a), __imm))
+
+/* shift right: shrq -> ri = ai >> bi; */
+#define msa_shrq_u8(__a, __b)  ((v16u8)__builtin_msa_srl_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_shrq_s8(__a, __b)  ((v16i8)__builtin_msa_sra_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_shrq_u16(__a, __b) ((v8u16)__builtin_msa_srl_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_shrq_s16(__a, __b) ((v8i16)__builtin_msa_sra_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_shrq_u32(__a, __b) ((v4u32)__builtin_msa_srl_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_shrq_s32(__a, __b) ((v4i32)__builtin_msa_sra_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_shrq_u64(__a, __b) ((v2u64)__builtin_msa_srl_d((v2i64)(__a), (v2i64)(__b)))
+#define msa_shrq_s64(__a, __b) ((v2i64)__builtin_msa_sra_d((v2i64)(__a), (v2i64)(__b)))
+
+/* Immediate Shift Right: shr -> ri = ai >> imm; */
+#define msa_shrq_n_u8(__a, __imm)  ((v16u8)__builtin_msa_srli_b((v16i8)(__a), __imm))
+#define msa_shrq_n_s8(__a, __imm)  ((v16i8)__builtin_msa_srai_b((v16i8)(__a), __imm))
+#define msa_shrq_n_u16(__a, __imm) ((v8u16)__builtin_msa_srli_h((v8i16)(__a), __imm))
+#define msa_shrq_n_s16(__a, __imm) ((v8i16)__builtin_msa_srai_h((v8i16)(__a), __imm))
+#define msa_shrq_n_u32(__a, __imm) ((v4u32)__builtin_msa_srli_w((v4i32)(__a), __imm))
+#define msa_shrq_n_s32(__a, __imm) ((v4i32)__builtin_msa_srai_w((v4i32)(__a), __imm))
+#define msa_shrq_n_u64(__a, __imm) ((v2u64)__builtin_msa_srli_d((v2i64)(__a), __imm))
+#define msa_shrq_n_s64(__a, __imm) ((v2i64)__builtin_msa_srai_d((v2i64)(__a), __imm))
+
+/* Immediate Shift Right Rounded: shr -> ri = ai >> (rounded)imm; */
+#define msa_rshrq_n_u8(__a, __imm)  ((v16u8)__builtin_msa_srlri_b((v16i8)(__a), __imm))
+#define msa_rshrq_n_s8(__a, __imm)  ((v16i8)__builtin_msa_srari_b((v16i8)(__a), __imm))
+#define msa_rshrq_n_u16(__a, __imm) ((v8u16)__builtin_msa_srlri_h((v8i16)(__a), __imm))
+#define msa_rshrq_n_s16(__a, __imm) ((v8i16)__builtin_msa_srari_h((v8i16)(__a), __imm))
+#define msa_rshrq_n_u32(__a, __imm) ((v4u32)__builtin_msa_srlri_w((v4i32)(__a), __imm))
+#define msa_rshrq_n_s32(__a, __imm) ((v4i32)__builtin_msa_srari_w((v4i32)(__a), __imm))
+#define msa_rshrq_n_u64(__a, __imm) ((v2u64)__builtin_msa_srlri_d((v2i64)(__a), __imm))
+#define msa_rshrq_n_s64(__a, __imm) ((v2i64)__builtin_msa_srari_d((v2i64)(__a), __imm))
+
+/* Vector saturating rounding shift left, qrshl -> ri = ai << bi; */
+#define msa_qrshrq_s32(a, b)  ((v4i32)__msa_srar_w((v4i32)(a), (v4i32)(b)))
+
+/* Rename the msa builtin func to unify the name style for intrin_msa.hpp */
+#define msa_qaddq_u8          __builtin_msa_adds_u_b
+#define msa_qaddq_s8          __builtin_msa_adds_s_b
+#define msa_qaddq_u16         __builtin_msa_adds_u_h
+#define msa_qaddq_s16         __builtin_msa_adds_s_h
+#define msa_qaddq_u32         __builtin_msa_adds_u_w
+#define msa_qaddq_s32         __builtin_msa_adds_s_w
+#define msa_qaddq_u64         __builtin_msa_adds_u_d
+#define msa_qaddq_s64         __builtin_msa_adds_s_d
+#define msa_addq_u8(a, b)     ((v16u8)__builtin_msa_addv_b((v16i8)(a), (v16i8)(b)))
+#define msa_addq_s8           __builtin_msa_addv_b
+#define msa_addq_u16(a, b)    ((v8u16)__builtin_msa_addv_h((v8i16)(a), (v8i16)(b)))
+#define msa_addq_s16          __builtin_msa_addv_h
+#define msa_addq_u32(a, b)    ((v4u32)__builtin_msa_addv_w((v4i32)(a), (v4i32)(b)))
+#define msa_addq_s32          __builtin_msa_addv_w
+#define msa_addq_f32          __builtin_msa_fadd_w
+#define msa_addq_u64(a, b)    ((v2u64)__builtin_msa_addv_d((v2i64)(a), (v2i64)(b)))
+#define msa_addq_s64          __builtin_msa_addv_d
+#define msa_addq_f64          __builtin_msa_fadd_d
+#define msa_qsubq_u8          __builtin_msa_subs_u_b
+#define msa_qsubq_s8          __builtin_msa_subs_s_b
+#define msa_qsubq_u16         __builtin_msa_subs_u_h
+#define msa_qsubq_s16         __builtin_msa_subs_s_h
+#define msa_subq_u8(a, b)     ((v16u8)__builtin_msa_subv_b((v16i8)(a), (v16i8)(b)))
+#define msa_subq_s8           __builtin_msa_subv_b
+#define msa_subq_u16(a, b)    ((v8u16)__builtin_msa_subv_h((v8i16)(a), (v8i16)(b)))
+#define msa_subq_s16          __builtin_msa_subv_h
+#define msa_subq_u32(a, b)    ((v4u32)__builtin_msa_subv_w((v4i32)(a), (v4i32)(b)))
+#define msa_subq_s32          __builtin_msa_subv_w
+#define msa_subq_f32          __builtin_msa_fsub_w
+#define msa_subq_u64(a, b)    ((v2u64)__builtin_msa_subv_d((v2i64)(a), (v2i64)(b)))
+#define msa_subq_s64          __builtin_msa_subv_d
+#define msa_subq_f64          __builtin_msa_fsub_d
+#define msa_mulq_u8(a, b)     ((v16u8)__builtin_msa_mulv_b((v16i8)(a), (v16i8)(b)))
+#define msa_mulq_s8(a, b)     ((v16i8)__builtin_msa_mulv_b((v16i8)(a), (v16i8)(b)))
+#define msa_mulq_u16(a, b)    ((v8u16)__builtin_msa_mulv_h((v8i16)(a), (v8i16)(b)))
+#define msa_mulq_s16(a, b)    ((v8i16)__builtin_msa_mulv_h((v8i16)(a), (v8i16)(b)))
+#define msa_mulq_u32(a, b)    ((v4u32)__builtin_msa_mulv_w((v4i32)(a), (v4i32)(b)))
+#define msa_mulq_s32(a, b)    ((v4i32)__builtin_msa_mulv_w((v4i32)(a), (v4i32)(b)))
+#define msa_mulq_u64(a, b)    ((v2u64)__builtin_msa_mulv_d((v2i64)(a), (v2i64)(b)))
+#define msa_mulq_s64(a, b)    ((v2i64)__builtin_msa_mulv_d((v2i64)(a), (v2i64)(b)))
+#define msa_mulq_f32          __builtin_msa_fmul_w
+#define msa_mulq_f64          __builtin_msa_fmul_d
+#define msa_divq_f32          __builtin_msa_fdiv_w
+#define msa_divq_f64          __builtin_msa_fdiv_d
+#define msa_dotp_s_h          __builtin_msa_dotp_s_h
+#define msa_dotp_s_w          __builtin_msa_dotp_s_w
+#define msa_dotp_s_d          __builtin_msa_dotp_s_d
+#define msa_dotp_u_h          __builtin_msa_dotp_u_h
+#define msa_dotp_u_w          __builtin_msa_dotp_u_w
+#define msa_dotp_u_d          __builtin_msa_dotp_u_d
+#define msa_dpadd_s_h         __builtin_msa_dpadd_s_h
+#define msa_dpadd_s_w         __builtin_msa_dpadd_s_w
+#define msa_dpadd_s_d         __builtin_msa_dpadd_s_d
+#define msa_dpadd_u_h         __builtin_msa_dpadd_u_h
+#define msa_dpadd_u_w         __builtin_msa_dpadd_u_w
+#define msa_dpadd_u_d         __builtin_msa_dpadd_u_d
+
+#define ILVRL_B2(RTYPE, in0, in1, low, hi) do {       \
+      low = (RTYPE)__builtin_msa_ilvr_b((v16i8)(in0), (v16i8)(in1));  \
+      hi  = (RTYPE)__builtin_msa_ilvl_b((v16i8)(in0), (v16i8)(in1));  \
+    } while (0)
+#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
+#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
+#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
+#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
+#define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
+
+#define ILVRL_H2(RTYPE, in0, in1, low, hi) do {       \
+      low = (RTYPE)__builtin_msa_ilvr_h((v8i16)(in0), (v8i16)(in1));  \
+      hi  = (RTYPE)__builtin_msa_ilvl_h((v8i16)(in0), (v8i16)(in1));  \
+    } while (0)
+#define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__)
+#define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
+#define ILVRL_H2_UH(...) ILVRL_H2(v8u16, __VA_ARGS__)
+#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
+#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
+#define ILVRL_H2_UW(...) ILVRL_H2(v4u32, __VA_ARGS__)
+
+#define ILVRL_W2(RTYPE, in0, in1, low, hi) do {       \
+      low = (RTYPE)__builtin_msa_ilvr_w((v4i32)(in0), (v4i32)(in1));  \
+      hi  = (RTYPE)__builtin_msa_ilvl_w((v4i32)(in0), (v4i32)(in1));  \
+    } while (0)
+#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
+#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
+#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
+#define ILVRL_W2_UW(...) ILVRL_W2(v4u32, __VA_ARGS__)
+
+/* absq, qabsq (r = |a|;) */
+#define msa_absq_s8(a)        __builtin_msa_add_a_b(a, __builtin_msa_fill_b(0))
+#define msa_absq_s16(a)       __builtin_msa_add_a_h(a, __builtin_msa_fill_h(0))
+#define msa_absq_s32(a)       __builtin_msa_add_a_w(a, __builtin_msa_fill_w(0))
+#define msa_absq_s64(a)       __builtin_msa_add_a_d(a, __builtin_msa_fill_d(0))
+#define msa_absq_f32(a)       ((v4f32)__builtin_msa_bclri_w((v4u32)(a), 31))
+#define msa_absq_f64(a)       ((v2f64)__builtin_msa_bclri_d((v2u64)(a), 63))
+#define msa_qabsq_s8(a)       __builtin_msa_adds_a_b(a, __builtin_msa_fill_b(0))
+#define msa_qabsq_s16(a)      __builtin_msa_adds_a_h(a, __builtin_msa_fill_h(0))
+#define msa_qabsq_s32(a)      __builtin_msa_adds_a_w(a, __builtin_msa_fill_w(0))
+#define msa_qabsq_s64(a)      __builtin_msa_adds_a_d(a, __builtin_msa_fill_d(0))
+
+/* abdq, qabdq (r = |a - b|;) */
+#define msa_abdq_u8           __builtin_msa_asub_u_b
+#define msa_abdq_s8           __builtin_msa_asub_s_b
+#define msa_abdq_u16          __builtin_msa_asub_u_h
+#define msa_abdq_s16          __builtin_msa_asub_s_h
+#define msa_abdq_u32          __builtin_msa_asub_u_w
+#define msa_abdq_s32          __builtin_msa_asub_s_w
+#define msa_abdq_u64          __builtin_msa_asub_u_d
+#define msa_abdq_s64          __builtin_msa_asub_s_d
+#define msa_abdq_f32(a, b)    msa_absq_f32(__builtin_msa_fsub_w(a, b))
+#define msa_abdq_f64(a, b)    msa_absq_f64(__builtin_msa_fsub_d(a, b))
+#define msa_qabdq_s8(a, b)    msa_qabsq_s8(__builtin_msa_subs_s_b(a, b))
+#define msa_qabdq_s16(a, b)   msa_qabsq_s16(__builtin_msa_subs_s_h(a, b))
+#define msa_qabdq_s32(a, b)   msa_qabsq_s32(__builtin_msa_subs_s_w(a, b))
+#define msa_qabdq_s64(a, b)   msa_qabsq_s64(__builtin_msa_subs_s_d(a, b))
+
+/* sqrtq, rsqrtq */
+#define msa_sqrtq_f32         __builtin_msa_fsqrt_w
+#define msa_sqrtq_f64         __builtin_msa_fsqrt_d
+#define msa_rsqrtq_f32        __builtin_msa_frsqrt_w
+#define msa_rsqrtq_f64        __builtin_msa_frsqrt_d
+
+
+/* mlaq: r = a + b * c; */
+__extension__ extern __inline v4i32
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+msa_mlaq_s32(v4i32 __a, v4i32 __b, v4i32 __c)
+{
+  __asm__ volatile("maddv.w %w[__a], %w[__b], %w[__c]\n"
+               // Outputs
+               : [__a] "+f"(__a)
+               // Inputs
+               : [__b] "f"(__b), [__c] "f"(__c));
+  return __a;
+}
+
+__extension__ extern __inline v2i64
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+msa_mlaq_s64(v2i64 __a, v2i64 __b, v2i64 __c)
+{
+  __asm__ volatile("maddv.d %w[__a], %w[__b], %w[__c]\n"
+               // Outputs
+               : [__a] "+f"(__a)
+               // Inputs
+               : [__b] "f"(__b), [__c] "f"(__c));
+  return __a;
+}
+
+__extension__ extern __inline v4f32
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+msa_mlaq_f32(v4f32 __a, v4f32 __b, v4f32 __c)
+{
+  __asm__ volatile("fmadd.w %w[__a], %w[__b], %w[__c]\n"
+               // Outputs
+               : [__a] "+f"(__a)
+               // Inputs
+               : [__b] "f"(__b), [__c] "f"(__c));
+  return __a;
+}
+
+__extension__ extern __inline v2f64
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+msa_mlaq_f64(v2f64 __a, v2f64 __b, v2f64 __c)
+{
+  __asm__ volatile("fmadd.d %w[__a], %w[__b], %w[__c]\n"
+               // Outputs
+               : [__a] "+f"(__a)
+               // Inputs
+               : [__b] "f"(__b), [__c] "f"(__c));
+  return __a;
+}
+
+/* cntq */
+#define msa_cntq_s8           __builtin_msa_pcnt_b
+#define msa_cntq_s16          __builtin_msa_pcnt_h
+#define msa_cntq_s32          __builtin_msa_pcnt_w
+#define msa_cntq_s64          __builtin_msa_pcnt_d
+
+/* bslq (a: mask; r = b(if a == 0); r = c(if a == 1);) */
+#define msa_bslq_u8           __builtin_msa_bsel_v
+
+/* ilvrq, ilvlq (For EL only, ilvrq: b0, a0, b1, a1; ilvlq: b2, a2, b3, a3;) */
+#define msa_ilvrq_s8          __builtin_msa_ilvr_b
+#define msa_ilvrq_s16         __builtin_msa_ilvr_h
+#define msa_ilvrq_s32         __builtin_msa_ilvr_w
+#define msa_ilvrq_s64         __builtin_msa_ilvr_d
+#define msa_ilvlq_s8          __builtin_msa_ilvl_b
+#define msa_ilvlq_s16         __builtin_msa_ilvl_h
+#define msa_ilvlq_s32         __builtin_msa_ilvl_w
+#define msa_ilvlq_s64         __builtin_msa_ilvl_d
+
+/* ilvevq, ilvodq (ilvevq: b0, a0, b2, a2; ilvodq: b1, a1, b3, a3; ) */
+#define msa_ilvevq_s8         __builtin_msa_ilvev_b
+#define msa_ilvevq_s16        __builtin_msa_ilvev_h
+#define msa_ilvevq_s32        __builtin_msa_ilvev_w
+#define msa_ilvevq_s64        __builtin_msa_ilvev_d
+#define msa_ilvodq_s8         __builtin_msa_ilvod_b
+#define msa_ilvodq_s16        __builtin_msa_ilvod_h
+#define msa_ilvodq_s32        __builtin_msa_ilvod_w
+#define msa_ilvodq_s64        __builtin_msa_ilvod_d
+
+/* extq (r = (a || b); a concatenation b and get elements from index c) */
+#ifdef _MIPSEB
+#define msa_extq_s8(a, b, c)  \
+(__builtin_msa_vshf_b(__builtin_msa_subv_b((v16i8)((v2i64){0x1716151413121110, 0x1F1E1D1C1B1A1918}), __builtin_msa_fill_b(c)), a, b))
+#define msa_extq_s16(a, b, c) \
+(__builtin_msa_vshf_h(__builtin_msa_subv_h((v8i16)((v2i64){0x000B000A00090008, 0x000F000E000D000C}), __builtin_msa_fill_h(c)), a, b))
+#define msa_extq_s32(a, b, c) \
+(__builtin_msa_vshf_w(__builtin_msa_subv_w((v4i32)((v2i64){0x0000000500000004, 0x0000000700000006}), __builtin_msa_fill_w(c)), a, b))
+#define msa_extq_s64(a, b, c) \
+(__builtin_msa_vshf_d(__builtin_msa_subv_d((v2i64){0x0000000000000002, 0x0000000000000003}, __builtin_msa_fill_d(c)), a, b))
+#else
+#define msa_extq_s8(a, b, c)  \
+(__builtin_msa_vshf_b(__builtin_msa_addv_b((v16i8)((v2i64){0x0706050403020100, 0x0F0E0D0C0B0A0908}), __builtin_msa_fill_b(c)), b, a))
+#define msa_extq_s16(a, b, c) \
+(__builtin_msa_vshf_h(__builtin_msa_addv_h((v8i16)((v2i64){0x0003000200010000, 0x0007000600050004}), __builtin_msa_fill_h(c)), b, a))
+#define msa_extq_s32(a, b, c) \
+(__builtin_msa_vshf_w(__builtin_msa_addv_w((v4i32)((v2i64){0x0000000100000000, 0x0000000300000002}), __builtin_msa_fill_w(c)), b, a))
+#define msa_extq_s64(a, b, c) \
+(__builtin_msa_vshf_d(__builtin_msa_addv_d((v2i64){0x0000000000000000, 0x0000000000000001}, __builtin_msa_fill_d(c)), b, a))
+#endif /* _MIPSEB */
+
+/* cvttruncq, cvttintq, cvtrintq */
+#define msa_cvttruncq_u32_f32 __builtin_msa_ftrunc_u_w
+#define msa_cvttruncq_s32_f32 __builtin_msa_ftrunc_s_w
+#define msa_cvttruncq_u64_f64 __builtin_msa_ftrunc_u_d
+#define msa_cvttruncq_s64_f64 __builtin_msa_ftrunc_s_d
+#define msa_cvttintq_u32_f32  __builtin_msa_ftint_u_w
+#define msa_cvttintq_s32_f32  __builtin_msa_ftint_s_w
+#define msa_cvttintq_u64_f64  __builtin_msa_ftint_u_d
+#define msa_cvttintq_s64_f64  __builtin_msa_ftint_s_d
+#define msa_cvtrintq_f32      __builtin_msa_frint_w
+#define msa_cvtrintq_f64      __builtin_msa_frint_d
+
+/* cvtfintq, cvtfq */
+#define msa_cvtfintq_f32_u32  __builtin_msa_ffint_u_w
+#define msa_cvtfintq_f32_s32  __builtin_msa_ffint_s_w
+#define msa_cvtfintq_f64_u64  __builtin_msa_ffint_u_d
+#define msa_cvtfintq_f64_s64  __builtin_msa_ffint_s_d
+#define msa_cvtfq_f32_f64     __builtin_msa_fexdo_w
+#define msa_cvtflq_f64_f32    __builtin_msa_fexupr_d
+#define msa_cvtfhq_f64_f32    __builtin_msa_fexupl_d
+
+#define msa_addl_u8(a, b)     ((v8u16)__builtin_msa_addv_h((v8i16)V8U8_2_V8I16(a), (v8i16)V8U8_2_V8I16(b)))
+#define msa_addl_s8(a, b)     (__builtin_msa_addv_h((v8i16)V8I8_2_V8I16(a), (v8i16)V8I8_2_V8I16(b)))
+#define msa_addl_u16(a, b)    ((v4u32)__builtin_msa_addv_w((v4i32)V4U16_2_V4I32(a), (v4i32)V4U16_2_V4I32(b)))
+#define msa_addl_s16(a, b)    (__builtin_msa_addv_w((v4i32)V4I16_2_V4I32(a), (v4i32)V4I16_2_V4I32(b)))
+#define msa_subl_s16(a, b)    (__builtin_msa_subv_w((v4i32)V4I16_2_V4I32(a), (v4i32)V4I16_2_V4I32(b)))
+#define msa_recpeq_f32        __builtin_msa_frcp_w
+#define msa_recpsq_f32(a, b)  (__builtin_msa_fsub_w(msa_dupq_n_f32(2.0f), __builtin_msa_fmul_w(a, b)))
+
+#define MSA_INTERLEAVED_IMPL_LOAD2_STORE2(_Tp, _Tpv, _Tpvs, suffix, df, nlanes) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld2q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b) \
+{ \
+  _Tpv v0 = msa_ld1q_##suffix(ptr); \
+  _Tpv v1 = msa_ld1q_##suffix(ptr + nlanes); \
+  *a = (_Tpv)__builtin_msa_pckev_##df((_Tpvs)v1, (_Tpvs)v0); \
+  *b = (_Tpv)__builtin_msa_pckod_##df((_Tpvs)v1, (_Tpvs)v0); \
+} \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st2q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b) \
+{ \
+  msa_st1q_##suffix(ptr, (_Tpv)__builtin_msa_ilvr_##df((_Tpvs)b, (_Tpvs)a)); \
+  msa_st1q_##suffix(ptr + nlanes, (_Tpv)__builtin_msa_ilvl_##df((_Tpvs)b, (_Tpvs)a)); \
+}
+
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint8_t, v16u8, v16i8, u8, b, 16)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int8_t, v16i8, v16i8, s8, b, 16)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint16_t, v8u16, v8i16, u16, h, 8)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int16_t, v8i16, v8i16, s16, h, 8)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint32_t, v4u32, v4i32, u32, w, 4)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int32_t, v4i32, v4i32, s32, w, 4)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(float, v4f32, v4i32, f32, w, 4)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint64_t, v2u64, v2i64, u64, d, 2)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int64_t, v2i64, v2i64, s64, d, 2)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(double, v2f64, v2i64, f64, d, 2)
+
+#ifdef _MIPSEB
+#define MSA_INTERLEAVED_IMPL_LOAD3_8(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
+{ \
+  _Tpv v0 = msa_ld1q_##suffix(ptr); \
+  _Tpv v1 = msa_ld1q_##suffix(ptr + 16); \
+  _Tpv v2 = msa_ld1q_##suffix(ptr + 32); \
+  _Tpvs v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0704011F1F1F1F1F, 0x1F1C191613100D0A}), (_Tpvs)v0, (_Tpvs)v1); \
+  *a = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x1716150E0B080502, 0x1F1E1D1C1B1A1918}), v3, (_Tpvs)v2); \
+  v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0603001F1F1F1F1F, 0x1E1B1815120F0C09}), (_Tpvs)v0, (_Tpvs)v1); \
+  *b = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x1716150D0A070401, 0x1F1E1D1C1B1A1918}), v3, (_Tpvs)v2); \
+  v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x05021F1F1F1F1F1F, 0x1D1A1714110E0B08}), (_Tpvs)v0, (_Tpvs)v1); \
+  *c = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x17160F0C09060300, 0x1F1E1D1C1B1A1918}), v3, (_Tpvs)v2); \
+}
+#else
+#define MSA_INTERLEAVED_IMPL_LOAD3_8(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
+{ \
+  _Tpv v0 = msa_ld1q_##suffix(ptr); \
+  _Tpv v1 = msa_ld1q_##suffix(ptr + 16); \
+  _Tpv v2 = msa_ld1q_##suffix(ptr + 32); \
+  _Tpvs v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x15120F0C09060300, 0x00000000001E1B18}), (_Tpvs)v1, (_Tpvs)v0); \
+  *a = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x0706050403020100, 0x1D1A1714110A0908}), (_Tpvs)v2, v3); \
+  v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x1613100D0A070401, 0x00000000001F1C19}), (_Tpvs)v1, (_Tpvs)v0); \
+  *b = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x0706050403020100, 0x1E1B1815120A0908}), (_Tpvs)v2, v3); \
+  v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x1714110E0B080502, 0x0000000000001D1A}), (_Tpvs)v1, (_Tpvs)v0); \
+  *c = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x0706050403020100, 0x1F1C191613100908}), (_Tpvs)v2, v3); \
+}
+#endif
+
+MSA_INTERLEAVED_IMPL_LOAD3_8(uint8_t, v16u8, v16i8, u8)
+MSA_INTERLEAVED_IMPL_LOAD3_8(int8_t, v16i8, v16i8, s8)
+
+#ifdef _MIPSEB
+#define MSA_INTERLEAVED_IMPL_LOAD3_16(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
+{ \
+  _Tpv v0 = msa_ld1q_##suffix(ptr); \
+  _Tpv v1 = msa_ld1q_##suffix(ptr + 8); \
+  _Tpv v2 = msa_ld1q_##suffix(ptr + 16); \
+  _Tpvs v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x00030000000F000F, 0x000F000C00090006}), (_Tpvs)v1, (_Tpvs)v0); \
+  *a = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000A00050002, 0x000F000E000D000C}), (_Tpvs)v2, v3); \
+  v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0002000F000F000F, 0x000E000B00080005}), (_Tpvs)v1, (_Tpvs)v0); \
+  *b = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000700040001, 0x000F000E000D000C}), (_Tpvs)v2, v3); \
+  v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0001000F000F000F, 0x000D000A00070004}), (_Tpvs)v1, (_Tpvs)v0); \
+  *c = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000600030000, 0x000F000E000D000C}), (_Tpvs)v2, v3); \
+}
+#else
+#define MSA_INTERLEAVED_IMPL_LOAD3_16(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
+{ \
+  _Tpv v0 = msa_ld1q_##suffix(ptr); \
+  _Tpv v1 = msa_ld1q_##suffix(ptr + 8); \
+  _Tpv v2 = msa_ld1q_##suffix(ptr + 16); \
+  _Tpvs v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0009000600030000, 0x00000000000F000C}), (_Tpvs)v1, (_Tpvs)v0); \
+  *a = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x0003000200010000, 0x000D000A00050004}), (_Tpvs)v2, v3); \
+  v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000A000700040001, 0x000000000000000D}), (_Tpvs)v1, (_Tpvs)v0); \
+  *b = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x0003000200010000, 0x000E000B00080004}), (_Tpvs)v2, v3); \
+  v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000800050002, 0x000000000000000E}), (_Tpvs)v1, (_Tpvs)v0); \
+  *c = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x0003000200010000, 0x000F000C00090004}), (_Tpvs)v2, v3); \
+}
+#endif
+
+MSA_INTERLEAVED_IMPL_LOAD3_16(uint16_t, v8u16, v8i16, u16)
+MSA_INTERLEAVED_IMPL_LOAD3_16(int16_t, v8i16, v8i16, s16)
+
+#define MSA_INTERLEAVED_IMPL_LOAD3_32(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
+{ \
+  _Tpv v00 = msa_ld1q_##suffix(ptr); \
+  _Tpv v01 = msa_ld1q_##suffix(ptr + 4); \
+  _Tpv v02 = msa_ld1q_##suffix(ptr + 8); \
+  _Tpvs v10 = __builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v01, (v2i64)v01), (_Tpvs)v00); \
+  _Tpvs v11 = __builtin_msa_ilvr_w((_Tpvs)v02, (_Tpvs)__builtin_msa_ilvl_d((v2i64)v00, (v2i64)v00)); \
+  _Tpvs v12 = __builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v02, (v2i64)v02), (_Tpvs)v01); \
+  *a = (_Tpv)__builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v11, (v2i64)v11), v10); \
+  *b = (_Tpv)__builtin_msa_ilvr_w(v12, (_Tpvs)__builtin_msa_ilvl_d((v2i64)v10, (v2i64)v10)); \
+  *c = (_Tpv)__builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v12, (v2i64)v12), v11); \
+}
+
+MSA_INTERLEAVED_IMPL_LOAD3_32(uint32_t, v4u32, v4i32, u32)
+MSA_INTERLEAVED_IMPL_LOAD3_32(int32_t, v4i32, v4i32, s32)
+MSA_INTERLEAVED_IMPL_LOAD3_32(float, v4f32, v4i32, f32)
+
+#define MSA_INTERLEAVED_IMPL_LOAD3_64(_Tp, _Tpv, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
+{ \
+  *((_Tp*)a) = *ptr;           *((_Tp*)b) = *(ptr + 1);     *((_Tp*)c) = *(ptr + 2);     \
+  *((_Tp*)a + 1) = *(ptr + 3); *((_Tp*)b + 1) = *(ptr + 4); *((_Tp*)c + 1) = *(ptr + 5); \
+}
+
+MSA_INTERLEAVED_IMPL_LOAD3_64(uint64_t, v2u64, u64)
+MSA_INTERLEAVED_IMPL_LOAD3_64(int64_t, v2i64, s64)
+MSA_INTERLEAVED_IMPL_LOAD3_64(double, v2f64, f64)
+
+#ifdef _MIPSEB
+#define MSA_INTERLEAVED_IMPL_STORE3_8(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
+{ \
+  _Tpvs v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0F0E0D0C0B1F1F1F, 0x1F1E1D1C1B1A1F1F}), (_Tpvs)b, (_Tpvs)a); \
+  _Tpvs v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0D1C140C1B130B1A, 0x1F170F1E160E1D15}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0A09080706051F1F, 0x19181716151F1F1F}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x1D14071C13061B12, 0x170A1F16091E1508}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x04030201001F1F1F, 0x14131211101F1F1F}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x15021C14011B1300, 0x051F17041E16031D}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 32, (_Tpv)v1); \
+}
+#else
+#define MSA_INTERLEAVED_IMPL_STORE3_8(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
+{ \
+  _Tpvs v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0000050403020100, 0x0000001413121110}), (_Tpvs)b, (_Tpvs)a); \
+  _Tpvs v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0A02110901100800, 0x05140C04130B0312}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0000000A09080706, 0x00001A1918171615}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x170A011609001508, 0x0D04190C03180B02}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0000000F0E0D0C0B, 0x0000001F1E1D1C1B}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x021C09011B08001A, 0x1F0C041E0B031D0A}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 32, (_Tpv)v1); \
+}
+#endif
+
+MSA_INTERLEAVED_IMPL_STORE3_8(uint8_t, v16u8, v16i8, u8)
+MSA_INTERLEAVED_IMPL_STORE3_8(int8_t, v16i8, v16i8, s8)
+
+#ifdef _MIPSEB
+#define MSA_INTERLEAVED_IMPL_STORE3_16(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
+{ \
+  _Tpvs v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000700060005000F, 0x000F000E000D000F}), (_Tpvs)b, (_Tpvs)a); \
+  _Tpvs v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000A0006000D0009, 0x000F000B0007000E}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x00040003000F000F, 0x000C000B000A000F}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000E000A0003000D, 0x0005000F000B0004}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000200010000000F, 0x00090008000F000F}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0001000E00090000, 0x000B0002000F000A}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \
+}
+#else
+#define MSA_INTERLEAVED_IMPL_STORE3_16(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
+{ \
+  _Tpvs v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0000000200010000, 0x0000000A00090008}), (_Tpvs)b, (_Tpvs)a); \
+  _Tpvs v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0001000800040000, 0x0006000200090005}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0000000500040003, 0x00000000000C000B}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B00040000000A, 0x0002000C00050001}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0000000000070006, 0x0000000F000E000D}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x00050000000D0004, 0x000F00060001000E}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \
+}
+#endif
+
+MSA_INTERLEAVED_IMPL_STORE3_16(uint16_t, v8u16, v8i16, u16)
+MSA_INTERLEAVED_IMPL_STORE3_16(int16_t, v8i16, v8i16, s16)
+
+#ifdef _MIPSEB
+#define MSA_INTERLEAVED_IMPL_STORE3_32(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
+{ \
+  _Tpvs v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000300000007, 0x0000000700000006}), (_Tpvs)b, (_Tpvs)a); \
+  _Tpvs v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000300000006, 0x0000000700000005}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000200000001, 0x0000000500000007}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000700000004, 0x0000000500000002}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 4, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000007, 0x0000000400000007}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000500000000, 0x0000000100000007}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \
+}
+#else
+#define MSA_INTERLEAVED_IMPL_STORE3_32(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
+{ \
+  _Tpvs v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000100000000, 0x0000000000000004}), (_Tpvs)b, (_Tpvs)a); \
+  _Tpvs v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000200000000, 0x0000000100000004}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000002, 0x0000000600000005}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000500000002, 0x0000000300000000}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 4, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000003, 0x0000000000000007}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000006, 0x0000000700000002}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \
+}
+#endif
+
+MSA_INTERLEAVED_IMPL_STORE3_32(uint32_t, v4u32, v4i32, u32)
+MSA_INTERLEAVED_IMPL_STORE3_32(int32_t, v4i32, v4i32, s32)
+MSA_INTERLEAVED_IMPL_STORE3_32(float, v4f32, v4i32, f32)
+
+#define MSA_INTERLEAVED_IMPL_STORE3_64(_Tp, _Tpv, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
+{ \
+  *ptr = a[0];       *(ptr + 1) = b[0]; *(ptr + 2) = c[0]; \
+  *(ptr + 3) = a[1]; *(ptr + 4) = b[1]; *(ptr + 5) = c[1]; \
+}
+
+MSA_INTERLEAVED_IMPL_STORE3_64(uint64_t, v2u64, u64)
+MSA_INTERLEAVED_IMPL_STORE3_64(int64_t, v2i64, s64)
+MSA_INTERLEAVED_IMPL_STORE3_64(double, v2f64, f64)
+
+#define MSA_INTERLEAVED_IMPL_LOAD4_STORE4(_Tp, _Tpv, _Tpvs, suffix, df, nlanes) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld4q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c, _Tpv* d) \
+{ \
+  _Tpv v0 = msa_ld1q_##suffix(ptr); \
+  _Tpv v1 = msa_ld1q_##suffix(ptr + nlanes); \
+  _Tpv v2 = msa_ld1q_##suffix(ptr + nlanes * 2); \
+  _Tpv v3 = msa_ld1q_##suffix(ptr + nlanes * 3); \
+  _Tpvs t0 = __builtin_msa_pckev_##df((_Tpvs)v1, (_Tpvs)v0); \
+  _Tpvs t1 = __builtin_msa_pckev_##df((_Tpvs)v3, (_Tpvs)v2); \
+  _Tpvs t2 = __builtin_msa_pckod_##df((_Tpvs)v1, (_Tpvs)v0); \
+  _Tpvs t3 = __builtin_msa_pckod_##df((_Tpvs)v3, (_Tpvs)v2); \
+  *a = (_Tpv)__builtin_msa_pckev_##df(t1, t0); \
+  *b = (_Tpv)__builtin_msa_pckev_##df(t3, t2); \
+  *c = (_Tpv)__builtin_msa_pckod_##df(t1, t0); \
+  *d = (_Tpv)__builtin_msa_pckod_##df(t3, t2); \
+} \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st4q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c, const _Tpv d) \
+{ \
+  _Tpvs v0 = __builtin_msa_ilvr_##df((_Tpvs)c, (_Tpvs)a); \
+  _Tpvs v1 = __builtin_msa_ilvr_##df((_Tpvs)d, (_Tpvs)b); \
+  _Tpvs v2 = __builtin_msa_ilvl_##df((_Tpvs)c, (_Tpvs)a); \
+  _Tpvs v3 = __builtin_msa_ilvl_##df((_Tpvs)d, (_Tpvs)b); \
+  msa_st1q_##suffix(ptr, (_Tpv)__builtin_msa_ilvr_##df(v1, v0)); \
+  msa_st1q_##suffix(ptr + nlanes, (_Tpv)__builtin_msa_ilvl_##df(v1, v0)); \
+  msa_st1q_##suffix(ptr + 2 * nlanes, (_Tpv)__builtin_msa_ilvr_##df(v3, v2)); \
+  msa_st1q_##suffix(ptr + 3 * nlanes, (_Tpv)__builtin_msa_ilvl_##df(v3, v2)); \
+}
+
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4(uint8_t, v16u8, v16i8, u8, b, 16)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4(int8_t, v16i8, v16i8, s8, b, 16)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4(uint16_t, v8u16, v8i16, u16, h, 8)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4(int16_t, v8i16, v8i16, s16, h, 8)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4(uint32_t, v4u32, v4i32, u32, w, 4)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4(int32_t, v4i32, v4i32, s32, w, 4)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4(float, v4f32, v4i32, f32, w, 4)
+
+#define MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld4q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c, _Tpv* d) \
+{ \
+  _Tpv v0 = msa_ld1q_##suffix(ptr); \
+  _Tpv v1 = msa_ld1q_##suffix(ptr + 2); \
+  _Tpv v2 = msa_ld1q_##suffix(ptr + 4); \
+  _Tpv v3 = msa_ld1q_##suffix(ptr + 6); \
+  *a = (_Tpv)__builtin_msa_ilvr_d((_Tpvs)v2, (_Tpvs)v0); \
+  *b = (_Tpv)__builtin_msa_ilvl_d((_Tpvs)v2, (_Tpvs)v0); \
+  *c = (_Tpv)__builtin_msa_ilvr_d((_Tpvs)v3, (_Tpvs)v1); \
+  *d = (_Tpv)__builtin_msa_ilvl_d((_Tpvs)v3, (_Tpvs)v1); \
+} \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st4q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c, const _Tpv d) \
+{ \
+  msa_st1q_##suffix(ptr, (_Tpv)__builtin_msa_ilvr_d((_Tpvs)b, (_Tpvs)a)); \
+  msa_st1q_##suffix(ptr + 2, (_Tpv)__builtin_msa_ilvr_d((_Tpvs)d, (_Tpvs)c)); \
+  msa_st1q_##suffix(ptr + 4, (_Tpv)__builtin_msa_ilvl_d((_Tpvs)b, (_Tpvs)a)); \
+  msa_st1q_##suffix(ptr + 6, (_Tpv)__builtin_msa_ilvl_d((_Tpvs)d, (_Tpvs)c)); \
+}
+
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(uint64_t, v2u64, v2i64, u64)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(int64_t, v2i64, v2i64, s64)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(double, v2f64, v2i64, f64)
+
+__extension__ extern __inline v8i16
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+msa_qdmulhq_n_s16(v8i16 a, int16_t b)
+{
+  v8i16 a_lo, a_hi;
+  ILVRL_H2_SH(a, msa_dupq_n_s16(0), a_lo, a_hi);
+  return msa_packr_s32(msa_shlq_n_s32(msa_mulq_s32(msa_paddlq_s16(a_lo), msa_dupq_n_s32(b)), 1),
+                       msa_shlq_n_s32(msa_mulq_s32(msa_paddlq_s16(a_hi), msa_dupq_n_s32(b)), 1), 16);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif /*__mips_msa*/
+#endif /* OPENCV_CORE_MSA_MACROS_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/simd_utils.impl.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/simd_utils.impl.hpp
new file mode 100644
index 0000000..fff8f94
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/hal/simd_utils.impl.hpp
@@ -0,0 +1,146 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// This header is not standalone. Don't include directly, use "intrin.hpp" instead.
+#ifdef OPENCV_HAL_INTRIN_HPP  // defined in intrin.hpp
+
+
+#if CV_SIMD128 || CV_SIMD128_CPP
+
+template<typename _T> struct Type2Vec128_Traits;
+#define CV_INTRIN_DEF_TYPE2VEC128_TRAITS(type_, vec_type_) \
+    template<> struct Type2Vec128_Traits<type_> \
+    { \
+        typedef vec_type_ vec_type; \
+    }
+
+CV_INTRIN_DEF_TYPE2VEC128_TRAITS(uchar, v_uint8x16);
+CV_INTRIN_DEF_TYPE2VEC128_TRAITS(schar, v_int8x16);
+CV_INTRIN_DEF_TYPE2VEC128_TRAITS(ushort, v_uint16x8);
+CV_INTRIN_DEF_TYPE2VEC128_TRAITS(short, v_int16x8);
+CV_INTRIN_DEF_TYPE2VEC128_TRAITS(unsigned, v_uint32x4);
+CV_INTRIN_DEF_TYPE2VEC128_TRAITS(int, v_int32x4);
+CV_INTRIN_DEF_TYPE2VEC128_TRAITS(float, v_float32x4);
+CV_INTRIN_DEF_TYPE2VEC128_TRAITS(uint64, v_uint64x2);
+CV_INTRIN_DEF_TYPE2VEC128_TRAITS(int64, v_int64x2);
+#if CV_SIMD128_64F
+CV_INTRIN_DEF_TYPE2VEC128_TRAITS(double, v_float64x2);
+#endif
+
+template<typename _T> static inline
+typename Type2Vec128_Traits<_T>::vec_type v_setall(const _T& a);
+
+template<> inline Type2Vec128_Traits< uchar>::vec_type v_setall< uchar>(const  uchar& a) { return v_setall_u8(a); }
+template<> inline Type2Vec128_Traits< schar>::vec_type v_setall< schar>(const  schar& a) { return v_setall_s8(a); }
+template<> inline Type2Vec128_Traits<ushort>::vec_type v_setall<ushort>(const ushort& a) { return v_setall_u16(a); }
+template<> inline Type2Vec128_Traits< short>::vec_type v_setall< short>(const  short& a) { return v_setall_s16(a); }
+template<> inline Type2Vec128_Traits<  uint>::vec_type v_setall<  uint>(const   uint& a) { return v_setall_u32(a); }
+template<> inline Type2Vec128_Traits<   int>::vec_type v_setall<   int>(const    int& a) { return v_setall_s32(a); }
+template<> inline Type2Vec128_Traits<uint64>::vec_type v_setall<uint64>(const uint64& a) { return v_setall_u64(a); }
+template<> inline Type2Vec128_Traits< int64>::vec_type v_setall< int64>(const  int64& a) { return v_setall_s64(a); }
+template<> inline Type2Vec128_Traits< float>::vec_type v_setall< float>(const  float& a) { return v_setall_f32(a); }
+#if CV_SIMD128_64F
+template<> inline Type2Vec128_Traits<double>::vec_type v_setall<double>(const double& a) { return v_setall_f64(a); }
+#endif
+
+#endif  // SIMD128
+
+
+#if CV_SIMD256
+
+template<typename _T> struct Type2Vec256_Traits;
+#define CV_INTRIN_DEF_TYPE2VEC256_TRAITS(type_, vec_type_) \
+    template<> struct Type2Vec256_Traits<type_> \
+    { \
+        typedef vec_type_ vec_type; \
+    }
+
+CV_INTRIN_DEF_TYPE2VEC256_TRAITS(uchar, v_uint8x32);
+CV_INTRIN_DEF_TYPE2VEC256_TRAITS(schar, v_int8x32);
+CV_INTRIN_DEF_TYPE2VEC256_TRAITS(ushort, v_uint16x16);
+CV_INTRIN_DEF_TYPE2VEC256_TRAITS(short, v_int16x16);
+CV_INTRIN_DEF_TYPE2VEC256_TRAITS(unsigned, v_uint32x8);
+CV_INTRIN_DEF_TYPE2VEC256_TRAITS(int, v_int32x8);
+CV_INTRIN_DEF_TYPE2VEC256_TRAITS(float, v_float32x8);
+CV_INTRIN_DEF_TYPE2VEC256_TRAITS(uint64, v_uint64x4);
+CV_INTRIN_DEF_TYPE2VEC256_TRAITS(int64, v_int64x4);
+#if CV_SIMD256_64F
+CV_INTRIN_DEF_TYPE2VEC256_TRAITS(double, v_float64x4);
+#endif
+
+template<typename _T> static inline
+typename Type2Vec256_Traits<_T>::vec_type v256_setall(const _T& a);
+
+template<> inline Type2Vec256_Traits< uchar>::vec_type v256_setall< uchar>(const  uchar& a) { return v256_setall_u8(a); }
+template<> inline Type2Vec256_Traits< schar>::vec_type v256_setall< schar>(const  schar& a) { return v256_setall_s8(a); }
+template<> inline Type2Vec256_Traits<ushort>::vec_type v256_setall<ushort>(const ushort& a) { return v256_setall_u16(a); }
+template<> inline Type2Vec256_Traits< short>::vec_type v256_setall< short>(const  short& a) { return v256_setall_s16(a); }
+template<> inline Type2Vec256_Traits<  uint>::vec_type v256_setall<  uint>(const   uint& a) { return v256_setall_u32(a); }
+template<> inline Type2Vec256_Traits<   int>::vec_type v256_setall<   int>(const    int& a) { return v256_setall_s32(a); }
+template<> inline Type2Vec256_Traits<uint64>::vec_type v256_setall<uint64>(const uint64& a) { return v256_setall_u64(a); }
+template<> inline Type2Vec256_Traits< int64>::vec_type v256_setall< int64>(const  int64& a) { return v256_setall_s64(a); }
+template<> inline Type2Vec256_Traits< float>::vec_type v256_setall< float>(const  float& a) { return v256_setall_f32(a); }
+#if CV_SIMD256_64F
+template<> inline Type2Vec256_Traits<double>::vec_type v256_setall<double>(const double& a) { return v256_setall_f64(a); }
+#endif
+
+#endif  // SIMD256
+
+
+#if CV_SIMD512
+
+template<typename _T> struct Type2Vec512_Traits;
+#define CV_INTRIN_DEF_TYPE2VEC512_TRAITS(type_, vec_type_) \
+    template<> struct Type2Vec512_Traits<type_> \
+    { \
+        typedef vec_type_ vec_type; \
+    }
+
+CV_INTRIN_DEF_TYPE2VEC512_TRAITS(uchar, v_uint8x64);
+CV_INTRIN_DEF_TYPE2VEC512_TRAITS(schar, v_int8x64);
+CV_INTRIN_DEF_TYPE2VEC512_TRAITS(ushort, v_uint16x32);
+CV_INTRIN_DEF_TYPE2VEC512_TRAITS(short, v_int16x32);
+CV_INTRIN_DEF_TYPE2VEC512_TRAITS(unsigned, v_uint32x16);
+CV_INTRIN_DEF_TYPE2VEC512_TRAITS(int, v_int32x16);
+CV_INTRIN_DEF_TYPE2VEC512_TRAITS(float, v_float32x16);
+CV_INTRIN_DEF_TYPE2VEC512_TRAITS(uint64, v_uint64x8);
+CV_INTRIN_DEF_TYPE2VEC512_TRAITS(int64, v_int64x8);
+#if CV_SIMD512_64F
+CV_INTRIN_DEF_TYPE2VEC512_TRAITS(double, v_float64x8);
+#endif
+
+template<typename _T> static inline
+typename Type2Vec512_Traits<_T>::vec_type v512_setall(const _T& a);
+
+template<> inline Type2Vec512_Traits< uchar>::vec_type v512_setall< uchar>(const  uchar& a) { return v512_setall_u8(a); }
+template<> inline Type2Vec512_Traits< schar>::vec_type v512_setall< schar>(const  schar& a) { return v512_setall_s8(a); }
+template<> inline Type2Vec512_Traits<ushort>::vec_type v512_setall<ushort>(const ushort& a) { return v512_setall_u16(a); }
+template<> inline Type2Vec512_Traits< short>::vec_type v512_setall< short>(const  short& a) { return v512_setall_s16(a); }
+template<> inline Type2Vec512_Traits<  uint>::vec_type v512_setall<  uint>(const   uint& a) { return v512_setall_u32(a); }
+template<> inline Type2Vec512_Traits<   int>::vec_type v512_setall<   int>(const    int& a) { return v512_setall_s32(a); }
+template<> inline Type2Vec512_Traits<uint64>::vec_type v512_setall<uint64>(const uint64& a) { return v512_setall_u64(a); }
+template<> inline Type2Vec512_Traits< int64>::vec_type v512_setall< int64>(const  int64& a) { return v512_setall_s64(a); }
+template<> inline Type2Vec512_Traits< float>::vec_type v512_setall< float>(const  float& a) { return v512_setall_f32(a); }
+#if CV_SIMD512_64F
+template<> inline Type2Vec512_Traits<double>::vec_type v512_setall<double>(const double& a) { return v512_setall_f64(a); }
+#endif
+
+#endif  // SIMD512
+
+
+#if CV_SIMD_WIDTH == 16
+template<typename _T> static inline
+typename Type2Vec128_Traits<_T>::vec_type vx_setall(const _T& a) { return v_setall(a); }
+#elif CV_SIMD_WIDTH == 32
+template<typename _T> static inline
+typename Type2Vec256_Traits<_T>::vec_type vx_setall(const _T& a) { return v256_setall(a); }
+#elif CV_SIMD_WIDTH == 64
+template<typename _T> static inline
+typename Type2Vec512_Traits<_T>::vec_type vx_setall(const _T& a) { return v512_setall(a); }
+#else
+#error "Build configuration error, unsupported CV_SIMD_WIDTH"
+#endif
+
+
+#endif  // OPENCV_HAL_INTRIN_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/mat.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/mat.hpp
new file mode 100644
index 0000000..2aba15c
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/mat.hpp
@@ -0,0 +1,3775 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_MAT_HPP
+#define OPENCV_CORE_MAT_HPP
+
+#ifndef __cplusplus
+#  error mat.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core/matx.hpp"
+#include "opencv2/core/types.hpp"
+
+#include "opencv2/core/bufferpool.hpp"
+
+#include <type_traits>
+
+namespace cv
+{
+
+//! @addtogroup core_basic
+//! @{
+
+enum AccessFlag { ACCESS_READ=1<<24, ACCESS_WRITE=1<<25,
+    ACCESS_RW=3<<24, ACCESS_MASK=ACCESS_RW, ACCESS_FAST=1<<26 };
+CV_ENUM_FLAGS(AccessFlag)
+__CV_ENUM_FLAGS_BITWISE_AND(AccessFlag, int, AccessFlag)
+
+CV__DEBUG_NS_BEGIN
+
+class CV_EXPORTS _OutputArray;
+
+//////////////////////// Input/Output Array Arguments /////////////////////////////////
+
+/** @brief This is the proxy class for passing read-only input arrays into OpenCV functions.
+
+It is defined as:
+@code
+    typedef const _InputArray& InputArray;
+@endcode
+where _InputArray is a class that can be constructed from `Mat`, `Mat_<T>`, `Matx<T, m, n>`,
+`std::vector<T>`, `std::vector<std::vector<T> >`, `std::vector<Mat>`, `std::vector<Mat_<T> >`,
+`UMat`, `std::vector<UMat>` or `double`. It can also be constructed from a matrix expression.
+
+Since this is mostly implementation-level class, and its interface may change in future versions, we
+do not describe it in details. There are a few key things, though, that should be kept in mind:
+
+-   When you see in the reference manual or in OpenCV source code a function that takes
+    InputArray, it means that you can actually pass `Mat`, `Matx`, `vector<T>` etc. (see above the
+    complete list).
+-   Optional input arguments: If some of the input arrays may be empty, pass cv::noArray() (or
+    simply cv::Mat() as you probably did before).
+-   The class is designed solely for passing parameters. That is, normally you *should not*
+    declare class members, local and global variables of this type.
+-   If you want to design your own function or a class method that can operate of arrays of
+    multiple types, you can use InputArray (or OutputArray) for the respective parameters. Inside
+    a function you should use _InputArray::getMat() method to construct a matrix header for the
+    array (without copying data). _InputArray::kind() can be used to distinguish Mat from
+    `vector<>` etc., but normally it is not needed.
+
+Here is how you can use a function that takes InputArray :
+@code
+    std::vector<Point2f> vec;
+    // points or a circle
+    for( int i = 0; i < 30; i++ )
+        vec.push_back(Point2f((float)(100 + 30*cos(i*CV_PI*2/5)),
+                              (float)(100 - 30*sin(i*CV_PI*2/5))));
+    cv::transform(vec, vec, cv::Matx23f(0.707, -0.707, 10, 0.707, 0.707, 20));
+@endcode
+That is, we form an STL vector containing points, and apply in-place affine transformation to the
+vector using the 2x3 matrix created inline as `Matx<float, 2, 3>` instance.
+
+Here is how such a function can be implemented (for simplicity, we implement a very specific case of
+it, according to the assertion statement inside) :
+@code
+    void myAffineTransform(InputArray _src, OutputArray _dst, InputArray _m)
+    {
+        // get Mat headers for input arrays. This is O(1) operation,
+        // unless _src and/or _m are matrix expressions.
+        Mat src = _src.getMat(), m = _m.getMat();
+        CV_Assert( src.type() == CV_32FC2 && m.type() == CV_32F && m.size() == Size(3, 2) );
+
+        // [re]create the output array so that it has the proper size and type.
+        // In case of Mat it calls Mat::create, in case of STL vector it calls vector::resize.
+        _dst.create(src.size(), src.type());
+        Mat dst = _dst.getMat();
+
+        for( int i = 0; i < src.rows; i++ )
+            for( int j = 0; j < src.cols; j++ )
+            {
+                Point2f pt = src.at<Point2f>(i, j);
+                dst.at<Point2f>(i, j) = Point2f(m.at<float>(0, 0)*pt.x +
+                                                m.at<float>(0, 1)*pt.y +
+                                                m.at<float>(0, 2),
+                                                m.at<float>(1, 0)*pt.x +
+                                                m.at<float>(1, 1)*pt.y +
+                                                m.at<float>(1, 2));
+            }
+    }
+@endcode
+There is another related type, InputArrayOfArrays, which is currently defined as a synonym for
+InputArray:
+@code
+    typedef InputArray InputArrayOfArrays;
+@endcode
+It denotes function arguments that are either vectors of vectors or vectors of matrices. A separate
+synonym is needed to generate Python/Java etc. wrappers properly. At the function implementation
+level their use is similar, but _InputArray::getMat(idx) should be used to get header for the
+idx-th component of the outer vector and _InputArray::size().area() should be used to find the
+number of components (vectors/matrices) of the outer vector.
+
+In general, type support is limited to cv::Mat types. Other types are forbidden.
+But in some cases we need to support passing of custom non-general Mat types, like arrays of cv::KeyPoint, cv::DMatch, etc.
+This data is not intended to be interpreted as an image data, or processed somehow like regular cv::Mat.
+To pass such custom type use rawIn() / rawOut() / rawInOut() wrappers.
+Custom type is wrapped as Mat-compatible `CV_8UC<N>` values (N = sizeof(T), N <= CV_CN_MAX).
+ */
+class CV_EXPORTS _InputArray
+{
+public:
+    enum KindFlag {
+        KIND_SHIFT = 16,
+        FIXED_TYPE = 0x8000 << KIND_SHIFT,
+        FIXED_SIZE = 0x4000 << KIND_SHIFT,
+        KIND_MASK = 31 << KIND_SHIFT,
+
+        NONE              = 0 << KIND_SHIFT,
+        MAT               = 1 << KIND_SHIFT,
+        MATX              = 2 << KIND_SHIFT,
+        STD_VECTOR        = 3 << KIND_SHIFT,
+        STD_VECTOR_VECTOR = 4 << KIND_SHIFT,
+        STD_VECTOR_MAT    = 5 << KIND_SHIFT,
+#if OPENCV_ABI_COMPATIBILITY < 500
+        EXPR              = 6 << KIND_SHIFT,  //!< removed: https://github.com/opencv/opencv/pull/17046
+#endif
+        OPENGL_BUFFER     = 7 << KIND_SHIFT,
+        CUDA_HOST_MEM     = 8 << KIND_SHIFT,
+        CUDA_GPU_MAT      = 9 << KIND_SHIFT,
+        UMAT              =10 << KIND_SHIFT,
+        STD_VECTOR_UMAT   =11 << KIND_SHIFT,
+        STD_BOOL_VECTOR   =12 << KIND_SHIFT,
+        STD_VECTOR_CUDA_GPU_MAT = 13 << KIND_SHIFT,
+#if OPENCV_ABI_COMPATIBILITY < 500
+        STD_ARRAY         =14 << KIND_SHIFT,  //!< removed: https://github.com/opencv/opencv/issues/18897
+#endif
+        STD_ARRAY_MAT     =15 << KIND_SHIFT
+    };
+
+    _InputArray();
+    _InputArray(int _flags, void* _obj);
+    _InputArray(const Mat& m);
+    _InputArray(const MatExpr& expr);
+    _InputArray(const std::vector<Mat>& vec);
+    template<typename _Tp> _InputArray(const Mat_<_Tp>& m);
+    template<typename _Tp> _InputArray(const std::vector<_Tp>& vec);
+    _InputArray(const std::vector<bool>& vec);
+    template<typename _Tp> _InputArray(const std::vector<std::vector<_Tp> >& vec);
+    _InputArray(const std::vector<std::vector<bool> >&) = delete;  // not supported
+    template<typename _Tp> _InputArray(const std::vector<Mat_<_Tp> >& vec);
+    template<typename _Tp> _InputArray(const _Tp* vec, int n);
+    template<typename _Tp, int m, int n> _InputArray(const Matx<_Tp, m, n>& matx);
+    _InputArray(const double& val);
+    _InputArray(const cuda::GpuMat& d_mat);
+    _InputArray(const std::vector<cuda::GpuMat>& d_mat_array);
+    _InputArray(const ogl::Buffer& buf);
+    _InputArray(const cuda::HostMem& cuda_mem);
+    template<typename _Tp> _InputArray(const cudev::GpuMat_<_Tp>& m);
+    _InputArray(const UMat& um);
+    _InputArray(const std::vector<UMat>& umv);
+
+    template<typename _Tp, std::size_t _Nm> _InputArray(const std::array<_Tp, _Nm>& arr);
+    template<std::size_t _Nm> _InputArray(const std::array<Mat, _Nm>& arr);
+
+    template<typename _Tp> static _InputArray rawIn(const std::vector<_Tp>& vec);
+    template<typename _Tp, std::size_t _Nm> static _InputArray rawIn(const std::array<_Tp, _Nm>& arr);
+
+    Mat getMat(int idx=-1) const;
+    Mat getMat_(int idx=-1) const;
+    UMat getUMat(int idx=-1) const;
+    void getMatVector(std::vector<Mat>& mv) const;
+    void getUMatVector(std::vector<UMat>& umv) const;
+    void getGpuMatVector(std::vector<cuda::GpuMat>& gpumv) const;
+    cuda::GpuMat getGpuMat() const;
+    ogl::Buffer getOGlBuffer() const;
+
+    int getFlags() const;
+    void* getObj() const;
+    Size getSz() const;
+
+    _InputArray::KindFlag kind() const;
+    int dims(int i=-1) const;
+    int cols(int i=-1) const;
+    int rows(int i=-1) const;
+    Size size(int i=-1) const;
+    int sizend(int* sz, int i=-1) const;
+    bool sameSize(const _InputArray& arr) const;
+    size_t total(int i=-1) const;
+    int type(int i=-1) const;
+    int depth(int i=-1) const;
+    int channels(int i=-1) const;
+    bool isContinuous(int i=-1) const;
+    bool isSubmatrix(int i=-1) const;
+    bool empty() const;
+    void copyTo(const _OutputArray& arr) const;
+    void copyTo(const _OutputArray& arr, const _InputArray & mask) const;
+    size_t offset(int i=-1) const;
+    size_t step(int i=-1) const;
+    bool isMat() const;
+    bool isUMat() const;
+    bool isMatVector() const;
+    bool isUMatVector() const;
+    bool isMatx() const;
+    bool isVector() const;
+    bool isGpuMat() const;
+    bool isGpuMatVector() const;
+    ~_InputArray();
+
+protected:
+    int flags;
+    void* obj;
+    Size sz;
+
+    void init(int _flags, const void* _obj);
+    void init(int _flags, const void* _obj, Size _sz);
+};
+CV_ENUM_FLAGS(_InputArray::KindFlag)
+__CV_ENUM_FLAGS_BITWISE_AND(_InputArray::KindFlag, int, _InputArray::KindFlag)
+
+/** @brief This type is very similar to InputArray except that it is used for input/output and output function
+parameters.
+
+Just like with InputArray, OpenCV users should not care about OutputArray, they just pass `Mat`,
+`vector<T>` etc. to the functions. The same limitation as for `InputArray`: *Do not explicitly
+create OutputArray instances* applies here too.
+
+If you want to make your function polymorphic (i.e. accept different arrays as output parameters),
+it is also not very difficult. Take the sample above as the reference. Note that
+_OutputArray::create() needs to be called before _OutputArray::getMat(). This way you guarantee
+that the output array is properly allocated.
+
+Optional output parameters. If you do not need certain output array to be computed and returned to
+you, pass cv::noArray(), just like you would in the case of optional input array. At the
+implementation level, use _OutputArray::needed() to check if certain output array needs to be
+computed or not.
+
+There are several synonyms for OutputArray that are used to assist automatic Python/Java/... wrapper
+generators:
+@code
+    typedef OutputArray OutputArrayOfArrays;
+    typedef OutputArray InputOutputArray;
+    typedef OutputArray InputOutputArrayOfArrays;
+@endcode
+ */
+class CV_EXPORTS _OutputArray : public _InputArray
+{
+public:
+    enum DepthMask
+    {
+        DEPTH_MASK_8U = 1 << CV_8U,
+        DEPTH_MASK_8S = 1 << CV_8S,
+        DEPTH_MASK_16U = 1 << CV_16U,
+        DEPTH_MASK_16S = 1 << CV_16S,
+        DEPTH_MASK_32S = 1 << CV_32S,
+        DEPTH_MASK_32F = 1 << CV_32F,
+        DEPTH_MASK_64F = 1 << CV_64F,
+        DEPTH_MASK_16F = 1 << CV_16F,
+        DEPTH_MASK_ALL = (DEPTH_MASK_64F<<1)-1,
+        DEPTH_MASK_ALL_BUT_8S = DEPTH_MASK_ALL & ~DEPTH_MASK_8S,
+        DEPTH_MASK_ALL_16F = (DEPTH_MASK_16F<<1)-1,
+        DEPTH_MASK_FLT = DEPTH_MASK_32F + DEPTH_MASK_64F
+    };
+
+    _OutputArray();
+    _OutputArray(int _flags, void* _obj);
+    _OutputArray(Mat& m);
+    _OutputArray(std::vector<Mat>& vec);
+    _OutputArray(cuda::GpuMat& d_mat);
+    _OutputArray(std::vector<cuda::GpuMat>& d_mat);
+    _OutputArray(ogl::Buffer& buf);
+    _OutputArray(cuda::HostMem& cuda_mem);
+    template<typename _Tp> _OutputArray(cudev::GpuMat_<_Tp>& m);
+    template<typename _Tp> _OutputArray(std::vector<_Tp>& vec);
+    _OutputArray(std::vector<bool>& vec) = delete;  // not supported
+    template<typename _Tp> _OutputArray(std::vector<std::vector<_Tp> >& vec);
+    _OutputArray(std::vector<std::vector<bool> >&) = delete;  // not supported
+    template<typename _Tp> _OutputArray(std::vector<Mat_<_Tp> >& vec);
+    template<typename _Tp> _OutputArray(Mat_<_Tp>& m);
+    template<typename _Tp> _OutputArray(_Tp* vec, int n);
+    template<typename _Tp, int m, int n> _OutputArray(Matx<_Tp, m, n>& matx);
+    _OutputArray(UMat& m);
+    _OutputArray(std::vector<UMat>& vec);
+
+    _OutputArray(const Mat& m);
+    _OutputArray(const std::vector<Mat>& vec);
+    _OutputArray(const cuda::GpuMat& d_mat);
+    _OutputArray(const std::vector<cuda::GpuMat>& d_mat);
+    _OutputArray(const ogl::Buffer& buf);
+    _OutputArray(const cuda::HostMem& cuda_mem);
+    template<typename _Tp> _OutputArray(const cudev::GpuMat_<_Tp>& m);
+    template<typename _Tp> _OutputArray(const std::vector<_Tp>& vec);
+    template<typename _Tp> _OutputArray(const std::vector<std::vector<_Tp> >& vec);
+    template<typename _Tp> _OutputArray(const std::vector<Mat_<_Tp> >& vec);
+    template<typename _Tp> _OutputArray(const Mat_<_Tp>& m);
+    template<typename _Tp> _OutputArray(const _Tp* vec, int n);
+    template<typename _Tp, int m, int n> _OutputArray(const Matx<_Tp, m, n>& matx);
+    _OutputArray(const UMat& m);
+    _OutputArray(const std::vector<UMat>& vec);
+
+    template<typename _Tp, std::size_t _Nm> _OutputArray(std::array<_Tp, _Nm>& arr);
+    template<typename _Tp, std::size_t _Nm> _OutputArray(const std::array<_Tp, _Nm>& arr);
+    template<std::size_t _Nm> _OutputArray(std::array<Mat, _Nm>& arr);
+    template<std::size_t _Nm> _OutputArray(const std::array<Mat, _Nm>& arr);
+
+    template<typename _Tp> static _OutputArray rawOut(std::vector<_Tp>& vec);
+    template<typename _Tp, std::size_t _Nm> static _OutputArray rawOut(std::array<_Tp, _Nm>& arr);
+
+    bool fixedSize() const;
+    bool fixedType() const;
+    bool needed() const;
+    Mat& getMatRef(int i=-1) const;
+    UMat& getUMatRef(int i=-1) const;
+    cuda::GpuMat& getGpuMatRef() const;
+    std::vector<cuda::GpuMat>& getGpuMatVecRef() const;
+    ogl::Buffer& getOGlBufferRef() const;
+    cuda::HostMem& getHostMemRef() const;
+    void create(Size sz, int type, int i=-1, bool allowTransposed=false, _OutputArray::DepthMask fixedDepthMask=static_cast<_OutputArray::DepthMask>(0)) const;
+    void create(int rows, int cols, int type, int i=-1, bool allowTransposed=false, _OutputArray::DepthMask fixedDepthMask=static_cast<_OutputArray::DepthMask>(0)) const;
+    void create(int dims, const int* size, int type, int i=-1, bool allowTransposed=false, _OutputArray::DepthMask fixedDepthMask=static_cast<_OutputArray::DepthMask>(0)) const;
+    void createSameSize(const _InputArray& arr, int mtype) const;
+    void release() const;
+    void clear() const;
+    void setTo(const _InputArray& value, const _InputArray & mask = _InputArray()) const;
+
+    void assign(const UMat& u) const;
+    void assign(const Mat& m) const;
+
+    void assign(const std::vector<UMat>& v) const;
+    void assign(const std::vector<Mat>& v) const;
+
+    void move(UMat& u) const;
+    void move(Mat& m) const;
+};
+
+
+class CV_EXPORTS _InputOutputArray : public _OutputArray
+{
+public:
+    _InputOutputArray();
+    _InputOutputArray(int _flags, void* _obj);
+    _InputOutputArray(Mat& m);
+    _InputOutputArray(std::vector<Mat>& vec);
+    _InputOutputArray(cuda::GpuMat& d_mat);
+    _InputOutputArray(ogl::Buffer& buf);
+    _InputOutputArray(cuda::HostMem& cuda_mem);
+    template<typename _Tp> _InputOutputArray(cudev::GpuMat_<_Tp>& m);
+    template<typename _Tp> _InputOutputArray(std::vector<_Tp>& vec);
+    _InputOutputArray(std::vector<bool>& vec) = delete;  // not supported
+    template<typename _Tp> _InputOutputArray(std::vector<std::vector<_Tp> >& vec);
+    template<typename _Tp> _InputOutputArray(std::vector<Mat_<_Tp> >& vec);
+    template<typename _Tp> _InputOutputArray(Mat_<_Tp>& m);
+    template<typename _Tp> _InputOutputArray(_Tp* vec, int n);
+    template<typename _Tp, int m, int n> _InputOutputArray(Matx<_Tp, m, n>& matx);
+    _InputOutputArray(UMat& m);
+    _InputOutputArray(std::vector<UMat>& vec);
+
+    _InputOutputArray(const Mat& m);
+    _InputOutputArray(const std::vector<Mat>& vec);
+    _InputOutputArray(const cuda::GpuMat& d_mat);
+    _InputOutputArray(const std::vector<cuda::GpuMat>& d_mat);
+    _InputOutputArray(const ogl::Buffer& buf);
+    _InputOutputArray(const cuda::HostMem& cuda_mem);
+    template<typename _Tp> _InputOutputArray(const cudev::GpuMat_<_Tp>& m);
+    template<typename _Tp> _InputOutputArray(const std::vector<_Tp>& vec);
+    template<typename _Tp> _InputOutputArray(const std::vector<std::vector<_Tp> >& vec);
+    template<typename _Tp> _InputOutputArray(const std::vector<Mat_<_Tp> >& vec);
+    template<typename _Tp> _InputOutputArray(const Mat_<_Tp>& m);
+    template<typename _Tp> _InputOutputArray(const _Tp* vec, int n);
+    template<typename _Tp, int m, int n> _InputOutputArray(const Matx<_Tp, m, n>& matx);
+    _InputOutputArray(const UMat& m);
+    _InputOutputArray(const std::vector<UMat>& vec);
+
+    template<typename _Tp, std::size_t _Nm> _InputOutputArray(std::array<_Tp, _Nm>& arr);
+    template<typename _Tp, std::size_t _Nm> _InputOutputArray(const std::array<_Tp, _Nm>& arr);
+    template<std::size_t _Nm> _InputOutputArray(std::array<Mat, _Nm>& arr);
+    template<std::size_t _Nm> _InputOutputArray(const std::array<Mat, _Nm>& arr);
+
+    template<typename _Tp> static _InputOutputArray rawInOut(std::vector<_Tp>& vec);
+    template<typename _Tp, std::size_t _Nm> _InputOutputArray rawInOut(std::array<_Tp, _Nm>& arr);
+
+};
+
+/** Helper to wrap custom types. @see InputArray */
+template<typename _Tp> static inline _InputArray rawIn(_Tp& v);
+/** Helper to wrap custom types. @see InputArray */
+template<typename _Tp> static inline _OutputArray rawOut(_Tp& v);
+/** Helper to wrap custom types. @see InputArray */
+template<typename _Tp> static inline _InputOutputArray rawInOut(_Tp& v);
+
+CV__DEBUG_NS_END
+
+typedef const _InputArray& InputArray;
+typedef InputArray InputArrayOfArrays;
+typedef const _OutputArray& OutputArray;
+typedef OutputArray OutputArrayOfArrays;
+typedef const _InputOutputArray& InputOutputArray;
+typedef InputOutputArray InputOutputArrayOfArrays;
+
+CV_EXPORTS InputOutputArray noArray();
+
+/////////////////////////////////// MatAllocator //////////////////////////////////////
+
+/** @brief  Usage flags for allocator
+
+ @warning  All flags except `USAGE_DEFAULT` are experimental.
+
+ @warning  For the OpenCL allocator, `USAGE_ALLOCATE_SHARED_MEMORY` depends on
+ OpenCV's optional, experimental integration with OpenCL SVM. To enable this
+ integration, build OpenCV using the `WITH_OPENCL_SVM=ON` CMake option and, at
+ runtime, call `cv::ocl::Context::getDefault().setUseSVM(true);` or similar
+ code. Note that SVM is incompatible with OpenCL 1.x.
+*/
+enum UMatUsageFlags
+{
+    USAGE_DEFAULT = 0,
+
+    // buffer allocation policy is platform and usage specific
+    USAGE_ALLOCATE_HOST_MEMORY = 1 << 0,
+    USAGE_ALLOCATE_DEVICE_MEMORY = 1 << 1,
+    USAGE_ALLOCATE_SHARED_MEMORY = 1 << 2, // It is not equal to: USAGE_ALLOCATE_HOST_MEMORY | USAGE_ALLOCATE_DEVICE_MEMORY
+
+    __UMAT_USAGE_FLAGS_32BIT = 0x7fffffff // Binary compatibility hint
+};
+
+struct CV_EXPORTS UMatData;
+
+/** @brief  Custom array allocator
+*/
+class CV_EXPORTS MatAllocator
+{
+public:
+    MatAllocator() {}
+    virtual ~MatAllocator() {}
+
+    // let's comment it off for now to detect and fix all the uses of allocator
+    //virtual void allocate(int dims, const int* sizes, int type, int*& refcount,
+    //                      uchar*& datastart, uchar*& data, size_t* step) = 0;
+    //virtual void deallocate(int* refcount, uchar* datastart, uchar* data) = 0;
+    virtual UMatData* allocate(int dims, const int* sizes, int type,
+                               void* data, size_t* step, AccessFlag flags, UMatUsageFlags usageFlags) const = 0;
+    virtual bool allocate(UMatData* data, AccessFlag accessflags, UMatUsageFlags usageFlags) const = 0;
+    virtual void deallocate(UMatData* data) const = 0;
+    virtual void map(UMatData* data, AccessFlag accessflags) const;
+    virtual void unmap(UMatData* data) const;
+    virtual void download(UMatData* data, void* dst, int dims, const size_t sz[],
+                          const size_t srcofs[], const size_t srcstep[],
+                          const size_t dststep[]) const;
+    virtual void upload(UMatData* data, const void* src, int dims, const size_t sz[],
+                        const size_t dstofs[], const size_t dststep[],
+                        const size_t srcstep[]) const;
+    virtual void copy(UMatData* srcdata, UMatData* dstdata, int dims, const size_t sz[],
+                      const size_t srcofs[], const size_t srcstep[],
+                      const size_t dstofs[], const size_t dststep[], bool sync) const;
+
+    // default implementation returns DummyBufferPoolController
+    virtual BufferPoolController* getBufferPoolController(const char* id = NULL) const;
+};
+
+
+//////////////////////////////// MatCommaInitializer //////////////////////////////////
+
+/** @brief  Comma-separated Matrix Initializer
+
+ The class instances are usually not created explicitly.
+ Instead, they are created on "matrix << firstValue" operator.
+
+ The sample below initializes 2x2 rotation matrix:
+
+ \code
+ double angle = 30, a = cos(angle*CV_PI/180), b = sin(angle*CV_PI/180);
+ Mat R = (Mat_<double>(2,2) << a, -b, b, a);
+ \endcode
+*/
+template<typename _Tp> class MatCommaInitializer_
+{
+public:
+    //! the constructor, created by "matrix << firstValue" operator, where matrix is cv::Mat
+    MatCommaInitializer_(Mat_<_Tp>* _m);
+    //! the operator that takes the next value and put it to the matrix
+    template<typename T2> MatCommaInitializer_<_Tp>& operator , (T2 v);
+    //! another form of conversion operator
+    operator Mat_<_Tp>() const;
+protected:
+    MatIterator_<_Tp> it;
+};
+
+
+/////////////////////////////////////// Mat ///////////////////////////////////////////
+
+// note that umatdata might be allocated together
+// with the matrix data, not as a separate object.
+// therefore, it does not have constructor or destructor;
+// it should be explicitly initialized using init().
+struct CV_EXPORTS UMatData
+{
+    enum MemoryFlag { COPY_ON_MAP=1, HOST_COPY_OBSOLETE=2,
+        DEVICE_COPY_OBSOLETE=4, TEMP_UMAT=8, TEMP_COPIED_UMAT=24,
+        USER_ALLOCATED=32, DEVICE_MEM_MAPPED=64,
+        ASYNC_CLEANUP=128
+    };
+    UMatData(const MatAllocator* allocator);
+    ~UMatData();
+
+    // provide atomic access to the structure
+    void lock();
+    void unlock();
+
+    bool hostCopyObsolete() const;
+    bool deviceCopyObsolete() const;
+    bool deviceMemMapped() const;
+    bool copyOnMap() const;
+    bool tempUMat() const;
+    bool tempCopiedUMat() const;
+    void markHostCopyObsolete(bool flag);
+    void markDeviceCopyObsolete(bool flag);
+    void markDeviceMemMapped(bool flag);
+
+    const MatAllocator* prevAllocator;
+    const MatAllocator* currAllocator;
+    int urefcount;
+    int refcount;
+    uchar* data;
+    uchar* origdata;
+    size_t size;
+
+    UMatData::MemoryFlag flags;
+    void* handle;
+    void* userdata;
+    int allocatorFlags_;
+    int mapcount;
+    UMatData* originalUMatData;
+    std::shared_ptr<void> allocatorContext;
+};
+CV_ENUM_FLAGS(UMatData::MemoryFlag)
+
+
+struct CV_EXPORTS MatSize
+{
+    explicit MatSize(int* _p) CV_NOEXCEPT;
+    int dims() const CV_NOEXCEPT;
+    Size operator()() const;
+    const int& operator[](int i) const;
+    int& operator[](int i);
+    operator const int*() const CV_NOEXCEPT;  // TODO OpenCV 4.0: drop this
+    bool operator == (const MatSize& sz) const CV_NOEXCEPT;
+    bool operator != (const MatSize& sz) const CV_NOEXCEPT;
+
+    int* p;
+};
+
+struct CV_EXPORTS MatStep
+{
+    MatStep() CV_NOEXCEPT;
+    explicit MatStep(size_t s) CV_NOEXCEPT;
+    const size_t& operator[](int i) const CV_NOEXCEPT;
+    size_t& operator[](int i) CV_NOEXCEPT;
+    operator size_t() const;
+    MatStep& operator = (size_t s);
+
+    size_t* p;
+    size_t buf[2];
+protected:
+    MatStep& operator = (const MatStep&);
+};
+
+/** @example samples/cpp/cout_mat.cpp
+An example demonstrating the serial out capabilities of cv::Mat
+*/
+
+ /** @brief n-dimensional dense array class \anchor CVMat_Details
+
+The class Mat represents an n-dimensional dense numerical single-channel or multi-channel array. It
+can be used to store real or complex-valued vectors and matrices, grayscale or color images, voxel
+volumes, vector fields, point clouds, tensors, histograms (though, very high-dimensional histograms
+may be better stored in a SparseMat ). The data layout of the array `M` is defined by the array
+`M.step[]`, so that the address of element \f$(i_0,...,i_{M.dims-1})\f$, where \f$0\leq i_k<M.size[k]\f$, is
+computed as:
+\f[addr(M_{i_0,...,i_{M.dims-1}}) = M.data + M.step[0]*i_0 + M.step[1]*i_1 + ... + M.step[M.dims-1]*i_{M.dims-1}\f]
+In case of a 2-dimensional array, the above formula is reduced to:
+\f[addr(M_{i,j}) = M.data + M.step[0]*i + M.step[1]*j\f]
+Note that `M.step[i] >= M.step[i+1]` (in fact, `M.step[i] >= M.step[i+1]*M.size[i+1]` ). This means
+that 2-dimensional matrices are stored row-by-row, 3-dimensional matrices are stored plane-by-plane,
+and so on. M.step[M.dims-1] is minimal and always equal to the element size M.elemSize() .
+
+So, the data layout in Mat is compatible with the majority of dense array types from the standard
+toolkits and SDKs, such as Numpy (ndarray), Win32 (independent device bitmaps), and others,
+that is, with any array that uses *steps* (or *strides*) to compute the position of a pixel.
+Due to this compatibility, it is possible to make a Mat header for user-allocated data and process
+it in-place using OpenCV functions.
+
+There are many different ways to create a Mat object. The most popular options are listed below:
+
+- Use the create(nrows, ncols, type) method or the similar Mat(nrows, ncols, type[, fillValue])
+constructor. A new array of the specified size and type is allocated. type has the same meaning as
+in the cvCreateMat method. For example, CV_8UC1 means a 8-bit single-channel array, CV_32FC2
+means a 2-channel (complex) floating-point array, and so on.
+@code
+    // make a 7x7 complex matrix filled with 1+3j.
+    Mat M(7,7,CV_32FC2,Scalar(1,3));
+    // and now turn M to a 100x60 15-channel 8-bit matrix.
+    // The old content will be deallocated
+    M.create(100,60,CV_8UC(15));
+@endcode
+As noted in the introduction to this chapter, create() allocates only a new array when the shape
+or type of the current array are different from the specified ones.
+
+- Create a multi-dimensional array:
+@code
+    // create a 100x100x100 8-bit array
+    int sz[] = {100, 100, 100};
+    Mat bigCube(3, sz, CV_8U, Scalar::all(0));
+@endcode
+It passes the number of dimensions =1 to the Mat constructor but the created array will be
+2-dimensional with the number of columns set to 1. So, Mat::dims is always \>= 2 (can also be 0
+when the array is empty).
+
+- Use a copy constructor or assignment operator where there can be an array or expression on the
+right side (see below). As noted in the introduction, the array assignment is an O(1) operation
+because it only copies the header and increases the reference counter. The Mat::clone() method can
+be used to get a full (deep) copy of the array when you need it.
+
+- Construct a header for a part of another array. It can be a single row, single column, several
+rows, several columns, rectangular region in the array (called a *minor* in algebra) or a
+diagonal. Such operations are also O(1) because the new header references the same data. You can
+actually modify a part of the array using this feature, for example:
+@code
+    // add the 5-th row, multiplied by 3 to the 3rd row
+    M.row(3) = M.row(3) + M.row(5)*3;
+    // now copy the 7-th column to the 1-st column
+    // M.col(1) = M.col(7); // this will not work
+    Mat M1 = M.col(1);
+    M.col(7).copyTo(M1);
+    // create a new 320x240 image
+    Mat img(Size(320,240),CV_8UC3);
+    // select a ROI
+    Mat roi(img, Rect(10,10,100,100));
+    // fill the ROI with (0,255,0) (which is green in RGB space);
+    // the original 320x240 image will be modified
+    roi = Scalar(0,255,0);
+@endcode
+Due to the additional datastart and dataend members, it is possible to compute a relative
+sub-array position in the main *container* array using locateROI():
+@code
+    Mat A = Mat::eye(10, 10, CV_32S);
+    // extracts A columns, 1 (inclusive) to 3 (exclusive).
+    Mat B = A(Range::all(), Range(1, 3));
+    // extracts B rows, 5 (inclusive) to 9 (exclusive).
+    // that is, C \~ A(Range(5, 9), Range(1, 3))
+    Mat C = B(Range(5, 9), Range::all());
+    Size size; Point ofs;
+    C.locateROI(size, ofs);
+    // size will be (width=10,height=10) and the ofs will be (x=1, y=5)
+@endcode
+As in case of whole matrices, if you need a deep copy, use the `clone()` method of the extracted
+sub-matrices.
+
+- Make a header for user-allocated data. It can be useful to do the following:
+    -# Process "foreign" data using OpenCV (for example, when you implement a DirectShow\* filter or
+    a processing module for gstreamer, and so on). For example:
+    @code
+        Mat process_video_frame(const unsigned char* pixels,
+                                int width, int height, int step)
+        {
+            // wrap input buffer
+            Mat img(height, width, CV_8UC3, (unsigned char*)pixels, step);
+
+            Mat result;
+            GaussianBlur(img, result, Size(7, 7), 1.5, 1.5);
+
+            return result;
+        }
+    @endcode
+    -# Quickly initialize small matrices and/or get a super-fast element access.
+    @code
+        double m[3][3] = {{a, b, c}, {d, e, f}, {g, h, i}};
+        Mat M = Mat(3, 3, CV_64F, m).inv();
+    @endcode
+    .
+
+- Use MATLAB-style array initializers, zeros(), ones(), eye(), for example:
+@code
+    // create a double-precision identity matrix and add it to M.
+    M += Mat::eye(M.rows, M.cols, CV_64F);
+@endcode
+
+- Use a comma-separated initializer:
+@code
+    // create a 3x3 double-precision identity matrix
+    Mat M = (Mat_<double>(3,3) << 1, 0, 0, 0, 1, 0, 0, 0, 1);
+@endcode
+With this approach, you first call a constructor of the Mat class with the proper parameters, and
+then you just put `<< operator` followed by comma-separated values that can be constants,
+variables, expressions, and so on. Also, note the extra parentheses required to avoid compilation
+errors.
+
+Once the array is created, it is automatically managed via a reference-counting mechanism. If the
+array header is built on top of user-allocated data, you should handle the data by yourself. The
+array data is deallocated when no one points to it. If you want to release the data pointed by a
+array header before the array destructor is called, use Mat::release().
+
+The next important thing to learn about the array class is element access. This manual already
+described how to compute an address of each array element. Normally, you are not required to use the
+formula directly in the code. If you know the array element type (which can be retrieved using the
+method Mat::type() ), you can access the element \f$M_{ij}\f$ of a 2-dimensional array as:
+@code
+    M.at<double>(i,j) += 1.f;
+@endcode
+assuming that `M` is a double-precision floating-point array. There are several variants of the method
+at for a different number of dimensions.
+
+If you need to process a whole row of a 2D array, the most efficient way is to get the pointer to
+the row first, and then just use the plain C operator [] :
+@code
+    // compute sum of positive matrix elements
+    // (assuming that M is a double-precision matrix)
+    double sum=0;
+    for(int i = 0; i < M.rows; i++)
+    {
+        const double* Mi = M.ptr<double>(i);
+        for(int j = 0; j < M.cols; j++)
+            sum += std::max(Mi[j], 0.);
+    }
+@endcode
+Some operations, like the one above, do not actually depend on the array shape. They just process
+elements of an array one by one (or elements from multiple arrays that have the same coordinates,
+for example, array addition). Such operations are called *element-wise*. It makes sense to check
+whether all the input/output arrays are continuous, namely, have no gaps at the end of each row. If
+yes, process them as a long single row:
+@code
+    // compute the sum of positive matrix elements, optimized variant
+    double sum=0;
+    int cols = M.cols, rows = M.rows;
+    if(M.isContinuous())
+    {
+        cols *= rows;
+        rows = 1;
+    }
+    for(int i = 0; i < rows; i++)
+    {
+        const double* Mi = M.ptr<double>(i);
+        for(int j = 0; j < cols; j++)
+            sum += std::max(Mi[j], 0.);
+    }
+@endcode
+In case of the continuous matrix, the outer loop body is executed just once. So, the overhead is
+smaller, which is especially noticeable in case of small matrices.
+
+Finally, there are STL-style iterators that are smart enough to skip gaps between successive rows:
+@code
+    // compute sum of positive matrix elements, iterator-based variant
+    double sum=0;
+    MatConstIterator_<double> it = M.begin<double>(), it_end = M.end<double>();
+    for(; it != it_end; ++it)
+        sum += std::max(*it, 0.);
+@endcode
+The matrix iterators are random-access iterators, so they can be passed to any STL algorithm,
+including std::sort().
+
+@note Matrix Expressions and arithmetic see MatExpr
+*/
+class CV_EXPORTS Mat
+{
+public:
+    /**
+    These are various constructors that form a matrix. As noted in the AutomaticAllocation, often
+    the default constructor is enough, and the proper matrix will be allocated by an OpenCV function.
+    The constructed matrix can further be assigned to another matrix or matrix expression or can be
+    allocated with Mat::create . In the former case, the old content is de-referenced.
+     */
+    Mat() CV_NOEXCEPT;
+
+    /** @overload
+    @param rows Number of rows in a 2D array.
+    @param cols Number of columns in a 2D array.
+    @param type Array type. Use CV_8UC1, ..., CV_64FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+    */
+    Mat(int rows, int cols, int type);
+
+    /** @overload
+    @param size 2D array size: Size(cols, rows) . In the Size() constructor, the number of rows and the
+    number of columns go in the reverse order.
+    @param type Array type. Use CV_8UC1, ..., CV_64FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+      */
+    Mat(Size size, int type);
+
+    /** @overload
+    @param rows Number of rows in a 2D array.
+    @param cols Number of columns in a 2D array.
+    @param type Array type. Use CV_8UC1, ..., CV_64FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+    @param s An optional value to initialize each matrix element with. To set all the matrix elements to
+    the particular value after the construction, use the assignment operator
+    Mat::operator=(const Scalar& value) .
+    */
+    Mat(int rows, int cols, int type, const Scalar& s);
+
+    /** @overload
+    @param size 2D array size: Size(cols, rows) . In the Size() constructor, the number of rows and the
+    number of columns go in the reverse order.
+    @param type Array type. Use CV_8UC1, ..., CV_64FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+    @param s An optional value to initialize each matrix element with. To set all the matrix elements to
+    the particular value after the construction, use the assignment operator
+    Mat::operator=(const Scalar& value) .
+      */
+    Mat(Size size, int type, const Scalar& s);
+
+    /** @overload
+    @param ndims Array dimensionality.
+    @param sizes Array of integers specifying an n-dimensional array shape.
+    @param type Array type. Use CV_8UC1, ..., CV_64FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+    */
+    Mat(int ndims, const int* sizes, int type);
+
+    /** @overload
+    @param sizes Array of integers specifying an n-dimensional array shape.
+    @param type Array type. Use CV_8UC1, ..., CV_64FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+    */
+    Mat(const std::vector<int>& sizes, int type);
+
+    /** @overload
+    @param ndims Array dimensionality.
+    @param sizes Array of integers specifying an n-dimensional array shape.
+    @param type Array type. Use CV_8UC1, ..., CV_64FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+    @param s An optional value to initialize each matrix element with. To set all the matrix elements to
+    the particular value after the construction, use the assignment operator
+    Mat::operator=(const Scalar& value) .
+    */
+    Mat(int ndims, const int* sizes, int type, const Scalar& s);
+
+    /** @overload
+    @param sizes Array of integers specifying an n-dimensional array shape.
+    @param type Array type. Use CV_8UC1, ..., CV_64FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+    @param s An optional value to initialize each matrix element with. To set all the matrix elements to
+    the particular value after the construction, use the assignment operator
+    Mat::operator=(const Scalar& value) .
+    */
+    Mat(const std::vector<int>& sizes, int type, const Scalar& s);
+
+
+    /** @overload
+    @param m Array that (as a whole or partly) is assigned to the constructed matrix. No data is copied
+    by these constructors. Instead, the header pointing to m data or its sub-array is constructed and
+    associated with it. The reference counter, if any, is incremented. So, when you modify the matrix
+    formed using such a constructor, you also modify the corresponding elements of m . If you want to
+    have an independent copy of the sub-array, use Mat::clone() .
+    */
+    Mat(const Mat& m);
+
+    /** @overload
+    @param rows Number of rows in a 2D array.
+    @param cols Number of columns in a 2D array.
+    @param type Array type. Use CV_8UC1, ..., CV_64FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+    @param data Pointer to the user data. Matrix constructors that take data and step parameters do not
+    allocate matrix data. Instead, they just initialize the matrix header that points to the specified
+    data, which means that no data is copied. This operation is very efficient and can be used to
+    process external data using OpenCV functions. The external data is not automatically deallocated, so
+    you should take care of it.
+    @param step Number of bytes each matrix row occupies. The value should include the padding bytes at
+    the end of each row, if any. If the parameter is missing (set to AUTO_STEP ), no padding is assumed
+    and the actual step is calculated as cols*elemSize(). See Mat::elemSize.
+    */
+    Mat(int rows, int cols, int type, void* data, size_t step=AUTO_STEP);
+
+    /** @overload
+    @param size 2D array size: Size(cols, rows) . In the Size() constructor, the number of rows and the
+    number of columns go in the reverse order.
+    @param type Array type. Use CV_8UC1, ..., CV_64FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+    @param data Pointer to the user data. Matrix constructors that take data and step parameters do not
+    allocate matrix data. Instead, they just initialize the matrix header that points to the specified
+    data, which means that no data is copied. This operation is very efficient and can be used to
+    process external data using OpenCV functions. The external data is not automatically deallocated, so
+    you should take care of it.
+    @param step Number of bytes each matrix row occupies. The value should include the padding bytes at
+    the end of each row, if any. If the parameter is missing (set to AUTO_STEP ), no padding is assumed
+    and the actual step is calculated as cols*elemSize(). See Mat::elemSize.
+    */
+    Mat(Size size, int type, void* data, size_t step=AUTO_STEP);
+
+    /** @overload
+    @param ndims Array dimensionality.
+    @param sizes Array of integers specifying an n-dimensional array shape.
+    @param type Array type. Use CV_8UC1, ..., CV_64FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+    @param data Pointer to the user data. Matrix constructors that take data and step parameters do not
+    allocate matrix data. Instead, they just initialize the matrix header that points to the specified
+    data, which means that no data is copied. This operation is very efficient and can be used to
+    process external data using OpenCV functions. The external data is not automatically deallocated, so
+    you should take care of it.
+    @param steps Array of ndims-1 steps in case of a multi-dimensional array (the last step is always
+    set to the element size). If not specified, the matrix is assumed to be continuous.
+    */
+    Mat(int ndims, const int* sizes, int type, void* data, const size_t* steps=0);
+
+    /** @overload
+    @param sizes Array of integers specifying an n-dimensional array shape.
+    @param type Array type. Use CV_8UC1, ..., CV_64FC4 to create 1-4 channel matrices, or
+    CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+    @param data Pointer to the user data. Matrix constructors that take data and step parameters do not
+    allocate matrix data. Instead, they just initialize the matrix header that points to the specified
+    data, which means that no data is copied. This operation is very efficient and can be used to
+    process external data using OpenCV functions. The external data is not automatically deallocated, so
+    you should take care of it.
+    @param steps Array of ndims-1 steps in case of a multi-dimensional array (the last step is always
+    set to the element size). If not specified, the matrix is assumed to be continuous.
+    */
+    Mat(const std::vector<int>& sizes, int type, void* data, const size_t* steps=0);
+
+    /** @overload
+    @param m Array that (as a whole or partly) is assigned to the constructed matrix. No data is copied
+    by these constructors. Instead, the header pointing to m data or its sub-array is constructed and
+    associated with it. The reference counter, if any, is incremented. So, when you modify the matrix
+    formed using such a constructor, you also modify the corresponding elements of m . If you want to
+    have an independent copy of the sub-array, use Mat::clone() .
+    @param rowRange Range of the m rows to take. As usual, the range start is inclusive and the range
+    end is exclusive. Use Range::all() to take all the rows.
+    @param colRange Range of the m columns to take. Use Range::all() to take all the columns.
+    */
+    Mat(const Mat& m, const Range& rowRange, const Range& colRange=Range::all());
+
+    /** @overload
+    @param m Array that (as a whole or partly) is assigned to the constructed matrix. No data is copied
+    by these constructors. Instead, the header pointing to m data or its sub-array is constructed and
+    associated with it. The reference counter, if any, is incremented. So, when you modify the matrix
+    formed using such a constructor, you also modify the corresponding elements of m . If you want to
+    have an independent copy of the sub-array, use Mat::clone() .
+    @param roi Region of interest.
+    */
+    Mat(const Mat& m, const Rect& roi);
+
+    /** @overload
+    @param m Array that (as a whole or partly) is assigned to the constructed matrix. No data is copied
+    by these constructors. Instead, the header pointing to m data or its sub-array is constructed and
+    associated with it. The reference counter, if any, is incremented. So, when you modify the matrix
+    formed using such a constructor, you also modify the corresponding elements of m . If you want to
+    have an independent copy of the sub-array, use Mat::clone() .
+    @param ranges Array of selected ranges of m along each dimensionality.
+    */
+    Mat(const Mat& m, const Range* ranges);
+
+    /** @overload
+    @param m Array that (as a whole or partly) is assigned to the constructed matrix. No data is copied
+    by these constructors. Instead, the header pointing to m data or its sub-array is constructed and
+    associated with it. The reference counter, if any, is incremented. So, when you modify the matrix
+    formed using such a constructor, you also modify the corresponding elements of m . If you want to
+    have an independent copy of the sub-array, use Mat::clone() .
+    @param ranges Array of selected ranges of m along each dimensionality.
+    */
+    Mat(const Mat& m, const std::vector<Range>& ranges);
+
+    /** @overload
+    @param vec STL vector whose elements form the matrix. The matrix has a single column and the number
+    of rows equal to the number of vector elements. Type of the matrix matches the type of vector
+    elements. The constructor can handle arbitrary types, for which there is a properly declared
+    DataType . This means that the vector elements must be primitive numbers or uni-type numerical
+    tuples of numbers. Mixed-type structures are not supported. The corresponding constructor is
+    explicit. Since STL vectors are not automatically converted to Mat instances, you should write
+    Mat(vec) explicitly. Unless you copy the data into the matrix ( copyData=true ), no new elements
+    will be added to the vector because it can potentially yield vector data reallocation, and, thus,
+    the matrix data pointer will be invalid.
+    @param copyData Flag to specify whether the underlying data of the STL vector should be copied
+    to (true) or shared with (false) the newly constructed matrix. When the data is copied, the
+    allocated buffer is managed using Mat reference counting mechanism. While the data is shared,
+    the reference counter is NULL, and you should not deallocate the data until the matrix is
+    destructed.
+    */
+    template<typename _Tp> explicit Mat(const std::vector<_Tp>& vec, bool copyData=false);
+
+    /** @overload
+    */
+    template<typename _Tp, typename = typename std::enable_if<std::is_arithmetic<_Tp>::value>::type>
+    explicit Mat(const std::initializer_list<_Tp> list);
+
+    /** @overload
+    */
+    template<typename _Tp> explicit Mat(const std::initializer_list<int> sizes, const std::initializer_list<_Tp> list);
+
+    /** @overload
+    */
+    template<typename _Tp, size_t _Nm> explicit Mat(const std::array<_Tp, _Nm>& arr, bool copyData=false);
+
+    /** @overload
+    */
+    template<typename _Tp, int n> explicit Mat(const Vec<_Tp, n>& vec, bool copyData=true);
+
+    /** @overload
+    */
+    template<typename _Tp, int m, int n> explicit Mat(const Matx<_Tp, m, n>& mtx, bool copyData=true);
+
+    /** @overload
+    */
+    template<typename _Tp> explicit Mat(const Point_<_Tp>& pt, bool copyData=true);
+
+    /** @overload
+    */
+    template<typename _Tp> explicit Mat(const Point3_<_Tp>& pt, bool copyData=true);
+
+    /** @overload
+    */
+    template<typename _Tp> explicit Mat(const MatCommaInitializer_<_Tp>& commaInitializer);
+
+    //! download data from GpuMat
+    explicit Mat(const cuda::GpuMat& m);
+
+    //! destructor - calls release()
+    ~Mat();
+
+    /** @brief assignment operators
+
+    These are available assignment operators. Since they all are very different, make sure to read the
+    operator parameters description.
+    @param m Assigned, right-hand-side matrix. Matrix assignment is an O(1) operation. This means that
+    no data is copied but the data is shared and the reference counter, if any, is incremented. Before
+    assigning new data, the old data is de-referenced via Mat::release .
+     */
+    Mat& operator = (const Mat& m);
+
+    /** @overload
+    @param expr Assigned matrix expression object. As opposite to the first form of the assignment
+    operation, the second form can reuse already allocated matrix if it has the right size and type to
+    fit the matrix expression result. It is automatically handled by the real function that the matrix
+    expressions is expanded to. For example, C=A+B is expanded to add(A, B, C), and add takes care of
+    automatic C reallocation.
+    */
+    Mat& operator = (const MatExpr& expr);
+
+    //! retrieve UMat from Mat
+    UMat getUMat(AccessFlag accessFlags, UMatUsageFlags usageFlags = USAGE_DEFAULT) const;
+
+    /** @brief Creates a matrix header for the specified matrix row.
+
+    The method makes a new header for the specified matrix row and returns it. This is an O(1)
+    operation, regardless of the matrix size. The underlying data of the new matrix is shared with the
+    original matrix. Here is the example of one of the classical basic matrix processing operations,
+    axpy, used by LU and many other algorithms:
+    @code
+        inline void matrix_axpy(Mat& A, int i, int j, double alpha)
+        {
+            A.row(i) += A.row(j)*alpha;
+        }
+    @endcode
+    @note In the current implementation, the following code does not work as expected:
+    @code
+        Mat A;
+        ...
+        A.row(i) = A.row(j); // will not work
+    @endcode
+    This happens because A.row(i) forms a temporary header that is further assigned to another header.
+    Remember that each of these operations is O(1), that is, no data is copied. Thus, the above
+    assignment is not true if you may have expected the j-th row to be copied to the i-th row. To
+    achieve that, you should either turn this simple assignment into an expression or use the
+    Mat::copyTo method:
+    @code
+        Mat A;
+        ...
+        // works, but looks a bit obscure.
+        A.row(i) = A.row(j) + 0;
+        // this is a bit longer, but the recommended method.
+        A.row(j).copyTo(A.row(i));
+    @endcode
+    @param y A 0-based row index.
+     */
+    Mat row(int y) const;
+
+    /** @brief Creates a matrix header for the specified matrix column.
+
+    The method makes a new header for the specified matrix column and returns it. This is an O(1)
+    operation, regardless of the matrix size. The underlying data of the new matrix is shared with the
+    original matrix. See also the Mat::row description.
+    @param x A 0-based column index.
+     */
+    Mat col(int x) const;
+
+    /** @brief Creates a matrix header for the specified row span.
+
+    The method makes a new header for the specified row span of the matrix. Similarly to Mat::row and
+    Mat::col , this is an O(1) operation.
+    @param startrow An inclusive 0-based start index of the row span.
+    @param endrow An exclusive 0-based ending index of the row span.
+     */
+    Mat rowRange(int startrow, int endrow) const;
+
+    /** @overload
+    @param r Range structure containing both the start and the end indices.
+    */
+    Mat rowRange(const Range& r) const;
+
+    /** @brief Creates a matrix header for the specified column span.
+
+    The method makes a new header for the specified column span of the matrix. Similarly to Mat::row and
+    Mat::col , this is an O(1) operation.
+    @param startcol An inclusive 0-based start index of the column span.
+    @param endcol An exclusive 0-based ending index of the column span.
+     */
+    Mat colRange(int startcol, int endcol) const;
+
+    /** @overload
+    @param r Range structure containing both the start and the end indices.
+    */
+    Mat colRange(const Range& r) const;
+
+    /** @brief Extracts a diagonal from a matrix
+
+    The method makes a new header for the specified matrix diagonal. The new matrix is represented as a
+    single-column matrix. Similarly to Mat::row and Mat::col, this is an O(1) operation.
+    @param d index of the diagonal, with the following values:
+    - `d=0` is the main diagonal.
+    - `d<0` is a diagonal from the lower half. For example, d=-1 means the diagonal is set
+      immediately below the main one.
+    - `d>0` is a diagonal from the upper half. For example, d=1 means the diagonal is set
+      immediately above the main one.
+    For example:
+    @code
+        Mat m = (Mat_<int>(3,3) <<
+                    1,2,3,
+                    4,5,6,
+                    7,8,9);
+        Mat d0 = m.diag(0);
+        Mat d1 = m.diag(1);
+        Mat d_1 = m.diag(-1);
+    @endcode
+    The resulting matrices are
+    @code
+     d0 =
+       [1;
+        5;
+        9]
+     d1 =
+       [2;
+        6]
+     d_1 =
+       [4;
+        8]
+    @endcode
+     */
+    Mat diag(int d=0) const;
+
+    /** @brief creates a diagonal matrix
+
+    The method creates a square diagonal matrix from specified main diagonal.
+    @param d One-dimensional matrix that represents the main diagonal.
+     */
+    CV_NODISCARD_STD static Mat diag(const Mat& d);
+
+    /** @brief Creates a full copy of the array and the underlying data.
+
+    The method creates a full copy of the array. The original step[] is not taken into account. So, the
+    array copy is a continuous array occupying total()*elemSize() bytes.
+     */
+    CV_NODISCARD_STD Mat clone() const;
+
+    /** @brief Copies the matrix to another one.
+
+    The method copies the matrix data to another matrix. Before copying the data, the method invokes :
+    @code
+        m.create(this->size(), this->type());
+    @endcode
+    so that the destination matrix is reallocated if needed. While m.copyTo(m); works flawlessly, the
+    function does not handle the case of a partial overlap between the source and the destination
+    matrices.
+
+    When the operation mask is specified, if the Mat::create call shown above reallocates the matrix,
+    the newly allocated matrix is initialized with all zeros before copying the data.
+    @param m Destination matrix. If it does not have a proper size or type before the operation, it is
+    reallocated.
+     */
+    void copyTo( OutputArray m ) const;
+
+    /** @overload
+    @param m Destination matrix. If it does not have a proper size or type before the operation, it is
+    reallocated.
+    @param mask Operation mask of the same size as \*this. Its non-zero elements indicate which matrix
+    elements need to be copied. The mask has to be of type CV_8U and can have 1 or multiple channels.
+    */
+    void copyTo( OutputArray m, InputArray mask ) const;
+
+    /** @brief Converts an array to another data type with optional scaling.
+
+    The method converts source pixel values to the target data type. saturate_cast\<\> is applied at
+    the end to avoid possible overflows:
+
+    \f[m(x,y) = saturate \_ cast<rType>( \alpha (*this)(x,y) +  \beta )\f]
+    @param m output matrix; if it does not have a proper size or type before the operation, it is
+    reallocated.
+    @param rtype desired output matrix type or, rather, the depth since the number of channels are the
+    same as the input has; if rtype is negative, the output matrix will have the same type as the input.
+    @param alpha optional scale factor.
+    @param beta optional delta added to the scaled values.
+     */
+    void convertTo( OutputArray m, int rtype, double alpha=1, double beta=0 ) const;
+
+    /** @brief Provides a functional form of convertTo.
+
+    This is an internally used method called by the @ref MatrixExpressions engine.
+    @param m Destination array.
+    @param type Desired destination array depth (or -1 if it should be the same as the source type).
+     */
+    void assignTo( Mat& m, int type=-1 ) const;
+
+    /** @brief Sets all or some of the array elements to the specified value.
+    @param s Assigned scalar converted to the actual array type.
+    */
+    Mat& operator = (const Scalar& s);
+
+    /** @brief Sets all or some of the array elements to the specified value.
+
+    This is an advanced variant of the Mat::operator=(const Scalar& s) operator.
+    @param value Assigned scalar converted to the actual array type.
+    @param mask Operation mask of the same size as \*this. Its non-zero elements indicate which matrix
+    elements need to be copied. The mask has to be of type CV_8U and can have 1 or multiple channels
+     */
+    Mat& setTo(InputArray value, InputArray mask=noArray());
+
+    /** @brief Changes the shape and/or the number of channels of a 2D matrix without copying the data.
+
+    The method makes a new matrix header for \*this elements. The new matrix may have a different size
+    and/or different number of channels. Any combination is possible if:
+    -   No extra elements are included into the new matrix and no elements are excluded. Consequently,
+        the product rows\*cols\*channels() must stay the same after the transformation.
+    -   No data is copied. That is, this is an O(1) operation. Consequently, if you change the number of
+        rows, or the operation changes the indices of elements row in some other way, the matrix must be
+        continuous. See Mat::isContinuous .
+
+    For example, if there is a set of 3D points stored as an STL vector, and you want to represent the
+    points as a 3xN matrix, do the following:
+    @code
+        std::vector<Point3f> vec;
+        ...
+        Mat pointMat = Mat(vec). // convert vector to Mat, O(1) operation
+                          reshape(1). // make Nx3 1-channel matrix out of Nx1 3-channel.
+                                      // Also, an O(1) operation
+                             t(); // finally, transpose the Nx3 matrix.
+                                  // This involves copying all the elements
+    @endcode
+    @param cn New number of channels. If the parameter is 0, the number of channels remains the same.
+    @param rows New number of rows. If the parameter is 0, the number of rows remains the same.
+     */
+    Mat reshape(int cn, int rows=0) const;
+
+    /** @overload */
+    Mat reshape(int cn, int newndims, const int* newsz) const;
+
+    /** @overload */
+    Mat reshape(int cn, const std::vector<int>& newshape) const;
+
+    /** @brief Transposes a matrix.
+
+    The method performs matrix transposition by means of matrix expressions. It does not perform the
+    actual transposition but returns a temporary matrix transposition object that can be further used as
+    a part of more complex matrix expressions or can be assigned to a matrix:
+    @code
+        Mat A1 = A + Mat::eye(A.size(), A.type())*lambda;
+        Mat C = A1.t()*A1; // compute (A + lambda*I)^t * (A + lamda*I)
+    @endcode
+     */
+    MatExpr t() const;
+
+    /** @brief Inverses a matrix.
+
+    The method performs a matrix inversion by means of matrix expressions. This means that a temporary
+    matrix inversion object is returned by the method and can be used further as a part of more complex
+    matrix expressions or can be assigned to a matrix.
+    @param method Matrix inversion method. One of cv::DecompTypes
+     */
+    MatExpr inv(int method=DECOMP_LU) const;
+
+    /** @brief Performs an element-wise multiplication or division of the two matrices.
+
+    The method returns a temporary object encoding per-element array multiplication, with optional
+    scale. Note that this is not a matrix multiplication that corresponds to a simpler "\*" operator.
+
+    Example:
+    @code
+        Mat C = A.mul(5/B); // equivalent to divide(A, B, C, 5)
+    @endcode
+    @param m Another array of the same type and the same size as \*this, or a matrix expression.
+    @param scale Optional scale factor.
+     */
+    MatExpr mul(InputArray m, double scale=1) const;
+
+    /** @brief Computes a cross-product of two 3-element vectors.
+
+    The method computes a cross-product of two 3-element vectors. The vectors must be 3-element
+    floating-point vectors of the same shape and size. The result is another 3-element vector of the
+    same shape and type as operands.
+    @param m Another cross-product operand.
+     */
+    Mat cross(InputArray m) const;
+
+    /** @brief Computes a dot-product of two vectors.
+
+    The method computes a dot-product of two matrices. If the matrices are not single-column or
+    single-row vectors, the top-to-bottom left-to-right scan ordering is used to treat them as 1D
+    vectors. The vectors must have the same size and type. If the matrices have more than one channel,
+    the dot products from all the channels are summed together.
+    @param m another dot-product operand.
+     */
+    double dot(InputArray m) const;
+
+    /** @brief Returns a zero array of the specified size and type.
+
+    The method returns a Matlab-style zero array initializer. It can be used to quickly form a constant
+    array as a function parameter, part of a matrix expression, or as a matrix initializer:
+    @code
+        Mat A;
+        A = Mat::zeros(3, 3, CV_32F);
+    @endcode
+    In the example above, a new matrix is allocated only if A is not a 3x3 floating-point matrix.
+    Otherwise, the existing matrix A is filled with zeros.
+    @param rows Number of rows.
+    @param cols Number of columns.
+    @param type Created matrix type.
+     */
+    CV_NODISCARD_STD static MatExpr zeros(int rows, int cols, int type);
+
+    /** @overload
+    @param size Alternative to the matrix size specification Size(cols, rows) .
+    @param type Created matrix type.
+    */
+    CV_NODISCARD_STD static MatExpr zeros(Size size, int type);
+
+    /** @overload
+    @param ndims Array dimensionality.
+    @param sz Array of integers specifying the array shape.
+    @param type Created matrix type.
+    */
+    CV_NODISCARD_STD static MatExpr zeros(int ndims, const int* sz, int type);
+
+    /** @brief Returns an array of all 1's of the specified size and type.
+
+    The method returns a Matlab-style 1's array initializer, similarly to Mat::zeros. Note that using
+    this method you can initialize an array with an arbitrary value, using the following Matlab idiom:
+    @code
+        Mat A = Mat::ones(100, 100, CV_8U)*3; // make 100x100 matrix filled with 3.
+    @endcode
+    The above operation does not form a 100x100 matrix of 1's and then multiply it by 3. Instead, it
+    just remembers the scale factor (3 in this case) and use it when actually invoking the matrix
+    initializer.
+    @note In case of multi-channels type, only the first channel will be initialized with 1's, the
+    others will be set to 0's.
+    @param rows Number of rows.
+    @param cols Number of columns.
+    @param type Created matrix type.
+     */
+    CV_NODISCARD_STD static MatExpr ones(int rows, int cols, int type);
+
+    /** @overload
+    @param size Alternative to the matrix size specification Size(cols, rows) .
+    @param type Created matrix type.
+    */
+    CV_NODISCARD_STD static MatExpr ones(Size size, int type);
+
+    /** @overload
+    @param ndims Array dimensionality.
+    @param sz Array of integers specifying the array shape.
+    @param type Created matrix type.
+    */
+    CV_NODISCARD_STD static MatExpr ones(int ndims, const int* sz, int type);
+
+    /** @brief Returns an identity matrix of the specified size and type.
+
+    The method returns a Matlab-style identity matrix initializer, similarly to Mat::zeros. Similarly to
+    Mat::ones, you can use a scale operation to create a scaled identity matrix efficiently:
+    @code
+        // make a 4x4 diagonal matrix with 0.1's on the diagonal.
+        Mat A = Mat::eye(4, 4, CV_32F)*0.1;
+    @endcode
+    @note In case of multi-channels type, identity matrix will be initialized only for the first channel,
+    the others will be set to 0's
+    @param rows Number of rows.
+    @param cols Number of columns.
+    @param type Created matrix type.
+     */
+    CV_NODISCARD_STD static MatExpr eye(int rows, int cols, int type);
+
+    /** @overload
+    @param size Alternative matrix size specification as Size(cols, rows) .
+    @param type Created matrix type.
+    */
+    CV_NODISCARD_STD static MatExpr eye(Size size, int type);
+
+    /** @brief Allocates new array data if needed.
+
+    This is one of the key Mat methods. Most new-style OpenCV functions and methods that produce arrays
+    call this method for each output array. The method uses the following algorithm:
+
+    -# If the current array shape and the type match the new ones, return immediately. Otherwise,
+       de-reference the previous data by calling Mat::release.
+    -# Initialize the new header.
+    -# Allocate the new data of total()\*elemSize() bytes.
+    -# Allocate the new, associated with the data, reference counter and set it to 1.
+
+    Such a scheme makes the memory management robust and efficient at the same time and helps avoid
+    extra typing for you. This means that usually there is no need to explicitly allocate output arrays.
+    That is, instead of writing:
+    @code
+        Mat color;
+        ...
+        Mat gray(color.rows, color.cols, color.depth());
+        cvtColor(color, gray, COLOR_BGR2GRAY);
+    @endcode
+    you can simply write:
+    @code
+        Mat color;
+        ...
+        Mat gray;
+        cvtColor(color, gray, COLOR_BGR2GRAY);
+    @endcode
+    because cvtColor, as well as the most of OpenCV functions, calls Mat::create() for the output array
+    internally.
+    @param rows New number of rows.
+    @param cols New number of columns.
+    @param type New matrix type.
+     */
+    void create(int rows, int cols, int type);
+
+    /** @overload
+    @param size Alternative new matrix size specification: Size(cols, rows)
+    @param type New matrix type.
+    */
+    void create(Size size, int type);
+
+    /** @overload
+    @param ndims New array dimensionality.
+    @param sizes Array of integers specifying a new array shape.
+    @param type New matrix type.
+    */
+    void create(int ndims, const int* sizes, int type);
+
+    /** @overload
+    @param sizes Array of integers specifying a new array shape.
+    @param type New matrix type.
+    */
+    void create(const std::vector<int>& sizes, int type);
+
+    /** @brief Increments the reference counter.
+
+    The method increments the reference counter associated with the matrix data. If the matrix header
+    points to an external data set (see Mat::Mat ), the reference counter is NULL, and the method has no
+    effect in this case. Normally, to avoid memory leaks, the method should not be called explicitly. It
+    is called implicitly by the matrix assignment operator. The reference counter increment is an atomic
+    operation on the platforms that support it. Thus, it is safe to operate on the same matrices
+    asynchronously in different threads.
+     */
+    void addref();
+
+    /** @brief Decrements the reference counter and deallocates the matrix if needed.
+
+    The method decrements the reference counter associated with the matrix data. When the reference
+    counter reaches 0, the matrix data is deallocated and the data and the reference counter pointers
+    are set to NULL's. If the matrix header points to an external data set (see Mat::Mat ), the
+    reference counter is NULL, and the method has no effect in this case.
+
+    This method can be called manually to force the matrix data deallocation. But since this method is
+    automatically called in the destructor, or by any other method that changes the data pointer, it is
+    usually not needed. The reference counter decrement and check for 0 is an atomic operation on the
+    platforms that support it. Thus, it is safe to operate on the same matrices asynchronously in
+    different threads.
+     */
+    void release();
+
+    //! internal use function, consider to use 'release' method instead; deallocates the matrix data
+    void deallocate();
+    //! internal use function; properly re-allocates _size, _step arrays
+    void copySize(const Mat& m);
+
+    /** @brief Reserves space for the certain number of rows.
+
+    The method reserves space for sz rows. If the matrix already has enough space to store sz rows,
+    nothing happens. If the matrix is reallocated, the first Mat::rows rows are preserved. The method
+    emulates the corresponding method of the STL vector class.
+    @param sz Number of rows.
+     */
+    void reserve(size_t sz);
+
+    /** @brief Reserves space for the certain number of bytes.
+
+    The method reserves space for sz bytes. If the matrix already has enough space to store sz bytes,
+    nothing happens. If matrix has to be reallocated its previous content could be lost.
+    @param sz Number of bytes.
+    */
+    void reserveBuffer(size_t sz);
+
+    /** @brief Changes the number of matrix rows.
+
+    The methods change the number of matrix rows. If the matrix is reallocated, the first
+    min(Mat::rows, sz) rows are preserved. The methods emulate the corresponding methods of the STL
+    vector class.
+    @param sz New number of rows.
+     */
+    void resize(size_t sz);
+
+    /** @overload
+    @param sz New number of rows.
+    @param s Value assigned to the newly added elements.
+     */
+    void resize(size_t sz, const Scalar& s);
+
+    //! internal function
+    void push_back_(const void* elem);
+
+    /** @brief Adds elements to the bottom of the matrix.
+
+    The methods add one or more elements to the bottom of the matrix. They emulate the corresponding
+    method of the STL vector class. When elem is Mat , its type and the number of columns must be the
+    same as in the container matrix.
+    @param elem Added element(s).
+     */
+    template<typename _Tp> void push_back(const _Tp& elem);
+
+    /** @overload
+    @param elem Added element(s).
+    */
+    template<typename _Tp> void push_back(const Mat_<_Tp>& elem);
+
+    /** @overload
+    @param elem Added element(s).
+    */
+    template<typename _Tp> void push_back(const std::vector<_Tp>& elem);
+
+    /** @overload
+    @param m Added line(s).
+    */
+    void push_back(const Mat& m);
+
+    /** @brief Removes elements from the bottom of the matrix.
+
+    The method removes one or more rows from the bottom of the matrix.
+    @param nelems Number of removed rows. If it is greater than the total number of rows, an exception
+    is thrown.
+     */
+    void pop_back(size_t nelems=1);
+
+    /** @brief Locates the matrix header within a parent matrix.
+
+    After you extracted a submatrix from a matrix using Mat::row, Mat::col, Mat::rowRange,
+    Mat::colRange, and others, the resultant submatrix points just to the part of the original big
+    matrix. However, each submatrix contains information (represented by datastart and dataend
+    fields) that helps reconstruct the original matrix size and the position of the extracted
+    submatrix within the original matrix. The method locateROI does exactly that.
+    @param wholeSize Output parameter that contains the size of the whole matrix containing *this*
+    as a part.
+    @param ofs Output parameter that contains an offset of *this* inside the whole matrix.
+     */
+    void locateROI( Size& wholeSize, Point& ofs ) const;
+
+    /** @brief Adjusts a submatrix size and position within the parent matrix.
+
+    The method is complimentary to Mat::locateROI . The typical use of these functions is to determine
+    the submatrix position within the parent matrix and then shift the position somehow. Typically, it
+    can be required for filtering operations when pixels outside of the ROI should be taken into
+    account. When all the method parameters are positive, the ROI needs to grow in all directions by the
+    specified amount, for example:
+    @code
+        A.adjustROI(2, 2, 2, 2);
+    @endcode
+    In this example, the matrix size is increased by 4 elements in each direction. The matrix is shifted
+    by 2 elements to the left and 2 elements up, which brings in all the necessary pixels for the
+    filtering with the 5x5 kernel.
+
+    adjustROI forces the adjusted ROI to be inside of the parent matrix that is boundaries of the
+    adjusted ROI are constrained by boundaries of the parent matrix. For example, if the submatrix A is
+    located in the first row of a parent matrix and you called A.adjustROI(2, 2, 2, 2) then A will not
+    be increased in the upward direction.
+
+    The function is used internally by the OpenCV filtering functions, like filter2D , morphological
+    operations, and so on.
+    @param dtop Shift of the top submatrix boundary upwards.
+    @param dbottom Shift of the bottom submatrix boundary downwards.
+    @param dleft Shift of the left submatrix boundary to the left.
+    @param dright Shift of the right submatrix boundary to the right.
+    @sa copyMakeBorder
+     */
+    Mat& adjustROI( int dtop, int dbottom, int dleft, int dright );
+
+    /** @brief Extracts a rectangular submatrix.
+
+    The operators make a new header for the specified sub-array of \*this . They are the most
+    generalized forms of Mat::row, Mat::col, Mat::rowRange, and Mat::colRange . For example,
+    `A(Range(0, 10), Range::all())` is equivalent to `A.rowRange(0, 10)`. Similarly to all of the above,
+    the operators are O(1) operations, that is, no matrix data is copied.
+    @param rowRange Start and end row of the extracted submatrix. The upper boundary is not included. To
+    select all the rows, use Range::all().
+    @param colRange Start and end column of the extracted submatrix. The upper boundary is not included.
+    To select all the columns, use Range::all().
+     */
+    Mat operator()( Range rowRange, Range colRange ) const;
+
+    /** @overload
+    @param roi Extracted submatrix specified as a rectangle.
+    */
+    Mat operator()( const Rect& roi ) const;
+
+    /** @overload
+    @param ranges Array of selected ranges along each array dimension.
+    */
+    Mat operator()( const Range* ranges ) const;
+
+    /** @overload
+    @param ranges Array of selected ranges along each array dimension.
+    */
+    Mat operator()(const std::vector<Range>& ranges) const;
+
+    template<typename _Tp> operator std::vector<_Tp>() const;
+    template<typename _Tp, int n> operator Vec<_Tp, n>() const;
+    template<typename _Tp, int m, int n> operator Matx<_Tp, m, n>() const;
+
+    template<typename _Tp, std::size_t _Nm> operator std::array<_Tp, _Nm>() const;
+
+    /** @brief Reports whether the matrix is continuous or not.
+
+    The method returns true if the matrix elements are stored continuously without gaps at the end of
+    each row. Otherwise, it returns false. Obviously, 1x1 or 1xN matrices are always continuous.
+    Matrices created with Mat::create are always continuous. But if you extract a part of the matrix
+    using Mat::col, Mat::diag, and so on, or constructed a matrix header for externally allocated data,
+    such matrices may no longer have this property.
+
+    The continuity flag is stored as a bit in the Mat::flags field and is computed automatically when
+    you construct a matrix header. Thus, the continuity check is a very fast operation, though
+    theoretically it could be done as follows:
+    @code
+        // alternative implementation of Mat::isContinuous()
+        bool myCheckMatContinuity(const Mat& m)
+        {
+            //return (m.flags & Mat::CONTINUOUS_FLAG) != 0;
+            return m.rows == 1 || m.step == m.cols*m.elemSize();
+        }
+    @endcode
+    The method is used in quite a few of OpenCV functions. The point is that element-wise operations
+    (such as arithmetic and logical operations, math functions, alpha blending, color space
+    transformations, and others) do not depend on the image geometry. Thus, if all the input and output
+    arrays are continuous, the functions can process them as very long single-row vectors. The example
+    below illustrates how an alpha-blending function can be implemented:
+    @code
+        template<typename T>
+        void alphaBlendRGBA(const Mat& src1, const Mat& src2, Mat& dst)
+        {
+            const float alpha_scale = (float)std::numeric_limits<T>::max(),
+                        inv_scale = 1.f/alpha_scale;
+
+            CV_Assert( src1.type() == src2.type() &&
+                       src1.type() == CV_MAKETYPE(traits::Depth<T>::value, 4) &&
+                       src1.size() == src2.size());
+            Size size = src1.size();
+            dst.create(size, src1.type());
+
+            // here is the idiom: check the arrays for continuity and,
+            // if this is the case,
+            // treat the arrays as 1D vectors
+            if( src1.isContinuous() && src2.isContinuous() && dst.isContinuous() )
+            {
+                size.width *= size.height;
+                size.height = 1;
+            }
+            size.width *= 4;
+
+            for( int i = 0; i < size.height; i++ )
+            {
+                // when the arrays are continuous,
+                // the outer loop is executed only once
+                const T* ptr1 = src1.ptr<T>(i);
+                const T* ptr2 = src2.ptr<T>(i);
+                T* dptr = dst.ptr<T>(i);
+
+                for( int j = 0; j < size.width; j += 4 )
+                {
+                    float alpha = ptr1[j+3]*inv_scale, beta = ptr2[j+3]*inv_scale;
+                    dptr[j] = saturate_cast<T>(ptr1[j]*alpha + ptr2[j]*beta);
+                    dptr[j+1] = saturate_cast<T>(ptr1[j+1]*alpha + ptr2[j+1]*beta);
+                    dptr[j+2] = saturate_cast<T>(ptr1[j+2]*alpha + ptr2[j+2]*beta);
+                    dptr[j+3] = saturate_cast<T>((1 - (1-alpha)*(1-beta))*alpha_scale);
+                }
+            }
+        }
+    @endcode
+    This approach, while being very simple, can boost the performance of a simple element-operation by
+    10-20 percents, especially if the image is rather small and the operation is quite simple.
+
+    Another OpenCV idiom in this function, a call of Mat::create for the destination array, that
+    allocates the destination array unless it already has the proper size and type. And while the newly
+    allocated arrays are always continuous, you still need to check the destination array because
+    Mat::create does not always allocate a new matrix.
+     */
+    bool isContinuous() const;
+
+    //! returns true if the matrix is a submatrix of another matrix
+    bool isSubmatrix() const;
+
+    /** @brief Returns the matrix element size in bytes.
+
+    The method returns the matrix element size in bytes. For example, if the matrix type is CV_16SC3 ,
+    the method returns 3\*sizeof(short) or 6.
+     */
+    size_t elemSize() const;
+
+    /** @brief Returns the size of each matrix element channel in bytes.
+
+    The method returns the matrix element channel size in bytes, that is, it ignores the number of
+    channels. For example, if the matrix type is CV_16SC3 , the method returns sizeof(short) or 2.
+     */
+    size_t elemSize1() const;
+
+    /** @brief Returns the type of a matrix element.
+
+    The method returns a matrix element type. This is an identifier compatible with the CvMat type
+    system, like CV_16SC3 or 16-bit signed 3-channel array, and so on.
+     */
+    int type() const;
+
+    /** @brief Returns the depth of a matrix element.
+
+    The method returns the identifier of the matrix element depth (the type of each individual channel).
+    For example, for a 16-bit signed element array, the method returns CV_16S . A complete list of
+    matrix types contains the following values:
+    -   CV_8U - 8-bit unsigned integers ( 0..255 )
+    -   CV_8S - 8-bit signed integers ( -128..127 )
+    -   CV_16U - 16-bit unsigned integers ( 0..65535 )
+    -   CV_16S - 16-bit signed integers ( -32768..32767 )
+    -   CV_32S - 32-bit signed integers ( -2147483648..2147483647 )
+    -   CV_32F - 32-bit floating-point numbers ( -FLT_MAX..FLT_MAX, INF, NAN )
+    -   CV_64F - 64-bit floating-point numbers ( -DBL_MAX..DBL_MAX, INF, NAN )
+     */
+    int depth() const;
+
+    /** @brief Returns the number of matrix channels.
+
+    The method returns the number of matrix channels.
+     */
+    int channels() const;
+
+    /** @brief Returns a normalized step.
+
+    The method returns a matrix step divided by Mat::elemSize1() . It can be useful to quickly access an
+    arbitrary matrix element.
+     */
+    size_t step1(int i=0) const;
+
+    /** @brief Returns true if the array has no elements.
+
+    The method returns true if Mat::total() is 0 or if Mat::data is NULL. Because of pop_back() and
+    resize() methods `M.total() == 0` does not imply that `M.data == NULL`.
+     */
+    bool empty() const;
+
+    /** @brief Returns the total number of array elements.
+
+    The method returns the number of array elements (a number of pixels if the array represents an
+    image).
+     */
+    size_t total() const;
+
+    /** @brief Returns the total number of array elements.
+
+     The method returns the number of elements within a certain sub-array slice with startDim <= dim < endDim
+     */
+    size_t total(int startDim, int endDim=INT_MAX) const;
+
+    /**
+     * @param elemChannels Number of channels or number of columns the matrix should have.
+     *                     For a 2-D matrix, when the matrix has only 1 column, then it should have
+     *                     elemChannels channels; When the matrix has only 1 channel,
+     *                     then it should have elemChannels columns.
+     *                     For a 3-D matrix, it should have only one channel. Furthermore,
+     *                     if the number of planes is not one, then the number of rows
+     *                     within every plane has to be 1; if the number of rows within
+     *                     every plane is not 1, then the number of planes has to be 1.
+     * @param depth The depth the matrix should have. Set it to -1 when any depth is fine.
+     * @param requireContinuous Set it to true to require the matrix to be continuous
+     * @return -1 if the requirement is not satisfied.
+     *         Otherwise, it returns the number of elements in the matrix. Note
+     *         that an element may have multiple channels.
+     *
+     * The following code demonstrates its usage for a 2-d matrix:
+     * @snippet snippets/core_mat_checkVector.cpp example-2d
+     *
+     * The following code demonstrates its usage for a 3-d matrix:
+     * @snippet snippets/core_mat_checkVector.cpp example-3d
+     */
+    int checkVector(int elemChannels, int depth=-1, bool requireContinuous=true) const;
+
+    /** @brief Returns a pointer to the specified matrix row.
+
+    The methods return `uchar*` or typed pointer to the specified matrix row. See the sample in
+    Mat::isContinuous to know how to use these methods.
+    @param i0 A 0-based row index.
+     */
+    uchar* ptr(int i0=0);
+    /** @overload */
+    const uchar* ptr(int i0=0) const;
+
+    /** @overload
+    @param row Index along the dimension 0
+    @param col Index along the dimension 1
+    */
+    uchar* ptr(int row, int col);
+    /** @overload
+    @param row Index along the dimension 0
+    @param col Index along the dimension 1
+    */
+    const uchar* ptr(int row, int col) const;
+
+    /** @overload */
+    uchar* ptr(int i0, int i1, int i2);
+    /** @overload */
+    const uchar* ptr(int i0, int i1, int i2) const;
+
+    /** @overload */
+    uchar* ptr(const int* idx);
+    /** @overload */
+    const uchar* ptr(const int* idx) const;
+    /** @overload */
+    template<int n> uchar* ptr(const Vec<int, n>& idx);
+    /** @overload */
+    template<int n> const uchar* ptr(const Vec<int, n>& idx) const;
+
+    /** @overload */
+    template<typename _Tp> _Tp* ptr(int i0=0);
+    /** @overload */
+    template<typename _Tp> const _Tp* ptr(int i0=0) const;
+    /** @overload
+    @param row Index along the dimension 0
+    @param col Index along the dimension 1
+    */
+    template<typename _Tp> _Tp* ptr(int row, int col);
+    /** @overload
+    @param row Index along the dimension 0
+    @param col Index along the dimension 1
+    */
+    template<typename _Tp> const _Tp* ptr(int row, int col) const;
+    /** @overload */
+    template<typename _Tp> _Tp* ptr(int i0, int i1, int i2);
+    /** @overload */
+    template<typename _Tp> const _Tp* ptr(int i0, int i1, int i2) const;
+    /** @overload */
+    template<typename _Tp> _Tp* ptr(const int* idx);
+    /** @overload */
+    template<typename _Tp> const _Tp* ptr(const int* idx) const;
+    /** @overload */
+    template<typename _Tp, int n> _Tp* ptr(const Vec<int, n>& idx);
+    /** @overload */
+    template<typename _Tp, int n> const _Tp* ptr(const Vec<int, n>& idx) const;
+
+    /** @brief Returns a reference to the specified array element.
+
+    The template methods return a reference to the specified array element. For the sake of higher
+    performance, the index range checks are only performed in the Debug configuration.
+
+    Note that the variants with a single index (i) can be used to access elements of single-row or
+    single-column 2-dimensional arrays. That is, if, for example, A is a 1 x N floating-point matrix and
+    B is an M x 1 integer matrix, you can simply write `A.at<float>(k+4)` and `B.at<int>(2*i+1)`
+    instead of `A.at<float>(0,k+4)` and `B.at<int>(2*i+1,0)`, respectively.
+
+    The example below initializes a Hilbert matrix:
+    @code
+        Mat H(100, 100, CV_64F);
+        for(int i = 0; i < H.rows; i++)
+            for(int j = 0; j < H.cols; j++)
+                H.at<double>(i,j)=1./(i+j+1);
+    @endcode
+
+    Keep in mind that the size identifier used in the at operator cannot be chosen at random. It depends
+    on the image from which you are trying to retrieve the data. The table below gives a better insight in this:
+     - If matrix is of type `CV_8U` then use `Mat.at<uchar>(y,x)`.
+     - If matrix is of type `CV_8S` then use `Mat.at<schar>(y,x)`.
+     - If matrix is of type `CV_16U` then use `Mat.at<ushort>(y,x)`.
+     - If matrix is of type `CV_16S` then use `Mat.at<short>(y,x)`.
+     - If matrix is of type `CV_32S`  then use `Mat.at<int>(y,x)`.
+     - If matrix is of type `CV_32F`  then use `Mat.at<float>(y,x)`.
+     - If matrix is of type `CV_64F` then use `Mat.at<double>(y,x)`.
+
+    @param i0 Index along the dimension 0
+     */
+    template<typename _Tp> _Tp& at(int i0=0);
+    /** @overload
+    @param i0 Index along the dimension 0
+    */
+    template<typename _Tp> const _Tp& at(int i0=0) const;
+    /** @overload
+    @param row Index along the dimension 0
+    @param col Index along the dimension 1
+    */
+    template<typename _Tp> _Tp& at(int row, int col);
+    /** @overload
+    @param row Index along the dimension 0
+    @param col Index along the dimension 1
+    */
+    template<typename _Tp> const _Tp& at(int row, int col) const;
+
+    /** @overload
+    @param i0 Index along the dimension 0
+    @param i1 Index along the dimension 1
+    @param i2 Index along the dimension 2
+    */
+    template<typename _Tp> _Tp& at(int i0, int i1, int i2);
+    /** @overload
+    @param i0 Index along the dimension 0
+    @param i1 Index along the dimension 1
+    @param i2 Index along the dimension 2
+    */
+    template<typename _Tp> const _Tp& at(int i0, int i1, int i2) const;
+
+    /** @overload
+    @param idx Array of Mat::dims indices.
+    */
+    template<typename _Tp> _Tp& at(const int* idx);
+    /** @overload
+    @param idx Array of Mat::dims indices.
+    */
+    template<typename _Tp> const _Tp& at(const int* idx) const;
+
+    /** @overload */
+    template<typename _Tp, int n> _Tp& at(const Vec<int, n>& idx);
+    /** @overload */
+    template<typename _Tp, int n> const _Tp& at(const Vec<int, n>& idx) const;
+
+    /** @overload
+    special versions for 2D arrays (especially convenient for referencing image pixels)
+    @param pt Element position specified as Point(j,i) .
+    */
+    template<typename _Tp> _Tp& at(Point pt);
+    /** @overload
+    special versions for 2D arrays (especially convenient for referencing image pixels)
+    @param pt Element position specified as Point(j,i) .
+    */
+    template<typename _Tp> const _Tp& at(Point pt) const;
+
+    /** @brief Returns the matrix iterator and sets it to the first matrix element.
+
+    The methods return the matrix read-only or read-write iterators. The use of matrix iterators is very
+    similar to the use of bi-directional STL iterators. In the example below, the alpha blending
+    function is rewritten using the matrix iterators:
+    @code
+        template<typename T>
+        void alphaBlendRGBA(const Mat& src1, const Mat& src2, Mat& dst)
+        {
+            typedef Vec<T, 4> VT;
+
+            const float alpha_scale = (float)std::numeric_limits<T>::max(),
+                        inv_scale = 1.f/alpha_scale;
+
+            CV_Assert( src1.type() == src2.type() &&
+                       src1.type() == traits::Type<VT>::value &&
+                       src1.size() == src2.size());
+            Size size = src1.size();
+            dst.create(size, src1.type());
+
+            MatConstIterator_<VT> it1 = src1.begin<VT>(), it1_end = src1.end<VT>();
+            MatConstIterator_<VT> it2 = src2.begin<VT>();
+            MatIterator_<VT> dst_it = dst.begin<VT>();
+
+            for( ; it1 != it1_end; ++it1, ++it2, ++dst_it )
+            {
+                VT pix1 = *it1, pix2 = *it2;
+                float alpha = pix1[3]*inv_scale, beta = pix2[3]*inv_scale;
+                *dst_it = VT(saturate_cast<T>(pix1[0]*alpha + pix2[0]*beta),
+                             saturate_cast<T>(pix1[1]*alpha + pix2[1]*beta),
+                             saturate_cast<T>(pix1[2]*alpha + pix2[2]*beta),
+                             saturate_cast<T>((1 - (1-alpha)*(1-beta))*alpha_scale));
+            }
+        }
+    @endcode
+     */
+    template<typename _Tp> MatIterator_<_Tp> begin();
+    template<typename _Tp> MatConstIterator_<_Tp> begin() const;
+
+    /** @brief Same as begin() but for inverse traversal
+     */
+    template<typename _Tp> std::reverse_iterator<MatIterator_<_Tp>> rbegin();
+    template<typename _Tp> std::reverse_iterator<MatConstIterator_<_Tp>> rbegin() const;
+
+    /** @brief Returns the matrix iterator and sets it to the after-last matrix element.
+
+    The methods return the matrix read-only or read-write iterators, set to the point following the last
+    matrix element.
+     */
+    template<typename _Tp> MatIterator_<_Tp> end();
+    template<typename _Tp> MatConstIterator_<_Tp> end() const;
+
+    /** @brief Same as end() but for inverse traversal
+     */
+    template<typename _Tp> std::reverse_iterator< MatIterator_<_Tp>> rend();
+    template<typename _Tp> std::reverse_iterator< MatConstIterator_<_Tp>> rend() const;
+
+
+    /** @brief Runs the given functor over all matrix elements in parallel.
+
+    The operation passed as argument has to be a function pointer, a function object or a lambda(C++11).
+
+    Example 1. All of the operations below put 0xFF the first channel of all matrix elements:
+    @code
+        Mat image(1920, 1080, CV_8UC3);
+        typedef cv::Point3_<uint8_t> Pixel;
+
+        // first. raw pointer access.
+        for (int r = 0; r < image.rows; ++r) {
+            Pixel* ptr = image.ptr<Pixel>(r, 0);
+            const Pixel* ptr_end = ptr + image.cols;
+            for (; ptr != ptr_end; ++ptr) {
+                ptr->x = 255;
+            }
+        }
+
+        // Using MatIterator. (Simple but there are a Iterator's overhead)
+        for (Pixel &p : cv::Mat_<Pixel>(image)) {
+            p.x = 255;
+        }
+
+        // Parallel execution with function object.
+        struct Operator {
+            void operator ()(Pixel &pixel, const int * position) {
+                pixel.x = 255;
+            }
+        };
+        image.forEach<Pixel>(Operator());
+
+        // Parallel execution using C++11 lambda.
+        image.forEach<Pixel>([](Pixel &p, const int * position) -> void {
+            p.x = 255;
+        });
+    @endcode
+    Example 2. Using the pixel's position:
+    @code
+        // Creating 3D matrix (255 x 255 x 255) typed uint8_t
+        // and initialize all elements by the value which equals elements position.
+        // i.e. pixels (x,y,z) = (1,2,3) is (b,g,r) = (1,2,3).
+
+        int sizes[] = { 255, 255, 255 };
+        typedef cv::Point3_<uint8_t> Pixel;
+
+        Mat_<Pixel> image = Mat::zeros(3, sizes, CV_8UC3);
+
+        image.forEach<Pixel>([](Pixel& pixel, const int position[]) -> void {
+            pixel.x = position[0];
+            pixel.y = position[1];
+            pixel.z = position[2];
+        });
+    @endcode
+     */
+    template<typename _Tp, typename Functor> void forEach(const Functor& operation);
+    /** @overload */
+    template<typename _Tp, typename Functor> void forEach(const Functor& operation) const;
+
+    Mat(Mat&& m);
+    Mat& operator = (Mat&& m);
+
+    enum { MAGIC_VAL  = 0x42FF0000, AUTO_STEP = 0, CONTINUOUS_FLAG = CV_MAT_CONT_FLAG, SUBMATRIX_FLAG = CV_SUBMAT_FLAG };
+    enum { MAGIC_MASK = 0xFFFF0000, TYPE_MASK = 0x00000FFF, DEPTH_MASK = 7 };
+
+    /*! includes several bit-fields:
+         - the magic signature
+         - continuity flag
+         - depth
+         - number of channels
+     */
+    int flags;
+    //! the matrix dimensionality, >= 2
+    int dims;
+    //! the number of rows and columns or (-1, -1) when the matrix has more than 2 dimensions
+    int rows, cols;
+    //! pointer to the data
+    uchar* data;
+
+    //! helper fields used in locateROI and adjustROI
+    const uchar* datastart;
+    const uchar* dataend;
+    const uchar* datalimit;
+
+    //! custom allocator
+    MatAllocator* allocator;
+    //! and the standard allocator
+    static MatAllocator* getStdAllocator();
+    static MatAllocator* getDefaultAllocator();
+    static void setDefaultAllocator(MatAllocator* allocator);
+
+    //! internal use method: updates the continuity flag
+    void updateContinuityFlag();
+
+    //! interaction with UMat
+    UMatData* u;
+
+    MatSize size;
+    MatStep step;
+
+protected:
+    template<typename _Tp, typename Functor> void forEach_impl(const Functor& operation);
+};
+
+
+///////////////////////////////// Mat_<_Tp> ////////////////////////////////////
+
+/** @brief Template matrix class derived from Mat
+
+@code{.cpp}
+    template<typename _Tp> class Mat_ : public Mat
+    {
+    public:
+        // ... some specific methods
+        //         and
+        // no new extra fields
+    };
+@endcode
+The class `Mat_<_Tp>` is a *thin* template wrapper on top of the Mat class. It does not have any
+extra data fields. Nor this class nor Mat has any virtual methods. Thus, references or pointers to
+these two classes can be freely but carefully converted one to another. For example:
+@code{.cpp}
+    // create a 100x100 8-bit matrix
+    Mat M(100,100,CV_8U);
+    // this will be compiled fine. no any data conversion will be done.
+    Mat_<float>& M1 = (Mat_<float>&)M;
+    // the program is likely to crash at the statement below
+    M1(99,99) = 1.f;
+@endcode
+While Mat is sufficient in most cases, Mat_ can be more convenient if you use a lot of element
+access operations and if you know matrix type at the compilation time. Note that
+`Mat::at(int y,int x)` and `Mat_::operator()(int y,int x)` do absolutely the same
+and run at the same speed, but the latter is certainly shorter:
+@code{.cpp}
+    Mat_<double> M(20,20);
+    for(int i = 0; i < M.rows; i++)
+        for(int j = 0; j < M.cols; j++)
+            M(i,j) = 1./(i+j+1);
+    Mat E, V;
+    eigen(M,E,V);
+    cout << E.at<double>(0,0)/E.at<double>(M.rows-1,0);
+@endcode
+To use Mat_ for multi-channel images/matrices, pass Vec as a Mat_ parameter:
+@code{.cpp}
+    // allocate a 320x240 color image and fill it with green (in RGB space)
+    Mat_<Vec3b> img(240, 320, Vec3b(0,255,0));
+    // now draw a diagonal white line
+    for(int i = 0; i < 100; i++)
+        img(i,i)=Vec3b(255,255,255);
+    // and now scramble the 2nd (red) channel of each pixel
+    for(int i = 0; i < img.rows; i++)
+        for(int j = 0; j < img.cols; j++)
+            img(i,j)[2] ^= (uchar)(i ^ j);
+@endcode
+Mat_ is fully compatible with C++11 range-based for loop. For example such loop
+can be used to safely apply look-up table:
+@code{.cpp}
+void applyTable(Mat_<uchar>& I, const uchar* const table)
+{
+    for(auto& pixel : I)
+    {
+        pixel = table[pixel];
+    }
+}
+@endcode
+ */
+template<typename _Tp> class Mat_ : public Mat
+{
+public:
+    typedef _Tp value_type;
+    typedef typename DataType<_Tp>::channel_type channel_type;
+    typedef MatIterator_<_Tp> iterator;
+    typedef MatConstIterator_<_Tp> const_iterator;
+
+    //! default constructor
+    Mat_() CV_NOEXCEPT;
+    //! equivalent to Mat(_rows, _cols, DataType<_Tp>::type)
+    Mat_(int _rows, int _cols);
+    //! constructor that sets each matrix element to specified value
+    Mat_(int _rows, int _cols, const _Tp& value);
+    //! equivalent to Mat(_size, DataType<_Tp>::type)
+    explicit Mat_(Size _size);
+    //! constructor that sets each matrix element to specified value
+    Mat_(Size _size, const _Tp& value);
+    //! n-dim array constructor
+    Mat_(int _ndims, const int* _sizes);
+    //! n-dim array constructor that sets each matrix element to specified value
+    Mat_(int _ndims, const int* _sizes, const _Tp& value);
+    //! copy/conversion constructor. If m is of different type, it's converted
+    Mat_(const Mat& m);
+    //! copy constructor
+    Mat_(const Mat_& m);
+    //! constructs a matrix on top of user-allocated data. step is in bytes(!!!), regardless of the type
+    Mat_(int _rows, int _cols, _Tp* _data, size_t _step=AUTO_STEP);
+    //! constructs n-dim matrix on top of user-allocated data. steps are in bytes(!!!), regardless of the type
+    Mat_(int _ndims, const int* _sizes, _Tp* _data, const size_t* _steps=0);
+    //! selects a submatrix
+    Mat_(const Mat_& m, const Range& rowRange, const Range& colRange=Range::all());
+    //! selects a submatrix
+    Mat_(const Mat_& m, const Rect& roi);
+    //! selects a submatrix, n-dim version
+    Mat_(const Mat_& m, const Range* ranges);
+    //! selects a submatrix, n-dim version
+    Mat_(const Mat_& m, const std::vector<Range>& ranges);
+    //! from a matrix expression
+    explicit Mat_(const MatExpr& e);
+    //! makes a matrix out of Vec, std::vector, Point_ or Point3_. The matrix will have a single column
+    explicit Mat_(const std::vector<_Tp>& vec, bool copyData=false);
+    template<int n> explicit Mat_(const Vec<typename DataType<_Tp>::channel_type, n>& vec, bool copyData=true);
+    template<int m, int n> explicit Mat_(const Matx<typename DataType<_Tp>::channel_type, m, n>& mtx, bool copyData=true);
+    explicit Mat_(const Point_<typename DataType<_Tp>::channel_type>& pt, bool copyData=true);
+    explicit Mat_(const Point3_<typename DataType<_Tp>::channel_type>& pt, bool copyData=true);
+    explicit Mat_(const MatCommaInitializer_<_Tp>& commaInitializer);
+
+    Mat_(std::initializer_list<_Tp> values);
+    explicit Mat_(const std::initializer_list<int> sizes, const std::initializer_list<_Tp> values);
+
+    template <std::size_t _Nm> explicit Mat_(const std::array<_Tp, _Nm>& arr, bool copyData=false);
+
+    Mat_& operator = (const Mat& m);
+    Mat_& operator = (const Mat_& m);
+    //! set all the elements to s.
+    Mat_& operator = (const _Tp& s);
+    //! assign a matrix expression
+    Mat_& operator = (const MatExpr& e);
+
+    //! iterators; they are smart enough to skip gaps in the end of rows
+    iterator begin();
+    iterator end();
+    const_iterator begin() const;
+    const_iterator end() const;
+
+    //reverse iterators
+    std::reverse_iterator<iterator> rbegin();
+    std::reverse_iterator<iterator> rend();
+    std::reverse_iterator<const_iterator> rbegin() const;
+    std::reverse_iterator<const_iterator> rend() const;
+
+    //! template methods for operation over all matrix elements.
+    // the operations take care of skipping gaps in the end of rows (if any)
+    template<typename Functor> void forEach(const Functor& operation);
+    template<typename Functor> void forEach(const Functor& operation) const;
+
+    //! equivalent to Mat::create(_rows, _cols, DataType<_Tp>::type)
+    void create(int _rows, int _cols);
+    //! equivalent to Mat::create(_size, DataType<_Tp>::type)
+    void create(Size _size);
+    //! equivalent to Mat::create(_ndims, _sizes, DatType<_Tp>::type)
+    void create(int _ndims, const int* _sizes);
+    //! equivalent to Mat::release()
+    void release();
+    //! cross-product
+    Mat_ cross(const Mat_& m) const;
+    //! data type conversion
+    template<typename T2> operator Mat_<T2>() const;
+    //! overridden forms of Mat::row() etc.
+    Mat_ row(int y) const;
+    Mat_ col(int x) const;
+    Mat_ diag(int d=0) const;
+    CV_NODISCARD_STD Mat_ clone() const;
+
+    //! overridden forms of Mat::elemSize() etc.
+    size_t elemSize() const;
+    size_t elemSize1() const;
+    int type() const;
+    int depth() const;
+    int channels() const;
+    size_t step1(int i=0) const;
+    //! returns step()/sizeof(_Tp)
+    size_t stepT(int i=0) const;
+
+    //! overridden forms of Mat::zeros() etc. Data type is omitted, of course
+    CV_NODISCARD_STD static MatExpr zeros(int rows, int cols);
+    CV_NODISCARD_STD static MatExpr zeros(Size size);
+    CV_NODISCARD_STD static MatExpr zeros(int _ndims, const int* _sizes);
+    CV_NODISCARD_STD static MatExpr ones(int rows, int cols);
+    CV_NODISCARD_STD static MatExpr ones(Size size);
+    CV_NODISCARD_STD static MatExpr ones(int _ndims, const int* _sizes);
+    CV_NODISCARD_STD static MatExpr eye(int rows, int cols);
+    CV_NODISCARD_STD static MatExpr eye(Size size);
+
+    //! some more overridden methods
+    Mat_& adjustROI( int dtop, int dbottom, int dleft, int dright );
+    Mat_ operator()( const Range& rowRange, const Range& colRange ) const;
+    Mat_ operator()( const Rect& roi ) const;
+    Mat_ operator()( const Range* ranges ) const;
+    Mat_ operator()(const std::vector<Range>& ranges) const;
+
+    //! more convenient forms of row and element access operators
+    _Tp* operator [](int y);
+    const _Tp* operator [](int y) const;
+
+    //! returns reference to the specified element
+    _Tp& operator ()(const int* idx);
+    //! returns read-only reference to the specified element
+    const _Tp& operator ()(const int* idx) const;
+
+    //! returns reference to the specified element
+    template<int n> _Tp& operator ()(const Vec<int, n>& idx);
+    //! returns read-only reference to the specified element
+    template<int n> const _Tp& operator ()(const Vec<int, n>& idx) const;
+
+    //! returns reference to the specified element (1D case)
+    _Tp& operator ()(int idx0);
+    //! returns read-only reference to the specified element (1D case)
+    const _Tp& operator ()(int idx0) const;
+    //! returns reference to the specified element (2D case)
+    _Tp& operator ()(int row, int col);
+    //! returns read-only reference to the specified element (2D case)
+    const _Tp& operator ()(int row, int col) const;
+    //! returns reference to the specified element (3D case)
+    _Tp& operator ()(int idx0, int idx1, int idx2);
+    //! returns read-only reference to the specified element (3D case)
+    const _Tp& operator ()(int idx0, int idx1, int idx2) const;
+
+    _Tp& operator ()(Point pt);
+    const _Tp& operator ()(Point pt) const;
+
+    //! conversion to vector.
+    operator std::vector<_Tp>() const;
+
+    //! conversion to array.
+    template<std::size_t _Nm> operator std::array<_Tp, _Nm>() const;
+
+    //! conversion to Vec
+    template<int n> operator Vec<typename DataType<_Tp>::channel_type, n>() const;
+    //! conversion to Matx
+    template<int m, int n> operator Matx<typename DataType<_Tp>::channel_type, m, n>() const;
+
+    Mat_(Mat_&& m);
+    Mat_& operator = (Mat_&& m);
+
+    Mat_(Mat&& m);
+    Mat_& operator = (Mat&& m);
+
+    Mat_(MatExpr&& e);
+};
+
+typedef Mat_<uchar> Mat1b;
+typedef Mat_<Vec2b> Mat2b;
+typedef Mat_<Vec3b> Mat3b;
+typedef Mat_<Vec4b> Mat4b;
+
+typedef Mat_<short> Mat1s;
+typedef Mat_<Vec2s> Mat2s;
+typedef Mat_<Vec3s> Mat3s;
+typedef Mat_<Vec4s> Mat4s;
+
+typedef Mat_<ushort> Mat1w;
+typedef Mat_<Vec2w> Mat2w;
+typedef Mat_<Vec3w> Mat3w;
+typedef Mat_<Vec4w> Mat4w;
+
+typedef Mat_<int>   Mat1i;
+typedef Mat_<Vec2i> Mat2i;
+typedef Mat_<Vec3i> Mat3i;
+typedef Mat_<Vec4i> Mat4i;
+
+typedef Mat_<float> Mat1f;
+typedef Mat_<Vec2f> Mat2f;
+typedef Mat_<Vec3f> Mat3f;
+typedef Mat_<Vec4f> Mat4f;
+
+typedef Mat_<double> Mat1d;
+typedef Mat_<Vec2d> Mat2d;
+typedef Mat_<Vec3d> Mat3d;
+typedef Mat_<Vec4d> Mat4d;
+
+/** @todo document */
+class CV_EXPORTS UMat
+{
+public:
+    //! default constructor
+    UMat(UMatUsageFlags usageFlags = USAGE_DEFAULT) CV_NOEXCEPT;
+    //! constructs 2D matrix of the specified size and type
+    // (_type is CV_8UC1, CV_64FC3, CV_32SC(12) etc.)
+    UMat(int rows, int cols, int type, UMatUsageFlags usageFlags = USAGE_DEFAULT);
+    UMat(Size size, int type, UMatUsageFlags usageFlags = USAGE_DEFAULT);
+    //! constructs 2D matrix and fills it with the specified value _s.
+    UMat(int rows, int cols, int type, const Scalar& s, UMatUsageFlags usageFlags = USAGE_DEFAULT);
+    UMat(Size size, int type, const Scalar& s, UMatUsageFlags usageFlags = USAGE_DEFAULT);
+
+    //! constructs n-dimensional matrix
+    UMat(int ndims, const int* sizes, int type, UMatUsageFlags usageFlags = USAGE_DEFAULT);
+    UMat(int ndims, const int* sizes, int type, const Scalar& s, UMatUsageFlags usageFlags = USAGE_DEFAULT);
+
+    //! copy constructor
+    UMat(const UMat& m);
+
+    //! creates a matrix header for a part of the bigger matrix
+    UMat(const UMat& m, const Range& rowRange, const Range& colRange=Range::all());
+    UMat(const UMat& m, const Rect& roi);
+    UMat(const UMat& m, const Range* ranges);
+    UMat(const UMat& m, const std::vector<Range>& ranges);
+
+    // FIXIT copyData=false is not implemented, drop this in favor of cv::Mat (OpenCV 5.0)
+    //! builds matrix from std::vector with or without copying the data
+    template<typename _Tp> explicit UMat(const std::vector<_Tp>& vec, bool copyData=false);
+
+    //! destructor - calls release()
+    ~UMat();
+    //! assignment operators
+    UMat& operator = (const UMat& m);
+
+    Mat getMat(AccessFlag flags) const;
+
+    //! returns a new matrix header for the specified row
+    UMat row(int y) const;
+    //! returns a new matrix header for the specified column
+    UMat col(int x) const;
+    //! ... for the specified row span
+    UMat rowRange(int startrow, int endrow) const;
+    UMat rowRange(const Range& r) const;
+    //! ... for the specified column span
+    UMat colRange(int startcol, int endcol) const;
+    UMat colRange(const Range& r) const;
+    //! ... for the specified diagonal
+    //! (d=0 - the main diagonal,
+    //!  >0 - a diagonal from the upper half,
+    //!  <0 - a diagonal from the lower half)
+    UMat diag(int d=0) const;
+    //! constructs a square diagonal matrix which main diagonal is vector "d"
+    CV_NODISCARD_STD static UMat diag(const UMat& d, UMatUsageFlags usageFlags /*= USAGE_DEFAULT*/);
+    CV_NODISCARD_STD static UMat diag(const UMat& d) { return diag(d, USAGE_DEFAULT); }  // OpenCV 5.0: remove abi compatibility overload
+
+    //! returns deep copy of the matrix, i.e. the data is copied
+    CV_NODISCARD_STD UMat clone() const;
+    //! copies the matrix content to "m".
+    // It calls m.create(this->size(), this->type()).
+    void copyTo( OutputArray m ) const;
+    //! copies those matrix elements to "m" that are marked with non-zero mask elements.
+    void copyTo( OutputArray m, InputArray mask ) const;
+    //! converts matrix to another datatype with optional scaling. See cvConvertScale.
+    void convertTo( OutputArray m, int rtype, double alpha=1, double beta=0 ) const;
+
+    void assignTo( UMat& m, int type=-1 ) const;
+
+    //! sets every matrix element to s
+    UMat& operator = (const Scalar& s);
+    //! sets some of the matrix elements to s, according to the mask
+    UMat& setTo(InputArray value, InputArray mask=noArray());
+    //! creates alternative matrix header for the same data, with different
+    // number of channels and/or different number of rows. see cvReshape.
+    UMat reshape(int cn, int rows=0) const;
+    UMat reshape(int cn, int newndims, const int* newsz) const;
+
+    //! matrix transposition by means of matrix expressions
+    UMat t() const;
+    //! matrix inversion by means of matrix expressions
+    UMat inv(int method=DECOMP_LU) const;
+    //! per-element matrix multiplication by means of matrix expressions
+    UMat mul(InputArray m, double scale=1) const;
+
+    //! computes dot-product
+    double dot(InputArray m) const;
+
+    //! Matlab-style matrix initialization
+    CV_NODISCARD_STD static UMat zeros(int rows, int cols, int type, UMatUsageFlags usageFlags /*= USAGE_DEFAULT*/);
+    CV_NODISCARD_STD static UMat zeros(Size size, int type, UMatUsageFlags usageFlags /*= USAGE_DEFAULT*/);
+    CV_NODISCARD_STD static UMat zeros(int ndims, const int* sz, int type, UMatUsageFlags usageFlags /*= USAGE_DEFAULT*/);
+    CV_NODISCARD_STD static UMat zeros(int rows, int cols, int type) { return zeros(rows, cols, type, USAGE_DEFAULT); }  // OpenCV 5.0: remove abi compatibility overload
+    CV_NODISCARD_STD static UMat zeros(Size size, int type) { return zeros(size, type, USAGE_DEFAULT); }  // OpenCV 5.0: remove abi compatibility overload
+    CV_NODISCARD_STD static UMat zeros(int ndims, const int* sz, int type) { return zeros(ndims, sz, type, USAGE_DEFAULT); }  // OpenCV 5.0: remove abi compatibility overload
+    CV_NODISCARD_STD static UMat ones(int rows, int cols, int type, UMatUsageFlags usageFlags /*= USAGE_DEFAULT*/);
+    CV_NODISCARD_STD static UMat ones(Size size, int type, UMatUsageFlags usageFlags /*= USAGE_DEFAULT*/);
+    CV_NODISCARD_STD static UMat ones(int ndims, const int* sz, int type, UMatUsageFlags usageFlags /*= USAGE_DEFAULT*/);
+    CV_NODISCARD_STD static UMat ones(int rows, int cols, int type) { return ones(rows, cols, type, USAGE_DEFAULT); }  // OpenCV 5.0: remove abi compatibility overload
+    CV_NODISCARD_STD static UMat ones(Size size, int type) { return ones(size, type, USAGE_DEFAULT); }  // OpenCV 5.0: remove abi compatibility overload
+    CV_NODISCARD_STD static UMat ones(int ndims, const int* sz, int type) { return ones(ndims, sz, type, USAGE_DEFAULT); }  // OpenCV 5.0: remove abi compatibility overload
+    CV_NODISCARD_STD static UMat eye(int rows, int cols, int type, UMatUsageFlags usageFlags /*= USAGE_DEFAULT*/);
+    CV_NODISCARD_STD static UMat eye(Size size, int type, UMatUsageFlags usageFlags /*= USAGE_DEFAULT*/);
+    CV_NODISCARD_STD static UMat eye(int rows, int cols, int type) { return eye(rows, cols, type, USAGE_DEFAULT); }  // OpenCV 5.0: remove abi compatibility overload
+    CV_NODISCARD_STD static UMat eye(Size size, int type) { return eye(size, type, USAGE_DEFAULT); }  // OpenCV 5.0: remove abi compatibility overload
+
+    //! allocates new matrix data unless the matrix already has specified size and type.
+    // previous data is unreferenced if needed.
+    void create(int rows, int cols, int type, UMatUsageFlags usageFlags = USAGE_DEFAULT);
+    void create(Size size, int type, UMatUsageFlags usageFlags = USAGE_DEFAULT);
+    void create(int ndims, const int* sizes, int type, UMatUsageFlags usageFlags = USAGE_DEFAULT);
+    void create(const std::vector<int>& sizes, int type, UMatUsageFlags usageFlags = USAGE_DEFAULT);
+
+    //! increases the reference counter; use with care to avoid memleaks
+    void addref();
+    //! decreases reference counter;
+    // deallocates the data when reference counter reaches 0.
+    void release();
+
+    //! deallocates the matrix data
+    void deallocate();
+    //! internal use function; properly re-allocates _size, _step arrays
+    void copySize(const UMat& m);
+
+    //! locates matrix header within a parent matrix. See below
+    void locateROI( Size& wholeSize, Point& ofs ) const;
+    //! moves/resizes the current matrix ROI inside the parent matrix.
+    UMat& adjustROI( int dtop, int dbottom, int dleft, int dright );
+    //! extracts a rectangular sub-matrix
+    // (this is a generalized form of row, rowRange etc.)
+    UMat operator()( Range rowRange, Range colRange ) const;
+    UMat operator()( const Rect& roi ) const;
+    UMat operator()( const Range* ranges ) const;
+    UMat operator()(const std::vector<Range>& ranges) const;
+
+    //! returns true iff the matrix data is continuous
+    // (i.e. when there are no gaps between successive rows).
+    // similar to CV_IS_MAT_CONT(cvmat->type)
+    bool isContinuous() const;
+
+    //! returns true if the matrix is a submatrix of another matrix
+    bool isSubmatrix() const;
+
+    //! returns element size in bytes,
+    // similar to CV_ELEM_SIZE(cvmat->type)
+    size_t elemSize() const;
+    //! returns the size of element channel in bytes.
+    size_t elemSize1() const;
+    //! returns element type, similar to CV_MAT_TYPE(cvmat->type)
+    int type() const;
+    //! returns element type, similar to CV_MAT_DEPTH(cvmat->type)
+    int depth() const;
+    //! returns element type, similar to CV_MAT_CN(cvmat->type)
+    int channels() const;
+    //! returns step/elemSize1()
+    size_t step1(int i=0) const;
+    //! returns true if matrix data is NULL
+    bool empty() const;
+    //! returns the total number of matrix elements
+    size_t total() const;
+
+    //! returns N if the matrix is 1-channel (N x ptdim) or ptdim-channel (1 x N) or (N x 1); negative number otherwise
+    int checkVector(int elemChannels, int depth=-1, bool requireContinuous=true) const;
+
+    UMat(UMat&& m);
+    UMat& operator = (UMat&& m);
+
+    /*! Returns the OpenCL buffer handle on which UMat operates on.
+        The UMat instance should be kept alive during the use of the handle to prevent the buffer to be
+        returned to the OpenCV buffer pool.
+     */
+    void* handle(AccessFlag accessFlags) const;
+    void ndoffset(size_t* ofs) const;
+
+    enum { MAGIC_VAL  = 0x42FF0000, AUTO_STEP = 0, CONTINUOUS_FLAG = CV_MAT_CONT_FLAG, SUBMATRIX_FLAG = CV_SUBMAT_FLAG };
+    enum { MAGIC_MASK = 0xFFFF0000, TYPE_MASK = 0x00000FFF, DEPTH_MASK = 7 };
+
+    /*! includes several bit-fields:
+         - the magic signature
+         - continuity flag
+         - depth
+         - number of channels
+     */
+    int flags;
+
+    //! the matrix dimensionality, >= 2
+    int dims;
+
+    //! number of rows in the matrix; -1 when the matrix has more than 2 dimensions
+    int rows;
+
+    //! number of columns in the matrix; -1 when the matrix has more than 2 dimensions
+    int cols;
+
+    //! custom allocator
+    MatAllocator* allocator;
+
+    //! usage flags for allocator; recommend do not set directly, instead set during construct/create/getUMat
+    UMatUsageFlags usageFlags;
+
+    //! and the standard allocator
+    static MatAllocator* getStdAllocator();
+
+    //! internal use method: updates the continuity flag
+    void updateContinuityFlag();
+
+    //! black-box container of UMat data
+    UMatData* u;
+
+    //! offset of the submatrix (or 0)
+    size_t offset;
+
+    //! dimensional size of the matrix; accessible in various formats
+    MatSize size;
+
+    //! number of bytes each matrix element/row/plane/dimension occupies
+    MatStep step;
+
+protected:
+};
+
+
+/////////////////////////// multi-dimensional sparse matrix //////////////////////////
+
+/** @brief The class SparseMat represents multi-dimensional sparse numerical arrays.
+
+Such a sparse array can store elements of any type that Mat can store. *Sparse* means that only
+non-zero elements are stored (though, as a result of operations on a sparse matrix, some of its
+stored elements can actually become 0. It is up to you to detect such elements and delete them
+using SparseMat::erase ). The non-zero elements are stored in a hash table that grows when it is
+filled so that the search time is O(1) in average (regardless of whether element is there or not).
+Elements can be accessed using the following methods:
+-   Query operations (SparseMat::ptr and the higher-level SparseMat::ref, SparseMat::value and
+    SparseMat::find), for example:
+    @code
+        const int dims = 5;
+        int size[5] = {10, 10, 10, 10, 10};
+        SparseMat sparse_mat(dims, size, CV_32F);
+        for(int i = 0; i < 1000; i++)
+        {
+            int idx[dims];
+            for(int k = 0; k < dims; k++)
+                idx[k] = rand() % size[k];
+            sparse_mat.ref<float>(idx) += 1.f;
+        }
+        cout << "nnz = " << sparse_mat.nzcount() << endl;
+    @endcode
+-   Sparse matrix iterators. They are similar to MatIterator but different from NAryMatIterator.
+    That is, the iteration loop is familiar to STL users:
+    @code
+        // prints elements of a sparse floating-point matrix
+        // and the sum of elements.
+        SparseMatConstIterator_<float>
+            it = sparse_mat.begin<float>(),
+            it_end = sparse_mat.end<float>();
+        double s = 0;
+        int dims = sparse_mat.dims();
+        for(; it != it_end; ++it)
+        {
+            // print element indices and the element value
+            const SparseMat::Node* n = it.node();
+            printf("(");
+            for(int i = 0; i < dims; i++)
+                printf("%d%s", n->idx[i], i < dims-1 ? ", " : ")");
+            printf(": %g\n", it.value<float>());
+            s += *it;
+        }
+        printf("Element sum is %g\n", s);
+    @endcode
+    If you run this loop, you will notice that elements are not enumerated in a logical order
+    (lexicographical, and so on). They come in the same order as they are stored in the hash table
+    (semi-randomly). You may collect pointers to the nodes and sort them to get the proper ordering.
+    Note, however, that pointers to the nodes may become invalid when you add more elements to the
+    matrix. This may happen due to possible buffer reallocation.
+-   Combination of the above 2 methods when you need to process 2 or more sparse matrices
+    simultaneously. For example, this is how you can compute unnormalized cross-correlation of the 2
+    floating-point sparse matrices:
+    @code
+        double cross_corr(const SparseMat& a, const SparseMat& b)
+        {
+            const SparseMat *_a = &a, *_b = &b;
+            // if b contains less elements than a,
+            // it is faster to iterate through b
+            if(_a->nzcount() > _b->nzcount())
+                std::swap(_a, _b);
+            SparseMatConstIterator_<float> it = _a->begin<float>(),
+                                           it_end = _a->end<float>();
+            double ccorr = 0;
+            for(; it != it_end; ++it)
+            {
+                // take the next element from the first matrix
+                float avalue = *it;
+                const Node* anode = it.node();
+                // and try to find an element with the same index in the second matrix.
+                // since the hash value depends only on the element index,
+                // reuse the hash value stored in the node
+                float bvalue = _b->value<float>(anode->idx,&anode->hashval);
+                ccorr += avalue*bvalue;
+            }
+            return ccorr;
+        }
+    @endcode
+ */
+class CV_EXPORTS SparseMat
+{
+public:
+    typedef SparseMatIterator iterator;
+    typedef SparseMatConstIterator const_iterator;
+
+    enum { MAGIC_VAL=0x42FD0000, MAX_DIM=32, HASH_SCALE=0x5bd1e995, HASH_BIT=0x80000000 };
+
+    //! the sparse matrix header
+    struct CV_EXPORTS Hdr
+    {
+        Hdr(int _dims, const int* _sizes, int _type);
+        void clear();
+        int refcount;
+        int dims;
+        int valueOffset;
+        size_t nodeSize;
+        size_t nodeCount;
+        size_t freeList;
+        std::vector<uchar> pool;
+        std::vector<size_t> hashtab;
+        int size[MAX_DIM];
+    };
+
+    //! sparse matrix node - element of a hash table
+    struct CV_EXPORTS Node
+    {
+        //! hash value
+        size_t hashval;
+        //! index of the next node in the same hash table entry
+        size_t next;
+        //! index of the matrix element
+        int idx[MAX_DIM];
+    };
+
+    /** @brief Various SparseMat constructors.
+     */
+    SparseMat();
+
+    /** @overload
+    @param dims Array dimensionality.
+    @param _sizes Sparce matrix size on all dementions.
+    @param _type Sparse matrix data type.
+    */
+    SparseMat(int dims, const int* _sizes, int _type);
+
+    /** @overload
+    @param m Source matrix for copy constructor. If m is dense matrix (ocvMat) then it will be converted
+    to sparse representation.
+    */
+    SparseMat(const SparseMat& m);
+
+    /** @overload
+    @param m Source matrix for copy constructor. If m is dense matrix (ocvMat) then it will be converted
+    to sparse representation.
+    */
+    explicit SparseMat(const Mat& m);
+
+    //! the destructor
+    ~SparseMat();
+
+    //! assignment operator. This is O(1) operation, i.e. no data is copied
+    SparseMat& operator = (const SparseMat& m);
+    //! equivalent to the corresponding constructor
+    SparseMat& operator = (const Mat& m);
+
+    //! creates full copy of the matrix
+    CV_NODISCARD_STD SparseMat clone() const;
+
+    //! copies all the data to the destination matrix. All the previous content of m is erased
+    void copyTo( SparseMat& m ) const;
+    //! converts sparse matrix to dense matrix.
+    void copyTo( Mat& m ) const;
+    //! multiplies all the matrix elements by the specified scale factor alpha and converts the results to the specified data type
+    void convertTo( SparseMat& m, int rtype, double alpha=1 ) const;
+    //! converts sparse matrix to dense n-dim matrix with optional type conversion and scaling.
+    /*!
+        @param [out] m - output matrix; if it does not have a proper size or type before the operation,
+            it is reallocated
+        @param [in] rtype - desired output matrix type or, rather, the depth since the number of channels
+            are the same as the input has; if rtype is negative, the output matrix will have the
+            same type as the input.
+        @param [in] alpha - optional scale factor
+        @param [in] beta - optional delta added to the scaled values
+    */
+    void convertTo( Mat& m, int rtype, double alpha=1, double beta=0 ) const;
+
+    // not used now
+    void assignTo( SparseMat& m, int type=-1 ) const;
+
+    //! reallocates sparse matrix.
+    /*!
+        If the matrix already had the proper size and type,
+        it is simply cleared with clear(), otherwise,
+        the old matrix is released (using release()) and the new one is allocated.
+    */
+    void create(int dims, const int* _sizes, int _type);
+    //! sets all the sparse matrix elements to 0, which means clearing the hash table.
+    void clear();
+    //! manually increments the reference counter to the header.
+    void addref();
+    // decrements the header reference counter. When the counter reaches 0, the header and all the underlying data are deallocated.
+    void release();
+
+    //! converts sparse matrix to the old-style representation; all the elements are copied.
+    //operator CvSparseMat*() const;
+    //! returns the size of each element in bytes (not including the overhead - the space occupied by SparseMat::Node elements)
+    size_t elemSize() const;
+    //! returns elemSize()/channels()
+    size_t elemSize1() const;
+
+    //! returns type of sparse matrix elements
+    int type() const;
+    //! returns the depth of sparse matrix elements
+    int depth() const;
+    //! returns the number of channels
+    int channels() const;
+
+    //! returns the array of sizes, or NULL if the matrix is not allocated
+    const int* size() const;
+    //! returns the size of i-th matrix dimension (or 0)
+    int size(int i) const;
+    //! returns the matrix dimensionality
+    int dims() const;
+    //! returns the number of non-zero elements (=the number of hash table nodes)
+    size_t nzcount() const;
+
+    //! computes the element hash value (1D case)
+    size_t hash(int i0) const;
+    //! computes the element hash value (2D case)
+    size_t hash(int i0, int i1) const;
+    //! computes the element hash value (3D case)
+    size_t hash(int i0, int i1, int i2) const;
+    //! computes the element hash value (nD case)
+    size_t hash(const int* idx) const;
+
+    //!@{
+    /*!
+     specialized variants for 1D, 2D, 3D cases and the generic_type one for n-D case.
+     return pointer to the matrix element.
+      - if the element is there (it's non-zero), the pointer to it is returned
+      - if it's not there and createMissing=false, NULL pointer is returned
+      - if it's not there and createMissing=true, then the new element
+        is created and initialized with 0. Pointer to it is returned
+      - if the optional hashval pointer is not NULL, the element hash value is
+        not computed, but *hashval is taken instead.
+    */
+    //! returns pointer to the specified element (1D case)
+    uchar* ptr(int i0, bool createMissing, size_t* hashval=0);
+    //! returns pointer to the specified element (2D case)
+    uchar* ptr(int i0, int i1, bool createMissing, size_t* hashval=0);
+    //! returns pointer to the specified element (3D case)
+    uchar* ptr(int i0, int i1, int i2, bool createMissing, size_t* hashval=0);
+    //! returns pointer to the specified element (nD case)
+    uchar* ptr(const int* idx, bool createMissing, size_t* hashval=0);
+    //!@}
+
+    //!@{
+    /*!
+     return read-write reference to the specified sparse matrix element.
+
+     `ref<_Tp>(i0,...[,hashval])` is equivalent to `*(_Tp*)ptr(i0,...,true[,hashval])`.
+     The methods always return a valid reference.
+     If the element did not exist, it is created and initialized with 0.
+    */
+    //! returns reference to the specified element (1D case)
+    template<typename _Tp> _Tp& ref(int i0, size_t* hashval=0);
+    //! returns reference to the specified element (2D case)
+    template<typename _Tp> _Tp& ref(int i0, int i1, size_t* hashval=0);
+    //! returns reference to the specified element (3D case)
+    template<typename _Tp> _Tp& ref(int i0, int i1, int i2, size_t* hashval=0);
+    //! returns reference to the specified element (nD case)
+    template<typename _Tp> _Tp& ref(const int* idx, size_t* hashval=0);
+    //!@}
+
+    //!@{
+    /*!
+     return value of the specified sparse matrix element.
+
+     `value<_Tp>(i0,...[,hashval])` is equivalent to
+     @code
+     { const _Tp* p = find<_Tp>(i0,...[,hashval]); return p ? *p : _Tp(); }
+     @endcode
+
+     That is, if the element did not exist, the methods return 0.
+     */
+    //! returns value of the specified element (1D case)
+    template<typename _Tp> _Tp value(int i0, size_t* hashval=0) const;
+    //! returns value of the specified element (2D case)
+    template<typename _Tp> _Tp value(int i0, int i1, size_t* hashval=0) const;
+    //! returns value of the specified element (3D case)
+    template<typename _Tp> _Tp value(int i0, int i1, int i2, size_t* hashval=0) const;
+    //! returns value of the specified element (nD case)
+    template<typename _Tp> _Tp value(const int* idx, size_t* hashval=0) const;
+    //!@}
+
+    //!@{
+    /*!
+     Return pointer to the specified sparse matrix element if it exists
+
+     `find<_Tp>(i0,...[,hashval])` is equivalent to `(_const Tp*)ptr(i0,...false[,hashval])`.
+
+     If the specified element does not exist, the methods return NULL.
+    */
+    //! returns pointer to the specified element (1D case)
+    template<typename _Tp> const _Tp* find(int i0, size_t* hashval=0) const;
+    //! returns pointer to the specified element (2D case)
+    template<typename _Tp> const _Tp* find(int i0, int i1, size_t* hashval=0) const;
+    //! returns pointer to the specified element (3D case)
+    template<typename _Tp> const _Tp* find(int i0, int i1, int i2, size_t* hashval=0) const;
+    //! returns pointer to the specified element (nD case)
+    template<typename _Tp> const _Tp* find(const int* idx, size_t* hashval=0) const;
+    //!@}
+
+    //! erases the specified element (2D case)
+    void erase(int i0, int i1, size_t* hashval=0);
+    //! erases the specified element (3D case)
+    void erase(int i0, int i1, int i2, size_t* hashval=0);
+    //! erases the specified element (nD case)
+    void erase(const int* idx, size_t* hashval=0);
+
+    //!@{
+    /*!
+       return the sparse matrix iterator pointing to the first sparse matrix element
+    */
+    //! returns the sparse matrix iterator at the matrix beginning
+    SparseMatIterator begin();
+    //! returns the sparse matrix iterator at the matrix beginning
+    template<typename _Tp> SparseMatIterator_<_Tp> begin();
+    //! returns the read-only sparse matrix iterator at the matrix beginning
+    SparseMatConstIterator begin() const;
+    //! returns the read-only sparse matrix iterator at the matrix beginning
+    template<typename _Tp> SparseMatConstIterator_<_Tp> begin() const;
+    //!@}
+    /*!
+       return the sparse matrix iterator pointing to the element following the last sparse matrix element
+    */
+    //! returns the sparse matrix iterator at the matrix end
+    SparseMatIterator end();
+    //! returns the read-only sparse matrix iterator at the matrix end
+    SparseMatConstIterator end() const;
+    //! returns the typed sparse matrix iterator at the matrix end
+    template<typename _Tp> SparseMatIterator_<_Tp> end();
+    //! returns the typed read-only sparse matrix iterator at the matrix end
+    template<typename _Tp> SparseMatConstIterator_<_Tp> end() const;
+
+    //! returns the value stored in the sparse martix node
+    template<typename _Tp> _Tp& value(Node* n);
+    //! returns the value stored in the sparse martix node
+    template<typename _Tp> const _Tp& value(const Node* n) const;
+
+    ////////////// some internal-use methods ///////////////
+    Node* node(size_t nidx);
+    const Node* node(size_t nidx) const;
+
+    uchar* newNode(const int* idx, size_t hashval);
+    void removeNode(size_t hidx, size_t nidx, size_t previdx);
+    void resizeHashTab(size_t newsize);
+
+    int flags;
+    Hdr* hdr;
+};
+
+
+
+///////////////////////////////// SparseMat_<_Tp> ////////////////////////////////////
+
+/** @brief Template sparse n-dimensional array class derived from SparseMat
+
+SparseMat_ is a thin wrapper on top of SparseMat created in the same way as Mat_ . It simplifies
+notation of some operations:
+@code
+    int sz[] = {10, 20, 30};
+    SparseMat_<double> M(3, sz);
+    ...
+    M.ref(1, 2, 3) = M(4, 5, 6) + M(7, 8, 9);
+@endcode
+ */
+template<typename _Tp> class SparseMat_ : public SparseMat
+{
+public:
+    typedef SparseMatIterator_<_Tp> iterator;
+    typedef SparseMatConstIterator_<_Tp> const_iterator;
+
+    //! the default constructor
+    SparseMat_();
+    //! the full constructor equivalent to SparseMat(dims, _sizes, DataType<_Tp>::type)
+    SparseMat_(int dims, const int* _sizes);
+    //! the copy constructor. If DataType<_Tp>.type != m.type(), the m elements are converted
+    SparseMat_(const SparseMat& m);
+    //! the copy constructor. This is O(1) operation - no data is copied
+    SparseMat_(const SparseMat_& m);
+    //! converts dense matrix to the sparse form
+    SparseMat_(const Mat& m);
+    //! converts the old-style sparse matrix to the C++ class. All the elements are copied
+    //SparseMat_(const CvSparseMat* m);
+    //! the assignment operator. If DataType<_Tp>.type != m.type(), the m elements are converted
+    SparseMat_& operator = (const SparseMat& m);
+    //! the assignment operator. This is O(1) operation - no data is copied
+    SparseMat_& operator = (const SparseMat_& m);
+    //! converts dense matrix to the sparse form
+    SparseMat_& operator = (const Mat& m);
+
+    //! makes full copy of the matrix. All the elements are duplicated
+    CV_NODISCARD_STD SparseMat_ clone() const;
+    //! equivalent to cv::SparseMat::create(dims, _sizes, DataType<_Tp>::type)
+    void create(int dims, const int* _sizes);
+    //! converts sparse matrix to the old-style CvSparseMat. All the elements are copied
+    //operator CvSparseMat*() const;
+
+    //! returns type of the matrix elements
+    int type() const;
+    //! returns depth of the matrix elements
+    int depth() const;
+    //! returns the number of channels in each matrix element
+    int channels() const;
+
+    //! equivalent to SparseMat::ref<_Tp>(i0, hashval)
+    _Tp& ref(int i0, size_t* hashval=0);
+    //! equivalent to SparseMat::ref<_Tp>(i0, i1, hashval)
+    _Tp& ref(int i0, int i1, size_t* hashval=0);
+    //! equivalent to SparseMat::ref<_Tp>(i0, i1, i2, hashval)
+    _Tp& ref(int i0, int i1, int i2, size_t* hashval=0);
+    //! equivalent to SparseMat::ref<_Tp>(idx, hashval)
+    _Tp& ref(const int* idx, size_t* hashval=0);
+
+    //! equivalent to SparseMat::value<_Tp>(i0, hashval)
+    _Tp operator()(int i0, size_t* hashval=0) const;
+    //! equivalent to SparseMat::value<_Tp>(i0, i1, hashval)
+    _Tp operator()(int i0, int i1, size_t* hashval=0) const;
+    //! equivalent to SparseMat::value<_Tp>(i0, i1, i2, hashval)
+    _Tp operator()(int i0, int i1, int i2, size_t* hashval=0) const;
+    //! equivalent to SparseMat::value<_Tp>(idx, hashval)
+    _Tp operator()(const int* idx, size_t* hashval=0) const;
+
+    //! returns sparse matrix iterator pointing to the first sparse matrix element
+    SparseMatIterator_<_Tp> begin();
+    //! returns read-only sparse matrix iterator pointing to the first sparse matrix element
+    SparseMatConstIterator_<_Tp> begin() const;
+    //! returns sparse matrix iterator pointing to the element following the last sparse matrix element
+    SparseMatIterator_<_Tp> end();
+    //! returns read-only sparse matrix iterator pointing to the element following the last sparse matrix element
+    SparseMatConstIterator_<_Tp> end() const;
+};
+
+
+
+////////////////////////////////// MatConstIterator //////////////////////////////////
+
+class CV_EXPORTS MatConstIterator
+{
+public:
+    typedef uchar* value_type;
+    typedef ptrdiff_t difference_type;
+    typedef const uchar** pointer;
+    typedef uchar* reference;
+
+    typedef std::random_access_iterator_tag iterator_category;
+
+    //! default constructor
+    MatConstIterator();
+    //! constructor that sets the iterator to the beginning of the matrix
+    MatConstIterator(const Mat* _m);
+    //! constructor that sets the iterator to the specified element of the matrix
+    MatConstIterator(const Mat* _m, int _row, int _col=0);
+    //! constructor that sets the iterator to the specified element of the matrix
+    MatConstIterator(const Mat* _m, Point _pt);
+    //! constructor that sets the iterator to the specified element of the matrix
+    MatConstIterator(const Mat* _m, const int* _idx);
+    //! copy constructor
+    MatConstIterator(const MatConstIterator& it);
+
+    //! copy operator
+    MatConstIterator& operator = (const MatConstIterator& it);
+    //! returns the current matrix element
+    const uchar* operator *() const;
+    //! returns the i-th matrix element, relative to the current
+    const uchar* operator [](ptrdiff_t i) const;
+
+    //! shifts the iterator forward by the specified number of elements
+    MatConstIterator& operator += (ptrdiff_t ofs);
+    //! shifts the iterator backward by the specified number of elements
+    MatConstIterator& operator -= (ptrdiff_t ofs);
+    //! decrements the iterator
+    MatConstIterator& operator --();
+    //! decrements the iterator
+    MatConstIterator operator --(int);
+    //! increments the iterator
+    MatConstIterator& operator ++();
+    //! increments the iterator
+    MatConstIterator operator ++(int);
+    //! returns the current iterator position
+    Point pos() const;
+    //! returns the current iterator position
+    void pos(int* _idx) const;
+
+    ptrdiff_t lpos() const;
+    void seek(ptrdiff_t ofs, bool relative = false);
+    void seek(const int* _idx, bool relative = false);
+
+    const Mat* m;
+    size_t elemSize;
+    const uchar* ptr;
+    const uchar* sliceStart;
+    const uchar* sliceEnd;
+};
+
+
+
+////////////////////////////////// MatConstIterator_ /////////////////////////////////
+
+/** @brief Matrix read-only iterator
+ */
+template<typename _Tp>
+class MatConstIterator_ : public MatConstIterator
+{
+public:
+    typedef _Tp value_type;
+    typedef ptrdiff_t difference_type;
+    typedef const _Tp* pointer;
+    typedef const _Tp& reference;
+
+    typedef std::random_access_iterator_tag iterator_category;
+
+    //! default constructor
+    MatConstIterator_();
+    //! constructor that sets the iterator to the beginning of the matrix
+    MatConstIterator_(const Mat_<_Tp>* _m);
+    //! constructor that sets the iterator to the specified element of the matrix
+    MatConstIterator_(const Mat_<_Tp>* _m, int _row, int _col=0);
+    //! constructor that sets the iterator to the specified element of the matrix
+    MatConstIterator_(const Mat_<_Tp>* _m, Point _pt);
+    //! constructor that sets the iterator to the specified element of the matrix
+    MatConstIterator_(const Mat_<_Tp>* _m, const int* _idx);
+    //! copy constructor
+    MatConstIterator_(const MatConstIterator_& it);
+
+    //! copy operator
+    MatConstIterator_& operator = (const MatConstIterator_& it);
+    //! returns the current matrix element
+    const _Tp& operator *() const;
+    //! returns the i-th matrix element, relative to the current
+    const _Tp& operator [](ptrdiff_t i) const;
+
+    //! shifts the iterator forward by the specified number of elements
+    MatConstIterator_& operator += (ptrdiff_t ofs);
+    //! shifts the iterator backward by the specified number of elements
+    MatConstIterator_& operator -= (ptrdiff_t ofs);
+    //! decrements the iterator
+    MatConstIterator_& operator --();
+    //! decrements the iterator
+    MatConstIterator_ operator --(int);
+    //! increments the iterator
+    MatConstIterator_& operator ++();
+    //! increments the iterator
+    MatConstIterator_ operator ++(int);
+    //! returns the current iterator position
+    Point pos() const;
+};
+
+
+
+//////////////////////////////////// MatIterator_ ////////////////////////////////////
+
+/** @brief Matrix read-write iterator
+*/
+template<typename _Tp>
+class MatIterator_ : public MatConstIterator_<_Tp>
+{
+public:
+    typedef _Tp* pointer;
+    typedef _Tp& reference;
+
+    typedef std::random_access_iterator_tag iterator_category;
+
+    //! the default constructor
+    MatIterator_();
+    //! constructor that sets the iterator to the beginning of the matrix
+    MatIterator_(Mat_<_Tp>* _m);
+    //! constructor that sets the iterator to the specified element of the matrix
+    MatIterator_(Mat_<_Tp>* _m, int _row, int _col=0);
+    //! constructor that sets the iterator to the specified element of the matrix
+    MatIterator_(Mat_<_Tp>* _m, Point _pt);
+    //! constructor that sets the iterator to the specified element of the matrix
+    MatIterator_(Mat_<_Tp>* _m, const int* _idx);
+    //! copy constructor
+    MatIterator_(const MatIterator_& it);
+    //! copy operator
+    MatIterator_& operator = (const MatIterator_<_Tp>& it );
+
+    //! returns the current matrix element
+    _Tp& operator *() const;
+    //! returns the i-th matrix element, relative to the current
+    _Tp& operator [](ptrdiff_t i) const;
+
+    //! shifts the iterator forward by the specified number of elements
+    MatIterator_& operator += (ptrdiff_t ofs);
+    //! shifts the iterator backward by the specified number of elements
+    MatIterator_& operator -= (ptrdiff_t ofs);
+    //! decrements the iterator
+    MatIterator_& operator --();
+    //! decrements the iterator
+    MatIterator_ operator --(int);
+    //! increments the iterator
+    MatIterator_& operator ++();
+    //! increments the iterator
+    MatIterator_ operator ++(int);
+};
+
+
+
+/////////////////////////////// SparseMatConstIterator ///////////////////////////////
+
+/**  @brief Read-Only Sparse Matrix Iterator.
+
+ Here is how to use the iterator to compute the sum of floating-point sparse matrix elements:
+
+ \code
+ SparseMatConstIterator it = m.begin(), it_end = m.end();
+ double s = 0;
+ CV_Assert( m.type() == CV_32F );
+ for( ; it != it_end; ++it )
+    s += it.value<float>();
+ \endcode
+*/
+class CV_EXPORTS SparseMatConstIterator
+{
+public:
+    //! the default constructor
+    SparseMatConstIterator();
+    //! the full constructor setting the iterator to the first sparse matrix element
+    SparseMatConstIterator(const SparseMat* _m);
+    //! the copy constructor
+    SparseMatConstIterator(const SparseMatConstIterator& it);
+
+    //! the assignment operator
+    SparseMatConstIterator& operator = (const SparseMatConstIterator& it);
+
+    //! template method returning the current matrix element
+    template<typename _Tp> const _Tp& value() const;
+    //! returns the current node of the sparse matrix. it.node->idx is the current element index
+    const SparseMat::Node* node() const;
+
+    //! moves iterator to the previous element
+    SparseMatConstIterator& operator --();
+    //! moves iterator to the previous element
+    SparseMatConstIterator operator --(int);
+    //! moves iterator to the next element
+    SparseMatConstIterator& operator ++();
+    //! moves iterator to the next element
+    SparseMatConstIterator operator ++(int);
+
+    //! moves iterator to the element after the last element
+    void seekEnd();
+
+    const SparseMat* m;
+    size_t hashidx;
+    uchar* ptr;
+};
+
+
+
+////////////////////////////////// SparseMatIterator /////////////////////////////////
+
+/** @brief  Read-write Sparse Matrix Iterator
+
+ The class is similar to cv::SparseMatConstIterator,
+ but can be used for in-place modification of the matrix elements.
+*/
+class CV_EXPORTS SparseMatIterator : public SparseMatConstIterator
+{
+public:
+    //! the default constructor
+    SparseMatIterator();
+    //! the full constructor setting the iterator to the first sparse matrix element
+    SparseMatIterator(SparseMat* _m);
+    //! the full constructor setting the iterator to the specified sparse matrix element
+    SparseMatIterator(SparseMat* _m, const int* idx);
+    //! the copy constructor
+    SparseMatIterator(const SparseMatIterator& it);
+
+    //! the assignment operator
+    SparseMatIterator& operator = (const SparseMatIterator& it);
+    //! returns read-write reference to the current sparse matrix element
+    template<typename _Tp> _Tp& value() const;
+    //! returns pointer to the current sparse matrix node. it.node->idx is the index of the current element (do not modify it!)
+    SparseMat::Node* node() const;
+
+    //! moves iterator to the next element
+    SparseMatIterator& operator ++();
+    //! moves iterator to the next element
+    SparseMatIterator operator ++(int);
+};
+
+
+
+/////////////////////////////// SparseMatConstIterator_ //////////////////////////////
+
+/** @brief  Template Read-Only Sparse Matrix Iterator Class.
+
+ This is the derived from SparseMatConstIterator class that
+ introduces more convenient operator *() for accessing the current element.
+*/
+template<typename _Tp> class SparseMatConstIterator_ : public SparseMatConstIterator
+{
+public:
+
+    typedef std::forward_iterator_tag iterator_category;
+
+    //! the default constructor
+    SparseMatConstIterator_();
+    //! the full constructor setting the iterator to the first sparse matrix element
+    SparseMatConstIterator_(const SparseMat_<_Tp>* _m);
+    SparseMatConstIterator_(const SparseMat* _m);
+    //! the copy constructor
+    SparseMatConstIterator_(const SparseMatConstIterator_& it);
+
+    //! the assignment operator
+    SparseMatConstIterator_& operator = (const SparseMatConstIterator_& it);
+    //! the element access operator
+    const _Tp& operator *() const;
+
+    //! moves iterator to the next element
+    SparseMatConstIterator_& operator ++();
+    //! moves iterator to the next element
+    SparseMatConstIterator_ operator ++(int);
+};
+
+
+
+///////////////////////////////// SparseMatIterator_ /////////////////////////////////
+
+/** @brief  Template Read-Write Sparse Matrix Iterator Class.
+
+ This is the derived from cv::SparseMatConstIterator_ class that
+ introduces more convenient operator *() for accessing the current element.
+*/
+template<typename _Tp> class SparseMatIterator_ : public SparseMatConstIterator_<_Tp>
+{
+public:
+
+    typedef std::forward_iterator_tag iterator_category;
+
+    //! the default constructor
+    SparseMatIterator_();
+    //! the full constructor setting the iterator to the first sparse matrix element
+    SparseMatIterator_(SparseMat_<_Tp>* _m);
+    SparseMatIterator_(SparseMat* _m);
+    //! the copy constructor
+    SparseMatIterator_(const SparseMatIterator_& it);
+
+    //! the assignment operator
+    SparseMatIterator_& operator = (const SparseMatIterator_& it);
+    //! returns the reference to the current element
+    _Tp& operator *() const;
+
+    //! moves the iterator to the next element
+    SparseMatIterator_& operator ++();
+    //! moves the iterator to the next element
+    SparseMatIterator_ operator ++(int);
+};
+
+
+
+/////////////////////////////////// NAryMatIterator //////////////////////////////////
+
+/** @brief n-ary multi-dimensional array iterator.
+
+Use the class to implement unary, binary, and, generally, n-ary element-wise operations on
+multi-dimensional arrays. Some of the arguments of an n-ary function may be continuous arrays, some
+may be not. It is possible to use conventional MatIterator 's for each array but incrementing all of
+the iterators after each small operations may be a big overhead. In this case consider using
+NAryMatIterator to iterate through several matrices simultaneously as long as they have the same
+geometry (dimensionality and all the dimension sizes are the same). On each iteration `it.planes[0]`,
+`it.planes[1]`,... will be the slices of the corresponding matrices.
+
+The example below illustrates how you can compute a normalized and threshold 3D color histogram:
+@code
+    void computeNormalizedColorHist(const Mat& image, Mat& hist, int N, double minProb)
+    {
+        const int histSize[] = {N, N, N};
+
+        // make sure that the histogram has a proper size and type
+        hist.create(3, histSize, CV_32F);
+
+        // and clear it
+        hist = Scalar(0);
+
+        // the loop below assumes that the image
+        // is a 8-bit 3-channel. check it.
+        CV_Assert(image.type() == CV_8UC3);
+        MatConstIterator_<Vec3b> it = image.begin<Vec3b>(),
+                                 it_end = image.end<Vec3b>();
+        for( ; it != it_end; ++it )
+        {
+            const Vec3b& pix = *it;
+            hist.at<float>(pix[0]*N/256, pix[1]*N/256, pix[2]*N/256) += 1.f;
+        }
+
+        minProb *= image.rows*image.cols;
+
+        // initialize iterator (the style is different from STL).
+        // after initialization the iterator will contain
+        // the number of slices or planes the iterator will go through.
+        // it simultaneously increments iterators for several matrices
+        // supplied as a null terminated list of pointers
+        const Mat* arrays[] = {&hist, 0};
+        Mat planes[1];
+        NAryMatIterator itNAry(arrays, planes, 1);
+        double s = 0;
+        // iterate through the matrix. on each iteration
+        // itNAry.planes[i] (of type Mat) will be set to the current plane
+        // of the i-th n-dim matrix passed to the iterator constructor.
+        for(int p = 0; p < itNAry.nplanes; p++, ++itNAry)
+        {
+            threshold(itNAry.planes[0], itNAry.planes[0], minProb, 0, THRESH_TOZERO);
+            s += sum(itNAry.planes[0])[0];
+        }
+
+        s = 1./s;
+        itNAry = NAryMatIterator(arrays, planes, 1);
+        for(int p = 0; p < itNAry.nplanes; p++, ++itNAry)
+            itNAry.planes[0] *= s;
+    }
+@endcode
+ */
+class CV_EXPORTS NAryMatIterator
+{
+public:
+    //! the default constructor
+    NAryMatIterator();
+    //! the full constructor taking arbitrary number of n-dim matrices
+    NAryMatIterator(const Mat** arrays, uchar** ptrs, int narrays=-1);
+    //! the full constructor taking arbitrary number of n-dim matrices
+    NAryMatIterator(const Mat** arrays, Mat* planes, int narrays=-1);
+    //! the separate iterator initialization method
+    void init(const Mat** arrays, Mat* planes, uchar** ptrs, int narrays=-1);
+
+    //! proceeds to the next plane of every iterated matrix
+    NAryMatIterator& operator ++();
+    //! proceeds to the next plane of every iterated matrix (postfix increment operator)
+    NAryMatIterator operator ++(int);
+
+    //! the iterated arrays
+    const Mat** arrays;
+    //! the current planes
+    Mat* planes;
+    //! data pointers
+    uchar** ptrs;
+    //! the number of arrays
+    int narrays;
+    //! the number of hyper-planes that the iterator steps through
+    size_t nplanes;
+    //! the size of each segment (in elements)
+    size_t size;
+protected:
+    int iterdepth;
+    size_t idx;
+};
+
+
+
+///////////////////////////////// Matrix Expressions /////////////////////////////////
+
+class CV_EXPORTS MatOp
+{
+public:
+    MatOp();
+    virtual ~MatOp();
+
+    virtual bool elementWise(const MatExpr& expr) const;
+    virtual void assign(const MatExpr& expr, Mat& m, int type=-1) const = 0;
+    virtual void roi(const MatExpr& expr, const Range& rowRange,
+                     const Range& colRange, MatExpr& res) const;
+    virtual void diag(const MatExpr& expr, int d, MatExpr& res) const;
+    virtual void augAssignAdd(const MatExpr& expr, Mat& m) const;
+    virtual void augAssignSubtract(const MatExpr& expr, Mat& m) const;
+    virtual void augAssignMultiply(const MatExpr& expr, Mat& m) const;
+    virtual void augAssignDivide(const MatExpr& expr, Mat& m) const;
+    virtual void augAssignAnd(const MatExpr& expr, Mat& m) const;
+    virtual void augAssignOr(const MatExpr& expr, Mat& m) const;
+    virtual void augAssignXor(const MatExpr& expr, Mat& m) const;
+
+    virtual void add(const MatExpr& expr1, const MatExpr& expr2, MatExpr& res) const;
+    virtual void add(const MatExpr& expr1, const Scalar& s, MatExpr& res) const;
+
+    virtual void subtract(const MatExpr& expr1, const MatExpr& expr2, MatExpr& res) const;
+    virtual void subtract(const Scalar& s, const MatExpr& expr, MatExpr& res) const;
+
+    virtual void multiply(const MatExpr& expr1, const MatExpr& expr2, MatExpr& res, double scale=1) const;
+    virtual void multiply(const MatExpr& expr1, double s, MatExpr& res) const;
+
+    virtual void divide(const MatExpr& expr1, const MatExpr& expr2, MatExpr& res, double scale=1) const;
+    virtual void divide(double s, const MatExpr& expr, MatExpr& res) const;
+
+    virtual void abs(const MatExpr& expr, MatExpr& res) const;
+
+    virtual void transpose(const MatExpr& expr, MatExpr& res) const;
+    virtual void matmul(const MatExpr& expr1, const MatExpr& expr2, MatExpr& res) const;
+    virtual void invert(const MatExpr& expr, int method, MatExpr& res) const;
+
+    virtual Size size(const MatExpr& expr) const;
+    virtual int type(const MatExpr& expr) const;
+};
+
+/** @brief Matrix expression representation
+@anchor MatrixExpressions
+This is a list of implemented matrix operations that can be combined in arbitrary complex
+expressions (here A, B stand for matrices ( Mat ), s for a scalar ( Scalar ), alpha for a
+real-valued scalar ( double )):
+-   Addition, subtraction, negation: `A+B`, `A-B`, `A+s`, `A-s`, `s+A`, `s-A`, `-A`
+-   Scaling: `A*alpha`
+-   Per-element multiplication and division: `A.mul(B)`, `A/B`, `alpha/A`
+-   Matrix multiplication: `A*B`
+-   Transposition: `A.t()` (means A<sup>T</sup>)
+-   Matrix inversion and pseudo-inversion, solving linear systems and least-squares problems:
+    `A.inv([method]) (~ A<sup>-1</sup>)`,   `A.inv([method])*B (~ X: AX=B)`
+-   Comparison: `A cmpop B`, `A cmpop alpha`, `alpha cmpop A`, where *cmpop* is one of
+  `>`, `>=`, `==`, `!=`, `<=`, `<`. The result of comparison is an 8-bit single channel mask whose
+    elements are set to 255 (if the particular element or pair of elements satisfy the condition) or
+    0.
+-   Bitwise logical operations: `A logicop B`, `A logicop s`, `s logicop A`, `~A`, where *logicop* is one of
+  `&`, `|`, `^`.
+-   Element-wise minimum and maximum: `min(A, B)`, `min(A, alpha)`, `max(A, B)`, `max(A, alpha)`
+-   Element-wise absolute value: `abs(A)`
+-   Cross-product, dot-product: `A.cross(B)`, `A.dot(B)`
+-   Any function of matrix or matrices and scalars that returns a matrix or a scalar, such as norm,
+    mean, sum, countNonZero, trace, determinant, repeat, and others.
+-   Matrix initializers ( Mat::eye(), Mat::zeros(), Mat::ones() ), matrix comma-separated
+    initializers, matrix constructors and operators that extract sub-matrices (see Mat description).
+-   Mat_<destination_type>() constructors to cast the result to the proper type.
+@note Comma-separated initializers and probably some other operations may require additional
+explicit Mat() or Mat_<T>() constructor calls to resolve a possible ambiguity.
+
+Here are examples of matrix expressions:
+@code
+    // compute pseudo-inverse of A, equivalent to A.inv(DECOMP_SVD)
+    SVD svd(A);
+    Mat pinvA = svd.vt.t()*Mat::diag(1./svd.w)*svd.u.t();
+
+    // compute the new vector of parameters in the Levenberg-Marquardt algorithm
+    x -= (A.t()*A + lambda*Mat::eye(A.cols,A.cols,A.type())).inv(DECOMP_CHOLESKY)*(A.t()*err);
+
+    // sharpen image using "unsharp mask" algorithm
+    Mat blurred; double sigma = 1, threshold = 5, amount = 1;
+    GaussianBlur(img, blurred, Size(), sigma, sigma);
+    Mat lowContrastMask = abs(img - blurred) < threshold;
+    Mat sharpened = img*(1+amount) + blurred*(-amount);
+    img.copyTo(sharpened, lowContrastMask);
+@endcode
+*/
+class CV_EXPORTS MatExpr
+{
+public:
+    MatExpr();
+    explicit MatExpr(const Mat& m);
+
+    MatExpr(const MatOp* _op, int _flags, const Mat& _a = Mat(), const Mat& _b = Mat(),
+            const Mat& _c = Mat(), double _alpha = 1, double _beta = 1, const Scalar& _s = Scalar());
+
+    operator Mat() const;
+    template<typename _Tp> operator Mat_<_Tp>() const;
+
+    Size size() const;
+    int type() const;
+
+    MatExpr row(int y) const;
+    MatExpr col(int x) const;
+    MatExpr diag(int d = 0) const;
+    MatExpr operator()( const Range& rowRange, const Range& colRange ) const;
+    MatExpr operator()( const Rect& roi ) const;
+
+    MatExpr t() const;
+    MatExpr inv(int method = DECOMP_LU) const;
+    MatExpr mul(const MatExpr& e, double scale=1) const;
+    MatExpr mul(const Mat& m, double scale=1) const;
+
+    Mat cross(const Mat& m) const;
+    double dot(const Mat& m) const;
+
+    void swap(MatExpr& b);
+
+    const MatOp* op;
+    int flags;
+
+    Mat a, b, c;
+    double alpha, beta;
+    Scalar s;
+};
+
+//! @} core_basic
+
+//! @relates cv::MatExpr
+//! @{
+CV_EXPORTS MatExpr operator + (const Mat& a, const Mat& b);
+CV_EXPORTS MatExpr operator + (const Mat& a, const Scalar& s);
+CV_EXPORTS MatExpr operator + (const Scalar& s, const Mat& a);
+CV_EXPORTS MatExpr operator + (const MatExpr& e, const Mat& m);
+CV_EXPORTS MatExpr operator + (const Mat& m, const MatExpr& e);
+CV_EXPORTS MatExpr operator + (const MatExpr& e, const Scalar& s);
+CV_EXPORTS MatExpr operator + (const Scalar& s, const MatExpr& e);
+CV_EXPORTS MatExpr operator + (const MatExpr& e1, const MatExpr& e2);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator + (const Mat& a, const Matx<_Tp, m, n>& b) { return a + Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator + (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) + b; }
+
+CV_EXPORTS MatExpr operator - (const Mat& a, const Mat& b);
+CV_EXPORTS MatExpr operator - (const Mat& a, const Scalar& s);
+CV_EXPORTS MatExpr operator - (const Scalar& s, const Mat& a);
+CV_EXPORTS MatExpr operator - (const MatExpr& e, const Mat& m);
+CV_EXPORTS MatExpr operator - (const Mat& m, const MatExpr& e);
+CV_EXPORTS MatExpr operator - (const MatExpr& e, const Scalar& s);
+CV_EXPORTS MatExpr operator - (const Scalar& s, const MatExpr& e);
+CV_EXPORTS MatExpr operator - (const MatExpr& e1, const MatExpr& e2);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator - (const Mat& a, const Matx<_Tp, m, n>& b) { return a - Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator - (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) - b; }
+
+CV_EXPORTS MatExpr operator - (const Mat& m);
+CV_EXPORTS MatExpr operator - (const MatExpr& e);
+
+CV_EXPORTS MatExpr operator * (const Mat& a, const Mat& b);
+CV_EXPORTS MatExpr operator * (const Mat& a, double s);
+CV_EXPORTS MatExpr operator * (double s, const Mat& a);
+CV_EXPORTS MatExpr operator * (const MatExpr& e, const Mat& m);
+CV_EXPORTS MatExpr operator * (const Mat& m, const MatExpr& e);
+CV_EXPORTS MatExpr operator * (const MatExpr& e, double s);
+CV_EXPORTS MatExpr operator * (double s, const MatExpr& e);
+CV_EXPORTS MatExpr operator * (const MatExpr& e1, const MatExpr& e2);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator * (const Mat& a, const Matx<_Tp, m, n>& b) { return a * Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator * (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) * b; }
+
+CV_EXPORTS MatExpr operator / (const Mat& a, const Mat& b);
+CV_EXPORTS MatExpr operator / (const Mat& a, double s);
+CV_EXPORTS MatExpr operator / (double s, const Mat& a);
+CV_EXPORTS MatExpr operator / (const MatExpr& e, const Mat& m);
+CV_EXPORTS MatExpr operator / (const Mat& m, const MatExpr& e);
+CV_EXPORTS MatExpr operator / (const MatExpr& e, double s);
+CV_EXPORTS MatExpr operator / (double s, const MatExpr& e);
+CV_EXPORTS MatExpr operator / (const MatExpr& e1, const MatExpr& e2);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator / (const Mat& a, const Matx<_Tp, m, n>& b) { return a / Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator / (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) / b; }
+
+CV_EXPORTS MatExpr operator < (const Mat& a, const Mat& b);
+CV_EXPORTS MatExpr operator < (const Mat& a, double s);
+CV_EXPORTS MatExpr operator < (double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator < (const Mat& a, const Matx<_Tp, m, n>& b) { return a < Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator < (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) < b; }
+
+CV_EXPORTS MatExpr operator <= (const Mat& a, const Mat& b);
+CV_EXPORTS MatExpr operator <= (const Mat& a, double s);
+CV_EXPORTS MatExpr operator <= (double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator <= (const Mat& a, const Matx<_Tp, m, n>& b) { return a <= Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator <= (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) <= b; }
+
+CV_EXPORTS MatExpr operator == (const Mat& a, const Mat& b);
+CV_EXPORTS MatExpr operator == (const Mat& a, double s);
+CV_EXPORTS MatExpr operator == (double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator == (const Mat& a, const Matx<_Tp, m, n>& b) { return a == Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator == (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) == b; }
+
+CV_EXPORTS MatExpr operator != (const Mat& a, const Mat& b);
+CV_EXPORTS MatExpr operator != (const Mat& a, double s);
+CV_EXPORTS MatExpr operator != (double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator != (const Mat& a, const Matx<_Tp, m, n>& b) { return a != Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator != (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) != b; }
+
+CV_EXPORTS MatExpr operator >= (const Mat& a, const Mat& b);
+CV_EXPORTS MatExpr operator >= (const Mat& a, double s);
+CV_EXPORTS MatExpr operator >= (double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator >= (const Mat& a, const Matx<_Tp, m, n>& b) { return a >= Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator >= (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) >= b; }
+
+CV_EXPORTS MatExpr operator > (const Mat& a, const Mat& b);
+CV_EXPORTS MatExpr operator > (const Mat& a, double s);
+CV_EXPORTS MatExpr operator > (double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator > (const Mat& a, const Matx<_Tp, m, n>& b) { return a > Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator > (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) > b; }
+
+CV_EXPORTS MatExpr operator & (const Mat& a, const Mat& b);
+CV_EXPORTS MatExpr operator & (const Mat& a, const Scalar& s);
+CV_EXPORTS MatExpr operator & (const Scalar& s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator & (const Mat& a, const Matx<_Tp, m, n>& b) { return a & Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator & (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) & b; }
+
+CV_EXPORTS MatExpr operator | (const Mat& a, const Mat& b);
+CV_EXPORTS MatExpr operator | (const Mat& a, const Scalar& s);
+CV_EXPORTS MatExpr operator | (const Scalar& s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator | (const Mat& a, const Matx<_Tp, m, n>& b) { return a | Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator | (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) | b; }
+
+CV_EXPORTS MatExpr operator ^ (const Mat& a, const Mat& b);
+CV_EXPORTS MatExpr operator ^ (const Mat& a, const Scalar& s);
+CV_EXPORTS MatExpr operator ^ (const Scalar& s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr operator ^ (const Mat& a, const Matx<_Tp, m, n>& b) { return a ^ Mat(b); }
+template<typename _Tp, int m, int n> static inline
+MatExpr operator ^ (const Matx<_Tp, m, n>& a, const Mat& b) { return Mat(a) ^ b; }
+
+CV_EXPORTS MatExpr operator ~(const Mat& m);
+
+CV_EXPORTS MatExpr min(const Mat& a, const Mat& b);
+CV_EXPORTS MatExpr min(const Mat& a, double s);
+CV_EXPORTS MatExpr min(double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr min (const Mat& a, const Matx<_Tp, m, n>& b) { return min(a, Mat(b)); }
+template<typename _Tp, int m, int n> static inline
+MatExpr min (const Matx<_Tp, m, n>& a, const Mat& b) { return min(Mat(a), b); }
+
+CV_EXPORTS MatExpr max(const Mat& a, const Mat& b);
+CV_EXPORTS MatExpr max(const Mat& a, double s);
+CV_EXPORTS MatExpr max(double s, const Mat& a);
+template<typename _Tp, int m, int n> static inline
+MatExpr max (const Mat& a, const Matx<_Tp, m, n>& b) { return max(a, Mat(b)); }
+template<typename _Tp, int m, int n> static inline
+MatExpr max (const Matx<_Tp, m, n>& a, const Mat& b) { return max(Mat(a), b); }
+
+/** @brief Calculates an absolute value of each matrix element.
+
+abs is a meta-function that is expanded to one of absdiff or convertScaleAbs forms:
+- C = abs(A-B) is equivalent to `absdiff(A, B, C)`
+- C = abs(A) is equivalent to `absdiff(A, Scalar::all(0), C)`
+- C = `Mat_<Vec<uchar,n> >(abs(A*alpha + beta))` is equivalent to `convertScaleAbs(A, C, alpha,
+beta)`
+
+The output matrix has the same size and the same type as the input one except for the last case,
+where C is depth=CV_8U .
+@param m matrix.
+@sa @ref MatrixExpressions, absdiff, convertScaleAbs
+ */
+CV_EXPORTS MatExpr abs(const Mat& m);
+/** @overload
+@param e matrix expression.
+*/
+CV_EXPORTS MatExpr abs(const MatExpr& e);
+//! @} relates cv::MatExpr
+
+} // cv
+
+#include "opencv2/core/mat.inl.hpp"
+
+#endif // OPENCV_CORE_MAT_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/mat.inl.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/mat.inl.hpp
new file mode 100644
index 0000000..886b82c
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/mat.inl.hpp
@@ -0,0 +1,3422 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_MATRIX_OPERATIONS_HPP
+#define OPENCV_CORE_MATRIX_OPERATIONS_HPP
+
+#ifndef __cplusplus
+#  error mat.inl.hpp header must be compiled as C++
+#endif
+
+#ifdef _MSC_VER
+#pragma warning( push )
+#pragma warning( disable: 4127 )
+#endif
+
+#if defined(CV_SKIP_DISABLE_CLANG_ENUM_WARNINGS)
+  // nothing
+#elif defined(CV_FORCE_DISABLE_CLANG_ENUM_WARNINGS)
+  #define CV_DISABLE_CLANG_ENUM_WARNINGS
+#elif defined(__clang__) && defined(__has_warning)
+  #if __has_warning("-Wdeprecated-enum-enum-conversion") && __has_warning("-Wdeprecated-anon-enum-enum-conversion")
+    #define CV_DISABLE_CLANG_ENUM_WARNINGS
+  #endif
+#endif
+#ifdef CV_DISABLE_CLANG_ENUM_WARNINGS
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-enum-enum-conversion"
+#pragma clang diagnostic ignored "-Wdeprecated-anon-enum-enum-conversion"
+#endif
+
+namespace cv
+{
+CV__DEBUG_NS_BEGIN
+
+
+//! @cond IGNORED
+
+////////////////////////// Custom (raw) type wrapper //////////////////////////
+
+template<typename _Tp> static inline
+int rawType()
+{
+    CV_StaticAssert(sizeof(_Tp) <= CV_CN_MAX, "sizeof(_Tp) is too large");
+    const int elemSize = sizeof(_Tp);
+    return (int)CV_MAKETYPE(CV_8U, elemSize);
+}
+
+//////////////////////// Input/Output Arrays ////////////////////////
+
+inline void _InputArray::init(int _flags, const void* _obj)
+{ flags = _flags; obj = (void*)_obj; }
+
+inline void _InputArray::init(int _flags, const void* _obj, Size _sz)
+{ flags = _flags; obj = (void*)_obj; sz = _sz; }
+
+inline void* _InputArray::getObj() const { return obj; }
+inline int _InputArray::getFlags() const { return flags; }
+inline Size _InputArray::getSz() const { return sz; }
+
+inline _InputArray::_InputArray() { init(0 + NONE, 0); }
+inline _InputArray::_InputArray(int _flags, void* _obj) { init(_flags, _obj); }
+inline _InputArray::_InputArray(const Mat& m) { init(MAT+ACCESS_READ, &m); }
+inline _InputArray::_InputArray(const std::vector<Mat>& vec) { init(STD_VECTOR_MAT+ACCESS_READ, &vec); }
+inline _InputArray::_InputArray(const UMat& m) { init(UMAT+ACCESS_READ, &m); }
+inline _InputArray::_InputArray(const std::vector<UMat>& vec) { init(STD_VECTOR_UMAT+ACCESS_READ, &vec); }
+
+template<typename _Tp> inline
+_InputArray::_InputArray(const std::vector<_Tp>& vec)
+{ init(FIXED_TYPE + STD_VECTOR + traits::Type<_Tp>::value + ACCESS_READ, &vec); }
+
+template<typename _Tp, std::size_t _Nm> inline
+_InputArray::_InputArray(const std::array<_Tp, _Nm>& arr)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_READ, arr.data(), Size(1, _Nm)); }
+
+template<std::size_t _Nm> inline
+_InputArray::_InputArray(const std::array<Mat, _Nm>& arr)
+{ init(STD_ARRAY_MAT + ACCESS_READ, arr.data(), Size(1, _Nm)); }
+
+inline
+_InputArray::_InputArray(const std::vector<bool>& vec)
+{ init(FIXED_TYPE + STD_BOOL_VECTOR + traits::Type<bool>::value + ACCESS_READ, &vec); }
+
+template<typename _Tp> inline
+_InputArray::_InputArray(const std::vector<std::vector<_Tp> >& vec)
+{ init(FIXED_TYPE + STD_VECTOR_VECTOR + traits::Type<_Tp>::value + ACCESS_READ, &vec); }
+
+template<typename _Tp> inline
+_InputArray::_InputArray(const std::vector<Mat_<_Tp> >& vec)
+{ init(FIXED_TYPE + STD_VECTOR_MAT + traits::Type<_Tp>::value + ACCESS_READ, &vec); }
+
+template<typename _Tp, int m, int n> inline
+_InputArray::_InputArray(const Matx<_Tp, m, n>& mtx)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_READ, &mtx, Size(n, m)); }
+
+template<typename _Tp> inline
+_InputArray::_InputArray(const _Tp* vec, int n)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_READ, vec, Size(n, 1)); }
+
+template<typename _Tp> inline
+_InputArray::_InputArray(const Mat_<_Tp>& m)
+{ init(FIXED_TYPE + MAT + traits::Type<_Tp>::value + ACCESS_READ, &m); }
+
+inline _InputArray::_InputArray(const double& val)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + CV_64F + ACCESS_READ, &val, Size(1,1)); }
+
+inline _InputArray::_InputArray(const cuda::GpuMat& d_mat)
+{ init(CUDA_GPU_MAT + ACCESS_READ, &d_mat); }
+
+inline _InputArray::_InputArray(const std::vector<cuda::GpuMat>& d_mat)
+{	init(STD_VECTOR_CUDA_GPU_MAT + ACCESS_READ, &d_mat);}
+
+inline _InputArray::_InputArray(const ogl::Buffer& buf)
+{ init(OPENGL_BUFFER + ACCESS_READ, &buf); }
+
+inline _InputArray::_InputArray(const cuda::HostMem& cuda_mem)
+{ init(CUDA_HOST_MEM + ACCESS_READ, &cuda_mem); }
+
+template<typename _Tp> inline
+_InputArray _InputArray::rawIn(const std::vector<_Tp>& vec)
+{
+    _InputArray v;
+    v.flags = _InputArray::FIXED_TYPE + _InputArray::STD_VECTOR + rawType<_Tp>() + ACCESS_READ;
+    v.obj = (void*)&vec;
+    return v;
+}
+
+template<typename _Tp, std::size_t _Nm> inline
+_InputArray _InputArray::rawIn(const std::array<_Tp, _Nm>& arr)
+{
+    _InputArray v;
+    v.flags = FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_READ;
+    v.obj = (void*)arr.data();
+    v.sz = Size(1, _Nm);
+    return v;
+}
+
+inline _InputArray::~_InputArray() {}
+
+inline Mat _InputArray::getMat(int i) const
+{
+    if( kind() == MAT && i < 0 )
+        return *(const Mat*)obj;
+    return getMat_(i);
+}
+
+inline bool _InputArray::isMat() const { return kind() == _InputArray::MAT; }
+inline bool _InputArray::isUMat() const  { return kind() == _InputArray::UMAT; }
+inline bool _InputArray::isMatVector() const { return kind() == _InputArray::STD_VECTOR_MAT; }
+inline bool _InputArray::isUMatVector() const  { return kind() == _InputArray::STD_VECTOR_UMAT; }
+inline bool _InputArray::isMatx() const { return kind() == _InputArray::MATX; }
+inline bool _InputArray::isVector() const { return kind() == _InputArray::STD_VECTOR ||
+                                                   kind() == _InputArray::STD_BOOL_VECTOR ||
+                                                   (kind() == _InputArray::MATX && (sz.width <= 1 || sz.height <= 1)); }
+inline bool _InputArray::isGpuMat() const { return kind() == _InputArray::CUDA_GPU_MAT; }
+inline bool _InputArray::isGpuMatVector() const { return kind() == _InputArray::STD_VECTOR_CUDA_GPU_MAT; }
+
+////////////////////////////////////////////////////////////////////////////////////////
+
+inline _OutputArray::_OutputArray() { init(NONE + ACCESS_WRITE, 0); }
+inline _OutputArray::_OutputArray(int _flags, void* _obj) { init(_flags + ACCESS_WRITE, _obj); }
+inline _OutputArray::_OutputArray(Mat& m) { init(MAT+ACCESS_WRITE, &m); }
+inline _OutputArray::_OutputArray(std::vector<Mat>& vec) { init(STD_VECTOR_MAT + ACCESS_WRITE, &vec); }
+inline _OutputArray::_OutputArray(UMat& m) { init(UMAT + ACCESS_WRITE, &m); }
+inline _OutputArray::_OutputArray(std::vector<UMat>& vec) { init(STD_VECTOR_UMAT + ACCESS_WRITE, &vec); }
+
+template<typename _Tp> inline
+_OutputArray::_OutputArray(std::vector<_Tp>& vec)
+{ init(FIXED_TYPE + STD_VECTOR + traits::Type<_Tp>::value + ACCESS_WRITE, &vec); }
+
+template<typename _Tp, std::size_t _Nm> inline
+_OutputArray::_OutputArray(std::array<_Tp, _Nm>& arr)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_WRITE, arr.data(), Size(1, _Nm)); }
+
+template<std::size_t _Nm> inline
+_OutputArray::_OutputArray(std::array<Mat, _Nm>& arr)
+{ init(STD_ARRAY_MAT + ACCESS_WRITE, arr.data(), Size(1, _Nm)); }
+
+template<typename _Tp> inline
+_OutputArray::_OutputArray(std::vector<std::vector<_Tp> >& vec)
+{ init(FIXED_TYPE + STD_VECTOR_VECTOR + traits::Type<_Tp>::value + ACCESS_WRITE, &vec); }
+
+template<typename _Tp> inline
+_OutputArray::_OutputArray(std::vector<Mat_<_Tp> >& vec)
+{ init(FIXED_TYPE + STD_VECTOR_MAT + traits::Type<_Tp>::value + ACCESS_WRITE, &vec); }
+
+template<typename _Tp> inline
+_OutputArray::_OutputArray(Mat_<_Tp>& m)
+{ init(FIXED_TYPE + MAT + traits::Type<_Tp>::value + ACCESS_WRITE, &m); }
+
+template<typename _Tp, int m, int n> inline
+_OutputArray::_OutputArray(Matx<_Tp, m, n>& mtx)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_WRITE, &mtx, Size(n, m)); }
+
+template<typename _Tp> inline
+_OutputArray::_OutputArray(_Tp* vec, int n)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_WRITE, vec, Size(n, 1)); }
+
+template<typename _Tp> inline
+_OutputArray::_OutputArray(const std::vector<_Tp>& vec)
+{ init(FIXED_TYPE + FIXED_SIZE + STD_VECTOR + traits::Type<_Tp>::value + ACCESS_WRITE, &vec); }
+
+template<typename _Tp, std::size_t _Nm> inline
+_OutputArray::_OutputArray(const std::array<_Tp, _Nm>& arr)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_WRITE, arr.data(), Size(1, _Nm)); }
+
+template<std::size_t _Nm> inline
+_OutputArray::_OutputArray(const std::array<Mat, _Nm>& arr)
+{ init(FIXED_SIZE + STD_ARRAY_MAT + ACCESS_WRITE, arr.data(), Size(1, _Nm)); }
+
+template<typename _Tp> inline
+_OutputArray::_OutputArray(const std::vector<std::vector<_Tp> >& vec)
+{ init(FIXED_TYPE + FIXED_SIZE + STD_VECTOR_VECTOR + traits::Type<_Tp>::value + ACCESS_WRITE, &vec); }
+
+template<typename _Tp> inline
+_OutputArray::_OutputArray(const std::vector<Mat_<_Tp> >& vec)
+{ init(FIXED_TYPE + FIXED_SIZE + STD_VECTOR_MAT + traits::Type<_Tp>::value + ACCESS_WRITE, &vec); }
+
+template<typename _Tp> inline
+_OutputArray::_OutputArray(const Mat_<_Tp>& m)
+{ init(FIXED_TYPE + FIXED_SIZE + MAT + traits::Type<_Tp>::value + ACCESS_WRITE, &m); }
+
+template<typename _Tp, int m, int n> inline
+_OutputArray::_OutputArray(const Matx<_Tp, m, n>& mtx)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_WRITE, &mtx, Size(n, m)); }
+
+template<typename _Tp> inline
+_OutputArray::_OutputArray(const _Tp* vec, int n)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_WRITE, vec, Size(n, 1)); }
+
+inline _OutputArray::_OutputArray(cuda::GpuMat& d_mat)
+{ init(CUDA_GPU_MAT + ACCESS_WRITE, &d_mat); }
+
+inline _OutputArray::_OutputArray(std::vector<cuda::GpuMat>& d_mat)
+{	init(STD_VECTOR_CUDA_GPU_MAT + ACCESS_WRITE, &d_mat);}
+
+inline _OutputArray::_OutputArray(ogl::Buffer& buf)
+{ init(OPENGL_BUFFER + ACCESS_WRITE, &buf); }
+
+inline _OutputArray::_OutputArray(cuda::HostMem& cuda_mem)
+{ init(CUDA_HOST_MEM + ACCESS_WRITE, &cuda_mem); }
+
+inline _OutputArray::_OutputArray(const Mat& m)
+{ init(FIXED_TYPE + FIXED_SIZE + MAT + ACCESS_WRITE, &m); }
+
+inline _OutputArray::_OutputArray(const std::vector<Mat>& vec)
+{ init(FIXED_SIZE + STD_VECTOR_MAT + ACCESS_WRITE, &vec); }
+
+inline _OutputArray::_OutputArray(const UMat& m)
+{ init(FIXED_TYPE + FIXED_SIZE + UMAT + ACCESS_WRITE, &m); }
+
+inline _OutputArray::_OutputArray(const std::vector<UMat>& vec)
+{ init(FIXED_SIZE + STD_VECTOR_UMAT + ACCESS_WRITE, &vec); }
+
+inline _OutputArray::_OutputArray(const cuda::GpuMat& d_mat)
+{ init(FIXED_TYPE + FIXED_SIZE + CUDA_GPU_MAT + ACCESS_WRITE, &d_mat); }
+
+
+inline _OutputArray::_OutputArray(const ogl::Buffer& buf)
+{ init(FIXED_TYPE + FIXED_SIZE + OPENGL_BUFFER + ACCESS_WRITE, &buf); }
+
+inline _OutputArray::_OutputArray(const cuda::HostMem& cuda_mem)
+{ init(FIXED_TYPE + FIXED_SIZE + CUDA_HOST_MEM + ACCESS_WRITE, &cuda_mem); }
+
+template<typename _Tp> inline
+_OutputArray _OutputArray::rawOut(std::vector<_Tp>& vec)
+{
+    _OutputArray v;
+    v.flags = _InputArray::FIXED_TYPE + _InputArray::STD_VECTOR + rawType<_Tp>() + ACCESS_WRITE;
+    v.obj = (void*)&vec;
+    return v;
+}
+
+template<typename _Tp, std::size_t _Nm> inline
+_OutputArray _OutputArray::rawOut(std::array<_Tp, _Nm>& arr)
+{
+    _OutputArray v;
+    v.flags = FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_WRITE;
+    v.obj = (void*)arr.data();
+    v.sz = Size(1, _Nm);
+    return v;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////
+
+inline _InputOutputArray::_InputOutputArray() { init(0+ACCESS_RW, 0); }
+inline _InputOutputArray::_InputOutputArray(int _flags, void* _obj) { init(_flags+ACCESS_RW, _obj); }
+inline _InputOutputArray::_InputOutputArray(Mat& m) { init(MAT+ACCESS_RW, &m); }
+inline _InputOutputArray::_InputOutputArray(std::vector<Mat>& vec) { init(STD_VECTOR_MAT+ACCESS_RW, &vec); }
+inline _InputOutputArray::_InputOutputArray(UMat& m) { init(UMAT+ACCESS_RW, &m); }
+inline _InputOutputArray::_InputOutputArray(std::vector<UMat>& vec) { init(STD_VECTOR_UMAT+ACCESS_RW, &vec); }
+
+template<typename _Tp> inline
+_InputOutputArray::_InputOutputArray(std::vector<_Tp>& vec)
+{ init(FIXED_TYPE + STD_VECTOR + traits::Type<_Tp>::value + ACCESS_RW, &vec); }
+
+template<typename _Tp, std::size_t _Nm> inline
+_InputOutputArray::_InputOutputArray(std::array<_Tp, _Nm>& arr)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_RW, arr.data(), Size(1, _Nm)); }
+
+template<std::size_t _Nm> inline
+_InputOutputArray::_InputOutputArray(std::array<Mat, _Nm>& arr)
+{ init(STD_ARRAY_MAT + ACCESS_RW, arr.data(), Size(1, _Nm)); }
+
+template<typename _Tp> inline
+_InputOutputArray::_InputOutputArray(std::vector<std::vector<_Tp> >& vec)
+{ init(FIXED_TYPE + STD_VECTOR_VECTOR + traits::Type<_Tp>::value + ACCESS_RW, &vec); }
+
+template<typename _Tp> inline
+_InputOutputArray::_InputOutputArray(std::vector<Mat_<_Tp> >& vec)
+{ init(FIXED_TYPE + STD_VECTOR_MAT + traits::Type<_Tp>::value + ACCESS_RW, &vec); }
+
+template<typename _Tp> inline
+_InputOutputArray::_InputOutputArray(Mat_<_Tp>& m)
+{ init(FIXED_TYPE + MAT + traits::Type<_Tp>::value + ACCESS_RW, &m); }
+
+template<typename _Tp, int m, int n> inline
+_InputOutputArray::_InputOutputArray(Matx<_Tp, m, n>& mtx)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_RW, &mtx, Size(n, m)); }
+
+template<typename _Tp> inline
+_InputOutputArray::_InputOutputArray(_Tp* vec, int n)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_RW, vec, Size(n, 1)); }
+
+template<typename _Tp> inline
+_InputOutputArray::_InputOutputArray(const std::vector<_Tp>& vec)
+{ init(FIXED_TYPE + FIXED_SIZE + STD_VECTOR + traits::Type<_Tp>::value + ACCESS_RW, &vec); }
+
+template<typename _Tp, std::size_t _Nm> inline
+_InputOutputArray::_InputOutputArray(const std::array<_Tp, _Nm>& arr)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_RW, arr.data(), Size(1, _Nm)); }
+
+template<std::size_t _Nm> inline
+_InputOutputArray::_InputOutputArray(const std::array<Mat, _Nm>& arr)
+{ init(FIXED_SIZE + STD_ARRAY_MAT + ACCESS_RW, arr.data(), Size(1, _Nm)); }
+
+template<typename _Tp> inline
+_InputOutputArray::_InputOutputArray(const std::vector<std::vector<_Tp> >& vec)
+{ init(FIXED_TYPE + FIXED_SIZE + STD_VECTOR_VECTOR + traits::Type<_Tp>::value + ACCESS_RW, &vec); }
+
+template<typename _Tp> inline
+_InputOutputArray::_InputOutputArray(const std::vector<Mat_<_Tp> >& vec)
+{ init(FIXED_TYPE + FIXED_SIZE + STD_VECTOR_MAT + traits::Type<_Tp>::value + ACCESS_RW, &vec); }
+
+template<typename _Tp> inline
+_InputOutputArray::_InputOutputArray(const Mat_<_Tp>& m)
+{ init(FIXED_TYPE + FIXED_SIZE + MAT + traits::Type<_Tp>::value + ACCESS_RW, &m); }
+
+template<typename _Tp, int m, int n> inline
+_InputOutputArray::_InputOutputArray(const Matx<_Tp, m, n>& mtx)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_RW, &mtx, Size(n, m)); }
+
+template<typename _Tp> inline
+_InputOutputArray::_InputOutputArray(const _Tp* vec, int n)
+{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_RW, vec, Size(n, 1)); }
+
+inline _InputOutputArray::_InputOutputArray(cuda::GpuMat& d_mat)
+{ init(CUDA_GPU_MAT + ACCESS_RW, &d_mat); }
+
+inline _InputOutputArray::_InputOutputArray(ogl::Buffer& buf)
+{ init(OPENGL_BUFFER + ACCESS_RW, &buf); }
+
+inline _InputOutputArray::_InputOutputArray(cuda::HostMem& cuda_mem)
+{ init(CUDA_HOST_MEM + ACCESS_RW, &cuda_mem); }
+
+inline _InputOutputArray::_InputOutputArray(const Mat& m)
+{ init(FIXED_TYPE + FIXED_SIZE + MAT + ACCESS_RW, &m); }
+
+inline _InputOutputArray::_InputOutputArray(const std::vector<Mat>& vec)
+{ init(FIXED_SIZE + STD_VECTOR_MAT + ACCESS_RW, &vec); }
+
+inline _InputOutputArray::_InputOutputArray(const UMat& m)
+{ init(FIXED_TYPE + FIXED_SIZE + UMAT + ACCESS_RW, &m); }
+
+inline _InputOutputArray::_InputOutputArray(const std::vector<UMat>& vec)
+{ init(FIXED_SIZE + STD_VECTOR_UMAT + ACCESS_RW, &vec); }
+
+inline _InputOutputArray::_InputOutputArray(const cuda::GpuMat& d_mat)
+{ init(FIXED_TYPE + FIXED_SIZE + CUDA_GPU_MAT + ACCESS_RW, &d_mat); }
+
+inline _InputOutputArray::_InputOutputArray(const std::vector<cuda::GpuMat>& d_mat)
+{ init(FIXED_TYPE + FIXED_SIZE + STD_VECTOR_CUDA_GPU_MAT + ACCESS_RW, &d_mat);}
+
+template<> inline _InputOutputArray::_InputOutputArray(std::vector<cuda::GpuMat>& d_mat)
+{ init(FIXED_TYPE + FIXED_SIZE + STD_VECTOR_CUDA_GPU_MAT + ACCESS_RW, &d_mat);}
+
+inline _InputOutputArray::_InputOutputArray(const ogl::Buffer& buf)
+{ init(FIXED_TYPE + FIXED_SIZE + OPENGL_BUFFER + ACCESS_RW, &buf); }
+
+inline _InputOutputArray::_InputOutputArray(const cuda::HostMem& cuda_mem)
+{ init(FIXED_TYPE + FIXED_SIZE + CUDA_HOST_MEM + ACCESS_RW, &cuda_mem); }
+
+template<typename _Tp> inline
+_InputOutputArray _InputOutputArray::rawInOut(std::vector<_Tp>& vec)
+{
+    _InputOutputArray v;
+    v.flags = _InputArray::FIXED_TYPE + _InputArray::STD_VECTOR + rawType<_Tp>() + ACCESS_RW;
+    v.obj = (void*)&vec;
+    return v;
+}
+
+template<typename _Tp, std::size_t _Nm> inline
+_InputOutputArray _InputOutputArray::rawInOut(std::array<_Tp, _Nm>& arr)
+{
+    _InputOutputArray v;
+    v.flags = FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_RW;
+    v.obj = (void*)arr.data();
+    v.sz = Size(1, _Nm);
+    return v;
+}
+
+
+template<typename _Tp> static inline _InputArray rawIn(_Tp& v) { return _InputArray::rawIn(v); }
+template<typename _Tp> static inline _OutputArray rawOut(_Tp& v) { return _OutputArray::rawOut(v); }
+template<typename _Tp> static inline _InputOutputArray rawInOut(_Tp& v) { return _InputOutputArray::rawInOut(v); }
+
+CV__DEBUG_NS_END
+
+//////////////////////////////////////////// Mat //////////////////////////////////////////
+
+template<typename _Tp> inline
+Mat::Mat(const std::vector<_Tp>& vec, bool copyData)
+    : flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows((int)vec.size()),
+      cols(1), data(0), datastart(0), dataend(0), datalimit(0), allocator(0), u(0), size(&rows), step(0)
+{
+    if(vec.empty())
+        return;
+    if( !copyData )
+    {
+        step[0] = step[1] = sizeof(_Tp);
+        datastart = data = (uchar*)&vec[0];
+        datalimit = dataend = datastart + rows * step[0];
+    }
+    else
+        Mat((int)vec.size(), 1, traits::Type<_Tp>::value, (uchar*)&vec[0]).copyTo(*this);
+}
+
+template<typename _Tp, typename> inline
+Mat::Mat(const std::initializer_list<_Tp> list)
+    : Mat()
+{
+    CV_Assert(list.size() != 0);
+    Mat((int)list.size(), 1, traits::Type<_Tp>::value, (uchar*)list.begin()).copyTo(*this);
+}
+
+template<typename _Tp> inline
+Mat::Mat(const std::initializer_list<int> sizes, const std::initializer_list<_Tp> list)
+    : Mat()
+{
+    size_t size_total = 1;
+    for(auto s : sizes)
+        size_total *= s;
+    CV_Assert(list.size() != 0);
+    CV_Assert(size_total == list.size());
+    Mat((int)sizes.size(), (int*)sizes.begin(), traits::Type<_Tp>::value, (uchar*)list.begin()).copyTo(*this);
+}
+
+template<typename _Tp, std::size_t _Nm> inline
+Mat::Mat(const std::array<_Tp, _Nm>& arr, bool copyData)
+    : flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows((int)arr.size()),
+      cols(1), data(0), datastart(0), dataend(0), datalimit(0), allocator(0), u(0), size(&rows), step(0)
+{
+    if(arr.empty())
+        return;
+    if( !copyData )
+    {
+        step[0] = step[1] = sizeof(_Tp);
+        datastart = data = (uchar*)arr.data();
+        datalimit = dataend = datastart + rows * step[0];
+    }
+    else
+        Mat((int)arr.size(), 1, traits::Type<_Tp>::value, (uchar*)arr.data()).copyTo(*this);
+}
+
+template<typename _Tp, int n> inline
+Mat::Mat(const Vec<_Tp, n>& vec, bool copyData)
+    : flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows(n), cols(1), data(0),
+      datastart(0), dataend(0), datalimit(0), allocator(0), u(0), size(&rows), step(0)
+{
+    if( !copyData )
+    {
+        step[0] = step[1] = sizeof(_Tp);
+        datastart = data = (uchar*)vec.val;
+        datalimit = dataend = datastart + rows * step[0];
+    }
+    else
+        Mat(n, 1, traits::Type<_Tp>::value, (void*)vec.val).copyTo(*this);
+}
+
+
+template<typename _Tp, int m, int n> inline
+Mat::Mat(const Matx<_Tp,m,n>& M, bool copyData)
+    : flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows(m), cols(n), data(0),
+      datastart(0), dataend(0), datalimit(0), allocator(0), u(0), size(&rows), step(0)
+{
+    if( !copyData )
+    {
+        step[0] = cols * sizeof(_Tp);
+        step[1] = sizeof(_Tp);
+        datastart = data = (uchar*)M.val;
+        datalimit = dataend = datastart + rows * step[0];
+    }
+    else
+        Mat(m, n, traits::Type<_Tp>::value, (uchar*)M.val).copyTo(*this);
+}
+
+template<typename _Tp> inline
+Mat::Mat(const Point_<_Tp>& pt, bool copyData)
+    : flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows(2), cols(1), data(0),
+      datastart(0), dataend(0), datalimit(0), allocator(0), u(0), size(&rows), step(0)
+{
+    if( !copyData )
+    {
+        step[0] = step[1] = sizeof(_Tp);
+        datastart = data = (uchar*)&pt.x;
+        datalimit = dataend = datastart + rows * step[0];
+    }
+    else
+    {
+        create(2, 1, traits::Type<_Tp>::value);
+        ((_Tp*)data)[0] = pt.x;
+        ((_Tp*)data)[1] = pt.y;
+    }
+}
+
+template<typename _Tp> inline
+Mat::Mat(const Point3_<_Tp>& pt, bool copyData)
+    : flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows(3), cols(1), data(0),
+      datastart(0), dataend(0), datalimit(0), allocator(0), u(0), size(&rows), step(0)
+{
+    if( !copyData )
+    {
+        step[0] = step[1] = sizeof(_Tp);
+        datastart = data = (uchar*)&pt.x;
+        datalimit = dataend = datastart + rows * step[0];
+    }
+    else
+    {
+        create(3, 1, traits::Type<_Tp>::value);
+        ((_Tp*)data)[0] = pt.x;
+        ((_Tp*)data)[1] = pt.y;
+        ((_Tp*)data)[2] = pt.z;
+    }
+}
+
+template<typename _Tp> inline
+Mat::Mat(const MatCommaInitializer_<_Tp>& commaInitializer)
+    : flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(0), rows(0), cols(0), data(0),
+      datastart(0), dataend(0), allocator(0), u(0), size(&rows)
+{
+    *this = commaInitializer.operator Mat_<_Tp>();
+}
+
+inline
+Mat Mat::row(int y) const
+{
+    return Mat(*this, Range(y, y + 1), Range::all());
+}
+
+inline
+Mat Mat::col(int x) const
+{
+    return Mat(*this, Range::all(), Range(x, x + 1));
+}
+
+inline
+Mat Mat::rowRange(int startrow, int endrow) const
+{
+    return Mat(*this, Range(startrow, endrow), Range::all());
+}
+
+inline
+Mat Mat::rowRange(const Range& r) const
+{
+    return Mat(*this, r, Range::all());
+}
+
+inline
+Mat Mat::colRange(int startcol, int endcol) const
+{
+    return Mat(*this, Range::all(), Range(startcol, endcol));
+}
+
+inline
+Mat Mat::colRange(const Range& r) const
+{
+    return Mat(*this, Range::all(), r);
+}
+
+inline
+Mat Mat::operator()( Range _rowRange, Range _colRange ) const
+{
+    return Mat(*this, _rowRange, _colRange);
+}
+
+inline
+Mat Mat::operator()( const Rect& roi ) const
+{
+    return Mat(*this, roi);
+}
+
+inline
+Mat Mat::operator()(const Range* ranges) const
+{
+    return Mat(*this, ranges);
+}
+
+inline
+Mat Mat::operator()(const std::vector<Range>& ranges) const
+{
+    return Mat(*this, ranges);
+}
+
+inline
+bool Mat::isContinuous() const
+{
+    return (flags & CONTINUOUS_FLAG) != 0;
+}
+
+inline
+bool Mat::isSubmatrix() const
+{
+    return (flags & SUBMATRIX_FLAG) != 0;
+}
+
+inline
+size_t Mat::elemSize() const
+{
+    size_t res = dims > 0 ? step.p[dims - 1] : 0;
+    CV_DbgAssert(res != 0);
+    return res;
+}
+
+inline
+size_t Mat::elemSize1() const
+{
+    return CV_ELEM_SIZE1(flags);
+}
+
+inline
+int Mat::type() const
+{
+    return CV_MAT_TYPE(flags);
+}
+
+inline
+int Mat::depth() const
+{
+    return CV_MAT_DEPTH(flags);
+}
+
+inline
+int Mat::channels() const
+{
+    return CV_MAT_CN(flags);
+}
+
+inline
+uchar* Mat::ptr(int y)
+{
+    CV_DbgAssert( y == 0 || (data && dims >= 1 && (unsigned)y < (unsigned)size.p[0]) );
+    return data + step.p[0] * y;
+}
+
+inline
+const uchar* Mat::ptr(int y) const
+{
+    CV_DbgAssert( y == 0 || (data && dims >= 1 && (unsigned)y < (unsigned)size.p[0]) );
+    return data + step.p[0] * y;
+}
+
+template<typename _Tp> inline
+_Tp* Mat::ptr(int y)
+{
+    CV_DbgAssert( y == 0 || (data && dims >= 1 && (unsigned)y < (unsigned)size.p[0]) );
+    return (_Tp*)(data + step.p[0] * y);
+}
+
+template<typename _Tp> inline
+const _Tp* Mat::ptr(int y) const
+{
+    CV_DbgAssert( y == 0 || (data && dims >= 1 && (unsigned)y < (unsigned)size.p[0]) );
+    return (const _Tp*)(data + step.p[0] * y);
+}
+
+inline
+uchar* Mat::ptr(int i0, int i1)
+{
+    CV_DbgAssert(dims >= 2);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)i0 < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)i1 < (unsigned)size.p[1]);
+    return data + i0 * step.p[0] + i1 * step.p[1];
+}
+
+inline
+const uchar* Mat::ptr(int i0, int i1) const
+{
+    CV_DbgAssert(dims >= 2);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)i0 < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)i1 < (unsigned)size.p[1]);
+    return data + i0 * step.p[0] + i1 * step.p[1];
+}
+
+template<typename _Tp> inline
+_Tp* Mat::ptr(int i0, int i1)
+{
+    CV_DbgAssert(dims >= 2);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)i0 < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)i1 < (unsigned)size.p[1]);
+    return (_Tp*)(data + i0 * step.p[0] + i1 * step.p[1]);
+}
+
+template<typename _Tp> inline
+const _Tp* Mat::ptr(int i0, int i1) const
+{
+    CV_DbgAssert(dims >= 2);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)i0 < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)i1 < (unsigned)size.p[1]);
+    return (const _Tp*)(data + i0 * step.p[0] + i1 * step.p[1]);
+}
+
+inline
+uchar* Mat::ptr(int i0, int i1, int i2)
+{
+    CV_DbgAssert(dims >= 3);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)i0 < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)i1 < (unsigned)size.p[1]);
+    CV_DbgAssert((unsigned)i2 < (unsigned)size.p[2]);
+    return data + i0 * step.p[0] + i1 * step.p[1] + i2 * step.p[2];
+}
+
+inline
+const uchar* Mat::ptr(int i0, int i1, int i2) const
+{
+    CV_DbgAssert(dims >= 3);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)i0 < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)i1 < (unsigned)size.p[1]);
+    CV_DbgAssert((unsigned)i2 < (unsigned)size.p[2]);
+    return data + i0 * step.p[0] + i1 * step.p[1] + i2 * step.p[2];
+}
+
+template<typename _Tp> inline
+_Tp* Mat::ptr(int i0, int i1, int i2)
+{
+    CV_DbgAssert(dims >= 3);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)i0 < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)i1 < (unsigned)size.p[1]);
+    CV_DbgAssert((unsigned)i2 < (unsigned)size.p[2]);
+    return (_Tp*)(data + i0 * step.p[0] + i1 * step.p[1] + i2 * step.p[2]);
+}
+
+template<typename _Tp> inline
+const _Tp* Mat::ptr(int i0, int i1, int i2) const
+{
+    CV_DbgAssert(dims >= 3);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)i0 < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)i1 < (unsigned)size.p[1]);
+    CV_DbgAssert((unsigned)i2 < (unsigned)size.p[2]);
+    return (const _Tp*)(data + i0 * step.p[0] + i1 * step.p[1] + i2 * step.p[2]);
+}
+
+inline
+uchar* Mat::ptr(const int* idx)
+{
+    int i, d = dims;
+    uchar* p = data;
+    CV_DbgAssert( d >= 1 && p );
+    for( i = 0; i < d; i++ )
+    {
+        CV_DbgAssert( (unsigned)idx[i] < (unsigned)size.p[i] );
+        p += idx[i] * step.p[i];
+    }
+    return p;
+}
+
+inline
+const uchar* Mat::ptr(const int* idx) const
+{
+    int i, d = dims;
+    uchar* p = data;
+    CV_DbgAssert( d >= 1 && p );
+    for( i = 0; i < d; i++ )
+    {
+        CV_DbgAssert( (unsigned)idx[i] < (unsigned)size.p[i] );
+        p += idx[i] * step.p[i];
+    }
+    return p;
+}
+
+template<typename _Tp> inline
+_Tp* Mat::ptr(const int* idx)
+{
+    int i, d = dims;
+    uchar* p = data;
+    CV_DbgAssert( d >= 1 && p );
+    for( i = 0; i < d; i++ )
+    {
+        CV_DbgAssert( (unsigned)idx[i] < (unsigned)size.p[i] );
+        p += idx[i] * step.p[i];
+    }
+    return (_Tp*)p;
+}
+
+template<typename _Tp> inline
+const _Tp* Mat::ptr(const int* idx) const
+{
+    int i, d = dims;
+    uchar* p = data;
+    CV_DbgAssert( d >= 1 && p );
+    for( i = 0; i < d; i++ )
+    {
+        CV_DbgAssert( (unsigned)idx[i] < (unsigned)size.p[i] );
+        p += idx[i] * step.p[i];
+    }
+    return (const _Tp*)p;
+}
+
+template<int n> inline
+uchar* Mat::ptr(const Vec<int, n>& idx)
+{
+    return Mat::ptr(idx.val);
+}
+
+template<int n> inline
+const uchar* Mat::ptr(const Vec<int, n>& idx) const
+{
+    return Mat::ptr(idx.val);
+}
+
+template<typename _Tp, int n> inline
+_Tp* Mat::ptr(const Vec<int, n>& idx)
+{
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    return Mat::ptr<_Tp>(idx.val);
+}
+
+template<typename _Tp, int n> inline
+const _Tp* Mat::ptr(const Vec<int, n>& idx) const
+{
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    return Mat::ptr<_Tp>(idx.val);
+}
+
+
+template<typename _Tp> inline
+_Tp& Mat::at(int i0, int i1)
+{
+    CV_DbgAssert(dims <= 2);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)i0 < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)(i1 * DataType<_Tp>::channels) < (unsigned)(size.p[1] * channels()));
+    CV_DbgAssert(CV_ELEM_SIZE1(traits::Depth<_Tp>::value) == elemSize1());
+    return ((_Tp*)(data + step.p[0] * i0))[i1];
+}
+
+template<typename _Tp> inline
+const _Tp& Mat::at(int i0, int i1) const
+{
+    CV_DbgAssert(dims <= 2);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)i0 < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)(i1 * DataType<_Tp>::channels) < (unsigned)(size.p[1] * channels()));
+    CV_DbgAssert(CV_ELEM_SIZE1(traits::Depth<_Tp>::value) == elemSize1());
+    return ((const _Tp*)(data + step.p[0] * i0))[i1];
+}
+
+template<typename _Tp> inline
+_Tp& Mat::at(Point pt)
+{
+    CV_DbgAssert(dims <= 2);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)pt.y < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)(pt.x * DataType<_Tp>::channels) < (unsigned)(size.p[1] * channels()));
+    CV_DbgAssert(CV_ELEM_SIZE1(traits::Depth<_Tp>::value) == elemSize1());
+    return ((_Tp*)(data + step.p[0] * pt.y))[pt.x];
+}
+
+template<typename _Tp> inline
+const _Tp& Mat::at(Point pt) const
+{
+    CV_DbgAssert(dims <= 2);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)pt.y < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)(pt.x * DataType<_Tp>::channels) < (unsigned)(size.p[1] * channels()));
+    CV_DbgAssert(CV_ELEM_SIZE1(traits::Depth<_Tp>::value) == elemSize1());
+    return ((const _Tp*)(data + step.p[0] * pt.y))[pt.x];
+}
+
+template<typename _Tp> inline
+_Tp& Mat::at(int i0)
+{
+    CV_DbgAssert(dims <= 2);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)i0 < (unsigned)(size.p[0] * size.p[1]));
+    CV_DbgAssert(elemSize() == sizeof(_Tp));
+    if( isContinuous() || size.p[0] == 1 )
+        return ((_Tp*)data)[i0];
+    if( size.p[1] == 1 )
+        return *(_Tp*)(data + step.p[0] * i0);
+    int i = i0 / cols, j = i0 - i * cols;
+    return ((_Tp*)(data + step.p[0] * i))[j];
+}
+
+template<typename _Tp> inline
+const _Tp& Mat::at(int i0) const
+{
+    CV_DbgAssert(dims <= 2);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)i0 < (unsigned)(size.p[0] * size.p[1]));
+    CV_DbgAssert(elemSize() == sizeof(_Tp));
+    if( isContinuous() || size.p[0] == 1 )
+        return ((const _Tp*)data)[i0];
+    if( size.p[1] == 1 )
+        return *(const _Tp*)(data + step.p[0] * i0);
+    int i = i0 / cols, j = i0 - i * cols;
+    return ((const _Tp*)(data + step.p[0] * i))[j];
+}
+
+template<typename _Tp> inline
+_Tp& Mat::at(int i0, int i1, int i2)
+{
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    return *(_Tp*)ptr(i0, i1, i2);
+}
+
+template<typename _Tp> inline
+const _Tp& Mat::at(int i0, int i1, int i2) const
+{
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    return *(const _Tp*)ptr(i0, i1, i2);
+}
+
+template<typename _Tp> inline
+_Tp& Mat::at(const int* idx)
+{
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    return *(_Tp*)ptr(idx);
+}
+
+template<typename _Tp> inline
+const _Tp& Mat::at(const int* idx) const
+{
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    return *(const _Tp*)ptr(idx);
+}
+
+template<typename _Tp, int n> inline
+_Tp& Mat::at(const Vec<int, n>& idx)
+{
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    return *(_Tp*)ptr(idx.val);
+}
+
+template<typename _Tp, int n> inline
+const _Tp& Mat::at(const Vec<int, n>& idx) const
+{
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    return *(const _Tp*)ptr(idx.val);
+}
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp> Mat::begin() const
+{
+    if (empty())
+        return MatConstIterator_<_Tp>();
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    return MatConstIterator_<_Tp>((const Mat_<_Tp>*)this);
+}
+
+template<typename _Tp> inline
+std::reverse_iterator<MatConstIterator_<_Tp>> Mat::rbegin() const
+{
+    if (empty())
+        return std::reverse_iterator<MatConstIterator_<_Tp>>();
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    MatConstIterator_<_Tp> it((const Mat_<_Tp>*)this);
+    it += total();
+    return std::reverse_iterator<MatConstIterator_<_Tp>> (it);
+}
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp> Mat::end() const
+{
+    if (empty())
+        return MatConstIterator_<_Tp>();
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    MatConstIterator_<_Tp> it((const Mat_<_Tp>*)this);
+    it += total();
+    return it;
+}
+
+template<typename _Tp> inline
+std::reverse_iterator<MatConstIterator_<_Tp>> Mat::rend() const
+{
+    if (empty())
+        return std::reverse_iterator<MatConstIterator_<_Tp>>();
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    return std::reverse_iterator<MatConstIterator_<_Tp>>((const Mat_<_Tp>*)this);
+}
+
+template<typename _Tp> inline
+MatIterator_<_Tp> Mat::begin()
+{
+    if (empty())
+        return MatIterator_<_Tp>();
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    return MatIterator_<_Tp>((Mat_<_Tp>*)this);
+}
+
+template<typename _Tp> inline
+std::reverse_iterator<MatIterator_<_Tp>> Mat::rbegin()
+{
+    if (empty())
+        return std::reverse_iterator<MatIterator_<_Tp>>();
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    MatIterator_<_Tp> it((Mat_<_Tp>*)this);
+    it += total();
+    return std::reverse_iterator<MatIterator_<_Tp>>(it);
+}
+
+template<typename _Tp> inline
+MatIterator_<_Tp> Mat::end()
+{
+    if (empty())
+        return MatIterator_<_Tp>();
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    MatIterator_<_Tp> it((Mat_<_Tp>*)this);
+    it += total();
+    return it;
+}
+
+template<typename _Tp> inline
+std::reverse_iterator<MatIterator_<_Tp>> Mat::rend()
+{
+    if (empty())
+        return std::reverse_iterator<MatIterator_<_Tp>>();
+    CV_DbgAssert( elemSize() == sizeof(_Tp) );
+    return std::reverse_iterator<MatIterator_<_Tp>>(MatIterator_<_Tp>((Mat_<_Tp>*)this));
+}
+
+template<typename _Tp, typename Functor> inline
+void Mat::forEach(const Functor& operation) {
+    this->forEach_impl<_Tp>(operation);
+}
+
+template<typename _Tp, typename Functor> inline
+void Mat::forEach(const Functor& operation) const {
+    // call as not const
+    (const_cast<Mat*>(this))->forEach<_Tp>(operation);
+}
+
+template<typename _Tp> inline
+Mat::operator std::vector<_Tp>() const
+{
+    std::vector<_Tp> v;
+    copyTo(v);
+    return v;
+}
+
+template<typename _Tp, std::size_t _Nm> inline
+Mat::operator std::array<_Tp, _Nm>() const
+{
+    std::array<_Tp, _Nm> v;
+    copyTo(v);
+    return v;
+}
+
+template<typename _Tp, int n> inline
+Mat::operator Vec<_Tp, n>() const
+{
+    CV_Assert( data && dims <= 2 && (rows == 1 || cols == 1) &&
+               rows + cols - 1 == n && channels() == 1 );
+
+    if( isContinuous() && type() == traits::Type<_Tp>::value )
+        return Vec<_Tp, n>((_Tp*)data);
+    Vec<_Tp, n> v;
+    Mat tmp(rows, cols, traits::Type<_Tp>::value, v.val);
+    convertTo(tmp, tmp.type());
+    return v;
+}
+
+template<typename _Tp, int m, int n> inline
+Mat::operator Matx<_Tp, m, n>() const
+{
+    CV_Assert( data && dims <= 2 && rows == m && cols == n && channels() == 1 );
+
+    if( isContinuous() && type() == traits::Type<_Tp>::value )
+        return Matx<_Tp, m, n>((_Tp*)data);
+    Matx<_Tp, m, n> mtx;
+    Mat tmp(rows, cols, traits::Type<_Tp>::value, mtx.val);
+    convertTo(tmp, tmp.type());
+    return mtx;
+}
+
+template<typename _Tp> inline
+void Mat::push_back(const _Tp& elem)
+{
+    if( !data )
+    {
+        *this = Mat(1, 1, traits::Type<_Tp>::value, (void*)&elem).clone();
+        return;
+    }
+    CV_Assert(traits::Type<_Tp>::value == type() && cols == 1
+              /* && dims == 2 (cols == 1 implies dims == 2) */);
+    const uchar* tmp = dataend + step[0];
+    if( !isSubmatrix() && isContinuous() && tmp <= datalimit )
+    {
+        *(_Tp*)(data + (size.p[0]++) * step.p[0]) = elem;
+        dataend = tmp;
+    }
+    else
+        push_back_(&elem);
+}
+
+template<typename _Tp> inline
+void Mat::push_back(const Mat_<_Tp>& m)
+{
+    push_back((const Mat&)m);
+}
+
+template<> inline
+void Mat::push_back(const MatExpr& expr)
+{
+    push_back(static_cast<Mat>(expr));
+}
+
+
+template<typename _Tp> inline
+void Mat::push_back(const std::vector<_Tp>& v)
+{
+    push_back(Mat(v));
+}
+
+
+///////////////////////////// MatSize ////////////////////////////
+
+inline
+MatSize::MatSize(int* _p) CV_NOEXCEPT
+    : p(_p) {}
+
+inline
+int MatSize::dims() const CV_NOEXCEPT
+{
+    return (p - 1)[0];
+}
+
+inline
+Size MatSize::operator()() const
+{
+    CV_DbgAssert(dims() <= 2);
+    return Size(p[1], p[0]);
+}
+
+inline
+const int& MatSize::operator[](int i) const
+{
+    CV_DbgAssert(i < dims());
+#ifdef __OPENCV_BUILD
+    CV_DbgAssert(i >= 0);
+#endif
+    return p[i];
+}
+
+inline
+int& MatSize::operator[](int i)
+{
+    CV_DbgAssert(i < dims());
+#ifdef __OPENCV_BUILD
+    CV_DbgAssert(i >= 0);
+#endif
+    return p[i];
+}
+
+inline
+MatSize::operator const int*() const CV_NOEXCEPT
+{
+    return p;
+}
+
+inline
+bool MatSize::operator != (const MatSize& sz) const CV_NOEXCEPT
+{
+    return !(*this == sz);
+}
+
+
+
+///////////////////////////// MatStep ////////////////////////////
+
+inline
+MatStep::MatStep() CV_NOEXCEPT
+{
+    p = buf; p[0] = p[1] = 0;
+}
+
+inline
+MatStep::MatStep(size_t s) CV_NOEXCEPT
+{
+    p = buf; p[0] = s; p[1] = 0;
+}
+
+inline
+const size_t& MatStep::operator[](int i) const CV_NOEXCEPT
+{
+    return p[i];
+}
+
+inline
+size_t& MatStep::operator[](int i) CV_NOEXCEPT
+{
+    return p[i];
+}
+
+inline MatStep::operator size_t() const
+{
+    CV_DbgAssert( p == buf );
+    return buf[0];
+}
+
+inline MatStep& MatStep::operator = (size_t s)
+{
+    CV_DbgAssert( p == buf );
+    buf[0] = s;
+    return *this;
+}
+
+
+
+////////////////////////////// Mat_<_Tp> ////////////////////////////
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_() CV_NOEXCEPT
+    : Mat()
+{
+    flags = (flags & ~CV_MAT_TYPE_MASK) + traits::Type<_Tp>::value;
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(int _rows, int _cols)
+    : Mat(_rows, _cols, traits::Type<_Tp>::value)
+{
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(int _rows, int _cols, const _Tp& value)
+    : Mat(_rows, _cols, traits::Type<_Tp>::value)
+{
+    *this = value;
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(Size _sz)
+    : Mat(_sz.height, _sz.width, traits::Type<_Tp>::value)
+{}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(Size _sz, const _Tp& value)
+    : Mat(_sz.height, _sz.width, traits::Type<_Tp>::value)
+{
+    *this = value;
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(int _dims, const int* _sz)
+    : Mat(_dims, _sz, traits::Type<_Tp>::value)
+{}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(int _dims, const int* _sz, const _Tp& _s)
+    : Mat(_dims, _sz, traits::Type<_Tp>::value, Scalar(_s))
+{}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(int _dims, const int* _sz, _Tp* _data, const size_t* _steps)
+    : Mat(_dims, _sz, traits::Type<_Tp>::value, _data, _steps)
+{}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(const Mat_<_Tp>& m, const Range* ranges)
+    : Mat(m, ranges)
+{}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(const Mat_<_Tp>& m, const std::vector<Range>& ranges)
+    : Mat(m, ranges)
+{}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(const Mat& m)
+    : Mat()
+{
+    flags = (flags & ~CV_MAT_TYPE_MASK) + traits::Type<_Tp>::value;
+    *this = m;
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(const Mat_& m)
+    : Mat(m)
+{}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(int _rows, int _cols, _Tp* _data, size_t steps)
+    : Mat(_rows, _cols, traits::Type<_Tp>::value, _data, steps)
+{}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(const Mat_& m, const Range& _rowRange, const Range& _colRange)
+    : Mat(m, _rowRange, _colRange)
+{}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(const Mat_& m, const Rect& roi)
+    : Mat(m, roi)
+{}
+
+template<typename _Tp> template<int n> inline
+Mat_<_Tp>::Mat_(const Vec<typename DataType<_Tp>::channel_type, n>& vec, bool copyData)
+    : Mat(n / DataType<_Tp>::channels, 1, traits::Type<_Tp>::value, (void*)&vec)
+{
+    CV_Assert(n%DataType<_Tp>::channels == 0);
+    if( copyData )
+        *this = clone();
+}
+
+template<typename _Tp> template<int m, int n> inline
+Mat_<_Tp>::Mat_(const Matx<typename DataType<_Tp>::channel_type, m, n>& M, bool copyData)
+    : Mat(m, n / DataType<_Tp>::channels, traits::Type<_Tp>::value, (void*)&M)
+{
+    CV_Assert(n % DataType<_Tp>::channels == 0);
+    if( copyData )
+        *this = clone();
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(const Point_<typename DataType<_Tp>::channel_type>& pt, bool copyData)
+    : Mat(2 / DataType<_Tp>::channels, 1, traits::Type<_Tp>::value, (void*)&pt)
+{
+    CV_Assert(2 % DataType<_Tp>::channels == 0);
+    if( copyData )
+        *this = clone();
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(const Point3_<typename DataType<_Tp>::channel_type>& pt, bool copyData)
+    : Mat(3 / DataType<_Tp>::channels, 1, traits::Type<_Tp>::value, (void*)&pt)
+{
+    CV_Assert(3 % DataType<_Tp>::channels == 0);
+    if( copyData )
+        *this = clone();
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(const MatCommaInitializer_<_Tp>& commaInitializer)
+    : Mat(commaInitializer)
+{}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(const std::vector<_Tp>& vec, bool copyData)
+    : Mat(vec, copyData)
+{}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(std::initializer_list<_Tp> list)
+    : Mat(list)
+{}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(const std::initializer_list<int> sizes, std::initializer_list<_Tp> list)
+    : Mat(sizes, list)
+{}
+
+template<typename _Tp> template<std::size_t _Nm> inline
+Mat_<_Tp>::Mat_(const std::array<_Tp, _Nm>& arr, bool copyData)
+    : Mat(arr, copyData)
+{}
+
+template<typename _Tp> inline
+Mat_<_Tp>& Mat_<_Tp>::operator = (const Mat& m)
+{
+    if (m.empty())
+    {
+        release();
+        return *this;
+    }
+    if( traits::Type<_Tp>::value == m.type() )
+    {
+        Mat::operator = (m);
+        return *this;
+    }
+    if( traits::Depth<_Tp>::value == m.depth() )
+    {
+        return (*this = m.reshape(DataType<_Tp>::channels, m.dims, 0));
+    }
+    CV_Assert(DataType<_Tp>::channels == m.channels() || m.empty());
+    m.convertTo(*this, type());
+    return *this;
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>& Mat_<_Tp>::operator = (const Mat_& m)
+{
+    Mat::operator=(m);
+    return *this;
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>& Mat_<_Tp>::operator = (const _Tp& s)
+{
+    typedef typename DataType<_Tp>::vec_type VT;
+    Mat::operator=(Scalar((const VT&)s));
+    return *this;
+}
+
+template<typename _Tp> inline
+void Mat_<_Tp>::create(int _rows, int _cols)
+{
+    Mat::create(_rows, _cols, traits::Type<_Tp>::value);
+}
+
+template<typename _Tp> inline
+void Mat_<_Tp>::create(Size _sz)
+{
+    Mat::create(_sz, traits::Type<_Tp>::value);
+}
+
+template<typename _Tp> inline
+void Mat_<_Tp>::create(int _dims, const int* _sz)
+{
+    Mat::create(_dims, _sz, traits::Type<_Tp>::value);
+}
+
+template<typename _Tp> inline
+void Mat_<_Tp>::release()
+{
+    Mat::release();
+    flags = (flags & ~CV_MAT_TYPE_MASK) + traits::Type<_Tp>::value;
+}
+
+template<typename _Tp> inline
+Mat_<_Tp> Mat_<_Tp>::cross(const Mat_& m) const
+{
+    return Mat_<_Tp>(Mat::cross(m));
+}
+
+template<typename _Tp> template<typename T2> inline
+Mat_<_Tp>::operator Mat_<T2>() const
+{
+    return Mat_<T2>(static_cast<const Mat&>(*this));
+}
+
+template<typename _Tp> inline
+Mat_<_Tp> Mat_<_Tp>::row(int y) const
+{
+    return Mat_(*this, Range(y, y+1), Range::all());
+}
+
+template<typename _Tp> inline
+Mat_<_Tp> Mat_<_Tp>::col(int x) const
+{
+    return Mat_(*this, Range::all(), Range(x, x+1));
+}
+
+template<typename _Tp> inline
+Mat_<_Tp> Mat_<_Tp>::diag(int d) const
+{
+    return Mat_(Mat::diag(d));
+}
+
+template<typename _Tp> inline
+Mat_<_Tp> Mat_<_Tp>::clone() const
+{
+    return Mat_(Mat::clone());
+}
+
+template<typename _Tp> inline
+size_t Mat_<_Tp>::elemSize() const
+{
+    CV_DbgAssert( Mat::elemSize() == sizeof(_Tp) );
+    return sizeof(_Tp);
+}
+
+template<typename _Tp> inline
+size_t Mat_<_Tp>::elemSize1() const
+{
+    CV_DbgAssert( Mat::elemSize1() == sizeof(_Tp) / DataType<_Tp>::channels );
+    return sizeof(_Tp) / DataType<_Tp>::channels;
+}
+
+template<typename _Tp> inline
+int Mat_<_Tp>::type() const
+{
+    CV_DbgAssert( Mat::type() == traits::Type<_Tp>::value );
+    return traits::Type<_Tp>::value;
+}
+
+template<typename _Tp> inline
+int Mat_<_Tp>::depth() const
+{
+    CV_DbgAssert( Mat::depth() == traits::Depth<_Tp>::value );
+    return traits::Depth<_Tp>::value;
+}
+
+template<typename _Tp> inline
+int Mat_<_Tp>::channels() const
+{
+    CV_DbgAssert( Mat::channels() == DataType<_Tp>::channels );
+    return DataType<_Tp>::channels;
+}
+
+template<typename _Tp> inline
+size_t Mat_<_Tp>::stepT(int i) const
+{
+    return step.p[i] / elemSize();
+}
+
+template<typename _Tp> inline
+size_t Mat_<_Tp>::step1(int i) const
+{
+    return step.p[i] / elemSize1();
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>& Mat_<_Tp>::adjustROI( int dtop, int dbottom, int dleft, int dright )
+{
+    return (Mat_<_Tp>&)(Mat::adjustROI(dtop, dbottom, dleft, dright));
+}
+
+template<typename _Tp> inline
+Mat_<_Tp> Mat_<_Tp>::operator()( const Range& _rowRange, const Range& _colRange ) const
+{
+    return Mat_<_Tp>(*this, _rowRange, _colRange);
+}
+
+template<typename _Tp> inline
+Mat_<_Tp> Mat_<_Tp>::operator()( const Rect& roi ) const
+{
+    return Mat_<_Tp>(*this, roi);
+}
+
+template<typename _Tp> inline
+Mat_<_Tp> Mat_<_Tp>::operator()( const Range* ranges ) const
+{
+    return Mat_<_Tp>(*this, ranges);
+}
+
+template<typename _Tp> inline
+Mat_<_Tp> Mat_<_Tp>::operator()(const std::vector<Range>& ranges) const
+{
+    return Mat_<_Tp>(*this, ranges);
+}
+
+template<typename _Tp> inline
+_Tp* Mat_<_Tp>::operator [](int y)
+{
+    CV_DbgAssert( 0 <= y && y < size.p[0] );
+    return (_Tp*)(data + y*step.p[0]);
+}
+
+template<typename _Tp> inline
+const _Tp* Mat_<_Tp>::operator [](int y) const
+{
+    CV_DbgAssert( 0 <= y && y < size.p[0] );
+    return (const _Tp*)(data + y*step.p[0]);
+}
+
+template<typename _Tp> inline
+_Tp& Mat_<_Tp>::operator ()(int i0, int i1)
+{
+    CV_DbgAssert(dims <= 2);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)i0 < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)i1 < (unsigned)size.p[1]);
+    CV_DbgAssert(type() == traits::Type<_Tp>::value);
+    return ((_Tp*)(data + step.p[0] * i0))[i1];
+}
+
+template<typename _Tp> inline
+const _Tp& Mat_<_Tp>::operator ()(int i0, int i1) const
+{
+    CV_DbgAssert(dims <= 2);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)i0 < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)i1 < (unsigned)size.p[1]);
+    CV_DbgAssert(type() == traits::Type<_Tp>::value);
+    return ((const _Tp*)(data + step.p[0] * i0))[i1];
+}
+
+template<typename _Tp> inline
+_Tp& Mat_<_Tp>::operator ()(Point pt)
+{
+    CV_DbgAssert(dims <= 2);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)pt.y < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)pt.x < (unsigned)size.p[1]);
+    CV_DbgAssert(type() == traits::Type<_Tp>::value);
+    return ((_Tp*)(data + step.p[0] * pt.y))[pt.x];
+}
+
+template<typename _Tp> inline
+const _Tp& Mat_<_Tp>::operator ()(Point pt) const
+{
+    CV_DbgAssert(dims <= 2);
+    CV_DbgAssert(data);
+    CV_DbgAssert((unsigned)pt.y < (unsigned)size.p[0]);
+    CV_DbgAssert((unsigned)pt.x < (unsigned)size.p[1]);
+    CV_DbgAssert(type() == traits::Type<_Tp>::value);
+    return ((const _Tp*)(data + step.p[0] * pt.y))[pt.x];
+}
+
+template<typename _Tp> inline
+_Tp& Mat_<_Tp>::operator ()(const int* idx)
+{
+    return Mat::at<_Tp>(idx);
+}
+
+template<typename _Tp> inline
+const _Tp& Mat_<_Tp>::operator ()(const int* idx) const
+{
+    return Mat::at<_Tp>(idx);
+}
+
+template<typename _Tp> template<int n> inline
+_Tp& Mat_<_Tp>::operator ()(const Vec<int, n>& idx)
+{
+    return Mat::at<_Tp>(idx);
+}
+
+template<typename _Tp> template<int n> inline
+const _Tp& Mat_<_Tp>::operator ()(const Vec<int, n>& idx) const
+{
+    return Mat::at<_Tp>(idx);
+}
+
+template<typename _Tp> inline
+_Tp& Mat_<_Tp>::operator ()(int i0)
+{
+    return this->at<_Tp>(i0);
+}
+
+template<typename _Tp> inline
+const _Tp& Mat_<_Tp>::operator ()(int i0) const
+{
+    return this->at<_Tp>(i0);
+}
+
+template<typename _Tp> inline
+_Tp& Mat_<_Tp>::operator ()(int i0, int i1, int i2)
+{
+    return this->at<_Tp>(i0, i1, i2);
+}
+
+template<typename _Tp> inline
+const _Tp& Mat_<_Tp>::operator ()(int i0, int i1, int i2) const
+{
+    return this->at<_Tp>(i0, i1, i2);
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>::operator std::vector<_Tp>() const
+{
+    std::vector<_Tp> v;
+    copyTo(v);
+    return v;
+}
+
+template<typename _Tp> template<std::size_t _Nm> inline
+Mat_<_Tp>::operator std::array<_Tp, _Nm>() const
+{
+    std::array<_Tp, _Nm> a;
+    copyTo(a);
+    return a;
+}
+
+template<typename _Tp> template<int n> inline
+Mat_<_Tp>::operator Vec<typename DataType<_Tp>::channel_type, n>() const
+{
+    CV_Assert(n % DataType<_Tp>::channels == 0);
+
+#if defined _MSC_VER
+    const Mat* pMat = (const Mat*)this; // workaround for MSVS <= 2012 compiler bugs (but GCC 4.6 dislikes this workaround)
+    return pMat->operator Vec<typename DataType<_Tp>::channel_type, n>();
+#else
+    return this->Mat::operator Vec<typename DataType<_Tp>::channel_type, n>();
+#endif
+}
+
+template<typename _Tp> template<int m, int n> inline
+Mat_<_Tp>::operator Matx<typename DataType<_Tp>::channel_type, m, n>() const
+{
+    CV_Assert(n % DataType<_Tp>::channels == 0);
+
+#if defined _MSC_VER
+    const Mat* pMat = (const Mat*)this; // workaround for MSVS <= 2012 compiler bugs (but GCC 4.6 dislikes this workaround)
+    Matx<typename DataType<_Tp>::channel_type, m, n> res = pMat->operator Matx<typename DataType<_Tp>::channel_type, m, n>();
+    return res;
+#else
+    Matx<typename DataType<_Tp>::channel_type, m, n> res = this->Mat::operator Matx<typename DataType<_Tp>::channel_type, m, n>();
+    return res;
+#endif
+}
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp> Mat_<_Tp>::begin() const
+{
+    return Mat::begin<_Tp>();
+}
+
+template<typename _Tp> inline
+std::reverse_iterator<MatConstIterator_<_Tp>> Mat_<_Tp>::rbegin() const
+{
+    return Mat::rbegin<_Tp>();
+}
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp> Mat_<_Tp>::end() const
+{
+    return Mat::end<_Tp>();
+}
+
+template<typename _Tp> inline
+std::reverse_iterator<MatConstIterator_<_Tp>> Mat_<_Tp>::rend() const
+{
+    return Mat::rend<_Tp>();
+}
+
+template<typename _Tp> inline
+MatIterator_<_Tp> Mat_<_Tp>::begin()
+{
+    return Mat::begin<_Tp>();
+}
+
+template<typename _Tp> inline
+std::reverse_iterator<MatIterator_<_Tp>> Mat_<_Tp>::rbegin()
+{
+    return Mat::rbegin<_Tp>();
+}
+
+template<typename _Tp> inline
+MatIterator_<_Tp> Mat_<_Tp>::end()
+{
+    return Mat::end<_Tp>();
+}
+
+template<typename _Tp> inline
+std::reverse_iterator<MatIterator_<_Tp>> Mat_<_Tp>::rend()
+{
+    return Mat::rend<_Tp>();
+}
+
+template<typename _Tp> template<typename Functor> inline
+void Mat_<_Tp>::forEach(const Functor& operation) {
+    Mat::forEach<_Tp, Functor>(operation);
+}
+
+template<typename _Tp> template<typename Functor> inline
+void Mat_<_Tp>::forEach(const Functor& operation) const {
+    Mat::forEach<_Tp, Functor>(operation);
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(Mat_&& m)
+    : Mat(std::move(m))
+{
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>& Mat_<_Tp>::operator = (Mat_&& m)
+{
+    Mat::operator = (std::move(m));
+    return *this;
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(Mat&& m)
+    : Mat()
+{
+    flags = (flags & ~CV_MAT_TYPE_MASK) + traits::Type<_Tp>::value;
+    *this = std::move(m);
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>& Mat_<_Tp>::operator = (Mat&& m)
+{
+    if (m.empty())
+    {
+        release();
+        return *this;
+    }
+    if( traits::Type<_Tp>::value == m.type() )
+    {
+        Mat::operator = ((Mat&&)m);
+        return *this;
+    }
+    if( traits::Depth<_Tp>::value == m.depth() )
+    {
+        Mat::operator = ((Mat&&)m.reshape(DataType<_Tp>::channels, m.dims, 0));
+        return *this;
+    }
+    CV_DbgAssert(DataType<_Tp>::channels == m.channels());
+    m.convertTo(*this, type());
+    return *this;
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(MatExpr&& e)
+    : Mat()
+{
+    flags = (flags & ~CV_MAT_TYPE_MASK) + traits::Type<_Tp>::value;
+    *this = Mat(e);
+}
+
+
+///////////////////////////// SparseMat /////////////////////////////
+
+inline
+SparseMat SparseMat::clone() const
+{
+    SparseMat temp;
+    this->copyTo(temp);
+    return temp;
+}
+
+inline
+size_t SparseMat::elemSize() const
+{
+    return CV_ELEM_SIZE(flags);
+}
+
+inline
+size_t SparseMat::elemSize1() const
+{
+    return CV_ELEM_SIZE1(flags);
+}
+
+inline
+int SparseMat::type() const
+{
+    return CV_MAT_TYPE(flags);
+}
+
+inline
+int SparseMat::depth() const
+{
+    return CV_MAT_DEPTH(flags);
+}
+
+inline
+int SparseMat::channels() const
+{
+    return CV_MAT_CN(flags);
+}
+
+inline
+const int* SparseMat::size() const
+{
+    return hdr ? hdr->size : 0;
+}
+
+inline
+int SparseMat::size(int i) const
+{
+    if( hdr )
+    {
+        CV_DbgAssert((unsigned)i < (unsigned)hdr->dims);
+        return hdr->size[i];
+    }
+    return 0;
+}
+
+inline
+int SparseMat::dims() const
+{
+    return hdr ? hdr->dims : 0;
+}
+
+inline
+size_t SparseMat::nzcount() const
+{
+    return hdr ? hdr->nodeCount : 0;
+}
+
+template<typename _Tp> inline
+_Tp& SparseMat::ref(int i0, size_t* hashval)
+{
+    return *(_Tp*)((SparseMat*)this)->ptr(i0, true, hashval);
+}
+
+template<typename _Tp> inline
+_Tp& SparseMat::ref(int i0, int i1, size_t* hashval)
+{
+    return *(_Tp*)((SparseMat*)this)->ptr(i0, i1, true, hashval);
+}
+
+template<typename _Tp> inline
+_Tp& SparseMat::ref(int i0, int i1, int i2, size_t* hashval)
+{
+    return *(_Tp*)((SparseMat*)this)->ptr(i0, i1, i2, true, hashval);
+}
+
+template<typename _Tp> inline
+_Tp& SparseMat::ref(const int* idx, size_t* hashval)
+{
+    return *(_Tp*)((SparseMat*)this)->ptr(idx, true, hashval);
+}
+
+template<typename _Tp> inline
+_Tp SparseMat::value(int i0, size_t* hashval) const
+{
+    const _Tp* p = (const _Tp*)((SparseMat*)this)->ptr(i0, false, hashval);
+    return p ? *p : _Tp();
+}
+
+template<typename _Tp> inline
+_Tp SparseMat::value(int i0, int i1, size_t* hashval) const
+{
+    const _Tp* p = (const _Tp*)((SparseMat*)this)->ptr(i0, i1, false, hashval);
+    return p ? *p : _Tp();
+}
+
+template<typename _Tp> inline
+_Tp SparseMat::value(int i0, int i1, int i2, size_t* hashval) const
+{
+    const _Tp* p = (const _Tp*)((SparseMat*)this)->ptr(i0, i1, i2, false, hashval);
+    return p ? *p : _Tp();
+}
+
+template<typename _Tp> inline
+_Tp SparseMat::value(const int* idx, size_t* hashval) const
+{
+    const _Tp* p = (const _Tp*)((SparseMat*)this)->ptr(idx, false, hashval);
+    return p ? *p : _Tp();
+}
+
+template<typename _Tp> inline
+const _Tp* SparseMat::find(int i0, size_t* hashval) const
+{
+    return (const _Tp*)((SparseMat*)this)->ptr(i0, false, hashval);
+}
+
+template<typename _Tp> inline
+const _Tp* SparseMat::find(int i0, int i1, size_t* hashval) const
+{
+    return (const _Tp*)((SparseMat*)this)->ptr(i0, i1, false, hashval);
+}
+
+template<typename _Tp> inline
+const _Tp* SparseMat::find(int i0, int i1, int i2, size_t* hashval) const
+{
+    return (const _Tp*)((SparseMat*)this)->ptr(i0, i1, i2, false, hashval);
+}
+
+template<typename _Tp> inline
+const _Tp* SparseMat::find(const int* idx, size_t* hashval) const
+{
+    return (const _Tp*)((SparseMat*)this)->ptr(idx, false, hashval);
+}
+
+template<typename _Tp> inline
+_Tp& SparseMat::value(Node* n)
+{
+    return *(_Tp*)((uchar*)n + hdr->valueOffset);
+}
+
+template<typename _Tp> inline
+const _Tp& SparseMat::value(const Node* n) const
+{
+    return *(const _Tp*)((const uchar*)n + hdr->valueOffset);
+}
+
+inline
+SparseMat::Node* SparseMat::node(size_t nidx)
+{
+    return (Node*)(void*)&hdr->pool[nidx];
+}
+
+inline
+const SparseMat::Node* SparseMat::node(size_t nidx) const
+{
+    return (const Node*)(const void*)&hdr->pool[nidx];
+}
+
+inline
+SparseMatIterator SparseMat::begin()
+{
+    return SparseMatIterator(this);
+}
+
+inline
+SparseMatConstIterator SparseMat::begin() const
+{
+    return SparseMatConstIterator(this);
+}
+
+inline
+SparseMatIterator SparseMat::end()
+{
+    SparseMatIterator it(this);
+    it.seekEnd();
+    return it;
+}
+
+inline
+SparseMatConstIterator SparseMat::end() const
+{
+    SparseMatConstIterator it(this);
+    it.seekEnd();
+    return it;
+}
+
+template<typename _Tp> inline
+SparseMatIterator_<_Tp> SparseMat::begin()
+{
+    return SparseMatIterator_<_Tp>(this);
+}
+
+template<typename _Tp> inline
+SparseMatConstIterator_<_Tp> SparseMat::begin() const
+{
+    return SparseMatConstIterator_<_Tp>(this);
+}
+
+template<typename _Tp> inline
+SparseMatIterator_<_Tp> SparseMat::end()
+{
+    SparseMatIterator_<_Tp> it(this);
+    it.seekEnd();
+    return it;
+}
+
+template<typename _Tp> inline
+SparseMatConstIterator_<_Tp> SparseMat::end() const
+{
+    SparseMatConstIterator_<_Tp> it(this);
+    it.seekEnd();
+    return it;
+}
+
+
+
+///////////////////////////// SparseMat_ ////////////////////////////
+
+template<typename _Tp> inline
+SparseMat_<_Tp>::SparseMat_()
+{
+    flags = MAGIC_VAL + traits::Type<_Tp>::value;
+}
+
+template<typename _Tp> inline
+SparseMat_<_Tp>::SparseMat_(int _dims, const int* _sizes)
+    : SparseMat(_dims, _sizes, traits::Type<_Tp>::value)
+{}
+
+template<typename _Tp> inline
+SparseMat_<_Tp>::SparseMat_(const SparseMat& m)
+{
+    if( m.type() == traits::Type<_Tp>::value )
+        *this = (const SparseMat_<_Tp>&)m;
+    else
+        m.convertTo(*this, traits::Type<_Tp>::value);
+}
+
+template<typename _Tp> inline
+SparseMat_<_Tp>::SparseMat_(const SparseMat_<_Tp>& m)
+{
+    this->flags = m.flags;
+    this->hdr = m.hdr;
+    if( this->hdr )
+        CV_XADD(&this->hdr->refcount, 1);
+}
+
+template<typename _Tp> inline
+SparseMat_<_Tp>::SparseMat_(const Mat& m)
+{
+    SparseMat sm(m);
+    *this = sm;
+}
+
+template<typename _Tp> inline
+SparseMat_<_Tp>& SparseMat_<_Tp>::operator = (const SparseMat_<_Tp>& m)
+{
+    if( this != &m )
+    {
+        if( m.hdr ) CV_XADD(&m.hdr->refcount, 1);
+        release();
+        flags = m.flags;
+        hdr = m.hdr;
+    }
+    return *this;
+}
+
+template<typename _Tp> inline
+SparseMat_<_Tp>& SparseMat_<_Tp>::operator = (const SparseMat& m)
+{
+    if( m.type() == traits::Type<_Tp>::value )
+        return (*this = (const SparseMat_<_Tp>&)m);
+    m.convertTo(*this, traits::Type<_Tp>::value);
+    return *this;
+}
+
+template<typename _Tp> inline
+SparseMat_<_Tp>& SparseMat_<_Tp>::operator = (const Mat& m)
+{
+    return (*this = SparseMat(m));
+}
+
+template<typename _Tp> inline
+SparseMat_<_Tp> SparseMat_<_Tp>::clone() const
+{
+    SparseMat_<_Tp> m;
+    this->copyTo(m);
+    return m;
+}
+
+template<typename _Tp> inline
+void SparseMat_<_Tp>::create(int _dims, const int* _sizes)
+{
+    SparseMat::create(_dims, _sizes, traits::Type<_Tp>::value);
+}
+
+template<typename _Tp> inline
+int SparseMat_<_Tp>::type() const
+{
+    return traits::Type<_Tp>::value;
+}
+
+template<typename _Tp> inline
+int SparseMat_<_Tp>::depth() const
+{
+    return traits::Depth<_Tp>::value;
+}
+
+template<typename _Tp> inline
+int SparseMat_<_Tp>::channels() const
+{
+    return DataType<_Tp>::channels;
+}
+
+template<typename _Tp> inline
+_Tp& SparseMat_<_Tp>::ref(int i0, size_t* hashval)
+{
+    return SparseMat::ref<_Tp>(i0, hashval);
+}
+
+template<typename _Tp> inline
+_Tp SparseMat_<_Tp>::operator()(int i0, size_t* hashval) const
+{
+    return SparseMat::value<_Tp>(i0, hashval);
+}
+
+template<typename _Tp> inline
+_Tp& SparseMat_<_Tp>::ref(int i0, int i1, size_t* hashval)
+{
+    return SparseMat::ref<_Tp>(i0, i1, hashval);
+}
+
+template<typename _Tp> inline
+_Tp SparseMat_<_Tp>::operator()(int i0, int i1, size_t* hashval) const
+{
+    return SparseMat::value<_Tp>(i0, i1, hashval);
+}
+
+template<typename _Tp> inline
+_Tp& SparseMat_<_Tp>::ref(int i0, int i1, int i2, size_t* hashval)
+{
+    return SparseMat::ref<_Tp>(i0, i1, i2, hashval);
+}
+
+template<typename _Tp> inline
+_Tp SparseMat_<_Tp>::operator()(int i0, int i1, int i2, size_t* hashval) const
+{
+    return SparseMat::value<_Tp>(i0, i1, i2, hashval);
+}
+
+template<typename _Tp> inline
+_Tp& SparseMat_<_Tp>::ref(const int* idx, size_t* hashval)
+{
+    return SparseMat::ref<_Tp>(idx, hashval);
+}
+
+template<typename _Tp> inline
+_Tp SparseMat_<_Tp>::operator()(const int* idx, size_t* hashval) const
+{
+    return SparseMat::value<_Tp>(idx, hashval);
+}
+
+template<typename _Tp> inline
+SparseMatIterator_<_Tp> SparseMat_<_Tp>::begin()
+{
+    return SparseMatIterator_<_Tp>(this);
+}
+
+template<typename _Tp> inline
+SparseMatConstIterator_<_Tp> SparseMat_<_Tp>::begin() const
+{
+    return SparseMatConstIterator_<_Tp>(this);
+}
+
+template<typename _Tp> inline
+SparseMatIterator_<_Tp> SparseMat_<_Tp>::end()
+{
+    SparseMatIterator_<_Tp> it(this);
+    it.seekEnd();
+    return it;
+}
+
+template<typename _Tp> inline
+SparseMatConstIterator_<_Tp> SparseMat_<_Tp>::end() const
+{
+    SparseMatConstIterator_<_Tp> it(this);
+    it.seekEnd();
+    return it;
+}
+
+
+
+////////////////////////// MatConstIterator /////////////////////////
+
+inline
+MatConstIterator::MatConstIterator()
+    : m(0), elemSize(0), ptr(0), sliceStart(0), sliceEnd(0)
+{}
+
+inline
+MatConstIterator::MatConstIterator(const Mat* _m)
+    : m(_m), elemSize(_m->elemSize()), ptr(0), sliceStart(0), sliceEnd(0)
+{
+    if( m && m->isContinuous() )
+    {
+        CV_Assert(!m->empty());
+        sliceStart = m->ptr();
+        sliceEnd = sliceStart + m->total()*elemSize;
+    }
+    seek((const int*)0);
+}
+
+inline
+MatConstIterator::MatConstIterator(const Mat* _m, int _row, int _col)
+    : m(_m), elemSize(_m->elemSize()), ptr(0), sliceStart(0), sliceEnd(0)
+{
+    CV_Assert(m && m->dims <= 2);
+    if( m->isContinuous() )
+    {
+        CV_Assert(!m->empty());
+        sliceStart = m->ptr();
+        sliceEnd = sliceStart + m->total()*elemSize;
+    }
+    int idx[] = {_row, _col};
+    seek(idx);
+}
+
+inline
+MatConstIterator::MatConstIterator(const Mat* _m, Point _pt)
+    : m(_m), elemSize(_m->elemSize()), ptr(0), sliceStart(0), sliceEnd(0)
+{
+    CV_Assert(m && m->dims <= 2);
+    if( m->isContinuous() )
+    {
+        CV_Assert(!m->empty());
+        sliceStart = m->ptr();
+        sliceEnd = sliceStart + m->total()*elemSize;
+    }
+    int idx[] = {_pt.y, _pt.x};
+    seek(idx);
+}
+
+inline
+MatConstIterator::MatConstIterator(const MatConstIterator& it)
+    : m(it.m), elemSize(it.elemSize), ptr(it.ptr), sliceStart(it.sliceStart), sliceEnd(it.sliceEnd)
+{}
+
+inline
+MatConstIterator& MatConstIterator::operator = (const MatConstIterator& it )
+{
+    m = it.m; elemSize = it.elemSize; ptr = it.ptr;
+    sliceStart = it.sliceStart; sliceEnd = it.sliceEnd;
+    return *this;
+}
+
+inline
+const uchar* MatConstIterator::operator *() const
+{
+    return ptr;
+}
+
+inline MatConstIterator& MatConstIterator::operator += (ptrdiff_t ofs)
+{
+    if( !m || ofs == 0 )
+        return *this;
+    ptrdiff_t ofsb = ofs*elemSize;
+    ptr += ofsb;
+    if( ptr < sliceStart || sliceEnd <= ptr )
+    {
+        ptr -= ofsb;
+        seek(ofs, true);
+    }
+    return *this;
+}
+
+inline
+MatConstIterator& MatConstIterator::operator -= (ptrdiff_t ofs)
+{
+    return (*this += -ofs);
+}
+
+inline
+MatConstIterator& MatConstIterator::operator --()
+{
+    if( m && (ptr -= elemSize) < sliceStart )
+    {
+        ptr += elemSize;
+        seek(-1, true);
+    }
+    return *this;
+}
+
+inline
+MatConstIterator MatConstIterator::operator --(int)
+{
+    MatConstIterator b = *this;
+    *this += -1;
+    return b;
+}
+
+inline
+MatConstIterator& MatConstIterator::operator ++()
+{
+    if( m && (ptr += elemSize) >= sliceEnd )
+    {
+        ptr -= elemSize;
+        seek(1, true);
+    }
+    return *this;
+}
+
+inline MatConstIterator MatConstIterator::operator ++(int)
+{
+    MatConstIterator b = *this;
+    *this += 1;
+    return b;
+}
+
+
+static inline
+bool operator == (const MatConstIterator& a, const MatConstIterator& b)
+{
+    return a.m == b.m && a.ptr == b.ptr;
+}
+
+static inline
+bool operator != (const MatConstIterator& a, const MatConstIterator& b)
+{
+    return !(a == b);
+}
+
+static inline
+bool operator < (const MatConstIterator& a, const MatConstIterator& b)
+{
+    return a.ptr < b.ptr;
+}
+
+static inline
+bool operator > (const MatConstIterator& a, const MatConstIterator& b)
+{
+    return a.ptr > b.ptr;
+}
+
+static inline
+bool operator <= (const MatConstIterator& a, const MatConstIterator& b)
+{
+    return a.ptr <= b.ptr;
+}
+
+static inline
+bool operator >= (const MatConstIterator& a, const MatConstIterator& b)
+{
+    return a.ptr >= b.ptr;
+}
+
+static inline
+ptrdiff_t operator - (const MatConstIterator& b, const MatConstIterator& a)
+{
+    if( a.m != b.m )
+        return ((size_t)(-1) >> 1);
+    if( a.sliceEnd == b.sliceEnd )
+        return (b.ptr - a.ptr)/static_cast<ptrdiff_t>(b.elemSize);
+
+    return b.lpos() - a.lpos();
+}
+
+static inline
+MatConstIterator operator + (const MatConstIterator& a, ptrdiff_t ofs)
+{
+    MatConstIterator b = a;
+    return b += ofs;
+}
+
+static inline
+MatConstIterator operator + (ptrdiff_t ofs, const MatConstIterator& a)
+{
+    MatConstIterator b = a;
+    return b += ofs;
+}
+
+static inline
+MatConstIterator operator - (const MatConstIterator& a, ptrdiff_t ofs)
+{
+    MatConstIterator b = a;
+    return b += -ofs;
+}
+
+
+inline
+const uchar* MatConstIterator::operator [](ptrdiff_t i) const
+{
+    return *(*this + i);
+}
+
+
+
+///////////////////////// MatConstIterator_ /////////////////////////
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp>::MatConstIterator_()
+{}
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp>::MatConstIterator_(const Mat_<_Tp>* _m)
+    : MatConstIterator(_m)
+{}
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp>::MatConstIterator_(const Mat_<_Tp>* _m, int _row, int _col)
+    : MatConstIterator(_m, _row, _col)
+{}
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp>::MatConstIterator_(const Mat_<_Tp>* _m, Point _pt)
+    : MatConstIterator(_m, _pt)
+{}
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp>::MatConstIterator_(const MatConstIterator_& it)
+    : MatConstIterator(it)
+{}
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp>& MatConstIterator_<_Tp>::operator = (const MatConstIterator_& it )
+{
+    MatConstIterator::operator = (it);
+    return *this;
+}
+
+template<typename _Tp> inline
+const _Tp& MatConstIterator_<_Tp>::operator *() const
+{
+    return *(_Tp*)(this->ptr);
+}
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp>& MatConstIterator_<_Tp>::operator += (ptrdiff_t ofs)
+{
+    MatConstIterator::operator += (ofs);
+    return *this;
+}
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp>& MatConstIterator_<_Tp>::operator -= (ptrdiff_t ofs)
+{
+    return (*this += -ofs);
+}
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp>& MatConstIterator_<_Tp>::operator --()
+{
+    MatConstIterator::operator --();
+    return *this;
+}
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp> MatConstIterator_<_Tp>::operator --(int)
+{
+    MatConstIterator_ b = *this;
+    MatConstIterator::operator --();
+    return b;
+}
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp>& MatConstIterator_<_Tp>::operator ++()
+{
+    MatConstIterator::operator ++();
+    return *this;
+}
+
+template<typename _Tp> inline
+MatConstIterator_<_Tp> MatConstIterator_<_Tp>::operator ++(int)
+{
+    MatConstIterator_ b = *this;
+    MatConstIterator::operator ++();
+    return b;
+}
+
+
+template<typename _Tp> inline
+Point MatConstIterator_<_Tp>::pos() const
+{
+    if( !m )
+        return Point();
+    CV_DbgAssert( m->dims <= 2 );
+    if( m->isContinuous() )
+    {
+        ptrdiff_t ofs = (const _Tp*)ptr - (const _Tp*)m->data;
+        int y = (int)(ofs / m->cols);
+        int x = (int)(ofs - (ptrdiff_t)y * m->cols);
+        return Point(x, y);
+    }
+    else
+    {
+        ptrdiff_t ofs = (uchar*)ptr - m->data;
+        int y = (int)(ofs / m->step);
+        int x = (int)((ofs - y * m->step)/sizeof(_Tp));
+        return Point(x, y);
+    }
+}
+
+
+template<typename _Tp> static inline
+bool operator == (const MatConstIterator_<_Tp>& a, const MatConstIterator_<_Tp>& b)
+{
+    return a.m == b.m && a.ptr == b.ptr;
+}
+
+template<typename _Tp> static inline
+bool operator != (const MatConstIterator_<_Tp>& a, const MatConstIterator_<_Tp>& b)
+{
+    return a.m != b.m || a.ptr != b.ptr;
+}
+
+template<typename _Tp> static inline
+MatConstIterator_<_Tp> operator + (const MatConstIterator_<_Tp>& a, ptrdiff_t ofs)
+{
+    MatConstIterator t = (const MatConstIterator&)a + ofs;
+    return (MatConstIterator_<_Tp>&)t;
+}
+
+template<typename _Tp> static inline
+MatConstIterator_<_Tp> operator + (ptrdiff_t ofs, const MatConstIterator_<_Tp>& a)
+{
+    MatConstIterator t = (const MatConstIterator&)a + ofs;
+    return (MatConstIterator_<_Tp>&)t;
+}
+
+template<typename _Tp> static inline
+MatConstIterator_<_Tp> operator - (const MatConstIterator_<_Tp>& a, ptrdiff_t ofs)
+{
+    MatConstIterator t = (const MatConstIterator&)a - ofs;
+    return (MatConstIterator_<_Tp>&)t;
+}
+
+template<typename _Tp> inline
+const _Tp& MatConstIterator_<_Tp>::operator [](ptrdiff_t i) const
+{
+    return *(_Tp*)MatConstIterator::operator [](i);
+}
+
+
+
+//////////////////////////// MatIterator_ ///////////////////////////
+
+template<typename _Tp> inline
+MatIterator_<_Tp>::MatIterator_()
+    : MatConstIterator_<_Tp>()
+{}
+
+template<typename _Tp> inline
+MatIterator_<_Tp>::MatIterator_(Mat_<_Tp>* _m)
+    : MatConstIterator_<_Tp>(_m)
+{}
+
+template<typename _Tp> inline
+MatIterator_<_Tp>::MatIterator_(Mat_<_Tp>* _m, int _row, int _col)
+    : MatConstIterator_<_Tp>(_m, _row, _col)
+{}
+
+template<typename _Tp> inline
+MatIterator_<_Tp>::MatIterator_(Mat_<_Tp>* _m, Point _pt)
+    : MatConstIterator_<_Tp>(_m, _pt)
+{}
+
+template<typename _Tp> inline
+MatIterator_<_Tp>::MatIterator_(Mat_<_Tp>* _m, const int* _idx)
+    : MatConstIterator_<_Tp>(_m, _idx)
+{}
+
+template<typename _Tp> inline
+MatIterator_<_Tp>::MatIterator_(const MatIterator_& it)
+    : MatConstIterator_<_Tp>(it)
+{}
+
+template<typename _Tp> inline
+MatIterator_<_Tp>& MatIterator_<_Tp>::operator = (const MatIterator_<_Tp>& it )
+{
+    MatConstIterator::operator = (it);
+    return *this;
+}
+
+template<typename _Tp> inline
+_Tp& MatIterator_<_Tp>::operator *() const
+{
+    return *(_Tp*)(this->ptr);
+}
+
+template<typename _Tp> inline
+MatIterator_<_Tp>& MatIterator_<_Tp>::operator += (ptrdiff_t ofs)
+{
+    MatConstIterator::operator += (ofs);
+    return *this;
+}
+
+template<typename _Tp> inline
+MatIterator_<_Tp>& MatIterator_<_Tp>::operator -= (ptrdiff_t ofs)
+{
+    MatConstIterator::operator += (-ofs);
+    return *this;
+}
+
+template<typename _Tp> inline
+MatIterator_<_Tp>& MatIterator_<_Tp>::operator --()
+{
+    MatConstIterator::operator --();
+    return *this;
+}
+
+template<typename _Tp> inline
+MatIterator_<_Tp> MatIterator_<_Tp>::operator --(int)
+{
+    MatIterator_ b = *this;
+    MatConstIterator::operator --();
+    return b;
+}
+
+template<typename _Tp> inline
+MatIterator_<_Tp>& MatIterator_<_Tp>::operator ++()
+{
+    MatConstIterator::operator ++();
+    return *this;
+}
+
+template<typename _Tp> inline
+MatIterator_<_Tp> MatIterator_<_Tp>::operator ++(int)
+{
+    MatIterator_ b = *this;
+    MatConstIterator::operator ++();
+    return b;
+}
+
+template<typename _Tp> inline
+_Tp& MatIterator_<_Tp>::operator [](ptrdiff_t i) const
+{
+    return *(*this + i);
+}
+
+
+template<typename _Tp> static inline
+bool operator == (const MatIterator_<_Tp>& a, const MatIterator_<_Tp>& b)
+{
+    return a.m == b.m && a.ptr == b.ptr;
+}
+
+template<typename _Tp> static inline
+bool operator != (const MatIterator_<_Tp>& a, const MatIterator_<_Tp>& b)
+{
+    return a.m != b.m || a.ptr != b.ptr;
+}
+
+template<typename _Tp> static inline
+MatIterator_<_Tp> operator + (const MatIterator_<_Tp>& a, ptrdiff_t ofs)
+{
+    MatConstIterator t = (const MatConstIterator&)a + ofs;
+    return (MatIterator_<_Tp>&)t;
+}
+
+template<typename _Tp> static inline
+MatIterator_<_Tp> operator + (ptrdiff_t ofs, const MatIterator_<_Tp>& a)
+{
+    MatConstIterator t = (const MatConstIterator&)a + ofs;
+    return (MatIterator_<_Tp>&)t;
+}
+
+template<typename _Tp> static inline
+MatIterator_<_Tp> operator - (const MatIterator_<_Tp>& a, ptrdiff_t ofs)
+{
+    MatConstIterator t = (const MatConstIterator&)a - ofs;
+    return (MatIterator_<_Tp>&)t;
+}
+
+
+
+/////////////////////// SparseMatConstIterator //////////////////////
+
+inline
+SparseMatConstIterator::SparseMatConstIterator()
+    : m(0), hashidx(0), ptr(0)
+{}
+
+inline
+SparseMatConstIterator::SparseMatConstIterator(const SparseMatConstIterator& it)
+    : m(it.m), hashidx(it.hashidx), ptr(it.ptr)
+{}
+
+inline SparseMatConstIterator& SparseMatConstIterator::operator = (const SparseMatConstIterator& it)
+{
+    if( this != &it )
+    {
+        m = it.m;
+        hashidx = it.hashidx;
+        ptr = it.ptr;
+    }
+    return *this;
+}
+
+template<typename _Tp> inline
+const _Tp& SparseMatConstIterator::value() const
+{
+    return *(const _Tp*)ptr;
+}
+
+inline
+const SparseMat::Node* SparseMatConstIterator::node() const
+{
+    return (ptr && m && m->hdr) ? (const SparseMat::Node*)(const void*)(ptr - m->hdr->valueOffset) : 0;
+}
+
+inline
+SparseMatConstIterator SparseMatConstIterator::operator ++(int)
+{
+    SparseMatConstIterator it = *this;
+    ++*this;
+    return it;
+}
+
+inline
+void SparseMatConstIterator::seekEnd()
+{
+    if( m && m->hdr )
+    {
+        hashidx = m->hdr->hashtab.size();
+        ptr = 0;
+    }
+}
+
+
+static inline
+bool operator == (const SparseMatConstIterator& it1, const SparseMatConstIterator& it2)
+{
+    return it1.m == it2.m && it1.ptr == it2.ptr;
+}
+
+static inline
+bool operator != (const SparseMatConstIterator& it1, const SparseMatConstIterator& it2)
+{
+    return !(it1 == it2);
+}
+
+
+
+///////////////////////// SparseMatIterator /////////////////////////
+
+inline
+SparseMatIterator::SparseMatIterator()
+{}
+
+inline
+SparseMatIterator::SparseMatIterator(SparseMat* _m)
+    : SparseMatConstIterator(_m)
+{}
+
+inline
+SparseMatIterator::SparseMatIterator(const SparseMatIterator& it)
+    : SparseMatConstIterator(it)
+{}
+
+inline
+SparseMatIterator& SparseMatIterator::operator = (const SparseMatIterator& it)
+{
+    (SparseMatConstIterator&)*this = it;
+    return *this;
+}
+
+template<typename _Tp> inline
+_Tp& SparseMatIterator::value() const
+{
+    return *(_Tp*)ptr;
+}
+
+inline
+SparseMat::Node* SparseMatIterator::node() const
+{
+    return (SparseMat::Node*)SparseMatConstIterator::node();
+}
+
+inline
+SparseMatIterator& SparseMatIterator::operator ++()
+{
+    SparseMatConstIterator::operator ++();
+    return *this;
+}
+
+inline
+SparseMatIterator SparseMatIterator::operator ++(int)
+{
+    SparseMatIterator it = *this;
+    ++*this;
+    return it;
+}
+
+
+
+////////////////////// SparseMatConstIterator_ //////////////////////
+
+template<typename _Tp> inline
+SparseMatConstIterator_<_Tp>::SparseMatConstIterator_()
+{}
+
+template<typename _Tp> inline
+SparseMatConstIterator_<_Tp>::SparseMatConstIterator_(const SparseMat_<_Tp>* _m)
+    : SparseMatConstIterator(_m)
+{}
+
+template<typename _Tp> inline
+SparseMatConstIterator_<_Tp>::SparseMatConstIterator_(const SparseMat* _m)
+    : SparseMatConstIterator(_m)
+{
+    CV_Assert( _m->type() == traits::Type<_Tp>::value );
+}
+
+template<typename _Tp> inline
+SparseMatConstIterator_<_Tp>::SparseMatConstIterator_(const SparseMatConstIterator_<_Tp>& it)
+    : SparseMatConstIterator(it)
+{}
+
+template<typename _Tp> inline
+SparseMatConstIterator_<_Tp>& SparseMatConstIterator_<_Tp>::operator = (const SparseMatConstIterator_<_Tp>& it)
+{
+    return reinterpret_cast<SparseMatConstIterator_<_Tp>&>
+         (*reinterpret_cast<SparseMatConstIterator*>(this) =
+           reinterpret_cast<const SparseMatConstIterator&>(it));
+}
+
+template<typename _Tp> inline
+const _Tp& SparseMatConstIterator_<_Tp>::operator *() const
+{
+    return *(const _Tp*)this->ptr;
+}
+
+template<typename _Tp> inline
+SparseMatConstIterator_<_Tp>& SparseMatConstIterator_<_Tp>::operator ++()
+{
+    SparseMatConstIterator::operator ++();
+    return *this;
+}
+
+template<typename _Tp> inline
+SparseMatConstIterator_<_Tp> SparseMatConstIterator_<_Tp>::operator ++(int)
+{
+    SparseMatConstIterator_<_Tp> it = *this;
+    SparseMatConstIterator::operator ++();
+    return it;
+}
+
+
+
+///////////////////////// SparseMatIterator_ ////////////////////////
+
+template<typename _Tp> inline
+SparseMatIterator_<_Tp>::SparseMatIterator_()
+{}
+
+template<typename _Tp> inline
+SparseMatIterator_<_Tp>::SparseMatIterator_(SparseMat_<_Tp>* _m)
+    : SparseMatConstIterator_<_Tp>(_m)
+{}
+
+template<typename _Tp> inline
+SparseMatIterator_<_Tp>::SparseMatIterator_(SparseMat* _m)
+    : SparseMatConstIterator_<_Tp>(_m)
+{}
+
+template<typename _Tp> inline
+SparseMatIterator_<_Tp>::SparseMatIterator_(const SparseMatIterator_<_Tp>& it)
+    : SparseMatConstIterator_<_Tp>(it)
+{}
+
+template<typename _Tp> inline
+SparseMatIterator_<_Tp>& SparseMatIterator_<_Tp>::operator = (const SparseMatIterator_<_Tp>& it)
+{
+    return reinterpret_cast<SparseMatIterator_<_Tp>&>
+         (*reinterpret_cast<SparseMatConstIterator*>(this) =
+           reinterpret_cast<const SparseMatConstIterator&>(it));
+}
+
+template<typename _Tp> inline
+_Tp& SparseMatIterator_<_Tp>::operator *() const
+{
+    return *(_Tp*)this->ptr;
+}
+
+template<typename _Tp> inline
+SparseMatIterator_<_Tp>& SparseMatIterator_<_Tp>::operator ++()
+{
+    SparseMatConstIterator::operator ++();
+    return *this;
+}
+
+template<typename _Tp> inline
+SparseMatIterator_<_Tp> SparseMatIterator_<_Tp>::operator ++(int)
+{
+    SparseMatIterator_<_Tp> it = *this;
+    SparseMatConstIterator::operator ++();
+    return it;
+}
+
+
+
+//////////////////////// MatCommaInitializer_ ///////////////////////
+
+template<typename _Tp> inline
+MatCommaInitializer_<_Tp>::MatCommaInitializer_(Mat_<_Tp>* _m)
+    : it(_m)
+{}
+
+template<typename _Tp> template<typename T2> inline
+MatCommaInitializer_<_Tp>& MatCommaInitializer_<_Tp>::operator , (T2 v)
+{
+    CV_DbgAssert( this->it < ((const Mat_<_Tp>*)this->it.m)->end() );
+    *this->it = _Tp(v);
+    ++this->it;
+    return *this;
+}
+
+template<typename _Tp> inline
+MatCommaInitializer_<_Tp>::operator Mat_<_Tp>() const
+{
+    CV_DbgAssert( this->it == ((const Mat_<_Tp>*)this->it.m)->end() );
+    return Mat_<_Tp>(*this->it.m);
+}
+
+
+template<typename _Tp, typename T2> static inline
+MatCommaInitializer_<_Tp> operator << (const Mat_<_Tp>& m, T2 val)
+{
+    MatCommaInitializer_<_Tp> commaInitializer((Mat_<_Tp>*)&m);
+    return (commaInitializer, val);
+}
+
+
+
+///////////////////////// Matrix Expressions ////////////////////////
+
+inline
+Mat& Mat::operator = (const MatExpr& e)
+{
+    e.op->assign(e, *this);
+    return *this;
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>::Mat_(const MatExpr& e)
+{
+    e.op->assign(e, *this, traits::Type<_Tp>::value);
+}
+
+template<typename _Tp> inline
+Mat_<_Tp>& Mat_<_Tp>::operator = (const MatExpr& e)
+{
+    e.op->assign(e, *this, traits::Type<_Tp>::value);
+    return *this;
+}
+
+template<typename _Tp> inline
+MatExpr Mat_<_Tp>::zeros(int rows, int cols)
+{
+    return Mat::zeros(rows, cols, traits::Type<_Tp>::value);
+}
+
+template<typename _Tp> inline
+MatExpr Mat_<_Tp>::zeros(Size sz)
+{
+    return Mat::zeros(sz, traits::Type<_Tp>::value);
+}
+
+template<typename _Tp> inline
+MatExpr Mat_<_Tp>::ones(int rows, int cols)
+{
+    return Mat::ones(rows, cols, traits::Type<_Tp>::value);
+}
+
+template<typename _Tp> inline
+MatExpr Mat_<_Tp>::ones(Size sz)
+{
+    return Mat::ones(sz, traits::Type<_Tp>::value);
+}
+
+template<typename _Tp> inline
+MatExpr Mat_<_Tp>::eye(int rows, int cols)
+{
+    return Mat::eye(rows, cols, traits::Type<_Tp>::value);
+}
+
+template<typename _Tp> inline
+MatExpr Mat_<_Tp>::eye(Size sz)
+{
+    return Mat::eye(sz, traits::Type<_Tp>::value);
+}
+
+inline
+MatExpr::MatExpr()
+    : op(0), flags(0), a(Mat()), b(Mat()), c(Mat()), alpha(0), beta(0), s()
+{}
+
+inline
+MatExpr::MatExpr(const MatOp* _op, int _flags, const Mat& _a, const Mat& _b,
+                 const Mat& _c, double _alpha, double _beta, const Scalar& _s)
+    : op(_op), flags(_flags), a(_a), b(_b), c(_c), alpha(_alpha), beta(_beta), s(_s)
+{}
+
+inline
+MatExpr::operator Mat() const
+{
+    Mat m;
+    op->assign(*this, m);
+    return m;
+}
+
+template<typename _Tp> inline
+MatExpr::operator Mat_<_Tp>() const
+{
+    Mat_<_Tp> m;
+    op->assign(*this, m, traits::Type<_Tp>::value);
+    return m;
+}
+
+
+template<typename _Tp> static inline
+MatExpr min(const Mat_<_Tp>& a, const Mat_<_Tp>& b)
+{
+    return cv::min((const Mat&)a, (const Mat&)b);
+}
+
+template<typename _Tp> static inline
+MatExpr min(const Mat_<_Tp>& a, double s)
+{
+    return cv::min((const Mat&)a, s);
+}
+
+template<typename _Tp> static inline
+MatExpr min(double s, const Mat_<_Tp>& a)
+{
+    return cv::min((const Mat&)a, s);
+}
+
+template<typename _Tp> static inline
+MatExpr max(const Mat_<_Tp>& a, const Mat_<_Tp>& b)
+{
+    return cv::max((const Mat&)a, (const Mat&)b);
+}
+
+template<typename _Tp> static inline
+MatExpr max(const Mat_<_Tp>& a, double s)
+{
+    return cv::max((const Mat&)a, s);
+}
+
+template<typename _Tp> static inline
+MatExpr max(double s, const Mat_<_Tp>& a)
+{
+    return cv::max((const Mat&)a, s);
+}
+
+template<typename _Tp> static inline
+MatExpr abs(const Mat_<_Tp>& m)
+{
+    return cv::abs((const Mat&)m);
+}
+
+
+static inline
+Mat& operator += (Mat& a, const MatExpr& b)
+{
+    b.op->augAssignAdd(b, a);
+    return a;
+}
+
+static inline
+const Mat& operator += (const Mat& a, const MatExpr& b)
+{
+    b.op->augAssignAdd(b, (Mat&)a);
+    return a;
+}
+
+template<typename _Tp> static inline
+Mat_<_Tp>& operator += (Mat_<_Tp>& a, const MatExpr& b)
+{
+    b.op->augAssignAdd(b, a);
+    return a;
+}
+
+template<typename _Tp> static inline
+const Mat_<_Tp>& operator += (const Mat_<_Tp>& a, const MatExpr& b)
+{
+    b.op->augAssignAdd(b, (Mat&)a);
+    return a;
+}
+
+static inline
+Mat& operator -= (Mat& a, const MatExpr& b)
+{
+    b.op->augAssignSubtract(b, a);
+    return a;
+}
+
+static inline
+const Mat& operator -= (const Mat& a, const MatExpr& b)
+{
+    b.op->augAssignSubtract(b, (Mat&)a);
+    return a;
+}
+
+template<typename _Tp> static inline
+Mat_<_Tp>& operator -= (Mat_<_Tp>& a, const MatExpr& b)
+{
+    b.op->augAssignSubtract(b, a);
+    return a;
+}
+
+template<typename _Tp> static inline
+const Mat_<_Tp>& operator -= (const Mat_<_Tp>& a, const MatExpr& b)
+{
+    b.op->augAssignSubtract(b, (Mat&)a);
+    return a;
+}
+
+static inline
+Mat& operator *= (Mat& a, const MatExpr& b)
+{
+    b.op->augAssignMultiply(b, a);
+    return a;
+}
+
+static inline
+const Mat& operator *= (const Mat& a, const MatExpr& b)
+{
+    b.op->augAssignMultiply(b, (Mat&)a);
+    return a;
+}
+
+template<typename _Tp> static inline
+Mat_<_Tp>& operator *= (Mat_<_Tp>& a, const MatExpr& b)
+{
+    b.op->augAssignMultiply(b, a);
+    return a;
+}
+
+template<typename _Tp> static inline
+const Mat_<_Tp>& operator *= (const Mat_<_Tp>& a, const MatExpr& b)
+{
+    b.op->augAssignMultiply(b, (Mat&)a);
+    return a;
+}
+
+static inline
+Mat& operator /= (Mat& a, const MatExpr& b)
+{
+    b.op->augAssignDivide(b, a);
+    return a;
+}
+
+static inline
+const Mat& operator /= (const Mat& a, const MatExpr& b)
+{
+    b.op->augAssignDivide(b, (Mat&)a);
+    return a;
+}
+
+template<typename _Tp> static inline
+Mat_<_Tp>& operator /= (Mat_<_Tp>& a, const MatExpr& b)
+{
+    b.op->augAssignDivide(b, a);
+    return a;
+}
+
+template<typename _Tp> static inline
+const Mat_<_Tp>& operator /= (const Mat_<_Tp>& a, const MatExpr& b)
+{
+    b.op->augAssignDivide(b, (Mat&)a);
+    return a;
+}
+
+
+//////////////////////////////// UMat ////////////////////////////////
+
+template<typename _Tp> inline
+UMat::UMat(const std::vector<_Tp>& vec, bool copyData)
+: flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows((int)vec.size()),
+cols(1), allocator(0), usageFlags(USAGE_DEFAULT), u(0), offset(0), size(&rows)
+{
+    if(vec.empty())
+        return;
+    if( !copyData )
+    {
+        // !!!TODO!!!
+        CV_Error(Error::StsNotImplemented, "");
+    }
+    else
+        Mat((int)vec.size(), 1, traits::Type<_Tp>::value, (uchar*)&vec[0]).copyTo(*this);
+}
+
+inline
+UMat UMat::row(int y) const
+{
+    return UMat(*this, Range(y, y + 1), Range::all());
+}
+
+inline
+UMat UMat::col(int x) const
+{
+    return UMat(*this, Range::all(), Range(x, x + 1));
+}
+
+inline
+UMat UMat::rowRange(int startrow, int endrow) const
+{
+    return UMat(*this, Range(startrow, endrow), Range::all());
+}
+
+inline
+UMat UMat::rowRange(const Range& r) const
+{
+    return UMat(*this, r, Range::all());
+}
+
+inline
+UMat UMat::colRange(int startcol, int endcol) const
+{
+    return UMat(*this, Range::all(), Range(startcol, endcol));
+}
+
+inline
+UMat UMat::colRange(const Range& r) const
+{
+    return UMat(*this, Range::all(), r);
+}
+
+inline
+UMat UMat::operator()( Range _rowRange, Range _colRange ) const
+{
+    return UMat(*this, _rowRange, _colRange);
+}
+
+inline
+UMat UMat::operator()( const Rect& roi ) const
+{
+    return UMat(*this, roi);
+}
+
+inline
+UMat UMat::operator()(const Range* ranges) const
+{
+    return UMat(*this, ranges);
+}
+
+inline
+UMat UMat::operator()(const std::vector<Range>& ranges) const
+{
+    return UMat(*this, ranges);
+}
+
+inline
+bool UMat::isContinuous() const
+{
+    return (flags & CONTINUOUS_FLAG) != 0;
+}
+
+inline
+bool UMat::isSubmatrix() const
+{
+    return (flags & SUBMATRIX_FLAG) != 0;
+}
+
+inline
+size_t UMat::elemSize() const
+{
+    size_t res = dims > 0 ? step.p[dims - 1] : 0;
+    CV_DbgAssert(res != 0);
+    return res;
+}
+
+inline
+size_t UMat::elemSize1() const
+{
+    return CV_ELEM_SIZE1(flags);
+}
+
+inline
+int UMat::type() const
+{
+    return CV_MAT_TYPE(flags);
+}
+
+inline
+int UMat::depth() const
+{
+    return CV_MAT_DEPTH(flags);
+}
+
+inline
+int UMat::channels() const
+{
+    return CV_MAT_CN(flags);
+}
+
+inline
+size_t UMat::step1(int i) const
+{
+    return step.p[i] / elemSize1();
+}
+
+
+inline bool UMatData::hostCopyObsolete() const { return (flags & HOST_COPY_OBSOLETE) != 0; }
+inline bool UMatData::deviceCopyObsolete() const { return (flags & DEVICE_COPY_OBSOLETE) != 0; }
+inline bool UMatData::deviceMemMapped() const { return (flags & DEVICE_MEM_MAPPED) != 0; }
+inline bool UMatData::copyOnMap() const { return (flags & COPY_ON_MAP) != 0; }
+inline bool UMatData::tempUMat() const { return (flags & TEMP_UMAT) != 0; }
+inline bool UMatData::tempCopiedUMat() const { return (flags & TEMP_COPIED_UMAT) == TEMP_COPIED_UMAT; }
+
+inline void UMatData::markDeviceMemMapped(bool flag)
+{
+  if(flag)
+    flags |= DEVICE_MEM_MAPPED;
+  else
+    flags &= ~DEVICE_MEM_MAPPED;
+}
+
+inline void UMatData::markHostCopyObsolete(bool flag)
+{
+    if(flag)
+        flags |= HOST_COPY_OBSOLETE;
+    else
+        flags &= ~HOST_COPY_OBSOLETE;
+}
+inline void UMatData::markDeviceCopyObsolete(bool flag)
+{
+    if(flag)
+        flags |= DEVICE_COPY_OBSOLETE;
+    else
+        flags &= ~DEVICE_COPY_OBSOLETE;
+}
+
+//! @endcond
+
+static inline
+void swap(MatExpr& a, MatExpr& b) { a.swap(b); }
+
+} //cv
+
+#ifdef _MSC_VER
+#pragma warning( pop )
+#endif
+
+#ifdef CV_DISABLE_CLANG_ENUM_WARNINGS
+#undef CV_DISABLE_CLANG_ENUM_WARNINGS
+#pragma clang diagnostic pop
+#endif
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/matx.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/matx.hpp
new file mode 100644
index 0000000..162ce6e
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/matx.hpp
@@ -0,0 +1,1528 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_MATX_HPP
+#define OPENCV_CORE_MATX_HPP
+
+#ifndef __cplusplus
+#  error matx.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core/cvdef.h"
+#include "opencv2/core/base.hpp"
+#include "opencv2/core/traits.hpp"
+#include "opencv2/core/saturate.hpp"
+
+#include <initializer_list>
+
+namespace cv
+{
+
+//! @addtogroup core_basic
+//! @{
+
+////////////////////////////// Small Matrix ///////////////////////////
+
+//! @cond IGNORED
+// FIXIT Remove this (especially CV_EXPORTS modifier)
+struct CV_EXPORTS Matx_AddOp { Matx_AddOp() {} Matx_AddOp(const Matx_AddOp&) {} };
+struct CV_EXPORTS Matx_SubOp { Matx_SubOp() {} Matx_SubOp(const Matx_SubOp&) {} };
+struct CV_EXPORTS Matx_ScaleOp { Matx_ScaleOp() {} Matx_ScaleOp(const Matx_ScaleOp&) {} };
+struct CV_EXPORTS Matx_MulOp { Matx_MulOp() {} Matx_MulOp(const Matx_MulOp&) {} };
+struct CV_EXPORTS Matx_DivOp { Matx_DivOp() {} Matx_DivOp(const Matx_DivOp&) {} };
+struct CV_EXPORTS Matx_MatMulOp { Matx_MatMulOp() {} Matx_MatMulOp(const Matx_MatMulOp&) {} };
+struct CV_EXPORTS Matx_TOp { Matx_TOp() {} Matx_TOp(const Matx_TOp&) {} };
+//! @endcond
+
+/** @brief Template class for small matrices whose type and size are known at compilation time
+
+If you need a more flexible type, use Mat . The elements of the matrix M are accessible using the
+M(i,j) notation. Most of the common matrix operations (see also @ref MatrixExpressions ) are
+available. To do an operation on Matx that is not implemented, you can easily convert the matrix to
+Mat and backwards:
+@code{.cpp}
+    Matx33f m(1, 2, 3,
+              4, 5, 6,
+              7, 8, 9);
+    cout << sum(Mat(m*m.t())) << endl;
+@endcode
+Except of the plain constructor which takes a list of elements, Matx can be initialized from a C-array:
+@code{.cpp}
+    float values[] = { 1, 2, 3};
+    Matx31f m(values);
+@endcode
+In case if C++11 features are available, std::initializer_list can be also used to initialize Matx:
+@code{.cpp}
+    Matx31f m = { 1, 2, 3};
+@endcode
+ */
+template<typename _Tp, int m, int n> class Matx
+{
+public:
+    enum {
+           rows     = m,
+           cols     = n,
+           channels = rows*cols,
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+           depth    = traits::Type<_Tp>::value,
+           type     = CV_MAKETYPE(depth, channels),
+#endif
+           shortdim = (m < n ? m : n)
+         };
+
+    typedef _Tp                           value_type;
+    typedef Matx<_Tp, m, n>               mat_type;
+    typedef Matx<_Tp, shortdim, 1> diag_type;
+
+    //! default constructor
+    Matx();
+
+    explicit Matx(_Tp v0); //!< 1x1 matrix
+    Matx(_Tp v0, _Tp v1); //!< 1x2 or 2x1 matrix
+    Matx(_Tp v0, _Tp v1, _Tp v2); //!< 1x3 or 3x1 matrix
+    Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3); //!< 1x4, 2x2 or 4x1 matrix
+    Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4); //!< 1x5 or 5x1 matrix
+    Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5); //!< 1x6, 2x3, 3x2 or 6x1 matrix
+    Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6); //!< 1x7 or 7x1 matrix
+    Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7); //!< 1x8, 2x4, 4x2 or 8x1 matrix
+    Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8); //!< 1x9, 3x3 or 9x1 matrix
+    Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9); //!< 1x10, 2x5 or 5x2 or 10x1 matrix
+    Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3,
+         _Tp v4, _Tp v5, _Tp v6, _Tp v7,
+         _Tp v8, _Tp v9, _Tp v10, _Tp v11); //!< 1x12, 2x6, 3x4, 4x3, 6x2 or 12x1 matrix
+    Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3,
+         _Tp v4, _Tp v5, _Tp v6, _Tp v7,
+         _Tp v8, _Tp v9, _Tp v10, _Tp v11,
+         _Tp v12, _Tp v13); //!< 1x14, 2x7, 7x2 or 14x1 matrix
+    Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3,
+         _Tp v4, _Tp v5, _Tp v6, _Tp v7,
+         _Tp v8, _Tp v9, _Tp v10, _Tp v11,
+         _Tp v12, _Tp v13, _Tp v14, _Tp v15); //!< 1x16, 4x4 or 16x1 matrix
+    explicit Matx(const _Tp* vals); //!< initialize from a plain array
+
+    Matx(std::initializer_list<_Tp>); //!< initialize from an initializer list
+
+    CV_NODISCARD_STD static Matx all(_Tp alpha);
+    CV_NODISCARD_STD static Matx zeros();
+    CV_NODISCARD_STD static Matx ones();
+    CV_NODISCARD_STD static Matx eye();
+    CV_NODISCARD_STD static Matx diag(const diag_type& d);
+    /** @brief Generates uniformly distributed random numbers
+    @param a Range boundary.
+    @param b The other range boundary (boundaries don't have to be ordered, the lower boundary is inclusive,
+    the upper one is exclusive).
+     */
+    CV_NODISCARD_STD static Matx randu(_Tp a, _Tp b);
+    /** @brief Generates normally distributed random numbers
+    @param a Mean value.
+    @param b Standard deviation.
+     */
+    CV_NODISCARD_STD static Matx randn(_Tp a, _Tp b);
+
+    //! dot product computed with the default precision
+    _Tp dot(const Matx<_Tp, m, n>& v) const;
+
+    //! dot product computed in double-precision arithmetics
+    double ddot(const Matx<_Tp, m, n>& v) const;
+
+    //! conversion to another data type
+    template<typename T2> operator Matx<T2, m, n>() const;
+
+    //! change the matrix shape
+    template<int m1, int n1> Matx<_Tp, m1, n1> reshape() const;
+
+    //! extract part of the matrix
+    template<int m1, int n1> Matx<_Tp, m1, n1> get_minor(int base_row, int base_col) const;
+
+    //! extract the matrix row
+    Matx<_Tp, 1, n> row(int i) const;
+
+    //! extract the matrix column
+    Matx<_Tp, m, 1> col(int i) const;
+
+    //! extract the matrix diagonal
+    diag_type diag() const;
+
+    //! transpose the matrix
+    Matx<_Tp, n, m> t() const;
+
+    //! invert the matrix
+    Matx<_Tp, n, m> inv(int method=DECOMP_LU, bool *p_is_ok = NULL) const;
+
+    //! solve linear system
+    template<int l> Matx<_Tp, n, l> solve(const Matx<_Tp, m, l>& rhs, int flags=DECOMP_LU) const;
+    Vec<_Tp, n> solve(const Vec<_Tp, m>& rhs, int method) const;
+
+    //! multiply two matrices element-wise
+    Matx<_Tp, m, n> mul(const Matx<_Tp, m, n>& a) const;
+
+    //! divide two matrices element-wise
+    Matx<_Tp, m, n> div(const Matx<_Tp, m, n>& a) const;
+
+    //! element access
+    const _Tp& operator ()(int row, int col) const;
+    _Tp& operator ()(int row, int col);
+
+    //! 1D element access
+    const _Tp& operator ()(int i) const;
+    _Tp& operator ()(int i);
+
+    Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_AddOp);
+    Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_SubOp);
+    template<typename _T2> Matx(const Matx<_Tp, m, n>& a, _T2 alpha, Matx_ScaleOp);
+    Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_MulOp);
+    Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_DivOp);
+    template<int l> Matx(const Matx<_Tp, m, l>& a, const Matx<_Tp, l, n>& b, Matx_MatMulOp);
+    Matx(const Matx<_Tp, n, m>& a, Matx_TOp);
+
+    _Tp val[m*n]; //< matrix elements
+};
+
+typedef Matx<float, 1, 2> Matx12f;
+typedef Matx<double, 1, 2> Matx12d;
+typedef Matx<float, 1, 3> Matx13f;
+typedef Matx<double, 1, 3> Matx13d;
+typedef Matx<float, 1, 4> Matx14f;
+typedef Matx<double, 1, 4> Matx14d;
+typedef Matx<float, 1, 6> Matx16f;
+typedef Matx<double, 1, 6> Matx16d;
+
+typedef Matx<float, 2, 1> Matx21f;
+typedef Matx<double, 2, 1> Matx21d;
+typedef Matx<float, 3, 1> Matx31f;
+typedef Matx<double, 3, 1> Matx31d;
+typedef Matx<float, 4, 1> Matx41f;
+typedef Matx<double, 4, 1> Matx41d;
+typedef Matx<float, 6, 1> Matx61f;
+typedef Matx<double, 6, 1> Matx61d;
+
+typedef Matx<float, 2, 2> Matx22f;
+typedef Matx<double, 2, 2> Matx22d;
+typedef Matx<float, 2, 3> Matx23f;
+typedef Matx<double, 2, 3> Matx23d;
+typedef Matx<float, 3, 2> Matx32f;
+typedef Matx<double, 3, 2> Matx32d;
+
+typedef Matx<float, 3, 3> Matx33f;
+typedef Matx<double, 3, 3> Matx33d;
+
+typedef Matx<float, 3, 4> Matx34f;
+typedef Matx<double, 3, 4> Matx34d;
+typedef Matx<float, 4, 3> Matx43f;
+typedef Matx<double, 4, 3> Matx43d;
+
+typedef Matx<float, 4, 4> Matx44f;
+typedef Matx<double, 4, 4> Matx44d;
+typedef Matx<float, 6, 6> Matx66f;
+typedef Matx<double, 6, 6> Matx66d;
+
+/*!
+  traits
+*/
+template<typename _Tp, int m, int n> class DataType< Matx<_Tp, m, n> >
+{
+public:
+    typedef Matx<_Tp, m, n>                               value_type;
+    typedef Matx<typename DataType<_Tp>::work_type, m, n> work_type;
+    typedef _Tp                                           channel_type;
+    typedef value_type                                    vec_type;
+
+    enum { generic_type = 0,
+           channels     = m * n,
+           fmt          = traits::SafeFmt<channel_type>::fmt + ((channels - 1) << 8)
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+           ,depth        = DataType<channel_type>::depth
+           ,type         = CV_MAKETYPE(depth, channels)
+#endif
+         };
+};
+
+namespace traits {
+template<typename _Tp, int m, int n>
+struct Depth< Matx<_Tp, m, n> > { enum { value = Depth<_Tp>::value }; };
+template<typename _Tp, int m, int n>
+struct Type< Matx<_Tp, m, n> > { enum { value = CV_MAKETYPE(Depth<_Tp>::value, n*m) }; };
+} // namespace
+
+
+/** @brief  Comma-separated Matrix Initializer
+*/
+template<typename _Tp, int m, int n> class MatxCommaInitializer
+{
+public:
+    MatxCommaInitializer(Matx<_Tp, m, n>* _mtx);
+    template<typename T2> MatxCommaInitializer<_Tp, m, n>& operator , (T2 val);
+    Matx<_Tp, m, n> operator *() const;
+
+    Matx<_Tp, m, n>* dst;
+    int idx;
+};
+
+/*
+ Utility methods
+*/
+template<typename _Tp, int m> static double determinant(const Matx<_Tp, m, m>& a);
+template<typename _Tp, int m, int n> static double trace(const Matx<_Tp, m, n>& a);
+template<typename _Tp, int m, int n> static double norm(const Matx<_Tp, m, n>& M);
+template<typename _Tp, int m, int n> static double norm(const Matx<_Tp, m, n>& M, int normType);
+
+
+
+/////////////////////// Vec (used as element of multi-channel images /////////////////////
+
+/** @brief Template class for short numerical vectors, a partial case of Matx
+
+This template class represents short numerical vectors (of 1, 2, 3, 4 ... elements) on which you
+can perform basic arithmetical operations, access individual elements using [] operator etc. The
+vectors are allocated on stack, as opposite to std::valarray, std::vector, cv::Mat etc., which
+elements are dynamically allocated in the heap.
+
+The template takes 2 parameters:
+@tparam _Tp element type
+@tparam cn the number of elements
+
+In addition to the universal notation like Vec<float, 3>, you can use shorter aliases
+for the most popular specialized variants of Vec, e.g. Vec3f ~ Vec<float, 3>.
+
+It is possible to convert Vec\<T,2\> to/from Point_, Vec\<T,3\> to/from Point3_ , and Vec\<T,4\>
+to CvScalar or Scalar_. Use operator[] to access the elements of Vec.
+
+All the expected vector operations are also implemented:
+-   v1 = v2 + v3
+-   v1 = v2 - v3
+-   v1 = v2 \* scale
+-   v1 = scale \* v2
+-   v1 = -v2
+-   v1 += v2 and other augmenting operations
+-   v1 == v2, v1 != v2
+-   norm(v1) (euclidean norm)
+The Vec class is commonly used to describe pixel types of multi-channel arrays. See Mat for details.
+*/
+template<typename _Tp, int cn> class Vec : public Matx<_Tp, cn, 1>
+{
+public:
+    typedef _Tp value_type;
+    enum {
+           channels = cn,
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+           depth    = Matx<_Tp, cn, 1>::depth,
+           type     = CV_MAKETYPE(depth, channels),
+#endif
+           _dummy_enum_finalizer = 0
+         };
+
+    //! default constructor
+    Vec();
+
+    Vec(_Tp v0); //!< 1-element vector constructor
+    Vec(_Tp v0, _Tp v1); //!< 2-element vector constructor
+    Vec(_Tp v0, _Tp v1, _Tp v2); //!< 3-element vector constructor
+    Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3); //!< 4-element vector constructor
+    Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4); //!< 5-element vector constructor
+    Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5); //!< 6-element vector constructor
+    Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6); //!< 7-element vector constructor
+    Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7); //!< 8-element vector constructor
+    Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8); //!< 9-element vector constructor
+    Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9); //!< 10-element vector constructor
+    Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9, _Tp v10, _Tp v11, _Tp v12, _Tp v13); //!< 14-element vector constructor
+    explicit Vec(const _Tp* values);
+
+    Vec(std::initializer_list<_Tp>);
+
+    Vec(const Vec<_Tp, cn>& v);
+
+    static Vec all(_Tp alpha);
+    static Vec ones();
+    static Vec randn(_Tp a, _Tp b);
+    static Vec randu(_Tp a, _Tp b);
+    static Vec zeros();
+#ifdef CV_CXX11
+    static Vec diag(_Tp alpha) = delete;
+    static Vec eye() = delete;
+#endif
+
+    //! per-element multiplication
+    Vec mul(const Vec<_Tp, cn>& v) const;
+
+    //! conjugation (makes sense for complex numbers and quaternions)
+    Vec conj() const;
+
+    /*!
+      cross product of the two 3D vectors.
+
+      For other dimensionalities the exception is raised
+    */
+    Vec cross(const Vec& v) const;
+    //! conversion to another data type
+    template<typename T2> operator Vec<T2, cn>() const;
+
+    /*! element access */
+    const _Tp& operator [](int i) const;
+    _Tp& operator[](int i);
+    const _Tp& operator ()(int i) const;
+    _Tp& operator ()(int i);
+
+#ifdef CV_CXX11
+    Vec<_Tp, cn>& operator=(const Vec<_Tp, cn>& rhs) = default;
+#endif
+
+    Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_AddOp);
+    Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_SubOp);
+    template<typename _T2> Vec(const Matx<_Tp, cn, 1>& a, _T2 alpha, Matx_ScaleOp);
+};
+
+/** @name Shorter aliases for the most popular specializations of Vec<T,n>
+  @{
+*/
+typedef Vec<uchar, 2> Vec2b;
+typedef Vec<uchar, 3> Vec3b;
+typedef Vec<uchar, 4> Vec4b;
+
+typedef Vec<short, 2> Vec2s;
+typedef Vec<short, 3> Vec3s;
+typedef Vec<short, 4> Vec4s;
+
+typedef Vec<ushort, 2> Vec2w;
+typedef Vec<ushort, 3> Vec3w;
+typedef Vec<ushort, 4> Vec4w;
+
+typedef Vec<int, 2> Vec2i;
+typedef Vec<int, 3> Vec3i;
+typedef Vec<int, 4> Vec4i;
+typedef Vec<int, 6> Vec6i;
+typedef Vec<int, 8> Vec8i;
+
+typedef Vec<float, 2> Vec2f;
+typedef Vec<float, 3> Vec3f;
+typedef Vec<float, 4> Vec4f;
+typedef Vec<float, 6> Vec6f;
+
+typedef Vec<double, 2> Vec2d;
+typedef Vec<double, 3> Vec3d;
+typedef Vec<double, 4> Vec4d;
+typedef Vec<double, 6> Vec6d;
+/** @} */
+
+/*!
+  traits
+*/
+template<typename _Tp, int cn> class DataType< Vec<_Tp, cn> >
+{
+public:
+    typedef Vec<_Tp, cn>                               value_type;
+    typedef Vec<typename DataType<_Tp>::work_type, cn> work_type;
+    typedef _Tp                                        channel_type;
+    typedef value_type                                 vec_type;
+
+    enum { generic_type = 0,
+           channels     = cn,
+           fmt          = DataType<channel_type>::fmt + ((channels - 1) << 8),
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+           depth        = DataType<channel_type>::depth,
+           type         = CV_MAKETYPE(depth, channels),
+#endif
+           _dummy_enum_finalizer = 0
+         };
+};
+
+namespace traits {
+template<typename _Tp, int cn>
+struct Depth< Vec<_Tp, cn> > { enum { value = Depth<_Tp>::value }; };
+template<typename _Tp, int cn>
+struct Type< Vec<_Tp, cn> > { enum { value = CV_MAKETYPE(Depth<_Tp>::value, cn) }; };
+} // namespace
+
+
+/** @brief  Comma-separated Vec Initializer
+*/
+template<typename _Tp, int m> class VecCommaInitializer : public MatxCommaInitializer<_Tp, m, 1>
+{
+public:
+    VecCommaInitializer(Vec<_Tp, m>* _vec);
+    template<typename T2> VecCommaInitializer<_Tp, m>& operator , (T2 val);
+    Vec<_Tp, m> operator *() const;
+};
+
+template<typename _Tp, int cn> static Vec<_Tp, cn> normalize(const Vec<_Tp, cn>& v);
+
+//! @} core_basic
+
+//! @cond IGNORED
+
+///////////////////////////////////// helper classes /////////////////////////////////////
+namespace internal
+{
+
+template<typename _Tp, int m> struct Matx_DetOp
+{
+    double operator ()(const Matx<_Tp, m, m>& a) const
+    {
+        Matx<_Tp, m, m> temp = a;
+        double p = LU(temp.val, m*sizeof(_Tp), m, 0, 0, 0);
+        if( p == 0 )
+            return p;
+        for( int i = 0; i < m; i++ )
+            p *= temp(i, i);
+        return p;
+    }
+};
+
+template<typename _Tp> struct Matx_DetOp<_Tp, 1>
+{
+    double operator ()(const Matx<_Tp, 1, 1>& a) const
+    {
+        return a(0,0);
+    }
+};
+
+template<typename _Tp> struct Matx_DetOp<_Tp, 2>
+{
+    double operator ()(const Matx<_Tp, 2, 2>& a) const
+    {
+        return a(0,0)*a(1,1) - a(0,1)*a(1,0);
+    }
+};
+
+template<typename _Tp> struct Matx_DetOp<_Tp, 3>
+{
+    double operator ()(const Matx<_Tp, 3, 3>& a) const
+    {
+        return a(0,0)*(a(1,1)*a(2,2) - a(2,1)*a(1,2)) -
+            a(0,1)*(a(1,0)*a(2,2) - a(2,0)*a(1,2)) +
+            a(0,2)*(a(1,0)*a(2,1) - a(2,0)*a(1,1));
+    }
+};
+
+template<typename _Tp> Vec<_Tp, 2> inline conjugate(const Vec<_Tp, 2>& v)
+{
+    return Vec<_Tp, 2>(v[0], -v[1]);
+}
+
+template<typename _Tp> Vec<_Tp, 4> inline conjugate(const Vec<_Tp, 4>& v)
+{
+    return Vec<_Tp, 4>(v[0], -v[1], -v[2], -v[3]);
+}
+
+} // internal
+
+
+
+////////////////////////////////// Matx Implementation ///////////////////////////////////
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx()
+{
+    for(int i = 0; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0)
+{
+    val[0] = v0;
+    for(int i = 1; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1)
+{
+    CV_StaticAssert(channels >= 2, "Matx should have at least 2 elements.");
+    val[0] = v0; val[1] = v1;
+    for(int i = 2; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2)
+{
+    CV_StaticAssert(channels >= 3, "Matx should have at least 3 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2;
+    for(int i = 3; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3)
+{
+    CV_StaticAssert(channels >= 4, "Matx should have at least 4 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    for(int i = 4; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4)
+{
+    CV_StaticAssert(channels >= 5, "Matx should have at least 5 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3; val[4] = v4;
+    for(int i = 5; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5)
+{
+    CV_StaticAssert(channels >= 6, "Matx should have at least 6 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5;
+    for(int i = 6; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6)
+{
+    CV_StaticAssert(channels >= 7, "Matx should have at least 7 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5; val[6] = v6;
+    for(int i = 7; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7)
+{
+    CV_StaticAssert(channels >= 8, "Matx should have at least 8 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
+    for(int i = 8; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8)
+{
+    CV_StaticAssert(channels >= 9, "Matx should have at least 9 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
+    val[8] = v8;
+    for(int i = 9; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9)
+{
+    CV_StaticAssert(channels >= 10, "Matx should have at least 10 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
+    val[8] = v8; val[9] = v9;
+    for(int i = 10; i < channels; i++) val[i] = _Tp(0);
+}
+
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9, _Tp v10, _Tp v11)
+{
+    CV_StaticAssert(channels >= 12, "Matx should have at least 12 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
+    val[8] = v8; val[9] = v9; val[10] = v10; val[11] = v11;
+    for(int i = 12; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9, _Tp v10, _Tp v11, _Tp v12, _Tp v13)
+{
+    CV_StaticAssert(channels >= 14, "Matx should have at least 14 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
+    val[8] = v8; val[9] = v9; val[10] = v10; val[11] = v11;
+    val[12] = v12; val[13] = v13;
+    for (int i = 14; i < channels; i++) val[i] = _Tp(0);
+}
+
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9, _Tp v10, _Tp v11, _Tp v12, _Tp v13, _Tp v14, _Tp v15)
+{
+    CV_StaticAssert(channels >= 16, "Matx should have at least 16 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
+    val[8] = v8; val[9] = v9; val[10] = v10; val[11] = v11;
+    val[12] = v12; val[13] = v13; val[14] = v14; val[15] = v15;
+    for(int i = 16; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(const _Tp* values)
+{
+    for( int i = 0; i < channels; i++ ) val[i] = values[i];
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(std::initializer_list<_Tp> list)
+{
+    CV_DbgAssert(list.size() == channels);
+    int i = 0;
+    for(const auto& elem : list)
+    {
+        val[i++] = elem;
+    }
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n> Matx<_Tp, m, n>::all(_Tp alpha)
+{
+    Matx<_Tp, m, n> M;
+    for( int i = 0; i < m*n; i++ ) M.val[i] = alpha;
+    return M;
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n> Matx<_Tp,m,n>::zeros()
+{
+    return all(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n> Matx<_Tp,m,n>::ones()
+{
+    return all(1);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n> Matx<_Tp,m,n>::eye()
+{
+    Matx<_Tp,m,n> M;
+    for(int i = 0; i < shortdim; i++)
+        M(i,i) = 1;
+    return M;
+}
+
+template<typename _Tp, int m, int n> inline
+_Tp Matx<_Tp, m, n>::dot(const Matx<_Tp, m, n>& M) const
+{
+    _Tp s = 0;
+    for( int i = 0; i < channels; i++ ) s += val[i]*M.val[i];
+    return s;
+}
+
+template<typename _Tp, int m, int n> inline
+double Matx<_Tp, m, n>::ddot(const Matx<_Tp, m, n>& M) const
+{
+    double s = 0;
+    for( int i = 0; i < channels; i++ ) s += (double)val[i]*M.val[i];
+    return s;
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n> Matx<_Tp,m,n>::diag(const typename Matx<_Tp,m,n>::diag_type& d)
+{
+    Matx<_Tp,m,n> M;
+    for(int i = 0; i < shortdim; i++)
+        M(i,i) = d(i, 0);
+    return M;
+}
+
+template<typename _Tp, int m, int n> template<typename T2>
+inline Matx<_Tp, m, n>::operator Matx<T2, m, n>() const
+{
+    Matx<T2, m, n> M;
+    for( int i = 0; i < m*n; i++ ) M.val[i] = saturate_cast<T2>(val[i]);
+    return M;
+}
+
+template<typename _Tp, int m, int n> template<int m1, int n1> inline
+Matx<_Tp, m1, n1> Matx<_Tp, m, n>::reshape() const
+{
+    CV_StaticAssert(m1*n1 == m*n, "Input and destnarion matrices must have the same number of elements");
+    return (const Matx<_Tp, m1, n1>&)*this;
+}
+
+template<typename _Tp, int m, int n>
+template<int m1, int n1> inline
+Matx<_Tp, m1, n1> Matx<_Tp, m, n>::get_minor(int base_row, int base_col) const
+{
+    CV_DbgAssert(0 <= base_row && base_row+m1 <= m && 0 <= base_col && base_col+n1 <= n);
+    Matx<_Tp, m1, n1> s;
+    for( int di = 0; di < m1; di++ )
+        for( int dj = 0; dj < n1; dj++ )
+            s(di, dj) = (*this)(base_row+di, base_col+dj);
+    return s;
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, 1, n> Matx<_Tp, m, n>::row(int i) const
+{
+    CV_DbgAssert((unsigned)i < (unsigned)m);
+    return Matx<_Tp, 1, n>(&val[i*n]);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, 1> Matx<_Tp, m, n>::col(int j) const
+{
+    CV_DbgAssert((unsigned)j < (unsigned)n);
+    Matx<_Tp, m, 1> v;
+    for( int i = 0; i < m; i++ )
+        v.val[i] = val[i*n + j];
+    return v;
+}
+
+template<typename _Tp, int m, int n> inline
+typename Matx<_Tp, m, n>::diag_type Matx<_Tp, m, n>::diag() const
+{
+    diag_type d;
+    for( int i = 0; i < shortdim; i++ )
+        d.val[i] = val[i*n + i];
+    return d;
+}
+
+template<typename _Tp, int m, int n> inline
+const _Tp& Matx<_Tp, m, n>::operator()(int row_idx, int col_idx) const
+{
+    CV_DbgAssert( (unsigned)row_idx < (unsigned)m && (unsigned)col_idx < (unsigned)n );
+    return this->val[row_idx*n + col_idx];
+}
+
+template<typename _Tp, int m, int n> inline
+_Tp& Matx<_Tp, m, n>::operator ()(int row_idx, int col_idx)
+{
+    CV_DbgAssert( (unsigned)row_idx < (unsigned)m && (unsigned)col_idx < (unsigned)n );
+    return val[row_idx*n + col_idx];
+}
+
+template<typename _Tp, int m, int n> inline
+const _Tp& Matx<_Tp, m, n>::operator ()(int i) const
+{
+    CV_StaticAssert(m == 1 || n == 1, "Single index indexation requires matrix to be a column or a row");
+    CV_DbgAssert( (unsigned)i < (unsigned)(m+n-1) );
+    return val[i];
+}
+
+template<typename _Tp, int m, int n> inline
+_Tp& Matx<_Tp, m, n>::operator ()(int i)
+{
+    CV_StaticAssert(m == 1 || n == 1, "Single index indexation requires matrix to be a column or a row");
+    CV_DbgAssert( (unsigned)i < (unsigned)(m+n-1) );
+    return val[i];
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_AddOp)
+{
+    for( int i = 0; i < channels; i++ )
+        val[i] = saturate_cast<_Tp>(a.val[i] + b.val[i]);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_SubOp)
+{
+    for( int i = 0; i < channels; i++ )
+        val[i] = saturate_cast<_Tp>(a.val[i] - b.val[i]);
+}
+
+template<typename _Tp, int m, int n> template<typename _T2> inline
+Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, _T2 alpha, Matx_ScaleOp)
+{
+    for( int i = 0; i < channels; i++ )
+        val[i] = saturate_cast<_Tp>(a.val[i] * alpha);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_MulOp)
+{
+    for( int i = 0; i < channels; i++ )
+        val[i] = saturate_cast<_Tp>(a.val[i] * b.val[i]);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_DivOp)
+{
+    for( int i = 0; i < channels; i++ )
+        val[i] = saturate_cast<_Tp>(a.val[i] / b.val[i]);
+}
+
+template<typename _Tp, int m, int n> template<int l> inline
+Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, l>& a, const Matx<_Tp, l, n>& b, Matx_MatMulOp)
+{
+    for( int i = 0; i < m; i++ )
+        for( int j = 0; j < n; j++ )
+        {
+            _Tp s = 0;
+            for( int k = 0; k < l; k++ )
+                s += a(i, k) * b(k, j);
+            val[i*n + j] = s;
+        }
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(const Matx<_Tp, n, m>& a, Matx_TOp)
+{
+    for( int i = 0; i < m; i++ )
+        for( int j = 0; j < n; j++ )
+            val[i*n + j] = a(j, i);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n> Matx<_Tp, m, n>::mul(const Matx<_Tp, m, n>& a) const
+{
+    return Matx<_Tp, m, n>(*this, a, Matx_MulOp());
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n> Matx<_Tp, m, n>::div(const Matx<_Tp, m, n>& a) const
+{
+    return Matx<_Tp, m, n>(*this, a, Matx_DivOp());
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, n, m> Matx<_Tp, m, n>::t() const
+{
+    return Matx<_Tp, n, m>(*this, Matx_TOp());
+}
+
+template<typename _Tp, int m, int n> inline
+Vec<_Tp, n> Matx<_Tp, m, n>::solve(const Vec<_Tp, m>& rhs, int method) const
+{
+    Matx<_Tp, n, 1> x = solve((const Matx<_Tp, m, 1>&)(rhs), method);
+    return (Vec<_Tp, n>&)(x);
+}
+
+template<typename _Tp, int m> static inline
+double determinant(const Matx<_Tp, m, m>& a)
+{
+    return cv::internal::Matx_DetOp<_Tp, m>()(a);
+}
+
+template<typename _Tp, int m, int n> static inline
+double trace(const Matx<_Tp, m, n>& a)
+{
+    _Tp s = 0;
+    for( int i = 0; i < std::min(m, n); i++ )
+        s += a(i,i);
+    return s;
+}
+
+template<typename _Tp, int m, int n> static inline
+double norm(const Matx<_Tp, m, n>& M)
+{
+    return std::sqrt(normL2Sqr<_Tp, double>(M.val, m*n));
+}
+
+template<typename _Tp, int m, int n> static inline
+double norm(const Matx<_Tp, m, n>& M, int normType)
+{
+    switch(normType) {
+    case NORM_INF:
+        return (double)normInf<_Tp, typename DataType<_Tp>::work_type>(M.val, m*n);
+    case NORM_L1:
+        return (double)normL1<_Tp, typename DataType<_Tp>::work_type>(M.val, m*n);
+    case NORM_L2SQR:
+        return (double)normL2Sqr<_Tp, typename DataType<_Tp>::work_type>(M.val, m*n);
+    default:
+    case NORM_L2:
+        return std::sqrt((double)normL2Sqr<_Tp, typename DataType<_Tp>::work_type>(M.val, m*n));
+    }
+}
+
+
+
+//////////////////////////////// matx comma initializer //////////////////////////////////
+
+template<typename _Tp, typename _T2, int m, int n> static inline
+MatxCommaInitializer<_Tp, m, n> operator << (const Matx<_Tp, m, n>& mtx, _T2 val)
+{
+    MatxCommaInitializer<_Tp, m, n> commaInitializer((Matx<_Tp, m, n>*)&mtx);
+    return (commaInitializer, val);
+}
+
+template<typename _Tp, int m, int n> inline
+MatxCommaInitializer<_Tp, m, n>::MatxCommaInitializer(Matx<_Tp, m, n>* _mtx)
+    : dst(_mtx), idx(0)
+{}
+
+template<typename _Tp, int m, int n> template<typename _T2> inline
+MatxCommaInitializer<_Tp, m, n>& MatxCommaInitializer<_Tp, m, n>::operator , (_T2 value)
+{
+    CV_DbgAssert( idx < m*n );
+    dst->val[idx++] = saturate_cast<_Tp>(value);
+    return *this;
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n> MatxCommaInitializer<_Tp, m, n>::operator *() const
+{
+    CV_DbgAssert( idx == n*m );
+    return *dst;
+}
+
+
+
+/////////////////////////////////// Vec Implementation ///////////////////////////////////
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec() {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0)
+    : Matx<_Tp, cn, 1>(v0) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1)
+    : Matx<_Tp, cn, 1>(v0, v1) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2)
+    : Matx<_Tp, cn, 1>(v0, v1, v2) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5, v6) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5, v6, v7) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5, v6, v7, v8) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9, _Tp v10, _Tp v11, _Tp v12, _Tp v13)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(const _Tp* values)
+    : Matx<_Tp, cn, 1>(values) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(std::initializer_list<_Tp> list)
+    : Matx<_Tp, cn, 1>(list) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(const Vec<_Tp, cn>& m)
+    : Matx<_Tp, cn, 1>(m.val) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_AddOp op)
+    : Matx<_Tp, cn, 1>(a, b, op) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_SubOp op)
+    : Matx<_Tp, cn, 1>(a, b, op) {}
+
+template<typename _Tp, int cn> template<typename _T2> inline
+Vec<_Tp, cn>::Vec(const Matx<_Tp, cn, 1>& a, _T2 alpha, Matx_ScaleOp op)
+    : Matx<_Tp, cn, 1>(a, alpha, op) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> Vec<_Tp, cn>::all(_Tp alpha)
+{
+    Vec v;
+    for( int i = 0; i < cn; i++ ) v.val[i] = alpha;
+    return v;
+}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> Vec<_Tp, cn>::ones()
+{
+    return Vec::all(1);
+}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> Vec<_Tp, cn>::zeros()
+{
+    return Vec::all(0);
+}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> Vec<_Tp, cn>::mul(const Vec<_Tp, cn>& v) const
+{
+    Vec<_Tp, cn> w;
+    for( int i = 0; i < cn; i++ ) w.val[i] = saturate_cast<_Tp>(this->val[i]*v.val[i]);
+    return w;
+}
+
+template<> inline
+Vec<float, 2> Vec<float, 2>::conj() const
+{
+    return cv::internal::conjugate(*this);
+}
+
+template<> inline
+Vec<double, 2> Vec<double, 2>::conj() const
+{
+    return cv::internal::conjugate(*this);
+}
+
+template<> inline
+Vec<float, 4> Vec<float, 4>::conj() const
+{
+    return cv::internal::conjugate(*this);
+}
+
+template<> inline
+Vec<double, 4> Vec<double, 4>::conj() const
+{
+    return cv::internal::conjugate(*this);
+}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> Vec<_Tp, cn>::cross(const Vec<_Tp, cn>&) const
+{
+    CV_StaticAssert(cn == 3, "for arbitrary-size vector there is no cross-product defined");
+    return Vec<_Tp, cn>();
+}
+
+template<> inline
+Vec<float, 3> Vec<float, 3>::cross(const Vec<float, 3>& v) const
+{
+    return Vec<float,3>(this->val[1]*v.val[2] - this->val[2]*v.val[1],
+                     this->val[2]*v.val[0] - this->val[0]*v.val[2],
+                     this->val[0]*v.val[1] - this->val[1]*v.val[0]);
+}
+
+template<> inline
+Vec<double, 3> Vec<double, 3>::cross(const Vec<double, 3>& v) const
+{
+    return Vec<double,3>(this->val[1]*v.val[2] - this->val[2]*v.val[1],
+                     this->val[2]*v.val[0] - this->val[0]*v.val[2],
+                     this->val[0]*v.val[1] - this->val[1]*v.val[0]);
+}
+
+template<typename _Tp, int cn> template<typename T2> inline
+Vec<_Tp, cn>::operator Vec<T2, cn>() const
+{
+    Vec<T2, cn> v;
+    for( int i = 0; i < cn; i++ ) v.val[i] = saturate_cast<T2>(this->val[i]);
+    return v;
+}
+
+template<typename _Tp, int cn> inline
+const _Tp& Vec<_Tp, cn>::operator [](int i) const
+{
+    CV_DbgAssert( (unsigned)i < (unsigned)cn );
+    return this->val[i];
+}
+
+template<typename _Tp, int cn> inline
+_Tp& Vec<_Tp, cn>::operator [](int i)
+{
+    CV_DbgAssert( (unsigned)i < (unsigned)cn );
+    return this->val[i];
+}
+
+template<typename _Tp, int cn> inline
+const _Tp& Vec<_Tp, cn>::operator ()(int i) const
+{
+    CV_DbgAssert( (unsigned)i < (unsigned)cn );
+    return this->val[i];
+}
+
+template<typename _Tp, int cn> inline
+_Tp& Vec<_Tp, cn>::operator ()(int i)
+{
+    CV_DbgAssert( (unsigned)i < (unsigned)cn );
+    return this->val[i];
+}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> normalize(const Vec<_Tp, cn>& v)
+{
+    double nv = norm(v);
+    return v * (nv ? 1./nv : 0.);
+}
+
+
+
+//////////////////////////////// vec comma initializer //////////////////////////////////
+
+
+template<typename _Tp, typename _T2, int cn> static inline
+VecCommaInitializer<_Tp, cn> operator << (const Vec<_Tp, cn>& vec, _T2 val)
+{
+    VecCommaInitializer<_Tp, cn> commaInitializer((Vec<_Tp, cn>*)&vec);
+    return (commaInitializer, val);
+}
+
+template<typename _Tp, int cn> inline
+VecCommaInitializer<_Tp, cn>::VecCommaInitializer(Vec<_Tp, cn>* _vec)
+    : MatxCommaInitializer<_Tp, cn, 1>(_vec)
+{}
+
+template<typename _Tp, int cn> template<typename _T2> inline
+VecCommaInitializer<_Tp, cn>& VecCommaInitializer<_Tp, cn>::operator , (_T2 value)
+{
+    CV_DbgAssert( this->idx < cn );
+    this->dst->val[this->idx++] = saturate_cast<_Tp>(value);
+    return *this;
+}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> VecCommaInitializer<_Tp, cn>::operator *() const
+{
+    CV_DbgAssert( this->idx == cn );
+    return *this->dst;
+}
+
+//! @endcond
+
+///////////////////////////// Matx out-of-class operators ////////////////////////////////
+
+//! @relates cv::Matx
+//! @{
+
+template<typename _Tp1, typename _Tp2, int m, int n> static inline
+Matx<_Tp1, m, n>& operator += (Matx<_Tp1, m, n>& a, const Matx<_Tp2, m, n>& b)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = saturate_cast<_Tp1>(a.val[i] + b.val[i]);
+    return a;
+}
+
+template<typename _Tp1, typename _Tp2, int m, int n> static inline
+Matx<_Tp1, m, n>& operator -= (Matx<_Tp1, m, n>& a, const Matx<_Tp2, m, n>& b)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = saturate_cast<_Tp1>(a.val[i] - b.val[i]);
+    return a;
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator + (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b)
+{
+    return Matx<_Tp, m, n>(a, b, Matx_AddOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator - (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b)
+{
+    return Matx<_Tp, m, n>(a, b, Matx_SubOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator *= (Matx<_Tp, m, n>& a, int alpha)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = saturate_cast<_Tp>(a.val[i] * alpha);
+    return a;
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator *= (Matx<_Tp, m, n>& a, float alpha)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = saturate_cast<_Tp>(a.val[i] * alpha);
+    return a;
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator *= (Matx<_Tp, m, n>& a, double alpha)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = saturate_cast<_Tp>(a.val[i] * alpha);
+    return a;
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (const Matx<_Tp, m, n>& a, int alpha)
+{
+    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (const Matx<_Tp, m, n>& a, float alpha)
+{
+    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (const Matx<_Tp, m, n>& a, double alpha)
+{
+    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (int alpha, const Matx<_Tp, m, n>& a)
+{
+    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (float alpha, const Matx<_Tp, m, n>& a)
+{
+    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (double alpha, const Matx<_Tp, m, n>& a)
+{
+    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator /= (Matx<_Tp, m, n>& a, float alpha)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = a.val[i] / alpha;
+    return a;
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator /= (Matx<_Tp, m, n>& a, double alpha)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = a.val[i] / alpha;
+    return a;
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator / (const Matx<_Tp, m, n>& a, float alpha)
+{
+    return Matx<_Tp, m, n>(a, 1.f/alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator / (const Matx<_Tp, m, n>& a, double alpha)
+{
+    return Matx<_Tp, m, n>(a, 1./alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator - (const Matx<_Tp, m, n>& a)
+{
+    return Matx<_Tp, m, n>(a, -1, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n, int l> static inline
+Matx<_Tp, m, n> operator * (const Matx<_Tp, m, l>& a, const Matx<_Tp, l, n>& b)
+{
+    return Matx<_Tp, m, n>(a, b, Matx_MatMulOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Vec<_Tp, m> operator * (const Matx<_Tp, m, n>& a, const Vec<_Tp, n>& b)
+{
+    Matx<_Tp, m, 1> c(a, b, Matx_MatMulOp());
+    return (const Vec<_Tp, m>&)(c);
+}
+
+template<typename _Tp, int m, int n> static inline
+bool operator == (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b)
+{
+    for( int i = 0; i < m*n; i++ )
+        if( a.val[i] != b.val[i] ) return false;
+    return true;
+}
+
+template<typename _Tp, int m, int n> static inline
+bool operator != (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b)
+{
+    return !(a == b);
+}
+
+//! @}
+
+////////////////////////////// Vec out-of-class operators ////////////////////////////////
+
+//! @relates cv::Vec
+//! @{
+
+template<typename _Tp1, typename _Tp2, int cn> static inline
+Vec<_Tp1, cn>& operator += (Vec<_Tp1, cn>& a, const Vec<_Tp2, cn>& b)
+{
+    for( int i = 0; i < cn; i++ )
+        a.val[i] = saturate_cast<_Tp1>(a.val[i] + b.val[i]);
+    return a;
+}
+
+template<typename _Tp1, typename _Tp2, int cn> static inline
+Vec<_Tp1, cn>& operator -= (Vec<_Tp1, cn>& a, const Vec<_Tp2, cn>& b)
+{
+    for( int i = 0; i < cn; i++ )
+        a.val[i] = saturate_cast<_Tp1>(a.val[i] - b.val[i]);
+    return a;
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator + (const Vec<_Tp, cn>& a, const Vec<_Tp, cn>& b)
+{
+    return Vec<_Tp, cn>(a, b, Matx_AddOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator - (const Vec<_Tp, cn>& a, const Vec<_Tp, cn>& b)
+{
+    return Vec<_Tp, cn>(a, b, Matx_SubOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator *= (Vec<_Tp, cn>& a, int alpha)
+{
+    for( int i = 0; i < cn; i++ )
+        a[i] = saturate_cast<_Tp>(a[i]*alpha);
+    return a;
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator *= (Vec<_Tp, cn>& a, float alpha)
+{
+    for( int i = 0; i < cn; i++ )
+        a[i] = saturate_cast<_Tp>(a[i]*alpha);
+    return a;
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator *= (Vec<_Tp, cn>& a, double alpha)
+{
+    for( int i = 0; i < cn; i++ )
+        a[i] = saturate_cast<_Tp>(a[i]*alpha);
+    return a;
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, int alpha)
+{
+    double ialpha = 1./alpha;
+    for( int i = 0; i < cn; i++ )
+        a[i] = saturate_cast<_Tp>(a[i]*ialpha);
+    return a;
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, float alpha)
+{
+    float ialpha = 1.f/alpha;
+    for( int i = 0; i < cn; i++ )
+        a[i] = saturate_cast<_Tp>(a[i]*ialpha);
+    return a;
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, double alpha)
+{
+    double ialpha = 1./alpha;
+    for( int i = 0; i < cn; i++ )
+        a[i] = saturate_cast<_Tp>(a[i]*ialpha);
+    return a;
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator * (const Vec<_Tp, cn>& a, int alpha)
+{
+    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator * (int alpha, const Vec<_Tp, cn>& a)
+{
+    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator * (const Vec<_Tp, cn>& a, float alpha)
+{
+    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator * (float alpha, const Vec<_Tp, cn>& a)
+{
+    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator * (const Vec<_Tp, cn>& a, double alpha)
+{
+    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator * (double alpha, const Vec<_Tp, cn>& a)
+{
+    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator / (const Vec<_Tp, cn>& a, int alpha)
+{
+    return Vec<_Tp, cn>(a, 1./alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator / (const Vec<_Tp, cn>& a, float alpha)
+{
+    return Vec<_Tp, cn>(a, 1.f/alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator / (const Vec<_Tp, cn>& a, double alpha)
+{
+    return Vec<_Tp, cn>(a, 1./alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator - (const Vec<_Tp, cn>& a)
+{
+    Vec<_Tp,cn> t;
+    for( int i = 0; i < cn; i++ ) t.val[i] = saturate_cast<_Tp>(-a.val[i]);
+    return t;
+}
+
+template<typename _Tp> inline Vec<_Tp, 4> operator * (const Vec<_Tp, 4>& v1, const Vec<_Tp, 4>& v2)
+{
+    return Vec<_Tp, 4>(saturate_cast<_Tp>(v1[0]*v2[0] - v1[1]*v2[1] - v1[2]*v2[2] - v1[3]*v2[3]),
+                       saturate_cast<_Tp>(v1[0]*v2[1] + v1[1]*v2[0] + v1[2]*v2[3] - v1[3]*v2[2]),
+                       saturate_cast<_Tp>(v1[0]*v2[2] - v1[1]*v2[3] + v1[2]*v2[0] + v1[3]*v2[1]),
+                       saturate_cast<_Tp>(v1[0]*v2[3] + v1[1]*v2[2] - v1[2]*v2[1] + v1[3]*v2[0]));
+}
+
+template<typename _Tp> inline Vec<_Tp, 4>& operator *= (Vec<_Tp, 4>& v1, const Vec<_Tp, 4>& v2)
+{
+    v1 = v1 * v2;
+    return v1;
+}
+
+//! @}
+
+} // cv
+
+#endif // OPENCV_CORE_MATX_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/neon_utils.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/neon_utils.hpp
new file mode 100644
index 0000000..573ba99
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/neon_utils.hpp
@@ -0,0 +1,128 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_HAL_NEON_UTILS_HPP
+#define OPENCV_HAL_NEON_UTILS_HPP
+
+#include "opencv2/core/cvdef.h"
+
+//! @addtogroup core_utils_neon
+//! @{
+
+#if CV_NEON
+
+inline int32x2_t cv_vrnd_s32_f32(float32x2_t v)
+{
+    static int32x2_t v_sign = vdup_n_s32(1 << 31),
+        v_05 = vreinterpret_s32_f32(vdup_n_f32(0.5f));
+
+    int32x2_t v_addition = vorr_s32(v_05, vand_s32(v_sign, vreinterpret_s32_f32(v)));
+    return vcvt_s32_f32(vadd_f32(v, vreinterpret_f32_s32(v_addition)));
+}
+
+inline int32x4_t cv_vrndq_s32_f32(float32x4_t v)
+{
+    static int32x4_t v_sign = vdupq_n_s32(1 << 31),
+        v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
+
+    int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(v)));
+    return vcvtq_s32_f32(vaddq_f32(v, vreinterpretq_f32_s32(v_addition)));
+}
+
+inline uint32x2_t cv_vrnd_u32_f32(float32x2_t v)
+{
+    static float32x2_t v_05 = vdup_n_f32(0.5f);
+    return vcvt_u32_f32(vadd_f32(v, v_05));
+}
+
+inline uint32x4_t cv_vrndq_u32_f32(float32x4_t v)
+{
+    static float32x4_t v_05 = vdupq_n_f32(0.5f);
+    return vcvtq_u32_f32(vaddq_f32(v, v_05));
+}
+
+inline float32x4_t cv_vrecpq_f32(float32x4_t val)
+{
+    float32x4_t reciprocal = vrecpeq_f32(val);
+    reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
+    return reciprocal;
+}
+
+inline float32x2_t cv_vrecp_f32(float32x2_t val)
+{
+    float32x2_t reciprocal = vrecpe_f32(val);
+    reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
+    reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
+    return reciprocal;
+}
+
+inline float32x4_t cv_vrsqrtq_f32(float32x4_t val)
+{
+    float32x4_t e = vrsqrteq_f32(val);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
+    return e;
+}
+
+inline float32x2_t cv_vrsqrt_f32(float32x2_t val)
+{
+    float32x2_t e = vrsqrte_f32(val);
+    e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
+    e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
+    return e;
+}
+
+inline float32x4_t cv_vsqrtq_f32(float32x4_t val)
+{
+    return cv_vrecpq_f32(cv_vrsqrtq_f32(val));
+}
+
+inline float32x2_t cv_vsqrt_f32(float32x2_t val)
+{
+    return cv_vrecp_f32(cv_vrsqrt_f32(val));
+}
+
+#endif
+
+//! @}
+
+#endif // OPENCV_HAL_NEON_UTILS_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/ocl.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/ocl.hpp
new file mode 100644
index 0000000..642b050
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/ocl.hpp
@@ -0,0 +1,902 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the OpenCV Foundation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_OPENCL_HPP
+#define OPENCV_OPENCL_HPP
+
+#include "opencv2/core.hpp"
+#include <typeinfo>
+#include <typeindex>
+
+namespace cv { namespace ocl {
+
+//! @addtogroup core_opencl
+//! @{
+
+CV_EXPORTS_W bool haveOpenCL();
+CV_EXPORTS_W bool useOpenCL();
+CV_EXPORTS_W bool haveAmdBlas();
+CV_EXPORTS_W bool haveAmdFft();
+CV_EXPORTS_W void setUseOpenCL(bool flag);
+CV_EXPORTS_W void finish();
+
+CV_EXPORTS bool haveSVM();
+
+class CV_EXPORTS Context;
+class CV_EXPORTS_W_SIMPLE Device;
+class CV_EXPORTS Kernel;
+class CV_EXPORTS Program;
+class CV_EXPORTS ProgramSource;
+class CV_EXPORTS Queue;
+class CV_EXPORTS PlatformInfo;
+class CV_EXPORTS Image2D;
+
+class CV_EXPORTS_W_SIMPLE Device
+{
+public:
+    CV_WRAP Device() CV_NOEXCEPT;
+    explicit Device(void* d);
+    Device(const Device& d);
+    Device& operator = (const Device& d);
+    Device(Device&& d) CV_NOEXCEPT;
+    Device& operator = (Device&& d) CV_NOEXCEPT;
+    CV_WRAP ~Device();
+
+    void set(void* d);
+
+    enum
+    {
+        TYPE_DEFAULT     = (1 << 0),
+        TYPE_CPU         = (1 << 1),
+        TYPE_GPU         = (1 << 2),
+        TYPE_ACCELERATOR = (1 << 3),
+        TYPE_DGPU        = TYPE_GPU + (1 << 16),
+        TYPE_IGPU        = TYPE_GPU + (1 << 17),
+        TYPE_ALL         = 0xFFFFFFFF
+    };
+
+    CV_WRAP String name() const;
+    CV_WRAP String extensions() const;
+    CV_WRAP bool isExtensionSupported(const String& extensionName) const;
+    CV_WRAP String version() const;
+    CV_WRAP String vendorName() const;
+    CV_WRAP String OpenCL_C_Version() const;
+    CV_WRAP String OpenCLVersion() const;
+    CV_WRAP int deviceVersionMajor() const;
+    CV_WRAP int deviceVersionMinor() const;
+    CV_WRAP String driverVersion() const;
+    void* ptr() const;
+
+    CV_WRAP int type() const;
+
+    CV_WRAP int addressBits() const;
+    CV_WRAP bool available() const;
+    CV_WRAP bool compilerAvailable() const;
+    CV_WRAP bool linkerAvailable() const;
+
+    enum
+    {
+        FP_DENORM=(1 << 0),
+        FP_INF_NAN=(1 << 1),
+        FP_ROUND_TO_NEAREST=(1 << 2),
+        FP_ROUND_TO_ZERO=(1 << 3),
+        FP_ROUND_TO_INF=(1 << 4),
+        FP_FMA=(1 << 5),
+        FP_SOFT_FLOAT=(1 << 6),
+        FP_CORRECTLY_ROUNDED_DIVIDE_SQRT=(1 << 7)
+    };
+    CV_WRAP int doubleFPConfig() const;
+    CV_WRAP int singleFPConfig() const;
+    CV_WRAP int halfFPConfig() const;
+
+    CV_WRAP bool endianLittle() const;
+    CV_WRAP bool errorCorrectionSupport() const;
+
+    enum
+    {
+        EXEC_KERNEL=(1 << 0),
+        EXEC_NATIVE_KERNEL=(1 << 1)
+    };
+    CV_WRAP int executionCapabilities() const;
+
+    CV_WRAP size_t globalMemCacheSize() const;
+
+    enum
+    {
+        NO_CACHE=0,
+        READ_ONLY_CACHE=1,
+        READ_WRITE_CACHE=2
+    };
+    CV_WRAP int globalMemCacheType() const;
+    CV_WRAP int globalMemCacheLineSize() const;
+    CV_WRAP size_t globalMemSize() const;
+
+    CV_WRAP size_t localMemSize() const;
+    enum
+    {
+        NO_LOCAL_MEM=0,
+        LOCAL_IS_LOCAL=1,
+        LOCAL_IS_GLOBAL=2
+    };
+    CV_WRAP int localMemType() const;
+    CV_WRAP bool hostUnifiedMemory() const;
+
+    CV_WRAP bool imageSupport() const;
+
+    CV_WRAP bool imageFromBufferSupport() const;
+    uint imagePitchAlignment() const;
+    uint imageBaseAddressAlignment() const;
+
+    /// deprecated, use isExtensionSupported() method (probably with "cl_khr_subgroups" value)
+    CV_WRAP bool intelSubgroupsSupport() const;
+
+    CV_WRAP size_t image2DMaxWidth() const;
+    CV_WRAP size_t image2DMaxHeight() const;
+
+    CV_WRAP size_t image3DMaxWidth() const;
+    CV_WRAP size_t image3DMaxHeight() const;
+    CV_WRAP size_t image3DMaxDepth() const;
+
+    CV_WRAP size_t imageMaxBufferSize() const;
+    CV_WRAP size_t imageMaxArraySize() const;
+
+    enum
+    {
+        UNKNOWN_VENDOR=0,
+        VENDOR_AMD=1,
+        VENDOR_INTEL=2,
+        VENDOR_NVIDIA=3
+    };
+    CV_WRAP int vendorID() const;
+    // FIXIT
+    // dev.isAMD() doesn't work for OpenCL CPU devices from AMD OpenCL platform.
+    // This method should use platform name instead of vendor name.
+    // After fix restore code in arithm.cpp: ocl_compare()
+    CV_WRAP inline bool isAMD() const { return vendorID() == VENDOR_AMD; }
+    CV_WRAP inline bool isIntel() const { return vendorID() == VENDOR_INTEL; }
+    CV_WRAP inline bool isNVidia() const { return vendorID() == VENDOR_NVIDIA; }
+
+    CV_WRAP int maxClockFrequency() const;
+    CV_WRAP int maxComputeUnits() const;
+    CV_WRAP int maxConstantArgs() const;
+    CV_WRAP size_t maxConstantBufferSize() const;
+
+    CV_WRAP size_t maxMemAllocSize() const;
+    CV_WRAP size_t maxParameterSize() const;
+
+    CV_WRAP int maxReadImageArgs() const;
+    CV_WRAP int maxWriteImageArgs() const;
+    CV_WRAP int maxSamplers() const;
+
+    CV_WRAP size_t maxWorkGroupSize() const;
+    CV_WRAP int maxWorkItemDims() const;
+    void maxWorkItemSizes(size_t*) const;
+
+    CV_WRAP int memBaseAddrAlign() const;
+
+    CV_WRAP int nativeVectorWidthChar() const;
+    CV_WRAP int nativeVectorWidthShort() const;
+    CV_WRAP int nativeVectorWidthInt() const;
+    CV_WRAP int nativeVectorWidthLong() const;
+    CV_WRAP int nativeVectorWidthFloat() const;
+    CV_WRAP int nativeVectorWidthDouble() const;
+    CV_WRAP int nativeVectorWidthHalf() const;
+
+    CV_WRAP int preferredVectorWidthChar() const;
+    CV_WRAP int preferredVectorWidthShort() const;
+    CV_WRAP int preferredVectorWidthInt() const;
+    CV_WRAP int preferredVectorWidthLong() const;
+    CV_WRAP int preferredVectorWidthFloat() const;
+    CV_WRAP int preferredVectorWidthDouble() const;
+    CV_WRAP int preferredVectorWidthHalf() const;
+
+    CV_WRAP size_t printfBufferSize() const;
+    CV_WRAP size_t profilingTimerResolution() const;
+
+    CV_WRAP static const Device& getDefault();
+
+    /**
+     * @param d OpenCL handle (cl_device_id). clRetainDevice() is called on success.
+     *
+     * @note Ownership of the passed device is passed to OpenCV on success.
+     * The caller should additionally call `clRetainDevice` on it if it intends
+     * to continue using the device.
+      */
+    static Device fromHandle(void* d);
+
+    struct Impl;
+    inline Impl* getImpl() const { return (Impl*)p; }
+    inline bool empty() const { return !p; }
+protected:
+    Impl* p;
+};
+
+
+class CV_EXPORTS Context
+{
+public:
+    Context() CV_NOEXCEPT;
+    explicit Context(int dtype);  //!< @deprecated
+    ~Context();
+    Context(const Context& c);
+    Context& operator= (const Context& c);
+    Context(Context&& c) CV_NOEXCEPT;
+    Context& operator = (Context&& c) CV_NOEXCEPT;
+
+    /** @deprecated */
+    bool create();
+    /** @deprecated */
+    bool create(int dtype);
+
+    size_t ndevices() const;
+    Device& device(size_t idx) const;
+    Program getProg(const ProgramSource& prog,
+                    const String& buildopt, String& errmsg);
+    void unloadProg(Program& prog);
+
+
+    /** Get thread-local OpenCL context (initialize if necessary) */
+#if 0  // OpenCV 5.0
+    static Context& getDefault();
+#else
+    static Context& getDefault(bool initialize = true);
+#endif
+
+    /** @returns cl_context value */
+    void* ptr() const;
+
+    /**
+     * @brief Get OpenCL context property specified on context creation
+     * @param propertyId Property id (CL_CONTEXT_* as defined in cl_context_properties type)
+     * @returns Property value if property was specified on clCreateContext, or NULL if context created without the property
+     */
+    void* getOpenCLContextProperty(int propertyId) const;
+
+    bool useSVM() const;
+    void setUseSVM(bool enabled);
+
+    /**
+     * @param context OpenCL handle (cl_context). clRetainContext() is called on success
+     */
+    static Context fromHandle(void* context);
+    static Context fromDevice(const ocl::Device& device);
+    static Context create(const std::string& configuration);
+
+    void release();
+
+    struct Impl;
+    inline Impl* getImpl() const { return (Impl*)p; }
+    inline bool empty() const { return !p; }
+// TODO OpenCV 5.0
+//protected:
+    Impl* p;
+};
+
+/** @deprecated */
+class CV_EXPORTS Platform
+{
+public:
+    Platform() CV_NOEXCEPT;
+    ~Platform();
+    Platform(const Platform& p);
+    Platform& operator = (const Platform& p);
+    Platform(Platform&& p) CV_NOEXCEPT;
+    Platform& operator = (Platform&& p) CV_NOEXCEPT;
+
+    void* ptr() const;
+
+    /** @deprecated */
+    static Platform& getDefault();
+
+    struct Impl;
+    inline Impl* getImpl() const { return (Impl*)p; }
+    inline bool empty() const { return !p; }
+protected:
+    Impl* p;
+};
+
+/** @brief Attaches OpenCL context to OpenCV
+@note
+  OpenCV will check if available OpenCL platform has platformName name, then assign context to
+  OpenCV and call `clRetainContext` function. The deviceID device will be used as target device and
+  new command queue will be created.
+@param platformName name of OpenCL platform to attach, this string is used to check if platform is available to OpenCV at runtime
+@param platformID ID of platform attached context was created for
+@param context OpenCL context to be attached to OpenCV
+@param deviceID ID of device, must be created from attached context
+*/
+CV_EXPORTS void attachContext(const String& platformName, void* platformID, void* context, void* deviceID);
+
+/** @brief Convert OpenCL buffer to UMat
+@note
+  OpenCL buffer (cl_mem_buffer) should contain 2D image data, compatible with OpenCV. Memory
+  content is not copied from `clBuffer` to UMat. Instead, buffer handle assigned to UMat and
+  `clRetainMemObject` is called.
+@param cl_mem_buffer source clBuffer handle
+@param step num of bytes in single row
+@param rows number of rows
+@param cols number of cols
+@param type OpenCV type of image
+@param dst destination UMat
+*/
+CV_EXPORTS void convertFromBuffer(void* cl_mem_buffer, size_t step, int rows, int cols, int type, UMat& dst);
+
+/** @brief Convert OpenCL image2d_t to UMat
+@note
+  OpenCL `image2d_t` (cl_mem_image), should be compatible with OpenCV UMat formats. Memory content
+  is copied from image to UMat with `clEnqueueCopyImageToBuffer` function.
+@param cl_mem_image source image2d_t handle
+@param dst destination UMat
+*/
+CV_EXPORTS void convertFromImage(void* cl_mem_image, UMat& dst);
+
+// TODO Move to internal header
+/// @deprecated
+void initializeContextFromHandle(Context& ctx, void* platform, void* context, void* device);
+
+class CV_EXPORTS Queue
+{
+public:
+    Queue() CV_NOEXCEPT;
+    explicit Queue(const Context& c, const Device& d=Device());
+    ~Queue();
+    Queue(const Queue& q);
+    Queue& operator = (const Queue& q);
+    Queue(Queue&& q) CV_NOEXCEPT;
+    Queue& operator = (Queue&& q) CV_NOEXCEPT;
+
+    bool create(const Context& c=Context(), const Device& d=Device());
+    void finish();
+    void* ptr() const;
+    static Queue& getDefault();
+
+    /// @brief Returns OpenCL command queue with enable profiling mode support
+    const Queue& getProfilingQueue() const;
+
+    struct Impl; friend struct Impl;
+    inline Impl* getImpl() const { return p; }
+    inline bool empty() const { return !p; }
+protected:
+    Impl* p;
+};
+
+
+class CV_EXPORTS KernelArg
+{
+public:
+    enum { LOCAL=1, READ_ONLY=2, WRITE_ONLY=4, READ_WRITE=6, CONSTANT=8, PTR_ONLY = 16, NO_SIZE=256 };
+    KernelArg(int _flags, UMat* _m, int wscale=1, int iwscale=1, const void* _obj=0, size_t _sz=0);
+    KernelArg() CV_NOEXCEPT;
+
+    static KernelArg Local(size_t localMemSize)
+    { return KernelArg(LOCAL, 0, 1, 1, 0, localMemSize); }
+    static KernelArg PtrWriteOnly(const UMat& m)
+    { return KernelArg(PTR_ONLY+WRITE_ONLY, (UMat*)&m); }
+    static KernelArg PtrReadOnly(const UMat& m)
+    { return KernelArg(PTR_ONLY+READ_ONLY, (UMat*)&m); }
+    static KernelArg PtrReadWrite(const UMat& m)
+    { return KernelArg(PTR_ONLY+READ_WRITE, (UMat*)&m); }
+    static KernelArg ReadWrite(const UMat& m, int wscale=1, int iwscale=1)
+    { return KernelArg(READ_WRITE, (UMat*)&m, wscale, iwscale); }
+    static KernelArg ReadWriteNoSize(const UMat& m, int wscale=1, int iwscale=1)
+    { return KernelArg(READ_WRITE+NO_SIZE, (UMat*)&m, wscale, iwscale); }
+    static KernelArg ReadOnly(const UMat& m, int wscale=1, int iwscale=1)
+    { return KernelArg(READ_ONLY, (UMat*)&m, wscale, iwscale); }
+    static KernelArg WriteOnly(const UMat& m, int wscale=1, int iwscale=1)
+    { return KernelArg(WRITE_ONLY, (UMat*)&m, wscale, iwscale); }
+    static KernelArg ReadOnlyNoSize(const UMat& m, int wscale=1, int iwscale=1)
+    { return KernelArg(READ_ONLY+NO_SIZE, (UMat*)&m, wscale, iwscale); }
+    static KernelArg WriteOnlyNoSize(const UMat& m, int wscale=1, int iwscale=1)
+    { return KernelArg(WRITE_ONLY+NO_SIZE, (UMat*)&m, wscale, iwscale); }
+    static KernelArg Constant(const Mat& m);
+    template<typename _Tp> static KernelArg Constant(const _Tp* arr, size_t n)
+    { return KernelArg(CONSTANT, 0, 1, 1, (void*)arr, n); }
+
+    int flags;
+    UMat* m;
+    const void* obj;
+    size_t sz;
+    int wscale, iwscale;
+};
+
+
+class CV_EXPORTS Kernel
+{
+public:
+    Kernel() CV_NOEXCEPT;
+    Kernel(const char* kname, const Program& prog);
+    Kernel(const char* kname, const ProgramSource& prog,
+           const String& buildopts = String(), String* errmsg=0);
+    ~Kernel();
+    Kernel(const Kernel& k);
+    Kernel& operator = (const Kernel& k);
+    Kernel(Kernel&& k) CV_NOEXCEPT;
+    Kernel& operator = (Kernel&& k) CV_NOEXCEPT;
+
+    bool empty() const;
+    bool create(const char* kname, const Program& prog);
+    bool create(const char* kname, const ProgramSource& prog,
+                const String& buildopts, String* errmsg=0);
+
+    int set(int i, const void* value, size_t sz);
+    int set(int i, const Image2D& image2D);
+    int set(int i, const UMat& m);
+    int set(int i, const KernelArg& arg);
+    template<typename _Tp> int set(int i, const _Tp& value)
+    { return set(i, &value, sizeof(value)); }
+
+
+protected:
+    template<typename _Tp0> inline
+    int set_args_(int i, const _Tp0& a0) { return set(i, a0); }
+    template<typename _Tp0, typename... _Tps> inline
+    int set_args_(int i, const _Tp0& a0, const _Tps&... rest_args) { i = set(i, a0); return set_args_(i, rest_args...); }
+public:
+    /** @brief Setup OpenCL Kernel arguments.
+    Avoid direct using of set(i, ...) methods.
+    @code
+    bool ok = kernel
+        .args(
+            srcUMat, dstUMat,
+            (float)some_float_param
+        ).run(ndims, globalSize, localSize);
+    if (!ok) return false;
+    @endcode
+    */
+    template<typename... _Tps> inline
+    Kernel& args(const _Tps&... kernel_args) { set_args_(0, kernel_args...); return *this; }
+
+    /** @brief Run the OpenCL kernel (globalsize value may be adjusted)
+
+    @param dims the work problem dimensions. It is the length of globalsize and localsize. It can be either 1, 2 or 3.
+    @param globalsize work items for each dimension. It is not the final globalsize passed to
+      OpenCL. Each dimension will be adjusted to the nearest integer divisible by the corresponding
+      value in localsize. If localsize is NULL, it will still be adjusted depending on dims. The
+      adjusted values are greater than or equal to the original values.
+    @param localsize work-group size for each dimension.
+    @param sync specify whether to wait for OpenCL computation to finish before return.
+    @param q command queue
+
+    @note Use run_() if your kernel code doesn't support adjusted globalsize.
+    */
+    bool run(int dims, size_t globalsize[],
+             size_t localsize[], bool sync, const Queue& q=Queue());
+
+    /** @brief Run the OpenCL kernel
+     *
+     * @param dims the work problem dimensions. It is the length of globalsize and localsize. It can be either 1, 2 or 3.
+     * @param globalsize work items for each dimension. This value is passed to OpenCL without changes.
+     * @param localsize work-group size for each dimension.
+     * @param sync specify whether to wait for OpenCL computation to finish before return.
+     * @param q command queue
+     */
+    bool run_(int dims, size_t globalsize[], size_t localsize[], bool sync, const Queue& q=Queue());
+
+    bool runTask(bool sync, const Queue& q=Queue());
+
+    /** @brief Similar to synchronized run_() call with returning of kernel execution time
+     *
+     * Separate OpenCL command queue may be used (with CL_QUEUE_PROFILING_ENABLE)
+     * @return Execution time in nanoseconds or negative number on error
+     */
+    int64 runProfiling(int dims, size_t globalsize[], size_t localsize[], const Queue& q=Queue());
+
+    size_t workGroupSize() const;
+    size_t preferedWorkGroupSizeMultiple() const;
+    bool compileWorkGroupSize(size_t wsz[]) const;
+    size_t localMemSize() const;
+
+    void* ptr() const;
+    struct Impl;
+
+protected:
+    Impl* p;
+};
+
+class CV_EXPORTS Program
+{
+public:
+    Program() CV_NOEXCEPT;
+    Program(const ProgramSource& src,
+            const String& buildflags, String& errmsg);
+    Program(const Program& prog);
+    Program& operator = (const Program& prog);
+    Program(Program&& prog) CV_NOEXCEPT;
+    Program& operator = (Program&& prog) CV_NOEXCEPT;
+    ~Program();
+
+    bool create(const ProgramSource& src,
+                const String& buildflags, String& errmsg);
+
+    void* ptr() const;
+
+    /**
+     * @brief Query device-specific program binary.
+     *
+     * Returns RAW OpenCL executable binary without additional attachments.
+     *
+     * @sa ProgramSource::fromBinary
+     *
+     * @param[out] binary output buffer
+     */
+    void getBinary(std::vector<char>& binary) const;
+
+    struct Impl; friend struct Impl;
+    inline Impl* getImpl() const { return (Impl*)p; }
+    inline bool empty() const { return !p; }
+protected:
+    Impl* p;
+public:
+#ifndef OPENCV_REMOVE_DEPRECATED_API
+    // TODO Remove this
+    CV_DEPRECATED bool read(const String& buf, const String& buildflags); // removed, use ProgramSource instead
+    CV_DEPRECATED bool write(String& buf) const; // removed, use getBinary() method instead (RAW OpenCL binary)
+    CV_DEPRECATED const ProgramSource& source() const; // implementation removed
+    CV_DEPRECATED String getPrefix() const; // deprecated, implementation replaced
+    CV_DEPRECATED static String getPrefix(const String& buildflags); // deprecated, implementation replaced
+#endif
+};
+
+
+class CV_EXPORTS ProgramSource
+{
+public:
+    typedef uint64 hash_t; // deprecated
+
+    ProgramSource() CV_NOEXCEPT;
+    explicit ProgramSource(const String& module, const String& name, const String& codeStr, const String& codeHash);
+    explicit ProgramSource(const String& prog); // deprecated
+    explicit ProgramSource(const char* prog); // deprecated
+    ~ProgramSource();
+    ProgramSource(const ProgramSource& prog);
+    ProgramSource& operator = (const ProgramSource& prog);
+    ProgramSource(ProgramSource&& prog) CV_NOEXCEPT;
+    ProgramSource& operator = (ProgramSource&& prog) CV_NOEXCEPT;
+
+    const String& source() const; // deprecated
+    hash_t hash() const; // deprecated
+
+
+    /** @brief Describe OpenCL program binary.
+     * Do not call clCreateProgramWithBinary() and/or clBuildProgram().
+     *
+     * Caller should guarantee binary buffer lifetime greater than ProgramSource object (and any of its copies).
+     *
+     * This kind of binary is not portable between platforms in general - it is specific to OpenCL vendor / device / driver version.
+     *
+     * @param module name of program owner module
+     * @param name unique name of program (module+name is used as key for OpenCL program caching)
+     * @param binary buffer address. See buffer lifetime requirement in description.
+     * @param size buffer size
+     * @param buildOptions additional program-related build options passed to clBuildProgram()
+     * @return created ProgramSource object
+     */
+    static ProgramSource fromBinary(const String& module, const String& name,
+            const unsigned char* binary, const size_t size,
+            const cv::String& buildOptions = cv::String());
+
+    /** @brief Describe OpenCL program in SPIR format.
+     * Do not call clCreateProgramWithBinary() and/or clBuildProgram().
+     *
+     * Supports SPIR 1.2 by default (pass '-spir-std=X.Y' in buildOptions to override this behavior)
+     *
+     * Caller should guarantee binary buffer lifetime greater than ProgramSource object (and any of its copies).
+     *
+     * Programs in this format are portable between OpenCL implementations with 'khr_spir' extension:
+     * https://www.khronos.org/registry/OpenCL/sdk/2.0/docs/man/xhtml/cl_khr_spir.html
+     * (but they are not portable between different platforms: 32-bit / 64-bit)
+     *
+     * Note: these programs can't support vendor specific extensions, like 'cl_intel_subgroups'.
+     *
+     * @param module name of program owner module
+     * @param name unique name of program (module+name is used as key for OpenCL program caching)
+     * @param binary buffer address. See buffer lifetime requirement in description.
+     * @param size buffer size
+     * @param buildOptions additional program-related build options passed to clBuildProgram()
+     *        (these options are added automatically: '-x spir' and '-spir-std=1.2')
+     * @return created ProgramSource object.
+     */
+    static ProgramSource fromSPIR(const String& module, const String& name,
+            const unsigned char* binary, const size_t size,
+            const cv::String& buildOptions = cv::String());
+
+    //OpenCL 2.1+ only
+    //static Program fromSPIRV(const String& module, const String& name,
+    //        const unsigned char* binary, const size_t size,
+    //        const cv::String& buildOptions = cv::String());
+
+    struct Impl; friend struct Impl;
+    inline Impl* getImpl() const { return (Impl*)p; }
+    inline bool empty() const { return !p; }
+protected:
+    Impl* p;
+};
+
+class CV_EXPORTS PlatformInfo
+{
+public:
+    PlatformInfo() CV_NOEXCEPT;
+    /**
+     * @param id pointer cl_platform_id (cl_platform_id*)
+     */
+    explicit PlatformInfo(void* id);
+    ~PlatformInfo();
+
+    PlatformInfo(const PlatformInfo& i);
+    PlatformInfo& operator =(const PlatformInfo& i);
+    PlatformInfo(PlatformInfo&& i) CV_NOEXCEPT;
+    PlatformInfo& operator = (PlatformInfo&& i) CV_NOEXCEPT;
+
+    String name() const;
+    String vendor() const;
+
+    /// See CL_PLATFORM_VERSION
+    String version() const;
+    int versionMajor() const;
+    int versionMinor() const;
+
+    int deviceNumber() const;
+    void getDevice(Device& device, int d) const;
+
+    struct Impl;
+    bool empty() const { return !p; }
+protected:
+    Impl* p;
+};
+
+CV_EXPORTS const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf);
+CV_EXPORTS const char* typeToStr(int t);
+CV_EXPORTS const char* memopTypeToStr(int t);
+CV_EXPORTS const char* vecopTypeToStr(int t);
+CV_EXPORTS const char* getOpenCLErrorString(int errorCode);
+CV_EXPORTS String kernelToStr(InputArray _kernel, int ddepth = -1, const char * name = NULL);
+CV_EXPORTS void getPlatfomsInfo(std::vector<PlatformInfo>& platform_info);
+
+
+enum OclVectorStrategy
+{
+    // all matrices have its own vector width
+    OCL_VECTOR_OWN = 0,
+    // all matrices have maximal vector width among all matrices
+    // (useful for cases when matrices have different data types)
+    OCL_VECTOR_MAX = 1,
+
+    // default strategy
+    OCL_VECTOR_DEFAULT = OCL_VECTOR_OWN
+};
+
+CV_EXPORTS int predictOptimalVectorWidth(InputArray src1, InputArray src2 = noArray(), InputArray src3 = noArray(),
+                                         InputArray src4 = noArray(), InputArray src5 = noArray(), InputArray src6 = noArray(),
+                                         InputArray src7 = noArray(), InputArray src8 = noArray(), InputArray src9 = noArray(),
+                                         OclVectorStrategy strat = OCL_VECTOR_DEFAULT);
+
+CV_EXPORTS int checkOptimalVectorWidth(const int *vectorWidths,
+                                       InputArray src1, InputArray src2 = noArray(), InputArray src3 = noArray(),
+                                       InputArray src4 = noArray(), InputArray src5 = noArray(), InputArray src6 = noArray(),
+                                       InputArray src7 = noArray(), InputArray src8 = noArray(), InputArray src9 = noArray(),
+                                       OclVectorStrategy strat = OCL_VECTOR_DEFAULT);
+
+// with OCL_VECTOR_MAX strategy
+CV_EXPORTS int predictOptimalVectorWidthMax(InputArray src1, InputArray src2 = noArray(), InputArray src3 = noArray(),
+                                            InputArray src4 = noArray(), InputArray src5 = noArray(), InputArray src6 = noArray(),
+                                            InputArray src7 = noArray(), InputArray src8 = noArray(), InputArray src9 = noArray());
+
+CV_EXPORTS void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m);
+
+class CV_EXPORTS Image2D
+{
+public:
+    Image2D() CV_NOEXCEPT;
+
+    /**
+    @param src UMat object from which to get image properties and data
+    @param norm flag to enable the use of normalized channel data types
+    @param alias flag indicating that the image should alias the src UMat. If true, changes to the
+        image or src will be reflected in both objects.
+    */
+    explicit Image2D(const UMat &src, bool norm = false, bool alias = false);
+    Image2D(const Image2D & i);
+    ~Image2D();
+
+    Image2D & operator = (const Image2D & i);
+    Image2D(Image2D &&) CV_NOEXCEPT;
+    Image2D &operator=(Image2D &&) CV_NOEXCEPT;
+
+    /** Indicates if creating an aliased image should succeed.
+    Depends on the underlying platform and the dimensions of the UMat.
+    */
+    static bool canCreateAlias(const UMat &u);
+
+    /** Indicates if the image format is supported.
+    */
+    static bool isFormatSupported(int depth, int cn, bool norm);
+
+    void* ptr() const;
+protected:
+    struct Impl;
+    Impl* p;
+};
+
+class CV_EXPORTS Timer
+{
+public:
+    Timer(const Queue& q);
+    ~Timer();
+    void start();
+    void stop();
+
+    uint64 durationNS() const; //< duration in nanoseconds
+
+protected:
+    struct Impl;
+    Impl* const p;
+
+private:
+    Timer(const Timer&); // disabled
+    Timer& operator=(const Timer&); // disabled
+};
+
+CV_EXPORTS MatAllocator* getOpenCLAllocator();
+
+
+class CV_EXPORTS_W OpenCLExecutionContext
+{
+public:
+    OpenCLExecutionContext() = default;
+    ~OpenCLExecutionContext() = default;
+
+    OpenCLExecutionContext(const OpenCLExecutionContext&) = default;
+    OpenCLExecutionContext(OpenCLExecutionContext&&) = default;
+
+    OpenCLExecutionContext& operator=(const OpenCLExecutionContext&) = default;
+    OpenCLExecutionContext& operator=(OpenCLExecutionContext&&) = default;
+
+    /** Get associated ocl::Context */
+    Context& getContext() const;
+    /** Get the single default associated ocl::Device */
+    Device& getDevice() const;
+    /** Get the single ocl::Queue that is associated with the ocl::Context and
+     *  the single default ocl::Device
+     */
+    Queue& getQueue() const;
+
+    bool useOpenCL() const;
+    void setUseOpenCL(bool flag);
+
+    /** Get OpenCL execution context of current thread.
+     *
+     * Initialize OpenCL execution context if it is empty
+     * - create new
+     * - reuse context of the main thread (threadID = 0)
+     */
+    static OpenCLExecutionContext& getCurrent();
+
+    /** Get OpenCL execution context of current thread (can be empty) */
+    static OpenCLExecutionContext& getCurrentRef();
+
+    /** Bind this OpenCL execution context to current thread.
+     *
+     * Context can't be empty.
+     *
+     * @note clFinish is not called for queue of previous execution context
+     */
+    void bind() const;
+
+    /** Creates new execution context with same OpenCV context and device
+     *
+     * @param q OpenCL queue
+     */
+    OpenCLExecutionContext cloneWithNewQueue(const ocl::Queue& q) const;
+    /** @overload */
+    OpenCLExecutionContext cloneWithNewQueue() const;
+
+    /** @brief Creates OpenCL execution context
+     * OpenCV will check if available OpenCL platform has platformName name,
+     * then assign context to OpenCV.
+     * The deviceID device will be used as target device and a new command queue will be created.
+     *
+     * @note On success, ownership of one reference of the context and device is taken.
+     * The caller should additionally call `clRetainContext` and/or `clRetainDevice`
+     * to increase the reference count if it wishes to continue using them.
+     *
+     * @param platformName name of OpenCL platform to attach, this string is used to check if platform is available to OpenCV at runtime
+     * @param platformID ID of platform attached context was created for (cl_platform_id)
+     * @param context OpenCL context to be attached to OpenCV (cl_context)
+     * @param deviceID OpenCL device (cl_device_id)
+     */
+    static OpenCLExecutionContext create(const std::string& platformName, void* platformID, void* context, void* deviceID);
+
+    /** @brief Creates OpenCL execution context
+     *
+     * @param context non-empty OpenCL context
+     * @param device non-empty OpenCL device (must be a part of context)
+     * @param queue non-empty OpenCL queue for provided context and device
+     */
+    static OpenCLExecutionContext create(const Context& context, const Device& device, const ocl::Queue& queue);
+    /** @overload */
+    static OpenCLExecutionContext create(const Context& context, const Device& device);
+
+    struct Impl;
+    inline bool empty() const { return !p; }
+    void release();
+protected:
+    std::shared_ptr<Impl> p;
+};
+
+class OpenCLExecutionContextScope
+{
+    OpenCLExecutionContext ctx_;
+public:
+    inline OpenCLExecutionContextScope(const OpenCLExecutionContext& ctx)
+    {
+        CV_Assert(!ctx.empty());
+        ctx_ = OpenCLExecutionContext::getCurrentRef();
+        ctx.bind();
+    }
+
+    inline ~OpenCLExecutionContextScope()
+    {
+        if (!ctx_.empty())
+        {
+            ctx_.bind();
+        }
+    }
+};
+
+#ifdef __OPENCV_BUILD
+namespace internal {
+
+CV_EXPORTS bool isOpenCLForced();
+#define OCL_FORCE_CHECK(condition) (cv::ocl::internal::isOpenCLForced() || (condition))
+
+CV_EXPORTS bool isPerformanceCheckBypassed();
+#define OCL_PERFORMANCE_CHECK(condition) (cv::ocl::internal::isPerformanceCheckBypassed() || (condition))
+
+CV_EXPORTS bool isCLBuffer(UMat& u);
+
+} // namespace internal
+#endif
+
+//! @}
+
+}}
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/ocl_genbase.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/ocl_genbase.hpp
new file mode 100644
index 0000000..5334cf1
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/ocl_genbase.hpp
@@ -0,0 +1,69 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the OpenCV Foundation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_OPENCL_GENBASE_HPP
+#define OPENCV_OPENCL_GENBASE_HPP
+
+//! @cond IGNORED
+
+namespace cv {
+namespace ocl {
+
+class ProgramSource;
+
+namespace internal {
+
+struct CV_EXPORTS ProgramEntry
+{
+    const char* module;
+    const char* name;
+    const char* programCode;
+    const char* programHash;
+    ProgramSource* pProgramSource;
+
+    operator ProgramSource& () const;
+};
+
+} } } // namespace
+
+//! @endcond
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/ocl_defs.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/ocl_defs.hpp
new file mode 100644
index 0000000..14df750
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/ocl_defs.hpp
@@ -0,0 +1,82 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#ifndef OPENCV_CORE_OPENCL_DEFS_HPP
+#define OPENCV_CORE_OPENCL_DEFS_HPP
+
+#include "opencv2/core/utility.hpp"
+#include "cvconfig.h"
+
+namespace cv { namespace ocl {
+#ifdef HAVE_OPENCL
+/// Call is similar to useOpenCL() but doesn't try to load OpenCL runtime or create OpenCL context
+CV_EXPORTS bool isOpenCLActivated();
+#else
+static inline bool isOpenCLActivated() { return false; }
+#endif
+}} // namespace
+
+
+//#define CV_OPENCL_RUN_ASSERT
+
+#ifdef HAVE_OPENCL
+
+#ifdef CV_OPENCL_RUN_VERBOSE
+#define CV_OCL_RUN_(condition, func, ...)                                   \
+    {                                                                       \
+        if (cv::ocl::isOpenCLActivated() && (condition) && func)            \
+        {                                                                   \
+            printf("%s: OpenCL implementation is running\n", CV_Func);      \
+            fflush(stdout);                                                 \
+            CV_IMPL_ADD(CV_IMPL_OCL);                                       \
+            return __VA_ARGS__;                                             \
+        }                                                                   \
+        else                                                                \
+        {                                                                   \
+            printf("%s: Plain implementation is running\n", CV_Func);       \
+            fflush(stdout);                                                 \
+        }                                                                   \
+    }
+#elif defined CV_OPENCL_RUN_ASSERT
+#define CV_OCL_RUN_(condition, func, ...)                                   \
+    {                                                                       \
+        if (cv::ocl::isOpenCLActivated() && (condition))                    \
+        {                                                                   \
+            if(func)                                                        \
+            {                                                               \
+                CV_IMPL_ADD(CV_IMPL_OCL);                                   \
+            }                                                               \
+            else                                                            \
+            {                                                               \
+                CV_Error(cv::Error::StsAssert, #func);                      \
+            }                                                               \
+            return __VA_ARGS__;                                             \
+        }                                                                   \
+    }
+#else
+#define CV_OCL_RUN_(condition, func, ...)                                   \
+try \
+{ \
+    if (cv::ocl::isOpenCLActivated() && (condition) && func)                \
+    {                                                                       \
+        CV_IMPL_ADD(CV_IMPL_OCL);                                           \
+        return __VA_ARGS__;                                                 \
+    } \
+} \
+catch (const cv::Exception& e) \
+{ \
+    CV_UNUSED(e); /* TODO: Add some logging here */ \
+}
+#endif
+
+#else
+#define CV_OCL_RUN_(condition, func, ...)
+#endif
+
+#define CV_OCL_RUN(condition, func) CV_OCL_RUN_(condition, func)
+
+#endif // OPENCV_CORE_OPENCL_DEFS_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/opencl_info.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/opencl_info.hpp
new file mode 100644
index 0000000..3ead76e
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/opencl_info.hpp
@@ -0,0 +1,212 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <iostream>
+
+#include <opencv2/core.hpp>
+#include <opencv2/core/ocl.hpp>
+
+#ifndef DUMP_CONFIG_PROPERTY
+#define DUMP_CONFIG_PROPERTY(...)
+#endif
+
+#ifndef DUMP_MESSAGE_STDOUT
+#define DUMP_MESSAGE_STDOUT(...) do { std::cout << __VA_ARGS__ << std::endl; } while (false)
+#endif
+
+namespace cv {
+
+namespace {
+static std::string bytesToStringRepr(size_t value)
+{
+    size_t b = value % 1024;
+    value /= 1024;
+
+    size_t kb = value % 1024;
+    value /= 1024;
+
+    size_t mb = value % 1024;
+    value /= 1024;
+
+    size_t gb = value;
+
+    std::ostringstream stream;
+
+    if (gb > 0)
+        stream << gb << " GB ";
+    if (mb > 0)
+        stream << mb << " MB ";
+    if (kb > 0)
+        stream << kb << " KB ";
+    if (b > 0)
+        stream << b << " B";
+
+    std::string s = stream.str();
+    if (s[s.size() - 1] == ' ')
+        s = s.substr(0, s.size() - 1);
+    return s;
+}
+
+static String getDeviceTypeString(const cv::ocl::Device& device)
+{
+    if (device.type() == cv::ocl::Device::TYPE_CPU) {
+        return "CPU";
+    }
+
+    if (device.type() == cv::ocl::Device::TYPE_GPU) {
+        if (device.hostUnifiedMemory()) {
+            return "iGPU";
+        } else {
+            return "dGPU";
+        }
+    }
+
+    return "unknown";
+}
+} // namespace
+
+static void dumpOpenCLInformation()
+{
+    using namespace cv::ocl;
+
+    try
+    {
+        if (!haveOpenCL() || !useOpenCL())
+        {
+            DUMP_MESSAGE_STDOUT("OpenCL is disabled");
+            DUMP_CONFIG_PROPERTY("cv_ocl", "disabled");
+            return;
+        }
+
+        std::vector<PlatformInfo> platforms;
+        cv::ocl::getPlatfomsInfo(platforms);
+        if (platforms.empty())
+        {
+            DUMP_MESSAGE_STDOUT("OpenCL is not available");
+            DUMP_CONFIG_PROPERTY("cv_ocl", "not available");
+            return;
+        }
+
+        DUMP_MESSAGE_STDOUT("OpenCL Platforms: ");
+        for (size_t i = 0; i < platforms.size(); i++)
+        {
+            const PlatformInfo* platform = &platforms[i];
+            DUMP_MESSAGE_STDOUT("    " << platform->name());
+            Device current_device;
+            for (int j = 0; j < platform->deviceNumber(); j++)
+            {
+                platform->getDevice(current_device, j);
+                String deviceTypeStr = getDeviceTypeString(current_device);
+                DUMP_MESSAGE_STDOUT( "        " << deviceTypeStr << ": " << current_device.name() << " (" << current_device.version() << ")");
+                DUMP_CONFIG_PROPERTY( cv::format("cv_ocl_platform_%d_device_%d", (int)i, j ),
+                    cv::format("(Platform=%s)(Type=%s)(Name=%s)(Version=%s)",
+                    platform->name().c_str(), deviceTypeStr.c_str(), current_device.name().c_str(), current_device.version().c_str()) );
+            }
+        }
+        const Device& device = Device::getDefault();
+        if (!device.available())
+            CV_Error(Error::OpenCLInitError, "OpenCL device is not available");
+
+        DUMP_MESSAGE_STDOUT("Current OpenCL device: ");
+
+        String deviceTypeStr = getDeviceTypeString(device);
+        DUMP_MESSAGE_STDOUT("    Type = " << deviceTypeStr);
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_deviceType", deviceTypeStr);
+
+        DUMP_MESSAGE_STDOUT("    Name = " << device.name());
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_deviceName", device.name());
+
+        DUMP_MESSAGE_STDOUT("    Version = " << device.version());
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_deviceVersion", device.version());
+
+        DUMP_MESSAGE_STDOUT("    Driver version = " << device.driverVersion());
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_driverVersion", device.driverVersion());
+
+        DUMP_MESSAGE_STDOUT("    Address bits = " << device.addressBits());
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_addressBits", device.addressBits());
+
+        DUMP_MESSAGE_STDOUT("    Compute units = " << device.maxComputeUnits());
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_maxComputeUnits", device.maxComputeUnits());
+
+        DUMP_MESSAGE_STDOUT("    Max work group size = " << device.maxWorkGroupSize());
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_maxWorkGroupSize", device.maxWorkGroupSize());
+
+        std::string localMemorySizeStr = bytesToStringRepr(device.localMemSize());
+        DUMP_MESSAGE_STDOUT("    Local memory size = " << localMemorySizeStr);
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_localMemSize", device.localMemSize());
+
+        std::string maxMemAllocSizeStr = bytesToStringRepr(device.maxMemAllocSize());
+        DUMP_MESSAGE_STDOUT("    Max memory allocation size = " << maxMemAllocSizeStr);
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_maxMemAllocSize", device.maxMemAllocSize());
+
+        const char* doubleSupportStr = device.doubleFPConfig() > 0 ? "Yes" : "No";
+        DUMP_MESSAGE_STDOUT("    Double support = " << doubleSupportStr);
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_haveDoubleSupport", device.doubleFPConfig() > 0);
+
+        const char* halfSupportStr = device.halfFPConfig() > 0 ? "Yes" : "No";
+        DUMP_MESSAGE_STDOUT("    Half support = " << halfSupportStr);
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_haveHalfSupport", device.halfFPConfig() > 0);
+
+        const char* isUnifiedMemoryStr = device.hostUnifiedMemory() ? "Yes" : "No";
+        DUMP_MESSAGE_STDOUT("    Host unified memory = " << isUnifiedMemoryStr);
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_hostUnifiedMemory", device.hostUnifiedMemory());
+
+        DUMP_MESSAGE_STDOUT("    Device extensions:");
+        String extensionsStr = device.extensions();
+        size_t pos = 0;
+        while (pos < extensionsStr.size())
+        {
+            size_t pos2 = extensionsStr.find(' ', pos);
+            if (pos2 == String::npos)
+                pos2 = extensionsStr.size();
+            if (pos2 > pos)
+            {
+                String extensionName = extensionsStr.substr(pos, pos2 - pos);
+                DUMP_MESSAGE_STDOUT("        " << extensionName);
+            }
+            pos = pos2 + 1;
+        }
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_extensions", extensionsStr);
+
+        const char* haveAmdBlasStr = haveAmdBlas() ? "Yes" : "No";
+        DUMP_MESSAGE_STDOUT("    Has AMD Blas = " << haveAmdBlasStr);
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_AmdBlas", haveAmdBlas());
+
+        const char* haveAmdFftStr = haveAmdFft() ? "Yes" : "No";
+        DUMP_MESSAGE_STDOUT("    Has AMD Fft = " << haveAmdFftStr);
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_AmdFft", haveAmdFft());
+
+
+        DUMP_MESSAGE_STDOUT("    Preferred vector width char = " << device.preferredVectorWidthChar());
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_preferredVectorWidthChar", device.preferredVectorWidthChar());
+
+        DUMP_MESSAGE_STDOUT("    Preferred vector width short = " << device.preferredVectorWidthShort());
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_preferredVectorWidthShort", device.preferredVectorWidthShort());
+
+        DUMP_MESSAGE_STDOUT("    Preferred vector width int = " << device.preferredVectorWidthInt());
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_preferredVectorWidthInt", device.preferredVectorWidthInt());
+
+        DUMP_MESSAGE_STDOUT("    Preferred vector width long = " << device.preferredVectorWidthLong());
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_preferredVectorWidthLong", device.preferredVectorWidthLong());
+
+        DUMP_MESSAGE_STDOUT("    Preferred vector width float = " << device.preferredVectorWidthFloat());
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_preferredVectorWidthFloat", device.preferredVectorWidthFloat());
+
+        DUMP_MESSAGE_STDOUT("    Preferred vector width double = " << device.preferredVectorWidthDouble());
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_preferredVectorWidthDouble", device.preferredVectorWidthDouble());
+
+        DUMP_MESSAGE_STDOUT("    Preferred vector width half = " << device.preferredVectorWidthHalf());
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_preferredVectorWidthHalf", device.preferredVectorWidthHalf());
+    }
+    catch (...)
+    {
+        DUMP_MESSAGE_STDOUT("Exception. Can't dump OpenCL info");
+        DUMP_MESSAGE_STDOUT("OpenCL device not available");
+        DUMP_CONFIG_PROPERTY("cv_ocl", "not available");
+    }
+}
+#undef DUMP_MESSAGE_STDOUT
+#undef DUMP_CONFIG_PROPERTY
+
+} // namespace
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/opencl_svm.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/opencl_svm.hpp
new file mode 100644
index 0000000..7453082
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/opencl_svm.hpp
@@ -0,0 +1,81 @@
+/* See LICENSE file in the root OpenCV directory */
+
+#ifndef OPENCV_CORE_OPENCL_SVM_HPP
+#define OPENCV_CORE_OPENCL_SVM_HPP
+
+//
+// Internal usage only (binary compatibility is not guaranteed)
+//
+#ifndef __OPENCV_BUILD
+#error Internal header file
+#endif
+
+#if defined(HAVE_OPENCL) && defined(HAVE_OPENCL_SVM)
+#include "runtime/opencl_core.hpp"
+#include "runtime/opencl_svm_20.hpp"
+#include "runtime/opencl_svm_hsa_extension.hpp"
+
+namespace cv { namespace ocl { namespace svm {
+
+struct SVMCapabilities
+{
+    enum Value
+    {
+        SVM_COARSE_GRAIN_BUFFER = (1 << 0),
+        SVM_FINE_GRAIN_BUFFER = (1 << 1),
+        SVM_FINE_GRAIN_SYSTEM = (1 << 2),
+        SVM_ATOMICS = (1 << 3),
+    };
+    int value_;
+
+    SVMCapabilities(int capabilities = 0) : value_(capabilities) { }
+    operator int() const { return value_; }
+
+    inline bool isNoSVMSupport() const { return value_ == 0; }
+    inline bool isSupportCoarseGrainBuffer() const { return (value_ & SVM_COARSE_GRAIN_BUFFER) != 0; }
+    inline bool isSupportFineGrainBuffer() const { return (value_ & SVM_FINE_GRAIN_BUFFER) != 0; }
+    inline bool isSupportFineGrainSystem() const { return (value_ & SVM_FINE_GRAIN_SYSTEM) != 0; }
+    inline bool isSupportAtomics() const { return (value_ & SVM_ATOMICS) != 0; }
+};
+
+CV_EXPORTS const SVMCapabilities getSVMCapabilitites(const ocl::Context& context);
+
+struct SVMFunctions
+{
+    clSVMAllocAMD_fn fn_clSVMAlloc;
+    clSVMFreeAMD_fn fn_clSVMFree;
+    clSetKernelArgSVMPointerAMD_fn fn_clSetKernelArgSVMPointer;
+    //clSetKernelExecInfoAMD_fn fn_clSetKernelExecInfo;
+    //clEnqueueSVMFreeAMD_fn fn_clEnqueueSVMFree;
+    clEnqueueSVMMemcpyAMD_fn fn_clEnqueueSVMMemcpy;
+    clEnqueueSVMMemFillAMD_fn fn_clEnqueueSVMMemFill;
+    clEnqueueSVMMapAMD_fn fn_clEnqueueSVMMap;
+    clEnqueueSVMUnmapAMD_fn fn_clEnqueueSVMUnmap;
+
+    inline SVMFunctions()
+        : fn_clSVMAlloc(NULL), fn_clSVMFree(NULL),
+          fn_clSetKernelArgSVMPointer(NULL), /*fn_clSetKernelExecInfo(NULL),*/
+          /*fn_clEnqueueSVMFree(NULL),*/ fn_clEnqueueSVMMemcpy(NULL), fn_clEnqueueSVMMemFill(NULL),
+          fn_clEnqueueSVMMap(NULL), fn_clEnqueueSVMUnmap(NULL)
+    {
+        // nothing
+    }
+
+    inline bool isValid() const
+    {
+        return fn_clSVMAlloc != NULL && fn_clSVMFree && fn_clSetKernelArgSVMPointer &&
+                /*fn_clSetKernelExecInfo && fn_clEnqueueSVMFree &&*/ fn_clEnqueueSVMMemcpy &&
+                fn_clEnqueueSVMMemFill && fn_clEnqueueSVMMap && fn_clEnqueueSVMUnmap;
+    }
+};
+
+// We should guarantee that SVMFunctions lifetime is not less than context's lifetime
+CV_EXPORTS const SVMFunctions* getSVMFunctions(const ocl::Context& context);
+
+CV_EXPORTS bool useSVM(UMatUsageFlags usageFlags);
+
+}}} //namespace cv::ocl::svm
+#endif
+
+#endif // OPENCV_CORE_OPENCL_SVM_HPP
+/* End of file. */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/autogenerated/opencl_clblas.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/autogenerated/opencl_clblas.hpp
new file mode 100644
index 0000000..2749927
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/autogenerated/opencl_clblas.hpp
@@ -0,0 +1,602 @@
+//
+// AUTOGENERATED, DO NOT EDIT
+//
+#ifndef OPENCV_CORE_OCL_RUNTIME_CLAMDBLAS_HPP
+#error "Invalid usage"
+#endif
+
+// generated by parser_clblas.py
+#define clblasCaxpy clblasCaxpy_
+#define clblasCcopy clblasCcopy_
+#define clblasCdotc clblasCdotc_
+#define clblasCdotu clblasCdotu_
+#define clblasCgbmv clblasCgbmv_
+#define clblasCgemm clblasCgemm_
+#define clblasCgemv clblasCgemv_
+#define clblasCgerc clblasCgerc_
+#define clblasCgeru clblasCgeru_
+#define clblasChbmv clblasChbmv_
+#define clblasChemm clblasChemm_
+#define clblasChemv clblasChemv_
+#define clblasCher clblasCher_
+#define clblasCher2 clblasCher2_
+#define clblasCher2k clblasCher2k_
+#define clblasCherk clblasCherk_
+#define clblasChpmv clblasChpmv_
+#define clblasChpr clblasChpr_
+#define clblasChpr2 clblasChpr2_
+#define clblasCrotg clblasCrotg_
+#define clblasCscal clblasCscal_
+#define clblasCsrot clblasCsrot_
+#define clblasCsscal clblasCsscal_
+#define clblasCswap clblasCswap_
+#define clblasCsymm clblasCsymm_
+#define clblasCsyr2k clblasCsyr2k_
+#define clblasCsyrk clblasCsyrk_
+#define clblasCtbmv clblasCtbmv_
+#define clblasCtbsv clblasCtbsv_
+#define clblasCtpmv clblasCtpmv_
+#define clblasCtpsv clblasCtpsv_
+#define clblasCtrmm clblasCtrmm_
+#define clblasCtrmv clblasCtrmv_
+#define clblasCtrsm clblasCtrsm_
+#define clblasCtrsv clblasCtrsv_
+#define clblasDasum clblasDasum_
+#define clblasDaxpy clblasDaxpy_
+#define clblasDcopy clblasDcopy_
+#define clblasDdot clblasDdot_
+#define clblasDgbmv clblasDgbmv_
+#define clblasDgemm clblasDgemm_
+#define clblasDgemv clblasDgemv_
+#define clblasDger clblasDger_
+#define clblasDnrm2 clblasDnrm2_
+#define clblasDrot clblasDrot_
+#define clblasDrotg clblasDrotg_
+#define clblasDrotm clblasDrotm_
+#define clblasDrotmg clblasDrotmg_
+#define clblasDsbmv clblasDsbmv_
+#define clblasDscal clblasDscal_
+#define clblasDspmv clblasDspmv_
+#define clblasDspr clblasDspr_
+#define clblasDspr2 clblasDspr2_
+#define clblasDswap clblasDswap_
+#define clblasDsymm clblasDsymm_
+#define clblasDsymv clblasDsymv_
+#define clblasDsyr clblasDsyr_
+#define clblasDsyr2 clblasDsyr2_
+#define clblasDsyr2k clblasDsyr2k_
+#define clblasDsyrk clblasDsyrk_
+#define clblasDtbmv clblasDtbmv_
+#define clblasDtbsv clblasDtbsv_
+#define clblasDtpmv clblasDtpmv_
+#define clblasDtpsv clblasDtpsv_
+#define clblasDtrmm clblasDtrmm_
+#define clblasDtrmv clblasDtrmv_
+#define clblasDtrsm clblasDtrsm_
+#define clblasDtrsv clblasDtrsv_
+#define clblasDzasum clblasDzasum_
+#define clblasDznrm2 clblasDznrm2_
+#define clblasGetVersion clblasGetVersion_
+#define clblasSasum clblasSasum_
+#define clblasSaxpy clblasSaxpy_
+#define clblasScasum clblasScasum_
+#define clblasScnrm2 clblasScnrm2_
+#define clblasScopy clblasScopy_
+#define clblasSdot clblasSdot_
+#define clblasSetup clblasSetup_
+#define clblasSgbmv clblasSgbmv_
+#define clblasSgemm clblasSgemm_
+#define clblasSgemv clblasSgemv_
+#define clblasSger clblasSger_
+#define clblasSnrm2 clblasSnrm2_
+#define clblasSrot clblasSrot_
+#define clblasSrotg clblasSrotg_
+#define clblasSrotm clblasSrotm_
+#define clblasSrotmg clblasSrotmg_
+#define clblasSsbmv clblasSsbmv_
+#define clblasSscal clblasSscal_
+#define clblasSspmv clblasSspmv_
+#define clblasSspr clblasSspr_
+#define clblasSspr2 clblasSspr2_
+#define clblasSswap clblasSswap_
+#define clblasSsymm clblasSsymm_
+#define clblasSsymv clblasSsymv_
+#define clblasSsyr clblasSsyr_
+#define clblasSsyr2 clblasSsyr2_
+#define clblasSsyr2k clblasSsyr2k_
+#define clblasSsyrk clblasSsyrk_
+#define clblasStbmv clblasStbmv_
+#define clblasStbsv clblasStbsv_
+#define clblasStpmv clblasStpmv_
+#define clblasStpsv clblasStpsv_
+#define clblasStrmm clblasStrmm_
+#define clblasStrmv clblasStrmv_
+#define clblasStrsm clblasStrsm_
+#define clblasStrsv clblasStrsv_
+#define clblasTeardown clblasTeardown_
+#define clblasZaxpy clblasZaxpy_
+#define clblasZcopy clblasZcopy_
+#define clblasZdotc clblasZdotc_
+#define clblasZdotu clblasZdotu_
+#define clblasZdrot clblasZdrot_
+#define clblasZdscal clblasZdscal_
+#define clblasZgbmv clblasZgbmv_
+#define clblasZgemm clblasZgemm_
+#define clblasZgemv clblasZgemv_
+#define clblasZgerc clblasZgerc_
+#define clblasZgeru clblasZgeru_
+#define clblasZhbmv clblasZhbmv_
+#define clblasZhemm clblasZhemm_
+#define clblasZhemv clblasZhemv_
+#define clblasZher clblasZher_
+#define clblasZher2 clblasZher2_
+#define clblasZher2k clblasZher2k_
+#define clblasZherk clblasZherk_
+#define clblasZhpmv clblasZhpmv_
+#define clblasZhpr clblasZhpr_
+#define clblasZhpr2 clblasZhpr2_
+#define clblasZrotg clblasZrotg_
+#define clblasZscal clblasZscal_
+#define clblasZswap clblasZswap_
+#define clblasZsymm clblasZsymm_
+#define clblasZsyr2k clblasZsyr2k_
+#define clblasZsyrk clblasZsyrk_
+#define clblasZtbmv clblasZtbmv_
+#define clblasZtbsv clblasZtbsv_
+#define clblasZtpmv clblasZtpmv_
+#define clblasZtpsv clblasZtpsv_
+#define clblasZtrmm clblasZtrmm_
+#define clblasZtrmv clblasZtrmv_
+#define clblasZtrsm clblasZtrsm_
+#define clblasZtrsv clblasZtrsv_
+#define clblasiCamax clblasiCamax_
+#define clblasiDamax clblasiDamax_
+#define clblasiSamax clblasiSamax_
+#define clblasiZamax clblasiZamax_
+
+#include <clBLAS.h>
+
+// generated by parser_clblas.py
+#undef clblasCaxpy
+//#define clblasCaxpy clblasCaxpy_pfn
+#undef clblasCcopy
+//#define clblasCcopy clblasCcopy_pfn
+#undef clblasCdotc
+//#define clblasCdotc clblasCdotc_pfn
+#undef clblasCdotu
+//#define clblasCdotu clblasCdotu_pfn
+#undef clblasCgbmv
+//#define clblasCgbmv clblasCgbmv_pfn
+#undef clblasCgemm
+#define clblasCgemm clblasCgemm_pfn
+#undef clblasCgemv
+//#define clblasCgemv clblasCgemv_pfn
+#undef clblasCgerc
+//#define clblasCgerc clblasCgerc_pfn
+#undef clblasCgeru
+//#define clblasCgeru clblasCgeru_pfn
+#undef clblasChbmv
+//#define clblasChbmv clblasChbmv_pfn
+#undef clblasChemm
+//#define clblasChemm clblasChemm_pfn
+#undef clblasChemv
+//#define clblasChemv clblasChemv_pfn
+#undef clblasCher
+//#define clblasCher clblasCher_pfn
+#undef clblasCher2
+//#define clblasCher2 clblasCher2_pfn
+#undef clblasCher2k
+//#define clblasCher2k clblasCher2k_pfn
+#undef clblasCherk
+//#define clblasCherk clblasCherk_pfn
+#undef clblasChpmv
+//#define clblasChpmv clblasChpmv_pfn
+#undef clblasChpr
+//#define clblasChpr clblasChpr_pfn
+#undef clblasChpr2
+//#define clblasChpr2 clblasChpr2_pfn
+#undef clblasCrotg
+//#define clblasCrotg clblasCrotg_pfn
+#undef clblasCscal
+//#define clblasCscal clblasCscal_pfn
+#undef clblasCsrot
+//#define clblasCsrot clblasCsrot_pfn
+#undef clblasCsscal
+//#define clblasCsscal clblasCsscal_pfn
+#undef clblasCswap
+//#define clblasCswap clblasCswap_pfn
+#undef clblasCsymm
+//#define clblasCsymm clblasCsymm_pfn
+#undef clblasCsyr2k
+//#define clblasCsyr2k clblasCsyr2k_pfn
+#undef clblasCsyrk
+//#define clblasCsyrk clblasCsyrk_pfn
+#undef clblasCtbmv
+//#define clblasCtbmv clblasCtbmv_pfn
+#undef clblasCtbsv
+//#define clblasCtbsv clblasCtbsv_pfn
+#undef clblasCtpmv
+//#define clblasCtpmv clblasCtpmv_pfn
+#undef clblasCtpsv
+//#define clblasCtpsv clblasCtpsv_pfn
+#undef clblasCtrmm
+//#define clblasCtrmm clblasCtrmm_pfn
+#undef clblasCtrmv
+//#define clblasCtrmv clblasCtrmv_pfn
+#undef clblasCtrsm
+//#define clblasCtrsm clblasCtrsm_pfn
+#undef clblasCtrsv
+//#define clblasCtrsv clblasCtrsv_pfn
+#undef clblasDasum
+//#define clblasDasum clblasDasum_pfn
+#undef clblasDaxpy
+//#define clblasDaxpy clblasDaxpy_pfn
+#undef clblasDcopy
+//#define clblasDcopy clblasDcopy_pfn
+#undef clblasDdot
+//#define clblasDdot clblasDdot_pfn
+#undef clblasDgbmv
+//#define clblasDgbmv clblasDgbmv_pfn
+#undef clblasDgemm
+#define clblasDgemm clblasDgemm_pfn
+#undef clblasDgemv
+//#define clblasDgemv clblasDgemv_pfn
+#undef clblasDger
+//#define clblasDger clblasDger_pfn
+#undef clblasDnrm2
+//#define clblasDnrm2 clblasDnrm2_pfn
+#undef clblasDrot
+//#define clblasDrot clblasDrot_pfn
+#undef clblasDrotg
+//#define clblasDrotg clblasDrotg_pfn
+#undef clblasDrotm
+//#define clblasDrotm clblasDrotm_pfn
+#undef clblasDrotmg
+//#define clblasDrotmg clblasDrotmg_pfn
+#undef clblasDsbmv
+//#define clblasDsbmv clblasDsbmv_pfn
+#undef clblasDscal
+//#define clblasDscal clblasDscal_pfn
+#undef clblasDspmv
+//#define clblasDspmv clblasDspmv_pfn
+#undef clblasDspr
+//#define clblasDspr clblasDspr_pfn
+#undef clblasDspr2
+//#define clblasDspr2 clblasDspr2_pfn
+#undef clblasDswap
+//#define clblasDswap clblasDswap_pfn
+#undef clblasDsymm
+//#define clblasDsymm clblasDsymm_pfn
+#undef clblasDsymv
+//#define clblasDsymv clblasDsymv_pfn
+#undef clblasDsyr
+//#define clblasDsyr clblasDsyr_pfn
+#undef clblasDsyr2
+//#define clblasDsyr2 clblasDsyr2_pfn
+#undef clblasDsyr2k
+//#define clblasDsyr2k clblasDsyr2k_pfn
+#undef clblasDsyrk
+//#define clblasDsyrk clblasDsyrk_pfn
+#undef clblasDtbmv
+//#define clblasDtbmv clblasDtbmv_pfn
+#undef clblasDtbsv
+//#define clblasDtbsv clblasDtbsv_pfn
+#undef clblasDtpmv
+//#define clblasDtpmv clblasDtpmv_pfn
+#undef clblasDtpsv
+//#define clblasDtpsv clblasDtpsv_pfn
+#undef clblasDtrmm
+//#define clblasDtrmm clblasDtrmm_pfn
+#undef clblasDtrmv
+//#define clblasDtrmv clblasDtrmv_pfn
+#undef clblasDtrsm
+//#define clblasDtrsm clblasDtrsm_pfn
+#undef clblasDtrsv
+//#define clblasDtrsv clblasDtrsv_pfn
+#undef clblasDzasum
+//#define clblasDzasum clblasDzasum_pfn
+#undef clblasDznrm2
+//#define clblasDznrm2 clblasDznrm2_pfn
+#undef clblasGetVersion
+//#define clblasGetVersion clblasGetVersion_pfn
+#undef clblasSasum
+//#define clblasSasum clblasSasum_pfn
+#undef clblasSaxpy
+//#define clblasSaxpy clblasSaxpy_pfn
+#undef clblasScasum
+//#define clblasScasum clblasScasum_pfn
+#undef clblasScnrm2
+//#define clblasScnrm2 clblasScnrm2_pfn
+#undef clblasScopy
+//#define clblasScopy clblasScopy_pfn
+#undef clblasSdot
+//#define clblasSdot clblasSdot_pfn
+#undef clblasSetup
+#define clblasSetup clblasSetup_pfn
+#undef clblasSgbmv
+//#define clblasSgbmv clblasSgbmv_pfn
+#undef clblasSgemm
+#define clblasSgemm clblasSgemm_pfn
+#undef clblasSgemv
+//#define clblasSgemv clblasSgemv_pfn
+#undef clblasSger
+//#define clblasSger clblasSger_pfn
+#undef clblasSnrm2
+//#define clblasSnrm2 clblasSnrm2_pfn
+#undef clblasSrot
+//#define clblasSrot clblasSrot_pfn
+#undef clblasSrotg
+//#define clblasSrotg clblasSrotg_pfn
+#undef clblasSrotm
+//#define clblasSrotm clblasSrotm_pfn
+#undef clblasSrotmg
+//#define clblasSrotmg clblasSrotmg_pfn
+#undef clblasSsbmv
+//#define clblasSsbmv clblasSsbmv_pfn
+#undef clblasSscal
+//#define clblasSscal clblasSscal_pfn
+#undef clblasSspmv
+//#define clblasSspmv clblasSspmv_pfn
+#undef clblasSspr
+//#define clblasSspr clblasSspr_pfn
+#undef clblasSspr2
+//#define clblasSspr2 clblasSspr2_pfn
+#undef clblasSswap
+//#define clblasSswap clblasSswap_pfn
+#undef clblasSsymm
+//#define clblasSsymm clblasSsymm_pfn
+#undef clblasSsymv
+//#define clblasSsymv clblasSsymv_pfn
+#undef clblasSsyr
+//#define clblasSsyr clblasSsyr_pfn
+#undef clblasSsyr2
+//#define clblasSsyr2 clblasSsyr2_pfn
+#undef clblasSsyr2k
+//#define clblasSsyr2k clblasSsyr2k_pfn
+#undef clblasSsyrk
+//#define clblasSsyrk clblasSsyrk_pfn
+#undef clblasStbmv
+//#define clblasStbmv clblasStbmv_pfn
+#undef clblasStbsv
+//#define clblasStbsv clblasStbsv_pfn
+#undef clblasStpmv
+//#define clblasStpmv clblasStpmv_pfn
+#undef clblasStpsv
+//#define clblasStpsv clblasStpsv_pfn
+#undef clblasStrmm
+//#define clblasStrmm clblasStrmm_pfn
+#undef clblasStrmv
+//#define clblasStrmv clblasStrmv_pfn
+#undef clblasStrsm
+//#define clblasStrsm clblasStrsm_pfn
+#undef clblasStrsv
+//#define clblasStrsv clblasStrsv_pfn
+#undef clblasTeardown
+#define clblasTeardown clblasTeardown_pfn
+#undef clblasZaxpy
+//#define clblasZaxpy clblasZaxpy_pfn
+#undef clblasZcopy
+//#define clblasZcopy clblasZcopy_pfn
+#undef clblasZdotc
+//#define clblasZdotc clblasZdotc_pfn
+#undef clblasZdotu
+//#define clblasZdotu clblasZdotu_pfn
+#undef clblasZdrot
+//#define clblasZdrot clblasZdrot_pfn
+#undef clblasZdscal
+//#define clblasZdscal clblasZdscal_pfn
+#undef clblasZgbmv
+//#define clblasZgbmv clblasZgbmv_pfn
+#undef clblasZgemm
+#define clblasZgemm clblasZgemm_pfn
+#undef clblasZgemv
+//#define clblasZgemv clblasZgemv_pfn
+#undef clblasZgerc
+//#define clblasZgerc clblasZgerc_pfn
+#undef clblasZgeru
+//#define clblasZgeru clblasZgeru_pfn
+#undef clblasZhbmv
+//#define clblasZhbmv clblasZhbmv_pfn
+#undef clblasZhemm
+//#define clblasZhemm clblasZhemm_pfn
+#undef clblasZhemv
+//#define clblasZhemv clblasZhemv_pfn
+#undef clblasZher
+//#define clblasZher clblasZher_pfn
+#undef clblasZher2
+//#define clblasZher2 clblasZher2_pfn
+#undef clblasZher2k
+//#define clblasZher2k clblasZher2k_pfn
+#undef clblasZherk
+//#define clblasZherk clblasZherk_pfn
+#undef clblasZhpmv
+//#define clblasZhpmv clblasZhpmv_pfn
+#undef clblasZhpr
+//#define clblasZhpr clblasZhpr_pfn
+#undef clblasZhpr2
+//#define clblasZhpr2 clblasZhpr2_pfn
+#undef clblasZrotg
+//#define clblasZrotg clblasZrotg_pfn
+#undef clblasZscal
+//#define clblasZscal clblasZscal_pfn
+#undef clblasZswap
+//#define clblasZswap clblasZswap_pfn
+#undef clblasZsymm
+//#define clblasZsymm clblasZsymm_pfn
+#undef clblasZsyr2k
+//#define clblasZsyr2k clblasZsyr2k_pfn
+#undef clblasZsyrk
+//#define clblasZsyrk clblasZsyrk_pfn
+#undef clblasZtbmv
+//#define clblasZtbmv clblasZtbmv_pfn
+#undef clblasZtbsv
+//#define clblasZtbsv clblasZtbsv_pfn
+#undef clblasZtpmv
+//#define clblasZtpmv clblasZtpmv_pfn
+#undef clblasZtpsv
+//#define clblasZtpsv clblasZtpsv_pfn
+#undef clblasZtrmm
+//#define clblasZtrmm clblasZtrmm_pfn
+#undef clblasZtrmv
+//#define clblasZtrmv clblasZtrmv_pfn
+#undef clblasZtrsm
+//#define clblasZtrsm clblasZtrsm_pfn
+#undef clblasZtrsv
+//#define clblasZtrsv clblasZtrsv_pfn
+#undef clblasiCamax
+//#define clblasiCamax clblasiCamax_pfn
+#undef clblasiDamax
+//#define clblasiDamax clblasiDamax_pfn
+#undef clblasiSamax
+//#define clblasiSamax clblasiSamax_pfn
+#undef clblasiZamax
+//#define clblasiZamax clblasiZamax_pfn
+
+// generated by parser_clblas.py
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCaxpy)(size_t N, cl_float2 alpha, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCcopy)(size_t N, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCdotc)(size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCdotu)(size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCgbmv)(clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, cl_float2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_float2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+extern CL_RUNTIME_EXPORT clblasStatus (*clblasCgemm)(clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, FloatComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCgemv)(clblasOrder order, clblasTranspose transA, size_t M, size_t N, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, FloatComplex beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCgerc)(clblasOrder order, size_t M, size_t N, cl_float2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCgeru)(clblasOrder order, size_t M, size_t N, cl_float2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasChbmv)(clblasOrder order, clblasUplo uplo, size_t N, size_t K, cl_float2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_float2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasChemm)(clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, cl_float2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_float2 beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasChemv)(clblasOrder order, clblasUplo uplo, size_t N, FloatComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, FloatComplex beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCher)(clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCher2)(clblasOrder order, clblasUplo uplo, size_t N, cl_float2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCher2k)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_float beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCherk)(clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, float alpha, const cl_mem A, size_t offa, size_t lda, float beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasChpmv)(clblasOrder order, clblasUplo uplo, size_t N, cl_float2 alpha, const cl_mem AP, size_t offa, const cl_mem X, size_t offx, int incx, cl_float2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasChpr)(clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasChpr2)(clblasOrder order, clblasUplo uplo, size_t N, cl_float2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCrotg)(cl_mem CA, size_t offCA, cl_mem CB, size_t offCB, cl_mem C, size_t offC, cl_mem S, size_t offS, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCscal)(size_t N, cl_float2 alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCsrot)(size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_float C, cl_float S, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCsscal)(size_t N, cl_float alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCswap)(size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCsymm)(clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, cl_float2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_float2 beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCsyr2k)(clblasOrder order, clblasUplo uplo, clblasTranspose transAB, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, FloatComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCsyrk)(clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, FloatComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCtbmv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCtbsv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCtpmv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem AP, size_t offa, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCtpsv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCtrmm)(clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCtrmv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCtrsm)(clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasCtrsv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDasum)(size_t N, cl_mem asum, size_t offAsum, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDaxpy)(size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDcopy)(size_t N, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDdot)(size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDgbmv)(clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, cl_double alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_double beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+extern CL_RUNTIME_EXPORT clblasStatus (*clblasDgemm)(clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDgemv)(clblasOrder order, clblasTranspose transA, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_double beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDger)(clblasOrder order, size_t M, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDnrm2)(size_t N, cl_mem NRM2, size_t offNRM2, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDrot)(size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_double C, cl_double S, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDrotg)(cl_mem DA, size_t offDA, cl_mem DB, size_t offDB, cl_mem C, size_t offC, cl_mem S, size_t offS, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDrotm)(size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, const cl_mem DPARAM, size_t offDparam, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDrotmg)(cl_mem DD1, size_t offDD1, cl_mem DD2, size_t offDD2, cl_mem DX1, size_t offDX1, const cl_mem DY1, size_t offDY1, cl_mem DPARAM, size_t offDparam, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDsbmv)(clblasOrder order, clblasUplo uplo, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_double beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDscal)(size_t N, cl_double alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDspmv)(clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem AP, size_t offa, const cl_mem X, size_t offx, int incx, cl_double beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDspr)(clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDspr2)(clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDswap)(size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDsymm)(clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_double beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDsymv)(clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_double beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDsyr)(clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDsyr2)(clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDsyr2k)(clblasOrder order, clblasUplo uplo, clblasTranspose transAB, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDsyrk)(clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDtbmv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDtbsv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDtpmv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem AP, size_t offa, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDtpsv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDtrmm)(clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDtrmv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDtrsm)(clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDtrsv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDzasum)(size_t N, cl_mem asum, size_t offAsum, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasDznrm2)(size_t N, cl_mem NRM2, size_t offNRM2, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasGetVersion)(cl_uint* major, cl_uint* minor, cl_uint* patch);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSasum)(size_t N, cl_mem asum, size_t offAsum, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSaxpy)(size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasScasum)(size_t N, cl_mem asum, size_t offAsum, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasScnrm2)(size_t N, cl_mem NRM2, size_t offNRM2, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasScopy)(size_t N, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSdot)(size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+extern CL_RUNTIME_EXPORT clblasStatus (*clblasSetup)();
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSgbmv)(clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, cl_float alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_float beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+extern CL_RUNTIME_EXPORT clblasStatus (*clblasSgemm)(clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSgemv)(clblasOrder order, clblasTranspose transA, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_float beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSger)(clblasOrder order, size_t M, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSnrm2)(size_t N, cl_mem NRM2, size_t offNRM2, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSrot)(size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_float C, cl_float S, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSrotg)(cl_mem SA, size_t offSA, cl_mem SB, size_t offSB, cl_mem C, size_t offC, cl_mem S, size_t offS, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSrotm)(size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, const cl_mem SPARAM, size_t offSparam, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSrotmg)(cl_mem SD1, size_t offSD1, cl_mem SD2, size_t offSD2, cl_mem SX1, size_t offSX1, const cl_mem SY1, size_t offSY1, cl_mem SPARAM, size_t offSparam, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSsbmv)(clblasOrder order, clblasUplo uplo, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_float beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSscal)(size_t N, cl_float alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSspmv)(clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem AP, size_t offa, const cl_mem X, size_t offx, int incx, cl_float beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSspr)(clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSspr2)(clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSswap)(size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSsymm)(clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_float beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSsymv)(clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_float beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSsyr)(clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSsyr2)(clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSsyr2k)(clblasOrder order, clblasUplo uplo, clblasTranspose transAB, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasSsyrk)(clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasStbmv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasStbsv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasStpmv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem AP, size_t offa, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasStpsv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasStrmm)(clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasStrmv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasStrsm)(clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasStrsv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+extern CL_RUNTIME_EXPORT void (*clblasTeardown)();
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZaxpy)(size_t N, cl_double2 alpha, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZcopy)(size_t N, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZdotc)(size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZdotu)(size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZdrot)(size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_double C, cl_double S, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZdscal)(size_t N, cl_double alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZgbmv)(clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, cl_double2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_double2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+extern CL_RUNTIME_EXPORT clblasStatus (*clblasZgemm)(clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, DoubleComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZgemv)(clblasOrder order, clblasTranspose transA, size_t M, size_t N, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, DoubleComplex beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZgerc)(clblasOrder order, size_t M, size_t N, cl_double2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZgeru)(clblasOrder order, size_t M, size_t N, cl_double2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZhbmv)(clblasOrder order, clblasUplo uplo, size_t N, size_t K, cl_double2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_double2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZhemm)(clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, cl_double2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_double2 beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZhemv)(clblasOrder order, clblasUplo uplo, size_t N, DoubleComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, DoubleComplex beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZher)(clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZher2)(clblasOrder order, clblasUplo uplo, size_t N, cl_double2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZher2k)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_double beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZherk)(clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, double alpha, const cl_mem A, size_t offa, size_t lda, double beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZhpmv)(clblasOrder order, clblasUplo uplo, size_t N, cl_double2 alpha, const cl_mem AP, size_t offa, const cl_mem X, size_t offx, int incx, cl_double2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZhpr)(clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZhpr2)(clblasOrder order, clblasUplo uplo, size_t N, cl_double2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZrotg)(cl_mem CA, size_t offCA, cl_mem CB, size_t offCB, cl_mem C, size_t offC, cl_mem S, size_t offS, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZscal)(size_t N, cl_double2 alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZswap)(size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZsymm)(clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, cl_double2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_double2 beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZsyr2k)(clblasOrder order, clblasUplo uplo, clblasTranspose transAB, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, DoubleComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZsyrk)(clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, DoubleComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZtbmv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZtbsv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZtpmv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem AP, size_t offa, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZtpsv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZtrmm)(clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZtrmv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZtrsm)(clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasZtrsv)(clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasiCamax)(size_t N, cl_mem iMax, size_t offiMax, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasiDamax)(size_t N, cl_mem iMax, size_t offiMax, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasiSamax)(size_t N, cl_mem iMax, size_t offiMax, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
+//extern CL_RUNTIME_EXPORT clblasStatus (*clblasiZamax)(size_t N, cl_mem iMax, size_t offiMax, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events);
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/autogenerated/opencl_clfft.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/autogenerated/opencl_clfft.hpp
new file mode 100644
index 0000000..dff3b40
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/autogenerated/opencl_clfft.hpp
@@ -0,0 +1,146 @@
+//
+// AUTOGENERATED, DO NOT EDIT
+//
+#ifndef OPENCV_CORE_OCL_RUNTIME_CLAMDFFT_HPP
+#error "Invalid usage"
+#endif
+
+// generated by parser_clfft.py
+#define clfftBakePlan clfftBakePlan_
+#define clfftCopyPlan clfftCopyPlan_
+#define clfftCreateDefaultPlan clfftCreateDefaultPlan_
+#define clfftDestroyPlan clfftDestroyPlan_
+#define clfftEnqueueTransform clfftEnqueueTransform_
+#define clfftGetLayout clfftGetLayout_
+#define clfftGetPlanBatchSize clfftGetPlanBatchSize_
+#define clfftGetPlanContext clfftGetPlanContext_
+#define clfftGetPlanDim clfftGetPlanDim_
+#define clfftGetPlanDistance clfftGetPlanDistance_
+#define clfftGetPlanInStride clfftGetPlanInStride_
+#define clfftGetPlanLength clfftGetPlanLength_
+#define clfftGetPlanOutStride clfftGetPlanOutStride_
+#define clfftGetPlanPrecision clfftGetPlanPrecision_
+#define clfftGetPlanScale clfftGetPlanScale_
+#define clfftGetPlanTransposeResult clfftGetPlanTransposeResult_
+#define clfftGetResultLocation clfftGetResultLocation_
+#define clfftGetTmpBufSize clfftGetTmpBufSize_
+#define clfftGetVersion clfftGetVersion_
+#define clfftSetLayout clfftSetLayout_
+#define clfftSetPlanBatchSize clfftSetPlanBatchSize_
+#define clfftSetPlanCallback clfftSetPlanCallback_
+#define clfftSetPlanDim clfftSetPlanDim_
+#define clfftSetPlanDistance clfftSetPlanDistance_
+#define clfftSetPlanInStride clfftSetPlanInStride_
+#define clfftSetPlanLength clfftSetPlanLength_
+#define clfftSetPlanOutStride clfftSetPlanOutStride_
+#define clfftSetPlanPrecision clfftSetPlanPrecision_
+#define clfftSetPlanScale clfftSetPlanScale_
+#define clfftSetPlanTransposeResult clfftSetPlanTransposeResult_
+#define clfftSetResultLocation clfftSetResultLocation_
+#define clfftSetup clfftSetup_
+#define clfftTeardown clfftTeardown_
+
+#include <clFFT.h>
+
+// generated by parser_clfft.py
+#undef clfftBakePlan
+#define clfftBakePlan clfftBakePlan_pfn
+#undef clfftCopyPlan
+//#define clfftCopyPlan clfftCopyPlan_pfn
+#undef clfftCreateDefaultPlan
+#define clfftCreateDefaultPlan clfftCreateDefaultPlan_pfn
+#undef clfftDestroyPlan
+#define clfftDestroyPlan clfftDestroyPlan_pfn
+#undef clfftEnqueueTransform
+#define clfftEnqueueTransform clfftEnqueueTransform_pfn
+#undef clfftGetLayout
+//#define clfftGetLayout clfftGetLayout_pfn
+#undef clfftGetPlanBatchSize
+//#define clfftGetPlanBatchSize clfftGetPlanBatchSize_pfn
+#undef clfftGetPlanContext
+//#define clfftGetPlanContext clfftGetPlanContext_pfn
+#undef clfftGetPlanDim
+//#define clfftGetPlanDim clfftGetPlanDim_pfn
+#undef clfftGetPlanDistance
+//#define clfftGetPlanDistance clfftGetPlanDistance_pfn
+#undef clfftGetPlanInStride
+//#define clfftGetPlanInStride clfftGetPlanInStride_pfn
+#undef clfftGetPlanLength
+//#define clfftGetPlanLength clfftGetPlanLength_pfn
+#undef clfftGetPlanOutStride
+//#define clfftGetPlanOutStride clfftGetPlanOutStride_pfn
+#undef clfftGetPlanPrecision
+//#define clfftGetPlanPrecision clfftGetPlanPrecision_pfn
+#undef clfftGetPlanScale
+//#define clfftGetPlanScale clfftGetPlanScale_pfn
+#undef clfftGetPlanTransposeResult
+//#define clfftGetPlanTransposeResult clfftGetPlanTransposeResult_pfn
+#undef clfftGetResultLocation
+//#define clfftGetResultLocation clfftGetResultLocation_pfn
+#undef clfftGetTmpBufSize
+#define clfftGetTmpBufSize clfftGetTmpBufSize_pfn
+#undef clfftGetVersion
+#define clfftGetVersion clfftGetVersion_pfn
+#undef clfftSetLayout
+#define clfftSetLayout clfftSetLayout_pfn
+#undef clfftSetPlanBatchSize
+#define clfftSetPlanBatchSize clfftSetPlanBatchSize_pfn
+#undef clfftSetPlanCallback
+//#define clfftSetPlanCallback clfftSetPlanCallback_pfn
+#undef clfftSetPlanDim
+//#define clfftSetPlanDim clfftSetPlanDim_pfn
+#undef clfftSetPlanDistance
+#define clfftSetPlanDistance clfftSetPlanDistance_pfn
+#undef clfftSetPlanInStride
+#define clfftSetPlanInStride clfftSetPlanInStride_pfn
+#undef clfftSetPlanLength
+//#define clfftSetPlanLength clfftSetPlanLength_pfn
+#undef clfftSetPlanOutStride
+#define clfftSetPlanOutStride clfftSetPlanOutStride_pfn
+#undef clfftSetPlanPrecision
+#define clfftSetPlanPrecision clfftSetPlanPrecision_pfn
+#undef clfftSetPlanScale
+#define clfftSetPlanScale clfftSetPlanScale_pfn
+#undef clfftSetPlanTransposeResult
+//#define clfftSetPlanTransposeResult clfftSetPlanTransposeResult_pfn
+#undef clfftSetResultLocation
+#define clfftSetResultLocation clfftSetResultLocation_pfn
+#undef clfftSetup
+#define clfftSetup clfftSetup_pfn
+#undef clfftTeardown
+#define clfftTeardown clfftTeardown_pfn
+
+// generated by parser_clfft.py
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftBakePlan)(clfftPlanHandle plHandle, cl_uint numQueues, cl_command_queue* commQueueFFT, void (CL_CALLBACK* pfn_notify) (clfftPlanHandle plHandle, void* user_data), void* user_data);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftCopyPlan)(clfftPlanHandle* out_plHandle, cl_context new_context, clfftPlanHandle in_plHandle);
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftCreateDefaultPlan)(clfftPlanHandle* plHandle, cl_context context, const clfftDim dim, const size_t* clLengths);
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftDestroyPlan)(clfftPlanHandle* plHandle);
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftEnqueueTransform)(clfftPlanHandle plHandle, clfftDirection dir, cl_uint numQueuesAndEvents, cl_command_queue* commQueues, cl_uint numWaitEvents, const cl_event* waitEvents, cl_event* outEvents, cl_mem* inputBuffers, cl_mem* outputBuffers, cl_mem tmpBuffer);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftGetLayout)(const clfftPlanHandle plHandle, clfftLayout* iLayout, clfftLayout* oLayout);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftGetPlanBatchSize)(const clfftPlanHandle plHandle, size_t* batchSize);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftGetPlanContext)(const clfftPlanHandle plHandle, cl_context* context);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftGetPlanDim)(const clfftPlanHandle plHandle, clfftDim* dim, cl_uint* size);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftGetPlanDistance)(const clfftPlanHandle plHandle, size_t* iDist, size_t* oDist);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftGetPlanInStride)(const clfftPlanHandle plHandle, const clfftDim dim, size_t* clStrides);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftGetPlanLength)(const clfftPlanHandle plHandle, const clfftDim dim, size_t* clLengths);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftGetPlanOutStride)(const clfftPlanHandle plHandle, const clfftDim dim, size_t* clStrides);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftGetPlanPrecision)(const clfftPlanHandle plHandle, clfftPrecision* precision);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftGetPlanScale)(const clfftPlanHandle plHandle, clfftDirection dir, cl_float* scale);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftGetPlanTransposeResult)(const clfftPlanHandle plHandle, clfftResultTransposed* transposed);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftGetResultLocation)(const clfftPlanHandle plHandle, clfftResultLocation* placeness);
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftGetTmpBufSize)(const clfftPlanHandle plHandle, size_t* buffersize);
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftGetVersion)(cl_uint* major, cl_uint* minor, cl_uint* patch);
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftSetLayout)(clfftPlanHandle plHandle, clfftLayout iLayout, clfftLayout oLayout);
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftSetPlanBatchSize)(clfftPlanHandle plHandle, size_t batchSize);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftSetPlanCallback)(clfftPlanHandle plHandle, const char* funcName, const char* funcString, int localMemSize, clfftCallbackType callbackType, cl_mem* userdata, int numUserdataBuffers);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftSetPlanDim)(clfftPlanHandle plHandle, const clfftDim dim);
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftSetPlanDistance)(clfftPlanHandle plHandle, size_t iDist, size_t oDist);
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftSetPlanInStride)(clfftPlanHandle plHandle, const clfftDim dim, size_t* clStrides);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftSetPlanLength)(clfftPlanHandle plHandle, const clfftDim dim, const size_t* clLengths);
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftSetPlanOutStride)(clfftPlanHandle plHandle, const clfftDim dim, size_t* clStrides);
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftSetPlanPrecision)(clfftPlanHandle plHandle, clfftPrecision precision);
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftSetPlanScale)(clfftPlanHandle plHandle, clfftDirection dir, cl_float scale);
+//extern CL_RUNTIME_EXPORT clfftStatus (*clfftSetPlanTransposeResult)(clfftPlanHandle plHandle, clfftResultTransposed transposed);
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftSetResultLocation)(clfftPlanHandle plHandle, clfftResultLocation placeness);
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftSetup)(const clfftSetupData* setupData);
+extern CL_RUNTIME_EXPORT clfftStatus (*clfftTeardown)();
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/autogenerated/opencl_core.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/autogenerated/opencl_core.hpp
new file mode 100644
index 0000000..28618a1
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/autogenerated/opencl_core.hpp
@@ -0,0 +1,371 @@
+//
+// AUTOGENERATED, DO NOT EDIT
+//
+#ifndef OPENCV_CORE_OCL_RUNTIME_OPENCL_CORE_HPP
+#error "Invalid usage"
+#endif
+
+// generated by parser_cl.py
+#define clBuildProgram clBuildProgram_
+#define clCompileProgram clCompileProgram_
+#define clCreateBuffer clCreateBuffer_
+#define clCreateCommandQueue clCreateCommandQueue_
+#define clCreateContext clCreateContext_
+#define clCreateContextFromType clCreateContextFromType_
+#define clCreateImage clCreateImage_
+#define clCreateImage2D clCreateImage2D_
+#define clCreateImage3D clCreateImage3D_
+#define clCreateKernel clCreateKernel_
+#define clCreateKernelsInProgram clCreateKernelsInProgram_
+#define clCreateProgramWithBinary clCreateProgramWithBinary_
+#define clCreateProgramWithBuiltInKernels clCreateProgramWithBuiltInKernels_
+#define clCreateProgramWithSource clCreateProgramWithSource_
+#define clCreateSampler clCreateSampler_
+#define clCreateSubBuffer clCreateSubBuffer_
+#define clCreateSubDevices clCreateSubDevices_
+#define clCreateUserEvent clCreateUserEvent_
+#define clEnqueueBarrier clEnqueueBarrier_
+#define clEnqueueBarrierWithWaitList clEnqueueBarrierWithWaitList_
+#define clEnqueueCopyBuffer clEnqueueCopyBuffer_
+#define clEnqueueCopyBufferRect clEnqueueCopyBufferRect_
+#define clEnqueueCopyBufferToImage clEnqueueCopyBufferToImage_
+#define clEnqueueCopyImage clEnqueueCopyImage_
+#define clEnqueueCopyImageToBuffer clEnqueueCopyImageToBuffer_
+#define clEnqueueFillBuffer clEnqueueFillBuffer_
+#define clEnqueueFillImage clEnqueueFillImage_
+#define clEnqueueMapBuffer clEnqueueMapBuffer_
+#define clEnqueueMapImage clEnqueueMapImage_
+#define clEnqueueMarker clEnqueueMarker_
+#define clEnqueueMarkerWithWaitList clEnqueueMarkerWithWaitList_
+#define clEnqueueMigrateMemObjects clEnqueueMigrateMemObjects_
+#define clEnqueueNDRangeKernel clEnqueueNDRangeKernel_
+#define clEnqueueNativeKernel clEnqueueNativeKernel_
+#define clEnqueueReadBuffer clEnqueueReadBuffer_
+#define clEnqueueReadBufferRect clEnqueueReadBufferRect_
+#define clEnqueueReadImage clEnqueueReadImage_
+#define clEnqueueTask clEnqueueTask_
+#define clEnqueueUnmapMemObject clEnqueueUnmapMemObject_
+#define clEnqueueWaitForEvents clEnqueueWaitForEvents_
+#define clEnqueueWriteBuffer clEnqueueWriteBuffer_
+#define clEnqueueWriteBufferRect clEnqueueWriteBufferRect_
+#define clEnqueueWriteImage clEnqueueWriteImage_
+#define clFinish clFinish_
+#define clFlush clFlush_
+#define clGetCommandQueueInfo clGetCommandQueueInfo_
+#define clGetContextInfo clGetContextInfo_
+#define clGetDeviceIDs clGetDeviceIDs_
+#define clGetDeviceInfo clGetDeviceInfo_
+#define clGetEventInfo clGetEventInfo_
+#define clGetEventProfilingInfo clGetEventProfilingInfo_
+#define clGetExtensionFunctionAddress clGetExtensionFunctionAddress_
+#define clGetExtensionFunctionAddressForPlatform clGetExtensionFunctionAddressForPlatform_
+#define clGetImageInfo clGetImageInfo_
+#define clGetKernelArgInfo clGetKernelArgInfo_
+#define clGetKernelInfo clGetKernelInfo_
+#define clGetKernelWorkGroupInfo clGetKernelWorkGroupInfo_
+#define clGetMemObjectInfo clGetMemObjectInfo_
+#define clGetPlatformIDs clGetPlatformIDs_
+#define clGetPlatformInfo clGetPlatformInfo_
+#define clGetProgramBuildInfo clGetProgramBuildInfo_
+#define clGetProgramInfo clGetProgramInfo_
+#define clGetSamplerInfo clGetSamplerInfo_
+#define clGetSupportedImageFormats clGetSupportedImageFormats_
+#define clLinkProgram clLinkProgram_
+#define clReleaseCommandQueue clReleaseCommandQueue_
+#define clReleaseContext clReleaseContext_
+#define clReleaseDevice clReleaseDevice_
+#define clReleaseEvent clReleaseEvent_
+#define clReleaseKernel clReleaseKernel_
+#define clReleaseMemObject clReleaseMemObject_
+#define clReleaseProgram clReleaseProgram_
+#define clReleaseSampler clReleaseSampler_
+#define clRetainCommandQueue clRetainCommandQueue_
+#define clRetainContext clRetainContext_
+#define clRetainDevice clRetainDevice_
+#define clRetainEvent clRetainEvent_
+#define clRetainKernel clRetainKernel_
+#define clRetainMemObject clRetainMemObject_
+#define clRetainProgram clRetainProgram_
+#define clRetainSampler clRetainSampler_
+#define clSetEventCallback clSetEventCallback_
+#define clSetKernelArg clSetKernelArg_
+#define clSetMemObjectDestructorCallback clSetMemObjectDestructorCallback_
+#define clSetUserEventStatus clSetUserEventStatus_
+#define clUnloadCompiler clUnloadCompiler_
+#define clUnloadPlatformCompiler clUnloadPlatformCompiler_
+#define clWaitForEvents clWaitForEvents_
+
+#if defined __APPLE__
+#define CL_SILENCE_DEPRECATION
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+// generated by parser_cl.py
+#undef clBuildProgram
+#define clBuildProgram clBuildProgram_pfn
+#undef clCompileProgram
+#define clCompileProgram clCompileProgram_pfn
+#undef clCreateBuffer
+#define clCreateBuffer clCreateBuffer_pfn
+#undef clCreateCommandQueue
+#define clCreateCommandQueue clCreateCommandQueue_pfn
+#undef clCreateContext
+#define clCreateContext clCreateContext_pfn
+#undef clCreateContextFromType
+#define clCreateContextFromType clCreateContextFromType_pfn
+#undef clCreateImage
+#define clCreateImage clCreateImage_pfn
+#undef clCreateImage2D
+#define clCreateImage2D clCreateImage2D_pfn
+#undef clCreateImage3D
+#define clCreateImage3D clCreateImage3D_pfn
+#undef clCreateKernel
+#define clCreateKernel clCreateKernel_pfn
+#undef clCreateKernelsInProgram
+#define clCreateKernelsInProgram clCreateKernelsInProgram_pfn
+#undef clCreateProgramWithBinary
+#define clCreateProgramWithBinary clCreateProgramWithBinary_pfn
+#undef clCreateProgramWithBuiltInKernels
+#define clCreateProgramWithBuiltInKernels clCreateProgramWithBuiltInKernels_pfn
+#undef clCreateProgramWithSource
+#define clCreateProgramWithSource clCreateProgramWithSource_pfn
+#undef clCreateSampler
+#define clCreateSampler clCreateSampler_pfn
+#undef clCreateSubBuffer
+#define clCreateSubBuffer clCreateSubBuffer_pfn
+#undef clCreateSubDevices
+#define clCreateSubDevices clCreateSubDevices_pfn
+#undef clCreateUserEvent
+#define clCreateUserEvent clCreateUserEvent_pfn
+#undef clEnqueueBarrier
+#define clEnqueueBarrier clEnqueueBarrier_pfn
+#undef clEnqueueBarrierWithWaitList
+#define clEnqueueBarrierWithWaitList clEnqueueBarrierWithWaitList_pfn
+#undef clEnqueueCopyBuffer
+#define clEnqueueCopyBuffer clEnqueueCopyBuffer_pfn
+#undef clEnqueueCopyBufferRect
+#define clEnqueueCopyBufferRect clEnqueueCopyBufferRect_pfn
+#undef clEnqueueCopyBufferToImage
+#define clEnqueueCopyBufferToImage clEnqueueCopyBufferToImage_pfn
+#undef clEnqueueCopyImage
+#define clEnqueueCopyImage clEnqueueCopyImage_pfn
+#undef clEnqueueCopyImageToBuffer
+#define clEnqueueCopyImageToBuffer clEnqueueCopyImageToBuffer_pfn
+#undef clEnqueueFillBuffer
+#define clEnqueueFillBuffer clEnqueueFillBuffer_pfn
+#undef clEnqueueFillImage
+#define clEnqueueFillImage clEnqueueFillImage_pfn
+#undef clEnqueueMapBuffer
+#define clEnqueueMapBuffer clEnqueueMapBuffer_pfn
+#undef clEnqueueMapImage
+#define clEnqueueMapImage clEnqueueMapImage_pfn
+#undef clEnqueueMarker
+#define clEnqueueMarker clEnqueueMarker_pfn
+#undef clEnqueueMarkerWithWaitList
+#define clEnqueueMarkerWithWaitList clEnqueueMarkerWithWaitList_pfn
+#undef clEnqueueMigrateMemObjects
+#define clEnqueueMigrateMemObjects clEnqueueMigrateMemObjects_pfn
+#undef clEnqueueNDRangeKernel
+#define clEnqueueNDRangeKernel clEnqueueNDRangeKernel_pfn
+#undef clEnqueueNativeKernel
+#define clEnqueueNativeKernel clEnqueueNativeKernel_pfn
+#undef clEnqueueReadBuffer
+#define clEnqueueReadBuffer clEnqueueReadBuffer_pfn
+#undef clEnqueueReadBufferRect
+#define clEnqueueReadBufferRect clEnqueueReadBufferRect_pfn
+#undef clEnqueueReadImage
+#define clEnqueueReadImage clEnqueueReadImage_pfn
+#undef clEnqueueTask
+#define clEnqueueTask clEnqueueTask_pfn
+#undef clEnqueueUnmapMemObject
+#define clEnqueueUnmapMemObject clEnqueueUnmapMemObject_pfn
+#undef clEnqueueWaitForEvents
+#define clEnqueueWaitForEvents clEnqueueWaitForEvents_pfn
+#undef clEnqueueWriteBuffer
+#define clEnqueueWriteBuffer clEnqueueWriteBuffer_pfn
+#undef clEnqueueWriteBufferRect
+#define clEnqueueWriteBufferRect clEnqueueWriteBufferRect_pfn
+#undef clEnqueueWriteImage
+#define clEnqueueWriteImage clEnqueueWriteImage_pfn
+#undef clFinish
+#define clFinish clFinish_pfn
+#undef clFlush
+#define clFlush clFlush_pfn
+#undef clGetCommandQueueInfo
+#define clGetCommandQueueInfo clGetCommandQueueInfo_pfn
+#undef clGetContextInfo
+#define clGetContextInfo clGetContextInfo_pfn
+#undef clGetDeviceIDs
+#define clGetDeviceIDs clGetDeviceIDs_pfn
+#undef clGetDeviceInfo
+#define clGetDeviceInfo clGetDeviceInfo_pfn
+#undef clGetEventInfo
+#define clGetEventInfo clGetEventInfo_pfn
+#undef clGetEventProfilingInfo
+#define clGetEventProfilingInfo clGetEventProfilingInfo_pfn
+#undef clGetExtensionFunctionAddress
+#define clGetExtensionFunctionAddress clGetExtensionFunctionAddress_pfn
+#undef clGetExtensionFunctionAddressForPlatform
+#define clGetExtensionFunctionAddressForPlatform clGetExtensionFunctionAddressForPlatform_pfn
+#undef clGetImageInfo
+#define clGetImageInfo clGetImageInfo_pfn
+#undef clGetKernelArgInfo
+#define clGetKernelArgInfo clGetKernelArgInfo_pfn
+#undef clGetKernelInfo
+#define clGetKernelInfo clGetKernelInfo_pfn
+#undef clGetKernelWorkGroupInfo
+#define clGetKernelWorkGroupInfo clGetKernelWorkGroupInfo_pfn
+#undef clGetMemObjectInfo
+#define clGetMemObjectInfo clGetMemObjectInfo_pfn
+#undef clGetPlatformIDs
+#define clGetPlatformIDs clGetPlatformIDs_pfn
+#undef clGetPlatformInfo
+#define clGetPlatformInfo clGetPlatformInfo_pfn
+#undef clGetProgramBuildInfo
+#define clGetProgramBuildInfo clGetProgramBuildInfo_pfn
+#undef clGetProgramInfo
+#define clGetProgramInfo clGetProgramInfo_pfn
+#undef clGetSamplerInfo
+#define clGetSamplerInfo clGetSamplerInfo_pfn
+#undef clGetSupportedImageFormats
+#define clGetSupportedImageFormats clGetSupportedImageFormats_pfn
+#undef clLinkProgram
+#define clLinkProgram clLinkProgram_pfn
+#undef clReleaseCommandQueue
+#define clReleaseCommandQueue clReleaseCommandQueue_pfn
+#undef clReleaseContext
+#define clReleaseContext clReleaseContext_pfn
+#undef clReleaseDevice
+#define clReleaseDevice clReleaseDevice_pfn
+#undef clReleaseEvent
+#define clReleaseEvent clReleaseEvent_pfn
+#undef clReleaseKernel
+#define clReleaseKernel clReleaseKernel_pfn
+#undef clReleaseMemObject
+#define clReleaseMemObject clReleaseMemObject_pfn
+#undef clReleaseProgram
+#define clReleaseProgram clReleaseProgram_pfn
+#undef clReleaseSampler
+#define clReleaseSampler clReleaseSampler_pfn
+#undef clRetainCommandQueue
+#define clRetainCommandQueue clRetainCommandQueue_pfn
+#undef clRetainContext
+#define clRetainContext clRetainContext_pfn
+#undef clRetainDevice
+#define clRetainDevice clRetainDevice_pfn
+#undef clRetainEvent
+#define clRetainEvent clRetainEvent_pfn
+#undef clRetainKernel
+#define clRetainKernel clRetainKernel_pfn
+#undef clRetainMemObject
+#define clRetainMemObject clRetainMemObject_pfn
+#undef clRetainProgram
+#define clRetainProgram clRetainProgram_pfn
+#undef clRetainSampler
+#define clRetainSampler clRetainSampler_pfn
+#undef clSetEventCallback
+#define clSetEventCallback clSetEventCallback_pfn
+#undef clSetKernelArg
+#define clSetKernelArg clSetKernelArg_pfn
+#undef clSetMemObjectDestructorCallback
+#define clSetMemObjectDestructorCallback clSetMemObjectDestructorCallback_pfn
+#undef clSetUserEventStatus
+#define clSetUserEventStatus clSetUserEventStatus_pfn
+#undef clUnloadCompiler
+#define clUnloadCompiler clUnloadCompiler_pfn
+#undef clUnloadPlatformCompiler
+#define clUnloadPlatformCompiler clUnloadPlatformCompiler_pfn
+#undef clWaitForEvents
+#define clWaitForEvents clWaitForEvents_pfn
+
+// generated by parser_cl.py
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clBuildProgram)(cl_program, cl_uint, const cl_device_id*, const char*, void (CL_CALLBACK*) (cl_program, void*), void*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clCompileProgram)(cl_program, cl_uint, const cl_device_id*, const char*, cl_uint, const cl_program*, const char**, void (CL_CALLBACK*) (cl_program, void*), void*);
+extern CL_RUNTIME_EXPORT cl_mem (CL_API_CALL*clCreateBuffer)(cl_context, cl_mem_flags, size_t, void*, cl_int*);
+extern CL_RUNTIME_EXPORT cl_command_queue (CL_API_CALL*clCreateCommandQueue)(cl_context, cl_device_id, cl_command_queue_properties, cl_int*);
+extern CL_RUNTIME_EXPORT cl_context (CL_API_CALL*clCreateContext)(const cl_context_properties*, cl_uint, const cl_device_id*, void (CL_CALLBACK*) (const char*, const void*, size_t, void*), void*, cl_int*);
+extern CL_RUNTIME_EXPORT cl_context (CL_API_CALL*clCreateContextFromType)(const cl_context_properties*, cl_device_type, void (CL_CALLBACK*) (const char*, const void*, size_t, void*), void*, cl_int*);
+extern CL_RUNTIME_EXPORT cl_mem (CL_API_CALL*clCreateImage)(cl_context, cl_mem_flags, const cl_image_format*, const cl_image_desc*, void*, cl_int*);
+extern CL_RUNTIME_EXPORT cl_mem (CL_API_CALL*clCreateImage2D)(cl_context, cl_mem_flags, const cl_image_format*, size_t, size_t, size_t, void*, cl_int*);
+extern CL_RUNTIME_EXPORT cl_mem (CL_API_CALL*clCreateImage3D)(cl_context, cl_mem_flags, const cl_image_format*, size_t, size_t, size_t, size_t, size_t, void*, cl_int*);
+extern CL_RUNTIME_EXPORT cl_kernel (CL_API_CALL*clCreateKernel)(cl_program, const char*, cl_int*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clCreateKernelsInProgram)(cl_program, cl_uint, cl_kernel*, cl_uint*);
+extern CL_RUNTIME_EXPORT cl_program (CL_API_CALL*clCreateProgramWithBinary)(cl_context, cl_uint, const cl_device_id*, const size_t*, const unsigned char**, cl_int*, cl_int*);
+extern CL_RUNTIME_EXPORT cl_program (CL_API_CALL*clCreateProgramWithBuiltInKernels)(cl_context, cl_uint, const cl_device_id*, const char*, cl_int*);
+extern CL_RUNTIME_EXPORT cl_program (CL_API_CALL*clCreateProgramWithSource)(cl_context, cl_uint, const char**, const size_t*, cl_int*);
+extern CL_RUNTIME_EXPORT cl_sampler (CL_API_CALL*clCreateSampler)(cl_context, cl_bool, cl_addressing_mode, cl_filter_mode, cl_int*);
+extern CL_RUNTIME_EXPORT cl_mem (CL_API_CALL*clCreateSubBuffer)(cl_mem, cl_mem_flags, cl_buffer_create_type, const void*, cl_int*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clCreateSubDevices)(cl_device_id, const cl_device_partition_property*, cl_uint, cl_device_id*, cl_uint*);
+extern CL_RUNTIME_EXPORT cl_event (CL_API_CALL*clCreateUserEvent)(cl_context, cl_int*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueBarrier)(cl_command_queue);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueBarrierWithWaitList)(cl_command_queue, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueCopyBuffer)(cl_command_queue, cl_mem, cl_mem, size_t, size_t, size_t, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueCopyBufferRect)(cl_command_queue, cl_mem, cl_mem, const size_t*, const size_t*, const size_t*, size_t, size_t, size_t, size_t, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueCopyBufferToImage)(cl_command_queue, cl_mem, cl_mem, size_t, const size_t*, const size_t*, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueCopyImage)(cl_command_queue, cl_mem, cl_mem, const size_t*, const size_t*, const size_t*, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueCopyImageToBuffer)(cl_command_queue, cl_mem, cl_mem, const size_t*, const size_t*, size_t, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueFillBuffer)(cl_command_queue, cl_mem, const void*, size_t, size_t, size_t, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueFillImage)(cl_command_queue, cl_mem, const void*, const size_t*, const size_t*, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT void* (CL_API_CALL*clEnqueueMapBuffer)(cl_command_queue, cl_mem, cl_bool, cl_map_flags, size_t, size_t, cl_uint, const cl_event*, cl_event*, cl_int*);
+extern CL_RUNTIME_EXPORT void* (CL_API_CALL*clEnqueueMapImage)(cl_command_queue, cl_mem, cl_bool, cl_map_flags, const size_t*, const size_t*, size_t*, size_t*, cl_uint, const cl_event*, cl_event*, cl_int*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueMarker)(cl_command_queue, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueMarkerWithWaitList)(cl_command_queue, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueMigrateMemObjects)(cl_command_queue, cl_uint, const cl_mem*, cl_mem_migration_flags, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueNDRangeKernel)(cl_command_queue, cl_kernel, cl_uint, const size_t*, const size_t*, const size_t*, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueNativeKernel)(cl_command_queue, void (CL_CALLBACK*) (void*), void*, size_t, cl_uint, const cl_mem*, const void**, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueReadBuffer)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void*, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueReadBufferRect)(cl_command_queue, cl_mem, cl_bool, const size_t*, const size_t*, const size_t*, size_t, size_t, size_t, size_t, void*, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueReadImage)(cl_command_queue, cl_mem, cl_bool, const size_t*, const size_t*, size_t, size_t, void*, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueTask)(cl_command_queue, cl_kernel, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueUnmapMemObject)(cl_command_queue, cl_mem, void*, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueWaitForEvents)(cl_command_queue, cl_uint, const cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueWriteBuffer)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void*, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueWriteBufferRect)(cl_command_queue, cl_mem, cl_bool, const size_t*, const size_t*, const size_t*, size_t, size_t, size_t, size_t, const void*, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueWriteImage)(cl_command_queue, cl_mem, cl_bool, const size_t*, const size_t*, size_t, size_t, const void*, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clFinish)(cl_command_queue);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clFlush)(cl_command_queue);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetCommandQueueInfo)(cl_command_queue, cl_command_queue_info, size_t, void*, size_t*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetContextInfo)(cl_context, cl_context_info, size_t, void*, size_t*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetDeviceIDs)(cl_platform_id, cl_device_type, cl_uint, cl_device_id*, cl_uint*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetDeviceInfo)(cl_device_id, cl_device_info, size_t, void*, size_t*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetEventInfo)(cl_event, cl_event_info, size_t, void*, size_t*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetEventProfilingInfo)(cl_event, cl_profiling_info, size_t, void*, size_t*);
+extern CL_RUNTIME_EXPORT void* (CL_API_CALL*clGetExtensionFunctionAddress)(const char*);
+extern CL_RUNTIME_EXPORT void* (CL_API_CALL*clGetExtensionFunctionAddressForPlatform)(cl_platform_id, const char*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetImageInfo)(cl_mem, cl_image_info, size_t, void*, size_t*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetKernelArgInfo)(cl_kernel, cl_uint, cl_kernel_arg_info, size_t, void*, size_t*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetKernelInfo)(cl_kernel, cl_kernel_info, size_t, void*, size_t*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetKernelWorkGroupInfo)(cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void*, size_t*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetMemObjectInfo)(cl_mem, cl_mem_info, size_t, void*, size_t*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetPlatformIDs)(cl_uint, cl_platform_id*, cl_uint*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetPlatformInfo)(cl_platform_id, cl_platform_info, size_t, void*, size_t*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetProgramBuildInfo)(cl_program, cl_device_id, cl_program_build_info, size_t, void*, size_t*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetProgramInfo)(cl_program, cl_program_info, size_t, void*, size_t*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetSamplerInfo)(cl_sampler, cl_sampler_info, size_t, void*, size_t*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetSupportedImageFormats)(cl_context, cl_mem_flags, cl_mem_object_type, cl_uint, cl_image_format*, cl_uint*);
+extern CL_RUNTIME_EXPORT cl_program (CL_API_CALL*clLinkProgram)(cl_context, cl_uint, const cl_device_id*, const char*, cl_uint, const cl_program*, void (CL_CALLBACK*) (cl_program, void*), void*, cl_int*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clReleaseCommandQueue)(cl_command_queue);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clReleaseContext)(cl_context);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clReleaseDevice)(cl_device_id);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clReleaseEvent)(cl_event);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clReleaseKernel)(cl_kernel);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clReleaseMemObject)(cl_mem);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clReleaseProgram)(cl_program);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clReleaseSampler)(cl_sampler);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clRetainCommandQueue)(cl_command_queue);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clRetainContext)(cl_context);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clRetainDevice)(cl_device_id);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clRetainEvent)(cl_event);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clRetainKernel)(cl_kernel);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clRetainMemObject)(cl_mem);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clRetainProgram)(cl_program);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clRetainSampler)(cl_sampler);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clSetEventCallback)(cl_event, cl_int, void (CL_CALLBACK*) (cl_event, cl_int, void*), void*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clSetKernelArg)(cl_kernel, cl_uint, size_t, const void*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clSetMemObjectDestructorCallback)(cl_mem, void (CL_CALLBACK*) (cl_mem, void*), void*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clSetUserEventStatus)(cl_event, cl_int);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clUnloadCompiler)();
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clUnloadPlatformCompiler)(cl_platform_id);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clWaitForEvents)(cl_uint, const cl_event*);
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/autogenerated/opencl_core_wrappers.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/autogenerated/opencl_core_wrappers.hpp
new file mode 100644
index 0000000..216b22b
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/autogenerated/opencl_core_wrappers.hpp
@@ -0,0 +1,272 @@
+//
+// AUTOGENERATED, DO NOT EDIT
+//
+#ifndef OPENCV_CORE_OCL_RUNTIME_OPENCL_WRAPPERS_HPP
+#error "Invalid usage"
+#endif
+
+// generated by parser_cl.py
+#undef clBuildProgram
+#define clBuildProgram clBuildProgram_fn
+inline cl_int clBuildProgram(cl_program p0, cl_uint p1, const cl_device_id* p2, const char* p3, void (CL_CALLBACK*p4) (cl_program, void*), void* p5) { return clBuildProgram_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clCompileProgram
+#define clCompileProgram clCompileProgram_fn
+inline cl_int clCompileProgram(cl_program p0, cl_uint p1, const cl_device_id* p2, const char* p3, cl_uint p4, const cl_program* p5, const char** p6, void (CL_CALLBACK*p7) (cl_program, void*), void* p8) { return clCompileProgram_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clCreateBuffer
+#define clCreateBuffer clCreateBuffer_fn
+inline cl_mem clCreateBuffer(cl_context p0, cl_mem_flags p1, size_t p2, void* p3, cl_int* p4) { return clCreateBuffer_pfn(p0, p1, p2, p3, p4); }
+#undef clCreateCommandQueue
+#define clCreateCommandQueue clCreateCommandQueue_fn
+inline cl_command_queue clCreateCommandQueue(cl_context p0, cl_device_id p1, cl_command_queue_properties p2, cl_int* p3) { return clCreateCommandQueue_pfn(p0, p1, p2, p3); }
+#undef clCreateContext
+#define clCreateContext clCreateContext_fn
+inline cl_context clCreateContext(const cl_context_properties* p0, cl_uint p1, const cl_device_id* p2, void (CL_CALLBACK*p3) (const char*, const void*, size_t, void*), void* p4, cl_int* p5) { return clCreateContext_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clCreateContextFromType
+#define clCreateContextFromType clCreateContextFromType_fn
+inline cl_context clCreateContextFromType(const cl_context_properties* p0, cl_device_type p1, void (CL_CALLBACK*p2) (const char*, const void*, size_t, void*), void* p3, cl_int* p4) { return clCreateContextFromType_pfn(p0, p1, p2, p3, p4); }
+#undef clCreateImage
+#define clCreateImage clCreateImage_fn
+inline cl_mem clCreateImage(cl_context p0, cl_mem_flags p1, const cl_image_format* p2, const cl_image_desc* p3, void* p4, cl_int* p5) { return clCreateImage_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clCreateImage2D
+#define clCreateImage2D clCreateImage2D_fn
+inline cl_mem clCreateImage2D(cl_context p0, cl_mem_flags p1, const cl_image_format* p2, size_t p3, size_t p4, size_t p5, void* p6, cl_int* p7) { return clCreateImage2D_pfn(p0, p1, p2, p3, p4, p5, p6, p7); }
+#undef clCreateImage3D
+#define clCreateImage3D clCreateImage3D_fn
+inline cl_mem clCreateImage3D(cl_context p0, cl_mem_flags p1, const cl_image_format* p2, size_t p3, size_t p4, size_t p5, size_t p6, size_t p7, void* p8, cl_int* p9) { return clCreateImage3D_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9); }
+#undef clCreateKernel
+#define clCreateKernel clCreateKernel_fn
+inline cl_kernel clCreateKernel(cl_program p0, const char* p1, cl_int* p2) { return clCreateKernel_pfn(p0, p1, p2); }
+#undef clCreateKernelsInProgram
+#define clCreateKernelsInProgram clCreateKernelsInProgram_fn
+inline cl_int clCreateKernelsInProgram(cl_program p0, cl_uint p1, cl_kernel* p2, cl_uint* p3) { return clCreateKernelsInProgram_pfn(p0, p1, p2, p3); }
+#undef clCreateProgramWithBinary
+#define clCreateProgramWithBinary clCreateProgramWithBinary_fn
+inline cl_program clCreateProgramWithBinary(cl_context p0, cl_uint p1, const cl_device_id* p2, const size_t* p3, const unsigned char** p4, cl_int* p5, cl_int* p6) { return clCreateProgramWithBinary_pfn(p0, p1, p2, p3, p4, p5, p6); }
+#undef clCreateProgramWithBuiltInKernels
+#define clCreateProgramWithBuiltInKernels clCreateProgramWithBuiltInKernels_fn
+inline cl_program clCreateProgramWithBuiltInKernels(cl_context p0, cl_uint p1, const cl_device_id* p2, const char* p3, cl_int* p4) { return clCreateProgramWithBuiltInKernels_pfn(p0, p1, p2, p3, p4); }
+#undef clCreateProgramWithSource
+#define clCreateProgramWithSource clCreateProgramWithSource_fn
+inline cl_program clCreateProgramWithSource(cl_context p0, cl_uint p1, const char** p2, const size_t* p3, cl_int* p4) { return clCreateProgramWithSource_pfn(p0, p1, p2, p3, p4); }
+#undef clCreateSampler
+#define clCreateSampler clCreateSampler_fn
+inline cl_sampler clCreateSampler(cl_context p0, cl_bool p1, cl_addressing_mode p2, cl_filter_mode p3, cl_int* p4) { return clCreateSampler_pfn(p0, p1, p2, p3, p4); }
+#undef clCreateSubBuffer
+#define clCreateSubBuffer clCreateSubBuffer_fn
+inline cl_mem clCreateSubBuffer(cl_mem p0, cl_mem_flags p1, cl_buffer_create_type p2, const void* p3, cl_int* p4) { return clCreateSubBuffer_pfn(p0, p1, p2, p3, p4); }
+#undef clCreateSubDevices
+#define clCreateSubDevices clCreateSubDevices_fn
+inline cl_int clCreateSubDevices(cl_device_id p0, const cl_device_partition_property* p1, cl_uint p2, cl_device_id* p3, cl_uint* p4) { return clCreateSubDevices_pfn(p0, p1, p2, p3, p4); }
+#undef clCreateUserEvent
+#define clCreateUserEvent clCreateUserEvent_fn
+inline cl_event clCreateUserEvent(cl_context p0, cl_int* p1) { return clCreateUserEvent_pfn(p0, p1); }
+#undef clEnqueueBarrier
+#define clEnqueueBarrier clEnqueueBarrier_fn
+inline cl_int clEnqueueBarrier(cl_command_queue p0) { return clEnqueueBarrier_pfn(p0); }
+#undef clEnqueueBarrierWithWaitList
+#define clEnqueueBarrierWithWaitList clEnqueueBarrierWithWaitList_fn
+inline cl_int clEnqueueBarrierWithWaitList(cl_command_queue p0, cl_uint p1, const cl_event* p2, cl_event* p3) { return clEnqueueBarrierWithWaitList_pfn(p0, p1, p2, p3); }
+#undef clEnqueueCopyBuffer
+#define clEnqueueCopyBuffer clEnqueueCopyBuffer_fn
+inline cl_int clEnqueueCopyBuffer(cl_command_queue p0, cl_mem p1, cl_mem p2, size_t p3, size_t p4, size_t p5, cl_uint p6, const cl_event* p7, cl_event* p8) { return clEnqueueCopyBuffer_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clEnqueueCopyBufferRect
+#define clEnqueueCopyBufferRect clEnqueueCopyBufferRect_fn
+inline cl_int clEnqueueCopyBufferRect(cl_command_queue p0, cl_mem p1, cl_mem p2, const size_t* p3, const size_t* p4, const size_t* p5, size_t p6, size_t p7, size_t p8, size_t p9, cl_uint p10, const cl_event* p11, cl_event* p12) { return clEnqueueCopyBufferRect_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12); }
+#undef clEnqueueCopyBufferToImage
+#define clEnqueueCopyBufferToImage clEnqueueCopyBufferToImage_fn
+inline cl_int clEnqueueCopyBufferToImage(cl_command_queue p0, cl_mem p1, cl_mem p2, size_t p3, const size_t* p4, const size_t* p5, cl_uint p6, const cl_event* p7, cl_event* p8) { return clEnqueueCopyBufferToImage_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clEnqueueCopyImage
+#define clEnqueueCopyImage clEnqueueCopyImage_fn
+inline cl_int clEnqueueCopyImage(cl_command_queue p0, cl_mem p1, cl_mem p2, const size_t* p3, const size_t* p4, const size_t* p5, cl_uint p6, const cl_event* p7, cl_event* p8) { return clEnqueueCopyImage_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clEnqueueCopyImageToBuffer
+#define clEnqueueCopyImageToBuffer clEnqueueCopyImageToBuffer_fn
+inline cl_int clEnqueueCopyImageToBuffer(cl_command_queue p0, cl_mem p1, cl_mem p2, const size_t* p3, const size_t* p4, size_t p5, cl_uint p6, const cl_event* p7, cl_event* p8) { return clEnqueueCopyImageToBuffer_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clEnqueueFillBuffer
+#define clEnqueueFillBuffer clEnqueueFillBuffer_fn
+inline cl_int clEnqueueFillBuffer(cl_command_queue p0, cl_mem p1, const void* p2, size_t p3, size_t p4, size_t p5, cl_uint p6, const cl_event* p7, cl_event* p8) { return clEnqueueFillBuffer_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clEnqueueFillImage
+#define clEnqueueFillImage clEnqueueFillImage_fn
+inline cl_int clEnqueueFillImage(cl_command_queue p0, cl_mem p1, const void* p2, const size_t* p3, const size_t* p4, cl_uint p5, const cl_event* p6, cl_event* p7) { return clEnqueueFillImage_pfn(p0, p1, p2, p3, p4, p5, p6, p7); }
+#undef clEnqueueMapBuffer
+#define clEnqueueMapBuffer clEnqueueMapBuffer_fn
+inline void* clEnqueueMapBuffer(cl_command_queue p0, cl_mem p1, cl_bool p2, cl_map_flags p3, size_t p4, size_t p5, cl_uint p6, const cl_event* p7, cl_event* p8, cl_int* p9) { return clEnqueueMapBuffer_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9); }
+#undef clEnqueueMapImage
+#define clEnqueueMapImage clEnqueueMapImage_fn
+inline void* clEnqueueMapImage(cl_command_queue p0, cl_mem p1, cl_bool p2, cl_map_flags p3, const size_t* p4, const size_t* p5, size_t* p6, size_t* p7, cl_uint p8, const cl_event* p9, cl_event* p10, cl_int* p11) { return clEnqueueMapImage_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11); }
+#undef clEnqueueMarker
+#define clEnqueueMarker clEnqueueMarker_fn
+inline cl_int clEnqueueMarker(cl_command_queue p0, cl_event* p1) { return clEnqueueMarker_pfn(p0, p1); }
+#undef clEnqueueMarkerWithWaitList
+#define clEnqueueMarkerWithWaitList clEnqueueMarkerWithWaitList_fn
+inline cl_int clEnqueueMarkerWithWaitList(cl_command_queue p0, cl_uint p1, const cl_event* p2, cl_event* p3) { return clEnqueueMarkerWithWaitList_pfn(p0, p1, p2, p3); }
+#undef clEnqueueMigrateMemObjects
+#define clEnqueueMigrateMemObjects clEnqueueMigrateMemObjects_fn
+inline cl_int clEnqueueMigrateMemObjects(cl_command_queue p0, cl_uint p1, const cl_mem* p2, cl_mem_migration_flags p3, cl_uint p4, const cl_event* p5, cl_event* p6) { return clEnqueueMigrateMemObjects_pfn(p0, p1, p2, p3, p4, p5, p6); }
+#undef clEnqueueNDRangeKernel
+#define clEnqueueNDRangeKernel clEnqueueNDRangeKernel_fn
+inline cl_int clEnqueueNDRangeKernel(cl_command_queue p0, cl_kernel p1, cl_uint p2, const size_t* p3, const size_t* p4, const size_t* p5, cl_uint p6, const cl_event* p7, cl_event* p8) { return clEnqueueNDRangeKernel_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clEnqueueNativeKernel
+#define clEnqueueNativeKernel clEnqueueNativeKernel_fn
+inline cl_int clEnqueueNativeKernel(cl_command_queue p0, void (CL_CALLBACK*p1) (void*), void* p2, size_t p3, cl_uint p4, const cl_mem* p5, const void** p6, cl_uint p7, const cl_event* p8, cl_event* p9) { return clEnqueueNativeKernel_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9); }
+#undef clEnqueueReadBuffer
+#define clEnqueueReadBuffer clEnqueueReadBuffer_fn
+inline cl_int clEnqueueReadBuffer(cl_command_queue p0, cl_mem p1, cl_bool p2, size_t p3, size_t p4, void* p5, cl_uint p6, const cl_event* p7, cl_event* p8) { return clEnqueueReadBuffer_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clEnqueueReadBufferRect
+#define clEnqueueReadBufferRect clEnqueueReadBufferRect_fn
+inline cl_int clEnqueueReadBufferRect(cl_command_queue p0, cl_mem p1, cl_bool p2, const size_t* p3, const size_t* p4, const size_t* p5, size_t p6, size_t p7, size_t p8, size_t p9, void* p10, cl_uint p11, const cl_event* p12, cl_event* p13) { return clEnqueueReadBufferRect_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13); }
+#undef clEnqueueReadImage
+#define clEnqueueReadImage clEnqueueReadImage_fn
+inline cl_int clEnqueueReadImage(cl_command_queue p0, cl_mem p1, cl_bool p2, const size_t* p3, const size_t* p4, size_t p5, size_t p6, void* p7, cl_uint p8, const cl_event* p9, cl_event* p10) { return clEnqueueReadImage_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10); }
+#undef clEnqueueTask
+#define clEnqueueTask clEnqueueTask_fn
+inline cl_int clEnqueueTask(cl_command_queue p0, cl_kernel p1, cl_uint p2, const cl_event* p3, cl_event* p4) { return clEnqueueTask_pfn(p0, p1, p2, p3, p4); }
+#undef clEnqueueUnmapMemObject
+#define clEnqueueUnmapMemObject clEnqueueUnmapMemObject_fn
+inline cl_int clEnqueueUnmapMemObject(cl_command_queue p0, cl_mem p1, void* p2, cl_uint p3, const cl_event* p4, cl_event* p5) { return clEnqueueUnmapMemObject_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clEnqueueWaitForEvents
+#define clEnqueueWaitForEvents clEnqueueWaitForEvents_fn
+inline cl_int clEnqueueWaitForEvents(cl_command_queue p0, cl_uint p1, const cl_event* p2) { return clEnqueueWaitForEvents_pfn(p0, p1, p2); }
+#undef clEnqueueWriteBuffer
+#define clEnqueueWriteBuffer clEnqueueWriteBuffer_fn
+inline cl_int clEnqueueWriteBuffer(cl_command_queue p0, cl_mem p1, cl_bool p2, size_t p3, size_t p4, const void* p5, cl_uint p6, const cl_event* p7, cl_event* p8) { return clEnqueueWriteBuffer_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clEnqueueWriteBufferRect
+#define clEnqueueWriteBufferRect clEnqueueWriteBufferRect_fn
+inline cl_int clEnqueueWriteBufferRect(cl_command_queue p0, cl_mem p1, cl_bool p2, const size_t* p3, const size_t* p4, const size_t* p5, size_t p6, size_t p7, size_t p8, size_t p9, const void* p10, cl_uint p11, const cl_event* p12, cl_event* p13) { return clEnqueueWriteBufferRect_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13); }
+#undef clEnqueueWriteImage
+#define clEnqueueWriteImage clEnqueueWriteImage_fn
+inline cl_int clEnqueueWriteImage(cl_command_queue p0, cl_mem p1, cl_bool p2, const size_t* p3, const size_t* p4, size_t p5, size_t p6, const void* p7, cl_uint p8, const cl_event* p9, cl_event* p10) { return clEnqueueWriteImage_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10); }
+#undef clFinish
+#define clFinish clFinish_fn
+inline cl_int clFinish(cl_command_queue p0) { return clFinish_pfn(p0); }
+#undef clFlush
+#define clFlush clFlush_fn
+inline cl_int clFlush(cl_command_queue p0) { return clFlush_pfn(p0); }
+#undef clGetCommandQueueInfo
+#define clGetCommandQueueInfo clGetCommandQueueInfo_fn
+inline cl_int clGetCommandQueueInfo(cl_command_queue p0, cl_command_queue_info p1, size_t p2, void* p3, size_t* p4) { return clGetCommandQueueInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clGetContextInfo
+#define clGetContextInfo clGetContextInfo_fn
+inline cl_int clGetContextInfo(cl_context p0, cl_context_info p1, size_t p2, void* p3, size_t* p4) { return clGetContextInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clGetDeviceIDs
+#define clGetDeviceIDs clGetDeviceIDs_fn
+inline cl_int clGetDeviceIDs(cl_platform_id p0, cl_device_type p1, cl_uint p2, cl_device_id* p3, cl_uint* p4) { return clGetDeviceIDs_pfn(p0, p1, p2, p3, p4); }
+#undef clGetDeviceInfo
+#define clGetDeviceInfo clGetDeviceInfo_fn
+inline cl_int clGetDeviceInfo(cl_device_id p0, cl_device_info p1, size_t p2, void* p3, size_t* p4) { return clGetDeviceInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clGetEventInfo
+#define clGetEventInfo clGetEventInfo_fn
+inline cl_int clGetEventInfo(cl_event p0, cl_event_info p1, size_t p2, void* p3, size_t* p4) { return clGetEventInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clGetEventProfilingInfo
+#define clGetEventProfilingInfo clGetEventProfilingInfo_fn
+inline cl_int clGetEventProfilingInfo(cl_event p0, cl_profiling_info p1, size_t p2, void* p3, size_t* p4) { return clGetEventProfilingInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clGetExtensionFunctionAddress
+#define clGetExtensionFunctionAddress clGetExtensionFunctionAddress_fn
+inline void* clGetExtensionFunctionAddress(const char* p0) { return clGetExtensionFunctionAddress_pfn(p0); }
+#undef clGetExtensionFunctionAddressForPlatform
+#define clGetExtensionFunctionAddressForPlatform clGetExtensionFunctionAddressForPlatform_fn
+inline void* clGetExtensionFunctionAddressForPlatform(cl_platform_id p0, const char* p1) { return clGetExtensionFunctionAddressForPlatform_pfn(p0, p1); }
+#undef clGetImageInfo
+#define clGetImageInfo clGetImageInfo_fn
+inline cl_int clGetImageInfo(cl_mem p0, cl_image_info p1, size_t p2, void* p3, size_t* p4) { return clGetImageInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clGetKernelArgInfo
+#define clGetKernelArgInfo clGetKernelArgInfo_fn
+inline cl_int clGetKernelArgInfo(cl_kernel p0, cl_uint p1, cl_kernel_arg_info p2, size_t p3, void* p4, size_t* p5) { return clGetKernelArgInfo_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clGetKernelInfo
+#define clGetKernelInfo clGetKernelInfo_fn
+inline cl_int clGetKernelInfo(cl_kernel p0, cl_kernel_info p1, size_t p2, void* p3, size_t* p4) { return clGetKernelInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clGetKernelWorkGroupInfo
+#define clGetKernelWorkGroupInfo clGetKernelWorkGroupInfo_fn
+inline cl_int clGetKernelWorkGroupInfo(cl_kernel p0, cl_device_id p1, cl_kernel_work_group_info p2, size_t p3, void* p4, size_t* p5) { return clGetKernelWorkGroupInfo_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clGetMemObjectInfo
+#define clGetMemObjectInfo clGetMemObjectInfo_fn
+inline cl_int clGetMemObjectInfo(cl_mem p0, cl_mem_info p1, size_t p2, void* p3, size_t* p4) { return clGetMemObjectInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clGetPlatformIDs
+#define clGetPlatformIDs clGetPlatformIDs_fn
+inline cl_int clGetPlatformIDs(cl_uint p0, cl_platform_id* p1, cl_uint* p2) { return clGetPlatformIDs_pfn(p0, p1, p2); }
+#undef clGetPlatformInfo
+#define clGetPlatformInfo clGetPlatformInfo_fn
+inline cl_int clGetPlatformInfo(cl_platform_id p0, cl_platform_info p1, size_t p2, void* p3, size_t* p4) { return clGetPlatformInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clGetProgramBuildInfo
+#define clGetProgramBuildInfo clGetProgramBuildInfo_fn
+inline cl_int clGetProgramBuildInfo(cl_program p0, cl_device_id p1, cl_program_build_info p2, size_t p3, void* p4, size_t* p5) { return clGetProgramBuildInfo_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clGetProgramInfo
+#define clGetProgramInfo clGetProgramInfo_fn
+inline cl_int clGetProgramInfo(cl_program p0, cl_program_info p1, size_t p2, void* p3, size_t* p4) { return clGetProgramInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clGetSamplerInfo
+#define clGetSamplerInfo clGetSamplerInfo_fn
+inline cl_int clGetSamplerInfo(cl_sampler p0, cl_sampler_info p1, size_t p2, void* p3, size_t* p4) { return clGetSamplerInfo_pfn(p0, p1, p2, p3, p4); }
+#undef clGetSupportedImageFormats
+#define clGetSupportedImageFormats clGetSupportedImageFormats_fn
+inline cl_int clGetSupportedImageFormats(cl_context p0, cl_mem_flags p1, cl_mem_object_type p2, cl_uint p3, cl_image_format* p4, cl_uint* p5) { return clGetSupportedImageFormats_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clLinkProgram
+#define clLinkProgram clLinkProgram_fn
+inline cl_program clLinkProgram(cl_context p0, cl_uint p1, const cl_device_id* p2, const char* p3, cl_uint p4, const cl_program* p5, void (CL_CALLBACK*p6) (cl_program, void*), void* p7, cl_int* p8) { return clLinkProgram_pfn(p0, p1, p2, p3, p4, p5, p6, p7, p8); }
+#undef clReleaseCommandQueue
+#define clReleaseCommandQueue clReleaseCommandQueue_fn
+inline cl_int clReleaseCommandQueue(cl_command_queue p0) { return clReleaseCommandQueue_pfn(p0); }
+#undef clReleaseContext
+#define clReleaseContext clReleaseContext_fn
+inline cl_int clReleaseContext(cl_context p0) { return clReleaseContext_pfn(p0); }
+#undef clReleaseDevice
+#define clReleaseDevice clReleaseDevice_fn
+inline cl_int clReleaseDevice(cl_device_id p0) { return clReleaseDevice_pfn(p0); }
+#undef clReleaseEvent
+#define clReleaseEvent clReleaseEvent_fn
+inline cl_int clReleaseEvent(cl_event p0) { return clReleaseEvent_pfn(p0); }
+#undef clReleaseKernel
+#define clReleaseKernel clReleaseKernel_fn
+inline cl_int clReleaseKernel(cl_kernel p0) { return clReleaseKernel_pfn(p0); }
+#undef clReleaseMemObject
+#define clReleaseMemObject clReleaseMemObject_fn
+inline cl_int clReleaseMemObject(cl_mem p0) { return clReleaseMemObject_pfn(p0); }
+#undef clReleaseProgram
+#define clReleaseProgram clReleaseProgram_fn
+inline cl_int clReleaseProgram(cl_program p0) { return clReleaseProgram_pfn(p0); }
+#undef clReleaseSampler
+#define clReleaseSampler clReleaseSampler_fn
+inline cl_int clReleaseSampler(cl_sampler p0) { return clReleaseSampler_pfn(p0); }
+#undef clRetainCommandQueue
+#define clRetainCommandQueue clRetainCommandQueue_fn
+inline cl_int clRetainCommandQueue(cl_command_queue p0) { return clRetainCommandQueue_pfn(p0); }
+#undef clRetainContext
+#define clRetainContext clRetainContext_fn
+inline cl_int clRetainContext(cl_context p0) { return clRetainContext_pfn(p0); }
+#undef clRetainDevice
+#define clRetainDevice clRetainDevice_fn
+inline cl_int clRetainDevice(cl_device_id p0) { return clRetainDevice_pfn(p0); }
+#undef clRetainEvent
+#define clRetainEvent clRetainEvent_fn
+inline cl_int clRetainEvent(cl_event p0) { return clRetainEvent_pfn(p0); }
+#undef clRetainKernel
+#define clRetainKernel clRetainKernel_fn
+inline cl_int clRetainKernel(cl_kernel p0) { return clRetainKernel_pfn(p0); }
+#undef clRetainMemObject
+#define clRetainMemObject clRetainMemObject_fn
+inline cl_int clRetainMemObject(cl_mem p0) { return clRetainMemObject_pfn(p0); }
+#undef clRetainProgram
+#define clRetainProgram clRetainProgram_fn
+inline cl_int clRetainProgram(cl_program p0) { return clRetainProgram_pfn(p0); }
+#undef clRetainSampler
+#define clRetainSampler clRetainSampler_fn
+inline cl_int clRetainSampler(cl_sampler p0) { return clRetainSampler_pfn(p0); }
+#undef clSetEventCallback
+#define clSetEventCallback clSetEventCallback_fn
+inline cl_int clSetEventCallback(cl_event p0, cl_int p1, void (CL_CALLBACK*p2) (cl_event, cl_int, void*), void* p3) { return clSetEventCallback_pfn(p0, p1, p2, p3); }
+#undef clSetKernelArg
+#define clSetKernelArg clSetKernelArg_fn
+inline cl_int clSetKernelArg(cl_kernel p0, cl_uint p1, size_t p2, const void* p3) { return clSetKernelArg_pfn(p0, p1, p2, p3); }
+#undef clSetMemObjectDestructorCallback
+#define clSetMemObjectDestructorCallback clSetMemObjectDestructorCallback_fn
+inline cl_int clSetMemObjectDestructorCallback(cl_mem p0, void (CL_CALLBACK*p1) (cl_mem, void*), void* p2) { return clSetMemObjectDestructorCallback_pfn(p0, p1, p2); }
+#undef clSetUserEventStatus
+#define clSetUserEventStatus clSetUserEventStatus_fn
+inline cl_int clSetUserEventStatus(cl_event p0, cl_int p1) { return clSetUserEventStatus_pfn(p0, p1); }
+#undef clUnloadCompiler
+#define clUnloadCompiler clUnloadCompiler_fn
+inline cl_int clUnloadCompiler() { return clUnloadCompiler_pfn(); }
+#undef clUnloadPlatformCompiler
+#define clUnloadPlatformCompiler clUnloadPlatformCompiler_fn
+inline cl_int clUnloadPlatformCompiler(cl_platform_id p0) { return clUnloadPlatformCompiler_pfn(p0); }
+#undef clWaitForEvents
+#define clWaitForEvents clWaitForEvents_fn
+inline cl_int clWaitForEvents(cl_uint p0, const cl_event* p1) { return clWaitForEvents_pfn(p0, p1); }
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/autogenerated/opencl_gl.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/autogenerated/opencl_gl.hpp
new file mode 100644
index 0000000..0b12aed
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/autogenerated/opencl_gl.hpp
@@ -0,0 +1,62 @@
+//
+// AUTOGENERATED, DO NOT EDIT
+//
+#ifndef OPENCV_CORE_OCL_RUNTIME_OPENCL_GL_HPP
+#error "Invalid usage"
+#endif
+
+// generated by parser_cl.py
+#define clCreateFromGLBuffer clCreateFromGLBuffer_
+#define clCreateFromGLRenderbuffer clCreateFromGLRenderbuffer_
+#define clCreateFromGLTexture clCreateFromGLTexture_
+#define clCreateFromGLTexture2D clCreateFromGLTexture2D_
+#define clCreateFromGLTexture3D clCreateFromGLTexture3D_
+#define clEnqueueAcquireGLObjects clEnqueueAcquireGLObjects_
+#define clEnqueueReleaseGLObjects clEnqueueReleaseGLObjects_
+#define clGetGLContextInfoKHR clGetGLContextInfoKHR_
+#define clGetGLObjectInfo clGetGLObjectInfo_
+#define clGetGLTextureInfo clGetGLTextureInfo_
+
+#if defined __APPLE__
+#include <OpenCL/cl_gl.h>
+#else
+#include <CL/cl_gl.h>
+#endif
+
+// generated by parser_cl.py
+#undef clCreateFromGLBuffer
+#define clCreateFromGLBuffer clCreateFromGLBuffer_pfn
+#undef clCreateFromGLRenderbuffer
+#define clCreateFromGLRenderbuffer clCreateFromGLRenderbuffer_pfn
+#undef clCreateFromGLTexture
+#define clCreateFromGLTexture clCreateFromGLTexture_pfn
+#undef clCreateFromGLTexture2D
+#define clCreateFromGLTexture2D clCreateFromGLTexture2D_pfn
+#undef clCreateFromGLTexture3D
+#define clCreateFromGLTexture3D clCreateFromGLTexture3D_pfn
+#undef clEnqueueAcquireGLObjects
+#define clEnqueueAcquireGLObjects clEnqueueAcquireGLObjects_pfn
+#undef clEnqueueReleaseGLObjects
+#define clEnqueueReleaseGLObjects clEnqueueReleaseGLObjects_pfn
+#undef clGetGLContextInfoKHR
+#define clGetGLContextInfoKHR clGetGLContextInfoKHR_pfn
+#undef clGetGLObjectInfo
+#define clGetGLObjectInfo clGetGLObjectInfo_pfn
+#undef clGetGLTextureInfo
+#define clGetGLTextureInfo clGetGLTextureInfo_pfn
+
+#ifdef cl_khr_gl_sharing
+
+// generated by parser_cl.py
+extern CL_RUNTIME_EXPORT cl_mem (CL_API_CALL*clCreateFromGLBuffer)(cl_context, cl_mem_flags, cl_GLuint, int*);
+extern CL_RUNTIME_EXPORT cl_mem (CL_API_CALL*clCreateFromGLRenderbuffer)(cl_context, cl_mem_flags, cl_GLuint, cl_int*);
+extern CL_RUNTIME_EXPORT cl_mem (CL_API_CALL*clCreateFromGLTexture)(cl_context, cl_mem_flags, cl_GLenum, cl_GLint, cl_GLuint, cl_int*);
+extern CL_RUNTIME_EXPORT cl_mem (CL_API_CALL*clCreateFromGLTexture2D)(cl_context, cl_mem_flags, cl_GLenum, cl_GLint, cl_GLuint, cl_int*);
+extern CL_RUNTIME_EXPORT cl_mem (CL_API_CALL*clCreateFromGLTexture3D)(cl_context, cl_mem_flags, cl_GLenum, cl_GLint, cl_GLuint, cl_int*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueAcquireGLObjects)(cl_command_queue, cl_uint, const cl_mem*, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clEnqueueReleaseGLObjects)(cl_command_queue, cl_uint, const cl_mem*, cl_uint, const cl_event*, cl_event*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetGLContextInfoKHR)(const cl_context_properties*, cl_gl_context_info, size_t, void*, size_t*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetGLObjectInfo)(cl_mem, cl_gl_object_type*, cl_GLuint*);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL*clGetGLTextureInfo)(cl_mem, cl_gl_texture_info, size_t, void*, size_t*);
+
+#endif // cl_khr_gl_sharing
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/autogenerated/opencl_gl_wrappers.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/autogenerated/opencl_gl_wrappers.hpp
new file mode 100644
index 0000000..12f342b
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/autogenerated/opencl_gl_wrappers.hpp
@@ -0,0 +1,42 @@
+//
+// AUTOGENERATED, DO NOT EDIT
+//
+#ifndef OPENCV_CORE_OCL_RUNTIME_OPENCL_GL_WRAPPERS_HPP
+#error "Invalid usage"
+#endif
+
+#ifdef cl_khr_gl_sharing
+
+// generated by parser_cl.py
+#undef clCreateFromGLBuffer
+#define clCreateFromGLBuffer clCreateFromGLBuffer_fn
+inline cl_mem clCreateFromGLBuffer(cl_context p0, cl_mem_flags p1, cl_GLuint p2, int* p3) { return clCreateFromGLBuffer_pfn(p0, p1, p2, p3); }
+#undef clCreateFromGLRenderbuffer
+#define clCreateFromGLRenderbuffer clCreateFromGLRenderbuffer_fn
+inline cl_mem clCreateFromGLRenderbuffer(cl_context p0, cl_mem_flags p1, cl_GLuint p2, cl_int* p3) { return clCreateFromGLRenderbuffer_pfn(p0, p1, p2, p3); }
+#undef clCreateFromGLTexture
+#define clCreateFromGLTexture clCreateFromGLTexture_fn
+inline cl_mem clCreateFromGLTexture(cl_context p0, cl_mem_flags p1, cl_GLenum p2, cl_GLint p3, cl_GLuint p4, cl_int* p5) { return clCreateFromGLTexture_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clCreateFromGLTexture2D
+#define clCreateFromGLTexture2D clCreateFromGLTexture2D_fn
+inline cl_mem clCreateFromGLTexture2D(cl_context p0, cl_mem_flags p1, cl_GLenum p2, cl_GLint p3, cl_GLuint p4, cl_int* p5) { return clCreateFromGLTexture2D_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clCreateFromGLTexture3D
+#define clCreateFromGLTexture3D clCreateFromGLTexture3D_fn
+inline cl_mem clCreateFromGLTexture3D(cl_context p0, cl_mem_flags p1, cl_GLenum p2, cl_GLint p3, cl_GLuint p4, cl_int* p5) { return clCreateFromGLTexture3D_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clEnqueueAcquireGLObjects
+#define clEnqueueAcquireGLObjects clEnqueueAcquireGLObjects_fn
+inline cl_int clEnqueueAcquireGLObjects(cl_command_queue p0, cl_uint p1, const cl_mem* p2, cl_uint p3, const cl_event* p4, cl_event* p5) { return clEnqueueAcquireGLObjects_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clEnqueueReleaseGLObjects
+#define clEnqueueReleaseGLObjects clEnqueueReleaseGLObjects_fn
+inline cl_int clEnqueueReleaseGLObjects(cl_command_queue p0, cl_uint p1, const cl_mem* p2, cl_uint p3, const cl_event* p4, cl_event* p5) { return clEnqueueReleaseGLObjects_pfn(p0, p1, p2, p3, p4, p5); }
+#undef clGetGLContextInfoKHR
+#define clGetGLContextInfoKHR clGetGLContextInfoKHR_fn
+inline cl_int clGetGLContextInfoKHR(const cl_context_properties* p0, cl_gl_context_info p1, size_t p2, void* p3, size_t* p4) { return clGetGLContextInfoKHR_pfn(p0, p1, p2, p3, p4); }
+#undef clGetGLObjectInfo
+#define clGetGLObjectInfo clGetGLObjectInfo_fn
+inline cl_int clGetGLObjectInfo(cl_mem p0, cl_gl_object_type* p1, cl_GLuint* p2) { return clGetGLObjectInfo_pfn(p0, p1, p2); }
+#undef clGetGLTextureInfo
+#define clGetGLTextureInfo clGetGLTextureInfo_fn
+inline cl_int clGetGLTextureInfo(cl_mem p0, cl_gl_texture_info p1, size_t p2, void* p3, size_t* p4) { return clGetGLTextureInfo_pfn(p0, p1, p2, p3, p4); }
+
+#endif // cl_khr_gl_sharing
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/opencl_clblas.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/opencl_clblas.hpp
new file mode 100644
index 0000000..ccddf8f
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/opencl_clblas.hpp
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the OpenCV Foundation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_OCL_RUNTIME_CLAMDBLAS_HPP
+#define OPENCV_CORE_OCL_RUNTIME_CLAMDBLAS_HPP
+
+#ifdef HAVE_CLAMDBLAS
+
+#include "opencl_core.hpp"
+
+#include "autogenerated/opencl_clblas.hpp"
+
+#endif // HAVE_CLAMDBLAS
+
+#endif // OPENCV_CORE_OCL_RUNTIME_CLAMDBLAS_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/opencl_clfft.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/opencl_clfft.hpp
new file mode 100644
index 0000000..7f4af5e
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/opencl_clfft.hpp
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the OpenCV Foundation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_OCL_RUNTIME_CLAMDFFT_HPP
+#define OPENCV_CORE_OCL_RUNTIME_CLAMDFFT_HPP
+
+#ifdef HAVE_CLAMDFFT
+
+#include "opencl_core.hpp"
+
+#include "autogenerated/opencl_clfft.hpp"
+
+#endif // HAVE_CLAMDFFT
+
+#endif // OPENCV_CORE_OCL_RUNTIME_CLAMDFFT_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/opencl_core.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/opencl_core.hpp
new file mode 100644
index 0000000..0404b31
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/opencl_core.hpp
@@ -0,0 +1,84 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the OpenCV Foundation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_OCL_RUNTIME_OPENCL_CORE_HPP
+#define OPENCV_CORE_OCL_RUNTIME_OPENCL_CORE_HPP
+
+#ifdef HAVE_OPENCL
+
+#ifndef CL_RUNTIME_EXPORT
+#if (defined(BUILD_SHARED_LIBS) || defined(OPENCV_CORE_SHARED)) && (defined _WIN32 || defined WINCE) && \
+    !(defined(__OPENCV_BUILD) && defined(OPENCV_MODULE_IS_PART_OF_WORLD))
+#define CL_RUNTIME_EXPORT __declspec(dllimport)
+#else
+#define CL_RUNTIME_EXPORT
+#endif
+#endif
+
+#ifdef HAVE_OPENCL_SVM
+#define clSVMAlloc clSVMAlloc_
+#define clSVMFree clSVMFree_
+#define clSetKernelArgSVMPointer clSetKernelArgSVMPointer_
+#define clSetKernelExecInfo clSetKernelExecInfo_
+#define clEnqueueSVMFree clEnqueueSVMFree_
+#define clEnqueueSVMMemcpy clEnqueueSVMMemcpy_
+#define clEnqueueSVMMemFill clEnqueueSVMMemFill_
+#define clEnqueueSVMMap clEnqueueSVMMap_
+#define clEnqueueSVMUnmap clEnqueueSVMUnmap_
+#endif
+
+#include "autogenerated/opencl_core.hpp"
+
+#ifndef CL_DEVICE_DOUBLE_FP_CONFIG
+#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032
+#endif
+
+#ifndef CL_DEVICE_HALF_FP_CONFIG
+#define CL_DEVICE_HALF_FP_CONFIG 0x1033
+#endif
+
+#ifndef CL_VERSION_1_2
+#define CV_REQUIRE_OPENCL_1_2_ERROR CV_Error(cv::Error::OpenCLApiCallError, "OpenCV compiled without OpenCL v1.2 support, so we can't use functionality from OpenCL v1.2")
+#endif
+
+#endif // HAVE_OPENCL
+
+#endif // OPENCV_CORE_OCL_RUNTIME_OPENCL_CORE_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/opencl_core_wrappers.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/opencl_core_wrappers.hpp
new file mode 100644
index 0000000..38fcae9
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/opencl_core_wrappers.hpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the OpenCV Foundation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_OCL_RUNTIME_OPENCL_WRAPPERS_HPP
+#define OPENCV_CORE_OCL_RUNTIME_OPENCL_WRAPPERS_HPP
+
+#include "autogenerated/opencl_core_wrappers.hpp"
+
+#endif // OPENCV_CORE_OCL_RUNTIME_OPENCL_WRAPPERS_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/opencl_gl.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/opencl_gl.hpp
new file mode 100644
index 0000000..659c7d8
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/opencl_gl.hpp
@@ -0,0 +1,53 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the OpenCV Foundation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_OCL_RUNTIME_OPENCL_GL_HPP
+#define OPENCV_CORE_OCL_RUNTIME_OPENCL_GL_HPP
+
+#if defined HAVE_OPENCL && defined HAVE_OPENGL
+
+#include "opencl_core.hpp"
+
+#include "autogenerated/opencl_gl.hpp"
+
+#endif // defined HAVE_OPENCL && defined HAVE_OPENGL
+
+#endif // OPENCV_CORE_OCL_RUNTIME_OPENCL_GL_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/opencl_gl_wrappers.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/opencl_gl_wrappers.hpp
new file mode 100644
index 0000000..9700004
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/opencl_gl_wrappers.hpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the OpenCV Foundation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_OCL_RUNTIME_OPENCL_GL_WRAPPERS_HPP
+#define OPENCV_CORE_OCL_RUNTIME_OPENCL_GL_WRAPPERS_HPP
+
+#include "autogenerated/opencl_gl_wrappers.hpp"
+
+#endif // OPENCV_CORE_OCL_RUNTIME_OPENCL_GL_WRAPPERS_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/opencl_svm_20.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/opencl_svm_20.hpp
new file mode 100644
index 0000000..9636b19
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/opencl_svm_20.hpp
@@ -0,0 +1,48 @@
+/* See LICENSE file in the root OpenCV directory */
+
+#ifndef OPENCV_CORE_OCL_RUNTIME_OPENCL_SVM_2_0_HPP
+#define OPENCV_CORE_OCL_RUNTIME_OPENCL_SVM_2_0_HPP
+
+#if defined(HAVE_OPENCL_SVM)
+#include "opencl_core.hpp"
+
+#include "opencl_svm_definitions.hpp"
+
+#undef clSVMAlloc
+#define clSVMAlloc clSVMAlloc_pfn
+#undef clSVMFree
+#define clSVMFree clSVMFree_pfn
+#undef clSetKernelArgSVMPointer
+#define clSetKernelArgSVMPointer clSetKernelArgSVMPointer_pfn
+#undef clSetKernelExecInfo
+//#define clSetKernelExecInfo clSetKernelExecInfo_pfn
+#undef clEnqueueSVMFree
+//#define clEnqueueSVMFree clEnqueueSVMFree_pfn
+#undef clEnqueueSVMMemcpy
+#define clEnqueueSVMMemcpy clEnqueueSVMMemcpy_pfn
+#undef clEnqueueSVMMemFill
+#define clEnqueueSVMMemFill clEnqueueSVMMemFill_pfn
+#undef clEnqueueSVMMap
+#define clEnqueueSVMMap clEnqueueSVMMap_pfn
+#undef clEnqueueSVMUnmap
+#define clEnqueueSVMUnmap clEnqueueSVMUnmap_pfn
+
+extern CL_RUNTIME_EXPORT void* (CL_API_CALL *clSVMAlloc)(cl_context context, cl_svm_mem_flags flags, size_t size, unsigned int alignment);
+extern CL_RUNTIME_EXPORT void (CL_API_CALL *clSVMFree)(cl_context context, void* svm_pointer);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL *clSetKernelArgSVMPointer)(cl_kernel kernel, cl_uint arg_index, const void* arg_value);
+//extern CL_RUNTIME_EXPORT void* (CL_API_CALL *clSetKernelExecInfo)(cl_kernel kernel, cl_kernel_exec_info param_name, size_t param_value_size, const void* param_value);
+//extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL *clEnqueueSVMFree)(cl_command_queue command_queue, cl_uint num_svm_pointers, void* svm_pointers[],
+//        void (CL_CALLBACK *pfn_free_func)(cl_command_queue queue, cl_uint num_svm_pointers, void* svm_pointers[], void* user_data), void* user_data,
+//        cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL *clEnqueueSVMMemcpy)(cl_command_queue command_queue, cl_bool blocking_copy, void* dst_ptr, const void* src_ptr, size_t size,
+        cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL *clEnqueueSVMMemFill)(cl_command_queue command_queue, void* svm_ptr, const void* pattern, size_t pattern_size, size_t size,
+        cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL *clEnqueueSVMMap)(cl_command_queue command_queue, cl_bool blocking_map, cl_map_flags map_flags, void* svm_ptr, size_t size,
+        cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event);
+extern CL_RUNTIME_EXPORT cl_int (CL_API_CALL *clEnqueueSVMUnmap)(cl_command_queue command_queue, void* svm_ptr,
+        cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event);
+
+#endif // HAVE_OPENCL_SVM
+
+#endif // OPENCV_CORE_OCL_RUNTIME_OPENCL_SVM_2_0_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/opencl_svm_definitions.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/opencl_svm_definitions.hpp
new file mode 100644
index 0000000..97c927b
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/opencl_svm_definitions.hpp
@@ -0,0 +1,42 @@
+/* See LICENSE file in the root OpenCV directory */
+
+#ifndef OPENCV_CORE_OCL_RUNTIME_OPENCL_SVM_DEFINITIONS_HPP
+#define OPENCV_CORE_OCL_RUNTIME_OPENCL_SVM_DEFINITIONS_HPP
+
+#if defined(HAVE_OPENCL_SVM)
+#if defined(CL_VERSION_2_0)
+
+// OpenCL 2.0 contains SVM definitions
+
+#else
+
+typedef cl_bitfield cl_device_svm_capabilities;
+typedef cl_bitfield cl_svm_mem_flags;
+typedef cl_uint     cl_kernel_exec_info;
+
+//
+// TODO Add real values after OpenCL 2.0 release
+//
+
+#ifndef CL_DEVICE_SVM_CAPABILITIES
+#define CL_DEVICE_SVM_CAPABILITIES 0x1053
+
+#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER             (1 << 0)
+#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER               (1 << 1)
+#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM               (1 << 2)
+#define CL_DEVICE_SVM_ATOMICS                         (1 << 3)
+#endif
+
+#ifndef CL_MEM_SVM_FINE_GRAIN_BUFFER
+#define CL_MEM_SVM_FINE_GRAIN_BUFFER (1 << 10)
+#endif
+
+#ifndef CL_MEM_SVM_ATOMICS
+#define CL_MEM_SVM_ATOMICS (1 << 11)
+#endif
+
+
+#endif // CL_VERSION_2_0
+#endif // HAVE_OPENCL_SVM
+
+#endif // OPENCV_CORE_OCL_RUNTIME_OPENCL_SVM_DEFINITIONS_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp
new file mode 100644
index 0000000..497bc3d
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp
@@ -0,0 +1,166 @@
+/* See LICENSE file in the root OpenCV directory */
+
+#ifndef OPENCV_CORE_OCL_RUNTIME_OPENCL_SVM_HSA_EXTENSION_HPP
+#define OPENCV_CORE_OCL_RUNTIME_OPENCL_SVM_HSA_EXTENSION_HPP
+
+#if defined(HAVE_OPENCL_SVM)
+#include "opencl_core.hpp"
+
+#ifndef CL_DEVICE_SVM_CAPABILITIES_AMD
+//
+//  Part of the file is an extract from the cl_ext.h file from AMD APP SDK package.
+//  Below is the original copyright.
+//
+/*******************************************************************************
+ * Copyright (c) 2008-2013 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/*******************************************
+ * Shared Virtual Memory (SVM) extension
+ *******************************************/
+typedef cl_bitfield                      cl_device_svm_capabilities_amd;
+typedef cl_bitfield                      cl_svm_mem_flags_amd;
+typedef cl_uint                          cl_kernel_exec_info_amd;
+
+/* cl_device_info */
+#define CL_DEVICE_SVM_CAPABILITIES_AMD                     0x1053
+#define CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT_AMD  0x1054
+
+/* cl_device_svm_capabilities_amd */
+#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_AMD             (1 << 0)
+#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_AMD               (1 << 1)
+#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_AMD               (1 << 2)
+#define CL_DEVICE_SVM_ATOMICS_AMD                         (1 << 3)
+
+/* cl_svm_mem_flags_amd */
+#define CL_MEM_SVM_FINE_GRAIN_BUFFER_AMD                  (1 << 10)
+#define CL_MEM_SVM_ATOMICS_AMD                            (1 << 11)
+
+/* cl_mem_info */
+#define CL_MEM_USES_SVM_POINTER_AMD                       0x1109
+
+/* cl_kernel_exec_info_amd */
+#define CL_KERNEL_EXEC_INFO_SVM_PTRS_AMD                  0x11B6
+#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_AMD     0x11B7
+
+/* cl_command_type */
+#define CL_COMMAND_SVM_FREE_AMD                           0x1209
+#define CL_COMMAND_SVM_MEMCPY_AMD                         0x120A
+#define CL_COMMAND_SVM_MEMFILL_AMD                        0x120B
+#define CL_COMMAND_SVM_MAP_AMD                            0x120C
+#define CL_COMMAND_SVM_UNMAP_AMD                          0x120D
+
+typedef CL_API_ENTRY void*
+(CL_API_CALL * clSVMAllocAMD_fn)(
+    cl_context            /* context */,
+    cl_svm_mem_flags_amd  /* flags */,
+    size_t                /* size */,
+    unsigned int          /* alignment */
+) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY void
+(CL_API_CALL * clSVMFreeAMD_fn)(
+    cl_context  /* context */,
+    void*       /* svm_pointer */
+) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clEnqueueSVMFreeAMD_fn)(
+    cl_command_queue /* command_queue */,
+    cl_uint          /* num_svm_pointers */,
+    void**           /* svm_pointers */,
+    void (CL_CALLBACK *)( /*pfn_free_func*/
+        cl_command_queue /* queue */,
+        cl_uint          /* num_svm_pointers */,
+        void**           /* svm_pointers */,
+        void*            /* user_data */),
+    void*             /* user_data */,
+    cl_uint           /* num_events_in_wait_list */,
+    const cl_event*   /* event_wait_list */,
+    cl_event*         /* event */
+) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clEnqueueSVMMemcpyAMD_fn)(
+    cl_command_queue /* command_queue */,
+    cl_bool          /* blocking_copy */,
+    void*            /* dst_ptr */,
+    const void*      /* src_ptr */,
+    size_t           /* size */,
+    cl_uint          /* num_events_in_wait_list */,
+    const cl_event*  /* event_wait_list */,
+    cl_event*        /* event */
+) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clEnqueueSVMMemFillAMD_fn)(
+    cl_command_queue /* command_queue */,
+    void*            /* svm_ptr */,
+    const void*      /* pattern */,
+    size_t           /* pattern_size */,
+    size_t           /* size */,
+    cl_uint          /* num_events_in_wait_list */,
+    const cl_event*  /* event_wait_list */,
+    cl_event*        /* event */
+) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clEnqueueSVMMapAMD_fn)(
+    cl_command_queue /* command_queue */,
+    cl_bool          /* blocking_map */,
+    cl_map_flags     /* map_flags */,
+    void*            /* svm_ptr */,
+    size_t           /* size */,
+    cl_uint          /* num_events_in_wait_list */,
+    const cl_event*  /* event_wait_list */,
+    cl_event*        /* event */
+) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clEnqueueSVMUnmapAMD_fn)(
+    cl_command_queue /* command_queue */,
+    void*            /* svm_ptr */,
+    cl_uint          /* num_events_in_wait_list */,
+    const cl_event*  /* event_wait_list */,
+    cl_event*        /* event */
+) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clSetKernelArgSVMPointerAMD_fn)(
+    cl_kernel     /* kernel */,
+    cl_uint       /* arg_index */,
+    const void *  /* arg_value */
+) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clSetKernelExecInfoAMD_fn)(
+     cl_kernel                /* kernel */,
+     cl_kernel_exec_info_amd  /* param_name */,
+     size_t                   /* param_value_size */,
+     const void *             /* param_value */
+) CL_EXT_SUFFIX__VERSION_1_2;
+
+#endif
+
+#endif // HAVE_OPENCL_SVM
+
+#endif // OPENCV_CORE_OCL_RUNTIME_OPENCL_SVM_HSA_EXTENSION_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opengl.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opengl.hpp
new file mode 100644
index 0000000..a311ce2
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/opengl.hpp
@@ -0,0 +1,725 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_OPENGL_HPP
+#define OPENCV_CORE_OPENGL_HPP
+
+#ifndef __cplusplus
+#  error opengl.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core.hpp"
+#include "ocl.hpp"
+
+namespace cv { namespace ogl {
+
+/** @addtogroup core_opengl
+This section describes OpenGL interoperability.
+
+To enable OpenGL support, configure OpenCV using CMake with WITH_OPENGL=ON . Currently OpenGL is
+supported only with WIN32, GTK and Qt backends on Windows and Linux (MacOS and Android are not
+supported). For GTK backend gtkglext-1.0 library is required.
+
+To use OpenGL functionality you should first create OpenGL context (window or frame buffer). You can
+do this with namedWindow function or with other OpenGL toolkit (GLUT, for example).
+*/
+//! @{
+
+/////////////////// OpenGL Objects ///////////////////
+
+/** @brief Smart pointer for OpenGL buffer object with reference counting.
+
+Buffer Objects are OpenGL objects that store an array of unformatted memory allocated by the OpenGL
+context. These can be used to store vertex data, pixel data retrieved from images or the
+framebuffer, and a variety of other things.
+
+ogl::Buffer has interface similar with Mat interface and represents 2D array memory.
+
+ogl::Buffer supports memory transfers between host and device and also can be mapped to CUDA memory.
+ */
+class CV_EXPORTS Buffer
+{
+public:
+    /** @brief The target defines how you intend to use the buffer object.
+    */
+    enum Target
+    {
+        ARRAY_BUFFER         = 0x8892, //!< The buffer will be used as a source for vertex data
+        ELEMENT_ARRAY_BUFFER = 0x8893, //!< The buffer will be used for indices (in glDrawElements, for example)
+        PIXEL_PACK_BUFFER    = 0x88EB, //!< The buffer will be used for reading from OpenGL textures
+        PIXEL_UNPACK_BUFFER  = 0x88EC  //!< The buffer will be used for writing to OpenGL textures
+    };
+
+    enum Access
+    {
+        READ_ONLY  = 0x88B8,
+        WRITE_ONLY = 0x88B9,
+        READ_WRITE = 0x88BA
+    };
+
+    /** @brief The constructors.
+
+    Creates empty ogl::Buffer object, creates ogl::Buffer object from existed buffer ( abufId
+    parameter), allocates memory for ogl::Buffer object or copies from host/device memory.
+     */
+    Buffer();
+
+    /** @overload
+    @param arows Number of rows in a 2D array.
+    @param acols Number of columns in a 2D array.
+    @param atype Array type ( CV_8UC1, ..., CV_64FC4 ). See Mat for details.
+    @param abufId Buffer object name.
+    @param autoRelease Auto release mode (if true, release will be called in object's destructor).
+    */
+    Buffer(int arows, int acols, int atype, unsigned int abufId, bool autoRelease = false);
+
+    /** @overload
+    @param asize 2D array size.
+    @param atype Array type ( CV_8UC1, ..., CV_64FC4 ). See Mat for details.
+    @param abufId Buffer object name.
+    @param autoRelease Auto release mode (if true, release will be called in object's destructor).
+    */
+    Buffer(Size asize, int atype, unsigned int abufId, bool autoRelease = false);
+
+    /** @overload
+    @param arows Number of rows in a 2D array.
+    @param acols Number of columns in a 2D array.
+    @param atype Array type ( CV_8UC1, ..., CV_64FC4 ). See Mat for details.
+    @param target Buffer usage. See cv::ogl::Buffer::Target .
+    @param autoRelease Auto release mode (if true, release will be called in object's destructor).
+    */
+    Buffer(int arows, int acols, int atype, Target target = ARRAY_BUFFER, bool autoRelease = false);
+
+    /** @overload
+    @param asize 2D array size.
+    @param atype Array type ( CV_8UC1, ..., CV_64FC4 ). See Mat for details.
+    @param target Buffer usage. See cv::ogl::Buffer::Target .
+    @param autoRelease Auto release mode (if true, release will be called in object's destructor).
+    */
+    Buffer(Size asize, int atype, Target target = ARRAY_BUFFER, bool autoRelease = false);
+
+    /** @overload
+    @param arr Input array (host or device memory, it can be Mat , cuda::GpuMat or std::vector ).
+    @param target Buffer usage. See cv::ogl::Buffer::Target .
+    @param autoRelease Auto release mode (if true, release will be called in object's destructor).
+    */
+    explicit Buffer(InputArray arr, Target target = ARRAY_BUFFER, bool autoRelease = false);
+
+    /** @brief Allocates memory for ogl::Buffer object.
+
+    @param arows Number of rows in a 2D array.
+    @param acols Number of columns in a 2D array.
+    @param atype Array type ( CV_8UC1, ..., CV_64FC4 ). See Mat for details.
+    @param target Buffer usage. See cv::ogl::Buffer::Target .
+    @param autoRelease Auto release mode (if true, release will be called in object's destructor).
+     */
+    void create(int arows, int acols, int atype, Target target = ARRAY_BUFFER, bool autoRelease = false);
+
+    /** @overload
+    @param asize 2D array size.
+    @param atype Array type ( CV_8UC1, ..., CV_64FC4 ). See Mat for details.
+    @param target Buffer usage. See cv::ogl::Buffer::Target .
+    @param autoRelease Auto release mode (if true, release will be called in object's destructor).
+    */
+    void create(Size asize, int atype, Target target = ARRAY_BUFFER, bool autoRelease = false);
+
+    /** @brief Decrements the reference counter and destroys the buffer object if needed.
+
+    The function will call setAutoRelease(true) .
+     */
+    void release();
+
+    /** @brief Sets auto release mode.
+
+    The lifetime of the OpenGL object is tied to the lifetime of the context. If OpenGL context was
+    bound to a window it could be released at any time (user can close a window). If object's destructor
+    is called after destruction of the context it will cause an error. Thus ogl::Buffer doesn't destroy
+    OpenGL object in destructor by default (all OpenGL resources will be released with OpenGL context).
+    This function can force ogl::Buffer destructor to destroy OpenGL object.
+    @param flag Auto release mode (if true, release will be called in object's destructor).
+     */
+    void setAutoRelease(bool flag);
+
+    /** @brief Copies from host/device memory to OpenGL buffer.
+    @param arr Input array (host or device memory, it can be Mat , cuda::GpuMat or std::vector ).
+    @param target Buffer usage. See cv::ogl::Buffer::Target .
+    @param autoRelease Auto release mode (if true, release will be called in object's destructor).
+     */
+    void copyFrom(InputArray arr, Target target = ARRAY_BUFFER, bool autoRelease = false);
+
+    /** @overload */
+    void copyFrom(InputArray arr, cuda::Stream& stream, Target target = ARRAY_BUFFER, bool autoRelease = false);
+
+    /** @brief Copies from OpenGL buffer to host/device memory or another OpenGL buffer object.
+
+    @param arr Destination array (host or device memory, can be Mat , cuda::GpuMat , std::vector or
+    ogl::Buffer ).
+     */
+    void copyTo(OutputArray arr) const;
+
+    /** @overload */
+    void copyTo(OutputArray arr, cuda::Stream& stream) const;
+
+    /** @brief Creates a full copy of the buffer object and the underlying data.
+
+    @param target Buffer usage for destination buffer.
+    @param autoRelease Auto release mode for destination buffer.
+     */
+    Buffer clone(Target target = ARRAY_BUFFER, bool autoRelease = false) const;
+
+    /** @brief Binds OpenGL buffer to the specified buffer binding point.
+
+    @param target Binding point. See cv::ogl::Buffer::Target .
+     */
+    void bind(Target target) const;
+
+    /** @brief Unbind any buffers from the specified binding point.
+
+    @param target Binding point. See cv::ogl::Buffer::Target .
+     */
+    static void unbind(Target target);
+
+    /** @brief Maps OpenGL buffer to host memory.
+
+    mapHost maps to the client's address space the entire data store of the buffer object. The data can
+    then be directly read and/or written relative to the returned pointer, depending on the specified
+    access policy.
+
+    A mapped data store must be unmapped with ogl::Buffer::unmapHost before its buffer object is used.
+
+    This operation can lead to memory transfers between host and device.
+
+    Only one buffer object can be mapped at a time.
+    @param access Access policy, indicating whether it will be possible to read from, write to, or both
+    read from and write to the buffer object's mapped data store. The symbolic constant must be
+    ogl::Buffer::READ_ONLY , ogl::Buffer::WRITE_ONLY or ogl::Buffer::READ_WRITE .
+     */
+    Mat mapHost(Access access);
+
+    /** @brief Unmaps OpenGL buffer.
+    */
+    void unmapHost();
+
+    //! map to device memory (blocking)
+    cuda::GpuMat mapDevice();
+    void unmapDevice();
+
+    /** @brief Maps OpenGL buffer to CUDA device memory.
+
+    This operation doesn't copy data. Several buffer objects can be mapped to CUDA memory at a time.
+
+    A mapped data store must be unmapped with ogl::Buffer::unmapDevice before its buffer object is used.
+     */
+    cuda::GpuMat mapDevice(cuda::Stream& stream);
+
+    /** @brief Unmaps OpenGL buffer.
+    */
+    void unmapDevice(cuda::Stream& stream);
+
+    int rows() const;
+    int cols() const;
+    Size size() const;
+    bool empty() const;
+
+    int type() const;
+    int depth() const;
+    int channels() const;
+    int elemSize() const;
+    int elemSize1() const;
+
+    //! get OpenGL opject id
+    unsigned int bufId() const;
+
+    class Impl;
+
+private:
+    Ptr<Impl> impl_;
+    int rows_;
+    int cols_;
+    int type_;
+};
+
+/** @brief Smart pointer for OpenGL 2D texture memory with reference counting.
+ */
+class CV_EXPORTS Texture2D
+{
+public:
+    /** @brief An Image Format describes the way that the images in Textures store their data.
+    */
+    enum Format
+    {
+        NONE            = 0,
+        DEPTH_COMPONENT = 0x1902, //!< Depth
+        RGB             = 0x1907, //!< Red, Green, Blue
+        RGBA            = 0x1908  //!< Red, Green, Blue, Alpha
+    };
+
+    /** @brief The constructors.
+
+    Creates empty ogl::Texture2D object, allocates memory for ogl::Texture2D object or copies from
+    host/device memory.
+     */
+    Texture2D();
+
+    /** @overload */
+    Texture2D(int arows, int acols, Format aformat, unsigned int atexId, bool autoRelease = false);
+
+    /** @overload */
+    Texture2D(Size asize, Format aformat, unsigned int atexId, bool autoRelease = false);
+
+    /** @overload
+    @param arows Number of rows.
+    @param acols Number of columns.
+    @param aformat Image format. See cv::ogl::Texture2D::Format .
+    @param autoRelease Auto release mode (if true, release will be called in object's destructor).
+    */
+    Texture2D(int arows, int acols, Format aformat, bool autoRelease = false);
+
+    /** @overload
+    @param asize 2D array size.
+    @param aformat Image format. See cv::ogl::Texture2D::Format .
+    @param autoRelease Auto release mode (if true, release will be called in object's destructor).
+    */
+    Texture2D(Size asize, Format aformat, bool autoRelease = false);
+
+    /** @overload
+    @param arr Input array (host or device memory, it can be Mat , cuda::GpuMat or ogl::Buffer ).
+    @param autoRelease Auto release mode (if true, release will be called in object's destructor).
+    */
+    explicit Texture2D(InputArray arr, bool autoRelease = false);
+
+    /** @brief Allocates memory for ogl::Texture2D object.
+
+    @param arows Number of rows.
+    @param acols Number of columns.
+    @param aformat Image format. See cv::ogl::Texture2D::Format .
+    @param autoRelease Auto release mode (if true, release will be called in object's destructor).
+     */
+    void create(int arows, int acols, Format aformat, bool autoRelease = false);
+    /** @overload
+    @param asize 2D array size.
+    @param aformat Image format. See cv::ogl::Texture2D::Format .
+    @param autoRelease Auto release mode (if true, release will be called in object's destructor).
+    */
+    void create(Size asize, Format aformat, bool autoRelease = false);
+
+    /** @brief Decrements the reference counter and destroys the texture object if needed.
+
+    The function will call setAutoRelease(true) .
+     */
+    void release();
+
+    /** @brief Sets auto release mode.
+
+    @param flag Auto release mode (if true, release will be called in object's destructor).
+
+    The lifetime of the OpenGL object is tied to the lifetime of the context. If OpenGL context was
+    bound to a window it could be released at any time (user can close a window). If object's destructor
+    is called after destruction of the context it will cause an error. Thus ogl::Texture2D doesn't
+    destroy OpenGL object in destructor by default (all OpenGL resources will be released with OpenGL
+    context). This function can force ogl::Texture2D destructor to destroy OpenGL object.
+     */
+    void setAutoRelease(bool flag);
+
+    /** @brief Copies from host/device memory to OpenGL texture.
+
+    @param arr Input array (host or device memory, it can be Mat , cuda::GpuMat or ogl::Buffer ).
+    @param autoRelease Auto release mode (if true, release will be called in object's destructor).
+     */
+    void copyFrom(InputArray arr, bool autoRelease = false);
+
+    /** @brief Copies from OpenGL texture to host/device memory or another OpenGL texture object.
+
+    @param arr Destination array (host or device memory, can be Mat , cuda::GpuMat , ogl::Buffer or
+    ogl::Texture2D ).
+    @param ddepth Destination depth.
+    @param autoRelease Auto release mode for destination buffer (if arr is OpenGL buffer or texture).
+     */
+    void copyTo(OutputArray arr, int ddepth = CV_32F, bool autoRelease = false) const;
+
+    /** @brief Binds texture to current active texture unit for GL_TEXTURE_2D target.
+    */
+    void bind() const;
+
+    int rows() const;
+    int cols() const;
+    Size size() const;
+    bool empty() const;
+
+    Format format() const;
+
+    //! get OpenGL opject id
+    unsigned int texId() const;
+
+    class Impl;
+
+private:
+    Ptr<Impl> impl_;
+    int rows_;
+    int cols_;
+    Format format_;
+};
+
+/** @brief Wrapper for OpenGL Client-Side Vertex arrays.
+
+ogl::Arrays stores vertex data in ogl::Buffer objects.
+ */
+class CV_EXPORTS Arrays
+{
+public:
+    /** @brief Default constructor
+     */
+    Arrays();
+
+    /** @brief Sets an array of vertex coordinates.
+    @param vertex array with vertex coordinates, can be both host and device memory.
+    */
+    void setVertexArray(InputArray vertex);
+
+    /** @brief Resets vertex coordinates.
+    */
+    void resetVertexArray();
+
+    /** @brief Sets an array of vertex colors.
+    @param color array with vertex colors, can be both host and device memory.
+     */
+    void setColorArray(InputArray color);
+
+    /** @brief Resets vertex colors.
+    */
+    void resetColorArray();
+
+    /** @brief Sets an array of vertex normals.
+    @param normal array with vertex normals, can be both host and device memory.
+     */
+    void setNormalArray(InputArray normal);
+
+    /** @brief Resets vertex normals.
+    */
+    void resetNormalArray();
+
+    /** @brief Sets an array of vertex texture coordinates.
+    @param texCoord array with vertex texture coordinates, can be both host and device memory.
+     */
+    void setTexCoordArray(InputArray texCoord);
+
+    /** @brief Resets vertex texture coordinates.
+    */
+    void resetTexCoordArray();
+
+    /** @brief Releases all inner buffers.
+    */
+    void release();
+
+    /** @brief Sets auto release mode all inner buffers.
+    @param flag Auto release mode.
+     */
+    void setAutoRelease(bool flag);
+
+    /** @brief Binds all vertex arrays.
+    */
+    void bind() const;
+
+    /** @brief Returns the vertex count.
+    */
+    int size() const;
+    bool empty() const;
+
+private:
+    int size_;
+    Buffer vertex_;
+    Buffer color_;
+    Buffer normal_;
+    Buffer texCoord_;
+};
+
+/////////////////// Render Functions ///////////////////
+
+//! render mode
+enum RenderModes {
+    POINTS         = 0x0000,
+    LINES          = 0x0001,
+    LINE_LOOP      = 0x0002,
+    LINE_STRIP     = 0x0003,
+    TRIANGLES      = 0x0004,
+    TRIANGLE_STRIP = 0x0005,
+    TRIANGLE_FAN   = 0x0006,
+    QUADS          = 0x0007,
+    QUAD_STRIP     = 0x0008,
+    POLYGON        = 0x0009
+};
+
+/** @brief Render OpenGL texture or primitives.
+@param tex Texture to draw.
+@param wndRect Region of window, where to draw a texture (normalized coordinates).
+@param texRect Region of texture to draw (normalized coordinates).
+ */
+CV_EXPORTS void render(const Texture2D& tex,
+    Rect_<double> wndRect = Rect_<double>(0.0, 0.0, 1.0, 1.0),
+    Rect_<double> texRect = Rect_<double>(0.0, 0.0, 1.0, 1.0));
+
+/** @overload
+@param arr Array of privitives vertices.
+@param mode Render mode. One of cv::ogl::RenderModes
+@param color Color for all vertices. Will be used if arr doesn't contain color array.
+*/
+CV_EXPORTS void render(const Arrays& arr, int mode = POINTS, Scalar color = Scalar::all(255));
+
+/** @overload
+@param arr Array of privitives vertices.
+@param indices Array of vertices indices (host or device memory).
+@param mode Render mode. One of cv::ogl::RenderModes
+@param color Color for all vertices. Will be used if arr doesn't contain color array.
+*/
+CV_EXPORTS void render(const Arrays& arr, InputArray indices, int mode = POINTS, Scalar color = Scalar::all(255));
+
+/////////////////// CL-GL Interoperability Functions ///////////////////
+
+namespace ocl {
+using namespace cv::ocl;
+
+// TODO static functions in the Context class
+/** @brief Creates OpenCL context from GL.
+@return Returns reference to OpenCL Context
+ */
+CV_EXPORTS Context& initializeContextFromGL();
+
+} // namespace cv::ogl::ocl
+
+/** @brief Converts InputArray to Texture2D object.
+@param src     - source InputArray.
+@param texture - destination Texture2D object.
+ */
+CV_EXPORTS void convertToGLTexture2D(InputArray src, Texture2D& texture);
+
+/** @brief Converts Texture2D object to OutputArray.
+@param texture - source Texture2D object.
+@param dst     - destination OutputArray.
+ */
+CV_EXPORTS void convertFromGLTexture2D(const Texture2D& texture, OutputArray dst);
+
+/** @brief Maps Buffer object to process on CL side (convert to UMat).
+
+Function creates CL buffer from GL one, and then constructs UMat that can be used
+to process buffer data with OpenCV functions. Note that in current implementation
+UMat constructed this way doesn't own corresponding GL buffer object, so it is
+the user responsibility to close down CL/GL buffers relationships by explicitly
+calling unmapGLBuffer() function.
+@param buffer      - source Buffer object.
+@param accessFlags - data access flags (ACCESS_READ|ACCESS_WRITE).
+@return Returns UMat object
+ */
+CV_EXPORTS UMat mapGLBuffer(const Buffer& buffer, AccessFlag accessFlags = ACCESS_READ | ACCESS_WRITE);
+
+/** @brief Unmaps Buffer object (releases UMat, previously mapped from Buffer).
+
+Function must be called explicitly by the user for each UMat previously constructed
+by the call to mapGLBuffer() function.
+@param u           - source UMat, created by mapGLBuffer().
+ */
+CV_EXPORTS void unmapGLBuffer(UMat& u);
+
+//! @}
+}} // namespace cv::ogl
+
+namespace cv { namespace cuda {
+
+/** @brief Sets a CUDA device and initializes it for the current thread with OpenGL interoperability.
+
+This function should be explicitly called after OpenGL context creation and before any CUDA calls.
+@param device System index of a CUDA device starting with 0.
+@ingroup core_opengl
+ */
+CV_EXPORTS void setGlDevice(int device = 0);
+
+}}
+
+//! @cond IGNORED
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+inline
+cv::ogl::Buffer::Buffer(int arows, int acols, int atype, Target target, bool autoRelease) : rows_(0), cols_(0), type_(0)
+{
+    create(arows, acols, atype, target, autoRelease);
+}
+
+inline
+cv::ogl::Buffer::Buffer(Size asize, int atype, Target target, bool autoRelease) : rows_(0), cols_(0), type_(0)
+{
+    create(asize, atype, target, autoRelease);
+}
+
+inline
+void cv::ogl::Buffer::create(Size asize, int atype, Target target, bool autoRelease)
+{
+    create(asize.height, asize.width, atype, target, autoRelease);
+}
+
+inline
+int cv::ogl::Buffer::rows() const
+{
+    return rows_;
+}
+
+inline
+int cv::ogl::Buffer::cols() const
+{
+    return cols_;
+}
+
+inline
+cv::Size cv::ogl::Buffer::size() const
+{
+    return Size(cols_, rows_);
+}
+
+inline
+bool cv::ogl::Buffer::empty() const
+{
+    return rows_ == 0 || cols_ == 0;
+}
+
+inline
+int cv::ogl::Buffer::type() const
+{
+    return type_;
+}
+
+inline
+int cv::ogl::Buffer::depth() const
+{
+    return CV_MAT_DEPTH(type_);
+}
+
+inline
+int cv::ogl::Buffer::channels() const
+{
+    return CV_MAT_CN(type_);
+}
+
+inline
+int cv::ogl::Buffer::elemSize() const
+{
+    return CV_ELEM_SIZE(type_);
+}
+
+inline
+int cv::ogl::Buffer::elemSize1() const
+{
+    return CV_ELEM_SIZE1(type_);
+}
+
+///////
+
+inline
+cv::ogl::Texture2D::Texture2D(int arows, int acols, Format aformat, bool autoRelease) : rows_(0), cols_(0), format_(NONE)
+{
+    create(arows, acols, aformat, autoRelease);
+}
+
+inline
+cv::ogl::Texture2D::Texture2D(Size asize, Format aformat, bool autoRelease) : rows_(0), cols_(0), format_(NONE)
+{
+    create(asize, aformat, autoRelease);
+}
+
+inline
+void cv::ogl::Texture2D::create(Size asize, Format aformat, bool autoRelease)
+{
+    create(asize.height, asize.width, aformat, autoRelease);
+}
+
+inline
+int cv::ogl::Texture2D::rows() const
+{
+    return rows_;
+}
+
+inline
+int cv::ogl::Texture2D::cols() const
+{
+    return cols_;
+}
+
+inline
+cv::Size cv::ogl::Texture2D::size() const
+{
+    return Size(cols_, rows_);
+}
+
+inline
+bool cv::ogl::Texture2D::empty() const
+{
+    return rows_ == 0 || cols_ == 0;
+}
+
+inline
+cv::ogl::Texture2D::Format cv::ogl::Texture2D::format() const
+{
+    return format_;
+}
+
+///////
+
+inline
+cv::ogl::Arrays::Arrays() : size_(0)
+{
+}
+
+inline
+int cv::ogl::Arrays::size() const
+{
+    return size_;
+}
+
+inline
+bool cv::ogl::Arrays::empty() const
+{
+    return size_ == 0;
+}
+
+//! @endcond
+
+#endif /* OPENCV_CORE_OPENGL_HPP */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/operations.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/operations.hpp
new file mode 100644
index 0000000..43a9eb8
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/operations.hpp
@@ -0,0 +1,610 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_OPERATIONS_HPP
+#define OPENCV_CORE_OPERATIONS_HPP
+
+#ifndef __cplusplus
+#  error operations.hpp header must be compiled as C++
+#endif
+
+#include <cstdio>
+
+#if defined(__GNUC__) || defined(__clang__) // at least GCC 3.1+, clang 3.5+
+#  if defined(__MINGW_PRINTF_FORMAT)  // https://sourceforge.net/p/mingw-w64/wiki2/gnu%20printf/.
+#    define CV_FORMAT_PRINTF(string_idx, first_to_check) __attribute__ ((format (__MINGW_PRINTF_FORMAT, string_idx, first_to_check)))
+#  else
+#    define CV_FORMAT_PRINTF(string_idx, first_to_check) __attribute__ ((format (printf, string_idx, first_to_check)))
+#  endif
+#else
+#  define CV_FORMAT_PRINTF(A, B)
+#endif
+
+//! @cond IGNORED
+
+namespace cv
+{
+
+////////////////////////////// Matx methods depending on core API /////////////////////////////
+
+namespace internal
+{
+
+template<typename _Tp, int m, int n> struct Matx_FastInvOp
+{
+    bool operator()(const Matx<_Tp, m, n>& a, Matx<_Tp, n, m>& b, int method) const
+    {
+        return invert(a, b, method) != 0;
+    }
+};
+
+template<typename _Tp, int m> struct Matx_FastInvOp<_Tp, m, m>
+{
+    bool operator()(const Matx<_Tp, m, m>& a, Matx<_Tp, m, m>& b, int method) const
+    {
+        if (method == DECOMP_LU || method == DECOMP_CHOLESKY)
+        {
+            Matx<_Tp, m, m> temp = a;
+
+            // assume that b is all 0's on input => make it a unity matrix
+            for (int i = 0; i < m; i++)
+                b(i, i) = (_Tp)1;
+
+            if (method == DECOMP_CHOLESKY)
+                return Cholesky(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m);
+
+            return LU(temp.val, m*sizeof(_Tp), m, b.val, m*sizeof(_Tp), m) != 0;
+        }
+        else
+        {
+            return invert(a, b, method) != 0;
+        }
+    }
+};
+
+template<typename _Tp> struct Matx_FastInvOp<_Tp, 2, 2>
+{
+    bool operator()(const Matx<_Tp, 2, 2>& a, Matx<_Tp, 2, 2>& b, int /*method*/) const
+    {
+        _Tp d = (_Tp)determinant(a);
+        if (d == 0)
+            return false;
+        d = 1/d;
+        b(1,1) = a(0,0)*d;
+        b(0,0) = a(1,1)*d;
+        b(0,1) = -a(0,1)*d;
+        b(1,0) = -a(1,0)*d;
+        return true;
+    }
+};
+
+template<typename _Tp> struct Matx_FastInvOp<_Tp, 3, 3>
+{
+    bool operator()(const Matx<_Tp, 3, 3>& a, Matx<_Tp, 3, 3>& b, int /*method*/) const
+    {
+        _Tp d = (_Tp)determinant(a);
+        if (d == 0)
+            return false;
+        d = 1/d;
+        b(0,0) = (a(1,1) * a(2,2) - a(1,2) * a(2,1)) * d;
+        b(0,1) = (a(0,2) * a(2,1) - a(0,1) * a(2,2)) * d;
+        b(0,2) = (a(0,1) * a(1,2) - a(0,2) * a(1,1)) * d;
+
+        b(1,0) = (a(1,2) * a(2,0) - a(1,0) * a(2,2)) * d;
+        b(1,1) = (a(0,0) * a(2,2) - a(0,2) * a(2,0)) * d;
+        b(1,2) = (a(0,2) * a(1,0) - a(0,0) * a(1,2)) * d;
+
+        b(2,0) = (a(1,0) * a(2,1) - a(1,1) * a(2,0)) * d;
+        b(2,1) = (a(0,1) * a(2,0) - a(0,0) * a(2,1)) * d;
+        b(2,2) = (a(0,0) * a(1,1) - a(0,1) * a(1,0)) * d;
+        return true;
+    }
+};
+
+
+template<typename _Tp, int m, int l, int n> struct Matx_FastSolveOp
+{
+    bool operator()(const Matx<_Tp, m, l>& a, const Matx<_Tp, m, n>& b,
+                    Matx<_Tp, l, n>& x, int method) const
+    {
+        return cv::solve(a, b, x, method);
+    }
+};
+
+template<typename _Tp, int m, int n> struct Matx_FastSolveOp<_Tp, m, m, n>
+{
+    bool operator()(const Matx<_Tp, m, m>& a, const Matx<_Tp, m, n>& b,
+                    Matx<_Tp, m, n>& x, int method) const
+    {
+        if (method == DECOMP_LU || method == DECOMP_CHOLESKY)
+        {
+            Matx<_Tp, m, m> temp = a;
+            x = b;
+            if( method == DECOMP_CHOLESKY )
+                return Cholesky(temp.val, m*sizeof(_Tp), m, x.val, n*sizeof(_Tp), n);
+
+            return LU(temp.val, m*sizeof(_Tp), m, x.val, n*sizeof(_Tp), n) != 0;
+        }
+        else
+        {
+            return cv::solve(a, b, x, method);
+        }
+    }
+};
+
+template<typename _Tp> struct Matx_FastSolveOp<_Tp, 2, 2, 1>
+{
+    bool operator()(const Matx<_Tp, 2, 2>& a, const Matx<_Tp, 2, 1>& b,
+                    Matx<_Tp, 2, 1>& x, int) const
+    {
+        _Tp d = (_Tp)determinant(a);
+        if (d == 0)
+            return false;
+        d = 1/d;
+        x(0) = (b(0)*a(1,1) - b(1)*a(0,1))*d;
+        x(1) = (b(1)*a(0,0) - b(0)*a(1,0))*d;
+        return true;
+    }
+};
+
+template<typename _Tp> struct Matx_FastSolveOp<_Tp, 3, 3, 1>
+{
+    bool operator()(const Matx<_Tp, 3, 3>& a, const Matx<_Tp, 3, 1>& b,
+                    Matx<_Tp, 3, 1>& x, int) const
+    {
+        _Tp d = (_Tp)determinant(a);
+        if (d == 0)
+            return false;
+        d = 1/d;
+        x(0) = d*(b(0)*(a(1,1)*a(2,2) - a(1,2)*a(2,1)) -
+                a(0,1)*(b(1)*a(2,2) - a(1,2)*b(2)) +
+                a(0,2)*(b(1)*a(2,1) - a(1,1)*b(2)));
+
+        x(1) = d*(a(0,0)*(b(1)*a(2,2) - a(1,2)*b(2)) -
+                b(0)*(a(1,0)*a(2,2) - a(1,2)*a(2,0)) +
+                a(0,2)*(a(1,0)*b(2) - b(1)*a(2,0)));
+
+        x(2) = d*(a(0,0)*(a(1,1)*b(2) - b(1)*a(2,1)) -
+                a(0,1)*(a(1,0)*b(2) - b(1)*a(2,0)) +
+                b(0)*(a(1,0)*a(2,1) - a(1,1)*a(2,0)));
+        return true;
+    }
+};
+
+} // internal
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n> Matx<_Tp,m,n>::randu(_Tp a, _Tp b)
+{
+    Matx<_Tp,m,n> M;
+    cv::randu(M, Scalar(a), Scalar(b));
+    return M;
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n> Matx<_Tp,m,n>::randn(_Tp a, _Tp b)
+{
+    Matx<_Tp,m,n> M;
+    cv::randn(M, Scalar(a), Scalar(b));
+    return M;
+}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> Vec<_Tp, cn>::randu(_Tp a, _Tp b)
+{
+    Vec<_Tp,cn> V;
+    cv::randu(V, Scalar(a), Scalar(b));
+    return V;
+}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> Vec<_Tp, cn>::randn(_Tp a, _Tp b)
+{
+    Vec<_Tp,cn> V;
+    cv::randn(V, Scalar(a), Scalar(b));
+    return V;
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, n, m> Matx<_Tp, m, n>::inv(int method, bool *p_is_ok /*= NULL*/) const
+{
+    Matx<_Tp, n, m> b;
+    bool ok = cv::internal::Matx_FastInvOp<_Tp, m, n>()(*this, b, method);
+    if (p_is_ok) *p_is_ok = ok;
+    return ok ? b : Matx<_Tp, n, m>::zeros();
+}
+
+template<typename _Tp, int m, int n> template<int l> inline
+Matx<_Tp, n, l> Matx<_Tp, m, n>::solve(const Matx<_Tp, m, l>& rhs, int method) const
+{
+    Matx<_Tp, n, l> x;
+    bool ok = cv::internal::Matx_FastSolveOp<_Tp, m, n, l>()(*this, rhs, x, method);
+    return ok ? x : Matx<_Tp, n, l>::zeros();
+}
+
+
+
+////////////////////////// Augmenting algebraic & logical operations //////////////////////////
+
+#define CV_MAT_AUG_OPERATOR1(op, cvop, A, B) \
+    static inline A& operator op (A& a, const B& b) { cvop; return a; }
+
+#define CV_MAT_AUG_OPERATOR(op, cvop, A, B)   \
+    CV_MAT_AUG_OPERATOR1(op, cvop, A, B)      \
+    CV_MAT_AUG_OPERATOR1(op, cvop, const A, B)
+
+#define CV_MAT_AUG_OPERATOR_T(op, cvop, A, B)                   \
+    template<typename _Tp> CV_MAT_AUG_OPERATOR1(op, cvop, A, B) \
+    template<typename _Tp> CV_MAT_AUG_OPERATOR1(op, cvop, const A, B)
+
+#define CV_MAT_AUG_OPERATOR_TN(op, cvop, A)                                \
+    template<typename _Tp, int m, int n> static inline A& operator op (A& a, const Matx<_Tp,m,n>& b) { cvop; return a; } \
+    template<typename _Tp, int m, int n> static inline const A& operator op (const A& a, const Matx<_Tp,m,n>& b) { cvop; return a; }
+
+CV_MAT_AUG_OPERATOR  (+=, cv::add(a, b, (const Mat&)a), Mat, Mat)
+CV_MAT_AUG_OPERATOR  (+=, cv::add(a, b, (const Mat&)a), Mat, Scalar)
+CV_MAT_AUG_OPERATOR_T(+=, cv::add(a, b, (const Mat&)a), Mat_<_Tp>, Mat)
+CV_MAT_AUG_OPERATOR_T(+=, cv::add(a, b, (const Mat&)a), Mat_<_Tp>, Scalar)
+CV_MAT_AUG_OPERATOR_T(+=, cv::add(a, b, (const Mat&)a), Mat_<_Tp>, Mat_<_Tp>)
+CV_MAT_AUG_OPERATOR_TN(+=, cv::add(a, Mat(b), (const Mat&)a), Mat)
+CV_MAT_AUG_OPERATOR_TN(+=, cv::add(a, Mat(b), (const Mat&)a), Mat_<_Tp>)
+
+CV_MAT_AUG_OPERATOR  (-=, cv::subtract(a, b, (const Mat&)a), Mat, Mat)
+CV_MAT_AUG_OPERATOR  (-=, cv::subtract(a, b, (const Mat&)a), Mat, Scalar)
+CV_MAT_AUG_OPERATOR_T(-=, cv::subtract(a, b, (const Mat&)a), Mat_<_Tp>, Mat)
+CV_MAT_AUG_OPERATOR_T(-=, cv::subtract(a, b, (const Mat&)a), Mat_<_Tp>, Scalar)
+CV_MAT_AUG_OPERATOR_T(-=, cv::subtract(a, b, (const Mat&)a), Mat_<_Tp>, Mat_<_Tp>)
+CV_MAT_AUG_OPERATOR_TN(-=, cv::subtract(a, Mat(b), (const Mat&)a), Mat)
+CV_MAT_AUG_OPERATOR_TN(-=, cv::subtract(a, Mat(b), (const Mat&)a), Mat_<_Tp>)
+
+CV_MAT_AUG_OPERATOR  (*=, cv::gemm(a, b, 1, Mat(), 0, a, 0), Mat, Mat)
+CV_MAT_AUG_OPERATOR_T(*=, cv::gemm(a, b, 1, Mat(), 0, a, 0), Mat_<_Tp>, Mat)
+CV_MAT_AUG_OPERATOR_T(*=, cv::gemm(a, b, 1, Mat(), 0, a, 0), Mat_<_Tp>, Mat_<_Tp>)
+CV_MAT_AUG_OPERATOR  (*=, a.convertTo(a, -1, b), Mat, double)
+CV_MAT_AUG_OPERATOR_T(*=, a.convertTo(a, -1, b), Mat_<_Tp>, double)
+CV_MAT_AUG_OPERATOR_TN(*=, cv::gemm(a, Mat(b), 1, Mat(), 0, a, 0), Mat)
+CV_MAT_AUG_OPERATOR_TN(*=, cv::gemm(a, Mat(b), 1, Mat(), 0, a, 0), Mat_<_Tp>)
+
+CV_MAT_AUG_OPERATOR  (/=, cv::divide(a, b, (const Mat&)a), Mat, Mat)
+CV_MAT_AUG_OPERATOR_T(/=, cv::divide(a, b, (const Mat&)a), Mat_<_Tp>, Mat)
+CV_MAT_AUG_OPERATOR_T(/=, cv::divide(a, b, (const Mat&)a), Mat_<_Tp>, Mat_<_Tp>)
+CV_MAT_AUG_OPERATOR  (/=, a.convertTo((Mat&)a, -1, 1./b), Mat, double)
+CV_MAT_AUG_OPERATOR_T(/=, a.convertTo((Mat&)a, -1, 1./b), Mat_<_Tp>, double)
+CV_MAT_AUG_OPERATOR_TN(/=, cv::divide(a, Mat(b), (const Mat&)a), Mat)
+CV_MAT_AUG_OPERATOR_TN(/=, cv::divide(a, Mat(b), (const Mat&)a), Mat_<_Tp>)
+
+CV_MAT_AUG_OPERATOR  (&=, cv::bitwise_and(a, b, (const Mat&)a), Mat, Mat)
+CV_MAT_AUG_OPERATOR  (&=, cv::bitwise_and(a, b, (const Mat&)a), Mat, Scalar)
+CV_MAT_AUG_OPERATOR_T(&=, cv::bitwise_and(a, b, (const Mat&)a), Mat_<_Tp>, Mat)
+CV_MAT_AUG_OPERATOR_T(&=, cv::bitwise_and(a, b, (const Mat&)a), Mat_<_Tp>, Scalar)
+CV_MAT_AUG_OPERATOR_T(&=, cv::bitwise_and(a, b, (const Mat&)a), Mat_<_Tp>, Mat_<_Tp>)
+CV_MAT_AUG_OPERATOR_TN(&=, cv::bitwise_and(a, Mat(b), (const Mat&)a), Mat)
+CV_MAT_AUG_OPERATOR_TN(&=, cv::bitwise_and(a, Mat(b), (const Mat&)a), Mat_<_Tp>)
+
+CV_MAT_AUG_OPERATOR  (|=, cv::bitwise_or(a, b, (const Mat&)a), Mat, Mat)
+CV_MAT_AUG_OPERATOR  (|=, cv::bitwise_or(a, b, (const Mat&)a), Mat, Scalar)
+CV_MAT_AUG_OPERATOR_T(|=, cv::bitwise_or(a, b, (const Mat&)a), Mat_<_Tp>, Mat)
+CV_MAT_AUG_OPERATOR_T(|=, cv::bitwise_or(a, b, (const Mat&)a), Mat_<_Tp>, Scalar)
+CV_MAT_AUG_OPERATOR_T(|=, cv::bitwise_or(a, b, (const Mat&)a), Mat_<_Tp>, Mat_<_Tp>)
+CV_MAT_AUG_OPERATOR_TN(|=, cv::bitwise_or(a, Mat(b), (const Mat&)a), Mat)
+CV_MAT_AUG_OPERATOR_TN(|=, cv::bitwise_or(a, Mat(b), (const Mat&)a), Mat_<_Tp>)
+
+CV_MAT_AUG_OPERATOR  (^=, cv::bitwise_xor(a, b, (const Mat&)a), Mat, Mat)
+CV_MAT_AUG_OPERATOR  (^=, cv::bitwise_xor(a, b, (const Mat&)a), Mat, Scalar)
+CV_MAT_AUG_OPERATOR_T(^=, cv::bitwise_xor(a, b, (const Mat&)a), Mat_<_Tp>, Mat)
+CV_MAT_AUG_OPERATOR_T(^=, cv::bitwise_xor(a, b, (const Mat&)a), Mat_<_Tp>, Scalar)
+CV_MAT_AUG_OPERATOR_T(^=, cv::bitwise_xor(a, b, (const Mat&)a), Mat_<_Tp>, Mat_<_Tp>)
+CV_MAT_AUG_OPERATOR_TN(^=, cv::bitwise_xor(a, Mat(b), (const Mat&)a), Mat)
+CV_MAT_AUG_OPERATOR_TN(^=, cv::bitwise_xor(a, Mat(b), (const Mat&)a), Mat_<_Tp>)
+
+#undef CV_MAT_AUG_OPERATOR_TN
+#undef CV_MAT_AUG_OPERATOR_T
+#undef CV_MAT_AUG_OPERATOR
+#undef CV_MAT_AUG_OPERATOR1
+
+
+
+///////////////////////////////////////////// SVD /////////////////////////////////////////////
+
+inline SVD::SVD() {}
+inline SVD::SVD( InputArray m, int flags ) { operator ()(m, flags); }
+inline void SVD::solveZ( InputArray m, OutputArray _dst )
+{
+    Mat mtx = m.getMat();
+    SVD svd(mtx, (mtx.rows >= mtx.cols ? 0 : SVD::FULL_UV));
+    _dst.create(svd.vt.cols, 1, svd.vt.type());
+    Mat dst = _dst.getMat();
+    svd.vt.row(svd.vt.rows-1).reshape(1,svd.vt.cols).copyTo(dst);
+}
+
+template<typename _Tp, int m, int n, int nm> inline void
+    SVD::compute( const Matx<_Tp, m, n>& a, Matx<_Tp, nm, 1>& w, Matx<_Tp, m, nm>& u, Matx<_Tp, n, nm>& vt )
+{
+    CV_StaticAssert( nm == MIN(m, n), "Invalid size of output vector.");
+    Mat _a(a, false), _u(u, false), _w(w, false), _vt(vt, false);
+    SVD::compute(_a, _w, _u, _vt);
+    CV_Assert(_w.data == (uchar*)&w.val[0] && _u.data == (uchar*)&u.val[0] && _vt.data == (uchar*)&vt.val[0]);
+}
+
+template<typename _Tp, int m, int n, int nm> inline void
+SVD::compute( const Matx<_Tp, m, n>& a, Matx<_Tp, nm, 1>& w )
+{
+    CV_StaticAssert( nm == MIN(m, n), "Invalid size of output vector.");
+    Mat _a(a, false), _w(w, false);
+    SVD::compute(_a, _w);
+    CV_Assert(_w.data == (uchar*)&w.val[0]);
+}
+
+template<typename _Tp, int m, int n, int nm, int nb> inline void
+SVD::backSubst( const Matx<_Tp, nm, 1>& w, const Matx<_Tp, m, nm>& u,
+                const Matx<_Tp, n, nm>& vt, const Matx<_Tp, m, nb>& rhs,
+                Matx<_Tp, n, nb>& dst )
+{
+    CV_StaticAssert( nm == MIN(m, n), "Invalid size of output vector.");
+    Mat _u(u, false), _w(w, false), _vt(vt, false), _rhs(rhs, false), _dst(dst, false);
+    SVD::backSubst(_w, _u, _vt, _rhs, _dst);
+    CV_Assert(_dst.data == (uchar*)&dst.val[0]);
+}
+
+
+
+/////////////////////////////////// Multiply-with-Carry RNG ///////////////////////////////////
+
+inline RNG::RNG()              { state = 0xffffffff; }
+inline RNG::RNG(uint64 _state) { state = _state ? _state : 0xffffffff; }
+
+inline RNG::operator uchar()    { return (uchar)next(); }
+inline RNG::operator schar()    { return (schar)next(); }
+inline RNG::operator ushort()   { return (ushort)next(); }
+inline RNG::operator short()    { return (short)next(); }
+inline RNG::operator int()      { return (int)next(); }
+inline RNG::operator unsigned() { return next(); }
+inline RNG::operator float()    { return next()*2.3283064365386962890625e-10f; }
+inline RNG::operator double()   { unsigned t = next(); return (((uint64)t << 32) | next()) * 5.4210108624275221700372640043497e-20; }
+
+inline unsigned RNG::operator ()(unsigned N) { return (unsigned)uniform(0,N); }
+inline unsigned RNG::operator ()()           { return next(); }
+
+inline int    RNG::uniform(int a, int b)       { return a == b ? a : (int)(next() % (b - a) + a); }
+inline float  RNG::uniform(float a, float b)   { return ((float)*this)*(b - a) + a; }
+inline double RNG::uniform(double a, double b) { return ((double)*this)*(b - a) + a; }
+
+inline bool RNG::operator ==(const RNG& other) const { return state == other.state; }
+
+inline unsigned RNG::next()
+{
+    state = (uint64)(unsigned)state* /*CV_RNG_COEFF*/ 4164903690U + (unsigned)(state >> 32);
+    return (unsigned)state;
+}
+
+//! returns the next uniformly-distributed random number of the specified type
+template<typename _Tp> static inline _Tp randu()
+{
+  return (_Tp)theRNG();
+}
+
+///////////////////////////////// Formatted string generation /////////////////////////////////
+
+/** @brief Returns a text string formatted using the printf-like expression.
+
+The function acts like sprintf but forms and returns an STL string. It can be used to form an error
+message in the Exception constructor.
+@param fmt printf-compatible formatting specifiers.
+
+**Note**:
+|Type|Specifier|
+|-|-|
+|`const char*`|`%s`|
+|`char`|`%c`|
+|`float` / `double`|`%f`,`%g`|
+|`int`, `long`, `long long`|`%d`, `%ld`, ``%lld`|
+|`unsigned`, `unsigned long`, `unsigned long long`|`%u`, `%lu`, `%llu`|
+|`uint64` -> `uintmax_t`, `int64` -> `intmax_t`|`%ju`, `%jd`|
+|`size_t`|`%zu`|
+ */
+CV_EXPORTS String format( const char* fmt, ... ) CV_FORMAT_PRINTF(1, 2);
+
+///////////////////////////////// Formatted output of cv::Mat /////////////////////////////////
+
+static inline
+Ptr<Formatted> format(InputArray mtx, Formatter::FormatType fmt)
+{
+    return Formatter::get(fmt)->format(mtx.getMat());
+}
+
+static inline
+int print(Ptr<Formatted> fmtd, FILE* stream = stdout)
+{
+    int written = 0;
+    fmtd->reset();
+    for(const char* str = fmtd->next(); str; str = fmtd->next())
+        written += fputs(str, stream);
+
+    return written;
+}
+
+static inline
+int print(const Mat& mtx, FILE* stream = stdout)
+{
+    return print(Formatter::get()->format(mtx), stream);
+}
+
+static inline
+int print(const UMat& mtx, FILE* stream = stdout)
+{
+    return print(Formatter::get()->format(mtx.getMat(ACCESS_READ)), stream);
+}
+
+template<typename _Tp> static inline
+int print(const std::vector<Point_<_Tp> >& vec, FILE* stream = stdout)
+{
+    return print(Formatter::get()->format(Mat(vec)), stream);
+}
+
+template<typename _Tp> static inline
+int print(const std::vector<Point3_<_Tp> >& vec, FILE* stream = stdout)
+{
+    return print(Formatter::get()->format(Mat(vec)), stream);
+}
+
+template<typename _Tp, int m, int n> static inline
+int print(const Matx<_Tp, m, n>& matx, FILE* stream = stdout)
+{
+    return print(Formatter::get()->format(cv::Mat(matx)), stream);
+}
+
+//! @endcond
+
+/****************************************************************************************\
+*                                  Auxiliary algorithms                                  *
+\****************************************************************************************/
+
+/** @brief Splits an element set into equivalency classes.
+
+The generic function partition implements an \f$O(N^2)\f$ algorithm for splitting a set of \f$N\f$ elements
+into one or more equivalency classes, as described in
+<http://en.wikipedia.org/wiki/Disjoint-set_data_structure> . The function returns the number of
+equivalency classes.
+@param _vec Set of elements stored as a vector.
+@param labels Output vector of labels. It contains as many elements as vec. Each label labels[i] is
+a 0-based cluster index of `vec[i]`.
+@param predicate Equivalence predicate (pointer to a boolean function of two arguments or an
+instance of the class that has the method bool operator()(const _Tp& a, const _Tp& b) ). The
+predicate returns true when the elements are certainly in the same class, and returns false if they
+may or may not be in the same class.
+@ingroup core_cluster
+*/
+template<typename _Tp, class _EqPredicate> int
+partition( const std::vector<_Tp>& _vec, std::vector<int>& labels,
+          _EqPredicate predicate=_EqPredicate())
+{
+    int i, j, N = (int)_vec.size();
+    const _Tp* vec = &_vec[0];
+
+    const int PARENT=0;
+    const int RANK=1;
+
+    std::vector<int> _nodes(N*2);
+    int (*nodes)[2] = (int(*)[2])&_nodes[0];
+
+    // The first O(N) pass: create N single-vertex trees
+    for(i = 0; i < N; i++)
+    {
+        nodes[i][PARENT]=-1;
+        nodes[i][RANK] = 0;
+    }
+
+    // The main O(N^2) pass: merge connected components
+    for( i = 0; i < N; i++ )
+    {
+        int root = i;
+
+        // find root
+        while( nodes[root][PARENT] >= 0 )
+            root = nodes[root][PARENT];
+
+        for( j = 0; j < N; j++ )
+        {
+            if( i == j || !predicate(vec[i], vec[j]))
+                continue;
+            int root2 = j;
+
+            while( nodes[root2][PARENT] >= 0 )
+                root2 = nodes[root2][PARENT];
+
+            if( root2 != root )
+            {
+                // unite both trees
+                int rank = nodes[root][RANK], rank2 = nodes[root2][RANK];
+                if( rank > rank2 )
+                    nodes[root2][PARENT] = root;
+                else
+                {
+                    nodes[root][PARENT] = root2;
+                    nodes[root2][RANK] += rank == rank2;
+                    root = root2;
+                }
+                CV_Assert( nodes[root][PARENT] < 0 );
+
+                int k = j, parent;
+
+                // compress the path from node2 to root
+                while( (parent = nodes[k][PARENT]) >= 0 )
+                {
+                    nodes[k][PARENT] = root;
+                    k = parent;
+                }
+
+                // compress the path from node to root
+                k = i;
+                while( (parent = nodes[k][PARENT]) >= 0 )
+                {
+                    nodes[k][PARENT] = root;
+                    k = parent;
+                }
+            }
+        }
+    }
+
+    // Final O(N) pass: enumerate classes
+    labels.resize(N);
+    int nclasses = 0;
+
+    for( i = 0; i < N; i++ )
+    {
+        int root = i;
+        while( nodes[root][PARENT] >= 0 )
+            root = nodes[root][PARENT];
+        // re-use the rank as the class label
+        if( nodes[root][RANK] >= 0 )
+            nodes[root][RANK] = ~nclasses++;
+        labels[i] = ~nodes[root][RANK];
+    }
+
+    return nclasses;
+}
+
+} // cv
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/optim.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/optim.hpp
new file mode 100644
index 0000000..f61a2b9
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/optim.hpp
@@ -0,0 +1,302 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the OpenCV Foundation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_OPTIM_HPP
+#define OPENCV_OPTIM_HPP
+
+#include "opencv2/core.hpp"
+
+namespace cv
+{
+
+/** @addtogroup core_optim
+The algorithms in this section minimize or maximize function value within specified constraints or
+without any constraints.
+@{
+*/
+
+/** @brief Basic interface for all solvers
+ */
+class CV_EXPORTS MinProblemSolver : public Algorithm
+{
+public:
+    /** @brief Represents function being optimized
+     */
+    class CV_EXPORTS Function
+    {
+    public:
+        virtual ~Function() {}
+        virtual int getDims() const = 0;
+        virtual double getGradientEps() const;
+        virtual double calc(const double* x) const = 0;
+        virtual void getGradient(const double* x,double* grad);
+    };
+
+    /** @brief Getter for the optimized function.
+
+    The optimized function is represented by Function interface, which requires derivatives to
+    implement the calc(double*) and getDim() methods to evaluate the function.
+
+    @return Smart-pointer to an object that implements Function interface - it represents the
+    function that is being optimized. It can be empty, if no function was given so far.
+     */
+    virtual Ptr<Function> getFunction() const = 0;
+
+    /** @brief Setter for the optimized function.
+
+    *It should be called at least once before the call to* minimize(), as default value is not usable.
+
+    @param f The new function to optimize.
+     */
+    virtual void setFunction(const Ptr<Function>& f) = 0;
+
+    /** @brief Getter for the previously set terminal criteria for this algorithm.
+
+    @return Deep copy of the terminal criteria used at the moment.
+     */
+    virtual TermCriteria getTermCriteria() const = 0;
+
+    /** @brief Set terminal criteria for solver.
+
+    This method *is not necessary* to be called before the first call to minimize(), as the default
+    value is sensible.
+
+    Algorithm stops when the number of function evaluations done exceeds termcrit.maxCount, when
+    the function values at the vertices of simplex are within termcrit.epsilon range or simplex
+    becomes so small that it can enclosed in a box with termcrit.epsilon sides, whatever comes
+    first.
+    @param termcrit Terminal criteria to be used, represented as cv::TermCriteria structure.
+     */
+    virtual void setTermCriteria(const TermCriteria& termcrit) = 0;
+
+    /** @brief actually runs the algorithm and performs the minimization.
+
+    The sole input parameter determines the centroid of the starting simplex (roughly, it tells
+    where to start), all the others (terminal criteria, initial step, function to be minimized) are
+    supposed to be set via the setters before the call to this method or the default values (not
+    always sensible) will be used.
+
+    @param x The initial point, that will become a centroid of an initial simplex. After the algorithm
+    will terminate, it will be set to the point where the algorithm stops, the point of possible
+    minimum.
+    @return The value of a function at the point found.
+     */
+    virtual double minimize(InputOutputArray x) = 0;
+};
+
+/** @brief This class is used to perform the non-linear non-constrained minimization of a function,
+
+defined on an `n`-dimensional Euclidean space, using the **Nelder-Mead method**, also known as
+**downhill simplex method**. The basic idea about the method can be obtained from
+<http://en.wikipedia.org/wiki/Nelder-Mead_method>.
+
+It should be noted, that this method, although deterministic, is rather a heuristic and therefore
+may converge to a local minima, not necessary a global one. It is iterative optimization technique,
+which at each step uses an information about the values of a function evaluated only at `n+1`
+points, arranged as a *simplex* in `n`-dimensional space (hence the second name of the method). At
+each step new point is chosen to evaluate function at, obtained value is compared with previous
+ones and based on this information simplex changes it's shape , slowly moving to the local minimum.
+Thus this method is using *only* function values to make decision, on contrary to, say, Nonlinear
+Conjugate Gradient method (which is also implemented in optim).
+
+Algorithm stops when the number of function evaluations done exceeds termcrit.maxCount, when the
+function values at the vertices of simplex are within termcrit.epsilon range or simplex becomes so
+small that it can enclosed in a box with termcrit.epsilon sides, whatever comes first, for some
+defined by user positive integer termcrit.maxCount and positive non-integer termcrit.epsilon.
+
+@note DownhillSolver is a derivative of the abstract interface
+cv::MinProblemSolver, which in turn is derived from the Algorithm interface and is used to
+encapsulate the functionality, common to all non-linear optimization algorithms in the optim
+module.
+
+@note term criteria should meet following condition:
+@code
+    termcrit.type == (TermCriteria::MAX_ITER + TermCriteria::EPS) && termcrit.epsilon > 0 && termcrit.maxCount > 0
+@endcode
+ */
+class CV_EXPORTS DownhillSolver : public MinProblemSolver
+{
+public:
+    /** @brief Returns the initial step that will be used in downhill simplex algorithm.
+
+    @param step Initial step that will be used in algorithm. Note, that although corresponding setter
+    accepts column-vectors as well as row-vectors, this method will return a row-vector.
+    @see DownhillSolver::setInitStep
+     */
+    virtual void getInitStep(OutputArray step) const=0;
+
+    /** @brief Sets the initial step that will be used in downhill simplex algorithm.
+
+    Step, together with initial point (given in DownhillSolver::minimize) are two `n`-dimensional
+    vectors that are used to determine the shape of initial simplex. Roughly said, initial point
+    determines the position of a simplex (it will become simplex's centroid), while step determines the
+    spread (size in each dimension) of a simplex. To be more precise, if \f$s,x_0\in\mathbb{R}^n\f$ are
+    the initial step and initial point respectively, the vertices of a simplex will be:
+    \f$v_0:=x_0-\frac{1}{2} s\f$ and \f$v_i:=x_0+s_i\f$ for \f$i=1,2,\dots,n\f$ where \f$s_i\f$ denotes
+    projections of the initial step of *n*-th coordinate (the result of projection is treated to be
+    vector given by \f$s_i:=e_i\cdot\left<e_i\cdot s\right>\f$, where \f$e_i\f$ form canonical basis)
+
+    @param step Initial step that will be used in algorithm. Roughly said, it determines the spread
+    (size in each dimension) of an initial simplex.
+     */
+    virtual void setInitStep(InputArray step)=0;
+
+    /** @brief This function returns the reference to the ready-to-use DownhillSolver object.
+
+    All the parameters are optional, so this procedure can be called even without parameters at
+    all. In this case, the default values will be used. As default value for terminal criteria are
+    the only sensible ones, MinProblemSolver::setFunction() and DownhillSolver::setInitStep()
+    should be called upon the obtained object, if the respective parameters were not given to
+    create(). Otherwise, the two ways (give parameters to createDownhillSolver() or miss them out
+    and call the MinProblemSolver::setFunction() and DownhillSolver::setInitStep()) are absolutely
+    equivalent (and will drop the same errors in the same way, should invalid input be detected).
+    @param f Pointer to the function that will be minimized, similarly to the one you submit via
+    MinProblemSolver::setFunction.
+    @param initStep Initial step, that will be used to construct the initial simplex, similarly to the one
+    you submit via MinProblemSolver::setInitStep.
+    @param termcrit Terminal criteria to the algorithm, similarly to the one you submit via
+    MinProblemSolver::setTermCriteria.
+     */
+    static Ptr<DownhillSolver> create(const Ptr<MinProblemSolver::Function>& f=Ptr<MinProblemSolver::Function>(),
+                                      InputArray initStep=Mat_<double>(1,1,0.0),
+                                      TermCriteria termcrit=TermCriteria(TermCriteria::MAX_ITER+TermCriteria::EPS,5000,0.000001));
+};
+
+/** @brief This class is used to perform the non-linear non-constrained minimization of a function
+with known gradient,
+
+defined on an *n*-dimensional Euclidean space, using the **Nonlinear Conjugate Gradient method**.
+The implementation was done based on the beautifully clear explanatory article [An Introduction to
+the Conjugate Gradient Method Without the Agonizing
+Pain](http://www.cs.cmu.edu/~quake-papers/painless-conjugate-gradient.pdf) by Jonathan Richard
+Shewchuk. The method can be seen as an adaptation of a standard Conjugate Gradient method (see, for
+example <http://en.wikipedia.org/wiki/Conjugate_gradient_method>) for numerically solving the
+systems of linear equations.
+
+It should be noted, that this method, although deterministic, is rather a heuristic method and
+therefore may converge to a local minima, not necessary a global one. What is even more disastrous,
+most of its behaviour is ruled by gradient, therefore it essentially cannot distinguish between
+local minima and maxima. Therefore, if it starts sufficiently near to the local maximum, it may
+converge to it. Another obvious restriction is that it should be possible to compute the gradient of
+a function at any point, thus it is preferable to have analytic expression for gradient and
+computational burden should be born by the user.
+
+The latter responsibility is accomplished via the getGradient method of a
+MinProblemSolver::Function interface (which represents function being optimized). This method takes
+point a point in *n*-dimensional space (first argument represents the array of coordinates of that
+point) and compute its gradient (it should be stored in the second argument as an array).
+
+@note class ConjGradSolver thus does not add any new methods to the basic MinProblemSolver interface.
+
+@note term criteria should meet following condition:
+@code
+    termcrit.type == (TermCriteria::MAX_ITER + TermCriteria::EPS) && termcrit.epsilon > 0 && termcrit.maxCount > 0
+    // or
+    termcrit.type == TermCriteria::MAX_ITER) && termcrit.maxCount > 0
+@endcode
+ */
+class CV_EXPORTS ConjGradSolver : public MinProblemSolver
+{
+public:
+    /** @brief This function returns the reference to the ready-to-use ConjGradSolver object.
+
+    All the parameters are optional, so this procedure can be called even without parameters at
+    all. In this case, the default values will be used. As default value for terminal criteria are
+    the only sensible ones, MinProblemSolver::setFunction() should be called upon the obtained
+    object, if the function was not given to create(). Otherwise, the two ways (submit it to
+    create() or miss it out and call the MinProblemSolver::setFunction()) are absolutely equivalent
+    (and will drop the same errors in the same way, should invalid input be detected).
+    @param f Pointer to the function that will be minimized, similarly to the one you submit via
+    MinProblemSolver::setFunction.
+    @param termcrit Terminal criteria to the algorithm, similarly to the one you submit via
+    MinProblemSolver::setTermCriteria.
+    */
+    static Ptr<ConjGradSolver> create(const Ptr<MinProblemSolver::Function>& f=Ptr<ConjGradSolver::Function>(),
+                                      TermCriteria termcrit=TermCriteria(TermCriteria::MAX_ITER+TermCriteria::EPS,5000,0.000001));
+};
+
+//! return codes for cv::solveLP() function
+enum SolveLPResult
+{
+    SOLVELP_UNBOUNDED    = -2, //!< problem is unbounded (target function can achieve arbitrary high values)
+    SOLVELP_UNFEASIBLE    = -1, //!< problem is unfeasible (there are no points that satisfy all the constraints imposed)
+    SOLVELP_SINGLE    = 0, //!< there is only one maximum for target function
+    SOLVELP_MULTI    = 1 //!< there are multiple maxima for target function - the arbitrary one is returned
+};
+
+/** @brief Solve given (non-integer) linear programming problem using the Simplex Algorithm (Simplex Method).
+
+What we mean here by "linear programming problem" (or LP problem, for short) can be formulated as:
+
+\f[\mbox{Maximize } c\cdot x\\
+ \mbox{Subject to:}\\
+ Ax\leq b\\
+ x\geq 0\f]
+
+Where \f$c\f$ is fixed `1`-by-`n` row-vector, \f$A\f$ is fixed `m`-by-`n` matrix, \f$b\f$ is fixed `m`-by-`1`
+column vector and \f$x\f$ is an arbitrary `n`-by-`1` column vector, which satisfies the constraints.
+
+Simplex algorithm is one of many algorithms that are designed to handle this sort of problems
+efficiently. Although it is not optimal in theoretical sense (there exist algorithms that can solve
+any problem written as above in polynomial time, while simplex method degenerates to exponential
+time for some special cases), it is well-studied, easy to implement and is shown to work well for
+real-life purposes.
+
+The particular implementation is taken almost verbatim from **Introduction to Algorithms, third
+edition** by T. H. Cormen, C. E. Leiserson, R. L. Rivest and Clifford Stein. In particular, the
+Bland's rule <http://en.wikipedia.org/wiki/Bland%27s_rule> is used to prevent cycling.
+
+@param Func This row-vector corresponds to \f$c\f$ in the LP problem formulation (see above). It should
+contain 32- or 64-bit floating point numbers. As a convenience, column-vector may be also submitted,
+in the latter case it is understood to correspond to \f$c^T\f$.
+@param Constr `m`-by-`n+1` matrix, whose rightmost column corresponds to \f$b\f$ in formulation above
+and the remaining to \f$A\f$. It should contain 32- or 64-bit floating point numbers.
+@param z The solution will be returned here as a column-vector - it corresponds to \f$c\f$ in the
+formulation above. It will contain 64-bit floating point numbers.
+@return One of cv::SolveLPResult
+ */
+CV_EXPORTS_W int solveLP(InputArray Func, InputArray Constr, OutputArray z);
+
+//! @}
+
+}// cv
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/ovx.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/ovx.hpp
new file mode 100644
index 0000000..8bb7d54
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/ovx.hpp
@@ -0,0 +1,28 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2016, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+// OpenVX related definitions and declarations
+
+#pragma once
+#ifndef OPENCV_OVX_HPP
+#define OPENCV_OVX_HPP
+
+#include "cvdef.h"
+
+namespace cv
+{
+/// Check if use of OpenVX is possible
+CV_EXPORTS_W bool haveOpenVX();
+
+/// Check if use of OpenVX is enabled
+CV_EXPORTS_W bool useOpenVX();
+
+/// Enable/disable use of OpenVX
+CV_EXPORTS_W void setUseOpenVX(bool flag);
+} // namespace cv
+
+#endif // OPENCV_OVX_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/parallel/backend/parallel_for.openmp.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/parallel/backend/parallel_for.openmp.hpp
new file mode 100644
index 0000000..b172cac
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/parallel/backend/parallel_for.openmp.hpp
@@ -0,0 +1,72 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_PARALLEL_FOR_OPENMP_HPP
+#define OPENCV_CORE_PARALLEL_FOR_OPENMP_HPP
+
+#include "opencv2/core/parallel/parallel_backend.hpp"
+
+#if !defined(_OPENMP) && !defined(OPENCV_SKIP_OPENMP_PRESENSE_CHECK)
+#error "This file must be compiled with enabled OpenMP"
+#endif
+
+#include <omp.h>
+
+namespace cv { namespace parallel { namespace openmp {
+
+/** OpenMP parallel_for API implementation
+ *
+ * @sa setParallelForBackend
+ * @ingroup core_parallel_backend
+ */
+class ParallelForBackend : public ParallelForAPI
+{
+protected:
+    int numThreads;
+    int numThreadsMax;
+public:
+    ParallelForBackend()
+    {
+        numThreads = 0;
+        numThreadsMax = omp_get_max_threads();
+    }
+
+    virtual ~ParallelForBackend() {}
+
+    virtual void parallel_for(int tasks, FN_parallel_for_body_cb_t body_callback, void* callback_data) CV_OVERRIDE
+    {
+#pragma omp parallel for schedule(dynamic) num_threads(numThreads > 0 ? numThreads : numThreadsMax)
+        for (int i = 0; i < tasks; ++i)
+            body_callback(i, i + 1, callback_data);
+    }
+
+    virtual int getThreadNum() const CV_OVERRIDE
+    {
+        return omp_get_thread_num();
+    }
+
+    virtual int getNumThreads() const CV_OVERRIDE
+    {
+        return numThreads > 0
+               ? numThreads
+               : numThreadsMax;
+    }
+
+    virtual int setNumThreads(int nThreads) CV_OVERRIDE
+    {
+        int oldNumThreads = numThreads;
+        numThreads = nThreads;
+        // nothing needed as numThreads is used in #pragma omp parallel for directly
+        return oldNumThreads;
+    }
+
+    const char* getName() const CV_OVERRIDE
+    {
+        return "openmp";
+    }
+};
+
+}}}  // namespace
+
+#endif  // OPENCV_CORE_PARALLEL_FOR_OPENMP_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/parallel/backend/parallel_for.tbb.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/parallel/backend/parallel_for.tbb.hpp
new file mode 100644
index 0000000..04b0c4c
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/parallel/backend/parallel_for.tbb.hpp
@@ -0,0 +1,153 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_PARALLEL_FOR_TBB_HPP
+#define OPENCV_CORE_PARALLEL_FOR_TBB_HPP
+
+#include "opencv2/core/parallel/parallel_backend.hpp"
+#include <opencv2/core/utils/logger.hpp>
+
+#ifndef TBB_SUPPRESS_DEPRECATED_MESSAGES  // supress warning
+#define TBB_SUPPRESS_DEPRECATED_MESSAGES 1
+#endif
+#include "tbb/tbb.h"
+#if !defined(TBB_INTERFACE_VERSION)
+#error "Unknows/unsupported TBB version"
+#endif
+
+#if TBB_INTERFACE_VERSION >= 8000
+#include "tbb/task_arena.h"
+#endif
+
+namespace cv { namespace parallel { namespace tbb {
+
+using namespace ::tbb;
+
+#if TBB_INTERFACE_VERSION >= 8000
+static tbb::task_arena& getArena()
+{
+    static tbb::task_arena tbbArena(tbb::task_arena::automatic);
+    return tbbArena;
+}
+#else
+static tbb::task_scheduler_init& getScheduler()
+{
+    static tbb::task_scheduler_init tbbScheduler(tbb::task_scheduler_init::deferred);
+    return tbbScheduler;
+}
+#endif
+
+/** TBB parallel_for API implementation
+ *
+ * @sa setParallelForBackend
+ * @ingroup core_parallel_backend
+ */
+class ParallelForBackend : public ParallelForAPI
+{
+protected:
+    int numThreads;
+    int numThreadsMax;
+public:
+    ParallelForBackend()
+    {
+        CV_LOG_INFO(NULL, "Initializing TBB parallel backend: TBB_INTERFACE_VERSION=" << TBB_INTERFACE_VERSION);
+        numThreads = 0;
+#if TBB_INTERFACE_VERSION >= 8000
+        (void)getArena();
+#else
+        (void)getScheduler();
+#endif
+    }
+
+    virtual ~ParallelForBackend() {}
+
+    class CallbackProxy
+    {
+        const FN_parallel_for_body_cb_t& callback;
+        void* const callback_data;
+        const int tasks;
+    public:
+        inline CallbackProxy(int tasks_, FN_parallel_for_body_cb_t& callback_, void* callback_data_)
+            : callback(callback_), callback_data(callback_data_), tasks(tasks_)
+        {
+            // nothing
+        }
+
+        void operator()(const tbb::blocked_range<int>& range) const
+        {
+            this->callback(range.begin(), range.end(), callback_data);
+        }
+
+        void operator()() const
+        {
+            tbb::parallel_for(tbb::blocked_range<int>(0, tasks), *this);
+        }
+    };
+
+    virtual void parallel_for(int tasks, FN_parallel_for_body_cb_t body_callback, void* callback_data) CV_OVERRIDE
+    {
+        CallbackProxy task(tasks, body_callback, callback_data);
+#if TBB_INTERFACE_VERSION >= 8000
+        getArena().execute(task);
+#else
+        task();
+#endif
+    }
+
+    virtual int getThreadNum() const CV_OVERRIDE
+    {
+#if TBB_INTERFACE_VERSION >= 9100
+        return tbb::this_task_arena::current_thread_index();
+#elif TBB_INTERFACE_VERSION >= 8000
+        return tbb::task_arena::current_thread_index();
+#else
+        return 0;
+#endif
+    }
+
+    virtual int getNumThreads() const CV_OVERRIDE
+    {
+#if TBB_INTERFACE_VERSION >= 9100
+    return getArena().max_concurrency();
+#elif TBB_INTERFACE_VERSION >= 8000
+    return numThreads > 0
+        ? numThreads
+        : tbb::task_scheduler_init::default_num_threads();
+#else
+    return getScheduler().is_active()
+           ? numThreads
+           : tbb::task_scheduler_init::default_num_threads();
+#endif
+    }
+
+    virtual int setNumThreads(int nThreads) CV_OVERRIDE
+    {
+        int oldNumThreads = numThreads;
+        numThreads = nThreads;
+
+#if TBB_INTERFACE_VERSION >= 8000
+        auto& tbbArena = getArena();
+        if (tbbArena.is_active())
+            tbbArena.terminate();
+        if (numThreads > 0)
+            tbbArena.initialize(numThreads);
+#else
+        auto& tbbScheduler = getScheduler();
+        if (tbbScheduler.is_active())
+            tbbScheduler.terminate();
+        if (numThreads > 0)
+            tbbScheduler.initialize(numThreads);
+#endif
+        return oldNumThreads;
+    }
+
+    const char* getName() const CV_OVERRIDE
+    {
+        return "tbb";
+    }
+};
+
+}}}  // namespace
+
+#endif  // OPENCV_CORE_PARALLEL_FOR_TBB_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/parallel/parallel_backend.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/parallel/parallel_backend.hpp
new file mode 100644
index 0000000..c3e8333
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/parallel/parallel_backend.hpp
@@ -0,0 +1,90 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_PARALLEL_BACKEND_HPP
+#define OPENCV_CORE_PARALLEL_BACKEND_HPP
+
+#include "opencv2/core/cvdef.h"
+#include <memory>
+
+namespace cv { namespace parallel {
+#ifndef CV_API_CALL
+#define CV_API_CALL
+#endif
+
+/** @addtogroup core_parallel_backend
+ * @{
+ * API below is provided to resolve problem of CPU resource over-subscription by multiple thread pools from different multi-threading frameworks.
+ * This is common problem for cases when OpenCV compiled threading framework is different from the Users Applications framework.
+ *
+ * Applications can replace OpenCV `parallel_for()` backend with own implementation (to reuse Application's thread pool).
+ *
+ *
+ * ### Backend API usage examples
+ *
+ * #### Intel TBB
+ *
+ * - include header with simple implementation of TBB backend:
+ *   @snippet parallel_backend/example-tbb.cpp tbb_include
+ * - execute backend replacement code:
+ *   @snippet parallel_backend/example-tbb.cpp tbb_backend
+ * - configuration of compiler/linker options is responsibility of Application's scripts
+ *
+ * #### OpenMP
+ *
+ * - include header with simple implementation of OpenMP backend:
+ *   @snippet parallel_backend/example-openmp.cpp openmp_include
+ * - execute backend replacement code:
+ *   @snippet parallel_backend/example-openmp.cpp openmp_backend
+ * - Configuration of compiler/linker options is responsibility of Application's scripts
+ *
+ *
+ * ### Plugins support
+ *
+ * Runtime configuration options:
+ * - change backend priority: `OPENCV_PARALLEL_PRIORITY_<backend>=9999`
+ * - disable backend: `OPENCV_PARALLEL_PRIORITY_<backend>=0`
+ * - specify list of backends with high priority (>100000): `OPENCV_PARALLEL_PRIORITY_LIST=TBB,OPENMP`. Unknown backends are registered as new plugins.
+ *
+ */
+
+/** Interface for parallel_for backends implementations
+ *
+ * @sa setParallelForBackend
+ */
+class CV_EXPORTS ParallelForAPI
+{
+public:
+    virtual ~ParallelForAPI();
+
+    typedef void (CV_API_CALL *FN_parallel_for_body_cb_t)(int start, int end, void* data);
+
+    virtual void parallel_for(int tasks, FN_parallel_for_body_cb_t body_callback, void* callback_data) = 0;
+
+    virtual int getThreadNum() const = 0;
+
+    virtual int getNumThreads() const = 0;
+
+    virtual int setNumThreads(int nThreads) = 0;
+
+    virtual const char* getName() const = 0;
+};
+
+/** @brief Replace OpenCV parallel_for backend
+ *
+ * Application can replace OpenCV `parallel_for()` backend with own implementation.
+ *
+ * @note This call is not thread-safe. Consider calling this function from the `main()` before any other OpenCV processing functions (and without any other created threads).
+ */
+CV_EXPORTS void setParallelForBackend(const std::shared_ptr<ParallelForAPI>& api, bool propagateNumThreads = true);
+
+/** @brief Change OpenCV parallel_for backend
+ *
+ * @note This call is not thread-safe. Consider calling this function from the `main()` before any other OpenCV processing functions (and without any other created threads).
+ */
+CV_EXPORTS_W bool setParallelForBackend(const std::string& backendName, bool propagateNumThreads = true);
+
+//! @}
+}}  // namespace
+#endif  // OPENCV_CORE_PARALLEL_BACKEND_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/persistence.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/persistence.hpp
new file mode 100644
index 0000000..8e135d1
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/persistence.hpp
@@ -0,0 +1,1350 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_PERSISTENCE_HPP
+#define OPENCV_CORE_PERSISTENCE_HPP
+
+#ifndef CV_DOXYGEN
+/// Define to support persistence legacy formats
+#define CV__LEGACY_PERSISTENCE
+#endif
+
+#ifndef __cplusplus
+#  error persistence.hpp header must be compiled as C++
+#endif
+
+//! @addtogroup core_c
+//! @{
+
+/** @brief "black box" representation of the file storage associated with a file on disk.
+
+Several functions that are described below take CvFileStorage\* as inputs and allow the user to
+save or to load hierarchical collections that consist of scalar values, standard CXCore objects
+(such as matrices, sequences, graphs), and user-defined objects.
+
+OpenCV can read and write data in XML (<http://www.w3c.org/XML>), YAML (<http://www.yaml.org>) or
+JSON (<http://www.json.org/>) formats. Below is an example of 3x3 floating-point identity matrix A,
+stored in XML and YAML files
+using CXCore functions:
+XML:
+@code{.xml}
+    <?xml version="1.0">
+    <opencv_storage>
+    <A type_id="opencv-matrix">
+      <rows>3</rows>
+      <cols>3</cols>
+      <dt>f</dt>
+      <data>1. 0. 0. 0. 1. 0. 0. 0. 1.</data>
+    </A>
+    </opencv_storage>
+@endcode
+YAML:
+@code{.yaml}
+    %YAML:1.0
+    A: !!opencv-matrix
+      rows: 3
+      cols: 3
+      dt: f
+      data: [ 1., 0., 0., 0., 1., 0., 0., 0., 1.]
+@endcode
+As it can be seen from the examples, XML uses nested tags to represent hierarchy, while YAML uses
+indentation for that purpose (similar to the Python programming language).
+
+The same functions can read and write data in both formats; the particular format is determined by
+the extension of the opened file, ".xml" for XML files, ".yml" or ".yaml" for YAML and ".json" for
+JSON.
+ */
+
+//! @} core_c
+
+#include "opencv2/core/types.hpp"
+#include "opencv2/core/mat.hpp"
+
+namespace cv {
+
+/** @addtogroup core_xml
+
+XML/YAML/JSON file storages.     {#xml_storage}
+=======================
+Writing to a file storage.
+--------------------------
+You can store and then restore various OpenCV data structures to/from XML (<http://www.w3c.org/XML>),
+YAML (<http://www.yaml.org>) or JSON (<http://www.json.org/>) formats. Also, it is possible to store
+and load arbitrarily complex data structures, which include OpenCV data structures, as well as
+primitive data types (integer and floating-point numbers and text strings) as their elements.
+
+Use the following procedure to write something to XML, YAML or JSON:
+-# Create new FileStorage and open it for writing. It can be done with a single call to
+FileStorage::FileStorage constructor that takes a filename, or you can use the default constructor
+and then call FileStorage::open. Format of the file (XML, YAML or JSON) is determined from the filename
+extension (".xml", ".yml"/".yaml" and ".json", respectively)
+-# Write all the data you want using the streaming operator `<<`, just like in the case of STL
+streams.
+-# Close the file using FileStorage::release. FileStorage destructor also closes the file.
+
+Here is an example:
+@code
+    #include "opencv2/core.hpp"
+    #include <time.h>
+
+    using namespace cv;
+
+    int main(int, char** argv)
+    {
+        FileStorage fs("test.yml", FileStorage::WRITE);
+
+        fs << "frameCount" << 5;
+        time_t rawtime; time(&rawtime);
+        fs << "calibrationDate" << asctime(localtime(&rawtime));
+        Mat cameraMatrix = (Mat_<double>(3,3) << 1000, 0, 320, 0, 1000, 240, 0, 0, 1);
+        Mat distCoeffs = (Mat_<double>(5,1) << 0.1, 0.01, -0.001, 0, 0);
+        fs << "cameraMatrix" << cameraMatrix << "distCoeffs" << distCoeffs;
+        fs << "features" << "[";
+        for( int i = 0; i < 3; i++ )
+        {
+            int x = rand() % 640;
+            int y = rand() % 480;
+            uchar lbp = rand() % 256;
+
+            fs << "{:" << "x" << x << "y" << y << "lbp" << "[:";
+            for( int j = 0; j < 8; j++ )
+                fs << ((lbp >> j) & 1);
+            fs << "]" << "}";
+        }
+        fs << "]";
+        fs.release();
+        return 0;
+    }
+@endcode
+The sample above stores to YML an integer, a text string (calibration date), 2 matrices, and a custom
+structure "feature", which includes feature coordinates and LBP (local binary pattern) value. Here
+is output of the sample:
+@code{.yaml}
+%YAML:1.0
+frameCount: 5
+calibrationDate: "Fri Jun 17 14:09:29 2011\n"
+cameraMatrix: !!opencv-matrix
+   rows: 3
+   cols: 3
+   dt: d
+   data: [ 1000., 0., 320., 0., 1000., 240., 0., 0., 1. ]
+distCoeffs: !!opencv-matrix
+   rows: 5
+   cols: 1
+   dt: d
+   data: [ 1.0000000000000001e-01, 1.0000000000000000e-02,
+       -1.0000000000000000e-03, 0., 0. ]
+features:
+   - { x:167, y:49, lbp:[ 1, 0, 0, 1, 1, 0, 1, 1 ] }
+   - { x:298, y:130, lbp:[ 0, 0, 0, 1, 0, 0, 1, 1 ] }
+   - { x:344, y:158, lbp:[ 1, 1, 0, 0, 0, 0, 1, 0 ] }
+@endcode
+
+As an exercise, you can replace ".yml" with ".xml" or ".json" in the sample above and see, how the
+corresponding XML file will look like.
+
+Several things can be noted by looking at the sample code and the output:
+
+-   The produced YAML (and XML/JSON) consists of heterogeneous collections that can be nested. There are
+    2 types of collections: named collections (mappings) and unnamed collections (sequences). In mappings
+    each element has a name and is accessed by name. This is similar to structures and std::map in
+    C/C++ and dictionaries in Python. In sequences elements do not have names, they are accessed by
+    indices. This is similar to arrays and std::vector in C/C++ and lists, tuples in Python.
+    "Heterogeneous" means that elements of each single collection can have different types.
+
+    Top-level collection in YAML/XML/JSON is a mapping. Each matrix is stored as a mapping, and the matrix
+    elements are stored as a sequence. Then, there is a sequence of features, where each feature is
+    represented a mapping, and lbp value in a nested sequence.
+
+-   When you write to a mapping (a structure), you write element name followed by its value. When you
+    write to a sequence, you simply write the elements one by one. OpenCV data structures (such as
+    cv::Mat) are written in absolutely the same way as simple C data structures - using `<<`
+    operator.
+
+-   To write a mapping, you first write the special string `{` to the storage, then write the
+    elements as pairs (`fs << <element_name> << <element_value>`) and then write the closing
+    `}`.
+
+-   To write a sequence, you first write the special string `[`, then write the elements, then
+    write the closing `]`.
+
+-   In YAML/JSON (but not XML), mappings and sequences can be written in a compact Python-like inline
+    form. In the sample above matrix elements, as well as each feature, including its lbp value, is
+    stored in such inline form. To store a mapping/sequence in a compact form, put `:` after the
+    opening character, e.g. use `{:` instead of `{` and `[:` instead of `[`. When the
+    data is written to XML, those extra `:` are ignored.
+
+Reading data from a file storage.
+---------------------------------
+To read the previously written XML, YAML or JSON file, do the following:
+-#  Open the file storage using FileStorage::FileStorage constructor or FileStorage::open method.
+    In the current implementation the whole file is parsed and the whole representation of file
+    storage is built in memory as a hierarchy of file nodes (see FileNode)
+
+-#  Read the data you are interested in. Use FileStorage::operator [], FileNode::operator []
+    and/or FileNodeIterator.
+
+-#  Close the storage using FileStorage::release.
+
+Here is how to read the file created by the code sample above:
+@code
+    FileStorage fs2("test.yml", FileStorage::READ);
+
+    // first method: use (type) operator on FileNode.
+    int frameCount = (int)fs2["frameCount"];
+
+    String date;
+    // second method: use FileNode::operator >>
+    fs2["calibrationDate"] >> date;
+
+    Mat cameraMatrix2, distCoeffs2;
+    fs2["cameraMatrix"] >> cameraMatrix2;
+    fs2["distCoeffs"] >> distCoeffs2;
+
+    cout << "frameCount: " << frameCount << endl
+         << "calibration date: " << date << endl
+         << "camera matrix: " << cameraMatrix2 << endl
+         << "distortion coeffs: " << distCoeffs2 << endl;
+
+    FileNode features = fs2["features"];
+    FileNodeIterator it = features.begin(), it_end = features.end();
+    int idx = 0;
+    std::vector<uchar> lbpval;
+
+    // iterate through a sequence using FileNodeIterator
+    for( ; it != it_end; ++it, idx++ )
+    {
+        cout << "feature #" << idx << ": ";
+        cout << "x=" << (int)(*it)["x"] << ", y=" << (int)(*it)["y"] << ", lbp: (";
+        // you can also easily read numerical arrays using FileNode >> std::vector operator.
+        (*it)["lbp"] >> lbpval;
+        for( int i = 0; i < (int)lbpval.size(); i++ )
+            cout << " " << (int)lbpval[i];
+        cout << ")" << endl;
+    }
+    fs2.release();
+@endcode
+
+Format specification    {#format_spec}
+--------------------
+`([count]{u|c|w|s|i|f|d})`... where the characters correspond to fundamental C++ types:
+-   `u` 8-bit unsigned number
+-   `c` 8-bit signed number
+-   `w` 16-bit unsigned number
+-   `s` 16-bit signed number
+-   `i` 32-bit signed number
+-   `f` single precision floating-point number
+-   `d` double precision floating-point number
+-   `r` pointer, 32 lower bits of which are written as a signed integer. The type can be used to
+    store structures with links between the elements.
+
+`count` is the optional counter of values of a given type. For example, `2if` means that each array
+element is a structure of 2 integers, followed by a single-precision floating-point number. The
+equivalent notations of the above specification are `iif`, `2i1f` and so forth. Other examples: `u`
+means that the array consists of bytes, and `2d` means the array consists of pairs of doubles.
+
+@see @ref samples/cpp/filestorage.cpp
+*/
+
+//! @{
+
+/** @example samples/cpp/filestorage.cpp
+A complete example using the FileStorage interface
+*/
+
+////////////////////////// XML & YAML I/O //////////////////////////
+
+class CV_EXPORTS FileNode;
+class CV_EXPORTS FileNodeIterator;
+
+/** @brief XML/YAML/JSON file storage class that encapsulates all the information necessary for writing or
+reading data to/from a file.
+ */
+class CV_EXPORTS_W FileStorage
+{
+public:
+    //! file storage mode
+    enum Mode
+    {
+        READ        = 0, //!< value, open the file for reading
+        WRITE       = 1, //!< value, open the file for writing
+        APPEND      = 2, //!< value, open the file for appending
+        MEMORY      = 4, /**< flag, read data from source or write data to the internal buffer (which is
+                              returned by FileStorage::release) */
+        FORMAT_MASK = (7<<3), //!< mask for format flags
+        FORMAT_AUTO = 0,      //!< flag, auto format
+        FORMAT_XML  = (1<<3), //!< flag, XML format
+        FORMAT_YAML = (2<<3), //!< flag, YAML format
+        FORMAT_JSON = (3<<3), //!< flag, JSON format
+
+        BASE64      = 64,     //!< flag, write rawdata in Base64 by default. (consider using WRITE_BASE64)
+        WRITE_BASE64 = BASE64 | WRITE, //!< flag, enable both WRITE and BASE64
+    };
+    enum State
+    {
+        UNDEFINED      = 0,
+        VALUE_EXPECTED = 1,
+        NAME_EXPECTED  = 2,
+        INSIDE_MAP     = 4
+    };
+
+    /** @brief The constructors.
+
+     The full constructor opens the file. Alternatively you can use the default constructor and then
+     call FileStorage::open.
+     */
+    CV_WRAP FileStorage();
+
+    /** @overload
+     @copydoc open()
+     */
+    CV_WRAP FileStorage(const String& filename, int flags, const String& encoding=String());
+
+    //! the destructor. calls release()
+    virtual ~FileStorage();
+
+    /** @brief Opens a file.
+
+     See description of parameters in FileStorage::FileStorage. The method calls FileStorage::release
+     before opening the file.
+     @param filename Name of the file to open or the text string to read the data from.
+     Extension of the file (.xml, .yml/.yaml or .json) determines its format (XML, YAML or JSON
+     respectively). Also you can append .gz to work with compressed files, for example myHugeMatrix.xml.gz. If both
+     FileStorage::WRITE and FileStorage::MEMORY flags are specified, source is used just to specify
+     the output file format (e.g. mydata.xml, .yml etc.). A file name can also contain parameters.
+     You can use this format, "*?base64" (e.g. "file.json?base64" (case sensitive)), as an alternative to
+     FileStorage::BASE64 flag.
+     @param flags Mode of operation. One of FileStorage::Mode
+     @param encoding Encoding of the file. Note that UTF-16 XML encoding is not supported currently and
+     you should use 8-bit encoding instead of it.
+     */
+    CV_WRAP virtual bool open(const String& filename, int flags, const String& encoding=String());
+
+    /** @brief Checks whether the file is opened.
+
+     @returns true if the object is associated with the current file and false otherwise. It is a
+     good practice to call this method after you tried to open a file.
+     */
+    CV_WRAP virtual bool isOpened() const;
+
+    /** @brief Closes the file and releases all the memory buffers.
+
+     Call this method after all I/O operations with the storage are finished.
+     */
+    CV_WRAP virtual void release();
+
+    /** @brief Closes the file and releases all the memory buffers.
+
+     Call this method after all I/O operations with the storage are finished. If the storage was
+     opened for writing data and FileStorage::WRITE was specified
+     */
+    CV_WRAP virtual String releaseAndGetString();
+
+    /** @brief Returns the first element of the top-level mapping.
+     @returns The first element of the top-level mapping.
+     */
+    CV_WRAP FileNode getFirstTopLevelNode() const;
+
+    /** @brief Returns the top-level mapping
+     @param streamidx Zero-based index of the stream. In most cases there is only one stream in the file.
+     However, YAML supports multiple streams and so there can be several.
+     @returns The top-level mapping.
+     */
+    CV_WRAP FileNode root(int streamidx=0) const;
+
+    /** @brief Returns the specified element of the top-level mapping.
+     @param nodename Name of the file node.
+     @returns Node with the given name.
+     */
+    FileNode operator[](const String& nodename) const;
+
+    /** @overload */
+    CV_WRAP_AS(getNode) FileNode operator[](const char* nodename) const;
+
+    /**
+     * @brief Simplified writing API to use with bindings.
+     * @param name Name of the written object. When writing to sequences (a.k.a. "arrays"), pass an empty string.
+     * @param val Value of the written object.
+     */
+    CV_WRAP void write(const String& name, int val);
+    /// @overload
+    CV_WRAP void write(const String& name, double val);
+    /// @overload
+    CV_WRAP void write(const String& name, const String& val);
+    /// @overload
+    CV_WRAP void write(const String& name, const Mat& val);
+    /// @overload
+    CV_WRAP void write(const String& name, const std::vector<String>& val);
+
+    /** @brief Writes multiple numbers.
+
+     Writes one or more numbers of the specified format to the currently written structure. Usually it is
+     more convenient to use operator `<<` instead of this method.
+     @param fmt Specification of each array element, see @ref format_spec "format specification"
+     @param vec Pointer to the written array.
+     @param len Number of the uchar elements to write.
+     */
+    void writeRaw( const String& fmt, const void* vec, size_t len );
+
+    /** @brief Writes a comment.
+
+     The function writes a comment into file storage. The comments are skipped when the storage is read.
+     @param comment The written comment, single-line or multi-line
+     @param append If true, the function tries to put the comment at the end of current line.
+     Else if the comment is multi-line, or if it does not fit at the end of the current
+     line, the comment starts a new line.
+     */
+    CV_WRAP void writeComment(const String& comment, bool append = false);
+
+    /** @brief Starts to write a nested structure (sequence or a mapping).
+    @param name name of the structure. When writing to sequences (a.k.a. "arrays"), pass an empty string.
+    @param flags type of the structure (FileNode::MAP or FileNode::SEQ (both with optional FileNode::FLOW)).
+    @param typeName optional name of the type you store. The effect of setting this depends on the storage format.
+    I.e. if the format has a specification for storing type information, this parameter is used.
+    */
+    CV_WRAP void startWriteStruct(const String& name, int flags, const String& typeName=String());
+
+    /** @brief Finishes writing nested structure (should pair startWriteStruct())
+    */
+    CV_WRAP void endWriteStruct();
+
+    /** @brief Returns the normalized object name for the specified name of a file.
+    @param filename Name of a file
+    @returns The normalized object name.
+     */
+    static String getDefaultObjectName(const String& filename);
+
+    /** @brief Returns the current format.
+     * @returns The current format, see FileStorage::Mode
+     */
+    CV_WRAP int getFormat() const;
+
+    int state;
+    std::string elname;
+
+    class Impl;
+    Ptr<Impl> p;
+};
+
+/** @brief File Storage Node class.
+
+The node is used to store each and every element of the file storage opened for reading. When
+XML/YAML file is read, it is first parsed and stored in the memory as a hierarchical collection of
+nodes. Each node can be a "leaf" that is contain a single number or a string, or be a collection of
+other nodes. There can be named collections (mappings) where each element has a name and it is
+accessed by a name, and ordered collections (sequences) where elements do not have names but rather
+accessed by index. Type of the file node can be determined using FileNode::type method.
+
+Note that file nodes are only used for navigating file storages opened for reading. When a file
+storage is opened for writing, no data is stored in memory after it is written.
+ */
+class CV_EXPORTS_W_SIMPLE FileNode
+{
+public:
+    //! type of the file storage node
+    enum
+    {
+        NONE      = 0, //!< empty node
+        INT       = 1, //!< an integer
+        REAL      = 2, //!< floating-point number
+        FLOAT     = REAL, //!< synonym or REAL
+        STR       = 3, //!< text string in UTF-8 encoding
+        STRING    = STR, //!< synonym for STR
+        SEQ       = 4, //!< sequence
+        MAP       = 5, //!< mapping
+        TYPE_MASK = 7,
+
+        FLOW      = 8,  //!< compact representation of a sequence or mapping. Used only by YAML writer
+        UNIFORM   = 8,  //!< if set, means that all the collection elements are numbers of the same type (real's or int's).
+        //!< UNIFORM is used only when reading FileStorage; FLOW is used only when writing. So they share the same bit
+        EMPTY     = 16, //!< empty structure (sequence or mapping)
+        NAMED     = 32  //!< the node has a name (i.e. it is element of a mapping).
+    };
+    /** @brief The constructors.
+
+     These constructors are used to create a default file node, construct it from obsolete structures or
+     from the another file node.
+     */
+    CV_WRAP FileNode();
+
+    /** @overload
+     @param fs Pointer to the file storage structure.
+     @param blockIdx Index of the memory block where the file node is stored
+     @param ofs Offset in bytes from the beginning of the serialized storage
+
+     @deprecated
+     */
+    FileNode(const FileStorage* fs, size_t blockIdx, size_t ofs);
+
+    /** @overload
+     @param node File node to be used as initialization for the created file node.
+     */
+    FileNode(const FileNode& node);
+
+    FileNode& operator=(const FileNode& node);
+
+    /** @brief Returns element of a mapping node or a sequence node.
+     @param nodename Name of an element in the mapping node.
+     @returns Returns the element with the given identifier.
+     */
+    FileNode operator[](const String& nodename) const;
+
+    /** @overload
+     @param nodename Name of an element in the mapping node.
+     */
+    CV_WRAP_AS(getNode) FileNode operator[](const char* nodename) const;
+
+    /** @overload
+     @param i Index of an element in the sequence node.
+     */
+    CV_WRAP_AS(at) FileNode operator[](int i) const;
+
+    /** @brief Returns keys of a mapping node.
+     @returns Keys of a mapping node.
+     */
+    CV_WRAP std::vector<String> keys() const;
+
+    /** @brief Returns type of the node.
+     @returns Type of the node. See FileNode::Type
+     */
+    CV_WRAP int type() const;
+
+    //! returns true if the node is empty
+    CV_WRAP bool empty() const;
+    //! returns true if the node is a "none" object
+    CV_WRAP bool isNone() const;
+    //! returns true if the node is a sequence
+    CV_WRAP bool isSeq() const;
+    //! returns true if the node is a mapping
+    CV_WRAP bool isMap() const;
+    //! returns true if the node is an integer
+    CV_WRAP bool isInt() const;
+    //! returns true if the node is a floating-point number
+    CV_WRAP bool isReal() const;
+    //! returns true if the node is a text string
+    CV_WRAP bool isString() const;
+    //! returns true if the node has a name
+    CV_WRAP bool isNamed() const;
+    //! returns the node name or an empty string if the node is nameless
+    CV_WRAP std::string name() const;
+    //! returns the number of elements in the node, if it is a sequence or mapping, or 1 otherwise.
+    CV_WRAP size_t size() const;
+    //! returns raw size of the FileNode in bytes
+    CV_WRAP size_t rawSize() const;
+    //! returns the node content as an integer. If the node stores floating-point number, it is rounded.
+    operator int() const;
+    //! returns the node content as float
+    operator float() const;
+    //! returns the node content as double
+    operator double() const;
+    //! returns the node content as text string
+    inline operator std::string() const { return this->string(); }
+
+    static bool isMap(int flags);
+    static bool isSeq(int flags);
+    static bool isCollection(int flags);
+    static bool isEmptyCollection(int flags);
+    static bool isFlow(int flags);
+
+    uchar* ptr();
+    const uchar* ptr() const;
+
+    //! returns iterator pointing to the first node element
+    FileNodeIterator begin() const;
+    //! returns iterator pointing to the element following the last node element
+    FileNodeIterator end() const;
+
+    /** @brief Reads node elements to the buffer with the specified format.
+
+    Usually it is more convenient to use operator `>>` instead of this method.
+    @param fmt Specification of each array element. See @ref format_spec "format specification"
+    @param vec Pointer to the destination array.
+    @param len Number of bytes to read (buffer size limit). If it is greater than number of
+               remaining elements then all of them will be read.
+     */
+    void readRaw( const String& fmt, void* vec, size_t len ) const;
+
+    /** Internal method used when reading FileStorage.
+     Sets the type (int, real or string) and value of the previously created node.
+     */
+    void setValue( int type, const void* value, int len=-1 );
+
+    //! Simplified reading API to use with bindings.
+    CV_WRAP double real() const;
+    //! Simplified reading API to use with bindings.
+    CV_WRAP std::string string() const;
+    //! Simplified reading API to use with bindings.
+    CV_WRAP Mat mat() const;
+
+    //protected:
+    FileNode(FileStorage::Impl* fs, size_t blockIdx, size_t ofs);
+
+    FileStorage::Impl* fs;
+    size_t blockIdx;
+    size_t ofs;
+};
+
+
+/** @brief used to iterate through sequences and mappings.
+
+ A standard STL notation, with node.begin(), node.end() denoting the beginning and the end of a
+ sequence, stored in node. See the data reading sample in the beginning of the section.
+ */
+class CV_EXPORTS FileNodeIterator
+{
+public:
+    /** @brief The constructors.
+
+     These constructors are used to create a default iterator, set it to specific element in a file node
+     or construct it from another iterator.
+     */
+    FileNodeIterator();
+
+    /** @overload
+     @param node File node - the collection to iterate over;
+        it can be a scalar (equivalent to 1-element collection) or "none" (equivalent to empty collection).
+     @param seekEnd - true if iterator needs to be set after the last element of the node;
+        that is:
+            * node.begin() => FileNodeIterator(node, false)
+            * node.end() => FileNodeIterator(node, true)
+     */
+    FileNodeIterator(const FileNode& node, bool seekEnd);
+
+    /** @overload
+     @param it Iterator to be used as initialization for the created iterator.
+     */
+    FileNodeIterator(const FileNodeIterator& it);
+
+    FileNodeIterator& operator=(const FileNodeIterator& it);
+
+    //! returns the currently observed element
+    FileNode operator *() const;
+
+    //! moves iterator to the next node
+    FileNodeIterator& operator ++ ();
+    //! moves iterator to the next node
+    FileNodeIterator operator ++ (int);
+    //! moves iterator forward by the specified offset (possibly negative)
+    FileNodeIterator& operator += (int ofs);
+
+    /** @brief Reads node elements to the buffer with the specified format.
+
+    Usually it is more convenient to use operator `>>` instead of this method.
+    @param fmt Specification of each array element. See @ref format_spec "format specification"
+    @param vec Pointer to the destination array.
+    @param len Number of bytes to read (buffer size limit). If it is greater than number of
+               remaining elements then all of them will be read.
+     */
+    FileNodeIterator& readRaw( const String& fmt, void* vec,
+                               size_t len=(size_t)INT_MAX );
+
+    //! returns the number of remaining (not read yet) elements
+    size_t remaining() const;
+
+    bool equalTo(const FileNodeIterator& it) const;
+
+protected:
+    FileStorage::Impl* fs;
+    size_t blockIdx;
+    size_t ofs;
+    size_t blockSize;
+    size_t nodeNElems;
+    size_t idx;
+};
+
+//! @} core_xml
+
+/////////////////// XML & YAML I/O implementation //////////////////
+
+//! @relates cv::FileStorage
+//! @{
+
+CV_EXPORTS void write( FileStorage& fs, const String& name, int value );
+CV_EXPORTS void write( FileStorage& fs, const String& name, float value );
+CV_EXPORTS void write( FileStorage& fs, const String& name, double value );
+CV_EXPORTS void write( FileStorage& fs, const String& name, const String& value );
+CV_EXPORTS void write( FileStorage& fs, const String& name, const Mat& value );
+CV_EXPORTS void write( FileStorage& fs, const String& name, const SparseMat& value );
+#ifdef CV__LEGACY_PERSISTENCE
+CV_EXPORTS void write( FileStorage& fs, const String& name, const std::vector<KeyPoint>& value);
+CV_EXPORTS void write( FileStorage& fs, const String& name, const std::vector<DMatch>& value);
+#endif
+
+CV_EXPORTS void writeScalar( FileStorage& fs, int value );
+CV_EXPORTS void writeScalar( FileStorage& fs, float value );
+CV_EXPORTS void writeScalar( FileStorage& fs, double value );
+CV_EXPORTS void writeScalar( FileStorage& fs, const String& value );
+
+//! @}
+
+//! @relates cv::FileNode
+//! @{
+
+CV_EXPORTS void read(const FileNode& node, int& value, int default_value);
+CV_EXPORTS void read(const FileNode& node, float& value, float default_value);
+CV_EXPORTS void read(const FileNode& node, double& value, double default_value);
+CV_EXPORTS void read(const FileNode& node, std::string& value, const std::string& default_value);
+CV_EXPORTS void read(const FileNode& node, Mat& mat, const Mat& default_mat = Mat() );
+CV_EXPORTS void read(const FileNode& node, SparseMat& mat, const SparseMat& default_mat = SparseMat() );
+#ifdef CV__LEGACY_PERSISTENCE
+CV_EXPORTS void read(const FileNode& node, std::vector<KeyPoint>& keypoints);
+CV_EXPORTS void read(const FileNode& node, std::vector<DMatch>& matches);
+#endif
+CV_EXPORTS void read(const FileNode& node, KeyPoint& value, const KeyPoint& default_value);
+CV_EXPORTS void read(const FileNode& node, DMatch& value, const DMatch& default_value);
+
+template<typename _Tp> static inline void read(const FileNode& node, Point_<_Tp>& value, const Point_<_Tp>& default_value)
+{
+    std::vector<_Tp> temp; FileNodeIterator it = node.begin(); it >> temp;
+    value = temp.size() != 2 ? default_value : Point_<_Tp>(saturate_cast<_Tp>(temp[0]), saturate_cast<_Tp>(temp[1]));
+}
+
+template<typename _Tp> static inline void read(const FileNode& node, Point3_<_Tp>& value, const Point3_<_Tp>& default_value)
+{
+    std::vector<_Tp> temp; FileNodeIterator it = node.begin(); it >> temp;
+    value = temp.size() != 3 ? default_value : Point3_<_Tp>(saturate_cast<_Tp>(temp[0]), saturate_cast<_Tp>(temp[1]),
+                                                            saturate_cast<_Tp>(temp[2]));
+}
+
+template<typename _Tp> static inline void read(const FileNode& node, Size_<_Tp>& value, const Size_<_Tp>& default_value)
+{
+    std::vector<_Tp> temp; FileNodeIterator it = node.begin(); it >> temp;
+    value = temp.size() != 2 ? default_value : Size_<_Tp>(saturate_cast<_Tp>(temp[0]), saturate_cast<_Tp>(temp[1]));
+}
+
+template<typename _Tp> static inline void read(const FileNode& node, Complex<_Tp>& value, const Complex<_Tp>& default_value)
+{
+    std::vector<_Tp> temp; FileNodeIterator it = node.begin(); it >> temp;
+    value = temp.size() != 2 ? default_value : Complex<_Tp>(saturate_cast<_Tp>(temp[0]), saturate_cast<_Tp>(temp[1]));
+}
+
+template<typename _Tp> static inline void read(const FileNode& node, Rect_<_Tp>& value, const Rect_<_Tp>& default_value)
+{
+    std::vector<_Tp> temp; FileNodeIterator it = node.begin(); it >> temp;
+    value = temp.size() != 4 ? default_value : Rect_<_Tp>(saturate_cast<_Tp>(temp[0]), saturate_cast<_Tp>(temp[1]),
+                                                          saturate_cast<_Tp>(temp[2]), saturate_cast<_Tp>(temp[3]));
+}
+
+template<typename _Tp, int cn> static inline void read(const FileNode& node, Vec<_Tp, cn>& value, const Vec<_Tp, cn>& default_value)
+{
+    std::vector<_Tp> temp; FileNodeIterator it = node.begin(); it >> temp;
+    value = temp.size() != cn ? default_value : Vec<_Tp, cn>(&temp[0]);
+}
+
+template<typename _Tp, int m, int n> static inline void read(const FileNode& node, Matx<_Tp, m, n>& value, const Matx<_Tp, m, n>& default_matx = Matx<_Tp, m, n>())
+{
+    Mat temp;
+    read(node, temp); // read as a Mat class
+
+    if (temp.empty())
+        value = default_matx;
+    else
+        value = Matx<_Tp, m, n>(temp);
+}
+
+template<typename _Tp> static inline void read(const FileNode& node, Scalar_<_Tp>& value, const Scalar_<_Tp>& default_value)
+{
+    std::vector<_Tp> temp; FileNodeIterator it = node.begin(); it >> temp;
+    value = temp.size() != 4 ? default_value : Scalar_<_Tp>(saturate_cast<_Tp>(temp[0]), saturate_cast<_Tp>(temp[1]),
+                                                            saturate_cast<_Tp>(temp[2]), saturate_cast<_Tp>(temp[3]));
+}
+
+static inline void read(const FileNode& node, Range& value, const Range& default_value)
+{
+    Point2i temp(value.start, value.end); const Point2i default_temp = Point2i(default_value.start, default_value.end);
+    read(node, temp, default_temp);
+    value.start = temp.x; value.end = temp.y;
+}
+
+//! @}
+
+/** @brief Writes string to a file storage.
+@relates cv::FileStorage
+ */
+CV_EXPORTS FileStorage& operator << (FileStorage& fs, const String& str);
+
+//! @cond IGNORED
+
+namespace internal
+{
+    class CV_EXPORTS WriteStructContext
+    {
+    public:
+        WriteStructContext(FileStorage& _fs, const String& name, int flags, const String& typeName = String());
+        ~WriteStructContext();
+    private:
+        FileStorage* fs;
+    };
+
+    template<typename _Tp, int numflag> class VecWriterProxy
+    {
+    public:
+        VecWriterProxy( FileStorage* _fs ) : fs(_fs) {}
+        void operator()(const std::vector<_Tp>& vec) const
+        {
+            size_t count = vec.size();
+            for (size_t i = 0; i < count; i++)
+                write(*fs, vec[i]);
+        }
+    private:
+        FileStorage* fs;
+    };
+
+    template<typename _Tp> class VecWriterProxy<_Tp, 1>
+    {
+    public:
+        VecWriterProxy( FileStorage* _fs ) : fs(_fs) {}
+        void operator()(const std::vector<_Tp>& vec) const
+        {
+            int _fmt = traits::SafeFmt<_Tp>::fmt;
+            char fmt[] = { (char)((_fmt >> 8) + '1'), (char)_fmt, '\0' };
+            fs->writeRaw(fmt, !vec.empty() ? (uchar*)&vec[0] : 0, vec.size() * sizeof(_Tp));
+        }
+    private:
+        FileStorage* fs;
+    };
+
+    template<typename _Tp, int numflag> class VecReaderProxy
+    {
+    public:
+        VecReaderProxy( FileNodeIterator* _it ) : it(_it) {}
+        void operator()(std::vector<_Tp>& vec, size_t count) const
+        {
+            count = std::min(count, it->remaining());
+            vec.resize(count);
+            for (size_t i = 0; i < count; i++, ++(*it))
+                read(**it, vec[i], _Tp());
+        }
+    private:
+        FileNodeIterator* it;
+    };
+
+    template<typename _Tp> class VecReaderProxy<_Tp, 1>
+    {
+    public:
+        VecReaderProxy( FileNodeIterator* _it ) : it(_it) {}
+        void operator()(std::vector<_Tp>& vec, size_t count) const
+        {
+            size_t remaining = it->remaining();
+            size_t cn = DataType<_Tp>::channels;
+            int _fmt = traits::SafeFmt<_Tp>::fmt;
+            CV_Assert((_fmt >> 8) < 9);
+            char fmt[] = { (char)((_fmt >> 8)+'1'), (char)_fmt, '\0' };
+            CV_Assert((remaining % cn) == 0);
+            size_t remaining1 = remaining / cn;
+            count = count > remaining1 ? remaining1 : count;
+            vec.resize(count);
+            it->readRaw(fmt, !vec.empty() ? (uchar*)&vec[0] : 0, count*sizeof(_Tp));
+        }
+    private:
+        FileNodeIterator* it;
+    };
+
+} // internal
+
+//! @endcond
+
+//! @relates cv::FileStorage
+//! @{
+
+template<typename _Tp> static inline
+void write(FileStorage& fs, const _Tp& value)
+{
+    write(fs, String(), value);
+}
+
+template<> inline
+void write( FileStorage& fs, const int& value )
+{
+    writeScalar(fs, value);
+}
+
+template<> inline
+void write( FileStorage& fs, const float& value )
+{
+    writeScalar(fs, value);
+}
+
+template<> inline
+void write( FileStorage& fs, const double& value )
+{
+    writeScalar(fs, value);
+}
+
+template<> inline
+void write( FileStorage& fs, const String& value )
+{
+    writeScalar(fs, value);
+}
+
+template<typename _Tp> static inline
+void write(FileStorage& fs, const Point_<_Tp>& pt )
+{
+    write(fs, pt.x);
+    write(fs, pt.y);
+}
+
+template<typename _Tp> static inline
+void write(FileStorage& fs, const Point3_<_Tp>& pt )
+{
+    write(fs, pt.x);
+    write(fs, pt.y);
+    write(fs, pt.z);
+}
+
+template<typename _Tp> static inline
+void write(FileStorage& fs, const Size_<_Tp>& sz )
+{
+    write(fs, sz.width);
+    write(fs, sz.height);
+}
+
+template<typename _Tp> static inline
+void write(FileStorage& fs, const Complex<_Tp>& c )
+{
+    write(fs, c.re);
+    write(fs, c.im);
+}
+
+template<typename _Tp> static inline
+void write(FileStorage& fs, const Rect_<_Tp>& r )
+{
+    write(fs, r.x);
+    write(fs, r.y);
+    write(fs, r.width);
+    write(fs, r.height);
+}
+
+template<typename _Tp, int cn> static inline
+void write(FileStorage& fs, const Vec<_Tp, cn>& v )
+{
+    for(int i = 0; i < cn; i++)
+        write(fs, v.val[i]);
+}
+
+template<typename _Tp, int m, int n> static inline
+void write(FileStorage& fs, const Matx<_Tp, m, n>& x )
+{
+    write(fs, Mat(x)); // write as a Mat class
+}
+
+template<typename _Tp> static inline
+void write(FileStorage& fs, const Scalar_<_Tp>& s )
+{
+    write(fs, s.val[0]);
+    write(fs, s.val[1]);
+    write(fs, s.val[2]);
+    write(fs, s.val[3]);
+}
+
+static inline
+void write(FileStorage& fs, const Range& r )
+{
+    write(fs, r.start);
+    write(fs, r.end);
+}
+
+template<typename _Tp> static inline
+void write( FileStorage& fs, const std::vector<_Tp>& vec )
+{
+    cv::internal::VecWriterProxy<_Tp, traits::SafeFmt<_Tp>::fmt != 0> w(&fs);
+    w(vec);
+}
+
+template<typename _Tp> static inline
+void write(FileStorage& fs, const String& name, const Point_<_Tp>& pt )
+{
+    cv::internal::WriteStructContext ws(fs, name, FileNode::SEQ+FileNode::FLOW);
+    write(fs, pt);
+}
+
+template<typename _Tp> static inline
+void write(FileStorage& fs, const String& name, const Point3_<_Tp>& pt )
+{
+    cv::internal::WriteStructContext ws(fs, name, FileNode::SEQ+FileNode::FLOW);
+    write(fs, pt);
+}
+
+template<typename _Tp> static inline
+void write(FileStorage& fs, const String& name, const Size_<_Tp>& sz )
+{
+    cv::internal::WriteStructContext ws(fs, name, FileNode::SEQ+FileNode::FLOW);
+    write(fs, sz);
+}
+
+template<typename _Tp> static inline
+void write(FileStorage& fs, const String& name, const Complex<_Tp>& c )
+{
+    cv::internal::WriteStructContext ws(fs, name, FileNode::SEQ+FileNode::FLOW);
+    write(fs, c);
+}
+
+template<typename _Tp> static inline
+void write(FileStorage& fs, const String& name, const Rect_<_Tp>& r )
+{
+    cv::internal::WriteStructContext ws(fs, name, FileNode::SEQ+FileNode::FLOW);
+    write(fs, r);
+}
+
+template<typename _Tp, int cn> static inline
+void write(FileStorage& fs, const String& name, const Vec<_Tp, cn>& v )
+{
+    cv::internal::WriteStructContext ws(fs, name, FileNode::SEQ+FileNode::FLOW);
+    write(fs, v);
+}
+
+template<typename _Tp, int m, int n> static inline
+void write(FileStorage& fs, const String& name, const Matx<_Tp, m, n>& x )
+{
+    write(fs, name, Mat(x)); // write as a Mat class
+}
+
+template<typename _Tp> static inline
+void write(FileStorage& fs, const String& name, const Scalar_<_Tp>& s )
+{
+    cv::internal::WriteStructContext ws(fs, name, FileNode::SEQ+FileNode::FLOW);
+    write(fs, s);
+}
+
+static inline
+void write(FileStorage& fs, const String& name, const Range& r )
+{
+    cv::internal::WriteStructContext ws(fs, name, FileNode::SEQ+FileNode::FLOW);
+    write(fs, r);
+}
+
+static inline
+void write(FileStorage& fs, const String& name, const KeyPoint& kpt)
+{
+    cv::internal::WriteStructContext ws(fs, name, FileNode::SEQ+FileNode::FLOW);
+    write(fs, kpt.pt.x);
+    write(fs, kpt.pt.y);
+    write(fs, kpt.size);
+    write(fs, kpt.angle);
+    write(fs, kpt.response);
+    write(fs, kpt.octave);
+    write(fs, kpt.class_id);
+}
+
+static inline
+void write(FileStorage& fs, const String& name, const DMatch& m)
+{
+    cv::internal::WriteStructContext ws(fs, name, FileNode::SEQ+FileNode::FLOW);
+    write(fs, m.queryIdx);
+    write(fs, m.trainIdx);
+    write(fs, m.imgIdx);
+    write(fs, m.distance);
+}
+
+template<typename _Tp, typename std::enable_if< std::is_enum<_Tp>::value >::type* = nullptr>
+static inline void write( FileStorage& fs, const String& name, const _Tp& val )
+{
+    write(fs, name, static_cast<int>(val));
+}
+
+template<typename _Tp> static inline
+void write( FileStorage& fs, const String& name, const std::vector<_Tp>& vec )
+{
+    cv::internal::WriteStructContext ws(fs, name, FileNode::SEQ+(traits::SafeFmt<_Tp>::fmt != 0 ? FileNode::FLOW : 0));
+    write(fs, vec);
+}
+
+template<typename _Tp> static inline
+void write( FileStorage& fs, const String& name, const std::vector< std::vector<_Tp> >& vec )
+{
+    cv::internal::WriteStructContext ws(fs, name, FileNode::SEQ);
+    for(size_t i = 0; i < vec.size(); i++)
+    {
+        cv::internal::WriteStructContext ws_(fs, name, FileNode::SEQ+(traits::SafeFmt<_Tp>::fmt != 0 ? FileNode::FLOW : 0));
+        write(fs, vec[i]);
+    }
+}
+
+#ifdef CV__LEGACY_PERSISTENCE
+// This code is not needed anymore, but it is preserved here to keep source compatibility
+// Implementation is similar to templates instantiations
+static inline void write(FileStorage& fs, const KeyPoint& kpt) { write(fs, String(), kpt); }
+static inline void write(FileStorage& fs, const DMatch& m) { write(fs, String(), m); }
+static inline void write(FileStorage& fs, const std::vector<KeyPoint>& vec)
+{
+    cv::internal::VecWriterProxy<KeyPoint, 0> w(&fs);
+    w(vec);
+}
+static inline void write(FileStorage& fs, const std::vector<DMatch>& vec)
+{
+    cv::internal::VecWriterProxy<DMatch, 0> w(&fs);
+    w(vec);
+
+}
+#endif
+
+//! @} FileStorage
+
+//! @relates cv::FileNode
+//! @{
+
+static inline
+void read(const FileNode& node, bool& value, bool default_value)
+{
+    int temp;
+    read(node, temp, (int)default_value);
+    value = temp != 0;
+}
+
+static inline
+void read(const FileNode& node, uchar& value, uchar default_value)
+{
+    int temp;
+    read(node, temp, (int)default_value);
+    value = saturate_cast<uchar>(temp);
+}
+
+static inline
+void read(const FileNode& node, schar& value, schar default_value)
+{
+    int temp;
+    read(node, temp, (int)default_value);
+    value = saturate_cast<schar>(temp);
+}
+
+static inline
+void read(const FileNode& node, ushort& value, ushort default_value)
+{
+    int temp;
+    read(node, temp, (int)default_value);
+    value = saturate_cast<ushort>(temp);
+}
+
+static inline
+void read(const FileNode& node, short& value, short default_value)
+{
+    int temp;
+    read(node, temp, (int)default_value);
+    value = saturate_cast<short>(temp);
+}
+
+template<typename _Tp> static inline
+void read( FileNodeIterator& it, std::vector<_Tp>& vec, size_t maxCount = (size_t)INT_MAX )
+{
+    cv::internal::VecReaderProxy<_Tp, traits::SafeFmt<_Tp>::fmt != 0> r(&it);
+    r(vec, maxCount);
+}
+
+template<typename _Tp, typename std::enable_if< std::is_enum<_Tp>::value >::type* = nullptr>
+static inline void read(const FileNode& node, _Tp& value, const _Tp& default_value = static_cast<_Tp>(0))
+{
+    int temp;
+    read(node, temp, static_cast<int>(default_value));
+    value = static_cast<_Tp>(temp);
+}
+
+template<typename _Tp> static inline
+void read( const FileNode& node, std::vector<_Tp>& vec, const std::vector<_Tp>& default_value = std::vector<_Tp>() )
+{
+    if(node.empty())
+        vec = default_value;
+    else
+    {
+        FileNodeIterator it = node.begin();
+        read( it, vec );
+    }
+}
+
+static inline
+void read( const FileNode& node, std::vector<KeyPoint>& vec, const std::vector<KeyPoint>& default_value )
+{
+    if(node.empty())
+        vec = default_value;
+    else
+        read(node, vec);
+}
+
+static inline
+void read( const FileNode& node, std::vector<DMatch>& vec, const std::vector<DMatch>& default_value )
+{
+    if(node.empty())
+        vec = default_value;
+    else
+        read(node, vec);
+}
+
+//! @} FileNode
+
+//! @relates cv::FileStorage
+//! @{
+
+/** @brief Writes data to a file storage.
+ */
+template<typename _Tp> static inline
+FileStorage& operator << (FileStorage& fs, const _Tp& value)
+{
+    if( !fs.isOpened() )
+        return fs;
+    if( fs.state == FileStorage::NAME_EXPECTED + FileStorage::INSIDE_MAP )
+        CV_Error( Error::StsError, "No element name has been given" );
+    write( fs, fs.elname, value );
+    if( fs.state & FileStorage::INSIDE_MAP )
+        fs.state = FileStorage::NAME_EXPECTED + FileStorage::INSIDE_MAP;
+    return fs;
+}
+
+/** @brief Writes data to a file storage.
+ */
+static inline
+FileStorage& operator << (FileStorage& fs, const char* str)
+{
+    return (fs << String(str));
+}
+
+/** @brief Writes data to a file storage.
+ */
+static inline
+FileStorage& operator << (FileStorage& fs, char* value)
+{
+    return (fs << String(value));
+}
+
+//! @} FileStorage
+
+//! @relates cv::FileNodeIterator
+//! @{
+
+/** @brief Reads data from a file storage.
+ */
+template<typename _Tp> static inline
+FileNodeIterator& operator >> (FileNodeIterator& it, _Tp& value)
+{
+    read( *it, value, _Tp());
+    return ++it;
+}
+
+/** @brief Reads data from a file storage.
+ */
+template<typename _Tp> static inline
+FileNodeIterator& operator >> (FileNodeIterator& it, std::vector<_Tp>& vec)
+{
+    cv::internal::VecReaderProxy<_Tp, traits::SafeFmt<_Tp>::fmt != 0> r(&it);
+    r(vec, (size_t)INT_MAX);
+    return it;
+}
+
+//! @} FileNodeIterator
+
+//! @relates cv::FileNode
+//! @{
+
+/** @brief Reads data from a file storage.
+ */
+template<typename _Tp> static inline
+void operator >> (const FileNode& n, _Tp& value)
+{
+    read( n, value, _Tp());
+}
+
+/** @brief Reads data from a file storage.
+ */
+template<typename _Tp> static inline
+void operator >> (const FileNode& n, std::vector<_Tp>& vec)
+{
+    FileNodeIterator it = n.begin();
+    it >> vec;
+}
+
+/** @brief Reads KeyPoint from a file storage.
+*/
+//It needs special handling because it contains two types of fields, int & float.
+static inline
+void operator >> (const FileNode& n, KeyPoint& kpt)
+{
+    FileNodeIterator it = n.begin();
+    it >> kpt.pt.x >> kpt.pt.y >> kpt.size >> kpt.angle >> kpt.response >> kpt.octave >> kpt.class_id;
+}
+
+#ifdef CV__LEGACY_PERSISTENCE
+static inline
+void operator >> (const FileNode& n, std::vector<KeyPoint>& vec)
+{
+    read(n, vec);
+}
+static inline
+void operator >> (const FileNode& n, std::vector<DMatch>& vec)
+{
+    read(n, vec);
+}
+#endif
+
+/** @brief Reads DMatch from a file storage.
+*/
+//It needs special handling because it contains two types of fields, int & float.
+static inline
+void operator >> (const FileNode& n, DMatch& m)
+{
+    FileNodeIterator it = n.begin();
+    it >> m.queryIdx >> m.trainIdx >> m.imgIdx >> m.distance;
+}
+
+//! @} FileNode
+
+//! @relates cv::FileNodeIterator
+//! @{
+
+CV_EXPORTS bool operator == (const FileNodeIterator& it1, const FileNodeIterator& it2);
+CV_EXPORTS bool operator != (const FileNodeIterator& it1, const FileNodeIterator& it2);
+
+static inline
+ptrdiff_t operator - (const FileNodeIterator& it1, const FileNodeIterator& it2)
+{
+    return it2.remaining() - it1.remaining();
+}
+
+static inline
+bool operator < (const FileNodeIterator& it1, const FileNodeIterator& it2)
+{
+    return it1.remaining() > it2.remaining();
+}
+
+//! @} FileNodeIterator
+
+} // cv
+
+#endif // OPENCV_CORE_PERSISTENCE_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/quaternion.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/quaternion.hpp
new file mode 100644
index 0000000..8c21501
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/quaternion.hpp
@@ -0,0 +1,1696 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2020, Huawei Technologies Co., Ltd. All rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: Liangqian Kong <chargerKong@126.com>
+//         Longbu Wang <riskiest@gmail.com>
+#ifndef OPENCV_CORE_QUATERNION_HPP
+#define OPENCV_CORE_QUATERNION_HPP
+
+#include <opencv2/core.hpp>
+#include <opencv2/core/utils/logger.hpp>
+#include <iostream>
+namespace cv
+{
+//! @addtogroup core
+//! @{
+
+//! Unit quaternion flag
+enum QuatAssumeType
+{
+    /**
+     * This flag is specified by default.
+     * If this flag is specified, the input quaternions are assumed to be not unit quaternions.
+     * It can guarantee the correctness of the calculations,
+     * although the calculation speed will be slower than the flag QUAT_ASSUME_UNIT.
+     */
+    QUAT_ASSUME_NOT_UNIT,
+    /**
+     * If this flag is specified, the input quaternions are assumed to be unit quaternions which
+     * will save some computations. However, if this flag is specified without unit quaternion,
+     * the program correctness of the result will not be guaranteed.
+     */
+    QUAT_ASSUME_UNIT
+};
+
+class QuatEnum
+{
+public:
+    /** @brief Enum of Euler angles type.
+     *
+     * Without considering the possibility of using two different convertions for the definition of the rotation axes ,
+     * there exists twelve possible sequences of rotation axes, divided into two groups:
+     * - Proper Euler angles (Z-X-Z, X-Y-X, Y-Z-Y, Z-Y-Z, X-Z-X, Y-X-Y)
+     * - Tait–Bryan angles (X-Y-Z, Y-Z-X, Z-X-Y, X-Z-Y, Z-Y-X, Y-X-Z).
+     *
+     * The three elemental rotations may be [extrinsic](https://en.wikipedia.org/wiki/Euler_angles#Definition_by_extrinsic_rotations)
+     * (rotations about the axes *xyz* of the original coordinate system, which is assumed to remain motionless),
+     * or [intrinsic](https://en.wikipedia.org/wiki/Euler_angles#Definition_by_intrinsic_rotations)(rotations about the axes of the rotating coordinate system *XYZ*, solidary with the moving body, which changes its orientation after each elemental rotation).
+     *
+     *
+     * Extrinsic and intrinsic rotations are relevant.
+     *
+     * The definition of the Euler angles is as following,
+     * - \f$\theta_1 \f$ represents the first rotation angle,
+     * - \f$\theta_2 \f$ represents the second rotation angle,
+     * - \f$\theta_3 \f$ represents the third rotation angle.
+     *
+     * For intrinsic rotations in the order of X-Y-Z, the rotation matrix R can be calculated by:\f[R =X(\theta_1) Y(\theta_2) Z(\theta_3) \f]
+     * For extrinsic rotations in the order of X-Y-Z, the rotation matrix R can be calculated by:\f[R =Z({\theta_3}) Y({\theta_2}) X({\theta_1})\f]
+     * where
+     * \f[X({\theta})={\begin{bmatrix}1&0&0\\0&\cos {\theta_1} &-\sin {\theta_1} \\0&\sin {\theta_1} &\cos {\theta_1} \\\end{bmatrix}},
+     * Y({\theta})={\begin{bmatrix}\cos \theta_{2}&0&\sin \theta_{2}\\0&1 &0 \\\ -sin \theta_2& 0&\cos \theta_{2} \\\end{bmatrix}},
+     * Z({\theta})={\begin{bmatrix}\cos\theta_{3} &-\sin \theta_3&0\\\sin \theta_3 &\cos \theta_3 &0\\0&0&1\\\end{bmatrix}}.
+     * \f]
+     *
+     * The function is designed according to this set of conventions:
+     * - [Right handed](https://en.wikipedia.org/wiki/Right_hand_rule) reference frames are adopted, and the [right hand rule](https://en.wikipedia.org/wiki/Right_hand_rule) is used to determine the sign of angles.
+     * - Each matrix is meant to represent an [active rotation](https://en.wikipedia.org/wiki/Active_and_passive_transformation) (the composing and composed matrices
+     * are supposed to act on the coordinates of vectors defined in the initial fixed reference frame and give as a result the coordinates of a rotated vector defined in the same reference frame).
+     * - For \f$\theta_1\f$ and \f$\theta_3\f$, the valid range is (−π, π].
+     *
+     *   For \f$\theta_2\f$, the valid range is [−π/2, π/2] or [0, π].
+     *
+     *   For Tait–Bryan angles, the valid range of \f$\theta_2\f$ is [−π/2, π/2]. When transforming a quaternion to Euler angles, the solution of Euler angles is unique in condition of \f$ \theta_2 \in (−π/2, π/2)\f$ .
+     *   If \f$\theta_2 = −π/2 \f$ or \f$ \theta_2 = π/2\f$, there are infinite solutions. The common name for this situation is gimbal lock.
+     *   For Proper Euler angles,the valid range of \f$\theta_2\f$ is in [0, π]. The solutions of Euler angles are unique in condition of  \f$ \theta_2 \in (0, π)\f$ . If \f$\theta_2 =0 \f$ or \f$\theta_2 =π \f$,
+     *   there are infinite solutions and gimbal lock will occur.
+     */
+    enum EulerAnglesType
+    {
+        INT_XYZ, ///< Intrinsic rotations with the Euler angles type X-Y-Z
+        INT_XZY, ///< Intrinsic rotations with the Euler angles type X-Z-Y
+        INT_YXZ, ///< Intrinsic rotations with the Euler angles type Y-X-Z
+        INT_YZX, ///< Intrinsic rotations with the Euler angles type Y-Z-X
+        INT_ZXY, ///< Intrinsic rotations with the Euler angles type Z-X-Y
+        INT_ZYX, ///< Intrinsic rotations with the Euler angles type Z-Y-X
+        INT_XYX, ///< Intrinsic rotations with the Euler angles type X-Y-X
+        INT_XZX, ///< Intrinsic rotations with the Euler angles type X-Z-X
+        INT_YXY, ///< Intrinsic rotations with the Euler angles type Y-X-Y
+        INT_YZY, ///< Intrinsic rotations with the Euler angles type Y-Z-Y
+        INT_ZXZ, ///< Intrinsic rotations with the Euler angles type Z-X-Z
+        INT_ZYZ, ///< Intrinsic rotations with the Euler angles type Z-Y-Z
+
+        EXT_XYZ, ///< Extrinsic rotations with the Euler angles type X-Y-Z
+        EXT_XZY, ///< Extrinsic rotations with the Euler angles type X-Z-Y
+        EXT_YXZ, ///< Extrinsic rotations with the Euler angles type Y-X-Z
+        EXT_YZX, ///< Extrinsic rotations with the Euler angles type Y-Z-X
+        EXT_ZXY, ///< Extrinsic rotations with the Euler angles type Z-X-Y
+        EXT_ZYX, ///< Extrinsic rotations with the Euler angles type Z-Y-X
+        EXT_XYX, ///< Extrinsic rotations with the Euler angles type X-Y-X
+        EXT_XZX, ///< Extrinsic rotations with the Euler angles type X-Z-X
+        EXT_YXY, ///< Extrinsic rotations with the Euler angles type Y-X-Y
+        EXT_YZY,  ///< Extrinsic rotations with the Euler angles type Y-Z-Y
+        EXT_ZXZ, ///< Extrinsic rotations with the Euler angles type Z-X-Z
+        EXT_ZYZ, ///< Extrinsic rotations with the Euler angles type Z-Y-Z
+        #ifndef CV_DOXYGEN
+            EULER_ANGLES_MAX_VALUE
+        #endif
+    };
+
+};
+
+template <typename _Tp> class Quat;
+template <typename _Tp> std::ostream& operator<<(std::ostream&, const Quat<_Tp>&);
+
+/**
+ * Quaternion is a number system that extends the complex numbers. It can be expressed as a
+ * rotation in three-dimensional space.
+ * A quaternion is generally represented in the form:
+ *      \f[q = w + x\boldsymbol{i} + y\boldsymbol{j} + z\boldsymbol{k}\f]
+ *      \f[q = [w, x, y, z]\f]
+ *      \f[q = [w, \boldsymbol{v}] \f]
+ *      \f[q = ||q||[\cos\psi, u_x\sin\psi,u_y\sin\psi,  u_z\sin\psi].\f]
+ *      \f[q = ||q||[\cos\psi, \boldsymbol{u}\sin\psi]\f]
+ * where \f$\psi = \frac{\theta}{2}\f$, \f$\theta\f$ represents rotation angle,
+ * \f$\boldsymbol{u} = [u_x, u_y, u_z]\f$ represents normalized rotation axis,
+ * and \f$||q||\f$ represents the norm of \f$q\f$.
+ *
+ * A unit quaternion is usually represents rotation, which has the form:
+ *      \f[q = [\cos\psi, u_x\sin\psi,u_y\sin\psi,  u_z\sin\psi].\f]
+ *
+ * To create a quaternion representing the rotation around the axis \f$\boldsymbol{u}\f$
+ * with angle \f$\theta\f$, you can use
+ * ```
+ * using namespace cv;
+ * double angle = CV_PI;
+ * Vec3d axis = {0, 0, 1};
+ * Quatd q = Quatd::createFromAngleAxis(angle, axis);
+ * ```
+ *
+ * You can simply use four same type number to create a quaternion
+ * ```
+ * Quatd q(1, 2, 3, 4);
+ * ```
+ * Or use a Vec4d or Vec4f vector.
+ * ```
+ * Vec4d vec{1, 2, 3, 4};
+ * Quatd q(vec);
+ * ```
+ *
+ * ```
+ * Vec4f vec{1, 2, 3, 4};
+ * Quatf q(vec);
+ * ```
+ *
+ * If you already have a 3x3 rotation matrix R, then you can use
+ * ```
+ * Quatd q = Quatd::createFromRotMat(R);
+ * ```
+ *
+ * If you already have a rotation vector rvec which has the form of `angle * axis`, then you can use
+ * ```
+ * Quatd q = Quatd::createFromRvec(rvec);
+ * ```
+ *
+ * To extract the rotation matrix from quaternion, see toRotMat3x3()
+ *
+ * To extract the Vec4d or Vec4f, see toVec()
+ *
+ * To extract the rotation vector, see toRotVec()
+ *
+ * If there are two quaternions \f$q_0, q_1\f$ are needed to interpolate, you can use nlerp(), slerp() or spline()
+ * ```
+ * Quatd::nlerp(q0, q1, t)
+ *
+ * Quatd::slerp(q0, q1, t)
+ *
+ * Quatd::spline(q0, q0, q1, q1, t)
+ * ```
+ * spline can smoothly connect rotations of  multiple quaternions
+ *
+ * Three ways to get an element in Quaternion
+ * ```
+ * Quatf q(1,2,3,4);
+ * std::cout << q.w << std::endl; // w=1, x=2, y=3, z=4
+ * std::cout << q[0] << std::endl; // q[0]=1, q[1]=2, q[2]=3, q[3]=4
+ * std::cout << q.at(0) << std::endl;
+ * ```
+ */
+template <typename _Tp>
+class Quat
+{
+    static_assert(std::is_floating_point<_Tp>::value, "Quaternion only make sense with type of float or double");
+    using value_type = _Tp;
+public:
+    static constexpr _Tp CV_QUAT_EPS = (_Tp)1.e-6;
+    static constexpr _Tp CV_QUAT_CONVERT_THRESHOLD = (_Tp)1.e-6;
+
+    Quat();
+
+    /**
+     * @brief From Vec4d or Vec4f.
+     */
+    explicit Quat(const Vec<_Tp, 4> &coeff);
+
+    /**
+     * @brief from four numbers.
+     */
+    Quat(_Tp w, _Tp x, _Tp y, _Tp z);
+
+    /**
+     * @brief from an angle, axis. Axis will be normalized in this function. And
+     * it generates
+     * \f[q = [\cos\psi, u_x\sin\psi,u_y\sin\psi,  u_z\sin\psi].\f]
+     * where \f$\psi = \frac{\theta}{2}\f$, \f$\theta\f$ is the rotation angle.
+     */
+    static Quat<_Tp> createFromAngleAxis(const _Tp angle, const Vec<_Tp, 3> &axis);
+
+    /**
+     * @brief from a 3x3 rotation matrix.
+     */
+    static Quat<_Tp> createFromRotMat(InputArray R);
+
+    /**
+     * @brief from a rotation vector
+     * \f$r\f$ has the form \f$\theta \cdot \boldsymbol{u}\f$, where \f$\theta\f$
+     * represents rotation angle and \f$\boldsymbol{u}\f$ represents normalized rotation axis.
+     *
+     * Angle and axis could be easily derived as:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * \psi &= ||r||\\
+     * \boldsymbol{u} &= \frac{r}{\theta}
+     * \end{split}
+     * \end{equation}
+     * \f]
+     * Then a quaternion can be calculated by
+     *  \f[q = [\cos\psi, \boldsymbol{u}\sin\psi]\f]
+     *  where \f$\psi = \theta / 2 \f$
+     */
+    static Quat<_Tp> createFromRvec(InputArray rvec);
+
+     /**
+     * @brief
+     * from Euler angles
+     *
+     * A quaternion can be generated from Euler angles by combining the quaternion representations of the Euler rotations.
+     *
+     * For example, if we use intrinsic rotations in the order of X-Y-Z,\f$\theta_1 \f$ is rotation around the X-axis, \f$\theta_2 \f$ is rotation around the Y-axis,
+     * \f$\theta_3 \f$ is rotation around the Z-axis. The final quaternion q can be calculated by
+     *
+     * \f[ {q} = q_{X, \theta_1}  q_{Y, \theta_2} q_{Z, \theta_3}\f]
+     * where \f$ q_{X, \theta_1} \f$ is created from @ref createFromXRot,  \f$ q_{Y, \theta_2} \f$ is created from @ref createFromYRot,
+     *  \f$ q_{Z, \theta_3} \f$ is created from @ref createFromZRot.
+     * @param angles the Euler angles in a vector of length 3
+     * @param eulerAnglesType the convertion Euler angles type
+     */
+    static Quat<_Tp> createFromEulerAngles(const Vec<_Tp, 3> &angles, QuatEnum::EulerAnglesType eulerAnglesType);
+
+    /**
+     * @brief get a quaternion from a rotation about the Y-axis by \f$\theta\f$ .
+     * \f[q = \cos(\theta/2)+0 i+ sin(\theta/2) j +0k \f]
+     */
+    static Quat<_Tp> createFromYRot(const _Tp theta);
+
+    /**
+     * @brief get a quaternion from a rotation about the X-axis by \f$\theta\f$ .
+     * \f[q = \cos(\theta/2)+sin(\theta/2) i +0 j +0 k \f]
+     */
+    static Quat<_Tp> createFromXRot(const _Tp theta);
+
+    /**
+     * @brief get a quaternion from a rotation about the Z-axis by \f$\theta\f$.
+     * \f[q = \cos(\theta/2)+0 i +0 j +sin(\theta/2) k \f]
+     */
+    static Quat<_Tp> createFromZRot(const _Tp theta);
+
+    /**
+     * @brief a way to get element.
+     * @param index over a range [0, 3].
+     *
+     * A quaternion q
+     *
+     * q.at(0) is equivalent to q.w,
+     *
+     * q.at(1) is equivalent to q.x,
+     *
+     * q.at(2) is equivalent to q.y,
+     *
+     * q.at(3) is equivalent to q.z.
+     */
+    _Tp at(size_t index) const;
+
+    /**
+     * @brief return the conjugate of this quaternion.
+     * \f[q.conjugate() = (w, -x, -y, -z).\f]
+     */
+    Quat<_Tp> conjugate() const;
+
+    /**
+     *
+     * @brief return the value of exponential value.
+     * \f[\exp(q) = e^w (\cos||\boldsymbol{v}||+ \frac{v}{||\boldsymbol{v}||})\sin||\boldsymbol{v}||\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     * @param q a quaternion.
+     *
+     * For example:
+     * ```
+     * Quatd q{1,2,3,4};
+     * cout << exp(q) << endl;
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> exp(const Quat<T> &q);
+
+    /**
+     * @brief return the value of exponential value.
+     * \f[\exp(q) = e^w (\cos||\boldsymbol{v}||+ \frac{v}{||\boldsymbol{v}||}\sin||\boldsymbol{v}||)\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     *
+     * For example
+     * ```
+     * Quatd q{1,2,3,4};
+     * cout << q.exp() << endl;
+     * ```
+     */
+    Quat<_Tp> exp() const;
+
+    /**
+     * @brief return the value of logarithm function.
+     * \f[\ln(q) = \ln||q|| + \frac{\boldsymbol{v}}{||\boldsymbol{v}||}\arccos\frac{w}{||q||}.\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     * @param q a quaternion.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, q assume to be a unit quaternion and this function will save some computations.
+     *
+     * For example
+     * ```
+     * Quatd q1{1,2,3,4};
+     * cout << log(q1) << endl;
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> log(const Quat<T> &q, QuatAssumeType assumeUnit);
+
+    /**
+     * @brief return the value of logarithm function.
+     *  \f[\ln(q) = \ln||q|| + \frac{\boldsymbol{v}}{||\boldsymbol{v}||}\arccos\frac{w}{||q||}\f].
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     * @param assumeUnit if QUAT_ASSUME_UNIT, this quaternion assume to be a unit quaternion and this function will save some computations.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.log();
+     *
+     * QuatAssumeType assumeUnit = QUAT_ASSUME_UNIT;
+     * Quatd q1(1,2,3,4);
+     * q1.normalize().log(assumeUnit);
+     * ```
+     */
+    Quat<_Tp> log(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief return the value of power function with index \f$x\f$.
+     * \f[q^x = ||q||(cos(x\theta) + \boldsymbol{u}sin(x\theta))).\f]
+     * @param q a quaternion.
+     * @param x index of exponentiation.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, quaternion q assume to be a unit quaternion and this function will save some computations.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * power(q, 2.0);
+     *
+     * QuatAssumeType assumeUnit = QUAT_ASSUME_UNIT;
+     * double angle = CV_PI;
+     * Vec3d axis{0, 0, 1};
+     * Quatd q1 = Quatd::createFromAngleAxis(angle, axis); //generate a unit quat by axis and angle
+     * power(q1, 2.0, assumeUnit);//This assumeUnit means q1 is a unit quaternion.
+     * ```
+     * @note the type of the index should be the same as the quaternion.
+     */
+    template <typename T>
+    friend Quat<T> power(const Quat<T> &q, const T x, QuatAssumeType assumeUnit);
+
+    /**
+     * @brief return the value of power function with index \f$x\f$.
+     * \f[q^x = ||q||(\cos(x\theta) + \boldsymbol{u}\sin(x\theta))).\f]
+     * @param x index of exponentiation.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, this quaternion assume to be a unit quaternion and this function will save some computations.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.power(2.0);
+     *
+     * QuatAssumeType assumeUnit = QUAT_ASSUME_UNIT;
+     * double angle = CV_PI;
+     * Vec3d axis{0, 0, 1};
+     * Quatd q1 = Quatd::createFromAngleAxis(angle, axis); //generate a unit quat by axis and angle
+     * q1.power(2.0, assumeUnit); //This assumeUnt means q1 is a unit quaternion
+     * ```
+     */
+    Quat<_Tp> power(const _Tp x, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief return \f$\sqrt{q}\f$.
+     * @param q a quaternion.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, quaternion q assume to be a unit quaternion and this function will save some computations.
+     *
+     * For example
+     * ```
+     * Quatf q(1,2,3,4);
+     * sqrt(q);
+     *
+     * QuatAssumeType assumeUnit = QUAT_ASSUME_UNIT;
+     * q = {1,0,0,0};
+     * sqrt(q, assumeUnit); //This assumeUnit means q is a unit quaternion.
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> sqrt(const Quat<T> &q, QuatAssumeType assumeUnit);
+
+    /**
+     * @brief return \f$\sqrt{q}\f$.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, this quaternion assume to be a unit quaternion and this function will save some computations.
+     *
+     * For example
+     * ```
+     * Quatf q(1,2,3,4);
+     * q.sqrt();
+     *
+     * QuatAssumeType assumeUnit = QUAT_ASSUME_UNIT;
+     * q = {1,0,0,0};
+     * q.sqrt(assumeUnit); //This assumeUnit means q is a unit quaternion
+     * ```
+     */
+    Quat<_Tp> sqrt(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief return the value of power function with quaternion \f$q\f$.
+     * \f[p^q = e^{q\ln(p)}.\f]
+     * @param p base quaternion of power function.
+     * @param q index quaternion of power function.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, quaternion \f$p\f$ assume to be a unit quaternion and this function will save some computations.
+     *
+     * For example
+     * ```
+     * Quatd p(1,2,3,4);
+     * Quatd q(5,6,7,8);
+     * power(p, q);
+     *
+     * QuatAssumeType assumeUnit = QUAT_ASSUME_UNIT;
+     * p = p.normalize();
+     * power(p, q, assumeUnit); //This assumeUnit means p is a unit quaternion
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> power(const Quat<T> &p, const Quat<T> &q, QuatAssumeType assumeUnit);
+
+    /**
+     * @brief return the value of power function with quaternion \f$q\f$.
+     * \f[p^q = e^{q\ln(p)}.\f]
+     * @param q index quaternion of power function.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, this quaternion assume to be a unit quaternion and this function will save some computations.
+     *
+     * For example
+     * ```
+     * Quatd p(1,2,3,4);
+     * Quatd q(5,6,7,8);
+     * p.power(q);
+     *
+     * QuatAssumeType assumeUnit = QUAT_ASSUME_UNIT;
+     * p = p.normalize();
+     * p.power(q, assumeUnit); //This assumeUnit means p is a unit quaternion
+     * ```
+     */
+    Quat<_Tp> power(const Quat<_Tp> &q, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief return the crossProduct between \f$p = (a, b, c, d) = (a, \boldsymbol{u})\f$ and \f$q = (w, x, y, z) = (w, \boldsymbol{v})\f$.
+     * \f[p \times q = \frac{pq- qp}{2}\f]
+     * \f[p \times q = \boldsymbol{u} \times \boldsymbol{v}\f]
+     * \f[p \times q = (cz-dy)i + (dx-bz)j + (by-xc)k \f]
+     *
+     * For example
+     * ```
+     * Quatd q{1,2,3,4};
+     * Quatd p{5,6,7,8};
+     * crossProduct(p, q);
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> crossProduct(const Quat<T> &p, const Quat<T> &q);
+
+    /**
+     * @brief return the crossProduct between \f$p = (a, b, c, d) = (a, \boldsymbol{u})\f$ and \f$q = (w, x, y, z) = (w, \boldsymbol{v})\f$.
+     * \f[p \times q = \frac{pq- qp}{2}.\f]
+     * \f[p \times q = \boldsymbol{u} \times \boldsymbol{v}.\f]
+     * \f[p \times q = (cz-dy)i + (dx-bz)j + (by-xc)k. \f]
+     *
+     * For example
+     * ```
+     * Quatd q{1,2,3,4};
+     * Quatd p{5,6,7,8};
+     * p.crossProduct(q)
+     * ```
+     */
+    Quat<_Tp> crossProduct(const Quat<_Tp> &q) const;
+
+    /**
+     * @brief return the norm of quaternion.
+     * \f[||q|| = \sqrt{w^2 + x^2 + y^2 + z^2}.\f]
+     */
+    _Tp norm() const;
+
+    /**
+     * @brief return a normalized \f$p\f$.
+     * \f[p = \frac{q}{||q||}\f]
+     * where \f$p\f$ satisfies \f$(p.x)^2 + (p.y)^2 + (p.z)^2 + (p.w)^2 = 1.\f$
+     */
+    Quat<_Tp> normalize() const;
+
+    /**
+     * @brief return \f$q^{-1}\f$ which is an inverse of \f$q\f$
+     * which satisfies \f$q * q^{-1} = 1\f$.
+     * @param q a quaternion.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, quaternion q assume to be a unit quaternion and this function will save some computations.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * inv(q);
+     *
+     * QuatAssumeType assumeUnit = QUAT_ASSUME_UNIT;
+     * q = q.normalize();
+     * inv(q, assumeUnit);//This assumeUnit means p is a unit quaternion
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> inv(const Quat<T> &q, QuatAssumeType assumeUnit);
+
+    /**
+     * @brief return \f$q^{-1}\f$ which is an inverse of \f$q\f$
+     * satisfying \f$q * q^{-1} = 1\f$.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, quaternion q assume to be a unit quaternion and this function will save some computations.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.inv();
+     *
+     * QuatAssumeType assumeUnit = QUAT_ASSUME_UNIT;
+     * q = q.normalize();
+     * q.inv(assumeUnit);  //assumeUnit means p is a unit quaternion
+     * ```
+     */
+    Quat<_Tp> inv(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief return sinh value of quaternion q, sinh could be calculated as:
+     * \f[\sinh(p) = \sin(w)\cos(||\boldsymbol{v}||) + \cosh(w)\frac{v}{||\boldsymbol{v}||}\sin||\boldsymbol{v}||\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     * @param q a quaternion.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * sinh(q);
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> sinh(const Quat<T> &q);
+
+    /**
+     * @brief return sinh value of this quaternion, sinh could be calculated as:
+     * \f$\sinh(p) = \sin(w)\cos(||\boldsymbol{v}||) + \cosh(w)\frac{v}{||\boldsymbol{v}||}\sin||\boldsymbol{v}||\f$
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.sinh();
+     * ```
+     */
+    Quat<_Tp> sinh() const;
+
+    /**
+     * @brief return cosh value of quaternion q, cosh could be calculated as:
+     * \f[\cosh(p) = \cosh(w) * \cos(||\boldsymbol{v}||) + \sinh(w)\frac{\boldsymbol{v}}{||\boldsymbol{v}||}\sin(||\boldsymbol{v}||)\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     * @param q a quaternion.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * cosh(q);
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> cosh(const Quat<T> &q);
+
+    /**
+     * @brief return cosh value of this quaternion, cosh could be calculated as:
+     * \f[\cosh(p) = \cosh(w) * \cos(||\boldsymbol{v}||) + \sinh(w)\frac{\boldsymbol{v}}{||\boldsymbol{v}||}sin(||\boldsymbol{v}||)\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.cosh();
+     * ```
+     */
+    Quat<_Tp> cosh() const;
+
+    /**
+     * @brief return tanh value of quaternion q, tanh could be calculated as:
+     * \f[ \tanh(q) = \frac{\sinh(q)}{\cosh(q)}.\f]
+     * @param q a quaternion.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * tanh(q);
+     * ```
+     * @sa sinh, cosh
+     */
+    template <typename T>
+    friend Quat<T> tanh(const Quat<T> &q);
+
+    /**
+     * @brief return tanh value of this quaternion, tanh could be calculated as:
+     * \f[ \tanh(q) = \frac{\sinh(q)}{\cosh(q)}.\f]
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.tanh();
+     * ```
+     * @sa sinh, cosh
+     */
+    Quat<_Tp> tanh() const;
+
+    /**
+     * @brief return tanh value of quaternion q, sin could be calculated as:
+     * \f[\sin(p) = \sin(w) * \cosh(||\boldsymbol{v}||) + \cos(w)\frac{\boldsymbol{v}}{||\boldsymbol{v}||}\sinh(||\boldsymbol{v}||)\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     * @param q a quaternion.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * sin(q);
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> sin(const Quat<T> &q);
+
+    /**
+     * @brief return sin value of this quaternion, sin could be calculated as:
+     * \f[\sin(p) = \sin(w) * \cosh(||\boldsymbol{v}||) + \cos(w)\frac{\boldsymbol{v}}{||\boldsymbol{v}||}\sinh(||\boldsymbol{v}||)\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.sin();
+     * ```
+     */
+    Quat<_Tp> sin() const;
+
+    /**
+     * @brief return sin value of quaternion q, cos could be calculated as:
+     * \f[\cos(p) = \cos(w) * \cosh(||\boldsymbol{v}||) - \sin(w)\frac{\boldsymbol{v}}{||\boldsymbol{v}||}\sinh(||\boldsymbol{v}||)\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     * @param q a quaternion.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * cos(q);
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> cos(const Quat<T> &q);
+
+    /**
+     * @brief return cos value of this quaternion, cos could be calculated as:
+     * \f[\cos(p) = \cos(w) * \cosh(||\boldsymbol{v}||) - \sin(w)\frac{\boldsymbol{v}}{||\boldsymbol{v}||}\sinh(||\boldsymbol{v}||)\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.cos();
+     * ```
+     */
+    Quat<_Tp> cos() const;
+
+    /**
+     * @brief return tan value of quaternion q, tan could be calculated as:
+     * \f[\tan(q) = \frac{\sin(q)}{\cos(q)}.\f]
+     * @param q a quaternion.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * tan(q);
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> tan(const Quat<T> &q);
+
+    /**
+     * @brief return tan value of this quaternion, tan could be calculated as:
+     * \f[\tan(q) = \frac{\sin(q)}{\cos(q)}.\f]
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.tan();
+     * ```
+     */
+    Quat<_Tp> tan() const;
+
+    /**
+     * @brief return arcsin value of quaternion q, arcsin could be calculated as:
+     * \f[\arcsin(q) = -\frac{\boldsymbol{v}}{||\boldsymbol{v}||}arcsinh(q\frac{\boldsymbol{v}}{||\boldsymbol{v}||})\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     * @param q a quaternion.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * asin(q);
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> asin(const Quat<T> &q);
+
+    /**
+     * @brief return arcsin value of this quaternion, arcsin could be calculated as:
+     * \f[\arcsin(q) = -\frac{\boldsymbol{v}}{||\boldsymbol{v}||}arcsinh(q\frac{\boldsymbol{v}}{||\boldsymbol{v}||})\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.asin();
+     * ```
+     */
+    Quat<_Tp> asin() const;
+
+    /**
+     * @brief return arccos value of quaternion q, arccos could be calculated as:
+     * \f[\arccos(q) = -\frac{\boldsymbol{v}}{||\boldsymbol{v}||}arccosh(q)\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     * @param q a quaternion.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * acos(q);
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> acos(const Quat<T> &q);
+
+    /**
+     * @brief return arccos value of this quaternion, arccos could be calculated as:
+     * \f[\arccos(q) = -\frac{\boldsymbol{v}}{||\boldsymbol{v}||}arccosh(q)\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.acos();
+     * ```
+     */
+    Quat<_Tp> acos() const;
+
+    /**
+     * @brief return arctan value of quaternion q, arctan could be calculated as:
+     * \f[\arctan(q) = -\frac{\boldsymbol{v}}{||\boldsymbol{v}||}arctanh(q\frac{\boldsymbol{v}}{||\boldsymbol{v}||})\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     * @param q a quaternion.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * atan(q);
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> atan(const Quat<T> &q);
+
+    /**
+     * @brief return arctan value of this quaternion, arctan could be calculated as:
+     * \f[\arctan(q) = -\frac{\boldsymbol{v}}{||\boldsymbol{v}||}arctanh(q\frac{\boldsymbol{v}}{||\boldsymbol{v}||})\f]
+     * where \f$\boldsymbol{v} = [x, y, z].\f$
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.atan();
+     * ```
+     */
+    Quat<_Tp> atan() const;
+
+    /**
+     * @brief return arcsinh value of quaternion q, arcsinh could be calculated as:
+     * \f[arcsinh(q) = \ln(q + \sqrt{q^2 + 1})\f].
+     * @param q a quaternion.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * asinh(q);
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> asinh(const Quat<T> &q);
+
+    /**
+     * @brief return arcsinh value of this quaternion, arcsinh could be calculated as:
+     * \f[arcsinh(q) = \ln(q + \sqrt{q^2 + 1})\f].
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.asinh();
+     * ```
+     */
+    Quat<_Tp> asinh() const;
+
+    /**
+     * @brief return arccosh value of quaternion q, arccosh could be calculated as:
+     * \f[arccosh(q) = \ln(q + \sqrt{q^2 - 1})\f].
+     * @param q a quaternion.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * acosh(q);
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> acosh(const Quat<T> &q);
+
+    /**
+     * @brief return arccosh value of this quaternion, arccosh could be calculated as:
+     * \f[arcosh(q) = \ln(q + \sqrt{q^2 - 1})\f].
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.acosh();
+     * ```
+     */
+    Quat<_Tp> acosh() const;
+
+    /**
+     * @brief return arctanh value of quaternion q, arctanh could be calculated as:
+     * \f[arctanh(q) = \frac{\ln(q + 1) - \ln(1 - q)}{2}\f].
+     * @param q a quaternion.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * atanh(q);
+     * ```
+     */
+    template <typename T>
+    friend Quat<T> atanh(const Quat<T> &q);
+
+    /**
+     * @brief return arctanh value of this quaternion, arctanh could be calculated as:
+     * \f[arcsinh(q) = \frac{\ln(q + 1) - \ln(1 - q)}{2}\f].
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.atanh();
+     * ```
+     */
+    Quat<_Tp> atanh() const;
+
+    /**
+     * @brief return true if this quaternion is a unit quaternion.
+     * @param eps tolerance scope of normalization. The eps could be defined as
+     *
+     * \f[eps = |1 - dotValue|\f] where \f[dotValue = (this.w^2 + this.x^2 + this,y^2 + this.z^2).\f]
+     * And this function will consider it is normalized when the dotValue over a range \f$[1-eps, 1+eps]\f$.
+     */
+    bool isNormal(_Tp eps=CV_QUAT_EPS) const;
+
+    /**
+     * @brief to throw an error if this quaternion is not a unit quaternion.
+     * @param eps tolerance scope of normalization.
+     * @sa isNormal
+     */
+    void assertNormal(_Tp eps=CV_QUAT_EPS) const;
+
+    /**
+     * @brief transform a quaternion to a 3x3 rotation matrix.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, this quaternion assume to be a unit quaternion and
+     * this function will save some computations. Otherwise, this function will normalize this
+     * quaternion at first then do the transformation.
+     *
+     * @note Matrix A which is to be rotated should have the form
+     * \f[\begin{bmatrix}
+     * x_0& x_1& x_2&...&x_n\\
+     * y_0& y_1& y_2&...&y_n\\
+     * z_0& z_1& z_2&...&z_n
+     * \end{bmatrix}\f]
+     * where the same subscript represents a point. The shape of A assume to be [3, n]
+     * The points matrix A can be rotated by toRotMat3x3() * A.
+     * The result has 3 rows and n columns too.
+
+     * For example
+     * ```
+     * double angle = CV_PI;
+     * Vec3d axis{0,0,1};
+     * Quatd q_unit = Quatd::createFromAngleAxis(angle, axis); //quaternion could also be get by interpolation by two or more quaternions.
+     *
+     * //assume there is two points (1,0,0) and (1,0,1) to be rotated
+     * Mat pointsA = (Mat_<double>(2, 3) << 1,0,0,1,0,1);
+     * //change the shape
+     * pointsA = pointsA.t();
+     * // rotate 180 degrees around the z axis
+     * Mat new_point = q_unit.toRotMat3x3() * pointsA;
+     * // print two points
+     * cout << new_point << endl;
+     * ```
+     */
+    Matx<_Tp, 3, 3> toRotMat3x3(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief transform a quaternion to a 4x4 rotation matrix.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, this quaternion assume to be a unit quaternion and
+     * this function will save some computations. Otherwise, this function will normalize this
+     * quaternion at first then do the transformation.
+     *
+     * The operations is similar as toRotMat3x3
+     * except that the points matrix should have the form
+     * \f[\begin{bmatrix}
+     * x_0& x_1& x_2&...&x_n\\
+     * y_0& y_1& y_2&...&y_n\\
+     * z_0& z_1& z_2&...&z_n\\
+     * 0&0&0&...&0
+     * \end{bmatrix}\f]
+     *
+     * @sa toRotMat3x3
+     */
+
+    Matx<_Tp, 4, 4> toRotMat4x4(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief transform the this quaternion to a Vec<T, 4>.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.toVec();
+     * ```
+     */
+    Vec<_Tp, 4> toVec() const;
+
+    /**
+     * @brief transform this quaternion to a Rotation vector.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, this quaternion assume to be a unit quaternion and
+     * this function will save some computations.
+     * Rotation vector rVec is defined as:
+     * \f[ rVec = [\theta v_x, \theta v_y, \theta v_z]\f]
+     * where \f$\theta\f$ represents rotation angle, and \f$\boldsymbol{v}\f$ represents the normalized rotation axis.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.toRotVec();
+     *
+     * QuatAssumeType assumeUnit = QUAT_ASSUME_UNIT;
+     * q.normalize().toRotVec(assumeUnit); //answer is same as q.toRotVec().
+     * ```
+     */
+    Vec<_Tp, 3> toRotVec(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief get the angle of quaternion, it returns the rotation angle.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, this quaternion assume to be a unit quaternion and
+     * this function will save some computations.
+     * \f[\psi = 2 *arccos(\frac{w}{||q||})\f]
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.getAngle();
+     *
+     * QuatAssumeType assumeUnit = QUAT_ASSUME_UNIT;
+     * q.normalize().getAngle(assumeUnit);//same as q.getAngle().
+     * ```
+     * @note It always return the value between \f$[0, 2\pi]\f$.
+     */
+    _Tp getAngle(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief get the axis of quaternion, it returns a vector of length 3.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, this quaternion assume to be a unit quaternion and
+     * this function will save some computations.
+     *
+     * the unit axis \f$\boldsymbol{u}\f$ is defined by
+     * \f[\begin{equation}
+     *    \begin{split}
+     *      \boldsymbol{v}
+     *      &= \boldsymbol{u} ||\boldsymbol{v}||\\
+     *      &= \boldsymbol{u}||q||sin(\frac{\theta}{2})
+     *    \end{split}
+     *    \end{equation}\f]
+     *  where \f$v=[x, y ,z]\f$ and \f$\theta\f$ represents rotation angle.
+     *
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * q.getAxis();
+     *
+     * QuatAssumeType assumeUnit = QUAT_ASSUME_UNIT;
+     * q.normalize().getAxis(assumeUnit);//same as q.getAxis()
+     * ```
+     */
+    Vec<_Tp, 3> getAxis(QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT) const;
+
+    /**
+     * @brief return the dot between quaternion \f$q\f$ and this quaternion.
+     *
+     * dot(p, q) is a good metric of how close the quaternions are.
+     * Indeed, consider the unit quaternion difference \f$p^{-1} * q\f$, its real part is dot(p, q).
+     * At the same time its real part is equal to \f$\cos(\beta/2)\f$ where \f$\beta\f$ is
+     * an angle of rotation between p and q, i.e.,
+     * Therefore, the closer dot(p, q) to 1,
+     * the smaller rotation between them.
+     * \f[p \cdot q = p.w \cdot q.w + p.x \cdot q.x + p.y \cdot q.y + p.z \cdot q.z\f]
+     * @param q the other quaternion.
+     *
+     * For example
+     * ```
+     * Quatd q(1,2,3,4);
+     * Quatd p(5,6,7,8);
+     * p.dot(q);
+     * ```
+     */
+    _Tp dot(Quat<_Tp> q) const;
+
+    /**
+     * @brief To calculate the interpolation from \f$q_0\f$ to \f$q_1\f$ by Linear Interpolation(Nlerp)
+     * For two quaternions, this interpolation curve can be displayed as:
+     * \f[Lerp(q_0, q_1, t) = (1 - t)q_0 + tq_1.\f]
+     * Obviously, the lerp will interpolate along a straight line if we think of \f$q_0\f$ and \f$q_1\f$ as a vector
+     * in a two-dimensional space. When \f$t = 0\f$, it returns \f$q_0\f$ and when \f$t= 1\f$, it returns \f$q_1\f$.
+     * \f$t\f$ should to be ranged in \f$[0, 1]\f$ normally.
+     * @param q0 a quaternion used in linear interpolation.
+     * @param q1 a quaternion used in linear interpolation.
+     * @param t percent of vector \f$\overrightarrow{q_0q_1}\f$ over a range [0, 1].
+     * @note it returns a non-unit quaternion.
+     */
+    static Quat<_Tp> lerp(const Quat<_Tp> &q0, const Quat &q1, const _Tp t);
+
+    /**
+     * @brief To calculate the interpolation from \f$q_0\f$ to \f$q_1\f$ by Normalized Linear Interpolation(Nlerp).
+     * it returns a normalized quaternion of Linear Interpolation(Lerp).
+     * \f[ Nlerp(q_0, q_1, t) = \frac{(1 - t)q_0 + tq_1}{||(1 - t)q_0 + tq_1||}.\f]
+     * The interpolation will always choose the shortest path but the constant speed is not guaranteed.
+     * @param q0 a quaternion used in normalized linear interpolation.
+     * @param q1 a quaternion used in normalized linear interpolation.
+     * @param t percent of vector \f$\overrightarrow{q_0q_1}\f$ over a range [0, 1].
+     * @param assumeUnit if QUAT_ASSUME_UNIT, all input quaternions assume to be unit quaternion. Otherwise, all inputs
+     quaternion will be normalized inside the function.
+     * @sa lerp
+     */
+    static Quat<_Tp> nlerp(const Quat<_Tp> &q0, const Quat &q1, const _Tp t, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
+
+    /**
+     @brief To calculate the interpolation between \f$q_0\f$ and \f$q_1\f$ by Spherical Linear
+     Interpolation(Slerp), which can be defined as:
+    \f[ Slerp(q_0, q_1, t) = \frac{\sin((1-t)\theta)}{\sin(\theta)}q_0 + \frac{\sin(t\theta)}{\sin(\theta)}q_1\f]
+    where \f$\theta\f$ can be calculated as:
+    \f[\theta=cos^{-1}(q_0\cdot q_1)\f]
+    resulting from the both of their norm is unit.
+    @param q0 a quaternion used in Slerp.
+    @param q1 a quaternion used in Slerp.
+    @param t percent of angle between \f$q_0\f$ and \f$q_1\f$ over a range [0, 1].
+    @param assumeUnit if QUAT_ASSUME_UNIT, all input quaternions assume to be unit quaternions. Otherwise, all input
+    quaternions will be normalized inside the function.
+    @param directChange if QUAT_ASSUME_UNIT, the interpolation will choose the nearest path.
+    @note If the interpolation angle is small, the error between Nlerp and Slerp is not so large. To improve efficiency and
+    avoid zero division error, we use Nlerp instead of Slerp.
+    */
+    static Quat<_Tp> slerp(const Quat<_Tp> &q0, const Quat &q1, const _Tp t, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT, bool directChange=true);
+
+    /**
+     * @brief To calculate the interpolation between \f$q_0\f$,\f$q_1\f$,\f$q_2\f$,\f$q_3\f$  by Spherical and quadrangle(Squad). This could be defined as:
+     * \f[Squad(q_i, s_i, s_{i+1}, q_{i+1}, t) = Slerp(Slerp(q_i, q_{i+1}, t), Slerp(s_i, s_{i+1}, t), 2t(1-t))\f]
+     * where
+     * \f[s_i = q_i\exp(-\frac{\log(q^*_iq_{i+1}) + \log(q^*_iq_{i-1})}{4})\f]
+     *
+     * The Squad expression is analogous to the \f$B\acute{e}zier\f$ curve, but involves spherical linear
+     * interpolation instead of simple linear interpolation. Each \f$s_i\f$ needs to be calculated by three
+     * quaternions.
+     *
+     * @param q0 the first quaternion.
+     * @param s0 the second quaternion.
+     * @param s1 the third quaternion.
+     * @param q1 thr fourth quaternion.
+     * @param t interpolation parameter of quadratic and linear interpolation over a range \f$[0, 1]\f$.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, all input quaternions assume to be unit quaternion. Otherwise, all input
+     * quaternions will be normalized inside the function.
+     * @param directChange if QUAT_ASSUME_UNIT, squad will find the nearest path to interpolate.
+     * @sa interPoint, spline
+     */
+    static Quat<_Tp> squad(const Quat<_Tp> &q0, const Quat<_Tp> &s0,
+                            const Quat<_Tp> &s1, const Quat<_Tp> &q1,
+                            const _Tp t, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT,
+                            bool directChange=true);
+
+    /**
+     * @brief This is the part calculation of squad.
+     * To calculate the intermedia quaternion \f$s_i\f$ between each three quaternion
+     * \f[s_i = q_i\exp(-\frac{\log(q^*_iq_{i+1}) + \log(q^*_iq_{i-1})}{4}).\f]
+     * @param q0 the first quaternion.
+     * @param q1 the second quaternion.
+     * @param q2 the third quaternion.
+     * @param assumeUnit if QUAT_ASSUME_UNIT, all input quaternions assume to be unit quaternion. Otherwise, all input
+     * quaternions will be normalized inside the function.
+     * @sa squad
+     */
+    static Quat<_Tp> interPoint(const Quat<_Tp> &q0, const Quat<_Tp> &q1,
+                                 const Quat<_Tp> &q2, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
+
+    /**
+     * @brief to calculate a quaternion which is the result of a \f$C^1\f$ continuous
+     * spline curve constructed by squad at the ratio t. Here, the interpolation values are
+     * between \f$q_1\f$ and \f$q_2\f$. \f$q_0\f$ and \f$q_2\f$ are used to ensure the \f$C^1\f$
+     * continuity. if t = 0, it returns \f$q_1\f$, if t = 1, it returns \f$q_2\f$.
+     * @param q0 the first input quaternion to ensure \f$C^1\f$ continuity.
+     * @param q1 the second input quaternion.
+     * @param q2 the third input quaternion.
+     * @param q3 the fourth input quaternion the same use of \f$q1\f$.
+     * @param t ratio over a range [0, 1].
+     * @param assumeUnit if QUAT_ASSUME_UNIT, \f$q_0, q_1, q_2, q_3\f$ assume to be unit quaternion. Otherwise, all input
+     * quaternions will be normalized inside the function.
+     *
+     * For example:
+     *
+     * If there are three double quaternions \f$v_0, v_1, v_2\f$ waiting to be interpolated.
+     *
+     * Interpolation between \f$v_0\f$ and \f$v_1\f$ with a ratio \f$t_0\f$ could be calculated as
+     * ```
+     * Quatd::spline(v0, v0, v1, v2, t0);
+     * ```
+     * Interpolation between \f$v_1\f$ and \f$v_2\f$ with a ratio \f$t_0\f$ could be calculated as
+     * ```
+     * Quatd::spline(v0, v1, v2, v2, t0);
+     * ```
+     * @sa squad, slerp
+     */
+    static Quat<_Tp> spline(const Quat<_Tp> &q0, const Quat<_Tp> &q1,
+                            const Quat<_Tp> &q2, const Quat<_Tp> &q3,
+                            const _Tp t, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
+
+    /**
+     * @brief Return opposite quaternion \f$-p\f$
+     * which satisfies \f$p + (-p) = 0.\f$
+     *
+     * For example
+     * ```
+     * Quatd q{1, 2, 3, 4};
+     * std::cout << -q << std::endl; // [-1, -2, -3, -4]
+     * ```
+     */
+    Quat<_Tp> operator-() const;
+
+    /**
+     * @brief return true if two quaternions p and q are nearly equal, i.e. when the absolute
+     * value of each \f$p_i\f$ and \f$q_i\f$ is less than CV_QUAT_EPS.
+     */
+    bool operator==(const Quat<_Tp>&) const;
+
+    /**
+     * @brief Addition operator of two quaternions p and q.
+     * It returns a new quaternion that each value is the sum of \f$p_i\f$ and \f$q_i\f$.
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * Quatd q{5, 6, 7, 8};
+     * std::cout << p + q << std::endl; //[6, 8, 10, 12]
+     * ```
+     */
+    Quat<_Tp> operator+(const Quat<_Tp>&) const;
+
+    /**
+     * @brief Addition assignment operator of two quaternions p and q.
+     * It adds right operand to the left operand and assign the result to left operand.
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * Quatd q{5, 6, 7, 8};
+     * p += q; // equivalent to p = p + q
+     * std::cout << p << std::endl; //[6, 8, 10, 12]
+     *
+     * ```
+     */
+    Quat<_Tp>& operator+=(const Quat<_Tp>&);
+
+    /**
+     * @brief Subtraction operator of two quaternions p and q.
+     * It returns a new quaternion that each value is the sum of \f$p_i\f$ and \f$-q_i\f$.
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * Quatd q{5, 6, 7, 8};
+     * std::cout << p - q << std::endl; //[-4, -4, -4, -4]
+     * ```
+     */
+    Quat<_Tp> operator-(const Quat<_Tp>&) const;
+
+    /**
+     * @brief Subtraction assignment operator of two quaternions p and q.
+     * It subtracts right operand from the left operand and assign the result to left operand.
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * Quatd q{5, 6, 7, 8};
+     * p -= q; // equivalent to p = p - q
+     * std::cout << p << std::endl; //[-4, -4, -4, -4]
+     *
+     * ```
+     */
+    Quat<_Tp>& operator-=(const Quat<_Tp>&);
+
+    /**
+     * @brief Multiplication assignment operator of two quaternions q and p.
+     * It multiplies right operand with the left operand and assign the result to left operand.
+     *
+     * Rule of quaternion multiplication:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p * q &= [p_0, \boldsymbol{u}]*[q_0, \boldsymbol{v}]\\
+     * &=[p_0q_0 - \boldsymbol{u}\cdot \boldsymbol{v}, p_0\boldsymbol{v} + q_0\boldsymbol{u}+ \boldsymbol{u}\times \boldsymbol{v}].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     * where \f$\cdot\f$ means dot product and \f$\times \f$ means cross product.
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * Quatd q{5, 6, 7, 8};
+     * p *= q; // equivalent to p = p * q
+     * std::cout << p << std::endl; //[-60, 12, 30, 24]
+     * ```
+     */
+    Quat<_Tp>& operator*=(const Quat<_Tp>&);
+
+    /**
+     * @brief Multiplication assignment operator of a quaternions and a scalar.
+     * It multiplies right operand with the left operand and assign the result to left operand.
+     *
+     * Rule of quaternion multiplication with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p * s &= [w, x, y, z] * s\\
+     * &=[w * s, x * s, y * s, z * s].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * double s = 2.0;
+     * p *= s; // equivalent to p = p * s
+     * std::cout << p << std::endl; //[2.0, 4.0, 6.0, 8.0]
+     * ```
+     * @note the type of scalar should be equal to the quaternion.
+     */
+    Quat<_Tp>& operator*=(const _Tp s);
+
+    /**
+     * @brief Multiplication operator of two quaternions q and p.
+     * Multiplies values on either side of the operator.
+     *
+     * Rule of quaternion multiplication:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p * q &= [p_0, \boldsymbol{u}]*[q_0, \boldsymbol{v}]\\
+     * &=[p_0q_0 - \boldsymbol{u}\cdot \boldsymbol{v}, p_0\boldsymbol{v} + q_0\boldsymbol{u}+ \boldsymbol{u}\times \boldsymbol{v}].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     * where \f$\cdot\f$ means dot product and \f$\times \f$ means cross product.
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * Quatd q{5, 6, 7, 8};
+     * std::cout << p * q << std::endl; //[-60, 12, 30, 24]
+     * ```
+     */
+    Quat<_Tp> operator*(const Quat<_Tp>&) const;
+
+    /**
+     * @brief Division operator of a quaternions and a scalar.
+     * It divides left operand with the right operand and assign the result to left operand.
+     *
+     * Rule of quaternion division with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p / s &= [w, x, y, z] / s\\
+     * &=[w/s, x/s, y/s, z/s].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * double s = 2.0;
+     * p /= s; // equivalent to p = p / s
+     * std::cout << p << std::endl; //[0.5, 1, 1.5, 2]
+     * ```
+     * @note the type of scalar should be equal to this quaternion.
+     */
+    Quat<_Tp> operator/(const _Tp s) const;
+
+    /**
+     * @brief Division operator of two quaternions p and q.
+     * Divides left hand operand by right hand operand.
+     *
+     * Rule of quaternion division with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p / q &= p * q.inv()\\
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * Quatd q{5, 6, 7, 8};
+     * std::cout << p / q << std::endl; // equivalent to p * q.inv()
+     * ```
+     */
+    Quat<_Tp> operator/(const Quat<_Tp>&) const;
+
+    /**
+     * @brief Division assignment operator of a quaternions and a scalar.
+     * It divides left operand with the right operand and assign the result to left operand.
+     *
+     * Rule of quaternion division with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p / s &= [w, x, y, z] / s\\
+     * &=[w / s, x / s, y / s, z / s].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * double s = 2.0;;
+     * p /= s; // equivalent to p = p / s
+     * std::cout << p << std::endl; //[0.5, 1.0, 1.5, 2.0]
+     * ```
+     * @note the type of scalar should be equal to the quaternion.
+     */
+    Quat<_Tp>& operator/=(const _Tp s);
+
+    /**
+     * @brief Division assignment operator of two quaternions p and q;
+     * It divides left operand with the right operand and assign the result to left operand.
+     *
+     * Rule of quaternion division with a quaternion:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p / q&= p * q.inv()\\
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * Quatd q{5, 6, 7, 8};
+     * p /= q; // equivalent to p = p * q.inv()
+     * std::cout << p << std::endl;
+     * ```
+     */
+    Quat<_Tp>& operator/=(const Quat<_Tp>&);
+
+    _Tp& operator[](std::size_t n);
+
+    const _Tp& operator[](std::size_t n) const;
+
+    /**
+     * @brief Subtraction operator of a scalar and a quaternions.
+     * Subtracts right hand operand from left hand operand.
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * double scalar = 2.0;
+     * std::cout << scalar - p << std::endl; //[1.0, -2, -3, -4]
+     * ```
+     * @note the type of scalar should be equal to the quaternion.
+     */
+    template <typename T>
+    friend Quat<T> cv::operator-(const T s, const Quat<T>&);
+
+    /**
+     * @brief Subtraction operator of a quaternions and a scalar.
+     * Subtracts right hand operand from left hand operand.
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * double scalar = 2.0;
+     * std::cout << p - scalar << std::endl; //[-1.0, 2, 3, 4]
+     * ```
+     * @note the type of scalar should be equal to the quaternion.
+     */
+    template <typename T>
+    friend Quat<T> cv::operator-(const Quat<T>&, const T s);
+
+    /**
+     * @brief Addition operator of a quaternions and a scalar.
+     * Adds right hand operand from left hand operand.
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * double scalar = 2.0;
+     * std::cout << scalar + p << std::endl; //[3.0, 2, 3, 4]
+     * ```
+     * @note the type of scalar should be equal to the quaternion.
+     */
+    template <typename T>
+    friend Quat<T> cv::operator+(const T s, const Quat<T>&);
+
+    /**
+     * @brief Addition operator of a quaternions and a scalar.
+     * Adds right hand operand from left hand operand.
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * double scalar = 2.0;
+     * std::cout << p + scalar << std::endl; //[3.0, 2, 3, 4]
+     * ```
+     * @note the type of scalar should be equal to the quaternion.
+     */
+    template <typename T>
+    friend Quat<T> cv::operator+(const Quat<T>&, const T s);
+
+    /**
+     * @brief Multiplication operator of a scalar and a quaternions.
+     * It multiplies right operand with the left operand and assign the result to left operand.
+     *
+     * Rule of quaternion multiplication with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p * s &= [w, x, y, z] * s\\
+     * &=[w * s, x * s, y * s, z * s].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * double s = 2.0;
+     * std::cout << s * p << std::endl; //[2.0, 4.0, 6.0, 8.0]
+     * ```
+     * @note the type of scalar should be equal to the quaternion.
+     */
+    template <typename T>
+    friend Quat<T> cv::operator*(const T s, const Quat<T>&);
+
+    /**
+     * @brief Multiplication operator of a quaternion and a scalar.
+     * It multiplies right operand with the left operand and assign the result to left operand.
+     *
+     * Rule of quaternion multiplication with a scalar:
+     * \f[
+     * \begin{equation}
+     * \begin{split}
+     * p * s &= [w, x, y, z] * s\\
+     * &=[w * s, x * s, y * s, z * s].
+     * \end{split}
+     * \end{equation}
+     * \f]
+     *
+     * For example
+     * ```
+     * Quatd p{1, 2, 3, 4};
+     * double s = 2.0;
+     * std::cout << p * s << std::endl; //[2.0, 4.0, 6.0, 8.0]
+     * ```
+     * @note the type of scalar should be equal to the quaternion.
+     */
+    template <typename T>
+    friend Quat<T> cv::operator*(const Quat<T>&, const T s);
+
+    template <typename S>
+    friend std::ostream& cv::operator<<(std::ostream&, const Quat<S>&);
+
+    /**
+     * @brief Transform a quaternion q to Euler angles.
+     *
+     *
+     * When transforming a quaternion \f$q = w + x\boldsymbol{i} + y\boldsymbol{j} + z\boldsymbol{k}\f$ to Euler angles, rotation matrix M can be calculated by:
+     * \f[ \begin{aligned} {M} &={\begin{bmatrix}1-2(y^{2}+z^{2})&2(xy-zx)&2(xz+yw)\\2(xy+zw)&1-2(x^{2}+z^{2})&2(yz-xw)\\2(xz-yw)&2(yz+xw)&1-2(x^{2}+y^{2})\end{bmatrix}}\end{aligned}.\f]
+     * On the other hand, the rotation matrix can be obtained from Euler angles.
+     * Using intrinsic rotations with Euler angles type XYZ as an example,
+     * \f$\theta_1 \f$, \f$\theta_2 \f$, \f$\theta_3 \f$ are three angles for Euler angles, the rotation matrix R can be calculated by:\f[R =X(\theta_1)Y(\theta_2)Z(\theta_3)
+     * ={\begin{bmatrix}\cos\theta_{2}\cos\theta_{3}&-\cos\theta_{2}\sin\theta_{3}&\sin\theta_{2}\\\cos\theta_{1}\sin\theta_{3}+\cos\theta_{3}\sin\theta_{1}\sin\theta_{2}&\cos\theta_{1}\cos\theta_{3}-\sin\theta_{1}\sin\theta_{2}\sin\theta_{3}&-\cos\theta_{2}\sin\theta_{1}\\\sin\theta_{1}\sin\theta_{3}-\cos\theta_{1}\cos\theta_{3}\sin\theta_{2}&\cos\theta_{3}\sin\theta_{1}+\cos\theta_{1}\sin\theta_{2}\sin\theta_{3}&\cos\theta_{1}\cos_{2}\end{bmatrix}}\f]
+     * Rotation matrix M and R are equal. As long as \f$ s_{2} \neq 1 \f$, by comparing each element of two matrices ,the solution is\f$\begin{cases} \theta_1 = \arctan2(-m_{23},m_{33})\\\theta_2 = arcsin(m_{13}) \\\theta_3 = \arctan2(-m_{12},m_{11}) \end{cases}\f$.
+     *
+     * When \f$ s_{2}=1\f$ or \f$ s_{2}=-1\f$, the gimbal lock occurs. The function will prompt "WARNING: Gimbal Lock will occur. Euler angles is non-unique. For intrinsic rotations, we set the third angle to 0, and for external rotation, we set the first angle to 0.".
+     *
+     * When \f$ s_{2}=1\f$ ,
+     * The rotation matrix R is \f$R = {\begin{bmatrix}0&0&1\\\sin(\theta_1+\theta_3)&\cos(\theta_1+\theta_3)&0\\-\cos(\theta_1+\theta_3)&\sin(\theta_1+\theta_3)&0\end{bmatrix}}\f$.
+     *
+     * The number of solutions is infinite with the condition \f$\begin{cases} \theta_1+\theta_3 = \arctan2(m_{21},m_{22})\\ \theta_2=\pi/2 \end{cases}\ \f$.
+     *
+     * We set \f$ \theta_3 = 0\f$, the solution is \f$\begin{cases} \theta_1=\arctan2(m_{21},m_{22})\\ \theta_2=\pi/2\\ \theta_3=0 \end{cases}\f$.
+     *
+     * When \f$ s_{2}=-1\f$,
+     * The rotation matrix R is \f$X_{1}Y_{2}Z_{3}={\begin{bmatrix}0&0&-1\\-\sin(\theta_1-\theta_3)&\cos(\theta_1-\theta_3)&0\\\cos(\theta_1-\theta_3)&\sin(\theta_1-\theta_3)&0\end{bmatrix}}\f$.
+     *
+     * The number of solutions is infinite with the condition \f$\begin{cases} \theta_1+\theta_3 = \arctan2(m_{32},m_{22})\\ \theta_2=\pi/2 \end{cases}\ \f$.
+     *
+     * We set \f$ \theta_3 = 0\f$, the solution is \f$ \begin{cases}\theta_1=\arctan2(m_{32},m_{22}) \\ \theta_2=-\pi/2\\  \theta_3=0\end{cases}\f$.
+     *
+     * Since \f$ sin \theta\in [-1,1] \f$ and \f$ cos \theta \in [-1,1] \f$, the unnormalized quaternion will cause computational troubles. For this reason, this function will normalize the quaternion at first and @ref QuatAssumeType is not needed.
+     *
+     * When the gimbal lock occurs, we set \f$\theta_3 = 0\f$ for intrinsic rotations or \f$\theta_1 = 0\f$ for extrinsic rotations.
+     *
+     * As a result, for every Euler angles type, we can get solution as shown in the following table.
+     * EulerAnglesType  | Ordinary | \f$\theta_2 = π/2\f$ | \f$\theta_2 = -π/2\f$
+     * ------------- | -------------| -------------| -------------
+     * INT_XYZ|\f$ \theta_1 = \arctan2(-m_{23},m_{33})\\\theta_2 = \arcsin(m_{13}) \\\theta_3= \arctan2(-m_{12},m_{11}) \f$|\f$ \theta_1=\arctan2(m_{21},m_{22})\\ \theta_2=\pi/2\\ \theta_3=0 \f$|\f$ \theta_1=\arctan2(m_{32},m_{22})\\ \theta_2=-\pi/2\\ \theta_3=0 \f$
+     * INT_XZY|\f$ \theta_1 = \arctan2(m_{32},m_{22})\\\theta_2 = -\arcsin(m_{12}) \\\theta_3= \arctan2(m_{13},m_{11}) \f$|\f$ \theta_1=\arctan2(m_{31},m_{33})\\ \theta_2=\pi/2\\ \theta_3=0 \f$|\f$ \theta_1=\arctan2(-m_{23},m_{33})\\ \theta_2=-\pi/2\\ \theta_3=0 \f$
+     * INT_YXZ|\f$ \theta_1 = \arctan2(m_{13},m_{33})\\\theta_2 = -\arcsin(m_{23}) \\\theta_3= \arctan2(m_{21},m_{22}) \f$|\f$ \theta_1=\arctan2(m_{12},m_{11})\\ \theta_2=\pi/2\\ \theta_3=0 \f$|\f$ \theta_1=\arctan2(-m_{12},m_{11})\\ \theta_2=-\pi/2\\ \theta_3=0 \f$
+     * INT_YZX|\f$ \theta_1 = \arctan2(-m_{31},m_{11})\\\theta_2 = \arcsin(m_{21}) \\\theta_3= \arctan2(-m_{23},m_{22}) \f$|\f$ \theta_1=\arctan2(m_{13},m_{33})\\ \theta_2=\pi/2\\ \theta_3=0 \f$|\f$ \theta_1=\arctan2(m_{13},m_{12})\\ \theta_2=-\pi/2\\ \theta_3=0 \f$
+     * INT_ZXY|\f$ \theta_1 = \arctan2(-m_{12},m_{22})\\\theta_2 = \arcsin(m_{32}) \\\theta_3= \arctan2(-m_{31},m_{33}) \f$|\f$ \theta_1=\arctan2(m_{21},m_{11})\\ \theta_2=\pi/2\\ \theta_3=0 \f$|\f$ \theta_1=\arctan2(m_{21},m_{11})\\ \theta_2=-\pi/2\\ \theta_3=0 \f$
+     * INT_ZYX|\f$ \theta_1 = \arctan2(m_{21},m_{11})\\\theta_2 = \arcsin(-m_{31}) \\\theta_3= \arctan2(m_{32},m_{33}) \f$|\f$ \theta_1=\arctan2(m_{23},m_{22})\\ \theta_2=\pi/2\\ \theta_3=0 \f$|\f$ \theta_1=\arctan2(-m_{12},m_{22})\\ \theta_2=-\pi/2\\ \theta_3=0 \f$
+     * EXT_XYZ|\f$ \theta_1 = \arctan2(m_{32},m_{33})\\\theta_2 = \arcsin(-m_{31}) \\\ \theta_3 = \arctan2(m_{21},m_{11})\f$|\f$ \theta_1= 0\\ \theta_2=\pi/2\\ \theta_3=\arctan2(m_{23},m_{22}) \f$|\f$ \theta_1=0\\ \theta_2=-\pi/2\\ \theta_3=\arctan2(-m_{12},m_{22}) \f$
+     * EXT_XZY|\f$ \theta_1 = \arctan2(-m_{23},m_{22})\\\theta_2 = \arcsin(m_{21}) \\\theta_3=  \arctan2(-m_{31},m_{11})\f$|\f$ \theta_1= 0\\ \theta_2=\pi/2\\ \theta_3=\arctan2(m_{13},m_{33}) \f$|\f$ \theta_1=0\\ \theta_2=-\pi/2\\ \theta_3=\arctan2(m_{13},m_{12}) \f$
+     * EXT_YXZ|\f$ \theta_1 = \arctan2(-m_{31},m_{33}) \\\theta_2 = \arcsin(m_{32}) \\\theta_3= \arctan2(-m_{12},m_{22})\f$|\f$ \theta_1= 0\\ \theta_2=\pi/2\\ \theta_3=\arctan2(m_{21},m_{11}) \f$|\f$ \theta_1=0\\ \theta_2=-\pi/2\\ \theta_3=\arctan2(m_{21},m_{11}) \f$
+     * EXT_YZX|\f$ \theta_1 = \arctan2(m_{13},m_{11})\\\theta_2 = -\arcsin(m_{12}) \\\theta_3= \arctan2(m_{32},m_{22})\f$|\f$ \theta_1= 0\\ \theta_2=\pi/2\\ \theta_3=\arctan2(m_{31},m_{33}) \f$|\f$ \theta_1=0\\ \theta_2=-\pi/2\\ \theta_3=\arctan2(-m_{23},m_{33}) \f$
+     * EXT_ZXY|\f$ \theta_1 = \arctan2(m_{21},m_{22})\\\theta_2 = -\arcsin(m_{23}) \\\theta_3= \arctan2(m_{13},m_{33})\f$|\f$ \theta_1= 0\\ \theta_2=\pi/2\\ \theta_3=\arctan2(m_{12},m_{11}) \f$|\f$ \theta_1= 0\\ \theta_2=-\pi/2\\ \theta_3=\arctan2(-m_{12},m_{11}) \f$
+     * EXT_ZYX|\f$ \theta_1 = \arctan2(-m_{12},m_{11})\\\theta_2 = \arcsin(m_{13}) \\\theta_3= \arctan2(-m_{23},m_{33})\f$|\f$ \theta_1=0\\ \theta_2=\pi/2\\ \theta_3=\arctan2(m_{21},m_{22}) \f$|\f$ \theta_1=0\\ \theta_2=-\pi/2\\ \theta_3=\arctan2(m_{32},m_{22}) \f$
+     *
+     *  EulerAnglesType  | Ordinary | \f$\theta_2 = 0\f$ | \f$\theta_2 = π\f$
+     * ------------- | -------------| -------------| -------------
+     * INT_XYX| \f$ \theta_1 = \arctan2(m_{21},-m_{31})\\\theta_2 =\arccos(m_{11}) \\\theta_3 = \arctan2(m_{12},m_{13}) \f$| \f$ \theta_1=\arctan2(m_{32},m_{33})\\ \theta_2=0\\ \theta_3=0 \f$| \f$ \theta_1=\arctan2(m_{23},m_{22})\\ \theta_2=\pi\\ \theta_3=0 \f$
+     * INT_XZX| \f$ \theta_1 = \arctan2(m_{31},m_{21})\\\theta_2 = \arccos(m_{11}) \\\theta_3 = \arctan2(m_{13},-m_{12}) \f$| \f$ \theta_1=\arctan2(m_{32},m_{33})\\ \theta_2=0\\ \theta_3=0 \f$| \f$ \theta_1=\arctan2(-m_{32},m_{33})\\ \theta_2=\pi\\ \theta_3=0 \f$
+     * INT_YXY| \f$ \theta_1 = \arctan2(m_{12},m_{32})\\\theta_2 = \arccos(m_{22}) \\\theta_3 = \arctan2(m_{21},-m_{23}) \f$| \f$ \theta_1=\arctan2(m_{13},m_{11})\\ \theta_2=0\\ \theta_3=0 \f$| \f$ \theta_1=\arctan2(-m_{31},m_{11})\\ \theta_2=\pi\\ \theta_3=0 \f$
+     * INT_YZY| \f$ \theta_1 = \arctan2(m_{32},-m_{12})\\\theta_2 = \arccos(m_{22}) \\\theta_3 =\arctan2(m_{23},m_{21}) \f$| \f$ \theta_1=\arctan2(m_{13},m_{11})\\ \theta_2=0\\ \theta_3=0 \f$| \f$ \theta_1=\arctan2(m_{13},-m_{11})\\ \theta_2=\pi\\ \theta_3=0 \f$
+     * INT_ZXZ| \f$ \theta_1 = \arctan2(-m_{13},m_{23})\\\theta_2 = \arccos(m_{33}) \\\theta_3 =\arctan2(m_{31},m_{32}) \f$| \f$ \theta_1=\arctan2(m_{21},m_{22})\\ \theta_2=0\\ \theta_3=0 \f$| \f$ \theta_1=\arctan2(m_{21},m_{11})\\ \theta_2=\pi\\ \theta_3=0 \f$
+     * INT_ZYZ| \f$ \theta_1 = \arctan2(m_{23},m_{13})\\\theta_2 = \arccos(m_{33}) \\\theta_3 = \arctan2(m_{32},-m_{31}) \f$| \f$ \theta_1=\arctan2(m_{21},m_{11})\\ \theta_2=0\\ \theta_3=0 \f$| \f$ \theta_1=\arctan2(m_{21},m_{11})\\ \theta_2=\pi\\ \theta_3=0 \f$
+     * EXT_XYX| \f$ \theta_1 = \arctan2(m_{12},m_{13}) \\\theta_2 = \arccos(m_{11}) \\\theta_3 = \arctan2(m_{21},-m_{31})\f$| \f$ \theta_1=0\\ \theta_2=0\\ \theta_3=\arctan2(m_{32},m_{33}) \f$| \f$ \theta_1= 0\\ \theta_2=\pi\\ \theta_3= \arctan2(m_{23},m_{22}) \f$
+     * EXT_XZX| \f$ \theta_1 = \arctan2(m_{13},-m_{12})\\\theta_2 = \arccos(m_{11}) \\\theta_3 = \arctan2(m_{31},m_{21})\f$| \f$ \theta_1= 0\\ \theta_2=0\\ \theta_3=\arctan2(m_{32},m_{33}) \f$| \f$ \theta_1= 0\\ \theta_2=\pi\\ \theta_3=\arctan2(-m_{32},m_{33}) \f$
+     * EXT_YXY| \f$ \theta_1 = \arctan2(m_{21},-m_{23})\\\theta_2 = \arccos(m_{22}) \\\theta_3 = \arctan2(m_{12},m_{32}) \f$| \f$ \theta_1= 0\\ \theta_2=0\\ \theta_3=\arctan2(m_{13},m_{11}) \f$| \f$ \theta_1= 0\\ \theta_2=\pi\\ \theta_3=\arctan2(-m_{31},m_{11}) \f$
+     * EXT_YZY| \f$ \theta_1 = \arctan2(m_{23},m_{21}) \\\theta_2 = \arccos(m_{22}) \\\theta_3 = \arctan2(m_{32},-m_{12}) \f$| \f$ \theta_1= 0\\ \theta_2=0\\ \theta_3=\arctan2(m_{13},m_{11}) \f$| \f$ \theta_1=0\\ \theta_2=\pi\\ \theta_3=\arctan2(m_{13},-m_{11}) \f$
+     * EXT_ZXZ| \f$ \theta_1 = \arctan2(m_{31},m_{32}) \\\theta_2 = \arccos(m_{33}) \\\theta_3 = \arctan2(-m_{13},m_{23})\f$| \f$ \theta_1=0\\ \theta_2=0\\ \theta_3=\arctan2(m_{21},m_{22}) \f$| \f$ \theta_1= 0\\ \theta_2=\pi\\ \theta_3=\arctan2(m_{21},m_{11}) \f$
+     * EXT_ZYZ| \f$ \theta_1 = \arctan2(m_{32},-m_{31})\\\theta_2 = \arccos(m_{33}) \\\theta_3 = \arctan2(m_{23},m_{13}) \f$| \f$ \theta_1=0\\ \theta_2=0\\ \theta_3=\arctan2(m_{21},m_{11}) \f$| \f$ \theta_1= 0\\ \theta_2=\pi\\ \theta_3=\arctan2(m_{21},m_{11}) \f$
+     *
+     * @param eulerAnglesType the convertion Euler angles type
+     */
+
+    Vec<_Tp, 3> toEulerAngles(QuatEnum::EulerAnglesType eulerAnglesType);
+
+    _Tp w, x, y, z;
+
+};
+
+template <typename T>
+Quat<T> inv(const Quat<T> &q, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
+
+template <typename T>
+Quat<T> sinh(const Quat<T> &q);
+
+template <typename T>
+Quat<T> cosh(const Quat<T> &q);
+
+template <typename T>
+Quat<T> tanh(const Quat<T> &q);
+
+template <typename T>
+Quat<T> sin(const Quat<T> &q);
+
+template <typename T>
+Quat<T> cos(const Quat<T> &q);
+
+template <typename T>
+Quat<T> tan(const Quat<T> &q);
+
+template <typename T>
+Quat<T> asinh(const Quat<T> &q);
+
+template <typename T>
+Quat<T> acosh(const Quat<T> &q);
+
+template <typename T>
+Quat<T> atanh(const Quat<T> &q);
+
+template <typename T>
+Quat<T> asin(const Quat<T> &q);
+
+template <typename T>
+Quat<T> acos(const Quat<T> &q);
+
+template <typename T>
+Quat<T> atan(const Quat<T> &q);
+
+template <typename T>
+Quat<T> power(const Quat<T> &q, const Quat<T> &p, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
+
+template <typename T>
+Quat<T> exp(const Quat<T> &q);
+
+template <typename T>
+Quat<T> log(const Quat<T> &q, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
+
+template <typename T>
+Quat<T> power(const Quat<T>& q, const T x, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
+
+template <typename T>
+Quat<T> crossProduct(const Quat<T> &p, const Quat<T> &q);
+
+template <typename S>
+Quat<S> sqrt(const Quat<S> &q, QuatAssumeType assumeUnit=QUAT_ASSUME_NOT_UNIT);
+
+template <typename T>
+Quat<T> operator*(const T, const Quat<T>&);
+
+template <typename T>
+Quat<T> operator*(const Quat<T>&, const T);
+
+template <typename S>
+std::ostream& operator<<(std::ostream&, const Quat<S>&);
+
+using Quatd = Quat<double>;
+using Quatf = Quat<float>;
+
+//! @} core
+}
+
+#include "opencv2/core/quaternion.inl.hpp"
+
+#endif /* OPENCV_CORE_QUATERNION_HPP */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/quaternion.inl.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/quaternion.inl.hpp
new file mode 100644
index 0000000..29a16d9
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/quaternion.inl.hpp
@@ -0,0 +1,1063 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2020, Huawei Technologies Co., Ltd. All rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: Liangqian Kong <chargerKong@126.com>
+//         Longbu Wang <riskiest@gmail.com>
+
+#ifndef OPENCV_CORE_QUATERNION_INL_HPP
+#define OPENCV_CORE_QUATERNION_INL_HPP
+
+#ifndef OPENCV_CORE_QUATERNION_HPP
+#erorr This is not a standalone header. Include quaternion.hpp instead.
+#endif
+
+//@cond IGNORE
+///////////////////////////////////////////////////////////////////////////////////////
+//Implementation
+namespace cv {
+
+template <typename T>
+Quat<T>::Quat() : w(0), x(0), y(0), z(0) {}
+
+template <typename T>
+Quat<T>::Quat(const Vec<T, 4> &coeff):w(coeff[0]), x(coeff[1]), y(coeff[2]), z(coeff[3]){}
+
+template <typename T>
+Quat<T>::Quat(const T qw, const T qx, const T qy, const T qz):w(qw), x(qx), y(qy), z(qz){}
+
+template <typename T>
+Quat<T> Quat<T>::createFromAngleAxis(const T angle, const Vec<T, 3> &axis)
+{
+    T w, x, y, z;
+    T vNorm = std::sqrt(axis.dot(axis));
+    if (vNorm < CV_QUAT_EPS)
+    {
+        CV_Error(Error::StsBadArg, "this quaternion does not represent a rotation");
+    }
+    const T angle_half = angle * T(0.5);
+    w = std::cos(angle_half);
+    const T sin_v = std::sin(angle_half);
+    const T sin_norm = sin_v / vNorm;
+    x = sin_norm * axis[0];
+    y = sin_norm * axis[1];
+    z = sin_norm * axis[2];
+    return Quat<T>(w, x, y, z);
+}
+
+template <typename T>
+Quat<T> Quat<T>::createFromRotMat(InputArray _R)
+{
+    CV_CheckTypeEQ(_R.type(), cv::traits::Type<T>::value, "");
+    if (_R.rows() != 3 || _R.cols() != 3)
+    {
+        CV_Error(Error::StsBadArg, "Cannot convert matrix to quaternion: rotation matrix should be a 3x3 matrix");
+    }
+    Matx<T, 3, 3> R;
+    _R.copyTo(R);
+
+    T S, w, x, y, z;
+    T trace = R(0, 0) + R(1, 1) + R(2, 2);
+    if (trace > 0)
+    {
+        S = std::sqrt(trace + 1) * T(2);
+        x = (R(1, 2) - R(2, 1)) / S;
+        y = (R(2, 0) - R(0, 2)) / S;
+        z = (R(0, 1) - R(1, 0)) / S;
+        w = -T(0.25) * S;
+    }
+    else if (R(0, 0) > R(1, 1) && R(0, 0) > R(2, 2))
+    {
+
+        S = std::sqrt(T(1.0) + R(0, 0) - R(1, 1) - R(2, 2)) * T(2);
+        x = -T(0.25) * S;
+        y = -(R(1, 0) + R(0, 1)) / S;
+        z = -(R(0, 2) + R(2, 0)) / S;
+        w = (R(1, 2) - R(2, 1)) / S;
+    }
+    else if (R(1, 1) > R(2, 2))
+    {
+        S = std::sqrt(T(1.0) - R(0, 0) + R(1, 1) - R(2, 2)) * T(2);
+        x = (R(0, 1) + R(1, 0)) / S;
+        y = T(0.25) * S;
+        z = (R(1, 2) + R(2, 1)) / S;
+        w = (R(0, 2) - R(2, 0)) / S;
+    }
+    else
+    {
+        S = std::sqrt(T(1.0) - R(0, 0) - R(1, 1) + R(2, 2)) * T(2);
+        x = (R(0, 2) + R(2, 0)) / S;
+        y = (R(1, 2) + R(2, 1)) / S;
+        z = T(0.25) * S;
+        w = -(R(0, 1) - R(1, 0)) / S;
+    }
+    return Quat<T> (w, x, y, z);
+}
+
+template <typename T>
+Quat<T> Quat<T>::createFromRvec(InputArray _rvec)
+{
+    if (!((_rvec.cols() == 1 && _rvec.rows() == 3) || (_rvec.cols() == 3 && _rvec.rows() == 1))) {
+        CV_Error(Error::StsBadArg, "Cannot convert rotation vector to quaternion: The length of rotation vector should be 3");
+    }
+    Vec<T, 3> rvec;
+    _rvec.copyTo(rvec);
+    T psi = std::sqrt(rvec.dot(rvec));
+    if (abs(psi) < CV_QUAT_EPS) {
+        return Quat<T> (1, 0, 0, 0);
+    }
+    Vec<T, 3> axis = rvec / psi;
+    return createFromAngleAxis(psi, axis);
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::operator-() const
+{
+    return Quat<T>(-w, -x, -y, -z);
+}
+
+
+template <typename T>
+inline bool Quat<T>::operator==(const Quat<T> &q) const
+{
+    return (abs(w - q.w) < CV_QUAT_EPS && abs(x - q.x) < CV_QUAT_EPS && abs(y - q.y) < CV_QUAT_EPS && abs(z - q.z) < CV_QUAT_EPS);
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::operator+(const Quat<T> &q1) const
+{
+    return Quat<T>(w + q1.w, x + q1.x, y + q1.y, z + q1.z);
+}
+
+template <typename T>
+inline Quat<T> operator+(const T a, const Quat<T>& q)
+{
+    return Quat<T>(q.w + a, q.x, q.y, q.z);
+}
+
+template <typename T>
+inline Quat<T> operator+(const Quat<T>& q, const T a)
+{
+    return Quat<T>(q.w + a, q.x, q.y, q.z);
+}
+
+template <typename T>
+inline Quat<T> operator-(const T a, const Quat<T>& q)
+{
+    return Quat<T>(a - q.w, -q.x, -q.y, -q.z);
+}
+
+template <typename T>
+inline Quat<T> operator-(const Quat<T>& q, const T a)
+{
+    return Quat<T>(q.w - a, q.x, q.y, q.z);
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::operator-(const Quat<T> &q1) const
+{
+    return Quat<T>(w - q1.w, x - q1.x, y - q1.y, z - q1.z);
+}
+
+template <typename T>
+inline Quat<T>& Quat<T>::operator+=(const Quat<T> &q1)
+{
+    w += q1.w;
+    x += q1.x;
+    y += q1.y;
+    z += q1.z;
+    return *this;
+}
+
+template <typename T>
+inline Quat<T>& Quat<T>::operator-=(const Quat<T> &q1)
+{
+    w -= q1.w;
+    x -= q1.x;
+    y -= q1.y;
+    z -= q1.z;
+    return *this;
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::operator*(const Quat<T> &q1) const
+{
+    Vec<T, 4> q{w, x, y, z};
+    Vec<T, 4> q2{q1.w, q1.x, q1.y, q1.z};
+    return Quat<T>(q * q2);
+}
+
+
+template <typename T>
+Quat<T> operator*(const Quat<T> &q1, const T a)
+{
+    return Quat<T>(a * q1.w, a * q1.x, a * q1.y, a * q1.z);
+}
+
+template <typename T>
+Quat<T> operator*(const T a, const Quat<T> &q1)
+{
+    return Quat<T>(a * q1.w, a * q1.x, a * q1.y, a * q1.z);
+}
+
+template <typename T>
+inline Quat<T>& Quat<T>::operator*=(const Quat<T> &q1)
+{
+    T qw, qx, qy, qz;
+    qw = w * q1.w - x * q1.x - y * q1.y - z * q1.z;
+    qx = x * q1.w + w * q1.x + y * q1.z - z * q1.y;
+    qy = y * q1.w + w * q1.y + z * q1.x - x * q1.z;
+    qz = z * q1.w + w * q1.z + x * q1.y - y * q1.x;
+    w = qw;
+    x = qx;
+    y = qy;
+    z = qz;
+    return *this;
+}
+
+template <typename T>
+inline Quat<T>& Quat<T>::operator/=(const Quat<T> &q1)
+{
+    Quat<T> q(*this * q1.inv());
+    w = q.w;
+    x = q.x;
+    y = q.y;
+    z = q.z;
+    return *this;
+}
+template <typename T>
+Quat<T>& Quat<T>::operator*=(const T q1)
+{
+    w *= q1;
+    x *= q1;
+    y *= q1;
+    z *= q1;
+    return *this;
+}
+
+template <typename T>
+inline Quat<T>& Quat<T>::operator/=(const T a)
+{
+    const T a_inv = 1.0 / a;
+    w *= a_inv;
+    x *= a_inv;
+    y *= a_inv;
+    z *= a_inv;
+    return *this;
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::operator/(const T a) const
+{
+    const T a_inv = T(1.0) / a;
+    return Quat<T>(w * a_inv, x * a_inv, y * a_inv, z * a_inv);
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::operator/(const Quat<T> &q) const
+{
+    return *this * q.inv();
+}
+
+template <typename T>
+inline const T& Quat<T>::operator[](std::size_t n) const
+{
+    switch (n) {
+        case 0:
+            return w;
+        case 1:
+            return x;
+        case 2:
+            return y;
+        case 3:
+            return z;
+        default:
+            CV_Error(Error::StsOutOfRange, "subscript exceeds the index range");
+    }
+}
+
+template <typename T>
+inline T& Quat<T>::operator[](std::size_t n)
+{
+    switch (n) {
+        case 0:
+            return w;
+        case 1:
+            return x;
+        case 2:
+            return y;
+        case 3:
+            return z;
+        default:
+            CV_Error(Error::StsOutOfRange, "subscript exceeds the index range");
+    }
+}
+
+template <typename T>
+std::ostream & operator<<(std::ostream &os, const Quat<T> &q)
+{
+    os << "Quat " << Vec<T, 4>{q.w, q.x, q.y, q.z};
+    return os;
+}
+
+template <typename T>
+inline T Quat<T>::at(size_t index) const
+{
+    return (*this)[index];
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::conjugate() const
+{
+    return Quat<T>(w, -x, -y, -z);
+}
+
+template <typename T>
+inline T Quat<T>::norm() const
+{
+    return std::sqrt(dot(*this));
+}
+
+template <typename T>
+Quat<T> exp(const Quat<T> &q)
+{
+    return q.exp();
+}
+
+template <typename T>
+Quat<T> Quat<T>::exp() const
+{
+    Vec<T, 3> v{x, y, z};
+    T normV = std::sqrt(v.dot(v));
+    T k = normV < CV_QUAT_EPS ? 1 : std::sin(normV) / normV;
+    return std::exp(w) * Quat<T>(std::cos(normV), v[0] * k, v[1] * k, v[2] * k);
+}
+
+template <typename T>
+Quat<T> log(const Quat<T> &q, QuatAssumeType assumeUnit)
+{
+    return q.log(assumeUnit);
+}
+
+template <typename T>
+Quat<T> Quat<T>::log(QuatAssumeType assumeUnit) const
+{
+    Vec<T, 3> v{x, y, z};
+    T vNorm = std::sqrt(v.dot(v));
+    if (assumeUnit)
+    {
+        T k = vNorm < CV_QUAT_EPS ? 1 : std::acos(w) / vNorm;
+        return Quat<T>(0, v[0] * k, v[1] * k, v[2] * k);
+    }
+    T qNorm = norm();
+    if (qNorm < CV_QUAT_EPS)
+    {
+        CV_Error(Error::StsBadArg, "Cannot apply this quaternion to log function: undefined");
+    }
+    T k = vNorm < CV_QUAT_EPS ? 1 : std::acos(w / qNorm) / vNorm;
+    return Quat<T>(std::log(qNorm), v[0] * k, v[1] * k, v[2] *k);
+}
+
+template <typename T>
+inline Quat<T> power(const Quat<T> &q1, const T alpha, QuatAssumeType assumeUnit)
+{
+    return q1.power(alpha, assumeUnit);
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::power(const T alpha, QuatAssumeType assumeUnit) const
+{
+    if (x * x + y * y + z * z > CV_QUAT_EPS)
+    {
+        T angle = getAngle(assumeUnit);
+        Vec<T, 3> axis = getAxis(assumeUnit);
+        if (assumeUnit)
+        {
+            return createFromAngleAxis(alpha * angle, axis);
+        }
+        return std::pow(norm(), alpha) * createFromAngleAxis(alpha * angle, axis);
+    }
+    else
+    {
+        return std::pow(norm(), alpha) * Quat<T>(w, x, y, z);
+    }
+}
+
+
+template <typename T>
+inline Quat<T> sqrt(const Quat<T> &q, QuatAssumeType assumeUnit)
+{
+    return q.sqrt(assumeUnit);
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::sqrt(QuatAssumeType assumeUnit) const
+{
+    return power(0.5, assumeUnit);
+}
+
+
+template <typename T>
+inline Quat<T> power(const Quat<T> &p, const Quat<T> &q, QuatAssumeType assumeUnit)
+{
+    return p.power(q, assumeUnit);
+}
+
+
+template <typename T>
+inline Quat<T> Quat<T>::power(const Quat<T> &q, QuatAssumeType assumeUnit) const
+{
+    return cv::exp(q * log(assumeUnit));
+}
+
+template <typename T>
+inline T Quat<T>::dot(Quat<T> q1) const
+{
+    return w * q1.w + x * q1.x + y * q1.y + z * q1.z;
+}
+
+
+template <typename T>
+inline Quat<T> crossProduct(const Quat<T> &p, const Quat<T> &q)
+{
+    return p.crossProduct(q);
+}
+
+
+template <typename T>
+inline Quat<T> Quat<T>::crossProduct(const Quat<T> &q) const
+{
+    return Quat<T> (0, y * q.z - z * q.y, z * q.x - x * q.z, x * q.y - q.x * y);
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::normalize() const
+{
+    T normVal = norm();
+    if (normVal < CV_QUAT_EPS)
+    {
+        CV_Error(Error::StsBadArg, "Cannot normalize this quaternion: the norm is too small.");
+    }
+    return Quat<T>(w / normVal, x / normVal, y / normVal, z / normVal) ;
+}
+
+template <typename T>
+inline Quat<T> inv(const Quat<T> &q, QuatAssumeType assumeUnit)
+{
+    return q.inv(assumeUnit);
+}
+
+
+template <typename T>
+inline Quat<T> Quat<T>::inv(QuatAssumeType assumeUnit) const
+{
+    if (assumeUnit)
+    {
+        return conjugate();
+    }
+    T norm2 = dot(*this);
+    if (norm2 < CV_QUAT_EPS)
+    {
+        CV_Error(Error::StsBadArg, "This quaternion do not have inverse quaternion");
+    }
+    return conjugate() / norm2;
+}
+
+template <typename T>
+inline Quat<T> sinh(const Quat<T> &q)
+{
+    return q.sinh();
+}
+
+
+template <typename T>
+inline Quat<T> Quat<T>::sinh() const
+{
+    Vec<T, 3> v{x, y ,z};
+    T vNorm = std::sqrt(v.dot(v));
+    T k = vNorm < CV_QUAT_EPS ? 1 : std::cosh(w) * std::sin(vNorm) / vNorm;
+    return Quat<T>(std::sinh(w) * std::cos(vNorm), v[0] * k, v[1] * k, v[2] * k);
+}
+
+
+template <typename T>
+inline Quat<T> cosh(const Quat<T> &q)
+{
+    return q.cosh();
+}
+
+
+template <typename T>
+inline Quat<T> Quat<T>::cosh() const
+{
+    Vec<T, 3> v{x, y ,z};
+    T vNorm = std::sqrt(v.dot(v));
+    T k = vNorm < CV_QUAT_EPS ? 1 : std::sinh(w) * std::sin(vNorm) / vNorm;
+    return Quat<T>(std::cosh(w) * std::cos(vNorm), v[0] * k, v[1] * k, v[2] * k);
+}
+
+template <typename T>
+inline Quat<T> tanh(const Quat<T> &q)
+{
+    return q.tanh();
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::tanh() const
+{
+    return sinh() * cosh().inv();
+}
+
+
+template <typename T>
+inline Quat<T> sin(const Quat<T> &q)
+{
+    return q.sin();
+}
+
+
+template <typename T>
+inline Quat<T> Quat<T>::sin() const
+{
+    Vec<T, 3> v{x, y ,z};
+    T vNorm = std::sqrt(v.dot(v));
+    T k = vNorm < CV_QUAT_EPS ? 1 : std::cos(w) * std::sinh(vNorm) / vNorm;
+    return Quat<T>(std::sin(w) * std::cosh(vNorm), v[0] * k, v[1] * k, v[2] * k);
+}
+
+template <typename T>
+inline Quat<T> cos(const Quat<T> &q)
+{
+    return q.cos();
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::cos() const
+{
+    Vec<T, 3> v{x, y ,z};
+    T vNorm = std::sqrt(v.dot(v));
+    T k = vNorm < CV_QUAT_EPS ? 1 : std::sin(w) * std::sinh(vNorm) / vNorm;
+    return Quat<T>(std::cos(w) * std::cosh(vNorm), -v[0] * k, -v[1] * k, -v[2] * k);
+}
+
+template <typename T>
+inline Quat<T> tan(const Quat<T> &q)
+{
+    return q.tan();
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::tan() const
+{
+    return sin() * cos().inv();
+}
+
+template <typename T>
+inline Quat<T> asinh(const Quat<T> &q)
+{
+    return q.asinh();
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::asinh() const
+{
+    return cv::log(*this + cv::power(*this * *this + Quat<T>(1, 0, 0, 0), 0.5));
+}
+
+template <typename T>
+inline Quat<T> acosh(const Quat<T> &q)
+{
+    return q.acosh();
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::acosh() const
+{
+    return cv::log(*this + cv::power(*this * *this - Quat<T>(1,0,0,0), 0.5));
+}
+
+template <typename T>
+inline Quat<T> atanh(const Quat<T> &q)
+{
+    return q.atanh();
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::atanh() const
+{
+    Quat<T> ident(1, 0, 0, 0);
+    Quat<T> c1 = (ident + *this).log();
+    Quat<T> c2 = (ident - *this).log();
+    return 0.5 * (c1 - c2);
+}
+
+template <typename T>
+inline Quat<T> asin(const Quat<T> &q)
+{
+    return q.asin();
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::asin() const
+{
+    Quat<T> v(0, x, y, z);
+    T vNorm = v.norm();
+    T k = vNorm < CV_QUAT_EPS ? 1 : vNorm;
+    return -v / k * (*this * v / k).asinh();
+}
+
+template <typename T>
+inline Quat<T> acos(const Quat<T> &q)
+{
+    return q.acos();
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::acos() const
+{
+    Quat<T> v(0, x, y, z);
+    T vNorm = v.norm();
+    T k = vNorm < CV_QUAT_EPS ? 1 : vNorm;
+    return -v / k * acosh();
+}
+
+template <typename T>
+inline Quat<T> atan(const Quat<T> &q)
+{
+    return q.atan();
+}
+
+template <typename T>
+inline Quat<T> Quat<T>::atan() const
+{
+    Quat<T> v(0, x, y, z);
+    T vNorm = v.norm();
+    T k = vNorm < CV_QUAT_EPS ? 1 : vNorm;
+    return -v / k * (*this * v / k).atanh();
+}
+
+template <typename T>
+inline T Quat<T>::getAngle(QuatAssumeType assumeUnit) const
+{
+    if (assumeUnit)
+    {
+        return 2 * std::acos(w);
+    }
+    if (norm() < CV_QUAT_EPS)
+    {
+        CV_Error(Error::StsBadArg, "This quaternion does not represent a rotation");
+    }
+    return 2 * std::acos(w / norm());
+}
+
+template <typename T>
+inline Vec<T, 3> Quat<T>::getAxis(QuatAssumeType assumeUnit) const
+{
+    T angle = getAngle(assumeUnit);
+    const T sin_v = std::sin(angle * 0.5);
+    if (assumeUnit)
+    {
+        return Vec<T, 3>{x, y, z} / sin_v;
+    }
+    return Vec<T, 3> {x, y, z} / (norm() * sin_v);
+}
+
+template <typename T>
+Matx<T, 4, 4> Quat<T>::toRotMat4x4(QuatAssumeType assumeUnit) const
+{
+    T a = w, b = x, c = y, d = z;
+    if (!assumeUnit)
+    {
+        Quat<T> qTemp = normalize();
+        a = qTemp.w;
+        b = qTemp.x;
+        c = qTemp.y;
+        d = qTemp.z;
+    }
+    Matx<T, 4, 4> R{
+        1 - 2 * (c * c + d * d), 2 * (b * c - a * d)    , 2 * (b * d + a * c)    , 0,
+        2 * (b * c + a * d)    , 1 - 2 * (b * b + d * d), 2 * (c * d - a * b)    , 0,
+        2 * (b * d - a * c)    , 2 * (c * d + a * b)    , 1 - 2 * (b * b + c * c), 0,
+        0                      , 0                      , 0                      , 1,
+    };
+    return R;
+}
+
+template <typename T>
+Matx<T, 3, 3> Quat<T>::toRotMat3x3(QuatAssumeType assumeUnit) const
+{
+    T a = w, b = x, c = y, d = z;
+    if (!assumeUnit)
+    {
+        Quat<T> qTemp = normalize();
+        a = qTemp.w;
+        b = qTemp.x;
+        c = qTemp.y;
+        d = qTemp.z;
+    }
+    Matx<T, 3, 3> R{
+        1 - 2 * (c * c + d * d), 2 * (b * c - a * d)    , 2 * (b * d + a * c),
+        2 * (b * c + a * d)    , 1 - 2 * (b * b + d * d), 2 * (c * d - a * b),
+        2 * (b * d - a * c)    , 2 * (c * d + a * b)    , 1 - 2 * (b * b + c * c)
+    };
+    return R;
+}
+
+template <typename T>
+Vec<T, 3> Quat<T>::toRotVec(QuatAssumeType assumeUnit) const
+{
+    T angle = getAngle(assumeUnit);
+    Vec<T, 3> axis = getAxis(assumeUnit);
+    return angle * axis;
+}
+
+template <typename T>
+Vec<T, 4> Quat<T>::toVec() const
+{
+    return Vec<T, 4>{w, x, y, z};
+}
+
+template <typename T>
+Quat<T> Quat<T>::lerp(const Quat<T> &q0, const Quat<T> &q1, const T t)
+{
+    return (1 - t) * q0 + t * q1;
+}
+
+template <typename T>
+Quat<T> Quat<T>::slerp(const Quat<T> &q0, const Quat<T> &q1, const T t, QuatAssumeType assumeUnit, bool directChange)
+{
+    Quatd v0(q0);
+    Quatd v1(q1);
+    if (!assumeUnit)
+    {
+        v0 = v0.normalize();
+        v1 = v1.normalize();
+    }
+    T cosTheta = v0.dot(v1);
+    constexpr T DOT_THRESHOLD = 0.995;
+    if (cosTheta > DOT_THRESHOLD)
+    {
+        return nlerp(v0, v1, t, QUAT_ASSUME_UNIT);
+    }
+
+    if (directChange && cosTheta < 0)
+    {
+        v0 = -v0;
+        cosTheta = -cosTheta;
+    }
+    T sinTheta = std::sqrt(1 - cosTheta * cosTheta);
+    T angle = atan2(sinTheta, cosTheta);
+    return (std::sin((1 - t) * angle) / (sinTheta) * v0 + std::sin(t * angle) / (sinTheta) * v1).normalize();
+}
+
+
+template <typename T>
+inline Quat<T> Quat<T>::nlerp(const Quat<T> &q0, const Quat<T> &q1, const T t, QuatAssumeType assumeUnit)
+{
+    Quat<T> v0(q0), v1(q1);
+    if (v1.dot(v0) < 0)
+    {
+        v0 = -v0;
+    }
+    if (assumeUnit)
+    {
+        return ((1 - t) * v0 + t * v1).normalize();
+    }
+    v0 = v0.normalize();
+    v1 = v1.normalize();
+    return ((1 - t) * v0 + t * v1).normalize();
+}
+
+
+template <typename T>
+inline bool Quat<T>::isNormal(T eps) const
+{
+
+    double normVar = norm();
+    if ((normVar > 1 - eps) && (normVar < 1 + eps))
+        return true;
+    return false;
+}
+
+template <typename T>
+inline void Quat<T>::assertNormal(T eps) const
+{
+    if (!isNormal(eps))
+        CV_Error(Error::StsBadArg, "Quaternion should be normalized");
+}
+
+
+template <typename T>
+inline Quat<T> Quat<T>::squad(const Quat<T> &q0, const Quat<T> &q1,
+                            const Quat<T> &q2, const Quat<T> &q3,
+                            const T t, QuatAssumeType assumeUnit,
+                            bool directChange)
+{
+    Quat<T> v0(q0), v1(q1), v2(q2), v3(q3);
+    if (!assumeUnit)
+    {
+        v0 = v0.normalize();
+        v1 = v1.normalize();
+        v2 = v2.normalize();
+        v3 = v3.normalize();
+    }
+
+    Quat<T> c0 = slerp(v0, v3, t, assumeUnit, directChange);
+    Quat<T> c1 = slerp(v1, v2, t, assumeUnit, directChange);
+    return slerp(c0, c1, 2 * t * (1 - t), assumeUnit, directChange);
+}
+
+template <typename T>
+Quat<T> Quat<T>::interPoint(const Quat<T> &q0, const Quat<T> &q1,
+                            const Quat<T> &q2, QuatAssumeType assumeUnit)
+{
+    Quat<T> v0(q0), v1(q1), v2(q2);
+    if (!assumeUnit)
+    {
+        v0 = v0.normalize();
+        v1 = v1.normalize();
+        v2 = v2.normalize();
+    }
+    return v1 * cv::exp(-(cv::log(v1.conjugate() * v0, assumeUnit) + (cv::log(v1.conjugate() * v2, assumeUnit))) / 4);
+}
+
+template <typename T>
+Quat<T> Quat<T>::spline(const Quat<T> &q0, const Quat<T> &q1, const Quat<T> &q2, const Quat<T> &q3, const T t, QuatAssumeType assumeUnit)
+{
+    Quatd v0(q0), v1(q1), v2(q2), v3(q3);
+    if (!assumeUnit)
+    {
+        v0 = v0.normalize();
+        v1 = v1.normalize();
+        v2 = v2.normalize();
+        v3 = v3.normalize();
+    }
+    T cosTheta;
+    std::vector<Quat<T>> vec{v0, v1, v2, v3};
+    for (size_t i = 0; i < 3; ++i)
+    {
+        cosTheta = vec[i].dot(vec[i + 1]);
+        if (cosTheta < 0)
+        {
+            vec[i + 1] = -vec[i + 1];
+        }
+    }
+    Quat<T> s1 = interPoint(vec[0], vec[1], vec[2], QUAT_ASSUME_UNIT);
+    Quat<T> s2 = interPoint(vec[1], vec[2], vec[3], QUAT_ASSUME_UNIT);
+    return squad(vec[1], s1, s2, vec[2], t, assumeUnit, QUAT_ASSUME_NOT_UNIT);
+}
+
+namespace detail {
+
+template <typename T> static
+Quat<T> createFromAxisRot(int axis, const T theta)
+{
+    if (axis == 0)
+        return Quat<T>::createFromXRot(theta);
+    if (axis == 1)
+        return Quat<T>::createFromYRot(theta);
+    if (axis == 2)
+        return Quat<T>::createFromZRot(theta);
+    CV_Assert(0);
+}
+
+inline bool isIntAngleType(QuatEnum::EulerAnglesType eulerAnglesType)
+{
+    return eulerAnglesType < QuatEnum::EXT_XYZ;
+}
+
+inline bool isTaitBryan(QuatEnum::EulerAnglesType eulerAnglesType)
+{
+    return eulerAnglesType/6 == 1 || eulerAnglesType/6 == 3;
+}
+}  // namespace detail
+
+template <typename T>
+Quat<T> Quat<T>::createFromYRot(const T theta)
+{
+    return Quat<T>{std::cos(theta * 0.5f), 0, std::sin(theta * 0.5f), 0};
+}
+
+template <typename T>
+Quat<T> Quat<T>::createFromXRot(const T theta){
+    return Quat<T>{std::cos(theta * 0.5f), std::sin(theta * 0.5f), 0, 0};
+}
+
+template <typename T>
+Quat<T> Quat<T>::createFromZRot(const T theta){
+    return Quat<T>{std::cos(theta * 0.5f), 0, 0, std::sin(theta * 0.5f)};
+}
+
+
+template <typename T>
+Quat<T> Quat<T>::createFromEulerAngles(const Vec<T, 3> &angles, QuatEnum::EulerAnglesType eulerAnglesType) {
+    CV_Assert(eulerAnglesType < QuatEnum::EulerAnglesType::EULER_ANGLES_MAX_VALUE);
+    static const int rotationAxis[24][3] = {
+        {0, 1, 2}, ///< Intrinsic rotations with the Euler angles type X-Y-Z
+        {0, 2, 1}, ///< Intrinsic rotations with the Euler angles type X-Z-Y
+        {1, 0, 2}, ///< Intrinsic rotations with the Euler angles type Y-X-Z
+        {1, 2, 0}, ///< Intrinsic rotations with the Euler angles type Y-Z-X
+        {2, 0, 1}, ///< Intrinsic rotations with the Euler angles type Z-X-Y
+        {2, 1, 0}, ///< Intrinsic rotations with the Euler angles type Z-Y-X
+        {0, 1, 0}, ///< Intrinsic rotations with the Euler angles type X-Y-X
+        {0, 2, 0}, ///< Intrinsic rotations with the Euler angles type X-Z-X
+        {1, 0, 1}, ///< Intrinsic rotations with the Euler angles type Y-X-Y
+        {1, 2, 1}, ///< Intrinsic rotations with the Euler angles type Y-Z-Y
+        {2, 0, 2}, ///< Intrinsic rotations with the Euler angles type Z-X-Z
+        {2, 1, 2}, ///< Intrinsic rotations with the Euler angles type Z-Y-Z
+        {0, 1, 2}, ///< Extrinsic rotations with the Euler angles type X-Y-Z
+        {0, 2, 1}, ///< Extrinsic rotations with the Euler angles type X-Z-Y
+        {1, 0, 2}, ///< Extrinsic rotations with the Euler angles type Y-X-Z
+        {1, 2, 0}, ///< Extrinsic rotations with the Euler angles type Y-Z-X
+        {2, 0, 1}, ///< Extrinsic rotations with the Euler angles type Z-X-Y
+        {2, 1, 0}, ///< Extrinsic rotations with the Euler angles type Z-Y-X
+        {0, 1, 0}, ///< Extrinsic rotations with the Euler angles type X-Y-X
+        {0, 2, 0}, ///< Extrinsic rotations with the Euler angles type X-Z-X
+        {1, 0, 1}, ///< Extrinsic rotations with the Euler angles type Y-X-Y
+        {1, 2, 1}, ///< Extrinsic rotations with the Euler angles type Y-Z-Y
+        {2, 0, 2}, ///< Extrinsic rotations with the Euler angles type Z-X-Z
+        {2, 1, 2}  ///< Extrinsic rotations with the Euler angles type Z-Y-Z
+    };
+    Quat<T> q1 = detail::createFromAxisRot(rotationAxis[eulerAnglesType][0], angles(0));
+    Quat<T> q2 = detail::createFromAxisRot(rotationAxis[eulerAnglesType][1], angles(1));
+    Quat<T> q3 = detail::createFromAxisRot(rotationAxis[eulerAnglesType][2], angles(2));
+    if (detail::isIntAngleType(eulerAnglesType))
+    {
+        return q1 * q2 * q3;
+    }
+    else // (!detail::isIntAngleType<T>(eulerAnglesType))
+    {
+        return q3 * q2 * q1;
+    }
+}
+
+template <typename T>
+Vec<T, 3> Quat<T>::toEulerAngles(QuatEnum::EulerAnglesType eulerAnglesType){
+    CV_Assert(eulerAnglesType < QuatEnum::EulerAnglesType::EULER_ANGLES_MAX_VALUE);
+    Matx33d R = toRotMat3x3();
+    enum {
+        C_ZERO,
+        C_PI,
+        C_PI_2,
+        N_CONSTANTS,
+        R_0_0 = N_CONSTANTS, R_0_1, R_0_2,
+        R_1_0, R_1_1, R_1_2,
+        R_2_0, R_2_1, R_2_2
+    };
+    static const T constants_[N_CONSTANTS] = {
+        0,  // C_ZERO
+        (T)CV_PI,  // C_PI
+        (T)(CV_PI * 0.5)  // C_PI_2, -C_PI_2
+    };
+    static const int rotationR_[24][12] = {
+        {+R_0_2,    +R_1_0, +R_1_1, C_PI_2,     +R_2_1, +R_1_1, -C_PI_2,    -R_1_2, +R_2_2,    +R_0_2,    -R_0_1, +R_0_0},  // INT_XYZ
+        {+R_0_1,    -R_1_2, +R_2_2, -C_PI_2,    +R_2_0, +R_2_2, C_PI_2,     +R_2_1, +R_1_1,    -R_0_1,    +R_0_2, +R_0_0},  // INT_XZY
+        {+R_1_2,    -R_0_1, +R_0_0, -C_PI_2,    +R_0_1, +R_0_0, C_PI_2,     +R_0_2, +R_2_2,    -R_1_2,    +R_1_0, +R_1_1},  // INT_YXZ
+        {+R_1_0,    +R_0_2, +R_2_2, C_PI_2,     +R_0_2, +R_0_1, -C_PI_2,    -R_2_0, +R_0_0,    +R_1_0,    -R_1_2, +R_1_1},  // INT_YZX
+        {+R_2_1,    +R_1_0, +R_0_0, C_PI_2,     +R_1_0, +R_0_0, -C_PI_2,    -R_0_1, +R_1_1,    +R_2_1,    -R_2_0, +R_2_2},  // INT_ZXY
+        {+R_2_0,    -R_0_1, +R_1_1, -C_PI_2,    +R_1_2, +R_1_1, C_PI_2,     +R_1_0, +R_0_0,    -R_2_0,    +R_2_1, +R_2_2},  // INT_ZYX
+        {+R_0_0,    +R_2_1, +R_2_2, C_ZERO,     +R_1_2, +R_1_1, C_PI,       +R_1_0, -R_2_0,    +R_0_0,    +R_0_1, +R_0_2},  // INT_XYX
+        {+R_0_0,    +R_2_1, +R_2_2, C_ZERO,     -R_2_1, +R_2_2, C_PI,       +R_2_0, +R_1_0,    +R_0_0,    +R_0_2, -R_0_1},  // INT_XZX
+        {+R_1_1,    +R_0_2, +R_0_0, C_ZERO,     -R_2_0, +R_0_0, C_PI,       +R_0_1, +R_2_1,    +R_1_1,    +R_1_0, -R_1_2},  // INT_YXY
+        {+R_1_1,    +R_0_2, +R_0_0, C_ZERO,     +R_0_2, -R_0_0, C_PI,       +R_2_1, -R_0_1,    +R_1_1,    +R_1_2, +R_1_0},  // INT_YZY
+        {+R_2_2,    +R_1_0, +R_1_1, C_ZERO,     +R_1_0, +R_0_0, C_PI,       +R_0_2, -R_1_2,    +R_2_2,    +R_2_0, +R_2_1},  // INT_ZXZ
+        {+R_2_2,    +R_1_0, +R_0_0, C_ZERO,     +R_1_0, +R_0_0, C_PI,       +R_1_2, +R_0_2,    +R_2_2,    +R_2_1, -R_2_0},  // INT_ZYZ
+
+        {+R_2_0,    -C_PI_2, -R_0_1, +R_1_1,    C_PI_2,  +R_1_2, +R_1_1,    +R_2_1, +R_2_2,    -R_2_0,    +R_1_0, +R_0_0},  // EXT_XYZ
+        {+R_1_0,    C_PI_2,  +R_0_2, +R_2_2,    -C_PI_2, +R_0_2, +R_0_1,    -R_1_2, +R_1_1,    +R_1_0,    -R_2_0, +R_0_0},  // EXT_XZY
+        {+R_2_1,    C_PI_2,  +R_1_0, +R_0_0,    -C_PI_2, +R_1_0, +R_0_0,    -R_2_0, +R_2_2,    +R_2_1,    -R_0_1, +R_1_1},  // EXT_YXZ
+        {+R_0_2,    -C_PI_2, -R_1_2, +R_2_2,    C_PI_2,  +R_2_0, +R_2_2,    +R_0_2, +R_0_0,    -R_0_1,    +R_2_1, +R_1_1},  // EXT_YZX
+        {+R_1_2,    -C_PI_2, -R_0_1, +R_0_0,    C_PI_2,  +R_0_1, +R_0_0,    +R_1_0, +R_1_1,    -R_1_2,    +R_0_2, +R_2_2},  // EXT_ZXY
+        {+R_0_2,    C_PI_2,  +R_1_0, +R_1_1,    -C_PI_2, +R_2_1, +R_1_1,    -R_0_1, +R_0_0,    +R_0_2,    -R_1_2, +R_2_2},  // EXT_ZYX
+        {+R_0_0,    C_ZERO,  +R_2_1, +R_2_2,    C_PI,    +R_1_2, +R_1_1,    +R_0_1, +R_0_2,    +R_0_0,    +R_1_0, -R_2_0},  // EXT_XYX
+        {+R_0_0,    C_ZERO,  +R_2_1, +R_2_2,    C_PI,    +R_2_1, +R_2_2,    +R_0_2, -R_0_1,    +R_0_0,    +R_2_0, +R_1_0},  // EXT_XZX
+        {+R_1_1,    C_ZERO,  +R_0_2, +R_0_0,    C_PI,    -R_2_0, +R_0_0,    +R_1_0, -R_1_2,    +R_1_1,    +R_0_1, +R_2_1},  // EXT_YXY
+        {+R_1_1,    C_ZERO,  +R_0_2, +R_0_0,    C_PI,    +R_0_2, -R_0_0,    +R_1_2, +R_1_0,    +R_1_1,    +R_2_1, -R_0_1},  // EXT_YZY
+        {+R_2_2,    C_ZERO,  +R_1_0, +R_1_1,    C_PI,    +R_1_0, +R_0_0,    +R_2_0, +R_2_1,    +R_2_2,    +R_0_2, -R_1_2},  // EXT_ZXZ
+        {+R_2_2,    C_ZERO,  +R_1_0, +R_0_0,    C_PI,    +R_1_0, +R_0_0,    +R_2_1, -R_2_0,    +R_2_2,    +R_1_2, +R_0_2},  // EXT_ZYZ
+    };
+    T rotationR[12];
+    for (int i = 0; i < 12; i++)
+    {
+        int id = rotationR_[eulerAnglesType][i];
+        unsigned idx = std::abs(id);
+        T value = 0.0f;
+        if (idx < N_CONSTANTS)
+        {
+            value = constants_[idx];
+        }
+        else
+        {
+            unsigned r_idx = idx - N_CONSTANTS;
+            CV_DbgAssert(r_idx < 9);
+            value = R.val[r_idx];
+        }
+        bool isNegative = id < 0;
+        if (isNegative)
+            value = -value;
+        rotationR[i] = value;
+    }
+    Vec<T, 3> angles;
+    if (detail::isIntAngleType(eulerAnglesType))
+    {
+        if (abs(rotationR[0] - 1) < CV_QUAT_CONVERT_THRESHOLD)
+        {
+            CV_LOG_WARNING(NULL,"Gimbal Lock occurs. Euler angles are non-unique, we set the third angle to 0");
+            angles = {std::atan2(rotationR[1], rotationR[2]), rotationR[3], 0};
+            return angles;
+        }
+        else if(abs(rotationR[0] + 1) < CV_QUAT_CONVERT_THRESHOLD)
+        {
+            CV_LOG_WARNING(NULL,"Gimbal Lock occurs. Euler angles are non-unique, we set the third angle to 0");
+            angles = {std::atan2(rotationR[4], rotationR[5]), rotationR[6], 0};
+            return angles;
+        }
+    }
+    else // (!detail::isIntAngleType<T>(eulerAnglesType))
+    {
+        if (abs(rotationR[0] - 1) < CV_QUAT_CONVERT_THRESHOLD)
+        {
+            CV_LOG_WARNING(NULL,"Gimbal Lock occurs. Euler angles are non-unique, we set the first angle to 0");
+            angles = {0, rotationR[1], std::atan2(rotationR[2], rotationR[3])};
+            return angles;
+        }
+        else if (abs(rotationR[0] + 1) < CV_QUAT_CONVERT_THRESHOLD)
+        {
+            CV_LOG_WARNING(NULL,"Gimbal Lock occurs. Euler angles are non-unique, we set the first angle to 0");
+            angles = {0, rotationR[4], std::atan2(rotationR[5], rotationR[6])};
+            return angles;
+        }
+    }
+
+    angles(0) = std::atan2(rotationR[7], rotationR[8]);
+    if (detail::isTaitBryan(eulerAnglesType))
+        angles(1) = std::acos(rotationR[9]);
+    else
+        angles(1) = std::asin(rotationR[9]);
+    angles(2) = std::atan2(rotationR[10], rotationR[11]);
+    return angles;
+}
+
+}  // namepsace
+//! @endcond
+
+#endif /*OPENCV_CORE_QUATERNION_INL_HPP*/
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/saturate.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/saturate.hpp
new file mode 100644
index 0000000..8127e3d
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/saturate.hpp
@@ -0,0 +1,179 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2014, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_SATURATE_HPP
+#define OPENCV_CORE_SATURATE_HPP
+
+#include "opencv2/core/cvdef.h"
+#include "opencv2/core/fast_math.hpp"
+
+namespace cv
+{
+
+//! @addtogroup core_utils
+//! @{
+
+/////////////// saturate_cast (used in image & signal processing) ///////////////////
+
+/** @brief Template function for accurate conversion from one primitive type to another.
+
+ The function saturate_cast resembles the standard C++ cast operations, such as static_cast\<T\>()
+ and others. It perform an efficient and accurate conversion from one primitive type to another
+ (see the introduction chapter). saturate in the name means that when the input value v is out of the
+ range of the target type, the result is not formed just by taking low bits of the input, but instead
+ the value is clipped. For example:
+ @code
+ uchar a = saturate_cast<uchar>(-100); // a = 0 (UCHAR_MIN)
+ short b = saturate_cast<short>(33333.33333); // b = 32767 (SHRT_MAX)
+ @endcode
+ Such clipping is done when the target type is unsigned char , signed char , unsigned short or
+ signed short . For 32-bit integers, no clipping is done.
+
+ When the parameter is a floating-point value and the target type is an integer (8-, 16- or 32-bit),
+ the floating-point value is first rounded to the nearest integer and then clipped if needed (when
+ the target type is 8- or 16-bit).
+
+ @param v Function parameter.
+ @sa add, subtract, multiply, divide, Mat::convertTo
+ */
+template<typename _Tp> static inline _Tp saturate_cast(uchar v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(schar v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(ushort v)   { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(short v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(unsigned v) { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(int v)      { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(float v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(double v)   { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(int64 v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(uint64 v)   { return _Tp(v); }
+
+template<> inline uchar saturate_cast<uchar>(schar v)        { return (uchar)std::max((int)v, 0); }
+template<> inline uchar saturate_cast<uchar>(ushort v)       { return (uchar)std::min((unsigned)v, (unsigned)UCHAR_MAX); }
+template<> inline uchar saturate_cast<uchar>(int v)          { return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
+template<> inline uchar saturate_cast<uchar>(short v)        { return saturate_cast<uchar>((int)v); }
+template<> inline uchar saturate_cast<uchar>(unsigned v)     { return (uchar)std::min(v, (unsigned)UCHAR_MAX); }
+template<> inline uchar saturate_cast<uchar>(float v)        { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
+template<> inline uchar saturate_cast<uchar>(double v)       { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
+template<> inline uchar saturate_cast<uchar>(int64 v)        { return (uchar)((uint64)v <= (uint64)UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
+template<> inline uchar saturate_cast<uchar>(uint64 v)       { return (uchar)std::min(v, (uint64)UCHAR_MAX); }
+
+template<> inline schar saturate_cast<schar>(uchar v)        { return (schar)std::min((int)v, SCHAR_MAX); }
+template<> inline schar saturate_cast<schar>(ushort v)       { return (schar)std::min((unsigned)v, (unsigned)SCHAR_MAX); }
+template<> inline schar saturate_cast<schar>(int v)          { return (schar)((unsigned)(v-SCHAR_MIN) <= (unsigned)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
+template<> inline schar saturate_cast<schar>(short v)        { return saturate_cast<schar>((int)v); }
+template<> inline schar saturate_cast<schar>(unsigned v)     { return (schar)std::min(v, (unsigned)SCHAR_MAX); }
+template<> inline schar saturate_cast<schar>(float v)        { int iv = cvRound(v); return saturate_cast<schar>(iv); }
+template<> inline schar saturate_cast<schar>(double v)       { int iv = cvRound(v); return saturate_cast<schar>(iv); }
+template<> inline schar saturate_cast<schar>(int64 v)        { return (schar)((uint64)((int64)v-SCHAR_MIN) <= (uint64)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
+template<> inline schar saturate_cast<schar>(uint64 v)       { return (schar)std::min(v, (uint64)SCHAR_MAX); }
+
+template<> inline ushort saturate_cast<ushort>(schar v)      { return (ushort)std::max((int)v, 0); }
+template<> inline ushort saturate_cast<ushort>(short v)      { return (ushort)std::max((int)v, 0); }
+template<> inline ushort saturate_cast<ushort>(int v)        { return (ushort)((unsigned)v <= (unsigned)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
+template<> inline ushort saturate_cast<ushort>(unsigned v)   { return (ushort)std::min(v, (unsigned)USHRT_MAX); }
+template<> inline ushort saturate_cast<ushort>(float v)      { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
+template<> inline ushort saturate_cast<ushort>(double v)     { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
+template<> inline ushort saturate_cast<ushort>(int64 v)      { return (ushort)((uint64)v <= (uint64)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
+template<> inline ushort saturate_cast<ushort>(uint64 v)     { return (ushort)std::min(v, (uint64)USHRT_MAX); }
+
+template<> inline short saturate_cast<short>(ushort v)       { return (short)std::min((int)v, SHRT_MAX); }
+template<> inline short saturate_cast<short>(int v)          { return (short)((unsigned)(v - SHRT_MIN) <= (unsigned)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
+template<> inline short saturate_cast<short>(unsigned v)     { return (short)std::min(v, (unsigned)SHRT_MAX); }
+template<> inline short saturate_cast<short>(float v)        { int iv = cvRound(v); return saturate_cast<short>(iv); }
+template<> inline short saturate_cast<short>(double v)       { int iv = cvRound(v); return saturate_cast<short>(iv); }
+template<> inline short saturate_cast<short>(int64 v)        { return (short)((uint64)((int64)v - SHRT_MIN) <= (uint64)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
+template<> inline short saturate_cast<short>(uint64 v)       { return (short)std::min(v, (uint64)SHRT_MAX); }
+
+template<> inline int saturate_cast<int>(unsigned v)         { return (int)std::min(v, (unsigned)INT_MAX); }
+template<> inline int saturate_cast<int>(int64 v)            { return (int)((uint64)(v - INT_MIN) <= (uint64)UINT_MAX ? v : v > 0 ? INT_MAX : INT_MIN); }
+template<> inline int saturate_cast<int>(uint64 v)           { return (int)std::min(v, (uint64)INT_MAX); }
+template<> inline int saturate_cast<int>(float v)            { return cvRound(v); }
+template<> inline int saturate_cast<int>(double v)           { return cvRound(v); }
+
+template<> inline unsigned saturate_cast<unsigned>(schar v)  { return (unsigned)std::max(v, (schar)0); }
+template<> inline unsigned saturate_cast<unsigned>(short v)  { return (unsigned)std::max(v, (short)0); }
+template<> inline unsigned saturate_cast<unsigned>(int v)    { return (unsigned)std::max(v, (int)0); }
+template<> inline unsigned saturate_cast<unsigned>(int64 v)  { return (unsigned)((uint64)v <= (uint64)UINT_MAX ? v : v > 0 ? UINT_MAX : 0); }
+template<> inline unsigned saturate_cast<unsigned>(uint64 v) { return (unsigned)std::min(v, (uint64)UINT_MAX); }
+// we intentionally do not clip negative numbers, to make -1 become 0xffffffff etc.
+template<> inline unsigned saturate_cast<unsigned>(float v)  { return static_cast<unsigned>(cvRound(v)); }
+template<> inline unsigned saturate_cast<unsigned>(double v) { return static_cast<unsigned>(cvRound(v)); }
+
+template<> inline uint64 saturate_cast<uint64>(schar v)      { return (uint64)std::max(v, (schar)0); }
+template<> inline uint64 saturate_cast<uint64>(short v)      { return (uint64)std::max(v, (short)0); }
+template<> inline uint64 saturate_cast<uint64>(int v)        { return (uint64)std::max(v, (int)0); }
+template<> inline uint64 saturate_cast<uint64>(int64 v)      { return (uint64)std::max(v, (int64)0); }
+
+template<> inline int64 saturate_cast<int64>(uint64 v)       { return (int64)std::min(v, (uint64)LLONG_MAX); }
+
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(float16_t v) { return saturate_cast<_Tp>((float)v); }
+
+// in theory, we could use a LUT for 8u/8s->16f conversion,
+// but with hardware support for FP32->FP16 conversion the current approach is preferable
+template<> inline float16_t saturate_cast<float16_t>(uchar v)   { return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(schar v)   { return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(ushort v)  { return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(short v)   { return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(unsigned v){ return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(int v)     { return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(uint64 v)  { return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(int64 v)   { return float16_t((float)v); }
+template<> inline float16_t saturate_cast<float16_t>(float v)   { return float16_t(v); }
+template<> inline float16_t saturate_cast<float16_t>(double v)  { return float16_t((float)v); }
+
+//! @}
+
+} // cv
+
+#endif // OPENCV_CORE_SATURATE_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/simd_intrinsics.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/simd_intrinsics.hpp
new file mode 100644
index 0000000..2658d92
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/simd_intrinsics.hpp
@@ -0,0 +1,87 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_SIMD_INTRINSICS_HPP
+#define OPENCV_CORE_SIMD_INTRINSICS_HPP
+
+/**
+Helper header to support SIMD intrinsics (universal intrinsics) in user code.
+Intrinsics documentation: https://docs.opencv.org/4.x/df/d91/group__core__hal__intrin.html
+
+
+Checks of target CPU instruction set based on compiler definitions don't work well enough.
+More reliable solutions require utilization of configuration systems (like CMake).
+
+So, probably you need to specify your own configuration.
+
+You can do that via CMake in this way:
+    add_definitions(/DOPENCV_SIMD_CONFIG_HEADER=opencv_simd_config_custom.hpp)
+or
+    add_definitions(/DOPENCV_SIMD_CONFIG_INCLUDE_DIR=1)
+
+Additionally you may need to add include directory to your files:
+    include_directories("${CMAKE_CURRENT_LIST_DIR}/opencv_config_${MYTARGET}")
+
+These files can be pre-generated for target configurations of your application
+or generated by CMake on the fly (use CMAKE_BINARY_DIR for that).
+
+Notes:
+- H/W capability checks are still responsibility of your application
+- runtime dispatching is not covered by this helper header
+*/
+
+#ifdef __OPENCV_BUILD
+#error "Use core/hal/intrin.hpp during OpenCV build"
+#endif
+
+#ifdef OPENCV_HAL_INTRIN_HPP
+#error "core/simd_intrinsics.hpp must be included before core/hal/intrin.hpp"
+#endif
+
+#include "opencv2/core/cvdef.h"
+
+#ifdef OPENCV_SIMD_CONFIG_HEADER
+#include CVAUX_STR(OPENCV_SIMD_CONFIG_HEADER)
+#elif defined(OPENCV_SIMD_CONFIG_INCLUDE_DIR)
+#include "opencv_simd_config.hpp"  // corresponding directory should be added via -I compiler parameter
+#else  // custom config headers
+
+#if (!defined(CV_AVX_512F) || !CV_AVX_512F) && (defined(__AVX512__) || defined(__AVX512F__))
+#  include <immintrin.h>
+#  undef CV_AVX_512F
+#  define CV_AVX_512F 1
+#  ifndef OPENCV_SIMD_DONT_ASSUME_SKX  // Skylake-X with AVX-512F/CD/BW/DQ/VL
+#    undef CV_AVX512_SKX
+#    define CV_AVX512_SKX 1
+#    undef CV_AVX_512CD
+#    define CV_AVX_512CD 1
+#    undef CV_AVX_512BW
+#    define CV_AVX_512BW 1
+#    undef CV_AVX_512DQ
+#    define CV_AVX_512DQ 1
+#    undef CV_AVX_512VL
+#    define CV_AVX_512VL 1
+#  endif
+#endif // AVX512
+
+// GCC/Clang: -mavx2
+// MSVC: /arch:AVX2
+#if defined __AVX2__
+#  include <immintrin.h>
+#  undef CV_AVX2
+#  define CV_AVX2 1
+#  if defined __F16C__
+#    undef CV_FP16
+#    define CV_FP16 1
+#  endif
+#endif
+
+#endif
+
+// SSE / NEON / VSX is handled by cv_cpu_dispatch.h compatibility block
+#include "cv_cpu_dispatch.h"
+
+#include "hal/intrin.hpp"
+
+#endif // OPENCV_CORE_SIMD_INTRINSICS_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/softfloat.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/softfloat.hpp
new file mode 100644
index 0000000..485e15c
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/softfloat.hpp
@@ -0,0 +1,514 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// This file is based on files from package issued with the following license:
+
+/*============================================================================
+
+This C header file is part of the SoftFloat IEEE Floating-Point Arithmetic
+Package, Release 3c, by John R. Hauser.
+
+Copyright 2011, 2012, 2013, 2014, 2015, 2016, 2017 The Regents of the
+University of California.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions, and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions, and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+ 3. Neither the name of the University nor the names of its contributors may
+    be used to endorse or promote products derived from this software without
+    specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE
+DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+=============================================================================*/
+
+#pragma once
+#ifndef softfloat_h
+#define softfloat_h 1
+
+#include "cvdef.h"
+
+namespace cv
+{
+
+/** @addtogroup core_utils_softfloat
+
+  [SoftFloat](http://www.jhauser.us/arithmetic/SoftFloat.html) is a software implementation
+  of floating-point calculations according to IEEE 754 standard.
+  All calculations are done in integers, that's why they are machine-independent and bit-exact.
+  This library can be useful in accuracy-critical parts like look-up tables generation, tests, etc.
+  OpenCV contains a subset of SoftFloat partially rewritten to C++.
+
+  ### Types
+
+  There are two basic types: @ref softfloat and @ref softdouble.
+  These types are binary compatible with float and double types respectively
+  and support conversions to/from them.
+  Other types from original SoftFloat library like fp16 or fp128 were thrown away
+  as well as quiet/signaling NaN support, on-the-fly rounding mode switch
+  and exception flags (though exceptions can be implemented in the future).
+
+  ### Operations
+
+  Both types support the following:
+  - Construction from signed and unsigned 32-bit and 64 integers,
+  float/double or raw binary representation
+  - Conversions between each other, to float or double and to int
+  using @ref cvRound, @ref cvTrunc, @ref cvFloor, @ref cvCeil or a bunch of
+  saturate_cast functions
+  - Add, subtract, multiply, divide, remainder, square root, FMA with absolute precision
+  - Comparison operations
+  - Explicit sign, exponent and significand manipulation through get/set methods,
+ number state indicators (isInf, isNan, isSubnormal)
+  - Type-specific constants like eps, minimum/maximum value, best pi approximation, etc.
+  - min(), max(), abs(), exp(), log() and pow() functions
+
+*/
+//! @{
+
+struct softfloat;
+struct softdouble;
+
+struct CV_EXPORTS softfloat
+{
+public:
+    /** @brief Default constructor */
+    softfloat() { v = 0; }
+    /** @brief Copy constructor */
+    softfloat( const softfloat& c) { v = c.v; }
+    /** @brief Assign constructor */
+    softfloat& operator=( const softfloat& c )
+    {
+        if(&c != this) v = c.v;
+        return *this;
+    }
+    /** @brief Construct from raw
+
+    Builds new value from raw binary representation
+    */
+    static const softfloat fromRaw( const uint32_t a ) { softfloat x; x.v = a; return x; }
+
+    /** @brief Construct from integer */
+    explicit softfloat( const uint32_t );
+    explicit softfloat( const uint64_t );
+    explicit softfloat( const int32_t );
+    explicit softfloat( const int64_t );
+
+#ifdef CV_INT32_T_IS_LONG_INT
+    // for platforms with int32_t = long int
+    explicit softfloat( const int a ) { *this = softfloat(static_cast<int32_t>(a)); }
+#endif
+
+    /** @brief Construct from float */
+    explicit softfloat( const float a ) { Cv32suf s; s.f = a; v = s.u; }
+
+    /** @brief Type casts  */
+    operator softdouble() const;
+    operator float() const { Cv32suf s; s.u = v; return s.f; }
+
+    /** @brief Basic arithmetics */
+    softfloat operator + (const softfloat&) const;
+    softfloat operator - (const softfloat&) const;
+    softfloat operator * (const softfloat&) const;
+    softfloat operator / (const softfloat&) const;
+    softfloat operator - () const { softfloat x; x.v = v ^ (1U << 31); return x; }
+
+    /** @brief Remainder operator
+
+    A quote from original SoftFloat manual:
+
+    > The IEEE Standard remainder operation computes the value
+    > a - n * b, where n is the integer closest to a / b.
+    > If a / b is exactly halfway between two integers, n is the even integer
+    > closest to a / b. The IEEE Standard’s remainder operation is always exact and so requires no rounding.
+    > Depending on the relative magnitudes of the operands, the remainder functions
+    > can take considerably longer to execute than the other SoftFloat functions.
+    > This is an inherent characteristic of the remainder operation itself and is not a flaw
+    > in the SoftFloat implementation.
+    */
+    softfloat operator % (const softfloat&) const;
+
+    softfloat& operator += (const softfloat& a) { *this = *this + a; return *this; }
+    softfloat& operator -= (const softfloat& a) { *this = *this - a; return *this; }
+    softfloat& operator *= (const softfloat& a) { *this = *this * a; return *this; }
+    softfloat& operator /= (const softfloat& a) { *this = *this / a; return *this; }
+    softfloat& operator %= (const softfloat& a) { *this = *this % a; return *this; }
+
+    /** @brief Comparison operations
+
+     - Any operation with NaN produces false
+       + The only exception is when x is NaN: x != y for any y.
+     - Positive and negative zeros are equal
+    */
+    bool operator == ( const softfloat& ) const;
+    bool operator != ( const softfloat& ) const;
+    bool operator >  ( const softfloat& ) const;
+    bool operator >= ( const softfloat& ) const;
+    bool operator <  ( const softfloat& ) const;
+    bool operator <= ( const softfloat& ) const;
+
+    /** @brief NaN state indicator */
+    inline bool isNaN() const { return (v & 0x7fffffff)  > 0x7f800000; }
+    /** @brief Inf state indicator */
+    inline bool isInf() const { return (v & 0x7fffffff) == 0x7f800000; }
+    /** @brief Subnormal number indicator */
+    inline bool isSubnormal() const { return ((v >> 23) & 0xFF) == 0; }
+
+    /** @brief Get sign bit */
+    inline bool getSign() const { return (v >> 31) != 0; }
+    /** @brief Construct a copy with new sign bit */
+    inline softfloat setSign(bool sign) const { softfloat x; x.v = (v & ((1U << 31) - 1)) | ((uint32_t)sign << 31); return x; }
+    /** @brief Get 0-based exponent */
+    inline int getExp() const { return ((v >> 23) & 0xFF) - 127; }
+    /** @brief Construct a copy with new 0-based exponent */
+    inline softfloat setExp(int e) const { softfloat x; x.v = (v & 0x807fffff) | (((e + 127) & 0xFF) << 23 ); return x; }
+
+    /** @brief Get a fraction part
+
+    Returns a number 1 <= x < 2 with the same significand
+    */
+    inline softfloat getFrac() const
+    {
+        uint_fast32_t vv = (v & 0x007fffff) | (127 << 23);
+        return softfloat::fromRaw(vv);
+    }
+    /** @brief Construct a copy with provided significand
+
+    Constructs a copy of a number with significand taken from parameter
+    */
+    inline softfloat setFrac(const softfloat& s) const
+    {
+        softfloat x;
+        x.v = (v & 0xff800000) | (s.v & 0x007fffff);
+        return x;
+    }
+
+    /** @brief Zero constant */
+    static softfloat zero() { return softfloat::fromRaw( 0 ); }
+    /** @brief Positive infinity constant */
+    static softfloat  inf() { return softfloat::fromRaw( 0xFF << 23 ); }
+    /** @brief Default NaN constant */
+    static softfloat  nan() { return softfloat::fromRaw( 0x7fffffff ); }
+    /** @brief One constant */
+    static softfloat  one() { return softfloat::fromRaw(  127 << 23 ); }
+    /** @brief Smallest normalized value */
+    static softfloat  min() { return softfloat::fromRaw( 0x01 << 23 ); }
+    /** @brief Difference between 1 and next representable value */
+    static softfloat  eps() { return softfloat::fromRaw( (127 - 23) << 23 ); }
+    /** @brief Biggest finite value */
+    static softfloat  max() { return softfloat::fromRaw( (0xFF << 23) - 1 ); }
+    /** @brief Correct pi approximation */
+    static softfloat   pi() { return softfloat::fromRaw( 0x40490fdb ); }
+
+    uint32_t v;
+};
+
+/*----------------------------------------------------------------------------
+*----------------------------------------------------------------------------*/
+
+struct CV_EXPORTS softdouble
+{
+public:
+    /** @brief Default constructor */
+    softdouble() : v(0) { }
+    /** @brief Copy constructor */
+    softdouble( const softdouble& c) { v = c.v; }
+    /** @brief Assign constructor */
+    softdouble& operator=( const softdouble& c )
+    {
+        if(&c != this) v = c.v;
+        return *this;
+    }
+    /** @brief Construct from raw
+
+    Builds new value from raw binary representation
+    */
+    static softdouble fromRaw( const uint64_t a ) { softdouble x; x.v = a; return x; }
+
+    /** @brief Construct from integer */
+    explicit softdouble( const uint32_t );
+    explicit softdouble( const uint64_t );
+    explicit softdouble( const  int32_t );
+    explicit softdouble( const  int64_t );
+
+#ifdef CV_INT32_T_IS_LONG_INT
+    // for platforms with int32_t = long int
+    explicit softdouble( const int a ) { *this = softdouble(static_cast<int32_t>(a)); }
+#endif
+
+    /** @brief Construct from double */
+    explicit softdouble( const double a ) { Cv64suf s; s.f = a; v = s.u; }
+
+    /** @brief Type casts  */
+    operator softfloat() const;
+    operator double() const { Cv64suf s; s.u = v; return s.f; }
+
+    /** @brief Basic arithmetics */
+    softdouble operator + (const softdouble&) const;
+    softdouble operator - (const softdouble&) const;
+    softdouble operator * (const softdouble&) const;
+    softdouble operator / (const softdouble&) const;
+    softdouble operator - () const { softdouble x; x.v = v ^ (1ULL << 63); return x; }
+
+    /** @brief Remainder operator
+
+    A quote from original SoftFloat manual:
+
+    > The IEEE Standard remainder operation computes the value
+    > a - n * b, where n is the integer closest to a / b.
+    > If a / b is exactly halfway between two integers, n is the even integer
+    > closest to a / b. The IEEE Standard’s remainder operation is always exact and so requires no rounding.
+    > Depending on the relative magnitudes of the operands, the remainder functions
+    > can take considerably longer to execute than the other SoftFloat functions.
+    > This is an inherent characteristic of the remainder operation itself and is not a flaw
+    > in the SoftFloat implementation.
+    */
+    softdouble operator % (const softdouble&) const;
+
+    softdouble& operator += (const softdouble& a) { *this = *this + a; return *this; }
+    softdouble& operator -= (const softdouble& a) { *this = *this - a; return *this; }
+    softdouble& operator *= (const softdouble& a) { *this = *this * a; return *this; }
+    softdouble& operator /= (const softdouble& a) { *this = *this / a; return *this; }
+    softdouble& operator %= (const softdouble& a) { *this = *this % a; return *this; }
+
+    /** @brief Comparison operations
+
+     - Any operation with NaN produces false
+       + The only exception is when x is NaN: x != y for any y.
+     - Positive and negative zeros are equal
+    */
+    bool operator == ( const softdouble& ) const;
+    bool operator != ( const softdouble& ) const;
+    bool operator >  ( const softdouble& ) const;
+    bool operator >= ( const softdouble& ) const;
+    bool operator <  ( const softdouble& ) const;
+    bool operator <= ( const softdouble& ) const;
+
+    /** @brief NaN state indicator */
+    inline bool isNaN() const { return (v & 0x7fffffffffffffff)  > 0x7ff0000000000000; }
+    /** @brief Inf state indicator */
+    inline bool isInf() const { return (v & 0x7fffffffffffffff) == 0x7ff0000000000000; }
+    /** @brief Subnormal number indicator */
+    inline bool isSubnormal() const { return ((v >> 52) & 0x7FF) == 0; }
+
+    /** @brief Get sign bit */
+    inline bool getSign() const { return (v >> 63) != 0; }
+    /** @brief Construct a copy with new sign bit */
+    softdouble setSign(bool sign) const { softdouble x; x.v = (v & ((1ULL << 63) - 1)) | ((uint_fast64_t)(sign) << 63); return x; }
+    /** @brief Get 0-based exponent */
+    inline int getExp() const { return ((v >> 52) & 0x7FF) - 1023; }
+    /** @brief Construct a copy with new 0-based exponent */
+    inline softdouble setExp(int e) const
+    {
+        softdouble x;
+        x.v = (v & 0x800FFFFFFFFFFFFF) | ((uint_fast64_t)((e + 1023) & 0x7FF) << 52);
+        return x;
+    }
+
+    /** @brief Get a fraction part
+
+    Returns a number 1 <= x < 2 with the same significand
+    */
+    inline softdouble getFrac() const
+    {
+        uint_fast64_t vv = (v & 0x000FFFFFFFFFFFFF) | ((uint_fast64_t)(1023) << 52);
+        return softdouble::fromRaw(vv);
+    }
+    /** @brief Construct a copy with provided significand
+
+    Constructs a copy of a number with significand taken from parameter
+    */
+    inline softdouble setFrac(const softdouble& s) const
+    {
+        softdouble x;
+        x.v = (v & 0xFFF0000000000000) | (s.v & 0x000FFFFFFFFFFFFF);
+        return x;
+    }
+
+    /** @brief Zero constant */
+    static softdouble zero() { return softdouble::fromRaw( 0 ); }
+    /** @brief Positive infinity constant */
+    static softdouble  inf() { return softdouble::fromRaw( (uint_fast64_t)(0x7FF) << 52 ); }
+    /** @brief Default NaN constant */
+    static softdouble  nan() { return softdouble::fromRaw( CV_BIG_INT(0x7FFFFFFFFFFFFFFF) ); }
+    /** @brief One constant */
+    static softdouble  one() { return softdouble::fromRaw( (uint_fast64_t)( 1023) << 52 ); }
+    /** @brief Smallest normalized value */
+    static softdouble  min() { return softdouble::fromRaw( (uint_fast64_t)( 0x01) << 52 ); }
+    /** @brief Difference between 1 and next representable value */
+    static softdouble  eps() { return softdouble::fromRaw( (uint_fast64_t)( 1023 - 52 ) << 52 ); }
+    /** @brief Biggest finite value */
+    static softdouble  max() { return softdouble::fromRaw( ((uint_fast64_t)(0x7FF) << 52) - 1 ); }
+    /** @brief Correct pi approximation */
+    static softdouble   pi() { return softdouble::fromRaw( CV_BIG_INT(0x400921FB54442D18) ); }
+
+    uint64_t v;
+};
+
+/*----------------------------------------------------------------------------
+*----------------------------------------------------------------------------*/
+
+/** @brief Fused Multiplication and Addition
+
+Computes (a*b)+c with single rounding
+*/
+CV_EXPORTS softfloat  mulAdd( const softfloat&  a, const softfloat&  b, const softfloat & c);
+CV_EXPORTS softdouble mulAdd( const softdouble& a, const softdouble& b, const softdouble& c);
+
+/** @brief Square root */
+CV_EXPORTS softfloat  sqrt( const softfloat&  a );
+CV_EXPORTS softdouble sqrt( const softdouble& a );
+}
+
+/*----------------------------------------------------------------------------
+| Ported from OpenCV and added for usability
+*----------------------------------------------------------------------------*/
+
+/** @brief Truncates number to integer with minimum magnitude */
+CV_EXPORTS int cvTrunc(const cv::softfloat&  a);
+CV_EXPORTS int cvTrunc(const cv::softdouble& a);
+
+/** @brief Rounds a number to nearest even integer */
+CV_EXPORTS int cvRound(const cv::softfloat&  a);
+CV_EXPORTS int cvRound(const cv::softdouble& a);
+
+/** @brief Rounds a number to nearest even long long integer */
+CV_EXPORTS int64_t cvRound64(const cv::softdouble& a);
+
+/** @brief Rounds a number down to integer */
+CV_EXPORTS int cvFloor(const cv::softfloat&  a);
+CV_EXPORTS int cvFloor(const cv::softdouble& a);
+
+/** @brief Rounds number up to integer */
+CV_EXPORTS int  cvCeil(const cv::softfloat&  a);
+CV_EXPORTS int  cvCeil(const cv::softdouble& a);
+
+namespace cv
+{
+/** @brief Saturate casts */
+template<typename _Tp> static inline _Tp saturate_cast(softfloat  a) { return _Tp(a); }
+template<typename _Tp> static inline _Tp saturate_cast(softdouble a) { return _Tp(a); }
+
+template<> inline uchar saturate_cast<uchar>(softfloat  a) { return (uchar)std::max(std::min(cvRound(a), (int)UCHAR_MAX), 0); }
+template<> inline uchar saturate_cast<uchar>(softdouble a) { return (uchar)std::max(std::min(cvRound(a), (int)UCHAR_MAX), 0); }
+
+template<> inline schar saturate_cast<schar>(softfloat  a) { return (schar)std::min(std::max(cvRound(a), (int)SCHAR_MIN), (int)SCHAR_MAX); }
+template<> inline schar saturate_cast<schar>(softdouble a) { return (schar)std::min(std::max(cvRound(a), (int)SCHAR_MIN), (int)SCHAR_MAX); }
+
+template<> inline ushort saturate_cast<ushort>(softfloat  a) { return (ushort)std::max(std::min(cvRound(a), (int)USHRT_MAX), 0); }
+template<> inline ushort saturate_cast<ushort>(softdouble a) { return (ushort)std::max(std::min(cvRound(a), (int)USHRT_MAX), 0); }
+
+template<> inline short saturate_cast<short>(softfloat  a) { return (short)std::min(std::max(cvRound(a), (int)SHRT_MIN), (int)SHRT_MAX); }
+template<> inline short saturate_cast<short>(softdouble a) { return (short)std::min(std::max(cvRound(a), (int)SHRT_MIN), (int)SHRT_MAX); }
+
+template<> inline int saturate_cast<int>(softfloat  a) { return cvRound(a); }
+template<> inline int saturate_cast<int>(softdouble a) { return cvRound(a); }
+
+template<> inline int64_t saturate_cast<int64_t>(softfloat  a) { return cvRound(a); }
+template<> inline int64_t saturate_cast<int64_t>(softdouble a) { return cvRound64(a); }
+
+/** @brief Saturate cast to unsigned integer and unsigned long long integer
+We intentionally do not clip negative numbers, to make -1 become 0xffffffff etc.
+*/
+template<> inline unsigned saturate_cast<unsigned>(softfloat  a) { return cvRound(a); }
+template<> inline unsigned saturate_cast<unsigned>(softdouble a) { return cvRound(a); }
+
+template<> inline uint64_t saturate_cast<uint64_t>(softfloat  a) { return cvRound(a); }
+template<> inline uint64_t saturate_cast<uint64_t>(softdouble a) { return cvRound64(a); }
+
+/** @brief Min and Max functions */
+inline softfloat  min(const softfloat&  a, const softfloat&  b) { return (a > b) ? b : a; }
+inline softdouble min(const softdouble& a, const softdouble& b) { return (a > b) ? b : a; }
+
+inline softfloat  max(const softfloat&  a, const softfloat&  b) { return (a > b) ? a : b; }
+inline softdouble max(const softdouble& a, const softdouble& b) { return (a > b) ? a : b; }
+
+/** @brief Absolute value */
+inline softfloat  abs( softfloat  a) { softfloat  x; x.v = a.v & ((1U   << 31) - 1); return x; }
+inline softdouble abs( softdouble a) { softdouble x; x.v = a.v & ((1ULL << 63) - 1); return x; }
+
+/** @brief Exponent
+
+Special cases:
+- exp(NaN) is NaN
+- exp(-Inf) == 0
+- exp(+Inf) == +Inf
+*/
+CV_EXPORTS softfloat  exp( const softfloat&  a);
+CV_EXPORTS softdouble exp( const softdouble& a);
+
+/** @brief Natural logarithm
+
+Special cases:
+- log(NaN), log(x < 0) are NaN
+- log(0) == -Inf
+*/
+CV_EXPORTS softfloat  log( const softfloat&  a );
+CV_EXPORTS softdouble log( const softdouble& a );
+
+/** @brief Raising to the power
+
+Special cases:
+- x**NaN is NaN for any x
+- ( |x| == 1 )**Inf is NaN
+- ( |x|  > 1 )**+Inf or ( |x| < 1 )**-Inf is +Inf
+- ( |x|  > 1 )**-Inf or ( |x| < 1 )**+Inf is 0
+- x ** 0 == 1 for any x
+- x ** 1 == 1 for any x
+- NaN ** y is NaN for any other y
+- Inf**(y < 0) == 0
+- Inf ** y is +Inf for any other y
+- (x < 0)**y is NaN for any other y if x can't be correctly rounded to integer
+- 0 ** 0 == 1
+- 0 ** (y < 0) is +Inf
+- 0 ** (y > 0) is 0
+*/
+CV_EXPORTS softfloat  pow( const softfloat&  a, const softfloat&  b);
+CV_EXPORTS softdouble pow( const softdouble& a, const softdouble& b);
+
+/** @brief Cube root
+
+Special cases:
+- cbrt(NaN) is NaN
+- cbrt(+/-Inf) is +/-Inf
+*/
+CV_EXPORTS softfloat cbrt( const softfloat& a );
+
+/** @brief Sine
+
+Special cases:
+- sin(Inf) or sin(NaN) is NaN
+- sin(x) == x when sin(x) is close to zero
+*/
+CV_EXPORTS softdouble sin( const softdouble& a );
+
+/** @brief Cosine
+ *
+Special cases:
+- cos(Inf) or cos(NaN) is NaN
+- cos(x) == +/- 1 when cos(x) is close to +/- 1
+*/
+CV_EXPORTS softdouble cos( const softdouble& a );
+
+//! @} core_utils_softfloat
+
+} // cv::
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/sse_utils.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/sse_utils.hpp
new file mode 100644
index 0000000..0906583
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/sse_utils.hpp
@@ -0,0 +1,652 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_SSE_UTILS_HPP
+#define OPENCV_CORE_SSE_UTILS_HPP
+
+#ifndef __cplusplus
+#  error sse_utils.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core/cvdef.h"
+
+//! @addtogroup core_utils_sse
+//! @{
+
+#if CV_SSE2
+
+inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
+{
+    __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g0);
+    __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g0);
+    __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_g1);
+    __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_g1);
+
+    __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk2);
+    __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk2);
+    __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk3);
+    __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk3);
+
+    __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk2);
+    __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk2);
+    __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk3);
+    __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk3);
+
+    __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk2);
+    __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk2);
+    __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk3);
+    __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk3);
+
+    v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk2);
+    v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk2);
+    v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk3);
+    v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk3);
+}
+
+inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
+                                  __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
+{
+    __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g1);
+    __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g1);
+    __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b0);
+    __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b0);
+    __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_b1);
+    __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_b1);
+
+    __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk3);
+    __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk3);
+    __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk4);
+    __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk4);
+    __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk5);
+    __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk5);
+
+    __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk3);
+    __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk3);
+    __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk4);
+    __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk4);
+    __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk5);
+    __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk5);
+
+    __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk3);
+    __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk3);
+    __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk4);
+    __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk4);
+    __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk5);
+    __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk5);
+
+    v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk3);
+    v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk3);
+    v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk4);
+    v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk4);
+    v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk5);
+    v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk5);
+}
+
+inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
+                                  __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
+{
+    __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_b0);
+    __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_b0);
+    __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b1);
+    __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b1);
+    __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_a0);
+    __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_a0);
+    __m128i layer1_chunk6 = _mm_unpacklo_epi8(v_g1, v_a1);
+    __m128i layer1_chunk7 = _mm_unpackhi_epi8(v_g1, v_a1);
+
+    __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk4);
+    __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk4);
+    __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk5);
+    __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk5);
+    __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk6);
+    __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk6);
+    __m128i layer2_chunk6 = _mm_unpacklo_epi8(layer1_chunk3, layer1_chunk7);
+    __m128i layer2_chunk7 = _mm_unpackhi_epi8(layer1_chunk3, layer1_chunk7);
+
+    __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk4);
+    __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk4);
+    __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk5);
+    __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk5);
+    __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk6);
+    __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk6);
+    __m128i layer3_chunk6 = _mm_unpacklo_epi8(layer2_chunk3, layer2_chunk7);
+    __m128i layer3_chunk7 = _mm_unpackhi_epi8(layer2_chunk3, layer2_chunk7);
+
+    __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk4);
+    __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk4);
+    __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk5);
+    __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk5);
+    __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk6);
+    __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk6);
+    __m128i layer4_chunk6 = _mm_unpacklo_epi8(layer3_chunk3, layer3_chunk7);
+    __m128i layer4_chunk7 = _mm_unpackhi_epi8(layer3_chunk3, layer3_chunk7);
+
+    v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk4);
+    v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk4);
+    v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk5);
+    v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk5);
+    v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk6);
+    v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk6);
+    v_a0 = _mm_unpacklo_epi8(layer4_chunk3, layer4_chunk7);
+    v_a1 = _mm_unpackhi_epi8(layer4_chunk3, layer4_chunk7);
+}
+
+inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
+{
+    __m128i v_mask = _mm_set1_epi16(0x00ff);
+
+    __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
+    __m128i layer4_chunk2 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
+    __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
+    __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
+
+    __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
+    __m128i layer3_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
+    __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
+    __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
+
+    __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
+    __m128i layer2_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
+    __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
+    __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
+
+    __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
+    __m128i layer1_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
+    __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
+    __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
+
+    v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
+    v_g0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
+    v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
+    v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
+}
+
+inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
+                                __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
+{
+    __m128i v_mask = _mm_set1_epi16(0x00ff);
+
+    __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
+    __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
+    __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
+    __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
+    __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
+    __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8));
+
+    __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
+    __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
+    __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
+    __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
+    __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask));
+    __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8));
+
+    __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
+    __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
+    __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
+    __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
+    __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
+    __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8));
+
+    __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
+    __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
+    __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
+    __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
+    __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
+    __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8));
+
+    v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
+    v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
+    v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
+    v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
+    v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
+    v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8));
+}
+
+inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
+                                __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
+{
+    __m128i v_mask = _mm_set1_epi16(0x00ff);
+
+    __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
+    __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
+    __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
+    __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
+    __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
+    __m128i layer4_chunk6 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8));
+    __m128i layer4_chunk3 = _mm_packus_epi16(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask));
+    __m128i layer4_chunk7 = _mm_packus_epi16(_mm_srli_epi16(v_a0, 8), _mm_srli_epi16(v_a1, 8));
+
+    __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
+    __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
+    __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
+    __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
+    __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask));
+    __m128i layer3_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8));
+    __m128i layer3_chunk3 = _mm_packus_epi16(_mm_and_si128(layer4_chunk6, v_mask), _mm_and_si128(layer4_chunk7, v_mask));
+    __m128i layer3_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk6, 8), _mm_srli_epi16(layer4_chunk7, 8));
+
+    __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
+    __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
+    __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
+    __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
+    __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
+    __m128i layer2_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8));
+    __m128i layer2_chunk3 = _mm_packus_epi16(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask));
+    __m128i layer2_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk6, 8), _mm_srli_epi16(layer3_chunk7, 8));
+
+    __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
+    __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
+    __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
+    __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
+    __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
+    __m128i layer1_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8));
+    __m128i layer1_chunk3 = _mm_packus_epi16(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask));
+    __m128i layer1_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk6, 8), _mm_srli_epi16(layer2_chunk7, 8));
+
+    v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
+    v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
+    v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
+    v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
+    v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
+    v_a0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8));
+    v_g1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask));
+    v_a1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk6, 8), _mm_srli_epi16(layer1_chunk7, 8));
+}
+
+inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
+{
+    __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g0);
+    __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g0);
+    __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_g1);
+    __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_g1);
+
+    __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk2);
+    __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk2);
+    __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk3);
+    __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk3);
+
+    __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk2);
+    __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk2);
+    __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk3);
+    __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk3);
+
+    v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk2);
+    v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk2);
+    v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk3);
+    v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk3);
+}
+
+inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
+                                   __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
+{
+    __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g1);
+    __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g1);
+    __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b0);
+    __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b0);
+    __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_b1);
+    __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_b1);
+
+    __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk3);
+    __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk3);
+    __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk4);
+    __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk4);
+    __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk5);
+    __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk5);
+
+    __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk3);
+    __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk3);
+    __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk4);
+    __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk4);
+    __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk5);
+    __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk5);
+
+    v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk3);
+    v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk3);
+    v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk4);
+    v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk4);
+    v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk5);
+    v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk5);
+}
+
+inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
+                                   __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
+{
+    __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_b0);
+    __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_b0);
+    __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b1);
+    __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b1);
+    __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_a0);
+    __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_a0);
+    __m128i layer1_chunk6 = _mm_unpacklo_epi16(v_g1, v_a1);
+    __m128i layer1_chunk7 = _mm_unpackhi_epi16(v_g1, v_a1);
+
+    __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk4);
+    __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk4);
+    __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk5);
+    __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk5);
+    __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk6);
+    __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk6);
+    __m128i layer2_chunk6 = _mm_unpacklo_epi16(layer1_chunk3, layer1_chunk7);
+    __m128i layer2_chunk7 = _mm_unpackhi_epi16(layer1_chunk3, layer1_chunk7);
+
+    __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk4);
+    __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk4);
+    __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk5);
+    __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk5);
+    __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk6);
+    __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk6);
+    __m128i layer3_chunk6 = _mm_unpacklo_epi16(layer2_chunk3, layer2_chunk7);
+    __m128i layer3_chunk7 = _mm_unpackhi_epi16(layer2_chunk3, layer2_chunk7);
+
+    v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk4);
+    v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk4);
+    v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk5);
+    v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk5);
+    v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk6);
+    v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk6);
+    v_a0 = _mm_unpacklo_epi16(layer3_chunk3, layer3_chunk7);
+    v_a1 = _mm_unpackhi_epi16(layer3_chunk3, layer3_chunk7);
+}
+
+#if CV_SSE4_1
+
+inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
+{
+    __m128i v_mask = _mm_set1_epi32(0x0000ffff);
+
+    __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
+    __m128i layer3_chunk2 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
+    __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
+    __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
+
+    __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
+    __m128i layer2_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
+    __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
+    __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
+
+    __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
+    __m128i layer1_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
+    __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
+    __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
+
+    v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
+    v_g0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
+    v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
+    v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
+}
+
+inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
+                                 __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
+{
+    __m128i v_mask = _mm_set1_epi32(0x0000ffff);
+
+    __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
+    __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
+    __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
+    __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
+    __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
+    __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16));
+
+    __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
+    __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
+    __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
+    __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
+    __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
+    __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16));
+
+    __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
+    __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
+    __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
+    __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
+    __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
+    __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16));
+
+    v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
+    v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
+    v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
+    v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
+    v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
+    v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16));
+}
+
+inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
+                                 __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
+{
+    __m128i v_mask = _mm_set1_epi32(0x0000ffff);
+
+    __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
+    __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
+    __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
+    __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
+    __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
+    __m128i layer3_chunk6 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16));
+    __m128i layer3_chunk3 = _mm_packus_epi32(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask));
+    __m128i layer3_chunk7 = _mm_packus_epi32(_mm_srli_epi32(v_a0, 16), _mm_srli_epi32(v_a1, 16));
+
+    __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
+    __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
+    __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
+    __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
+    __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
+    __m128i layer2_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16));
+    __m128i layer2_chunk3 = _mm_packus_epi32(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask));
+    __m128i layer2_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk6, 16), _mm_srli_epi32(layer3_chunk7, 16));
+
+    __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
+    __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
+    __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
+    __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
+    __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
+    __m128i layer1_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16));
+    __m128i layer1_chunk3 = _mm_packus_epi32(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask));
+    __m128i layer1_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk6, 16), _mm_srli_epi32(layer2_chunk7, 16));
+
+    v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
+    v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
+    v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
+    v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
+    v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
+    v_a0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16));
+    v_g1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask));
+    v_a1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk6, 16), _mm_srli_epi32(layer1_chunk7, 16));
+}
+
+#endif // CV_SSE4_1
+
+inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
+{
+    __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g0);
+    __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g0);
+    __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_g1);
+    __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_g1);
+
+    __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk2);
+    __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk2);
+    __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk3);
+    __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk3);
+
+    v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk2);
+    v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk2);
+    v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk3);
+    v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk3);
+}
+
+inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
+                                __m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
+{
+    __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g1);
+    __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g1);
+    __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b0);
+    __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b0);
+    __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_b1);
+    __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_b1);
+
+    __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk3);
+    __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk3);
+    __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk4);
+    __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk4);
+    __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk5);
+    __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk5);
+
+    v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk3);
+    v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk3);
+    v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk4);
+    v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk4);
+    v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk5);
+    v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk5);
+}
+
+inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
+                                __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
+{
+    __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_b0);
+    __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_b0);
+    __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b1);
+    __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b1);
+    __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_a0);
+    __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_a0);
+    __m128 layer1_chunk6 = _mm_unpacklo_ps(v_g1, v_a1);
+    __m128 layer1_chunk7 = _mm_unpackhi_ps(v_g1, v_a1);
+
+    __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk4);
+    __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk4);
+    __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk5);
+    __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk5);
+    __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk6);
+    __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk6);
+    __m128 layer2_chunk6 = _mm_unpacklo_ps(layer1_chunk3, layer1_chunk7);
+    __m128 layer2_chunk7 = _mm_unpackhi_ps(layer1_chunk3, layer1_chunk7);
+
+    v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk4);
+    v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk4);
+    v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk5);
+    v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk5);
+    v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk6);
+    v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk6);
+    v_a0 = _mm_unpacklo_ps(layer2_chunk3, layer2_chunk7);
+    v_a1 = _mm_unpackhi_ps(layer2_chunk3, layer2_chunk7);
+}
+
+inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
+{
+    enum { mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1) };
+
+    __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
+    __m128 layer2_chunk2 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
+    __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
+    __m128 layer2_chunk3 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
+
+    __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
+    __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
+    __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
+    __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
+
+    v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
+    v_g0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
+    v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
+    v_g1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
+}
+
+inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
+                              __m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
+{
+    enum { mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1) };
+
+    __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
+    __m128 layer2_chunk3 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
+    __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
+    __m128 layer2_chunk4 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
+    __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo);
+    __m128 layer2_chunk5 = _mm_shuffle_ps(v_b0, v_b1, mask_hi);
+
+    __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
+    __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
+    __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
+    __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
+    __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo);
+    __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi);
+
+    v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
+    v_g1 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
+    v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
+    v_b0 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
+    v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo);
+    v_b1 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi);
+}
+
+inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
+                              __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
+{
+    enum { mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1) };
+
+    __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
+    __m128 layer2_chunk4 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
+    __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
+    __m128 layer2_chunk5 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
+    __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo);
+    __m128 layer2_chunk6 = _mm_shuffle_ps(v_b0, v_b1, mask_hi);
+    __m128 layer2_chunk3 = _mm_shuffle_ps(v_a0, v_a1, mask_lo);
+    __m128 layer2_chunk7 = _mm_shuffle_ps(v_a0, v_a1, mask_hi);
+
+    __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
+    __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
+    __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
+    __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
+    __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo);
+    __m128 layer1_chunk6 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi);
+    __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_lo);
+    __m128 layer1_chunk7 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_hi);
+
+    v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
+    v_b0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
+    v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
+    v_b1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
+    v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo);
+    v_a0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi);
+    v_g1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_lo);
+    v_a1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_hi);
+}
+
+#endif // CV_SSE2
+
+//! @}
+
+#endif //OPENCV_CORE_SSE_UTILS_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/traits.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/traits.hpp
new file mode 100644
index 0000000..52ab083
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/traits.hpp
@@ -0,0 +1,417 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_TRAITS_HPP
+#define OPENCV_CORE_TRAITS_HPP
+
+#include "opencv2/core/cvdef.h"
+
+namespace cv
+{
+
+//#define OPENCV_TRAITS_ENABLE_DEPRECATED
+
+//! @addtogroup core_basic
+//! @{
+
+/** @brief Template "trait" class for OpenCV primitive data types.
+
+@note Deprecated. This is replaced by "single purpose" traits: traits::Type and traits::Depth
+
+A primitive OpenCV data type is one of unsigned char, bool, signed char, unsigned short, signed
+short, int, float, double, or a tuple of values of one of these types, where all the values in the
+tuple have the same type. Any primitive type from the list can be defined by an identifier in the
+form CV_\<bit-depth\>{U|S|F}C(\<number_of_channels\>), for example: uchar \~ CV_8UC1, 3-element
+floating-point tuple \~ CV_32FC3, and so on. A universal OpenCV structure that is able to store a
+single instance of such a primitive data type is Vec. Multiple instances of such a type can be
+stored in a std::vector, Mat, Mat_, SparseMat, SparseMat_, or any other container that is able to
+store Vec instances.
+
+The DataType class is basically used to provide a description of such primitive data types without
+adding any fields or methods to the corresponding classes (and it is actually impossible to add
+anything to primitive C/C++ data types). This technique is known in C++ as class traits. It is not
+DataType itself that is used but its specialized versions, such as:
+@code
+    template<> class DataType<uchar>
+    {
+        typedef uchar value_type;
+        typedef int work_type;
+        typedef uchar channel_type;
+        enum { channel_type = CV_8U, channels = 1, fmt='u', type = CV_8U };
+    };
+    ...
+    template<typename _Tp> DataType<std::complex<_Tp> >
+    {
+        typedef std::complex<_Tp> value_type;
+        typedef std::complex<_Tp> work_type;
+        typedef _Tp channel_type;
+        // DataDepth is another helper trait class
+        enum { depth = DataDepth<_Tp>::value, channels=2,
+            fmt=(channels-1)*256+DataDepth<_Tp>::fmt,
+            type=CV_MAKETYPE(depth, channels) };
+    };
+    ...
+@endcode
+The main purpose of this class is to convert compilation-time type information to an
+OpenCV-compatible data type identifier, for example:
+@code
+    // allocates a 30x40 floating-point matrix
+    Mat A(30, 40, DataType<float>::type);
+
+    Mat B = Mat_<std::complex<double> >(3, 3);
+    // the statement below will print 6, 2 , that is depth == CV_64F, channels == 2
+    cout << B.depth() << ", " << B.channels() << endl;
+@endcode
+So, such traits are used to tell OpenCV which data type you are working with, even if such a type is
+not native to OpenCV. For example, the matrix B initialization above is compiled because OpenCV
+defines the proper specialized template class DataType\<complex\<_Tp\> \> . This mechanism is also
+useful (and used in OpenCV this way) for generic algorithms implementations.
+
+@note Default values were dropped to stop confusing developers about using of unsupported types (see #7599)
+*/
+template<typename _Tp> class DataType
+{
+public:
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+    typedef _Tp         value_type;
+    typedef value_type  work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 1,
+           depth        = -1,
+           channels     = 1,
+           fmt          = 0,
+           type = CV_MAKETYPE(depth, channels)
+         };
+#endif
+};
+
+template<> class DataType<bool>
+{
+public:
+    typedef bool        value_type;
+    typedef int         work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 0,
+           depth        = CV_8U,
+           channels     = 1,
+           fmt          = (int)'u',
+           type         = CV_MAKETYPE(depth, channels)
+         };
+};
+
+template<> class DataType<uchar>
+{
+public:
+    typedef uchar       value_type;
+    typedef int         work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 0,
+           depth        = CV_8U,
+           channels     = 1,
+           fmt          = (int)'u',
+           type         = CV_MAKETYPE(depth, channels)
+         };
+};
+
+template<> class DataType<schar>
+{
+public:
+    typedef schar       value_type;
+    typedef int         work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 0,
+           depth        = CV_8S,
+           channels     = 1,
+           fmt          = (int)'c',
+           type         = CV_MAKETYPE(depth, channels)
+         };
+};
+
+template<> class DataType<char>
+{
+public:
+    typedef schar       value_type;
+    typedef int         work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 0,
+           depth        = CV_8S,
+           channels     = 1,
+           fmt          = (int)'c',
+           type         = CV_MAKETYPE(depth, channels)
+         };
+};
+
+template<> class DataType<ushort>
+{
+public:
+    typedef ushort      value_type;
+    typedef int         work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 0,
+           depth        = CV_16U,
+           channels     = 1,
+           fmt          = (int)'w',
+           type         = CV_MAKETYPE(depth, channels)
+         };
+};
+
+template<> class DataType<short>
+{
+public:
+    typedef short       value_type;
+    typedef int         work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 0,
+           depth        = CV_16S,
+           channels     = 1,
+           fmt          = (int)'s',
+           type         = CV_MAKETYPE(depth, channels)
+         };
+};
+
+template<> class DataType<int>
+{
+public:
+    typedef int         value_type;
+    typedef value_type  work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 0,
+           depth        = CV_32S,
+           channels     = 1,
+           fmt          = (int)'i',
+           type         = CV_MAKETYPE(depth, channels)
+         };
+};
+
+template<> class DataType<float>
+{
+public:
+    typedef float       value_type;
+    typedef value_type  work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 0,
+           depth        = CV_32F,
+           channels     = 1,
+           fmt          = (int)'f',
+           type         = CV_MAKETYPE(depth, channels)
+         };
+};
+
+template<> class DataType<double>
+{
+public:
+    typedef double      value_type;
+    typedef value_type  work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 0,
+           depth        = CV_64F,
+           channels     = 1,
+           fmt          = (int)'d',
+           type         = CV_MAKETYPE(depth, channels)
+         };
+};
+
+template<> class DataType<float16_t>
+{
+public:
+    typedef float16_t   value_type;
+    typedef float       work_type;
+    typedef value_type  channel_type;
+    typedef value_type  vec_type;
+    enum { generic_type = 0,
+           depth        = CV_16F,
+           channels     = 1,
+           fmt          = (int)'h',
+           type         = CV_MAKETYPE(depth, channels)
+         };
+};
+
+/** @brief A helper class for cv::DataType
+
+The class is specialized for each fundamental numerical data type supported by OpenCV. It provides
+DataDepth<T>::value constant.
+*/
+template<typename _Tp> class DataDepth
+{
+public:
+    enum
+    {
+        value = DataType<_Tp>::depth,
+        fmt   = DataType<_Tp>::fmt
+    };
+};
+
+
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+
+template<int _depth> class TypeDepth
+{
+#ifdef OPENCV_TRAITS_ENABLE_LEGACY_DEFAULTS
+    enum { depth = CV_USRTYPE1 };
+    typedef void value_type;
+#endif
+};
+
+template<> class TypeDepth<CV_8U>
+{
+    enum { depth = CV_8U };
+    typedef uchar value_type;
+};
+
+template<> class TypeDepth<CV_8S>
+{
+    enum { depth = CV_8S };
+    typedef schar value_type;
+};
+
+template<> class TypeDepth<CV_16U>
+{
+    enum { depth = CV_16U };
+    typedef ushort value_type;
+};
+
+template<> class TypeDepth<CV_16S>
+{
+    enum { depth = CV_16S };
+    typedef short value_type;
+};
+
+template<> class TypeDepth<CV_32S>
+{
+    enum { depth = CV_32S };
+    typedef int value_type;
+};
+
+template<> class TypeDepth<CV_32F>
+{
+    enum { depth = CV_32F };
+    typedef float value_type;
+};
+
+template<> class TypeDepth<CV_64F>
+{
+    enum { depth = CV_64F };
+    typedef double value_type;
+};
+
+template<> class TypeDepth<CV_16F>
+{
+    enum { depth = CV_16F };
+    typedef float16_t value_type;
+};
+
+#endif
+
+//! @}
+
+namespace traits {
+
+namespace internal {
+#define CV_CREATE_MEMBER_CHECK(X) \
+template<typename T> class CheckMember_##X { \
+    struct Fallback { int X; }; \
+    struct Derived : T, Fallback { }; \
+    template<typename U, U> struct Check; \
+    typedef char CV_NO[1]; \
+    typedef char CV_YES[2]; \
+    template<typename U> static CV_NO & func(Check<int Fallback::*, &U::X> *); \
+    template<typename U> static CV_YES & func(...); \
+public: \
+    typedef CheckMember_##X type; \
+    enum { value = sizeof(func<Derived>(0)) == sizeof(CV_YES) }; \
+};
+
+CV_CREATE_MEMBER_CHECK(fmt)
+CV_CREATE_MEMBER_CHECK(type)
+
+} // namespace internal
+
+
+template<typename T>
+struct Depth
+{ enum { value = DataType<T>::depth }; };
+
+template<typename T>
+struct Type
+{ enum { value = DataType<T>::type }; };
+
+/** Similar to traits::Type<T> but has value = -1 in case of unknown type (instead of compiler error) */
+template<typename T, bool available = internal::CheckMember_type< DataType<T> >::value >
+struct SafeType {};
+
+template<typename T>
+struct SafeType<T, false>
+{ enum { value = -1 }; };
+
+template<typename T>
+struct SafeType<T, true>
+{ enum { value = Type<T>::value }; };
+
+
+template<typename T, bool available = internal::CheckMember_fmt< DataType<T> >::value >
+struct SafeFmt {};
+
+template<typename T>
+struct SafeFmt<T, false>
+{ enum { fmt = 0 }; };
+
+template<typename T>
+struct SafeFmt<T, true>
+{ enum { fmt = DataType<T>::fmt }; };
+
+
+} // namespace
+
+} // cv
+
+#endif // OPENCV_CORE_TRAITS_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/types.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/types.hpp
new file mode 100644
index 0000000..2867520
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/types.hpp
@@ -0,0 +1,2439 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_TYPES_HPP
+#define OPENCV_CORE_TYPES_HPP
+
+#ifndef __cplusplus
+#  error types.hpp header must be compiled as C++
+#endif
+
+#include <climits>
+#include <cfloat>
+#include <vector>
+#include <limits>
+
+#include "opencv2/core/cvdef.h"
+#include "opencv2/core/cvstd.hpp"
+#include "opencv2/core/matx.hpp"
+
+namespace cv
+{
+
+//! @addtogroup core_basic
+//! @{
+
+//////////////////////////////// Complex //////////////////////////////
+
+/** @brief  A complex number class.
+
+  The template class is similar and compatible with std::complex, however it provides slightly
+  more convenient access to the real and imaginary parts using through the simple field access, as opposite
+  to std::complex::real() and std::complex::imag().
+*/
+template<typename _Tp> class Complex
+{
+public:
+
+    //! default constructor
+    Complex();
+    Complex( _Tp _re, _Tp _im = 0 );
+
+    //! conversion to another data type
+    template<typename T2> operator Complex<T2>() const;
+    //! conjugation
+    Complex conj() const;
+
+    _Tp re, im; //< the real and the imaginary parts
+};
+
+typedef Complex<float> Complexf;
+typedef Complex<double> Complexd;
+
+template<typename _Tp> class DataType< Complex<_Tp> >
+{
+public:
+    typedef Complex<_Tp> value_type;
+    typedef value_type   work_type;
+    typedef _Tp          channel_type;
+
+    enum { generic_type = 0,
+           channels     = 2,
+           fmt          = DataType<channel_type>::fmt + ((channels - 1) << 8)
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+           ,depth        = DataType<channel_type>::depth
+           ,type         = CV_MAKETYPE(depth, channels)
+#endif
+    };
+
+    typedef Vec<channel_type, channels> vec_type;
+};
+
+namespace traits {
+template<typename _Tp>
+struct Depth< Complex<_Tp> > { enum { value = Depth<_Tp>::value }; };
+template<typename _Tp>
+struct Type< Complex<_Tp> > { enum { value = CV_MAKETYPE(Depth<_Tp>::value, 2) }; };
+} // namespace
+
+
+//////////////////////////////// Point_ ////////////////////////////////
+
+/** @brief Template class for 2D points specified by its coordinates `x` and `y`.
+
+An instance of the class is interchangeable with C structures, CvPoint and CvPoint2D32f . There is
+also a cast operator to convert point coordinates to the specified type. The conversion from
+floating-point coordinates to integer coordinates is done by rounding. Commonly, the conversion
+uses this operation for each of the coordinates. Besides the class members listed in the
+declaration above, the following operations on points are implemented:
+@code
+    pt1 = pt2 + pt3;
+    pt1 = pt2 - pt3;
+    pt1 = pt2 * a;
+    pt1 = a * pt2;
+    pt1 = pt2 / a;
+    pt1 += pt2;
+    pt1 -= pt2;
+    pt1 *= a;
+    pt1 /= a;
+    double value = norm(pt); // L2 norm
+    pt1 == pt2;
+    pt1 != pt2;
+@endcode
+For your convenience, the following type aliases are defined:
+@code
+    typedef Point_<int> Point2i;
+    typedef Point2i Point;
+    typedef Point_<float> Point2f;
+    typedef Point_<double> Point2d;
+@endcode
+Example:
+@code
+    Point2f a(0.3f, 0.f), b(0.f, 0.4f);
+    Point pt = (a + b)*10.f;
+    cout << pt.x << ", " << pt.y << endl;
+@endcode
+*/
+template<typename _Tp> class Point_
+{
+public:
+    typedef _Tp value_type;
+
+    //! default constructor
+    Point_();
+    Point_(_Tp _x, _Tp _y);
+#if (defined(__GNUC__) && __GNUC__ < 5) && !defined(__clang__)  // GCC 4.x bug. Details: https://github.com/opencv/opencv/pull/20837
+    Point_(const Point_& pt);
+    Point_(Point_&& pt) CV_NOEXCEPT = default;
+#elif OPENCV_ABI_COMPATIBILITY < 500
+    Point_(const Point_& pt) = default;
+    Point_(Point_&& pt) CV_NOEXCEPT = default;
+#endif
+    Point_(const Size_<_Tp>& sz);
+    Point_(const Vec<_Tp, 2>& v);
+
+#if (defined(__GNUC__) && __GNUC__ < 5) && !defined(__clang__)  // GCC 4.x bug. Details: https://github.com/opencv/opencv/pull/20837
+    Point_& operator = (const Point_& pt);
+    Point_& operator = (Point_&& pt) CV_NOEXCEPT = default;
+#elif OPENCV_ABI_COMPATIBILITY < 500
+    Point_& operator = (const Point_& pt) = default;
+    Point_& operator = (Point_&& pt) CV_NOEXCEPT = default;
+#endif
+    //! conversion to another data type
+    template<typename _Tp2> operator Point_<_Tp2>() const;
+
+    //! conversion to the old-style C structures
+    operator Vec<_Tp, 2>() const;
+
+    //! dot product
+    _Tp dot(const Point_& pt) const;
+    //! dot product computed in double-precision arithmetics
+    double ddot(const Point_& pt) const;
+    //! cross-product
+    double cross(const Point_& pt) const;
+    //! checks whether the point is inside the specified rectangle
+    bool inside(const Rect_<_Tp>& r) const;
+    _Tp x; //!< x coordinate of the point
+    _Tp y; //!< y coordinate of the point
+};
+
+typedef Point_<int> Point2i;
+typedef Point_<int64> Point2l;
+typedef Point_<float> Point2f;
+typedef Point_<double> Point2d;
+typedef Point2i Point;
+
+template<typename _Tp> class DataType< Point_<_Tp> >
+{
+public:
+    typedef Point_<_Tp>                               value_type;
+    typedef Point_<typename DataType<_Tp>::work_type> work_type;
+    typedef _Tp                                       channel_type;
+
+    enum { generic_type = 0,
+           channels     = 2,
+           fmt          = traits::SafeFmt<channel_type>::fmt + ((channels - 1) << 8)
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+           ,depth        = DataType<channel_type>::depth
+           ,type         = CV_MAKETYPE(depth, channels)
+#endif
+         };
+
+    typedef Vec<channel_type, channels> vec_type;
+};
+
+namespace traits {
+template<typename _Tp>
+struct Depth< Point_<_Tp> > { enum { value = Depth<_Tp>::value }; };
+template<typename _Tp>
+struct Type< Point_<_Tp> > { enum { value = CV_MAKETYPE(Depth<_Tp>::value, 2) }; };
+} // namespace
+
+
+//////////////////////////////// Point3_ ////////////////////////////////
+
+/** @brief Template class for 3D points specified by its coordinates `x`, `y` and `z`.
+
+An instance of the class is interchangeable with the C structure CvPoint2D32f . Similarly to
+Point_ , the coordinates of 3D points can be converted to another type. The vector arithmetic and
+comparison operations are also supported.
+
+The following Point3_\<\> aliases are available:
+@code
+    typedef Point3_<int> Point3i;
+    typedef Point3_<float> Point3f;
+    typedef Point3_<double> Point3d;
+@endcode
+@see cv::Point3i, cv::Point3f and cv::Point3d
+*/
+template<typename _Tp> class Point3_
+{
+public:
+    typedef _Tp value_type;
+
+    //! default constructor
+    Point3_();
+    Point3_(_Tp _x, _Tp _y, _Tp _z);
+#if OPENCV_ABI_COMPATIBILITY < 500
+    Point3_(const Point3_& pt) = default;
+    Point3_(Point3_&& pt) CV_NOEXCEPT = default;
+#endif
+    explicit Point3_(const Point_<_Tp>& pt);
+    Point3_(const Vec<_Tp, 3>& v);
+
+#if OPENCV_ABI_COMPATIBILITY < 500
+    Point3_& operator = (const Point3_& pt) = default;
+    Point3_& operator = (Point3_&& pt) CV_NOEXCEPT = default;
+#endif
+    //! conversion to another data type
+    template<typename _Tp2> operator Point3_<_Tp2>() const;
+    //! conversion to cv::Vec<>
+    operator Vec<_Tp, 3>() const;
+
+    //! dot product
+    _Tp dot(const Point3_& pt) const;
+    //! dot product computed in double-precision arithmetics
+    double ddot(const Point3_& pt) const;
+    //! cross product of the 2 3D points
+    Point3_ cross(const Point3_& pt) const;
+    _Tp x; //!< x coordinate of the 3D point
+    _Tp y; //!< y coordinate of the 3D point
+    _Tp z; //!< z coordinate of the 3D point
+};
+
+typedef Point3_<int> Point3i;
+typedef Point3_<float> Point3f;
+typedef Point3_<double> Point3d;
+
+template<typename _Tp> class DataType< Point3_<_Tp> >
+{
+public:
+    typedef Point3_<_Tp>                               value_type;
+    typedef Point3_<typename DataType<_Tp>::work_type> work_type;
+    typedef _Tp                                        channel_type;
+
+    enum { generic_type = 0,
+           channels     = 3,
+           fmt          = traits::SafeFmt<channel_type>::fmt + ((channels - 1) << 8)
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+           ,depth        = DataType<channel_type>::depth
+           ,type         = CV_MAKETYPE(depth, channels)
+#endif
+         };
+
+    typedef Vec<channel_type, channels> vec_type;
+};
+
+namespace traits {
+template<typename _Tp>
+struct Depth< Point3_<_Tp> > { enum { value = Depth<_Tp>::value }; };
+template<typename _Tp>
+struct Type< Point3_<_Tp> > { enum { value = CV_MAKETYPE(Depth<_Tp>::value, 3) }; };
+} // namespace
+
+//////////////////////////////// Size_ ////////////////////////////////
+
+/** @brief Template class for specifying the size of an image or rectangle.
+
+The class includes two members called width and height. The structure can be converted to and from
+the old OpenCV structures CvSize and CvSize2D32f . The same set of arithmetic and comparison
+operations as for Point_ is available.
+
+OpenCV defines the following Size_\<\> aliases:
+@code
+    typedef Size_<int> Size2i;
+    typedef Size2i Size;
+    typedef Size_<float> Size2f;
+@endcode
+*/
+template<typename _Tp> class Size_
+{
+public:
+    typedef _Tp value_type;
+
+    //! default constructor
+    Size_();
+    Size_(_Tp _width, _Tp _height);
+#if OPENCV_ABI_COMPATIBILITY < 500
+    Size_(const Size_& sz) = default;
+    Size_(Size_&& sz) CV_NOEXCEPT = default;
+#endif
+    Size_(const Point_<_Tp>& pt);
+
+#if OPENCV_ABI_COMPATIBILITY < 500
+    Size_& operator = (const Size_& sz) = default;
+    Size_& operator = (Size_&& sz) CV_NOEXCEPT = default;
+#endif
+    //! the area (width*height)
+    _Tp area() const;
+    //! aspect ratio (width/height)
+    double aspectRatio() const;
+    //! true if empty
+    bool empty() const;
+
+    //! conversion of another data type.
+    template<typename _Tp2> operator Size_<_Tp2>() const;
+
+    _Tp width; //!< the width
+    _Tp height; //!< the height
+};
+
+typedef Size_<int> Size2i;
+typedef Size_<int64> Size2l;
+typedef Size_<float> Size2f;
+typedef Size_<double> Size2d;
+typedef Size2i Size;
+
+template<typename _Tp> class DataType< Size_<_Tp> >
+{
+public:
+    typedef Size_<_Tp>                               value_type;
+    typedef Size_<typename DataType<_Tp>::work_type> work_type;
+    typedef _Tp                                      channel_type;
+
+    enum { generic_type = 0,
+           channels     = 2,
+           fmt          = DataType<channel_type>::fmt + ((channels - 1) << 8)
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+           ,depth        = DataType<channel_type>::depth
+           ,type         = CV_MAKETYPE(depth, channels)
+#endif
+         };
+
+    typedef Vec<channel_type, channels> vec_type;
+};
+
+namespace traits {
+template<typename _Tp>
+struct Depth< Size_<_Tp> > { enum { value = Depth<_Tp>::value }; };
+template<typename _Tp>
+struct Type< Size_<_Tp> > { enum { value = CV_MAKETYPE(Depth<_Tp>::value, 2) }; };
+} // namespace
+
+//////////////////////////////// Rect_ ////////////////////////////////
+
+/** @brief Template class for 2D rectangles
+
+described by the following parameters:
+-   Coordinates of the top-left corner. This is a default interpretation of Rect_::x and Rect_::y
+    in OpenCV. Though, in your algorithms you may count x and y from the bottom-left corner.
+-   Rectangle width and height.
+
+OpenCV typically assumes that the top and left boundary of the rectangle are inclusive, while the
+right and bottom boundaries are not. For example, the method Rect_::contains returns true if
+
+\f[x  \leq pt.x < x+width,
+      y  \leq pt.y < y+height\f]
+
+Virtually every loop over an image ROI in OpenCV (where ROI is specified by Rect_\<int\> ) is
+implemented as:
+@code
+    for(int y = roi.y; y < roi.y + roi.height; y++)
+        for(int x = roi.x; x < roi.x + roi.width; x++)
+        {
+            // ...
+        }
+@endcode
+In addition to the class members, the following operations on rectangles are implemented:
+-   \f$\texttt{rect} = \texttt{rect} \pm \texttt{point}\f$ (shifting a rectangle by a certain offset)
+-   \f$\texttt{rect} = \texttt{rect} \pm \texttt{size}\f$ (expanding or shrinking a rectangle by a
+    certain amount)
+-   rect += point, rect -= point, rect += size, rect -= size (augmenting operations)
+-   rect = rect1 & rect2 (rectangle intersection)
+-   rect = rect1 | rect2 (minimum area rectangle containing rect1 and rect2 )
+-   rect &= rect1, rect |= rect1 (and the corresponding augmenting operations)
+-   rect == rect1, rect != rect1 (rectangle comparison)
+
+This is an example how the partial ordering on rectangles can be established (rect1 \f$\subseteq\f$
+rect2):
+@code
+    template<typename _Tp> inline bool
+    operator <= (const Rect_<_Tp>& r1, const Rect_<_Tp>& r2)
+    {
+        return (r1 & r2) == r1;
+    }
+@endcode
+For your convenience, the Rect_\<\> alias is available: cv::Rect
+*/
+template<typename _Tp> class Rect_
+{
+public:
+    typedef _Tp value_type;
+
+    //! default constructor
+    Rect_();
+    Rect_(_Tp _x, _Tp _y, _Tp _width, _Tp _height);
+#if OPENCV_ABI_COMPATIBILITY < 500
+    Rect_(const Rect_& r) = default;
+    Rect_(Rect_&& r) CV_NOEXCEPT = default;
+#endif
+    Rect_(const Point_<_Tp>& org, const Size_<_Tp>& sz);
+    Rect_(const Point_<_Tp>& pt1, const Point_<_Tp>& pt2);
+
+#if OPENCV_ABI_COMPATIBILITY < 500
+    Rect_& operator = (const Rect_& r) = default;
+    Rect_& operator = (Rect_&& r) CV_NOEXCEPT = default;
+#endif
+    //! the top-left corner
+    Point_<_Tp> tl() const;
+    //! the bottom-right corner
+    Point_<_Tp> br() const;
+
+    //! size (width, height) of the rectangle
+    Size_<_Tp> size() const;
+    //! area (width*height) of the rectangle
+    _Tp area() const;
+    //! true if empty
+    bool empty() const;
+
+    //! conversion to another data type
+    template<typename _Tp2> operator Rect_<_Tp2>() const;
+
+    //! checks whether the rectangle contains the point
+    bool contains(const Point_<_Tp>& pt) const;
+
+    _Tp x; //!< x coordinate of the top-left corner
+    _Tp y; //!< y coordinate of the top-left corner
+    _Tp width; //!< width of the rectangle
+    _Tp height; //!< height of the rectangle
+};
+
+typedef Rect_<int> Rect2i;
+typedef Rect_<float> Rect2f;
+typedef Rect_<double> Rect2d;
+typedef Rect2i Rect;
+
+template<typename _Tp> class DataType< Rect_<_Tp> >
+{
+public:
+    typedef Rect_<_Tp>                               value_type;
+    typedef Rect_<typename DataType<_Tp>::work_type> work_type;
+    typedef _Tp                                      channel_type;
+
+    enum { generic_type = 0,
+           channels     = 4,
+           fmt          = traits::SafeFmt<channel_type>::fmt + ((channels - 1) << 8)
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+           ,depth        = DataType<channel_type>::depth
+           ,type         = CV_MAKETYPE(depth, channels)
+#endif
+         };
+
+    typedef Vec<channel_type, channels> vec_type;
+};
+
+namespace traits {
+template<typename _Tp>
+struct Depth< Rect_<_Tp> > { enum { value = Depth<_Tp>::value }; };
+template<typename _Tp>
+struct Type< Rect_<_Tp> > { enum { value = CV_MAKETYPE(Depth<_Tp>::value, 4) }; };
+} // namespace
+
+///////////////////////////// RotatedRect /////////////////////////////
+
+/** @brief The class represents rotated (i.e. not up-right) rectangles on a plane.
+
+Each rectangle is specified by the center point (mass center), length of each side (represented by
+#Size2f structure) and the rotation angle in degrees.
+
+The sample below demonstrates how to use RotatedRect:
+@snippet snippets/core_various.cpp RotatedRect_demo
+![image](pics/rotatedrect.png)
+
+@sa CamShift, fitEllipse, minAreaRect, CvBox2D
+*/
+class CV_EXPORTS RotatedRect
+{
+public:
+    //! default constructor
+    RotatedRect();
+    /** full constructor
+    @param center The rectangle mass center.
+    @param size Width and height of the rectangle.
+    @param angle The rotation angle in a clockwise direction. When the angle is 0, 90, 180, 270 etc.,
+    the rectangle becomes an up-right rectangle.
+    */
+    RotatedRect(const Point2f& center, const Size2f& size, float angle);
+    /**
+    Any 3 end points of the RotatedRect. They must be given in order (either clockwise or
+    anticlockwise).
+     */
+    RotatedRect(const Point2f& point1, const Point2f& point2, const Point2f& point3);
+
+    /** returns 4 vertices of the rectangle
+    @param pts The points array for storing rectangle vertices. The order is bottomLeft, topLeft, topRight, bottomRight.
+    */
+    void points(Point2f pts[]) const;
+    //! returns the minimal up-right integer rectangle containing the rotated rectangle
+    Rect boundingRect() const;
+    //! returns the minimal (exact) floating point rectangle containing the rotated rectangle, not intended for use with images
+    Rect_<float> boundingRect2f() const;
+    //! returns the rectangle mass center
+    Point2f center;
+    //! returns width and height of the rectangle
+    Size2f size;
+    //! returns the rotation angle. When the angle is 0, 90, 180, 270 etc., the rectangle becomes an up-right rectangle.
+    float angle;
+};
+
+template<> class DataType< RotatedRect >
+{
+public:
+    typedef RotatedRect  value_type;
+    typedef value_type   work_type;
+    typedef float        channel_type;
+
+    enum { generic_type = 0,
+           channels     = (int)sizeof(value_type)/sizeof(channel_type), // 5
+           fmt          = traits::SafeFmt<channel_type>::fmt + ((channels - 1) << 8)
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+           ,depth        = DataType<channel_type>::depth
+           ,type         = CV_MAKETYPE(depth, channels)
+#endif
+         };
+
+    typedef Vec<channel_type, channels> vec_type;
+};
+
+namespace traits {
+template<>
+struct Depth< RotatedRect > { enum { value = Depth<float>::value }; };
+template<>
+struct Type< RotatedRect > { enum { value = CV_MAKETYPE(Depth<float>::value, (int)sizeof(RotatedRect)/sizeof(float)) }; };
+} // namespace
+
+
+//////////////////////////////// Range /////////////////////////////////
+
+/** @brief Template class specifying a continuous subsequence (slice) of a sequence.
+
+The class is used to specify a row or a column span in a matrix ( Mat ) and for many other purposes.
+Range(a,b) is basically the same as a:b in Matlab or a..b in Python. As in Python, start is an
+inclusive left boundary of the range and end is an exclusive right boundary of the range. Such a
+half-opened interval is usually denoted as \f$[start,end)\f$ .
+
+The static method Range::all() returns a special variable that means "the whole sequence" or "the
+whole range", just like " : " in Matlab or " ... " in Python. All the methods and functions in
+OpenCV that take Range support this special Range::all() value. But, of course, in case of your own
+custom processing, you will probably have to check and handle it explicitly:
+@code
+    void my_function(..., const Range& r, ....)
+    {
+        if(r == Range::all()) {
+            // process all the data
+        }
+        else {
+            // process [r.start, r.end)
+        }
+    }
+@endcode
+*/
+class CV_EXPORTS Range
+{
+public:
+    Range();
+    Range(int _start, int _end);
+    int size() const;
+    bool empty() const;
+    static Range all();
+
+    int start, end;
+};
+
+template<> class DataType<Range>
+{
+public:
+    typedef Range      value_type;
+    typedef value_type work_type;
+    typedef int        channel_type;
+
+    enum { generic_type = 0,
+           channels     = 2,
+           fmt          = traits::SafeFmt<channel_type>::fmt + ((channels - 1) << 8)
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+           ,depth        = DataType<channel_type>::depth
+           ,type         = CV_MAKETYPE(depth, channels)
+#endif
+         };
+
+    typedef Vec<channel_type, channels> vec_type;
+};
+
+namespace traits {
+template<>
+struct Depth< Range > { enum { value = Depth<int>::value }; };
+template<>
+struct Type< Range > { enum { value = CV_MAKETYPE(Depth<int>::value, 2) }; };
+} // namespace
+
+
+//////////////////////////////// Scalar_ ///////////////////////////////
+
+/** @brief Template class for a 4-element vector derived from Vec.
+
+Being derived from Vec\<_Tp, 4\> , Scalar\_ and Scalar can be used just as typical 4-element
+vectors. In addition, they can be converted to/from CvScalar . The type Scalar is widely used in
+OpenCV to pass pixel values.
+*/
+template<typename _Tp> class Scalar_ : public Vec<_Tp, 4>
+{
+public:
+    //! default constructor
+    Scalar_();
+    Scalar_(_Tp v0, _Tp v1, _Tp v2=0, _Tp v3=0);
+    Scalar_(_Tp v0);
+
+    Scalar_(const Scalar_& s);
+    Scalar_(Scalar_&& s) CV_NOEXCEPT;
+
+    Scalar_& operator=(const Scalar_& s);
+    Scalar_& operator=(Scalar_&& s) CV_NOEXCEPT;
+
+    template<typename _Tp2, int cn>
+    Scalar_(const Vec<_Tp2, cn>& v);
+
+    //! returns a scalar with all elements set to v0
+    static Scalar_<_Tp> all(_Tp v0);
+
+    //! conversion to another data type
+    template<typename T2> operator Scalar_<T2>() const;
+
+    //! per-element product
+    Scalar_<_Tp> mul(const Scalar_<_Tp>& a, double scale=1 ) const;
+
+    //! returns (v0, -v1, -v2, -v3)
+    Scalar_<_Tp> conj() const;
+
+    //! returns true iff v1 == v2 == v3 == 0
+    bool isReal() const;
+};
+
+typedef Scalar_<double> Scalar;
+
+template<typename _Tp> class DataType< Scalar_<_Tp> >
+{
+public:
+    typedef Scalar_<_Tp>                               value_type;
+    typedef Scalar_<typename DataType<_Tp>::work_type> work_type;
+    typedef _Tp                                        channel_type;
+
+    enum { generic_type = 0,
+           channels     = 4,
+           fmt          = traits::SafeFmt<channel_type>::fmt + ((channels - 1) << 8)
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+           ,depth        = DataType<channel_type>::depth
+           ,type         = CV_MAKETYPE(depth, channels)
+#endif
+         };
+
+    typedef Vec<channel_type, channels> vec_type;
+};
+
+namespace traits {
+template<typename _Tp>
+struct Depth< Scalar_<_Tp> > { enum { value = Depth<_Tp>::value }; };
+template<typename _Tp>
+struct Type< Scalar_<_Tp> > { enum { value = CV_MAKETYPE(Depth<_Tp>::value, 4) }; };
+} // namespace
+
+
+/////////////////////////////// KeyPoint ////////////////////////////////
+
+/** @brief Data structure for salient point detectors.
+
+The class instance stores a keypoint, i.e. a point feature found by one of many available keypoint
+detectors, such as Harris corner detector, #FAST, %StarDetector, %SURF, %SIFT etc.
+
+The keypoint is characterized by the 2D position, scale (proportional to the diameter of the
+neighborhood that needs to be taken into account), orientation and some other parameters. The
+keypoint neighborhood is then analyzed by another algorithm that builds a descriptor (usually
+represented as a feature vector). The keypoints representing the same object in different images
+can then be matched using %KDTree or another method.
+*/
+class CV_EXPORTS_W_SIMPLE KeyPoint
+{
+public:
+    //! the default constructor
+    CV_WRAP KeyPoint();
+    /**
+    @param pt x & y coordinates of the keypoint
+    @param size keypoint diameter
+    @param angle keypoint orientation
+    @param response keypoint detector response on the keypoint (that is, strength of the keypoint)
+    @param octave pyramid octave in which the keypoint has been detected
+    @param class_id object id
+     */
+    KeyPoint(Point2f pt, float size, float angle=-1, float response=0, int octave=0, int class_id=-1);
+    /**
+    @param x x-coordinate of the keypoint
+    @param y y-coordinate of the keypoint
+    @param size keypoint diameter
+    @param angle keypoint orientation
+    @param response keypoint detector response on the keypoint (that is, strength of the keypoint)
+    @param octave pyramid octave in which the keypoint has been detected
+    @param class_id object id
+     */
+    CV_WRAP KeyPoint(float x, float y, float size, float angle=-1, float response=0, int octave=0, int class_id=-1);
+
+    size_t hash() const;
+
+    /**
+    This method converts vector of keypoints to vector of points or the reverse, where each keypoint is
+    assigned the same size and the same orientation.
+
+    @param keypoints Keypoints obtained from any feature detection algorithm like SIFT/SURF/ORB
+    @param points2f Array of (x,y) coordinates of each keypoint
+    @param keypointIndexes Array of indexes of keypoints to be converted to points. (Acts like a mask to
+    convert only specified keypoints)
+    */
+    CV_WRAP static void convert(const std::vector<KeyPoint>& keypoints,
+                                CV_OUT std::vector<Point2f>& points2f,
+                                const std::vector<int>& keypointIndexes=std::vector<int>());
+    /** @overload
+    @param points2f Array of (x,y) coordinates of each keypoint
+    @param keypoints Keypoints obtained from any feature detection algorithm like SIFT/SURF/ORB
+    @param size keypoint diameter
+    @param response keypoint detector response on the keypoint (that is, strength of the keypoint)
+    @param octave pyramid octave in which the keypoint has been detected
+    @param class_id object id
+    */
+    CV_WRAP static void convert(const std::vector<Point2f>& points2f,
+                                CV_OUT std::vector<KeyPoint>& keypoints,
+                                float size=1, float response=1, int octave=0, int class_id=-1);
+
+    /**
+    This method computes overlap for pair of keypoints. Overlap is the ratio between area of keypoint
+    regions' intersection and area of keypoint regions' union (considering keypoint region as circle).
+    If they don't overlap, we get zero. If they coincide at same location with same size, we get 1.
+    @param kp1 First keypoint
+    @param kp2 Second keypoint
+    */
+    CV_WRAP static float overlap(const KeyPoint& kp1, const KeyPoint& kp2);
+
+    CV_PROP_RW Point2f pt; //!< coordinates of the keypoints
+    CV_PROP_RW float size; //!< diameter of the meaningful keypoint neighborhood
+    CV_PROP_RW float angle; //!< computed orientation of the keypoint (-1 if not applicable);
+                            //!< it's in [0,360) degrees and measured relative to
+                            //!< image coordinate system, ie in clockwise.
+    CV_PROP_RW float response; //!< the response by which the most strong keypoints have been selected. Can be used for the further sorting or subsampling
+    CV_PROP_RW int octave; //!< octave (pyramid layer) from which the keypoint has been extracted
+    CV_PROP_RW int class_id; //!< object class (if the keypoints need to be clustered by an object they belong to)
+};
+
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+template<> class DataType<KeyPoint>
+{
+public:
+    typedef KeyPoint      value_type;
+    typedef float         work_type;
+    typedef float         channel_type;
+
+    enum { generic_type = 0,
+           depth        = DataType<channel_type>::depth,
+           channels     = (int)(sizeof(value_type)/sizeof(channel_type)), // 7
+           fmt          = DataType<channel_type>::fmt + ((channels - 1) << 8),
+           type         = CV_MAKETYPE(depth, channels)
+         };
+
+    typedef Vec<channel_type, channels> vec_type;
+};
+#endif
+
+
+//////////////////////////////// DMatch /////////////////////////////////
+
+/** @brief Class for matching keypoint descriptors
+
+query descriptor index, train descriptor index, train image index, and distance between
+descriptors.
+*/
+class CV_EXPORTS_W_SIMPLE DMatch
+{
+public:
+    CV_WRAP DMatch();
+    CV_WRAP DMatch(int _queryIdx, int _trainIdx, float _distance);
+    CV_WRAP DMatch(int _queryIdx, int _trainIdx, int _imgIdx, float _distance);
+
+    CV_PROP_RW int queryIdx; //!< query descriptor index
+    CV_PROP_RW int trainIdx; //!< train descriptor index
+    CV_PROP_RW int imgIdx;   //!< train image index
+
+    CV_PROP_RW float distance;
+
+    // less is better
+    bool operator<(const DMatch &m) const;
+};
+
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+template<> class DataType<DMatch>
+{
+public:
+    typedef DMatch      value_type;
+    typedef int         work_type;
+    typedef int         channel_type;
+
+    enum { generic_type = 0,
+           depth        = DataType<channel_type>::depth,
+           channels     = (int)(sizeof(value_type)/sizeof(channel_type)), // 4
+           fmt          = DataType<channel_type>::fmt + ((channels - 1) << 8),
+           type         = CV_MAKETYPE(depth, channels)
+         };
+
+    typedef Vec<channel_type, channels> vec_type;
+};
+#endif
+
+
+///////////////////////////// TermCriteria //////////////////////////////
+
+/** @brief The class defining termination criteria for iterative algorithms.
+
+You can initialize it by default constructor and then override any parameters, or the structure may
+be fully initialized using the advanced variant of the constructor.
+*/
+class CV_EXPORTS TermCriteria
+{
+public:
+    /**
+      Criteria type, can be one of: COUNT, EPS or COUNT + EPS
+    */
+    enum Type
+    {
+        COUNT=1, //!< the maximum number of iterations or elements to compute
+        MAX_ITER=COUNT, //!< ditto
+        EPS=2 //!< the desired accuracy or change in parameters at which the iterative algorithm stops
+    };
+
+    //! default constructor
+    TermCriteria();
+    /**
+    @param type The type of termination criteria, one of TermCriteria::Type
+    @param maxCount The maximum number of iterations or elements to compute.
+    @param epsilon The desired accuracy or change in parameters at which the iterative algorithm stops.
+    */
+    TermCriteria(int type, int maxCount, double epsilon);
+
+    inline bool isValid() const
+    {
+        const bool isCount = (type & COUNT) && maxCount > 0;
+        const bool isEps = (type & EPS) && !cvIsNaN(epsilon);
+        return isCount || isEps;
+    }
+
+    int type; //!< the type of termination criteria: COUNT, EPS or COUNT + EPS
+    int maxCount; //!< the maximum number of iterations/elements
+    double epsilon; //!< the desired accuracy
+};
+
+
+//! @} core_basic
+
+///////////////////////// raster image moments //////////////////////////
+
+//! @addtogroup imgproc_shape
+//! @{
+
+/** @brief struct returned by cv::moments
+
+The spatial moments \f$\texttt{Moments::m}_{ji}\f$ are computed as:
+
+\f[\texttt{m} _{ji}= \sum _{x,y}  \left ( \texttt{array} (x,y)  \cdot x^j  \cdot y^i \right )\f]
+
+The central moments \f$\texttt{Moments::mu}_{ji}\f$ are computed as:
+
+\f[\texttt{mu} _{ji}= \sum _{x,y}  \left ( \texttt{array} (x,y)  \cdot (x -  \bar{x} )^j  \cdot (y -  \bar{y} )^i \right )\f]
+
+where \f$(\bar{x}, \bar{y})\f$ is the mass center:
+
+\f[\bar{x} = \frac{\texttt{m}_{10}}{\texttt{m}_{00}} , \; \bar{y} = \frac{\texttt{m}_{01}}{\texttt{m}_{00}}\f]
+
+The normalized central moments \f$\texttt{Moments::nu}_{ij}\f$ are computed as:
+
+\f[\texttt{nu} _{ji}= \frac{\texttt{mu}_{ji}}{\texttt{m}_{00}^{(i+j)/2+1}} .\f]
+
+@note
+\f$\texttt{mu}_{00}=\texttt{m}_{00}\f$, \f$\texttt{nu}_{00}=1\f$
+\f$\texttt{nu}_{10}=\texttt{mu}_{10}=\texttt{mu}_{01}=\texttt{mu}_{10}=0\f$ , hence the values are not
+stored.
+
+The moments of a contour are defined in the same way but computed using the Green's formula (see
+<http://en.wikipedia.org/wiki/Green_theorem>). So, due to a limited raster resolution, the moments
+computed for a contour are slightly different from the moments computed for the same rasterized
+contour.
+
+@note
+Since the contour moments are computed using Green formula, you may get seemingly odd results for
+contours with self-intersections, e.g. a zero area (m00) for butterfly-shaped contours.
+ */
+class CV_EXPORTS_W_MAP Moments
+{
+public:
+    //! the default constructor
+    Moments();
+    //! the full constructor
+    Moments(double m00, double m10, double m01, double m20, double m11,
+            double m02, double m30, double m21, double m12, double m03 );
+    ////! the conversion from CvMoments
+    //Moments( const CvMoments& moments );
+    ////! the conversion to CvMoments
+    //operator CvMoments() const;
+
+    //! @name spatial moments
+    //! @{
+    CV_PROP_RW double  m00, m10, m01, m20, m11, m02, m30, m21, m12, m03;
+    //! @}
+
+    //! @name central moments
+    //! @{
+    CV_PROP_RW double  mu20, mu11, mu02, mu30, mu21, mu12, mu03;
+    //! @}
+
+    //! @name central normalized moments
+    //! @{
+    CV_PROP_RW double  nu20, nu11, nu02, nu30, nu21, nu12, nu03;
+    //! @}
+};
+
+template<> class DataType<Moments>
+{
+public:
+    typedef Moments     value_type;
+    typedef double      work_type;
+    typedef double      channel_type;
+
+    enum { generic_type = 0,
+           channels     = (int)(sizeof(value_type)/sizeof(channel_type)), // 24
+           fmt          = DataType<channel_type>::fmt + ((channels - 1) << 8)
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+           ,depth        = DataType<channel_type>::depth
+           ,type         = CV_MAKETYPE(depth, channels)
+#endif
+         };
+
+    typedef Vec<channel_type, channels> vec_type;
+};
+
+namespace traits {
+template<>
+struct Depth< Moments > { enum { value = Depth<double>::value }; };
+template<>
+struct Type< Moments > { enum { value = CV_MAKETYPE(Depth<double>::value, (int)(sizeof(Moments)/sizeof(double))) }; };
+} // namespace
+
+//! @} imgproc_shape
+
+//! @cond IGNORED
+
+/////////////////////////////////////////////////////////////////////////
+///////////////////////////// Implementation ////////////////////////////
+/////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////// Complex ////////////////////////////////
+
+template<typename _Tp> inline
+Complex<_Tp>::Complex()
+    : re(0), im(0) {}
+
+template<typename _Tp> inline
+Complex<_Tp>::Complex( _Tp _re, _Tp _im )
+    : re(_re), im(_im) {}
+
+template<typename _Tp> template<typename T2> inline
+Complex<_Tp>::operator Complex<T2>() const
+{
+    return Complex<T2>(saturate_cast<T2>(re), saturate_cast<T2>(im));
+}
+
+template<typename _Tp> inline
+Complex<_Tp> Complex<_Tp>::conj() const
+{
+    return Complex<_Tp>(re, -im);
+}
+
+
+template<typename _Tp> static inline
+bool operator == (const Complex<_Tp>& a, const Complex<_Tp>& b)
+{
+    return a.re == b.re && a.im == b.im;
+}
+
+template<typename _Tp> static inline
+bool operator != (const Complex<_Tp>& a, const Complex<_Tp>& b)
+{
+    return a.re != b.re || a.im != b.im;
+}
+
+template<typename _Tp> static inline
+Complex<_Tp> operator + (const Complex<_Tp>& a, const Complex<_Tp>& b)
+{
+    return Complex<_Tp>( a.re + b.re, a.im + b.im );
+}
+
+template<typename _Tp> static inline
+Complex<_Tp>& operator += (Complex<_Tp>& a, const Complex<_Tp>& b)
+{
+    a.re += b.re; a.im += b.im;
+    return a;
+}
+
+template<typename _Tp> static inline
+Complex<_Tp> operator - (const Complex<_Tp>& a, const Complex<_Tp>& b)
+{
+    return Complex<_Tp>( a.re - b.re, a.im - b.im );
+}
+
+template<typename _Tp> static inline
+Complex<_Tp>& operator -= (Complex<_Tp>& a, const Complex<_Tp>& b)
+{
+    a.re -= b.re; a.im -= b.im;
+    return a;
+}
+
+template<typename _Tp> static inline
+Complex<_Tp> operator - (const Complex<_Tp>& a)
+{
+    return Complex<_Tp>(-a.re, -a.im);
+}
+
+template<typename _Tp> static inline
+Complex<_Tp> operator * (const Complex<_Tp>& a, const Complex<_Tp>& b)
+{
+    return Complex<_Tp>( a.re*b.re - a.im*b.im, a.re*b.im + a.im*b.re );
+}
+
+template<typename _Tp> static inline
+Complex<_Tp> operator * (const Complex<_Tp>& a, _Tp b)
+{
+    return Complex<_Tp>( a.re*b, a.im*b );
+}
+
+template<typename _Tp> static inline
+Complex<_Tp> operator * (_Tp b, const Complex<_Tp>& a)
+{
+    return Complex<_Tp>( a.re*b, a.im*b );
+}
+
+template<typename _Tp> static inline
+Complex<_Tp> operator + (const Complex<_Tp>& a, _Tp b)
+{
+    return Complex<_Tp>( a.re + b, a.im );
+}
+
+template<typename _Tp> static inline
+Complex<_Tp> operator - (const Complex<_Tp>& a, _Tp b)
+{ return Complex<_Tp>( a.re - b, a.im ); }
+
+template<typename _Tp> static inline
+Complex<_Tp> operator + (_Tp b, const Complex<_Tp>& a)
+{
+    return Complex<_Tp>( a.re + b, a.im );
+}
+
+template<typename _Tp> static inline
+Complex<_Tp> operator - (_Tp b, const Complex<_Tp>& a)
+{
+    return Complex<_Tp>( b - a.re, -a.im );
+}
+
+template<typename _Tp> static inline
+Complex<_Tp>& operator += (Complex<_Tp>& a, _Tp b)
+{
+    a.re += b; return a;
+}
+
+template<typename _Tp> static inline
+Complex<_Tp>& operator -= (Complex<_Tp>& a, _Tp b)
+{
+    a.re -= b; return a;
+}
+
+template<typename _Tp> static inline
+Complex<_Tp>& operator *= (Complex<_Tp>& a, _Tp b)
+{
+    a.re *= b; a.im *= b; return a;
+}
+
+template<typename _Tp> static inline
+double abs(const Complex<_Tp>& a)
+{
+    return std::sqrt( (double)a.re*a.re + (double)a.im*a.im);
+}
+
+template<typename _Tp> static inline
+Complex<_Tp> operator / (const Complex<_Tp>& a, const Complex<_Tp>& b)
+{
+    double t = 1./((double)b.re*b.re + (double)b.im*b.im);
+    return Complex<_Tp>( (_Tp)((a.re*b.re + a.im*b.im)*t),
+                        (_Tp)((-a.re*b.im + a.im*b.re)*t) );
+}
+
+template<typename _Tp> static inline
+Complex<_Tp>& operator /= (Complex<_Tp>& a, const Complex<_Tp>& b)
+{
+    a = a / b;
+    return a;
+}
+
+template<typename _Tp> static inline
+Complex<_Tp> operator / (const Complex<_Tp>& a, _Tp b)
+{
+    _Tp t = (_Tp)1/b;
+    return Complex<_Tp>( a.re*t, a.im*t );
+}
+
+template<typename _Tp> static inline
+Complex<_Tp> operator / (_Tp b, const Complex<_Tp>& a)
+{
+    return Complex<_Tp>(b)/a;
+}
+
+template<typename _Tp> static inline
+Complex<_Tp> operator /= (const Complex<_Tp>& a, _Tp b)
+{
+    _Tp t = (_Tp)1/b;
+    a.re *= t; a.im *= t; return a;
+}
+
+
+
+//////////////////////////////// 2D Point ///////////////////////////////
+
+template<typename _Tp> inline
+Point_<_Tp>::Point_()
+    : x(0), y(0) {}
+
+template<typename _Tp> inline
+Point_<_Tp>::Point_(_Tp _x, _Tp _y)
+    : x(_x), y(_y) {}
+
+#if (defined(__GNUC__) && __GNUC__ < 5) && !defined(__clang__)  // GCC 4.x bug. Details: https://github.com/opencv/opencv/pull/20837
+template<typename _Tp> inline
+Point_<_Tp>::Point_(const Point_& pt)
+    : x(pt.x), y(pt.y) {}
+#endif
+
+template<typename _Tp> inline
+Point_<_Tp>::Point_(const Size_<_Tp>& sz)
+    : x(sz.width), y(sz.height) {}
+
+template<typename _Tp> inline
+Point_<_Tp>::Point_(const Vec<_Tp,2>& v)
+    : x(v[0]), y(v[1]) {}
+
+#if (defined(__GNUC__) && __GNUC__ < 5) && !defined(__clang__)  // GCC 4.x bug. Details: https://github.com/opencv/opencv/pull/20837
+template<typename _Tp> inline
+Point_<_Tp>& Point_<_Tp>::operator = (const Point_& pt)
+{
+    x = pt.x; y = pt.y;
+    return *this;
+}
+#endif
+
+template<typename _Tp> template<typename _Tp2> inline
+Point_<_Tp>::operator Point_<_Tp2>() const
+{
+    return Point_<_Tp2>(saturate_cast<_Tp2>(x), saturate_cast<_Tp2>(y));
+}
+
+template<typename _Tp> inline
+Point_<_Tp>::operator Vec<_Tp, 2>() const
+{
+    return Vec<_Tp, 2>(x, y);
+}
+
+template<typename _Tp> inline
+_Tp Point_<_Tp>::dot(const Point_& pt) const
+{
+    return saturate_cast<_Tp>(x*pt.x + y*pt.y);
+}
+
+template<typename _Tp> inline
+double Point_<_Tp>::ddot(const Point_& pt) const
+{
+    return (double)x*(double)(pt.x) + (double)y*(double)(pt.y);
+}
+
+template<typename _Tp> inline
+double Point_<_Tp>::cross(const Point_& pt) const
+{
+    return (double)x*pt.y - (double)y*pt.x;
+}
+
+template<typename _Tp> inline bool
+Point_<_Tp>::inside( const Rect_<_Tp>& r ) const
+{
+    return r.contains(*this);
+}
+
+
+template<typename _Tp> static inline
+Point_<_Tp>& operator += (Point_<_Tp>& a, const Point_<_Tp>& b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    return a;
+}
+
+template<typename _Tp> static inline
+Point_<_Tp>& operator -= (Point_<_Tp>& a, const Point_<_Tp>& b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    return a;
+}
+
+template<typename _Tp> static inline
+Point_<_Tp>& operator *= (Point_<_Tp>& a, int b)
+{
+    a.x = saturate_cast<_Tp>(a.x * b);
+    a.y = saturate_cast<_Tp>(a.y * b);
+    return a;
+}
+
+template<typename _Tp> static inline
+Point_<_Tp>& operator *= (Point_<_Tp>& a, float b)
+{
+    a.x = saturate_cast<_Tp>(a.x * b);
+    a.y = saturate_cast<_Tp>(a.y * b);
+    return a;
+}
+
+template<typename _Tp> static inline
+Point_<_Tp>& operator *= (Point_<_Tp>& a, double b)
+{
+    a.x = saturate_cast<_Tp>(a.x * b);
+    a.y = saturate_cast<_Tp>(a.y * b);
+    return a;
+}
+
+template<typename _Tp> static inline
+Point_<_Tp>& operator /= (Point_<_Tp>& a, int b)
+{
+    a.x = saturate_cast<_Tp>(a.x / b);
+    a.y = saturate_cast<_Tp>(a.y / b);
+    return a;
+}
+
+template<typename _Tp> static inline
+Point_<_Tp>& operator /= (Point_<_Tp>& a, float b)
+{
+    a.x = saturate_cast<_Tp>(a.x / b);
+    a.y = saturate_cast<_Tp>(a.y / b);
+    return a;
+}
+
+template<typename _Tp> static inline
+Point_<_Tp>& operator /= (Point_<_Tp>& a, double b)
+{
+    a.x = saturate_cast<_Tp>(a.x / b);
+    a.y = saturate_cast<_Tp>(a.y / b);
+    return a;
+}
+
+template<typename _Tp> static inline
+double norm(const Point_<_Tp>& pt)
+{
+    return std::sqrt((double)pt.x*pt.x + (double)pt.y*pt.y);
+}
+
+template<typename _Tp> static inline
+bool operator == (const Point_<_Tp>& a, const Point_<_Tp>& b)
+{
+    return a.x == b.x && a.y == b.y;
+}
+
+template<typename _Tp> static inline
+bool operator != (const Point_<_Tp>& a, const Point_<_Tp>& b)
+{
+    return a.x != b.x || a.y != b.y;
+}
+
+template<typename _Tp> static inline
+Point_<_Tp> operator + (const Point_<_Tp>& a, const Point_<_Tp>& b)
+{
+    return Point_<_Tp>( saturate_cast<_Tp>(a.x + b.x), saturate_cast<_Tp>(a.y + b.y) );
+}
+
+template<typename _Tp> static inline
+Point_<_Tp> operator - (const Point_<_Tp>& a, const Point_<_Tp>& b)
+{
+    return Point_<_Tp>( saturate_cast<_Tp>(a.x - b.x), saturate_cast<_Tp>(a.y - b.y) );
+}
+
+template<typename _Tp> static inline
+Point_<_Tp> operator - (const Point_<_Tp>& a)
+{
+    return Point_<_Tp>( saturate_cast<_Tp>(-a.x), saturate_cast<_Tp>(-a.y) );
+}
+
+template<typename _Tp> static inline
+Point_<_Tp> operator * (const Point_<_Tp>& a, int b)
+{
+    return Point_<_Tp>( saturate_cast<_Tp>(a.x*b), saturate_cast<_Tp>(a.y*b) );
+}
+
+template<typename _Tp> static inline
+Point_<_Tp> operator * (int a, const Point_<_Tp>& b)
+{
+    return Point_<_Tp>( saturate_cast<_Tp>(b.x*a), saturate_cast<_Tp>(b.y*a) );
+}
+
+template<typename _Tp> static inline
+Point_<_Tp> operator * (const Point_<_Tp>& a, float b)
+{
+    return Point_<_Tp>( saturate_cast<_Tp>(a.x*b), saturate_cast<_Tp>(a.y*b) );
+}
+
+template<typename _Tp> static inline
+Point_<_Tp> operator * (float a, const Point_<_Tp>& b)
+{
+    return Point_<_Tp>( saturate_cast<_Tp>(b.x*a), saturate_cast<_Tp>(b.y*a) );
+}
+
+template<typename _Tp> static inline
+Point_<_Tp> operator * (const Point_<_Tp>& a, double b)
+{
+    return Point_<_Tp>( saturate_cast<_Tp>(a.x*b), saturate_cast<_Tp>(a.y*b) );
+}
+
+template<typename _Tp> static inline
+Point_<_Tp> operator * (double a, const Point_<_Tp>& b)
+{
+    return Point_<_Tp>( saturate_cast<_Tp>(b.x*a), saturate_cast<_Tp>(b.y*a) );
+}
+
+template<typename _Tp> static inline
+Point_<_Tp> operator * (const Matx<_Tp, 2, 2>& a, const Point_<_Tp>& b)
+{
+    Matx<_Tp, 2, 1> tmp = a * Vec<_Tp,2>(b.x, b.y);
+    return Point_<_Tp>(tmp.val[0], tmp.val[1]);
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp> operator * (const Matx<_Tp, 3, 3>& a, const Point_<_Tp>& b)
+{
+    Matx<_Tp, 3, 1> tmp = a * Vec<_Tp,3>(b.x, b.y, 1);
+    return Point3_<_Tp>(tmp.val[0], tmp.val[1], tmp.val[2]);
+}
+
+template<typename _Tp> static inline
+Point_<_Tp> operator / (const Point_<_Tp>& a, int b)
+{
+    Point_<_Tp> tmp(a);
+    tmp /= b;
+    return tmp;
+}
+
+template<typename _Tp> static inline
+Point_<_Tp> operator / (const Point_<_Tp>& a, float b)
+{
+    Point_<_Tp> tmp(a);
+    tmp /= b;
+    return tmp;
+}
+
+template<typename _Tp> static inline
+Point_<_Tp> operator / (const Point_<_Tp>& a, double b)
+{
+    Point_<_Tp> tmp(a);
+    tmp /= b;
+    return tmp;
+}
+
+
+template<typename _AccTp> static inline _AccTp normL2Sqr(const Point_<int>& pt);
+template<typename _AccTp> static inline _AccTp normL2Sqr(const Point_<int64>& pt);
+template<typename _AccTp> static inline _AccTp normL2Sqr(const Point_<float>& pt);
+template<typename _AccTp> static inline _AccTp normL2Sqr(const Point_<double>& pt);
+
+template<> inline int normL2Sqr<int>(const Point_<int>& pt) { return pt.dot(pt); }
+template<> inline int64 normL2Sqr<int64>(const Point_<int64>& pt) { return pt.dot(pt); }
+template<> inline float normL2Sqr<float>(const Point_<float>& pt) { return pt.dot(pt); }
+template<> inline double normL2Sqr<double>(const Point_<int>& pt) { return pt.dot(pt); }
+
+template<> inline double normL2Sqr<double>(const Point_<float>& pt) { return pt.ddot(pt); }
+template<> inline double normL2Sqr<double>(const Point_<double>& pt) { return pt.ddot(pt); }
+
+
+
+//////////////////////////////// 3D Point ///////////////////////////////
+
+template<typename _Tp> inline
+Point3_<_Tp>::Point3_()
+    : x(0), y(0), z(0) {}
+
+template<typename _Tp> inline
+Point3_<_Tp>::Point3_(_Tp _x, _Tp _y, _Tp _z)
+    : x(_x), y(_y), z(_z) {}
+
+template<typename _Tp> inline
+Point3_<_Tp>::Point3_(const Point_<_Tp>& pt)
+    : x(pt.x), y(pt.y), z(_Tp()) {}
+
+template<typename _Tp> inline
+Point3_<_Tp>::Point3_(const Vec<_Tp, 3>& v)
+    : x(v[0]), y(v[1]), z(v[2]) {}
+
+template<typename _Tp> template<typename _Tp2> inline
+Point3_<_Tp>::operator Point3_<_Tp2>() const
+{
+    return Point3_<_Tp2>(saturate_cast<_Tp2>(x), saturate_cast<_Tp2>(y), saturate_cast<_Tp2>(z));
+}
+
+template<typename _Tp> inline
+Point3_<_Tp>::operator Vec<_Tp, 3>() const
+{
+    return Vec<_Tp, 3>(x, y, z);
+}
+
+template<typename _Tp> inline
+_Tp Point3_<_Tp>::dot(const Point3_& pt) const
+{
+    return saturate_cast<_Tp>(x*pt.x + y*pt.y + z*pt.z);
+}
+
+template<typename _Tp> inline
+double Point3_<_Tp>::ddot(const Point3_& pt) const
+{
+    return (double)x*pt.x + (double)y*pt.y + (double)z*pt.z;
+}
+
+template<typename _Tp> inline
+Point3_<_Tp> Point3_<_Tp>::cross(const Point3_<_Tp>& pt) const
+{
+    return Point3_<_Tp>(y*pt.z - z*pt.y, z*pt.x - x*pt.z, x*pt.y - y*pt.x);
+}
+
+
+template<typename _Tp> static inline
+Point3_<_Tp>& operator += (Point3_<_Tp>& a, const Point3_<_Tp>& b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+    return a;
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp>& operator -= (Point3_<_Tp>& a, const Point3_<_Tp>& b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+    return a;
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp>& operator *= (Point3_<_Tp>& a, int b)
+{
+    a.x = saturate_cast<_Tp>(a.x * b);
+    a.y = saturate_cast<_Tp>(a.y * b);
+    a.z = saturate_cast<_Tp>(a.z * b);
+    return a;
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp>& operator *= (Point3_<_Tp>& a, float b)
+{
+    a.x = saturate_cast<_Tp>(a.x * b);
+    a.y = saturate_cast<_Tp>(a.y * b);
+    a.z = saturate_cast<_Tp>(a.z * b);
+    return a;
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp>& operator *= (Point3_<_Tp>& a, double b)
+{
+    a.x = saturate_cast<_Tp>(a.x * b);
+    a.y = saturate_cast<_Tp>(a.y * b);
+    a.z = saturate_cast<_Tp>(a.z * b);
+    return a;
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp>& operator /= (Point3_<_Tp>& a, int b)
+{
+    a.x = saturate_cast<_Tp>(a.x / b);
+    a.y = saturate_cast<_Tp>(a.y / b);
+    a.z = saturate_cast<_Tp>(a.z / b);
+    return a;
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp>& operator /= (Point3_<_Tp>& a, float b)
+{
+    a.x = saturate_cast<_Tp>(a.x / b);
+    a.y = saturate_cast<_Tp>(a.y / b);
+    a.z = saturate_cast<_Tp>(a.z / b);
+    return a;
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp>& operator /= (Point3_<_Tp>& a, double b)
+{
+    a.x = saturate_cast<_Tp>(a.x / b);
+    a.y = saturate_cast<_Tp>(a.y / b);
+    a.z = saturate_cast<_Tp>(a.z / b);
+    return a;
+}
+
+template<typename _Tp> static inline
+double norm(const Point3_<_Tp>& pt)
+{
+    return std::sqrt((double)pt.x*pt.x + (double)pt.y*pt.y + (double)pt.z*pt.z);
+}
+
+template<typename _Tp> static inline
+bool operator == (const Point3_<_Tp>& a, const Point3_<_Tp>& b)
+{
+    return a.x == b.x && a.y == b.y && a.z == b.z;
+}
+
+template<typename _Tp> static inline
+bool operator != (const Point3_<_Tp>& a, const Point3_<_Tp>& b)
+{
+    return a.x != b.x || a.y != b.y || a.z != b.z;
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp> operator + (const Point3_<_Tp>& a, const Point3_<_Tp>& b)
+{
+    return Point3_<_Tp>( saturate_cast<_Tp>(a.x + b.x), saturate_cast<_Tp>(a.y + b.y), saturate_cast<_Tp>(a.z + b.z));
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp> operator - (const Point3_<_Tp>& a, const Point3_<_Tp>& b)
+{
+    return Point3_<_Tp>( saturate_cast<_Tp>(a.x - b.x), saturate_cast<_Tp>(a.y - b.y), saturate_cast<_Tp>(a.z - b.z));
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp> operator - (const Point3_<_Tp>& a)
+{
+    return Point3_<_Tp>( saturate_cast<_Tp>(-a.x), saturate_cast<_Tp>(-a.y), saturate_cast<_Tp>(-a.z) );
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp> operator * (const Point3_<_Tp>& a, int b)
+{
+    return Point3_<_Tp>( saturate_cast<_Tp>(a.x*b), saturate_cast<_Tp>(a.y*b), saturate_cast<_Tp>(a.z*b) );
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp> operator * (int a, const Point3_<_Tp>& b)
+{
+    return Point3_<_Tp>( saturate_cast<_Tp>(b.x * a), saturate_cast<_Tp>(b.y * a), saturate_cast<_Tp>(b.z * a) );
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp> operator * (const Point3_<_Tp>& a, float b)
+{
+    return Point3_<_Tp>( saturate_cast<_Tp>(a.x * b), saturate_cast<_Tp>(a.y * b), saturate_cast<_Tp>(a.z * b) );
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp> operator * (float a, const Point3_<_Tp>& b)
+{
+    return Point3_<_Tp>( saturate_cast<_Tp>(b.x * a), saturate_cast<_Tp>(b.y * a), saturate_cast<_Tp>(b.z * a) );
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp> operator * (const Point3_<_Tp>& a, double b)
+{
+    return Point3_<_Tp>( saturate_cast<_Tp>(a.x * b), saturate_cast<_Tp>(a.y * b), saturate_cast<_Tp>(a.z * b) );
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp> operator * (double a, const Point3_<_Tp>& b)
+{
+    return Point3_<_Tp>( saturate_cast<_Tp>(b.x * a), saturate_cast<_Tp>(b.y * a), saturate_cast<_Tp>(b.z * a) );
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp> operator * (const Matx<_Tp, 3, 3>& a, const Point3_<_Tp>& b)
+{
+    Matx<_Tp, 3, 1> tmp = a * Vec<_Tp,3>(b.x, b.y, b.z);
+    return Point3_<_Tp>(tmp.val[0], tmp.val[1], tmp.val[2]);
+}
+
+template<typename _Tp> static inline
+Matx<_Tp, 4, 1> operator * (const Matx<_Tp, 4, 4>& a, const Point3_<_Tp>& b)
+{
+    return a * Matx<_Tp, 4, 1>(b.x, b.y, b.z, 1);
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp> operator / (const Point3_<_Tp>& a, int b)
+{
+    Point3_<_Tp> tmp(a);
+    tmp /= b;
+    return tmp;
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp> operator / (const Point3_<_Tp>& a, float b)
+{
+    Point3_<_Tp> tmp(a);
+    tmp /= b;
+    return tmp;
+}
+
+template<typename _Tp> static inline
+Point3_<_Tp> operator / (const Point3_<_Tp>& a, double b)
+{
+    Point3_<_Tp> tmp(a);
+    tmp /= b;
+    return tmp;
+}
+
+
+
+////////////////////////////////// Size /////////////////////////////////
+
+template<typename _Tp> inline
+Size_<_Tp>::Size_()
+    : width(0), height(0) {}
+
+template<typename _Tp> inline
+Size_<_Tp>::Size_(_Tp _width, _Tp _height)
+    : width(_width), height(_height) {}
+
+template<typename _Tp> inline
+Size_<_Tp>::Size_(const Point_<_Tp>& pt)
+    : width(pt.x), height(pt.y) {}
+
+template<typename _Tp> template<typename _Tp2> inline
+Size_<_Tp>::operator Size_<_Tp2>() const
+{
+    return Size_<_Tp2>(saturate_cast<_Tp2>(width), saturate_cast<_Tp2>(height));
+}
+
+template<typename _Tp> inline
+_Tp Size_<_Tp>::area() const
+{
+    const _Tp result = width * height;
+    CV_DbgAssert(!std::numeric_limits<_Tp>::is_integer
+        || width == 0 || result / width == height); // make sure the result fits in the return value
+    return result;
+}
+
+template<typename _Tp> inline
+double Size_<_Tp>::aspectRatio() const
+{
+    return width / static_cast<double>(height);
+}
+
+template<typename _Tp> inline
+bool Size_<_Tp>::empty() const
+{
+    return width <= 0 || height <= 0;
+}
+
+
+template<typename _Tp> static inline
+Size_<_Tp>& operator *= (Size_<_Tp>& a, _Tp b)
+{
+    a.width *= b;
+    a.height *= b;
+    return a;
+}
+
+template<typename _Tp> static inline
+Size_<_Tp> operator * (const Size_<_Tp>& a, _Tp b)
+{
+    Size_<_Tp> tmp(a);
+    tmp *= b;
+    return tmp;
+}
+
+template<typename _Tp> static inline
+Size_<_Tp>& operator /= (Size_<_Tp>& a, _Tp b)
+{
+    a.width /= b;
+    a.height /= b;
+    return a;
+}
+
+template<typename _Tp> static inline
+Size_<_Tp> operator / (const Size_<_Tp>& a, _Tp b)
+{
+    Size_<_Tp> tmp(a);
+    tmp /= b;
+    return tmp;
+}
+
+template<typename _Tp> static inline
+Size_<_Tp>& operator += (Size_<_Tp>& a, const Size_<_Tp>& b)
+{
+    a.width += b.width;
+    a.height += b.height;
+    return a;
+}
+
+template<typename _Tp> static inline
+Size_<_Tp> operator + (const Size_<_Tp>& a, const Size_<_Tp>& b)
+{
+    Size_<_Tp> tmp(a);
+    tmp += b;
+    return tmp;
+}
+
+template<typename _Tp> static inline
+Size_<_Tp>& operator -= (Size_<_Tp>& a, const Size_<_Tp>& b)
+{
+    a.width -= b.width;
+    a.height -= b.height;
+    return a;
+}
+
+template<typename _Tp> static inline
+Size_<_Tp> operator - (const Size_<_Tp>& a, const Size_<_Tp>& b)
+{
+    Size_<_Tp> tmp(a);
+    tmp -= b;
+    return tmp;
+}
+
+template<typename _Tp> static inline
+bool operator == (const Size_<_Tp>& a, const Size_<_Tp>& b)
+{
+    return a.width == b.width && a.height == b.height;
+}
+
+template<typename _Tp> static inline
+bool operator != (const Size_<_Tp>& a, const Size_<_Tp>& b)
+{
+    return !(a == b);
+}
+
+
+
+////////////////////////////////// Rect /////////////////////////////////
+
+template<typename _Tp> inline
+Rect_<_Tp>::Rect_()
+    : x(0), y(0), width(0), height(0) {}
+
+template<typename _Tp> inline
+Rect_<_Tp>::Rect_(_Tp _x, _Tp _y, _Tp _width, _Tp _height)
+    : x(_x), y(_y), width(_width), height(_height) {}
+
+template<typename _Tp> inline
+Rect_<_Tp>::Rect_(const Point_<_Tp>& org, const Size_<_Tp>& sz)
+    : x(org.x), y(org.y), width(sz.width), height(sz.height) {}
+
+template<typename _Tp> inline
+Rect_<_Tp>::Rect_(const Point_<_Tp>& pt1, const Point_<_Tp>& pt2)
+{
+    x = std::min(pt1.x, pt2.x);
+    y = std::min(pt1.y, pt2.y);
+    width = std::max(pt1.x, pt2.x) - x;
+    height = std::max(pt1.y, pt2.y) - y;
+}
+
+template<typename _Tp> inline
+Point_<_Tp> Rect_<_Tp>::tl() const
+{
+    return Point_<_Tp>(x,y);
+}
+
+template<typename _Tp> inline
+Point_<_Tp> Rect_<_Tp>::br() const
+{
+    return Point_<_Tp>(x + width, y + height);
+}
+
+template<typename _Tp> inline
+Size_<_Tp> Rect_<_Tp>::size() const
+{
+    return Size_<_Tp>(width, height);
+}
+
+template<typename _Tp> inline
+_Tp Rect_<_Tp>::area() const
+{
+    const _Tp result = width * height;
+    CV_DbgAssert(!std::numeric_limits<_Tp>::is_integer
+        || width == 0 || result / width == height); // make sure the result fits in the return value
+    return result;
+}
+
+template<typename _Tp> inline
+bool Rect_<_Tp>::empty() const
+{
+    return width <= 0 || height <= 0;
+}
+
+template<typename _Tp> template<typename _Tp2> inline
+Rect_<_Tp>::operator Rect_<_Tp2>() const
+{
+    return Rect_<_Tp2>(saturate_cast<_Tp2>(x), saturate_cast<_Tp2>(y), saturate_cast<_Tp2>(width), saturate_cast<_Tp2>(height));
+}
+
+template<typename _Tp> inline
+bool Rect_<_Tp>::contains(const Point_<_Tp>& pt) const
+{
+    return x <= pt.x && pt.x < x + width && y <= pt.y && pt.y < y + height;
+}
+
+
+template<typename _Tp> static inline
+Rect_<_Tp>& operator += ( Rect_<_Tp>& a, const Point_<_Tp>& b )
+{
+    a.x += b.x;
+    a.y += b.y;
+    return a;
+}
+
+template<typename _Tp> static inline
+Rect_<_Tp>& operator -= ( Rect_<_Tp>& a, const Point_<_Tp>& b )
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    return a;
+}
+
+template<typename _Tp> static inline
+Rect_<_Tp>& operator += ( Rect_<_Tp>& a, const Size_<_Tp>& b )
+{
+    a.width += b.width;
+    a.height += b.height;
+    return a;
+}
+
+template<typename _Tp> static inline
+Rect_<_Tp>& operator -= ( Rect_<_Tp>& a, const Size_<_Tp>& b )
+{
+    const _Tp width = a.width - b.width;
+    const _Tp height = a.height - b.height;
+    CV_DbgAssert(width >= 0 && height >= 0);
+    a.width = width;
+    a.height = height;
+    return a;
+}
+
+template<typename _Tp> static inline
+Rect_<_Tp>& operator &= ( Rect_<_Tp>& a, const Rect_<_Tp>& b )
+{
+    if (a.empty() || b.empty()) {
+        a = Rect();
+        return a;
+    }
+    const Rect_<_Tp>& Rx_min = (a.x < b.x) ? a : b;
+    const Rect_<_Tp>& Rx_max = (a.x < b.x) ? b : a;
+    const Rect_<_Tp>& Ry_min = (a.y < b.y) ? a : b;
+    const Rect_<_Tp>& Ry_max = (a.y < b.y) ? b : a;
+    // Looking at the formula below, we will compute Rx_min.width - (Rx_max.x - Rx_min.x)
+    // but we want to avoid overflows. Rx_min.width >= 0 and (Rx_max.x - Rx_min.x) >= 0
+    // by definition so the difference does not overflow. The only thing that can overflow
+    // is (Rx_max.x - Rx_min.x). And it can only overflow if Rx_min.x < 0.
+    // Let us first deal with the following case.
+    if ((Rx_min.x < 0 && Rx_min.x + Rx_min.width < Rx_max.x) ||
+        (Ry_min.y < 0 && Ry_min.y + Ry_min.height < Ry_max.y)) {
+        a = Rect();
+        return a;
+    }
+    // We now know that either Rx_min.x >= 0, or
+    // Rx_min.x < 0 && Rx_min.x + Rx_min.width >= Rx_max.x and therefore
+    // Rx_min.width >= (Rx_max.x - Rx_min.x) which means (Rx_max.x - Rx_min.x)
+    // is inferior to a valid int and therefore does not overflow.
+    a.width = std::min(Rx_min.width - (Rx_max.x - Rx_min.x), Rx_max.width);
+    a.height = std::min(Ry_min.height - (Ry_max.y - Ry_min.y), Ry_max.height);
+    a.x = Rx_max.x;
+    a.y = Ry_max.y;
+    if (a.empty())
+        a = Rect();
+    return a;
+}
+
+template<typename _Tp> static inline
+Rect_<_Tp>& operator |= ( Rect_<_Tp>& a, const Rect_<_Tp>& b )
+{
+    if (a.empty()) {
+        a = b;
+    }
+    else if (!b.empty()) {
+        _Tp x1 = std::min(a.x, b.x);
+        _Tp y1 = std::min(a.y, b.y);
+        a.width = std::max(a.x + a.width, b.x + b.width) - x1;
+        a.height = std::max(a.y + a.height, b.y + b.height) - y1;
+        a.x = x1;
+        a.y = y1;
+    }
+    return a;
+}
+
+template<typename _Tp> static inline
+bool operator == (const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    return a.x == b.x && a.y == b.y && a.width == b.width && a.height == b.height;
+}
+
+template<typename _Tp> static inline
+bool operator != (const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    return a.x != b.x || a.y != b.y || a.width != b.width || a.height != b.height;
+}
+
+template<typename _Tp> static inline
+Rect_<_Tp> operator + (const Rect_<_Tp>& a, const Point_<_Tp>& b)
+{
+    return Rect_<_Tp>( a.x + b.x, a.y + b.y, a.width, a.height );
+}
+
+template<typename _Tp> static inline
+Rect_<_Tp> operator - (const Rect_<_Tp>& a, const Point_<_Tp>& b)
+{
+    return Rect_<_Tp>( a.x - b.x, a.y - b.y, a.width, a.height );
+}
+
+template<typename _Tp> static inline
+Rect_<_Tp> operator + (const Rect_<_Tp>& a, const Size_<_Tp>& b)
+{
+    return Rect_<_Tp>( a.x, a.y, a.width + b.width, a.height + b.height );
+}
+
+template<typename _Tp> static inline
+Rect_<_Tp> operator - (const Rect_<_Tp>& a, const Size_<_Tp>& b)
+{
+    const _Tp width = a.width - b.width;
+    const _Tp height = a.height - b.height;
+    CV_DbgAssert(width >= 0 && height >= 0);
+    return Rect_<_Tp>( a.x, a.y, width, height );
+}
+
+template<typename _Tp> static inline
+Rect_<_Tp> operator & (const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    Rect_<_Tp> c = a;
+    return c &= b;
+}
+
+template<typename _Tp> static inline
+Rect_<_Tp> operator | (const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    Rect_<_Tp> c = a;
+    return c |= b;
+}
+
+/**
+ * @brief measure dissimilarity between two sample sets
+ *
+ * computes the complement of the Jaccard Index as described in <https://en.wikipedia.org/wiki/Jaccard_index>.
+ * For rectangles this reduces to computing the intersection over the union.
+ */
+template<typename _Tp> static inline
+double jaccardDistance(const Rect_<_Tp>& a, const Rect_<_Tp>& b) {
+    _Tp Aa = a.area();
+    _Tp Ab = b.area();
+
+    if ((Aa + Ab) <= std::numeric_limits<_Tp>::epsilon()) {
+        // jaccard_index = 1 -> distance = 0
+        return 0.0;
+    }
+
+    double Aab = (a & b).area();
+    // distance = 1 - jaccard_index
+    return 1.0 - Aab / (Aa + Ab - Aab);
+}
+
+////////////////////////////// RotatedRect //////////////////////////////
+
+inline
+RotatedRect::RotatedRect()
+    : center(), size(), angle(0) {}
+
+inline
+RotatedRect::RotatedRect(const Point2f& _center, const Size2f& _size, float _angle)
+    : center(_center), size(_size), angle(_angle) {}
+
+///////////////////////////////// Range /////////////////////////////////
+
+inline
+Range::Range()
+    : start(0), end(0) {}
+
+inline
+Range::Range(int _start, int _end)
+    : start(_start), end(_end) {}
+
+inline
+int Range::size() const
+{
+    return end - start;
+}
+
+inline
+bool Range::empty() const
+{
+    return start == end;
+}
+
+inline
+Range Range::all()
+{
+    return Range(INT_MIN, INT_MAX);
+}
+
+
+static inline
+bool operator == (const Range& r1, const Range& r2)
+{
+    return r1.start == r2.start && r1.end == r2.end;
+}
+
+static inline
+bool operator != (const Range& r1, const Range& r2)
+{
+    return !(r1 == r2);
+}
+
+static inline
+bool operator !(const Range& r)
+{
+    return r.start == r.end;
+}
+
+static inline
+Range operator & (const Range& r1, const Range& r2)
+{
+    Range r(std::max(r1.start, r2.start), std::min(r1.end, r2.end));
+    r.end = std::max(r.end, r.start);
+    return r;
+}
+
+static inline
+Range& operator &= (Range& r1, const Range& r2)
+{
+    r1 = r1 & r2;
+    return r1;
+}
+
+static inline
+Range operator + (const Range& r1, int delta)
+{
+    return Range(r1.start + delta, r1.end + delta);
+}
+
+static inline
+Range operator + (int delta, const Range& r1)
+{
+    return Range(r1.start + delta, r1.end + delta);
+}
+
+static inline
+Range operator - (const Range& r1, int delta)
+{
+    return r1 + (-delta);
+}
+
+
+
+///////////////////////////////// Scalar ////////////////////////////////
+
+template<typename _Tp> inline
+Scalar_<_Tp>::Scalar_()
+{
+    this->val[0] = this->val[1] = this->val[2] = this->val[3] = 0;
+}
+
+template<typename _Tp> inline
+Scalar_<_Tp>::Scalar_(_Tp v0, _Tp v1, _Tp v2, _Tp v3)
+{
+    this->val[0] = v0;
+    this->val[1] = v1;
+    this->val[2] = v2;
+    this->val[3] = v3;
+}
+
+template<typename _Tp> inline
+Scalar_<_Tp>::Scalar_(const Scalar_<_Tp>& s) : Vec<_Tp, 4>(s) {
+}
+
+template<typename _Tp> inline
+Scalar_<_Tp>::Scalar_(Scalar_<_Tp>&& s) CV_NOEXCEPT {
+    this->val[0] = std::move(s.val[0]);
+    this->val[1] = std::move(s.val[1]);
+    this->val[2] = std::move(s.val[2]);
+    this->val[3] = std::move(s.val[3]);
+}
+
+template<typename _Tp> inline
+Scalar_<_Tp>& Scalar_<_Tp>::operator=(const Scalar_<_Tp>& s) {
+    this->val[0] = s.val[0];
+    this->val[1] = s.val[1];
+    this->val[2] = s.val[2];
+    this->val[3] = s.val[3];
+    return *this;
+}
+
+template<typename _Tp> inline
+Scalar_<_Tp>& Scalar_<_Tp>::operator=(Scalar_<_Tp>&& s) CV_NOEXCEPT {
+    this->val[0] = std::move(s.val[0]);
+    this->val[1] = std::move(s.val[1]);
+    this->val[2] = std::move(s.val[2]);
+    this->val[3] = std::move(s.val[3]);
+    return *this;
+}
+
+template<typename _Tp> template<typename _Tp2, int cn> inline
+Scalar_<_Tp>::Scalar_(const Vec<_Tp2, cn>& v)
+{
+    int i;
+    for( i = 0; i < (cn < 4 ? cn : 4); i++ )
+        this->val[i] = cv::saturate_cast<_Tp>(v.val[i]);
+    for( ; i < 4; i++ )
+        this->val[i] = 0;
+}
+
+template<typename _Tp> inline
+Scalar_<_Tp>::Scalar_(_Tp v0)
+{
+    this->val[0] = v0;
+    this->val[1] = this->val[2] = this->val[3] = 0;
+}
+
+template<typename _Tp> inline
+Scalar_<_Tp> Scalar_<_Tp>::all(_Tp v0)
+{
+    return Scalar_<_Tp>(v0, v0, v0, v0);
+}
+
+
+template<typename _Tp> inline
+Scalar_<_Tp> Scalar_<_Tp>::mul(const Scalar_<_Tp>& a, double scale ) const
+{
+    return Scalar_<_Tp>(saturate_cast<_Tp>(this->val[0] * a.val[0] * scale),
+                        saturate_cast<_Tp>(this->val[1] * a.val[1] * scale),
+                        saturate_cast<_Tp>(this->val[2] * a.val[2] * scale),
+                        saturate_cast<_Tp>(this->val[3] * a.val[3] * scale));
+}
+
+template<typename _Tp> inline
+Scalar_<_Tp> Scalar_<_Tp>::conj() const
+{
+    return Scalar_<_Tp>(saturate_cast<_Tp>( this->val[0]),
+                        saturate_cast<_Tp>(-this->val[1]),
+                        saturate_cast<_Tp>(-this->val[2]),
+                        saturate_cast<_Tp>(-this->val[3]));
+}
+
+template<typename _Tp> inline
+bool Scalar_<_Tp>::isReal() const
+{
+    return this->val[1] == 0 && this->val[2] == 0 && this->val[3] == 0;
+}
+
+
+template<typename _Tp> template<typename T2> inline
+Scalar_<_Tp>::operator Scalar_<T2>() const
+{
+    return Scalar_<T2>(saturate_cast<T2>(this->val[0]),
+                       saturate_cast<T2>(this->val[1]),
+                       saturate_cast<T2>(this->val[2]),
+                       saturate_cast<T2>(this->val[3]));
+}
+
+
+template<typename _Tp> static inline
+Scalar_<_Tp>& operator += (Scalar_<_Tp>& a, const Scalar_<_Tp>& b)
+{
+    a.val[0] += b.val[0];
+    a.val[1] += b.val[1];
+    a.val[2] += b.val[2];
+    a.val[3] += b.val[3];
+    return a;
+}
+
+template<typename _Tp> static inline
+Scalar_<_Tp>& operator -= (Scalar_<_Tp>& a, const Scalar_<_Tp>& b)
+{
+    a.val[0] -= b.val[0];
+    a.val[1] -= b.val[1];
+    a.val[2] -= b.val[2];
+    a.val[3] -= b.val[3];
+    return a;
+}
+
+template<typename _Tp> static inline
+Scalar_<_Tp>& operator *= ( Scalar_<_Tp>& a, _Tp v )
+{
+    a.val[0] *= v;
+    a.val[1] *= v;
+    a.val[2] *= v;
+    a.val[3] *= v;
+    return a;
+}
+
+template<typename _Tp> static inline
+bool operator == ( const Scalar_<_Tp>& a, const Scalar_<_Tp>& b )
+{
+    return a.val[0] == b.val[0] && a.val[1] == b.val[1] &&
+           a.val[2] == b.val[2] && a.val[3] == b.val[3];
+}
+
+template<typename _Tp> static inline
+bool operator != ( const Scalar_<_Tp>& a, const Scalar_<_Tp>& b )
+{
+    return a.val[0] != b.val[0] || a.val[1] != b.val[1] ||
+           a.val[2] != b.val[2] || a.val[3] != b.val[3];
+}
+
+template<typename _Tp> static inline
+Scalar_<_Tp> operator + (const Scalar_<_Tp>& a, const Scalar_<_Tp>& b)
+{
+    return Scalar_<_Tp>(a.val[0] + b.val[0],
+                        a.val[1] + b.val[1],
+                        a.val[2] + b.val[2],
+                        a.val[3] + b.val[3]);
+}
+
+template<typename _Tp> static inline
+Scalar_<_Tp> operator - (const Scalar_<_Tp>& a, const Scalar_<_Tp>& b)
+{
+    return Scalar_<_Tp>(saturate_cast<_Tp>(a.val[0] - b.val[0]),
+                        saturate_cast<_Tp>(a.val[1] - b.val[1]),
+                        saturate_cast<_Tp>(a.val[2] - b.val[2]),
+                        saturate_cast<_Tp>(a.val[3] - b.val[3]));
+}
+
+template<typename _Tp> static inline
+Scalar_<_Tp> operator * (const Scalar_<_Tp>& a, _Tp alpha)
+{
+    return Scalar_<_Tp>(a.val[0] * alpha,
+                        a.val[1] * alpha,
+                        a.val[2] * alpha,
+                        a.val[3] * alpha);
+}
+
+template<typename _Tp> static inline
+Scalar_<_Tp> operator * (_Tp alpha, const Scalar_<_Tp>& a)
+{
+    return a*alpha;
+}
+
+template<typename _Tp> static inline
+Scalar_<_Tp> operator - (const Scalar_<_Tp>& a)
+{
+    return Scalar_<_Tp>(saturate_cast<_Tp>(-a.val[0]),
+                        saturate_cast<_Tp>(-a.val[1]),
+                        saturate_cast<_Tp>(-a.val[2]),
+                        saturate_cast<_Tp>(-a.val[3]));
+}
+
+
+template<typename _Tp> static inline
+Scalar_<_Tp> operator * (const Scalar_<_Tp>& a, const Scalar_<_Tp>& b)
+{
+    return Scalar_<_Tp>(saturate_cast<_Tp>(a[0]*b[0] - a[1]*b[1] - a[2]*b[2] - a[3]*b[3]),
+                        saturate_cast<_Tp>(a[0]*b[1] + a[1]*b[0] + a[2]*b[3] - a[3]*b[2]),
+                        saturate_cast<_Tp>(a[0]*b[2] - a[1]*b[3] + a[2]*b[0] + a[3]*b[1]),
+                        saturate_cast<_Tp>(a[0]*b[3] + a[1]*b[2] - a[2]*b[1] + a[3]*b[0]));
+}
+
+template<typename _Tp> static inline
+Scalar_<_Tp>& operator *= (Scalar_<_Tp>& a, const Scalar_<_Tp>& b)
+{
+    a = a * b;
+    return a;
+}
+
+template<typename _Tp> static inline
+Scalar_<_Tp> operator / (const Scalar_<_Tp>& a, _Tp alpha)
+{
+    return Scalar_<_Tp>(a.val[0] / alpha,
+                        a.val[1] / alpha,
+                        a.val[2] / alpha,
+                        a.val[3] / alpha);
+}
+
+template<typename _Tp> static inline
+Scalar_<float> operator / (const Scalar_<float>& a, float alpha)
+{
+    float s = 1 / alpha;
+    return Scalar_<float>(a.val[0] * s, a.val[1] * s, a.val[2] * s, a.val[3] * s);
+}
+
+template<typename _Tp> static inline
+Scalar_<double> operator / (const Scalar_<double>& a, double alpha)
+{
+    double s = 1 / alpha;
+    return Scalar_<double>(a.val[0] * s, a.val[1] * s, a.val[2] * s, a.val[3] * s);
+}
+
+template<typename _Tp> static inline
+Scalar_<_Tp>& operator /= (Scalar_<_Tp>& a, _Tp alpha)
+{
+    a = a / alpha;
+    return a;
+}
+
+template<typename _Tp> static inline
+Scalar_<_Tp> operator / (_Tp a, const Scalar_<_Tp>& b)
+{
+    _Tp s = a / (b[0]*b[0] + b[1]*b[1] + b[2]*b[2] + b[3]*b[3]);
+    return b.conj() * s;
+}
+
+template<typename _Tp> static inline
+Scalar_<_Tp> operator / (const Scalar_<_Tp>& a, const Scalar_<_Tp>& b)
+{
+    return a * ((_Tp)1 / b);
+}
+
+template<typename _Tp> static inline
+Scalar_<_Tp>& operator /= (Scalar_<_Tp>& a, const Scalar_<_Tp>& b)
+{
+    a = a / b;
+    return a;
+}
+
+template<typename _Tp> static inline
+Scalar operator * (const Matx<_Tp, 4, 4>& a, const Scalar& b)
+{
+    Matx<double, 4, 1> c((Matx<double, 4, 4>)a, b, Matx_MatMulOp());
+    return reinterpret_cast<const Scalar&>(c);
+}
+
+template<> inline
+Scalar operator * (const Matx<double, 4, 4>& a, const Scalar& b)
+{
+    Matx<double, 4, 1> c(a, b, Matx_MatMulOp());
+    return reinterpret_cast<const Scalar&>(c);
+}
+
+
+
+//////////////////////////////// KeyPoint ///////////////////////////////
+
+inline
+KeyPoint::KeyPoint()
+    : pt(0,0), size(0), angle(-1), response(0), octave(0), class_id(-1) {}
+
+inline
+KeyPoint::KeyPoint(Point2f _pt, float _size, float _angle, float _response, int _octave, int _class_id)
+    : pt(_pt), size(_size), angle(_angle), response(_response), octave(_octave), class_id(_class_id) {}
+
+inline
+KeyPoint::KeyPoint(float x, float y, float _size, float _angle, float _response, int _octave, int _class_id)
+    : pt(x, y), size(_size), angle(_angle), response(_response), octave(_octave), class_id(_class_id) {}
+
+
+
+///////////////////////////////// DMatch ////////////////////////////////
+
+inline
+DMatch::DMatch()
+    : queryIdx(-1), trainIdx(-1), imgIdx(-1), distance(FLT_MAX) {}
+
+inline
+DMatch::DMatch(int _queryIdx, int _trainIdx, float _distance)
+    : queryIdx(_queryIdx), trainIdx(_trainIdx), imgIdx(-1), distance(_distance) {}
+
+inline
+DMatch::DMatch(int _queryIdx, int _trainIdx, int _imgIdx, float _distance)
+    : queryIdx(_queryIdx), trainIdx(_trainIdx), imgIdx(_imgIdx), distance(_distance) {}
+
+inline
+bool DMatch::operator < (const DMatch &m) const
+{
+    return distance < m.distance;
+}
+
+
+
+////////////////////////////// TermCriteria /////////////////////////////
+
+inline
+TermCriteria::TermCriteria()
+    : type(0), maxCount(0), epsilon(0) {}
+
+inline
+TermCriteria::TermCriteria(int _type, int _maxCount, double _epsilon)
+    : type(_type), maxCount(_maxCount), epsilon(_epsilon) {}
+
+//! @endcond
+
+} // cv
+
+#endif //OPENCV_CORE_TYPES_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/types_c.h b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/types_c.h
new file mode 100644
index 0000000..32f3c8c
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/types_c.h
@@ -0,0 +1,2126 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_TYPES_H
+#define OPENCV_CORE_TYPES_H
+
+#ifdef CV__ENABLE_C_API_CTORS  // invalid C API ctors (must be removed)
+#if defined(_WIN32) && !defined(CV__SKIP_MESSAGE_MALFORMED_C_API_CTORS)
+#error "C API ctors don't work on Win32: https://github.com/opencv/opencv/issues/15990"
+#endif
+#endif
+
+//#define CV__VALIDATE_UNUNITIALIZED_VARS 1  // C++11 & GCC only
+
+#ifdef __cplusplus
+
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#define CV_STRUCT_INITIALIZER {0,}
+#else
+#if defined(__GNUC__) && __GNUC__ == 4  // GCC 4.x warns on "= {}" initialization, fixed in GCC 5.0
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#endif
+#define CV_STRUCT_INITIALIZER {}
+#endif
+
+#else
+#define CV_STRUCT_INITIALIZER {0}
+#endif
+
+
+#ifdef HAVE_IPL
+#  ifndef __IPL_H__
+#    if defined _WIN32
+#      include <ipl.h>
+#    else
+#      include <ipl/ipl.h>
+#    endif
+#  endif
+#elif defined __IPL_H__
+#  define HAVE_IPL
+#endif
+
+#include "opencv2/core/cvdef.h"
+
+#ifndef SKIP_INCLUDES
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <float.h>
+#endif // SKIP_INCLUDES
+
+#if defined _WIN32
+#  define CV_CDECL __cdecl
+#  define CV_STDCALL __stdcall
+#else
+#  define CV_CDECL
+#  define CV_STDCALL
+#endif
+
+#ifndef CV_DEFAULT
+#  ifdef __cplusplus
+#    define CV_DEFAULT(val) = val
+#  else
+#    define CV_DEFAULT(val)
+#  endif
+#endif
+
+#ifndef CV_EXTERN_C_FUNCPTR
+#  ifdef __cplusplus
+#    define CV_EXTERN_C_FUNCPTR(x) extern "C" { typedef x; }
+#  else
+#    define CV_EXTERN_C_FUNCPTR(x) typedef x
+#  endif
+#endif
+
+#ifndef CVAPI
+#  define CVAPI(rettype) CV_EXTERN_C CV_EXPORTS rettype CV_CDECL
+#endif
+
+#ifndef CV_IMPL
+#  define CV_IMPL CV_EXTERN_C
+#endif
+
+#ifdef __cplusplus
+#  include "opencv2/core.hpp"
+#endif
+
+/** @addtogroup core_c
+    @{
+*/
+
+/** @brief This is the "metatype" used *only* as a function parameter.
+
+It denotes that the function accepts arrays of multiple types, such as IplImage*, CvMat* or even
+CvSeq* sometimes. The particular array type is determined at runtime by analyzing the first 4
+bytes of the header. In C++ interface the role of CvArr is played by InputArray and OutputArray.
+ */
+typedef void CvArr;
+
+typedef int CVStatus;
+
+/** @see cv::Error::Code */
+enum {
+ CV_StsOk=                       0,  /**< everything is ok                */
+ CV_StsBackTrace=               -1,  /**< pseudo error for back trace     */
+ CV_StsError=                   -2,  /**< unknown /unspecified error      */
+ CV_StsInternal=                -3,  /**< internal error (bad state)      */
+ CV_StsNoMem=                   -4,  /**< insufficient memory             */
+ CV_StsBadArg=                  -5,  /**< function arg/param is bad       */
+ CV_StsBadFunc=                 -6,  /**< unsupported function            */
+ CV_StsNoConv=                  -7,  /**< iter. didn't converge           */
+ CV_StsAutoTrace=               -8,  /**< tracing                         */
+ CV_HeaderIsNull=               -9,  /**< image header is NULL            */
+ CV_BadImageSize=              -10,  /**< image size is invalid           */
+ CV_BadOffset=                 -11,  /**< offset is invalid               */
+ CV_BadDataPtr=                -12,  /**/
+ CV_BadStep=                   -13,  /**< image step is wrong, this may happen for a non-continuous matrix */
+ CV_BadModelOrChSeq=           -14,  /**/
+ CV_BadNumChannels=            -15,  /**< bad number of channels, for example, some functions accept only single channel matrices */
+ CV_BadNumChannel1U=           -16,  /**/
+ CV_BadDepth=                  -17,  /**< input image depth is not supported by the function */
+ CV_BadAlphaChannel=           -18,  /**/
+ CV_BadOrder=                  -19,  /**< number of dimensions is out of range */
+ CV_BadOrigin=                 -20,  /**< incorrect input origin               */
+ CV_BadAlign=                  -21,  /**< incorrect input align                */
+ CV_BadCallBack=               -22,  /**/
+ CV_BadTileSize=               -23,  /**/
+ CV_BadCOI=                    -24,  /**< input COI is not supported           */
+ CV_BadROISize=                -25,  /**< incorrect input roi                  */
+ CV_MaskIsTiled=               -26,  /**/
+ CV_StsNullPtr=                -27,  /**< null pointer */
+ CV_StsVecLengthErr=           -28,  /**< incorrect vector length */
+ CV_StsFilterStructContentErr= -29,  /**< incorrect filter structure content */
+ CV_StsKernelStructContentErr= -30,  /**< incorrect transform kernel content */
+ CV_StsFilterOffsetErr=        -31,  /**< incorrect filter offset value */
+ CV_StsBadSize=                -201, /**< the input/output structure size is incorrect  */
+ CV_StsDivByZero=              -202, /**< division by zero */
+ CV_StsInplaceNotSupported=    -203, /**< in-place operation is not supported */
+ CV_StsObjectNotFound=         -204, /**< request can't be completed */
+ CV_StsUnmatchedFormats=       -205, /**< formats of input/output arrays differ */
+ CV_StsBadFlag=                -206, /**< flag is wrong or not supported */
+ CV_StsBadPoint=               -207, /**< bad CvPoint */
+ CV_StsBadMask=                -208, /**< bad format of mask (neither 8uC1 nor 8sC1)*/
+ CV_StsUnmatchedSizes=         -209, /**< sizes of input/output structures do not match */
+ CV_StsUnsupportedFormat=      -210, /**< the data format/type is not supported by the function*/
+ CV_StsOutOfRange=             -211, /**< some of parameters are out of range */
+ CV_StsParseError=             -212, /**< invalid syntax/structure of the parsed file */
+ CV_StsNotImplemented=         -213, /**< the requested function/feature is not implemented */
+ CV_StsBadMemBlock=            -214, /**< an allocated block has been corrupted */
+ CV_StsAssert=                 -215, /**< assertion failed   */
+ CV_GpuNotSupported=           -216, /**< no CUDA support    */
+ CV_GpuApiCallError=           -217, /**< GPU API call error */
+ CV_OpenGlNotSupported=        -218, /**< no OpenGL support  */
+ CV_OpenGlApiCallError=        -219, /**< OpenGL API call error */
+ CV_OpenCLApiCallError=        -220, /**< OpenCL API call error */
+ CV_OpenCLDoubleNotSupported=  -221,
+ CV_OpenCLInitError=           -222, /**< OpenCL initialization error */
+ CV_OpenCLNoAMDBlasFft=        -223
+};
+
+/****************************************************************************************\
+*                             Common macros and inline functions                         *
+\****************************************************************************************/
+
+#define CV_SWAP(a,b,t) ((t) = (a), (a) = (b), (b) = (t))
+
+/** min & max without jumps */
+#define  CV_IMIN(a, b)  ((a) ^ (((a)^(b)) & (((a) < (b)) - 1)))
+
+#define  CV_IMAX(a, b)  ((a) ^ (((a)^(b)) & (((a) > (b)) - 1)))
+
+/** absolute value without jumps */
+#ifndef __cplusplus
+#  define  CV_IABS(a)     (((a) ^ ((a) < 0 ? -1 : 0)) - ((a) < 0 ? -1 : 0))
+#else
+#  define  CV_IABS(a)     abs(a)
+#endif
+#define  CV_CMP(a,b)    (((a) > (b)) - ((a) < (b)))
+#define  CV_SIGN(a)     CV_CMP((a),0)
+
+#define cvInvSqrt(value) ((float)(1./sqrt(value)))
+#define cvSqrt(value)  ((float)sqrt(value))
+
+
+/*************** Random number generation *******************/
+
+typedef uint64 CvRNG;
+
+#define CV_RNG_COEFF 4164903690U
+
+/** @brief Initializes a random number generator state.
+
+The function initializes a random number generator and returns the state. The pointer to the state
+can be then passed to the cvRandInt, cvRandReal and cvRandArr functions. In the current
+implementation a multiply-with-carry generator is used.
+@param seed 64-bit value used to initiate a random sequence
+@sa the C++ class RNG replaced CvRNG.
+ */
+CV_INLINE CvRNG cvRNG( int64 seed CV_DEFAULT(-1))
+{
+    CvRNG rng = seed ? (uint64)seed : (uint64)(int64)-1;
+    return rng;
+}
+
+/** @brief Returns a 32-bit unsigned integer and updates RNG.
+
+The function returns a uniformly-distributed random 32-bit unsigned integer and updates the RNG
+state. It is similar to the rand() function from the C runtime library, except that OpenCV functions
+always generates a 32-bit random number, regardless of the platform.
+@param rng CvRNG state initialized by cvRNG.
+ */
+CV_INLINE unsigned cvRandInt( CvRNG* rng )
+{
+    uint64 temp = *rng;
+    temp = (uint64)(unsigned)temp*CV_RNG_COEFF + (temp >> 32);
+    *rng = temp;
+    return (unsigned)temp;
+}
+
+/** @brief Returns a floating-point random number and updates RNG.
+
+The function returns a uniformly-distributed random floating-point number between 0 and 1 (1 is not
+included).
+@param rng RNG state initialized by cvRNG
+ */
+CV_INLINE double cvRandReal( CvRNG* rng )
+{
+    return cvRandInt(rng)*2.3283064365386962890625e-10 /* 2^-32 */;
+}
+
+/****************************************************************************************\
+*                                  Image type (IplImage)                                 *
+\****************************************************************************************/
+
+#ifndef HAVE_IPL
+
+/*
+ * The following definitions (until #endif)
+ * is an extract from IPL headers.
+ * Copyright (c) 1995 Intel Corporation.
+ */
+#define IPL_DEPTH_SIGN 0x80000000
+
+#define IPL_DEPTH_1U     1
+#define IPL_DEPTH_8U     8
+#define IPL_DEPTH_16U   16
+#define IPL_DEPTH_32F   32
+
+#define IPL_DEPTH_8S  (IPL_DEPTH_SIGN| 8)
+#define IPL_DEPTH_16S (IPL_DEPTH_SIGN|16)
+#define IPL_DEPTH_32S (IPL_DEPTH_SIGN|32)
+
+#define IPL_DATA_ORDER_PIXEL  0
+#define IPL_DATA_ORDER_PLANE  1
+
+#define IPL_ORIGIN_TL 0
+#define IPL_ORIGIN_BL 1
+
+#define IPL_ALIGN_4BYTES   4
+#define IPL_ALIGN_8BYTES   8
+#define IPL_ALIGN_16BYTES 16
+#define IPL_ALIGN_32BYTES 32
+
+#define IPL_ALIGN_DWORD   IPL_ALIGN_4BYTES
+#define IPL_ALIGN_QWORD   IPL_ALIGN_8BYTES
+
+#define IPL_BORDER_CONSTANT   0
+#define IPL_BORDER_REPLICATE  1
+#define IPL_BORDER_REFLECT    2
+#define IPL_BORDER_WRAP       3
+
+#ifdef __cplusplus
+typedef struct _IplImage IplImage;
+CV_EXPORTS _IplImage cvIplImage(const cv::Mat& m);
+#endif
+
+/** The IplImage is taken from the Intel Image Processing Library, in which the format is native. OpenCV
+only supports a subset of possible IplImage formats, as outlined in the parameter list above.
+
+In addition to the above restrictions, OpenCV handles ROIs differently. OpenCV functions require
+that the image size or ROI size of all source and destination images match exactly. On the other
+hand, the Intel Image Processing Library processes the area of intersection between the source and
+destination images (or ROIs), allowing them to vary independently.
+*/
+typedef struct
+_IplImage
+{
+    int  nSize;             /**< sizeof(IplImage) */
+    int  ID;                /**< version (=0)*/
+    int  nChannels;         /**< Most of OpenCV functions support 1,2,3 or 4 channels */
+    int  alphaChannel;      /**< Ignored by OpenCV */
+    int  depth;             /**< Pixel depth in bits: IPL_DEPTH_8U, IPL_DEPTH_8S, IPL_DEPTH_16S,
+                               IPL_DEPTH_32S, IPL_DEPTH_32F and IPL_DEPTH_64F are supported.  */
+    char colorModel[4];     /**< Ignored by OpenCV */
+    char channelSeq[4];     /**< ditto */
+    int  dataOrder;         /**< 0 - interleaved color channels, 1 - separate color channels.
+                               cvCreateImage can only create interleaved images */
+    int  origin;            /**< 0 - top-left origin,
+                               1 - bottom-left origin (Windows bitmaps style).  */
+    int  align;             /**< Alignment of image rows (4 or 8).
+                               OpenCV ignores it and uses widthStep instead.    */
+    int  width;             /**< Image width in pixels.                           */
+    int  height;            /**< Image height in pixels.                          */
+    struct _IplROI *roi;    /**< Image ROI. If NULL, the whole image is selected. */
+    struct _IplImage *maskROI;      /**< Must be NULL. */
+    void  *imageId;                 /**< "           " */
+    struct _IplTileInfo *tileInfo;  /**< "           " */
+    int  imageSize;         /**< Image data size in bytes
+                               (==image->height*image->widthStep
+                               in case of interleaved data)*/
+    char *imageData;        /**< Pointer to aligned image data.         */
+    int  widthStep;         /**< Size of aligned image row in bytes.    */
+    int  BorderMode[4];     /**< Ignored by OpenCV.                     */
+    int  BorderConst[4];    /**< Ditto.                                 */
+    char *imageDataOrigin;  /**< Pointer to very origin of image data
+                               (not necessarily aligned) -
+                               needed for correct deallocation */
+
+#if defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
+    _IplImage()
+    {
+        memset(this, 0, sizeof(*this));  // valid for POD structure
+        nSize = sizeof(IplImage);
+    }
+    _IplImage(const cv::Mat& m) { *this = cvIplImage(m); }
+#endif
+}
+IplImage;
+
+CV_INLINE IplImage cvIplImage()
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    IplImage self = CV_STRUCT_INITIALIZER; self.nSize = sizeof(IplImage); return self;
+#else
+    return _IplImage();
+#endif
+}
+
+typedef struct _IplTileInfo IplTileInfo;
+
+typedef struct _IplROI
+{
+    int  coi; /**< 0 - no COI (all channels are selected), 1 - 0th channel is selected ...*/
+    int  xOffset;
+    int  yOffset;
+    int  width;
+    int  height;
+}
+IplROI;
+
+typedef struct _IplConvKernel
+{
+    int  nCols;
+    int  nRows;
+    int  anchorX;
+    int  anchorY;
+    int *values;
+    int  nShiftR;
+}
+IplConvKernel;
+
+typedef struct _IplConvKernelFP
+{
+    int  nCols;
+    int  nRows;
+    int  anchorX;
+    int  anchorY;
+    float *values;
+}
+IplConvKernelFP;
+
+#define IPL_IMAGE_HEADER 1
+#define IPL_IMAGE_DATA   2
+#define IPL_IMAGE_ROI    4
+
+#endif/*HAVE_IPL*/
+
+/** extra border mode */
+#define IPL_BORDER_REFLECT_101    4
+#define IPL_BORDER_TRANSPARENT    5
+
+#define IPL_IMAGE_MAGIC_VAL  ((int)sizeof(IplImage))
+#define CV_TYPE_NAME_IMAGE "opencv-image"
+
+#define CV_IS_IMAGE_HDR(img) \
+    ((img) != NULL && ((const IplImage*)(img))->nSize == sizeof(IplImage))
+
+#define CV_IS_IMAGE(img) \
+    (CV_IS_IMAGE_HDR(img) && ((IplImage*)img)->imageData != NULL)
+
+/** for storing double-precision
+   floating point data in IplImage's */
+#define IPL_DEPTH_64F  64
+
+/** get reference to pixel at (col,row),
+   for multi-channel images (col) should be multiplied by number of channels */
+#define CV_IMAGE_ELEM( image, elemtype, row, col )       \
+    (((elemtype*)((image)->imageData + (image)->widthStep*(row)))[(col)])
+
+/****************************************************************************************\
+*                                  Matrix type (CvMat)                                   *
+\****************************************************************************************/
+
+#define CV_AUTO_STEP  0x7fffffff
+#define CV_WHOLE_ARR  cvSlice( 0, 0x3fffffff )
+
+#define CV_MAGIC_MASK       0xFFFF0000
+#define CV_MAT_MAGIC_VAL    0x42420000
+#define CV_TYPE_NAME_MAT    "opencv-matrix"
+
+#ifdef __cplusplus
+typedef struct CvMat CvMat;
+CV_INLINE CvMat cvMat(const cv::Mat& m);
+#endif
+
+/** Matrix elements are stored row by row. Element (i, j) (i - 0-based row index, j - 0-based column
+index) of a matrix can be retrieved or modified using CV_MAT_ELEM macro:
+
+    uchar pixval = CV_MAT_ELEM(grayimg, uchar, i, j)
+    CV_MAT_ELEM(cameraMatrix, float, 0, 2) = image.width*0.5f;
+
+To access multiple-channel matrices, you can use
+CV_MAT_ELEM(matrix, type, i, j\*nchannels + channel_idx).
+
+@deprecated CvMat is now obsolete; consider using Mat instead.
+ */
+typedef struct CvMat
+{
+    int type;
+    int step;
+
+    /* for internal use only */
+    int* refcount;
+    int hdr_refcount;
+
+    union
+    {
+        uchar* ptr;
+        short* s;
+        int* i;
+        float* fl;
+        double* db;
+    } data;
+
+#ifdef __cplusplus
+    union
+    {
+        int rows;
+        int height;
+    };
+
+    union
+    {
+        int cols;
+        int width;
+    };
+#else
+    int rows;
+    int cols;
+#endif
+
+#if defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
+    CvMat() {}
+    CvMat(const cv::Mat& m) { *this = cvMat(m); }
+#endif
+}
+CvMat;
+
+
+#define CV_IS_MAT_HDR(mat) \
+    ((mat) != NULL && \
+    (((const CvMat*)(mat))->type & CV_MAGIC_MASK) == CV_MAT_MAGIC_VAL && \
+    ((const CvMat*)(mat))->cols > 0 && ((const CvMat*)(mat))->rows > 0)
+
+#define CV_IS_MAT_HDR_Z(mat) \
+    ((mat) != NULL && \
+    (((const CvMat*)(mat))->type & CV_MAGIC_MASK) == CV_MAT_MAGIC_VAL && \
+    ((const CvMat*)(mat))->cols >= 0 && ((const CvMat*)(mat))->rows >= 0)
+
+#define CV_IS_MAT(mat) \
+    (CV_IS_MAT_HDR(mat) && ((const CvMat*)(mat))->data.ptr != NULL)
+
+#define CV_IS_MASK_ARR(mat) \
+    (((mat)->type & (CV_MAT_TYPE_MASK & ~CV_8SC1)) == 0)
+
+#define CV_ARE_TYPES_EQ(mat1, mat2) \
+    ((((mat1)->type ^ (mat2)->type) & CV_MAT_TYPE_MASK) == 0)
+
+#define CV_ARE_CNS_EQ(mat1, mat2) \
+    ((((mat1)->type ^ (mat2)->type) & CV_MAT_CN_MASK) == 0)
+
+#define CV_ARE_DEPTHS_EQ(mat1, mat2) \
+    ((((mat1)->type ^ (mat2)->type) & CV_MAT_DEPTH_MASK) == 0)
+
+#define CV_ARE_SIZES_EQ(mat1, mat2) \
+    ((mat1)->rows == (mat2)->rows && (mat1)->cols == (mat2)->cols)
+
+#define CV_IS_MAT_CONST(mat)  \
+    (((mat)->rows|(mat)->cols) == 1)
+
+#define IPL2CV_DEPTH(depth) \
+    ((((CV_8U)+(CV_16U<<4)+(CV_32F<<8)+(CV_64F<<16)+(CV_8S<<20)+ \
+    (CV_16S<<24)+(CV_32S<<28)) >> ((((depth) & 0xF0) >> 2) + \
+    (((depth) & IPL_DEPTH_SIGN) ? 20 : 0))) & 15)
+
+/** Inline constructor. No data is allocated internally!!!
+ * (Use together with cvCreateData, or use cvCreateMat instead to
+ * get a matrix with allocated data):
+ */
+CV_INLINE CvMat cvMat( int rows, int cols, int type, void* data CV_DEFAULT(NULL))
+{
+    CvMat m;
+
+    assert( (unsigned)CV_MAT_DEPTH(type) <= CV_64F );
+    type = CV_MAT_TYPE(type);
+    m.type = CV_MAT_MAGIC_VAL | CV_MAT_CONT_FLAG | type;
+    m.cols = cols;
+    m.rows = rows;
+    m.step = m.cols*CV_ELEM_SIZE(type);
+    m.data.ptr = (uchar*)data;
+    m.refcount = NULL;
+    m.hdr_refcount = 0;
+
+    return m;
+}
+
+#ifdef __cplusplus
+
+CV_INLINE CvMat cvMat(const cv::Mat& m)
+{
+    CvMat self;
+    CV_DbgAssert(m.dims <= 2);
+    self = cvMat(m.rows, m.dims == 1 ? 1 : m.cols, m.type(), m.data);
+    self.step = (int)m.step[0];
+    self.type = (self.type & ~cv::Mat::CONTINUOUS_FLAG) | (m.flags & cv::Mat::CONTINUOUS_FLAG);
+    return self;
+}
+CV_INLINE CvMat cvMat()
+{
+#if !defined(CV__ENABLE_C_API_CTORS)
+    CvMat self = CV_STRUCT_INITIALIZER; return self;
+#else
+    return CvMat();
+#endif
+}
+CV_INLINE CvMat cvMat(const CvMat& m)
+{
+#if !defined(CV__ENABLE_C_API_CTORS)
+    CvMat self = CV_STRUCT_INITIALIZER; memcpy(&self, &m, sizeof(self)); return self;
+#else
+    return CvMat(m);
+#endif
+}
+
+#endif // __cplusplus
+
+
+#define CV_MAT_ELEM_PTR_FAST( mat, row, col, pix_size )  \
+    (assert( (unsigned)(row) < (unsigned)(mat).rows &&   \
+             (unsigned)(col) < (unsigned)(mat).cols ),   \
+     (mat).data.ptr + (size_t)(mat).step*(row) + (pix_size)*(col))
+
+#define CV_MAT_ELEM_PTR( mat, row, col )                 \
+    CV_MAT_ELEM_PTR_FAST( mat, row, col, CV_ELEM_SIZE((mat).type) )
+
+#define CV_MAT_ELEM( mat, elemtype, row, col )           \
+    (*(elemtype*)CV_MAT_ELEM_PTR_FAST( mat, row, col, sizeof(elemtype)))
+
+/** @brief Returns the particular element of single-channel floating-point matrix.
+
+The function is a fast replacement for cvGetReal2D in the case of single-channel floating-point
+matrices. It is faster because it is inline, it does fewer checks for array type and array element
+type, and it checks for the row and column ranges only in debug mode.
+@param mat Input matrix
+@param row The zero-based index of row
+@param col The zero-based index of column
+ */
+CV_INLINE  double  cvmGet( const CvMat* mat, int row, int col )
+{
+    int type;
+
+    type = CV_MAT_TYPE(mat->type);
+    assert( (unsigned)row < (unsigned)mat->rows &&
+            (unsigned)col < (unsigned)mat->cols );
+
+    if( type == CV_32FC1 )
+        return ((float*)(void*)(mat->data.ptr + (size_t)mat->step*row))[col];
+    else
+    {
+        assert( type == CV_64FC1 );
+        return ((double*)(void*)(mat->data.ptr + (size_t)mat->step*row))[col];
+    }
+}
+
+/** @brief Sets a specific element of a single-channel floating-point matrix.
+
+The function is a fast replacement for cvSetReal2D in the case of single-channel floating-point
+matrices. It is faster because it is inline, it does fewer checks for array type and array element
+type, and it checks for the row and column ranges only in debug mode.
+@param mat The matrix
+@param row The zero-based index of row
+@param col The zero-based index of column
+@param value The new value of the matrix element
+ */
+CV_INLINE  void  cvmSet( CvMat* mat, int row, int col, double value )
+{
+    int type;
+    type = CV_MAT_TYPE(mat->type);
+    assert( (unsigned)row < (unsigned)mat->rows &&
+            (unsigned)col < (unsigned)mat->cols );
+
+    if( type == CV_32FC1 )
+        ((float*)(void*)(mat->data.ptr + (size_t)mat->step*row))[col] = (float)value;
+    else
+    {
+        assert( type == CV_64FC1 );
+        ((double*)(void*)(mat->data.ptr + (size_t)mat->step*row))[col] = value;
+    }
+}
+
+
+CV_INLINE int cvIplDepth( int type )
+{
+    int depth = CV_MAT_DEPTH(type);
+    return CV_ELEM_SIZE1(depth)*8 | (depth == CV_8S || depth == CV_16S ||
+           depth == CV_32S ? IPL_DEPTH_SIGN : 0);
+}
+
+
+/****************************************************************************************\
+*                       Multi-dimensional dense array (CvMatND)                          *
+\****************************************************************************************/
+
+#define CV_MATND_MAGIC_VAL    0x42430000
+#define CV_TYPE_NAME_MATND    "opencv-nd-matrix"
+
+#define CV_MAX_DIM            32
+
+#ifdef __cplusplus
+typedef struct CvMatND CvMatND;
+CV_EXPORTS CvMatND cvMatND(const cv::Mat& m);
+#endif
+
+/**
+  @deprecated consider using cv::Mat instead
+  */
+typedef struct
+CvMatND
+{
+    int type;
+    int dims;
+
+    int* refcount;
+    int hdr_refcount;
+
+    union
+    {
+        uchar* ptr;
+        float* fl;
+        double* db;
+        int* i;
+        short* s;
+    } data;
+
+    struct
+    {
+        int size;
+        int step;
+    }
+    dim[CV_MAX_DIM];
+
+#if defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
+    CvMatND() {}
+    CvMatND(const cv::Mat& m) { *this = cvMatND(m); }
+#endif
+}
+CvMatND;
+
+
+CV_INLINE CvMatND cvMatND()
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvMatND self = CV_STRUCT_INITIALIZER; return self;
+#else
+    return CvMatND();
+#endif
+}
+
+#define CV_IS_MATND_HDR(mat) \
+    ((mat) != NULL && (((const CvMatND*)(mat))->type & CV_MAGIC_MASK) == CV_MATND_MAGIC_VAL)
+
+#define CV_IS_MATND(mat) \
+    (CV_IS_MATND_HDR(mat) && ((const CvMatND*)(mat))->data.ptr != NULL)
+
+
+/****************************************************************************************\
+*                      Multi-dimensional sparse array (CvSparseMat)                      *
+\****************************************************************************************/
+
+#define CV_SPARSE_MAT_MAGIC_VAL    0x42440000
+#define CV_TYPE_NAME_SPARSE_MAT    "opencv-sparse-matrix"
+
+struct CvSet;
+
+typedef struct CvSparseMat
+{
+    int type;
+    int dims;
+    int* refcount;
+    int hdr_refcount;
+
+    struct CvSet* heap;
+    void** hashtable;
+    int hashsize;
+    int valoffset;
+    int idxoffset;
+    int size[CV_MAX_DIM];
+
+#ifdef __cplusplus
+    CV_EXPORTS void copyToSparseMat(cv::SparseMat& m) const;
+#endif
+}
+CvSparseMat;
+
+#ifdef __cplusplus
+CV_EXPORTS CvSparseMat* cvCreateSparseMat(const cv::SparseMat& m);
+#endif
+
+#define CV_IS_SPARSE_MAT_HDR(mat) \
+    ((mat) != NULL && \
+    (((const CvSparseMat*)(mat))->type & CV_MAGIC_MASK) == CV_SPARSE_MAT_MAGIC_VAL)
+
+#define CV_IS_SPARSE_MAT(mat) \
+    CV_IS_SPARSE_MAT_HDR(mat)
+
+/**************** iteration through a sparse array *****************/
+
+typedef struct CvSparseNode
+{
+    unsigned hashval;
+    struct CvSparseNode* next;
+}
+CvSparseNode;
+
+typedef struct CvSparseMatIterator
+{
+    CvSparseMat* mat;
+    CvSparseNode* node;
+    int curidx;
+}
+CvSparseMatIterator;
+
+#define CV_NODE_VAL(mat,node)   ((void*)((uchar*)(node) + (mat)->valoffset))
+#define CV_NODE_IDX(mat,node)   ((int*)((uchar*)(node) + (mat)->idxoffset))
+
+/****************************************************************************************\
+*                                         Histogram                                      *
+\****************************************************************************************/
+
+typedef int CvHistType;
+
+#define CV_HIST_MAGIC_VAL     0x42450000
+#define CV_HIST_UNIFORM_FLAG  (1 << 10)
+
+/** indicates whether bin ranges are set already or not */
+#define CV_HIST_RANGES_FLAG   (1 << 11)
+
+#define CV_HIST_ARRAY         0
+#define CV_HIST_SPARSE        1
+#define CV_HIST_TREE          CV_HIST_SPARSE
+
+/** should be used as a parameter only,
+   it turns to CV_HIST_UNIFORM_FLAG of hist->type */
+#define CV_HIST_UNIFORM       1
+
+typedef struct CvHistogram
+{
+    int     type;
+    CvArr*  bins;
+    float   thresh[CV_MAX_DIM][2];  /**< For uniform histograms.                      */
+    float** thresh2;                /**< For non-uniform histograms.                  */
+    CvMatND mat;                    /**< Embedded matrix header for array histograms. */
+}
+CvHistogram;
+
+#define CV_IS_HIST( hist ) \
+    ((hist) != NULL  && \
+     (((CvHistogram*)(hist))->type & CV_MAGIC_MASK) == CV_HIST_MAGIC_VAL && \
+     (hist)->bins != NULL)
+
+#define CV_IS_UNIFORM_HIST( hist ) \
+    (((hist)->type & CV_HIST_UNIFORM_FLAG) != 0)
+
+#define CV_IS_SPARSE_HIST( hist ) \
+    CV_IS_SPARSE_MAT((hist)->bins)
+
+#define CV_HIST_HAS_RANGES( hist ) \
+    (((hist)->type & CV_HIST_RANGES_FLAG) != 0)
+
+/****************************************************************************************\
+*                      Other supplementary data type definitions                         *
+\****************************************************************************************/
+
+/*************************************** CvRect *****************************************/
+/** @sa Rect_ */
+typedef struct CvRect
+{
+    int x;
+    int y;
+    int width;
+    int height;
+
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+    CvRect() __attribute__(( warning("Non-initialized variable") )) {};
+    template<typename _Tp> CvRect(const std::initializer_list<_Tp> list)
+    {
+        CV_Assert(list.size() == 0 || list.size() == 4);
+        x = y = width = height = 0;
+        if (list.size() == 4)
+        {
+            x = list.begin()[0]; y = list.begin()[1]; width = list.begin()[2]; height = list.begin()[3];
+        }
+    };
+#elif defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
+    CvRect(int _x = 0, int _y = 0, int w = 0, int h = 0): x(_x), y(_y), width(w), height(h) {}
+    template<typename _Tp>
+    CvRect(const cv::Rect_<_Tp>& r): x(cv::saturate_cast<int>(r.x)), y(cv::saturate_cast<int>(r.y)), width(cv::saturate_cast<int>(r.width)), height(cv::saturate_cast<int>(r.height)) {}
+#endif
+#ifdef __cplusplus
+    template<typename _Tp>
+    operator cv::Rect_<_Tp>() const { return cv::Rect_<_Tp>((_Tp)x, (_Tp)y, (_Tp)width, (_Tp)height); }
+#endif
+}
+CvRect;
+
+/** constructs CvRect structure. */
+CV_INLINE  CvRect  cvRect( int x, int y, int width, int height )
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvRect r = {x, y, width, height};
+#else
+    CvRect r(x, y , width, height);
+#endif
+    return r;
+}
+#ifdef __cplusplus
+CV_INLINE CvRect cvRect(const cv::Rect& rc) { return cvRect(rc.x, rc.y, rc.width, rc.height); }
+#endif
+
+CV_INLINE  IplROI  cvRectToROI( CvRect rect, int coi )
+{
+    IplROI roi;
+    roi.xOffset = rect.x;
+    roi.yOffset = rect.y;
+    roi.width = rect.width;
+    roi.height = rect.height;
+    roi.coi = coi;
+
+    return roi;
+}
+
+
+CV_INLINE  CvRect  cvROIToRect( IplROI roi )
+{
+    return cvRect( roi.xOffset, roi.yOffset, roi.width, roi.height );
+}
+
+/*********************************** CvTermCriteria *************************************/
+
+#define CV_TERMCRIT_ITER    1
+#define CV_TERMCRIT_NUMBER  CV_TERMCRIT_ITER
+#define CV_TERMCRIT_EPS     2
+
+/** @sa TermCriteria
+ */
+typedef struct CvTermCriteria
+{
+    int    type;  /**< may be combination of
+                     CV_TERMCRIT_ITER
+                     CV_TERMCRIT_EPS */
+    int    max_iter;
+    double epsilon;
+#if defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
+    CvTermCriteria(int _type = 0, int _iter = 0, double _eps = 0) : type(_type), max_iter(_iter), epsilon(_eps)  {}
+    CvTermCriteria(const cv::TermCriteria& t) : type(t.type), max_iter(t.maxCount), epsilon(t.epsilon)  {}
+#endif
+#ifdef __cplusplus
+    operator cv::TermCriteria() const { return cv::TermCriteria(type, max_iter, epsilon); }
+#endif
+}
+CvTermCriteria;
+
+CV_INLINE  CvTermCriteria  cvTermCriteria( int type, int max_iter, double epsilon )
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvTermCriteria t = { type, max_iter, (float)epsilon};
+#else
+    CvTermCriteria t(type, max_iter, epsilon);
+#endif
+    return t;
+}
+#ifdef __cplusplus
+CV_INLINE CvTermCriteria cvTermCriteria(const cv::TermCriteria& t) { return cvTermCriteria(t.type, t.maxCount, t.epsilon); }
+#endif
+
+
+/******************************* CvPoint and variants ***********************************/
+
+typedef struct CvPoint
+{
+    int x;
+    int y;
+
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+    CvPoint() __attribute__(( warning("Non-initialized variable") )) {}
+    template<typename _Tp> CvPoint(const std::initializer_list<_Tp> list)
+    {
+        CV_Assert(list.size() == 0 || list.size() == 2);
+        x = y = 0;
+        if (list.size() == 2)
+        {
+            x = list.begin()[0]; y = list.begin()[1];
+        }
+    };
+#elif defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
+    CvPoint(int _x = 0, int _y = 0): x(_x), y(_y) {}
+    template<typename _Tp>
+    CvPoint(const cv::Point_<_Tp>& pt): x((int)pt.x), y((int)pt.y) {}
+#endif
+#ifdef __cplusplus
+    template<typename _Tp>
+    operator cv::Point_<_Tp>() const { return cv::Point_<_Tp>(cv::saturate_cast<_Tp>(x), cv::saturate_cast<_Tp>(y)); }
+#endif
+}
+CvPoint;
+
+/** constructs CvPoint structure. */
+CV_INLINE  CvPoint  cvPoint( int x, int y )
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvPoint p = {x, y};
+#else
+    CvPoint p(x, y);
+#endif
+    return p;
+}
+#ifdef __cplusplus
+CV_INLINE CvPoint cvPoint(const cv::Point& pt) { return cvPoint(pt.x, pt.y); }
+#endif
+
+typedef struct CvPoint2D32f
+{
+    float x;
+    float y;
+
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+    CvPoint2D32f() __attribute__(( warning("Non-initialized variable") )) {}
+    template<typename _Tp> CvPoint2D32f(const std::initializer_list<_Tp> list)
+    {
+        CV_Assert(list.size() == 0 || list.size() == 2);
+        x = y = 0;
+        if (list.size() == 2)
+        {
+            x = list.begin()[0]; y = list.begin()[1];
+        }
+    };
+#elif defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
+    CvPoint2D32f(float _x = 0, float _y = 0): x(_x), y(_y) {}
+    template<typename _Tp>
+    CvPoint2D32f(const cv::Point_<_Tp>& pt): x((float)pt.x), y((float)pt.y) {}
+#endif
+#ifdef __cplusplus
+    template<typename _Tp>
+    operator cv::Point_<_Tp>() const { return cv::Point_<_Tp>(cv::saturate_cast<_Tp>(x), cv::saturate_cast<_Tp>(y)); }
+#endif
+}
+CvPoint2D32f;
+
+/** constructs CvPoint2D32f structure. */
+CV_INLINE  CvPoint2D32f  cvPoint2D32f( double x, double y )
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvPoint2D32f p = { (float)x, (float)y };
+#else
+    CvPoint2D32f p((float)x, (float)y);
+#endif
+    return p;
+}
+
+#ifdef __cplusplus
+template<typename _Tp>
+CvPoint2D32f cvPoint2D32f(const cv::Point_<_Tp>& pt)
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvPoint2D32f p = { (float)pt.x, (float)pt.y };
+#else
+    CvPoint2D32f p((float)pt.x, (float)pt.y);
+#endif
+    return p;
+}
+#endif
+
+/** converts CvPoint to CvPoint2D32f. */
+CV_INLINE  CvPoint2D32f  cvPointTo32f( CvPoint point )
+{
+    return cvPoint2D32f( (float)point.x, (float)point.y );
+}
+
+/** converts CvPoint2D32f to CvPoint. */
+CV_INLINE  CvPoint  cvPointFrom32f( CvPoint2D32f point )
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvPoint ipt = { cvRound(point.x), cvRound(point.y) };
+#else
+    CvPoint ipt(cvRound(point.x), cvRound(point.y));
+#endif
+    return ipt;
+}
+
+
+typedef struct CvPoint3D32f
+{
+    float x;
+    float y;
+    float z;
+
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+    CvPoint3D32f() __attribute__(( warning("Non-initialized variable") )) {}
+    template<typename _Tp> CvPoint3D32f(const std::initializer_list<_Tp> list)
+    {
+        CV_Assert(list.size() == 0 || list.size() == 3);
+        x = y = z = 0;
+        if (list.size() == 3)
+        {
+            x = list.begin()[0]; y = list.begin()[1]; z = list.begin()[2];
+        }
+    };
+#elif defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
+    CvPoint3D32f(float _x = 0, float _y = 0, float _z = 0): x(_x), y(_y), z(_z) {}
+    template<typename _Tp>
+    CvPoint3D32f(const cv::Point3_<_Tp>& pt): x((float)pt.x), y((float)pt.y), z((float)pt.z) {}
+#endif
+#ifdef __cplusplus
+    template<typename _Tp>
+    operator cv::Point3_<_Tp>() const { return cv::Point3_<_Tp>(cv::saturate_cast<_Tp>(x), cv::saturate_cast<_Tp>(y), cv::saturate_cast<_Tp>(z)); }
+#endif
+}
+CvPoint3D32f;
+
+/** constructs CvPoint3D32f structure. */
+CV_INLINE  CvPoint3D32f  cvPoint3D32f( double x, double y, double z )
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvPoint3D32f p = { (float)x, (float)y, (float)z };
+#else
+    CvPoint3D32f p((float)x, (float)y, (float)z);
+#endif
+    return p;
+}
+
+#ifdef __cplusplus
+template<typename _Tp>
+CvPoint3D32f cvPoint3D32f(const cv::Point3_<_Tp>& pt)
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvPoint3D32f p  = { (float)pt.x, (float)pt.y, (float)pt.z };
+#else
+    CvPoint3D32f p((float)pt.x, (float)pt.y, (float)pt.z);
+#endif
+    return p;
+}
+#endif
+
+
+typedef struct CvPoint2D64f
+{
+    double x;
+    double y;
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+    CvPoint2D64f() __attribute__(( warning("Non-initialized variable") )) {}
+    template<typename _Tp> CvPoint2D64f(const std::initializer_list<_Tp> list)
+    {
+        CV_Assert(list.size() == 0 || list.size() == 2);
+        x = y = 0;
+        if (list.size() == 2)
+        {
+            x = list.begin()[0]; y = list.begin()[1];
+        }
+    };
+#endif
+}
+CvPoint2D64f;
+
+/** constructs CvPoint2D64f structure.*/
+CV_INLINE  CvPoint2D64f  cvPoint2D64f( double x, double y )
+{
+    CvPoint2D64f p = { x, y };
+    return p;
+}
+
+
+typedef struct CvPoint3D64f
+{
+    double x;
+    double y;
+    double z;
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+    CvPoint3D64f() __attribute__(( warning("Non-initialized variable") )) {}
+    template<typename _Tp> CvPoint3D64f(const std::initializer_list<_Tp> list)
+    {
+        CV_Assert(list.size() == 0 || list.size() == 3);
+        x = y = z = 0;
+        if (list.size() == 3)
+        {
+            x = list.begin()[0]; y = list.begin()[1]; z = list.begin()[2];
+        }
+    };
+#endif
+}
+CvPoint3D64f;
+
+/** constructs CvPoint3D64f structure. */
+CV_INLINE  CvPoint3D64f  cvPoint3D64f( double x, double y, double z )
+{
+    CvPoint3D64f p = { x, y, z };
+    return p;
+}
+
+
+/******************************** CvSize's & CvBox **************************************/
+
+typedef struct CvSize
+{
+    int width;
+    int height;
+
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+    CvSize() __attribute__(( warning("Non-initialized variable") )) {}
+    template<typename _Tp> CvSize(const std::initializer_list<_Tp> list)
+    {
+        CV_Assert(list.size() == 0 || list.size() == 2);
+        width = 0; height = 0;
+        if (list.size() == 2)
+        {
+            width = list.begin()[0]; height = list.begin()[1];
+        }
+    };
+#elif defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
+    CvSize(int w = 0, int h = 0): width(w), height(h) {}
+    template<typename _Tp>
+    CvSize(const cv::Size_<_Tp>& sz): width(cv::saturate_cast<int>(sz.width)), height(cv::saturate_cast<int>(sz.height)) {}
+#endif
+#ifdef __cplusplus
+    template<typename _Tp>
+    operator cv::Size_<_Tp>() const { return cv::Size_<_Tp>(cv::saturate_cast<_Tp>(width), cv::saturate_cast<_Tp>(height)); }
+#endif
+}
+CvSize;
+
+/** constructs CvSize structure. */
+CV_INLINE  CvSize  cvSize( int width, int height )
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvSize s = { width, height };
+#else
+    CvSize s(width, height);
+#endif
+    return s;
+}
+
+#ifdef __cplusplus
+CV_INLINE CvSize cvSize(const cv::Size& sz)
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvSize s = { sz.width, sz.height };
+#else
+    CvSize s(sz.width, sz.height);
+#endif
+    return s;
+}
+#endif
+
+typedef struct CvSize2D32f
+{
+    float width;
+    float height;
+
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+    CvSize2D32f() __attribute__(( warning("Non-initialized variable") )) {}
+    template<typename _Tp> CvSize2D32f(const std::initializer_list<_Tp> list)
+    {
+        CV_Assert(list.size() == 0 || list.size() == 2);
+        width = 0; height = 0;
+        if (list.size() == 2)
+        {
+            width = list.begin()[0]; height = list.begin()[1];
+        }
+    };
+#elif defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
+    CvSize2D32f(float w = 0, float h = 0): width(w), height(h) {}
+    template<typename _Tp>
+    CvSize2D32f(const cv::Size_<_Tp>& sz): width(cv::saturate_cast<float>(sz.width)), height(cv::saturate_cast<float>(sz.height)) {}
+#endif
+#ifdef __cplusplus
+    template<typename _Tp>
+    operator cv::Size_<_Tp>() const { return cv::Size_<_Tp>(cv::saturate_cast<_Tp>(width), cv::saturate_cast<_Tp>(height)); }
+#endif
+}
+CvSize2D32f;
+
+/** constructs CvSize2D32f structure. */
+CV_INLINE  CvSize2D32f  cvSize2D32f( double width, double height )
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvSize2D32f s = { (float)width, (float)height };
+#else
+    CvSize2D32f s((float)width, (float)height);
+#endif
+    return s;
+}
+#ifdef __cplusplus
+template<typename _Tp>
+CvSize2D32f cvSize2D32f(const cv::Size_<_Tp>& sz)
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvSize2D32f s = { (float)sz.width, (float)sz.height };
+#else
+    CvSize2D32f s((float)sz.width, (float)sz.height);
+#endif
+    return s;
+}
+#endif
+
+/** @sa RotatedRect
+ */
+typedef struct CvBox2D
+{
+    CvPoint2D32f center;  /**< Center of the box.                          */
+    CvSize2D32f  size;    /**< Box width and length.                       */
+    float angle;          /**< Angle between the horizontal axis           */
+                          /**< and the first side (i.e. length) in degrees */
+
+#if defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
+    CvBox2D(CvPoint2D32f c = CvPoint2D32f(), CvSize2D32f s = CvSize2D32f(), float a = 0) : center(c), size(s), angle(a) {}
+    CvBox2D(const cv::RotatedRect& rr) : center(rr.center), size(rr.size), angle(rr.angle) {}
+#endif
+#ifdef __cplusplus
+    operator cv::RotatedRect() const { return cv::RotatedRect(center, size, angle); }
+#endif
+}
+CvBox2D;
+
+
+#ifdef __cplusplus
+CV_INLINE CvBox2D cvBox2D(CvPoint2D32f c = CvPoint2D32f(), CvSize2D32f s = CvSize2D32f(), float a = 0)
+{
+    CvBox2D self;
+    self.center = c;
+    self.size = s;
+    self.angle = a;
+    return self;
+}
+CV_INLINE CvBox2D cvBox2D(const cv::RotatedRect& rr)
+{
+    CvBox2D self;
+    self.center = cvPoint2D32f(rr.center);
+    self.size = cvSize2D32f(rr.size);
+    self.angle = rr.angle;
+    return self;
+}
+#endif
+
+
+/** Line iterator state: */
+typedef struct CvLineIterator
+{
+    /** Pointer to the current point: */
+    uchar* ptr;
+
+    /* Bresenham algorithm state: */
+    int  err;
+    int  plus_delta;
+    int  minus_delta;
+    int  plus_step;
+    int  minus_step;
+}
+CvLineIterator;
+
+
+
+/************************************* CvSlice ******************************************/
+#define CV_WHOLE_SEQ_END_INDEX 0x3fffffff
+#define CV_WHOLE_SEQ  cvSlice(0, CV_WHOLE_SEQ_END_INDEX)
+
+typedef struct CvSlice
+{
+    int  start_index, end_index;
+
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+    CvSlice() __attribute__(( warning("Non-initialized variable") )) {}
+    template<typename _Tp> CvSlice(const std::initializer_list<_Tp> list)
+    {
+        CV_Assert(list.size() == 0 || list.size() == 2);
+        start_index = end_index = 0;
+        if (list.size() == 2)
+        {
+            start_index = list.begin()[0]; end_index = list.begin()[1];
+        }
+    };
+#endif
+#if defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus) && !defined(__CUDACC__)
+    CvSlice(int start = 0, int end = 0) : start_index(start), end_index(end) {}
+    CvSlice(const cv::Range& r) { *this = (r.start != INT_MIN && r.end != INT_MAX) ? CvSlice(r.start, r.end) : CvSlice(0, CV_WHOLE_SEQ_END_INDEX); }
+    operator cv::Range() const { return (start_index == 0 && end_index == CV_WHOLE_SEQ_END_INDEX ) ? cv::Range::all() : cv::Range(start_index, end_index); }
+#endif
+}
+CvSlice;
+
+CV_INLINE  CvSlice  cvSlice( int start, int end )
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus) && !defined(__CUDACC__))
+    CvSlice slice = { start, end };
+#else
+    CvSlice slice(start, end);
+#endif
+    return slice;
+}
+
+#if defined(__cplusplus)
+CV_INLINE  CvSlice  cvSlice(const cv::Range& r)
+{
+    CvSlice slice = (r.start != INT_MIN && r.end != INT_MAX) ? cvSlice(r.start, r.end) : cvSlice(0, CV_WHOLE_SEQ_END_INDEX);
+    return slice;
+}
+#endif
+
+
+/************************************* CvScalar *****************************************/
+/** @sa Scalar_
+ */
+typedef struct CvScalar
+{
+    double val[4];
+
+#ifdef CV__VALIDATE_UNUNITIALIZED_VARS
+    CvScalar() __attribute__(( warning("Non-initialized variable") )) {}
+    CvScalar(const std::initializer_list<double> list)
+    {
+        CV_Assert(list.size() == 0 || list.size() == 4);
+        val[0] = val[1] = val[2] = val[3] = 0;
+        if (list.size() == 4)
+        {
+            val[0] = list.begin()[0]; val[1] = list.begin()[1]; val[2] = list.begin()[2]; val[3] = list.begin()[3];
+        }
+    };
+#elif defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
+    CvScalar() {}
+    CvScalar(double d0, double d1 = 0, double d2 = 0, double d3 = 0) { val[0] = d0; val[1] = d1; val[2] = d2; val[3] = d3; }
+    template<typename _Tp>
+    CvScalar(const cv::Scalar_<_Tp>& s) { val[0] = s.val[0]; val[1] = s.val[1]; val[2] = s.val[2]; val[3] = s.val[3]; }
+    template<typename _Tp, int cn>
+    CvScalar(const cv::Vec<_Tp, cn>& v)
+    {
+        int i;
+        for( i = 0; i < (cn < 4 ? cn : 4); i++ ) val[i] = v.val[i];
+        for( ; i < 4; i++ ) val[i] = 0;
+    }
+#endif
+#ifdef __cplusplus
+    template<typename _Tp>
+    operator cv::Scalar_<_Tp>() const { return cv::Scalar_<_Tp>(cv::saturate_cast<_Tp>(val[0]), cv::saturate_cast<_Tp>(val[1]), cv::saturate_cast<_Tp>(val[2]), cv::saturate_cast<_Tp>(val[3])); }
+#endif
+}
+CvScalar;
+
+CV_INLINE  CvScalar  cvScalar( double val0, double val1 CV_DEFAULT(0),
+                               double val2 CV_DEFAULT(0), double val3 CV_DEFAULT(0))
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvScalar scalar = CV_STRUCT_INITIALIZER;
+#else
+    CvScalar scalar;
+#endif
+    scalar.val[0] = val0; scalar.val[1] = val1;
+    scalar.val[2] = val2; scalar.val[3] = val3;
+    return scalar;
+}
+
+#ifdef __cplusplus
+CV_INLINE CvScalar cvScalar()
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvScalar scalar = CV_STRUCT_INITIALIZER;
+#else
+    CvScalar scalar;
+#endif
+    scalar.val[0] = scalar.val[1] = scalar.val[2] = scalar.val[3] = 0;
+    return scalar;
+}
+CV_INLINE CvScalar cvScalar(const cv::Scalar& s)
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvScalar scalar = CV_STRUCT_INITIALIZER;
+#else
+    CvScalar scalar;
+#endif
+    scalar.val[0] = s.val[0];
+    scalar.val[1] = s.val[1];
+    scalar.val[2] = s.val[2];
+    scalar.val[3] = s.val[3];
+    return scalar;
+}
+#endif
+
+CV_INLINE  CvScalar  cvRealScalar( double val0 )
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvScalar scalar = CV_STRUCT_INITIALIZER;
+#else
+    CvScalar scalar;
+#endif
+    scalar.val[0] = val0;
+    scalar.val[1] = scalar.val[2] = scalar.val[3] = 0;
+    return scalar;
+}
+
+CV_INLINE  CvScalar  cvScalarAll( double val0123 )
+{
+#if !(defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus))
+    CvScalar scalar = CV_STRUCT_INITIALIZER;
+#else
+    CvScalar scalar;
+#endif
+    scalar.val[0] = val0123;
+    scalar.val[1] = val0123;
+    scalar.val[2] = val0123;
+    scalar.val[3] = val0123;
+    return scalar;
+}
+
+/****************************************************************************************\
+*                                   Dynamic Data structures                              *
+\****************************************************************************************/
+
+/******************************** Memory storage ****************************************/
+
+typedef struct CvMemBlock
+{
+    struct CvMemBlock*  prev;
+    struct CvMemBlock*  next;
+}
+CvMemBlock;
+
+#define CV_STORAGE_MAGIC_VAL    0x42890000
+
+typedef struct CvMemStorage
+{
+    int signature;
+    CvMemBlock* bottom;           /**< First allocated block.                   */
+    CvMemBlock* top;              /**< Current memory block - top of the stack. */
+    struct  CvMemStorage* parent; /**< We get new blocks from parent as needed. */
+    int block_size;               /**< Block size.                              */
+    int free_space;               /**< Remaining free space in current block.   */
+}
+CvMemStorage;
+
+#define CV_IS_STORAGE(storage)  \
+    ((storage) != NULL &&       \
+    (((CvMemStorage*)(storage))->signature & CV_MAGIC_MASK) == CV_STORAGE_MAGIC_VAL)
+
+
+typedef struct CvMemStoragePos
+{
+    CvMemBlock* top;
+    int free_space;
+}
+CvMemStoragePos;
+
+
+/*********************************** Sequence *******************************************/
+
+typedef struct CvSeqBlock
+{
+    struct CvSeqBlock*  prev; /**< Previous sequence block.                   */
+    struct CvSeqBlock*  next; /**< Next sequence block.                       */
+    int    start_index;       /**< Index of the first element in the block +  */
+                              /**< sequence->first->start_index.              */
+    int    count;             /**< Number of elements in the block.           */
+    schar* data;              /**< Pointer to the first element of the block. */
+}
+CvSeqBlock;
+
+
+#define CV_TREE_NODE_FIELDS(node_type)                               \
+    int       flags;             /**< Miscellaneous flags.     */      \
+    int       header_size;       /**< Size of sequence header. */      \
+    struct    node_type* h_prev; /**< Previous sequence.       */      \
+    struct    node_type* h_next; /**< Next sequence.           */      \
+    struct    node_type* v_prev; /**< 2nd previous sequence.   */      \
+    struct    node_type* v_next  /**< 2nd next sequence.       */
+
+/**
+   Read/Write sequence.
+   Elements can be dynamically inserted to or deleted from the sequence.
+*/
+#define CV_SEQUENCE_FIELDS()                                              \
+    CV_TREE_NODE_FIELDS(CvSeq);                                           \
+    int       total;          /**< Total number of elements.            */  \
+    int       elem_size;      /**< Size of sequence element in bytes.   */  \
+    schar*    block_max;      /**< Maximal bound of the last block.     */  \
+    schar*    ptr;            /**< Current write pointer.               */  \
+    int       delta_elems;    /**< Grow seq this many at a time.        */  \
+    CvMemStorage* storage;    /**< Where the seq is stored.             */  \
+    CvSeqBlock* free_blocks;  /**< Free blocks list.                    */  \
+    CvSeqBlock* first;        /**< Pointer to the first sequence block. */
+
+typedef struct CvSeq
+{
+    CV_SEQUENCE_FIELDS()
+}
+CvSeq;
+
+#define CV_TYPE_NAME_SEQ             "opencv-sequence"
+#define CV_TYPE_NAME_SEQ_TREE        "opencv-sequence-tree"
+
+/*************************************** Set ********************************************/
+/** @brief Set
+  Order is not preserved. There can be gaps between sequence elements.
+  After the element has been inserted it stays in the same place all the time.
+  The MSB(most-significant or sign bit) of the first field (flags) is 0 iff the element exists.
+*/
+#define CV_SET_ELEM_FIELDS(elem_type)   \
+    int  flags;                         \
+    struct elem_type* next_free;
+
+typedef struct CvSetElem
+{
+    CV_SET_ELEM_FIELDS(CvSetElem)
+}
+CvSetElem;
+
+#define CV_SET_FIELDS()      \
+    CV_SEQUENCE_FIELDS()     \
+    CvSetElem* free_elems;   \
+    int active_count;
+
+typedef struct CvSet
+{
+    CV_SET_FIELDS()
+}
+CvSet;
+
+
+#define CV_SET_ELEM_IDX_MASK   ((1 << 26) - 1)
+#define CV_SET_ELEM_FREE_FLAG  (1 << (sizeof(int)*8-1))
+
+/** Checks whether the element pointed by ptr belongs to a set or not */
+#define CV_IS_SET_ELEM( ptr )  (((CvSetElem*)(ptr))->flags >= 0)
+
+/************************************* Graph ********************************************/
+
+/** @name Graph
+
+We represent a graph as a set of vertices. Vertices contain their adjacency lists (more exactly,
+pointers to first incoming or outcoming edge (or 0 if isolated vertex)). Edges are stored in
+another set. There is a singly-linked list of incoming/outcoming edges for each vertex.
+
+Each edge consists of:
+
+- Two pointers to the starting and ending vertices (vtx[0] and vtx[1] respectively).
+
+    A graph may be oriented or not. In the latter case, edges between vertex i to vertex j are not
+distinguished during search operations.
+
+- Two pointers to next edges for the starting and ending vertices, where next[0] points to the
+next edge in the vtx[0] adjacency list and next[1] points to the next edge in the vtx[1]
+adjacency list.
+
+@see CvGraphEdge, CvGraphVtx, CvGraphVtx2D, CvGraph
+@{
+*/
+#define CV_GRAPH_EDGE_FIELDS()      \
+    int flags;                      \
+    float weight;                   \
+    struct CvGraphEdge* next[2];    \
+    struct CvGraphVtx* vtx[2];
+
+
+#define CV_GRAPH_VERTEX_FIELDS()    \
+    int flags;                      \
+    struct CvGraphEdge* first;
+
+
+typedef struct CvGraphEdge
+{
+    CV_GRAPH_EDGE_FIELDS()
+}
+CvGraphEdge;
+
+typedef struct CvGraphVtx
+{
+    CV_GRAPH_VERTEX_FIELDS()
+}
+CvGraphVtx;
+
+typedef struct CvGraphVtx2D
+{
+    CV_GRAPH_VERTEX_FIELDS()
+    CvPoint2D32f* ptr;
+}
+CvGraphVtx2D;
+
+/**
+   Graph is "derived" from the set (this is set a of vertices)
+   and includes another set (edges)
+*/
+#define  CV_GRAPH_FIELDS()   \
+    CV_SET_FIELDS()          \
+    CvSet* edges;
+
+typedef struct CvGraph
+{
+    CV_GRAPH_FIELDS()
+}
+CvGraph;
+
+#define CV_TYPE_NAME_GRAPH "opencv-graph"
+
+/** @} */
+
+/*********************************** Chain/Contour *************************************/
+
+typedef struct CvChain
+{
+    CV_SEQUENCE_FIELDS()
+    CvPoint  origin;
+}
+CvChain;
+
+#define CV_CONTOUR_FIELDS()  \
+    CV_SEQUENCE_FIELDS()     \
+    CvRect rect;             \
+    int color;               \
+    int reserved[3];
+
+typedef struct CvContour
+{
+    CV_CONTOUR_FIELDS()
+}
+CvContour;
+
+typedef CvContour CvPoint2DSeq;
+
+/****************************************************************************************\
+*                                    Sequence types                                      *
+\****************************************************************************************/
+
+#define CV_SEQ_MAGIC_VAL             0x42990000
+
+#define CV_IS_SEQ(seq) \
+    ((seq) != NULL && (((CvSeq*)(seq))->flags & CV_MAGIC_MASK) == CV_SEQ_MAGIC_VAL)
+
+#define CV_SET_MAGIC_VAL             0x42980000
+#define CV_IS_SET(set) \
+    ((set) != NULL && (((CvSeq*)(set))->flags & CV_MAGIC_MASK) == CV_SET_MAGIC_VAL)
+
+#define CV_SEQ_ELTYPE_BITS           12
+#define CV_SEQ_ELTYPE_MASK           ((1 << CV_SEQ_ELTYPE_BITS) - 1)
+
+#define CV_SEQ_ELTYPE_POINT          CV_32SC2  /**< (x,y) */
+#define CV_SEQ_ELTYPE_CODE           CV_8UC1   /**< freeman code: 0..7 */
+#define CV_SEQ_ELTYPE_GENERIC        0
+#define CV_SEQ_ELTYPE_PTR            CV_MAKE_TYPE(CV_8U, 8 /*sizeof(void*)*/)
+#define CV_SEQ_ELTYPE_PPOINT         CV_SEQ_ELTYPE_PTR  /**< &(x,y) */
+#define CV_SEQ_ELTYPE_INDEX          CV_32SC1  /**< #(x,y) */
+#define CV_SEQ_ELTYPE_GRAPH_EDGE     0  /**< &next_o, &next_d, &vtx_o, &vtx_d */
+#define CV_SEQ_ELTYPE_GRAPH_VERTEX   0  /**< first_edge, &(x,y) */
+#define CV_SEQ_ELTYPE_TRIAN_ATR      0  /**< vertex of the binary tree   */
+#define CV_SEQ_ELTYPE_CONNECTED_COMP 0  /**< connected component  */
+#define CV_SEQ_ELTYPE_POINT3D        CV_32FC3  /**< (x,y,z)  */
+
+#define CV_SEQ_KIND_BITS        2
+#define CV_SEQ_KIND_MASK        (((1 << CV_SEQ_KIND_BITS) - 1)<<CV_SEQ_ELTYPE_BITS)
+
+/** types of sequences */
+#define CV_SEQ_KIND_GENERIC     (0 << CV_SEQ_ELTYPE_BITS)
+#define CV_SEQ_KIND_CURVE       (1 << CV_SEQ_ELTYPE_BITS)
+#define CV_SEQ_KIND_BIN_TREE    (2 << CV_SEQ_ELTYPE_BITS)
+
+/** types of sparse sequences (sets) */
+#define CV_SEQ_KIND_GRAPH       (1 << CV_SEQ_ELTYPE_BITS)
+#define CV_SEQ_KIND_SUBDIV2D    (2 << CV_SEQ_ELTYPE_BITS)
+
+#define CV_SEQ_FLAG_SHIFT       (CV_SEQ_KIND_BITS + CV_SEQ_ELTYPE_BITS)
+
+/** flags for curves */
+#define CV_SEQ_FLAG_CLOSED     (1 << CV_SEQ_FLAG_SHIFT)
+#define CV_SEQ_FLAG_SIMPLE     (0 << CV_SEQ_FLAG_SHIFT)
+#define CV_SEQ_FLAG_CONVEX     (0 << CV_SEQ_FLAG_SHIFT)
+#define CV_SEQ_FLAG_HOLE       (2 << CV_SEQ_FLAG_SHIFT)
+
+/** flags for graphs */
+#define CV_GRAPH_FLAG_ORIENTED (1 << CV_SEQ_FLAG_SHIFT)
+
+#define CV_GRAPH               CV_SEQ_KIND_GRAPH
+#define CV_ORIENTED_GRAPH      (CV_SEQ_KIND_GRAPH|CV_GRAPH_FLAG_ORIENTED)
+
+/** point sets */
+#define CV_SEQ_POINT_SET       (CV_SEQ_KIND_GENERIC| CV_SEQ_ELTYPE_POINT)
+#define CV_SEQ_POINT3D_SET     (CV_SEQ_KIND_GENERIC| CV_SEQ_ELTYPE_POINT3D)
+#define CV_SEQ_POLYLINE        (CV_SEQ_KIND_CURVE  | CV_SEQ_ELTYPE_POINT)
+#define CV_SEQ_POLYGON         (CV_SEQ_FLAG_CLOSED | CV_SEQ_POLYLINE )
+#define CV_SEQ_CONTOUR         CV_SEQ_POLYGON
+#define CV_SEQ_SIMPLE_POLYGON  (CV_SEQ_FLAG_SIMPLE | CV_SEQ_POLYGON  )
+
+/** chain-coded curves */
+#define CV_SEQ_CHAIN           (CV_SEQ_KIND_CURVE  | CV_SEQ_ELTYPE_CODE)
+#define CV_SEQ_CHAIN_CONTOUR   (CV_SEQ_FLAG_CLOSED | CV_SEQ_CHAIN)
+
+/** binary tree for the contour */
+#define CV_SEQ_POLYGON_TREE    (CV_SEQ_KIND_BIN_TREE  | CV_SEQ_ELTYPE_TRIAN_ATR)
+
+/** sequence of the connected components */
+#define CV_SEQ_CONNECTED_COMP  (CV_SEQ_KIND_GENERIC  | CV_SEQ_ELTYPE_CONNECTED_COMP)
+
+/** sequence of the integer numbers */
+#define CV_SEQ_INDEX           (CV_SEQ_KIND_GENERIC  | CV_SEQ_ELTYPE_INDEX)
+
+#define CV_SEQ_ELTYPE( seq )   ((seq)->flags & CV_SEQ_ELTYPE_MASK)
+#define CV_SEQ_KIND( seq )     ((seq)->flags & CV_SEQ_KIND_MASK )
+
+/** flag checking */
+#define CV_IS_SEQ_INDEX( seq )      ((CV_SEQ_ELTYPE(seq) == CV_SEQ_ELTYPE_INDEX) && \
+                                     (CV_SEQ_KIND(seq) == CV_SEQ_KIND_GENERIC))
+
+#define CV_IS_SEQ_CURVE( seq )      (CV_SEQ_KIND(seq) == CV_SEQ_KIND_CURVE)
+#define CV_IS_SEQ_CLOSED( seq )     (((seq)->flags & CV_SEQ_FLAG_CLOSED) != 0)
+#define CV_IS_SEQ_CONVEX( seq )     0
+#define CV_IS_SEQ_HOLE( seq )       (((seq)->flags & CV_SEQ_FLAG_HOLE) != 0)
+#define CV_IS_SEQ_SIMPLE( seq )     1
+
+/** type checking macros */
+#define CV_IS_SEQ_POINT_SET( seq ) \
+    ((CV_SEQ_ELTYPE(seq) == CV_32SC2 || CV_SEQ_ELTYPE(seq) == CV_32FC2))
+
+#define CV_IS_SEQ_POINT_SUBSET( seq ) \
+    (CV_IS_SEQ_INDEX( seq ) || CV_SEQ_ELTYPE(seq) == CV_SEQ_ELTYPE_PPOINT)
+
+#define CV_IS_SEQ_POLYLINE( seq )   \
+    (CV_SEQ_KIND(seq) == CV_SEQ_KIND_CURVE && CV_IS_SEQ_POINT_SET(seq))
+
+#define CV_IS_SEQ_POLYGON( seq )   \
+    (CV_IS_SEQ_POLYLINE(seq) && CV_IS_SEQ_CLOSED(seq))
+
+#define CV_IS_SEQ_CHAIN( seq )   \
+    (CV_SEQ_KIND(seq) == CV_SEQ_KIND_CURVE && (seq)->elem_size == 1)
+
+#define CV_IS_SEQ_CONTOUR( seq )   \
+    (CV_IS_SEQ_CLOSED(seq) && (CV_IS_SEQ_POLYLINE(seq) || CV_IS_SEQ_CHAIN(seq)))
+
+#define CV_IS_SEQ_CHAIN_CONTOUR( seq ) \
+    (CV_IS_SEQ_CHAIN( seq ) && CV_IS_SEQ_CLOSED( seq ))
+
+#define CV_IS_SEQ_POLYGON_TREE( seq ) \
+    (CV_SEQ_ELTYPE (seq) ==  CV_SEQ_ELTYPE_TRIAN_ATR &&    \
+    CV_SEQ_KIND( seq ) ==  CV_SEQ_KIND_BIN_TREE )
+
+#define CV_IS_GRAPH( seq )    \
+    (CV_IS_SET(seq) && CV_SEQ_KIND((CvSet*)(seq)) == CV_SEQ_KIND_GRAPH)
+
+#define CV_IS_GRAPH_ORIENTED( seq )   \
+    (((seq)->flags & CV_GRAPH_FLAG_ORIENTED) != 0)
+
+#define CV_IS_SUBDIV2D( seq )  \
+    (CV_IS_SET(seq) && CV_SEQ_KIND((CvSet*)(seq)) == CV_SEQ_KIND_SUBDIV2D)
+
+/****************************************************************************************/
+/*                            Sequence writer & reader                                  */
+/****************************************************************************************/
+
+#define CV_SEQ_WRITER_FIELDS()                                     \
+    int          header_size;                                      \
+    CvSeq*       seq;        /**< the sequence written */            \
+    CvSeqBlock*  block;      /**< current block */                   \
+    schar*       ptr;        /**< pointer to free space */           \
+    schar*       block_min;  /**< pointer to the beginning of block*/\
+    schar*       block_max;  /**< pointer to the end of block */
+
+typedef struct CvSeqWriter
+{
+    CV_SEQ_WRITER_FIELDS()
+}
+CvSeqWriter;
+
+
+#define CV_SEQ_READER_FIELDS()                                      \
+    int          header_size;                                       \
+    CvSeq*       seq;        /**< sequence, beign read */             \
+    CvSeqBlock*  block;      /**< current block */                    \
+    schar*       ptr;        /**< pointer to element be read next */  \
+    schar*       block_min;  /**< pointer to the beginning of block */\
+    schar*       block_max;  /**< pointer to the end of block */      \
+    int          delta_index;/**< = seq->first->start_index   */      \
+    schar*       prev_elem;  /**< pointer to previous element */
+
+typedef struct CvSeqReader
+{
+    CV_SEQ_READER_FIELDS()
+}
+CvSeqReader;
+
+/****************************************************************************************/
+/*                                Operations on sequences                               */
+/****************************************************************************************/
+
+#define  CV_SEQ_ELEM( seq, elem_type, index )                    \
+/** assert gives some guarantee that <seq> parameter is valid */  \
+(   assert(sizeof((seq)->first[0]) == sizeof(CvSeqBlock) &&      \
+    (seq)->elem_size == sizeof(elem_type)),                      \
+    (elem_type*)((seq)->first && (unsigned)index <               \
+    (unsigned)((seq)->first->count) ?                            \
+    (seq)->first->data + (index) * sizeof(elem_type) :           \
+    cvGetSeqElem( (CvSeq*)(seq), (index) )))
+#define CV_GET_SEQ_ELEM( elem_type, seq, index ) CV_SEQ_ELEM( (seq), elem_type, (index) )
+
+/** Add element to sequence: */
+#define CV_WRITE_SEQ_ELEM_VAR( elem_ptr, writer )     \
+{                                                     \
+    if( (writer).ptr >= (writer).block_max )          \
+    {                                                 \
+        cvCreateSeqBlock( &writer);                   \
+    }                                                 \
+    memcpy((writer).ptr, elem_ptr, (writer).seq->elem_size);\
+    (writer).ptr += (writer).seq->elem_size;          \
+}
+
+#define CV_WRITE_SEQ_ELEM( elem, writer )             \
+{                                                     \
+    assert( (writer).seq->elem_size == sizeof(elem)); \
+    if( (writer).ptr >= (writer).block_max )          \
+    {                                                 \
+        cvCreateSeqBlock( &writer);                   \
+    }                                                 \
+    assert( (writer).ptr <= (writer).block_max - sizeof(elem));\
+    memcpy((writer).ptr, &(elem), sizeof(elem));      \
+    (writer).ptr += sizeof(elem);                     \
+}
+
+
+/** Move reader position forward: */
+#define CV_NEXT_SEQ_ELEM( elem_size, reader )                 \
+{                                                             \
+    if( ((reader).ptr += (elem_size)) >= (reader).block_max ) \
+    {                                                         \
+        cvChangeSeqBlock( &(reader), 1 );                     \
+    }                                                         \
+}
+
+
+/** Move reader position backward: */
+#define CV_PREV_SEQ_ELEM( elem_size, reader )                \
+{                                                            \
+    if( ((reader).ptr -= (elem_size)) < (reader).block_min ) \
+    {                                                        \
+        cvChangeSeqBlock( &(reader), -1 );                   \
+    }                                                        \
+}
+
+/** Read element and move read position forward: */
+#define CV_READ_SEQ_ELEM( elem, reader )                       \
+{                                                              \
+    assert( (reader).seq->elem_size == sizeof(elem));          \
+    memcpy( &(elem), (reader).ptr, sizeof((elem)));            \
+    CV_NEXT_SEQ_ELEM( sizeof(elem), reader )                   \
+}
+
+/** Read element and move read position backward: */
+#define CV_REV_READ_SEQ_ELEM( elem, reader )                     \
+{                                                                \
+    assert( (reader).seq->elem_size == sizeof(elem));            \
+    memcpy(&(elem), (reader).ptr, sizeof((elem)));               \
+    CV_PREV_SEQ_ELEM( sizeof(elem), reader )                     \
+}
+
+
+#define CV_READ_CHAIN_POINT( _pt, reader )                              \
+{                                                                       \
+    (_pt) = (reader).pt;                                                \
+    if( (reader).ptr )                                                  \
+    {                                                                   \
+        CV_READ_SEQ_ELEM( (reader).code, (reader));                     \
+        assert( ((reader).code & ~7) == 0 );                            \
+        (reader).pt.x += (reader).deltas[(int)(reader).code][0];        \
+        (reader).pt.y += (reader).deltas[(int)(reader).code][1];        \
+    }                                                                   \
+}
+
+#define CV_CURRENT_POINT( reader )  (*((CvPoint*)((reader).ptr)))
+#define CV_PREV_POINT( reader )     (*((CvPoint*)((reader).prev_elem)))
+
+#define CV_READ_EDGE( pt1, pt2, reader )               \
+{                                                      \
+    assert( sizeof(pt1) == sizeof(CvPoint) &&          \
+            sizeof(pt2) == sizeof(CvPoint) &&          \
+            reader.seq->elem_size == sizeof(CvPoint)); \
+    (pt1) = CV_PREV_POINT( reader );                   \
+    (pt2) = CV_CURRENT_POINT( reader );                \
+    (reader).prev_elem = (reader).ptr;                 \
+    CV_NEXT_SEQ_ELEM( sizeof(CvPoint), (reader));      \
+}
+
+/************ Graph macros ************/
+
+/** Return next graph edge for given vertex: */
+#define  CV_NEXT_GRAPH_EDGE( edge, vertex )                              \
+     (assert((edge)->vtx[0] == (vertex) || (edge)->vtx[1] == (vertex)),  \
+      (edge)->next[(edge)->vtx[1] == (vertex)])
+
+
+
+/****************************************************************************************\
+*             Data structures for persistence (a.k.a serialization) functionality        *
+\****************************************************************************************/
+
+#if 0
+
+/** "black box" file storage */
+typedef struct CvFileStorage CvFileStorage;
+
+/** Storage flags: */
+#define CV_STORAGE_READ          0
+#define CV_STORAGE_WRITE         1
+#define CV_STORAGE_WRITE_TEXT    CV_STORAGE_WRITE
+#define CV_STORAGE_WRITE_BINARY  CV_STORAGE_WRITE
+#define CV_STORAGE_APPEND        2
+#define CV_STORAGE_MEMORY        4
+#define CV_STORAGE_FORMAT_MASK   (7<<3)
+#define CV_STORAGE_FORMAT_AUTO   0
+#define CV_STORAGE_FORMAT_XML    8
+#define CV_STORAGE_FORMAT_YAML  16
+#define CV_STORAGE_FORMAT_JSON  24
+#define CV_STORAGE_BASE64       64
+#define CV_STORAGE_WRITE_BASE64  (CV_STORAGE_BASE64 | CV_STORAGE_WRITE)
+
+/** @brief List of attributes. :
+
+In the current implementation, attributes are used to pass extra parameters when writing user
+objects (see cvWrite). XML attributes inside tags are not supported, aside from the object type
+specification (type_id attribute).
+@see cvAttrList, cvAttrValue
+ */
+typedef struct CvAttrList
+{
+    const char** attr;         /**< NULL-terminated array of (attribute_name,attribute_value) pairs. */
+    struct CvAttrList* next;   /**< Pointer to next chunk of the attributes list.                    */
+}
+CvAttrList;
+
+/** initializes CvAttrList structure */
+CV_INLINE CvAttrList cvAttrList( const char** attr CV_DEFAULT(NULL),
+                                 CvAttrList* next CV_DEFAULT(NULL) )
+{
+    CvAttrList l;
+    l.attr = attr;
+    l.next = next;
+
+    return l;
+}
+
+struct CvTypeInfo;
+
+#define CV_NODE_NONE        0
+#define CV_NODE_INT         1
+#define CV_NODE_INTEGER     CV_NODE_INT
+#define CV_NODE_REAL        2
+#define CV_NODE_FLOAT       CV_NODE_REAL
+#define CV_NODE_STR         3
+#define CV_NODE_STRING      CV_NODE_STR
+#define CV_NODE_REF         4 /**< not used */
+#define CV_NODE_SEQ         5
+#define CV_NODE_MAP         6
+#define CV_NODE_TYPE_MASK   7
+
+#define CV_NODE_TYPE(flags)  ((flags) & CV_NODE_TYPE_MASK)
+
+/** file node flags */
+#define CV_NODE_FLOW        8 /**<Used only for writing structures in YAML format. */
+#define CV_NODE_USER        16
+#define CV_NODE_EMPTY       32
+#define CV_NODE_NAMED       64
+
+#define CV_NODE_IS_INT(flags)        (CV_NODE_TYPE(flags) == CV_NODE_INT)
+#define CV_NODE_IS_REAL(flags)       (CV_NODE_TYPE(flags) == CV_NODE_REAL)
+#define CV_NODE_IS_STRING(flags)     (CV_NODE_TYPE(flags) == CV_NODE_STRING)
+#define CV_NODE_IS_SEQ(flags)        (CV_NODE_TYPE(flags) == CV_NODE_SEQ)
+#define CV_NODE_IS_MAP(flags)        (CV_NODE_TYPE(flags) == CV_NODE_MAP)
+#define CV_NODE_IS_COLLECTION(flags) (CV_NODE_TYPE(flags) >= CV_NODE_SEQ)
+#define CV_NODE_IS_FLOW(flags)       (((flags) & CV_NODE_FLOW) != 0)
+#define CV_NODE_IS_EMPTY(flags)      (((flags) & CV_NODE_EMPTY) != 0)
+#define CV_NODE_IS_USER(flags)       (((flags) & CV_NODE_USER) != 0)
+#define CV_NODE_HAS_NAME(flags)      (((flags) & CV_NODE_NAMED) != 0)
+
+#define CV_NODE_SEQ_SIMPLE 256
+#define CV_NODE_SEQ_IS_SIMPLE(seq) (((seq)->flags & CV_NODE_SEQ_SIMPLE) != 0)
+
+typedef struct CvString
+{
+    int len;
+    char* ptr;
+}
+CvString;
+
+/** All the keys (names) of elements in the read file storage
+   are stored in the hash to speed up the lookup operations: */
+typedef struct CvStringHashNode
+{
+    unsigned hashval;
+    CvString str;
+    struct CvStringHashNode* next;
+}
+CvStringHashNode;
+
+typedef struct CvGenericHash CvFileNodeHash;
+
+/** Basic element of the file storage - scalar or collection: */
+typedef struct CvFileNode
+{
+    int tag;
+    struct CvTypeInfo* info; /**< type information
+            (only for user-defined object, for others it is 0) */
+    union
+    {
+        double f; /**< scalar floating-point number */
+        int i;    /**< scalar integer number */
+        CvString str; /**< text string */
+        CvSeq* seq; /**< sequence (ordered collection of file nodes) */
+        CvFileNodeHash* map; /**< map (collection of named file nodes) */
+    } data;
+}
+CvFileNode;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+typedef int (CV_CDECL *CvIsInstanceFunc)( const void* struct_ptr );
+typedef void (CV_CDECL *CvReleaseFunc)( void** struct_dblptr );
+typedef void* (CV_CDECL *CvReadFunc)( CvFileStorage* storage, CvFileNode* node );
+typedef void (CV_CDECL *CvWriteFunc)( CvFileStorage* storage, const char* name,
+                                      const void* struct_ptr, CvAttrList attributes );
+typedef void* (CV_CDECL *CvCloneFunc)( const void* struct_ptr );
+#ifdef __cplusplus
+}
+#endif
+
+/** @brief Type information
+
+The structure contains information about one of the standard or user-defined types. Instances of the
+type may or may not contain a pointer to the corresponding CvTypeInfo structure. In any case, there
+is a way to find the type info structure for a given object using the cvTypeOf function.
+Alternatively, type info can be found by type name using cvFindType, which is used when an object
+is read from file storage. The user can register a new type with cvRegisterType that adds the type
+information structure into the beginning of the type list. Thus, it is possible to create
+specialized types from generic standard types and override the basic methods.
+ */
+typedef struct CvTypeInfo
+{
+    int flags; /**< not used */
+    int header_size; /**< sizeof(CvTypeInfo) */
+    struct CvTypeInfo* prev; /**< previous registered type in the list */
+    struct CvTypeInfo* next; /**< next registered type in the list */
+    const char* type_name; /**< type name, written to file storage */
+    CvIsInstanceFunc is_instance; /**< checks if the passed object belongs to the type */
+    CvReleaseFunc release; /**< releases object (memory etc.) */
+    CvReadFunc read; /**< reads object from file storage */
+    CvWriteFunc write; /**< writes object to file storage */
+    CvCloneFunc clone; /**< creates a copy of the object */
+}
+CvTypeInfo;
+#endif
+
+/** @} */
+
+#endif /*OPENCV_CORE_TYPES_H*/
+
+/* End of file. */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utility.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utility.hpp
new file mode 100644
index 0000000..108c0d9
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utility.hpp
@@ -0,0 +1,1229 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_UTILITY_H
+#define OPENCV_CORE_UTILITY_H
+
+#ifndef __cplusplus
+#  error utility.hpp header must be compiled as C++
+#endif
+
+#if defined(check)
+#  warning Detected Apple 'check' macro definition, it can cause build conflicts. Please, include this header before any Apple headers.
+#endif
+
+#include "opencv2/core.hpp"
+#include <ostream>
+
+#include <functional>
+
+#if !defined(_M_CEE)
+#include <mutex>  // std::mutex, std::lock_guard
+#endif
+
+namespace cv
+{
+
+//! @addtogroup core_utils
+//! @{
+
+/** @brief  Automatically Allocated Buffer Class
+
+ The class is used for temporary buffers in functions and methods.
+ If a temporary buffer is usually small (a few K's of memory),
+ but its size depends on the parameters, it makes sense to create a small
+ fixed-size array on stack and use it if it's large enough. If the required buffer size
+ is larger than the fixed size, another buffer of sufficient size is allocated dynamically
+ and released after the processing. Therefore, in typical cases, when the buffer size is small,
+ there is no overhead associated with malloc()/free().
+ At the same time, there is no limit on the size of processed data.
+
+ This is what AutoBuffer does. The template takes 2 parameters - type of the buffer elements and
+ the number of stack-allocated elements. Here is how the class is used:
+
+ \code
+ void my_func(const cv::Mat& m)
+ {
+    cv::AutoBuffer<float> buf(1000); // create automatic buffer containing 1000 floats
+
+    buf.allocate(m.rows); // if m.rows <= 1000, the pre-allocated buffer is used,
+                          // otherwise the buffer of "m.rows" floats will be allocated
+                          // dynamically and deallocated in cv::AutoBuffer destructor
+    ...
+ }
+ \endcode
+*/
+#ifdef OPENCV_ENABLE_MEMORY_SANITIZER
+template<typename _Tp, size_t fixed_size = 0> class AutoBuffer
+#else
+template<typename _Tp, size_t fixed_size = 1024/sizeof(_Tp)+8> class AutoBuffer
+#endif
+{
+public:
+    typedef _Tp value_type;
+
+    //! the default constructor
+    AutoBuffer();
+    //! constructor taking the real buffer size
+    explicit AutoBuffer(size_t _size);
+
+    //! the copy constructor
+    AutoBuffer(const AutoBuffer<_Tp, fixed_size>& buf);
+    //! the assignment operator
+    AutoBuffer<_Tp, fixed_size>& operator = (const AutoBuffer<_Tp, fixed_size>& buf);
+
+    //! destructor. calls deallocate()
+    ~AutoBuffer();
+
+    //! allocates the new buffer of size _size. if the _size is small enough, stack-allocated buffer is used
+    void allocate(size_t _size);
+    //! deallocates the buffer if it was dynamically allocated
+    void deallocate();
+    //! resizes the buffer and preserves the content
+    void resize(size_t _size);
+    //! returns the current buffer size
+    size_t size() const;
+    //! returns pointer to the real buffer, stack-allocated or heap-allocated
+    inline _Tp* data() { return ptr; }
+    //! returns read-only pointer to the real buffer, stack-allocated or heap-allocated
+    inline const _Tp* data() const { return ptr; }
+
+#if !defined(OPENCV_DISABLE_DEPRECATED_COMPATIBILITY) // use to .data() calls instead
+    //! returns pointer to the real buffer, stack-allocated or heap-allocated
+    operator _Tp* () { return ptr; }
+    //! returns read-only pointer to the real buffer, stack-allocated or heap-allocated
+    operator const _Tp* () const { return ptr; }
+#else
+    //! returns a reference to the element at specified location. No bounds checking is performed in Release builds.
+    inline _Tp& operator[] (size_t i) { CV_DbgCheckLT(i, sz, "out of range"); return ptr[i]; }
+    //! returns a reference to the element at specified location. No bounds checking is performed in Release builds.
+    inline const _Tp& operator[] (size_t i) const { CV_DbgCheckLT(i, sz, "out of range"); return ptr[i]; }
+#endif
+
+protected:
+    //! pointer to the real buffer, can point to buf if the buffer is small enough
+    _Tp* ptr;
+    //! size of the real buffer
+    size_t sz;
+    //! pre-allocated buffer. At least 1 element to confirm C++ standard requirements
+    _Tp buf[(fixed_size > 0) ? fixed_size : 1];
+};
+
+/**  @brief Sets/resets the break-on-error mode.
+
+When the break-on-error mode is set, the default error handler issues a hardware exception, which
+can make debugging more convenient.
+
+\return the previous state
+ */
+CV_EXPORTS bool setBreakOnError(bool flag);
+
+extern "C" typedef int (*ErrorCallback)( int status, const char* func_name,
+                                       const char* err_msg, const char* file_name,
+                                       int line, void* userdata );
+
+
+/** @brief Sets the new error handler and the optional user data.
+
+  The function sets the new error handler, called from cv::error().
+
+  \param errCallback the new error handler. If NULL, the default error handler is used.
+  \param userdata the optional user data pointer, passed to the callback.
+  \param prevUserdata the optional output parameter where the previous user data pointer is stored
+
+  \return the previous error handler
+*/
+CV_EXPORTS ErrorCallback redirectError( ErrorCallback errCallback, void* userdata=0, void** prevUserdata=0);
+
+CV_EXPORTS String tempfile( const char* suffix = 0);
+CV_EXPORTS void glob(String pattern, std::vector<String>& result, bool recursive = false);
+
+/** @brief OpenCV will try to set the number of threads for the next parallel region.
+
+If threads == 0, OpenCV will disable threading optimizations and run all it's functions
+sequentially. Passing threads \< 0 will reset threads number to system default. This function must
+be called outside of parallel region.
+
+OpenCV will try to run its functions with specified threads number, but some behaviour differs from
+framework:
+-   `TBB` - User-defined parallel constructions will run with the same threads number, if
+    another is not specified. If later on user creates his own scheduler, OpenCV will use it.
+-   `OpenMP` - No special defined behaviour.
+-   `Concurrency` - If threads == 1, OpenCV will disable threading optimizations and run its
+    functions sequentially.
+-   `GCD` - Supports only values \<= 0.
+-   `C=` - No special defined behaviour.
+@param nthreads Number of threads used by OpenCV.
+@sa getNumThreads, getThreadNum
+ */
+CV_EXPORTS_W void setNumThreads(int nthreads);
+
+/** @brief Returns the number of threads used by OpenCV for parallel regions.
+
+Always returns 1 if OpenCV is built without threading support.
+
+The exact meaning of return value depends on the threading framework used by OpenCV library:
+- `TBB` - The number of threads, that OpenCV will try to use for parallel regions. If there is
+  any tbb::thread_scheduler_init in user code conflicting with OpenCV, then function returns
+  default number of threads used by TBB library.
+- `OpenMP` - An upper bound on the number of threads that could be used to form a new team.
+- `Concurrency` - The number of threads, that OpenCV will try to use for parallel regions.
+- `GCD` - Unsupported; returns the GCD thread pool limit (512) for compatibility.
+- `C=` - The number of threads, that OpenCV will try to use for parallel regions, if before
+  called setNumThreads with threads \> 0, otherwise returns the number of logical CPUs,
+  available for the process.
+@sa setNumThreads, getThreadNum
+ */
+CV_EXPORTS_W int getNumThreads();
+
+/** @brief Returns the index of the currently executed thread within the current parallel region. Always
+returns 0 if called outside of parallel region.
+
+@deprecated Current implementation doesn't corresponding to this documentation.
+
+The exact meaning of the return value depends on the threading framework used by OpenCV library:
+- `TBB` - Unsupported with current 4.1 TBB release. Maybe will be supported in future.
+- `OpenMP` - The thread number, within the current team, of the calling thread.
+- `Concurrency` - An ID for the virtual processor that the current context is executing on (0
+  for master thread and unique number for others, but not necessary 1,2,3,...).
+- `GCD` - System calling thread's ID. Never returns 0 inside parallel region.
+- `C=` - The index of the current parallel task.
+@sa setNumThreads, getNumThreads
+ */
+CV_EXPORTS_W int getThreadNum();
+
+/** @brief Returns full configuration time cmake output.
+
+Returned value is raw cmake output including version control system revision, compiler version,
+compiler flags, enabled modules and third party libraries, etc. Output format depends on target
+architecture.
+ */
+CV_EXPORTS_W const String& getBuildInformation();
+
+/** @brief Returns library version string
+
+For example "3.4.1-dev".
+
+@sa getMajorVersion, getMinorVersion, getRevisionVersion
+*/
+CV_EXPORTS_W String getVersionString();
+
+/** @brief Returns major library version */
+CV_EXPORTS_W int getVersionMajor();
+
+/** @brief Returns minor library version */
+CV_EXPORTS_W int getVersionMinor();
+
+/** @brief Returns revision field of the library version */
+CV_EXPORTS_W int getVersionRevision();
+
+/** @brief Returns the number of ticks.
+
+The function returns the number of ticks after the certain event (for example, when the machine was
+turned on). It can be used to initialize RNG or to measure a function execution time by reading the
+tick count before and after the function call.
+@sa getTickFrequency, TickMeter
+ */
+CV_EXPORTS_W int64 getTickCount();
+
+/** @brief Returns the number of ticks per second.
+
+The function returns the number of ticks per second. That is, the following code computes the
+execution time in seconds:
+@code
+    double t = (double)getTickCount();
+    // do something ...
+    t = ((double)getTickCount() - t)/getTickFrequency();
+@endcode
+@sa getTickCount, TickMeter
+ */
+CV_EXPORTS_W double getTickFrequency();
+
+/** @brief a Class to measure passing time.
+
+The class computes passing time by counting the number of ticks per second. That is, the following code computes the
+execution time in seconds:
+@snippet snippets/core_various.cpp TickMeter_total
+
+It is also possible to compute the average time over multiple runs:
+@snippet snippets/core_various.cpp TickMeter_average
+
+@sa getTickCount, getTickFrequency
+*/
+class CV_EXPORTS_W TickMeter
+{
+public:
+    //! the default constructor
+    CV_WRAP TickMeter()
+    {
+        reset();
+    }
+
+    //! starts counting ticks.
+    CV_WRAP void start()
+    {
+        startTime = cv::getTickCount();
+    }
+
+    //! stops counting ticks.
+    CV_WRAP void stop()
+    {
+        int64 time = cv::getTickCount();
+        if (startTime == 0)
+            return;
+        ++counter;
+        sumTime += (time - startTime);
+        startTime = 0;
+    }
+
+    //! returns counted ticks.
+    CV_WRAP int64 getTimeTicks() const
+    {
+        return sumTime;
+    }
+
+    //! returns passed time in microseconds.
+    CV_WRAP double getTimeMicro() const
+    {
+        return getTimeMilli()*1e3;
+    }
+
+    //! returns passed time in milliseconds.
+    CV_WRAP double getTimeMilli() const
+    {
+        return getTimeSec()*1e3;
+    }
+
+    //! returns passed time in seconds.
+    CV_WRAP double getTimeSec()   const
+    {
+        return (double)getTimeTicks() / getTickFrequency();
+    }
+
+    //! returns internal counter value.
+    CV_WRAP int64 getCounter() const
+    {
+        return counter;
+    }
+
+    //! returns average FPS (frames per second) value.
+    CV_WRAP double getFPS() const
+    {
+        const double sec = getTimeSec();
+        if (sec < DBL_EPSILON)
+            return 0.;
+        return counter / sec;
+    }
+
+    //! returns average time in seconds
+    CV_WRAP double getAvgTimeSec() const
+    {
+        if (counter <= 0)
+            return 0.;
+        return getTimeSec() / counter;
+    }
+
+    //! returns average time in milliseconds
+    CV_WRAP double getAvgTimeMilli() const
+    {
+        return getAvgTimeSec() * 1e3;
+    }
+
+    //! resets internal values.
+    CV_WRAP void reset()
+    {
+        startTime = 0;
+        sumTime = 0;
+        counter = 0;
+    }
+
+private:
+    int64 counter;
+    int64 sumTime;
+    int64 startTime;
+};
+
+/** @brief output operator
+@code
+TickMeter tm;
+tm.start();
+// do something ...
+tm.stop();
+std::cout << tm;
+@endcode
+*/
+
+static inline
+std::ostream& operator << (std::ostream& out, const TickMeter& tm)
+{
+    return out << tm.getTimeSec() << "sec";
+}
+
+/** @brief Returns the number of CPU ticks.
+
+The function returns the current number of CPU ticks on some architectures (such as x86, x64,
+PowerPC). On other platforms the function is equivalent to getTickCount. It can also be used for
+very accurate time measurements, as well as for RNG initialization. Note that in case of multi-CPU
+systems a thread, from which getCPUTickCount is called, can be suspended and resumed at another CPU
+with its own counter. So, theoretically (and practically) the subsequent calls to the function do
+not necessary return the monotonously increasing values. Also, since a modern CPU varies the CPU
+frequency depending on the load, the number of CPU clocks spent in some code cannot be directly
+converted to time units. Therefore, getTickCount is generally a preferable solution for measuring
+execution time.
+ */
+CV_EXPORTS_W int64 getCPUTickCount();
+
+/** @brief Returns true if the specified feature is supported by the host hardware.
+
+The function returns true if the host hardware supports the specified feature. When user calls
+setUseOptimized(false), the subsequent calls to checkHardwareSupport() will return false until
+setUseOptimized(true) is called. This way user can dynamically switch on and off the optimized code
+in OpenCV.
+@param feature The feature of interest, one of cv::CpuFeatures
+ */
+CV_EXPORTS_W bool checkHardwareSupport(int feature);
+
+/** @brief Returns feature name by ID
+
+Returns empty string if feature is not defined
+*/
+CV_EXPORTS_W String getHardwareFeatureName(int feature);
+
+/** @brief Returns list of CPU features enabled during compilation.
+
+Returned value is a string containing space separated list of CPU features with following markers:
+
+- no markers - baseline features
+- prefix `*` - features enabled in dispatcher
+- suffix `?` - features enabled but not available in HW
+
+Example: `SSE SSE2 SSE3 *SSE4.1 *SSE4.2 *FP16 *AVX *AVX2 *AVX512-SKX?`
+*/
+CV_EXPORTS_W std::string getCPUFeaturesLine();
+
+/** @brief Returns the number of logical CPUs available for the process.
+ */
+CV_EXPORTS_W int getNumberOfCPUs();
+
+
+/** @brief Aligns a pointer to the specified number of bytes.
+
+The function returns the aligned pointer of the same type as the input pointer:
+\f[\texttt{(_Tp*)(((size_t)ptr + n-1) & -n)}\f]
+@param ptr Aligned pointer.
+@param n Alignment size that must be a power of two.
+ */
+template<typename _Tp> static inline _Tp* alignPtr(_Tp* ptr, int n=(int)sizeof(_Tp))
+{
+    CV_DbgAssert((n & (n - 1)) == 0); // n is a power of 2
+    return (_Tp*)(((size_t)ptr + n-1) & -n);
+}
+
+/** @brief Aligns a buffer size to the specified number of bytes.
+
+The function returns the minimum number that is greater than or equal to sz and is divisible by n :
+\f[\texttt{(sz + n-1) & -n}\f]
+@param sz Buffer size to align.
+@param n Alignment size that must be a power of two.
+ */
+static inline size_t alignSize(size_t sz, int n)
+{
+    CV_DbgAssert((n & (n - 1)) == 0); // n is a power of 2
+    return (sz + n-1) & -n;
+}
+
+/** @brief Integer division with result round up.
+
+Use this function instead of `ceil((float)a / b)` expressions.
+
+@sa alignSize
+*/
+static inline int divUp(int a, unsigned int b)
+{
+    CV_DbgAssert(a >= 0);
+    return (a + b - 1) / b;
+}
+/** @overload */
+static inline size_t divUp(size_t a, unsigned int b)
+{
+    return (a + b - 1) / b;
+}
+
+/** @brief Round first value up to the nearest multiple of second value.
+
+Use this function instead of `ceil((float)a / b) * b` expressions.
+
+@sa divUp
+*/
+static inline int roundUp(int a, unsigned int b)
+{
+    CV_DbgAssert(a >= 0);
+    return a + b - 1 - (a + b -1) % b;
+}
+/** @overload */
+static inline size_t roundUp(size_t a, unsigned int b)
+{
+    return a + b - 1 - (a + b - 1) % b;
+}
+
+/** @brief Alignment check of passed values
+
+Usage: `isAligned<sizeof(int)>(...)`
+
+@note Alignment(N) must be a power of 2 (2**k, 2^k)
+*/
+template<int N, typename T> static inline
+bool isAligned(const T& data)
+{
+    CV_StaticAssert((N & (N - 1)) == 0, "");  // power of 2
+    return (((size_t)data) & (N - 1)) == 0;
+}
+/** @overload */
+template<int N> static inline
+bool isAligned(const void* p1)
+{
+    return isAligned<N>((size_t)p1);
+}
+/** @overload */
+template<int N> static inline
+bool isAligned(const void* p1, const void* p2)
+{
+    return isAligned<N>(((size_t)p1)|((size_t)p2));
+}
+/** @overload */
+template<int N> static inline
+bool isAligned(const void* p1, const void* p2, const void* p3)
+{
+    return isAligned<N>(((size_t)p1)|((size_t)p2)|((size_t)p3));
+}
+/** @overload */
+template<int N> static inline
+bool isAligned(const void* p1, const void* p2, const void* p3, const void* p4)
+{
+    return isAligned<N>(((size_t)p1)|((size_t)p2)|((size_t)p3)|((size_t)p4));
+}
+
+/** @brief Enables or disables the optimized code.
+
+The function can be used to dynamically turn on and off optimized dispatched code (code that uses SSE4.2, AVX/AVX2,
+and other instructions on the platforms that support it). It sets a global flag that is further
+checked by OpenCV functions. Since the flag is not checked in the inner OpenCV loops, it is only
+safe to call the function on the very top level in your application where you can be sure that no
+other OpenCV function is currently executed.
+
+By default, the optimized code is enabled unless you disable it in CMake. The current status can be
+retrieved using useOptimized.
+@param onoff The boolean flag specifying whether the optimized code should be used (onoff=true)
+or not (onoff=false).
+ */
+CV_EXPORTS_W void setUseOptimized(bool onoff);
+
+/** @brief Returns the status of optimized code usage.
+
+The function returns true if the optimized code is enabled. Otherwise, it returns false.
+ */
+CV_EXPORTS_W bool useOptimized();
+
+static inline size_t getElemSize(int type) { return (size_t)CV_ELEM_SIZE(type); }
+
+/////////////////////////////// Parallel Primitives //////////////////////////////////
+
+/** @brief Base class for parallel data processors
+
+@ingroup core_parallel
+*/
+class CV_EXPORTS ParallelLoopBody
+{
+public:
+    virtual ~ParallelLoopBody();
+    virtual void operator() (const Range& range) const = 0;
+};
+
+/** @brief Parallel data processor
+
+@ingroup core_parallel
+*/
+CV_EXPORTS void parallel_for_(const Range& range, const ParallelLoopBody& body, double nstripes=-1.);
+
+//! @ingroup core_parallel
+class ParallelLoopBodyLambdaWrapper : public ParallelLoopBody
+{
+private:
+    std::function<void(const Range&)> m_functor;
+public:
+    inline
+    ParallelLoopBodyLambdaWrapper(std::function<void(const Range&)> functor)
+        : m_functor(functor)
+    {
+        // nothing
+    }
+
+    virtual void operator() (const cv::Range& range) const CV_OVERRIDE
+    {
+        m_functor(range);
+    }
+};
+
+//! @ingroup core_parallel
+static inline
+void parallel_for_(const Range& range, std::function<void(const Range&)> functor, double nstripes=-1.)
+{
+    parallel_for_(range, ParallelLoopBodyLambdaWrapper(functor), nstripes);
+}
+
+
+/////////////////////////////// forEach method of cv::Mat ////////////////////////////
+template<typename _Tp, typename Functor> inline
+void Mat::forEach_impl(const Functor& operation) {
+    if (false) {
+        operation(*reinterpret_cast<_Tp*>(0), reinterpret_cast<int*>(0));
+        // If your compiler fails in this line.
+        // Please check that your functor signature is
+        //     (_Tp&, const int*)   <- multi-dimensional
+        //  or (_Tp&, void*)        <- in case you don't need current idx.
+    }
+
+    CV_Assert(!empty());
+    CV_Assert(this->total() / this->size[this->dims - 1] <= INT_MAX);
+    const int LINES = static_cast<int>(this->total() / this->size[this->dims - 1]);
+
+    class PixelOperationWrapper :public ParallelLoopBody
+    {
+    public:
+        PixelOperationWrapper(Mat_<_Tp>* const frame, const Functor& _operation)
+            : mat(frame), op(_operation) {}
+        virtual ~PixelOperationWrapper(){}
+        // ! Overloaded virtual operator
+        // convert range call to row call.
+        virtual void operator()(const Range &range) const CV_OVERRIDE
+        {
+            const int DIMS = mat->dims;
+            const int COLS = mat->size[DIMS - 1];
+            if (DIMS <= 2) {
+                for (int row = range.start; row < range.end; ++row) {
+                    this->rowCall2(row, COLS);
+                }
+            } else {
+                std::vector<int> idx(DIMS); /// idx is modified in this->rowCall
+                idx[DIMS - 2] = range.start - 1;
+
+                for (int line_num = range.start; line_num < range.end; ++line_num) {
+                    idx[DIMS - 2]++;
+                    for (int i = DIMS - 2; i >= 0; --i) {
+                        if (idx[i] >= mat->size[i]) {
+                            idx[i - 1] += idx[i] / mat->size[i];
+                            idx[i] %= mat->size[i];
+                            continue; // carry-over;
+                        }
+                        else {
+                            break;
+                        }
+                    }
+                    this->rowCall(&idx[0], COLS, DIMS);
+                }
+            }
+        }
+    private:
+        Mat_<_Tp>* const mat;
+        const Functor op;
+        // ! Call operator for each elements in this row.
+        inline void rowCall(int* const idx, const int COLS, const int DIMS) const {
+            int &col = idx[DIMS - 1];
+            col = 0;
+            _Tp* pixel = &(mat->template at<_Tp>(idx));
+
+            while (col < COLS) {
+                op(*pixel, const_cast<const int*>(idx));
+                pixel++; col++;
+            }
+            col = 0;
+        }
+        // ! Call operator for each elements in this row. 2d mat special version.
+        inline void rowCall2(const int row, const int COLS) const {
+            union Index{
+                int body[2];
+                operator const int*() const {
+                    return reinterpret_cast<const int*>(this);
+                }
+                int& operator[](const int i) {
+                    return body[i];
+                }
+            } idx = {{row, 0}};
+            // Special union is needed to avoid
+            // "error: array subscript is above array bounds [-Werror=array-bounds]"
+            // when call the functor `op` such that access idx[3].
+
+            _Tp* pixel = &(mat->template at<_Tp>(idx));
+            const _Tp* const pixel_end = pixel + COLS;
+            while(pixel < pixel_end) {
+                op(*pixel++, static_cast<const int*>(idx));
+                idx[1]++;
+            }
+        }
+        PixelOperationWrapper& operator=(const PixelOperationWrapper &) {
+            CV_Assert(false);
+            // We can not remove this implementation because Visual Studio warning C4822.
+            return *this;
+        }
+    };
+
+    parallel_for_(cv::Range(0, LINES), PixelOperationWrapper(reinterpret_cast<Mat_<_Tp>*>(this), operation));
+}
+
+/////////////////////////// Synchronization Primitives ///////////////////////////////
+
+#if !defined(_M_CEE)
+#ifndef OPENCV_DISABLE_THREAD_SUPPORT
+typedef std::recursive_mutex Mutex;
+typedef std::lock_guard<cv::Mutex> AutoLock;
+#else // OPENCV_DISABLE_THREAD_SUPPORT
+// Custom (failing) implementation of `std::recursive_mutex`.
+struct Mutex {
+    void lock(){
+        CV_Error(cv::Error::StsNotImplemented,
+                 "cv::Mutex is disabled by OPENCV_DISABLE_THREAD_SUPPORT=ON");
+    }
+    void unlock(){
+        CV_Error(cv::Error::StsNotImplemented,
+                 "cv::Mutex is disabled by OPENCV_DISABLE_THREAD_SUPPORT=ON");
+    }
+};
+// Stub for cv::AutoLock when threads are disabled.
+struct AutoLock {
+    AutoLock(Mutex &) { }
+};
+#endif // OPENCV_DISABLE_THREAD_SUPPORT
+#endif // !defined(_M_CEE)
+
+
+/** @brief Designed for command line parsing
+
+The sample below demonstrates how to use CommandLineParser:
+@code
+    CommandLineParser parser(argc, argv, keys);
+    parser.about("Application name v1.0.0");
+
+    if (parser.has("help"))
+    {
+        parser.printMessage();
+        return 0;
+    }
+
+    int N = parser.get<int>("N");
+    double fps = parser.get<double>("fps");
+    String path = parser.get<String>("path");
+
+    use_time_stamp = parser.has("timestamp");
+
+    String img1 = parser.get<String>(0);
+    String img2 = parser.get<String>(1);
+
+    int repeat = parser.get<int>(2);
+
+    if (!parser.check())
+    {
+        parser.printErrors();
+        return 0;
+    }
+@endcode
+
+### Keys syntax
+
+The keys parameter is a string containing several blocks, each one is enclosed in curly braces and
+describes one argument. Each argument contains three parts separated by the `|` symbol:
+
+-# argument names is a space-separated list of option synonyms (to mark argument as positional, prefix it with the `@` symbol)
+-# default value will be used if the argument was not provided (can be empty)
+-# help message (can be empty)
+
+For example:
+
+@code{.cpp}
+    const String keys =
+        "{help h usage ? |      | print this message   }"
+        "{@image1        |      | image1 for compare   }"
+        "{@image2        |<none>| image2 for compare   }"
+        "{@repeat        |1     | number               }"
+        "{path           |.     | path to file         }"
+        "{fps            | -1.0 | fps for output video }"
+        "{N count        |100   | count of objects     }"
+        "{ts timestamp   |      | use time stamp       }"
+        ;
+}
+@endcode
+
+Note that there are no default values for `help` and `timestamp` so we can check their presence using the `has()` method.
+Arguments with default values are considered to be always present. Use the `get()` method in these cases to check their
+actual value instead.
+
+String keys like `get<String>("@image1")` return the empty string `""` by default - even with an empty default value.
+Use the special `<none>` default value to enforce that the returned string must not be empty. (like in `get<String>("@image2")`)
+
+### Usage
+
+For the described keys:
+
+@code{.sh}
+    # Good call (3 positional parameters: image1, image2 and repeat; N is 200, ts is true)
+    $ ./app -N=200 1.png 2.jpg 19 -ts
+
+    # Bad call
+    $ ./app -fps=aaa
+    ERRORS:
+    Parameter 'fps': can not convert: [aaa] to [double]
+@endcode
+ */
+class CV_EXPORTS CommandLineParser
+{
+public:
+
+    /** @brief Constructor
+
+    Initializes command line parser object
+
+    @param argc number of command line arguments (from main())
+    @param argv array of command line arguments (from main())
+    @param keys string describing acceptable command line parameters (see class description for syntax)
+    */
+    CommandLineParser(int argc, const char* const argv[], const String& keys);
+
+    /** @brief Copy constructor */
+    CommandLineParser(const CommandLineParser& parser);
+
+    /** @brief Assignment operator */
+    CommandLineParser& operator = (const CommandLineParser& parser);
+
+    /** @brief Destructor */
+    ~CommandLineParser();
+
+    /** @brief Returns application path
+
+    This method returns the path to the executable from the command line (`argv[0]`).
+
+    For example, if the application has been started with such a command:
+    @code{.sh}
+    $ ./bin/my-executable
+    @endcode
+    this method will return `./bin`.
+    */
+    String getPathToApplication() const;
+
+    /** @brief Access arguments by name
+
+    Returns argument converted to selected type. If the argument is not known or can not be
+    converted to selected type, the error flag is set (can be checked with @ref check).
+
+    For example, define:
+    @code{.cpp}
+    String keys = "{N count||}";
+    @endcode
+
+    Call:
+    @code{.sh}
+    $ ./my-app -N=20
+    # or
+    $ ./my-app --count=20
+    @endcode
+
+    Access:
+    @code{.cpp}
+    int N = parser.get<int>("N");
+    @endcode
+
+    @param name name of the argument
+    @param space_delete remove spaces from the left and right of the string
+    @tparam T the argument will be converted to this type if possible
+
+    @note You can access positional arguments by their `@`-prefixed name:
+    @code{.cpp}
+    parser.get<String>("@image");
+    @endcode
+     */
+    template <typename T>
+    T get(const String& name, bool space_delete = true) const
+    {
+        T val = T();
+        getByName(name, space_delete, ParamType<T>::type, (void*)&val);
+        return val;
+    }
+
+    /** @brief Access positional arguments by index
+
+    Returns argument converted to selected type. Indexes are counted from zero.
+
+    For example, define:
+    @code{.cpp}
+    String keys = "{@arg1||}{@arg2||}"
+    @endcode
+
+    Call:
+    @code{.sh}
+    ./my-app abc qwe
+    @endcode
+
+    Access arguments:
+    @code{.cpp}
+    String val_1 = parser.get<String>(0); // returns "abc", arg1
+    String val_2 = parser.get<String>(1); // returns "qwe", arg2
+    @endcode
+
+    @param index index of the argument
+    @param space_delete remove spaces from the left and right of the string
+    @tparam T the argument will be converted to this type if possible
+     */
+    template <typename T>
+    T get(int index, bool space_delete = true) const
+    {
+        T val = T();
+        getByIndex(index, space_delete, ParamType<T>::type, (void*)&val);
+        return val;
+    }
+
+    /** @brief Check if field was provided in the command line
+
+    @param name argument name to check
+    */
+    bool has(const String& name) const;
+
+    /** @brief Check for parsing errors
+
+    Returns false if error occurred while accessing the parameters (bad conversion, missing arguments,
+    etc.). Call @ref printErrors to print error messages list.
+     */
+    bool check() const;
+
+    /** @brief Set the about message
+
+    The about message will be shown when @ref printMessage is called, right before arguments table.
+     */
+    void about(const String& message);
+
+    /** @brief Print help message
+
+    This method will print standard help message containing the about message and arguments description.
+
+    @sa about
+    */
+    void printMessage() const;
+
+    /** @brief Print list of errors occurred
+
+    @sa check
+    */
+    void printErrors() const;
+
+protected:
+    void getByName(const String& name, bool space_delete, Param type, void* dst) const;
+    void getByIndex(int index, bool space_delete, Param type, void* dst) const;
+
+    struct Impl;
+    Impl* impl;
+};
+
+//! @} core_utils
+
+//! @cond IGNORED
+
+/////////////////////////////// AutoBuffer implementation ////////////////////////////////////////
+
+template<typename _Tp, size_t fixed_size> inline
+AutoBuffer<_Tp, fixed_size>::AutoBuffer()
+{
+    ptr = buf;
+    sz = fixed_size;
+}
+
+template<typename _Tp, size_t fixed_size> inline
+AutoBuffer<_Tp, fixed_size>::AutoBuffer(size_t _size)
+{
+    ptr = buf;
+    sz = fixed_size;
+    allocate(_size);
+}
+
+template<typename _Tp, size_t fixed_size> inline
+AutoBuffer<_Tp, fixed_size>::AutoBuffer(const AutoBuffer<_Tp, fixed_size>& abuf )
+{
+    ptr = buf;
+    sz = fixed_size;
+    allocate(abuf.size());
+    for( size_t i = 0; i < sz; i++ )
+        ptr[i] = abuf.ptr[i];
+}
+
+template<typename _Tp, size_t fixed_size> inline AutoBuffer<_Tp, fixed_size>&
+AutoBuffer<_Tp, fixed_size>::operator = (const AutoBuffer<_Tp, fixed_size>& abuf)
+{
+    if( this != &abuf )
+    {
+        deallocate();
+        allocate(abuf.size());
+        for( size_t i = 0; i < sz; i++ )
+            ptr[i] = abuf.ptr[i];
+    }
+    return *this;
+}
+
+template<typename _Tp, size_t fixed_size> inline
+AutoBuffer<_Tp, fixed_size>::~AutoBuffer()
+{ deallocate(); }
+
+template<typename _Tp, size_t fixed_size> inline void
+AutoBuffer<_Tp, fixed_size>::allocate(size_t _size)
+{
+    if(_size <= sz)
+    {
+        sz = _size;
+        return;
+    }
+    deallocate();
+    sz = _size;
+    if(_size > fixed_size)
+    {
+        ptr = new _Tp[_size];
+    }
+}
+
+template<typename _Tp, size_t fixed_size> inline void
+AutoBuffer<_Tp, fixed_size>::deallocate()
+{
+    if( ptr != buf )
+    {
+        delete[] ptr;
+        ptr = buf;
+        sz = fixed_size;
+    }
+}
+
+template<typename _Tp, size_t fixed_size> inline void
+AutoBuffer<_Tp, fixed_size>::resize(size_t _size)
+{
+    if(_size <= sz)
+    {
+        sz = _size;
+        return;
+    }
+    size_t i, prevsize = sz, minsize = MIN(prevsize, _size);
+    _Tp* prevptr = ptr;
+
+    ptr = _size > fixed_size ? new _Tp[_size] : buf;
+    sz = _size;
+
+    if( ptr != prevptr )
+        for( i = 0; i < minsize; i++ )
+            ptr[i] = prevptr[i];
+    for( i = prevsize; i < _size; i++ )
+        ptr[i] = _Tp();
+
+    if( prevptr != buf )
+        delete[] prevptr;
+}
+
+template<typename _Tp, size_t fixed_size> inline size_t
+AutoBuffer<_Tp, fixed_size>::size() const
+{ return sz; }
+
+//! @endcond
+
+
+// Basic Node class for tree building
+template<class OBJECT>
+class CV_EXPORTS Node
+{
+public:
+    Node()
+    {
+        m_pParent  = 0;
+    }
+    Node(OBJECT& payload) : m_payload(payload)
+    {
+        m_pParent  = 0;
+    }
+    ~Node()
+    {
+        removeChilds();
+        if (m_pParent)
+        {
+            int idx = m_pParent->findChild(this);
+            if (idx >= 0)
+                m_pParent->m_childs.erase(m_pParent->m_childs.begin() + idx);
+        }
+    }
+
+    Node<OBJECT>* findChild(OBJECT& payload) const
+    {
+        for(size_t i = 0; i < this->m_childs.size(); i++)
+        {
+            if(this->m_childs[i]->m_payload == payload)
+                return this->m_childs[i];
+        }
+        return NULL;
+    }
+
+    int findChild(Node<OBJECT> *pNode) const
+    {
+        for (size_t i = 0; i < this->m_childs.size(); i++)
+        {
+            if(this->m_childs[i] == pNode)
+                return (int)i;
+        }
+        return -1;
+    }
+
+    void addChild(Node<OBJECT> *pNode)
+    {
+        if(!pNode)
+            return;
+
+        CV_Assert(pNode->m_pParent == 0);
+        pNode->m_pParent = this;
+        this->m_childs.push_back(pNode);
+    }
+
+    void removeChilds()
+    {
+        for(size_t i = 0; i < m_childs.size(); i++)
+        {
+            m_childs[i]->m_pParent = 0; // avoid excessive parent vector trimming
+            delete m_childs[i];
+        }
+        m_childs.clear();
+    }
+
+    int getDepth()
+    {
+        int   count   = 0;
+        Node *pParent = m_pParent;
+        while(pParent) count++, pParent = pParent->m_pParent;
+        return count;
+    }
+
+public:
+    OBJECT                     m_payload;
+    Node<OBJECT>*              m_pParent;
+    std::vector<Node<OBJECT>*> m_childs;
+};
+
+
+namespace samples {
+
+//! @addtogroup core_utils_samples
+// This section describes utility functions for OpenCV samples.
+//
+// @note Implementation of these utilities is not thread-safe.
+//
+//! @{
+
+/** @brief Try to find requested data file
+
+Search directories:
+
+1. Directories passed via `addSamplesDataSearchPath()`
+2. OPENCV_SAMPLES_DATA_PATH_HINT environment variable
+3. OPENCV_SAMPLES_DATA_PATH environment variable
+   If parameter value is not empty and nothing is found then stop searching.
+4. Detects build/install path based on:
+   a. current working directory (CWD)
+   b. and/or binary module location (opencv_core/opencv_world, doesn't work with static linkage)
+5. Scan `<source>/{,data,samples/data}` directories if build directory is detected or the current directory is in source tree.
+6. Scan `<install>/share/OpenCV` directory if install directory is detected.
+
+@see cv::utils::findDataFile
+
+@param relative_path Relative path to data file
+@param required Specify "file not found" handling.
+       If true, function prints information message and raises cv::Exception.
+       If false, function returns empty result
+@param silentMode Disables messages
+@return Returns path (absolute or relative to the current directory) or empty string if file is not found
+*/
+CV_EXPORTS_W cv::String findFile(const cv::String& relative_path, bool required = true, bool silentMode = false);
+
+CV_EXPORTS_W cv::String findFileOrKeep(const cv::String& relative_path, bool silentMode = false);
+
+inline cv::String findFileOrKeep(const cv::String& relative_path, bool silentMode)
+{
+    cv::String res = findFile(relative_path, false, silentMode);
+    if (res.empty())
+        return relative_path;
+    return res;
+}
+
+/** @brief Override search data path by adding new search location
+
+Use this only to override default behavior
+Passed paths are used in LIFO order.
+
+@param path Path to used samples data
+*/
+CV_EXPORTS_W void addSamplesDataSearchPath(const cv::String& path);
+
+/** @brief Append samples search data sub directory
+
+General usage is to add OpenCV modules name (`<opencv_contrib>/modules/<name>/samples/data` -> `<name>/samples/data` + `modules/<name>/samples/data`).
+Passed subdirectories are used in LIFO order.
+
+@param subdir samples data sub directory
+*/
+CV_EXPORTS_W void addSamplesDataSearchSubDirectory(const cv::String& subdir);
+
+//! @}
+} // namespace samples
+
+namespace utils {
+
+CV_EXPORTS int getThreadID();
+
+} // namespace
+
+} //namespace cv
+
+#ifdef CV_COLLECT_IMPL_DATA
+#include "opencv2/core/utils/instrumentation.hpp"
+#else
+/// Collect implementation data on OpenCV function call. Requires ENABLE_IMPL_COLLECTION build option.
+#define CV_IMPL_ADD(impl)
+#endif
+
+#endif //OPENCV_CORE_UTILITY_H
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utils/allocator_stats.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utils/allocator_stats.hpp
new file mode 100644
index 0000000..79e9338
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utils/allocator_stats.hpp
@@ -0,0 +1,29 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_ALLOCATOR_STATS_HPP
+#define OPENCV_CORE_ALLOCATOR_STATS_HPP
+
+#include "../cvdef.h"
+
+namespace cv { namespace utils {
+
+class AllocatorStatisticsInterface
+{
+protected:
+    AllocatorStatisticsInterface() {}
+    virtual ~AllocatorStatisticsInterface() {}
+public:
+    virtual uint64_t getCurrentUsage() const = 0;
+    virtual uint64_t getTotalUsage() const = 0;
+    virtual uint64_t getNumberOfAllocations() const = 0;
+    virtual uint64_t getPeakUsage() const = 0;
+
+    /** set peak usage = current usage */
+    virtual void resetPeakUsage() = 0;
+};
+
+}} // namespace
+
+#endif // OPENCV_CORE_ALLOCATOR_STATS_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utils/allocator_stats.impl.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utils/allocator_stats.impl.hpp
new file mode 100644
index 0000000..eb5ecde
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utils/allocator_stats.impl.hpp
@@ -0,0 +1,158 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_ALLOCATOR_STATS_IMPL_HPP
+#define OPENCV_CORE_ALLOCATOR_STATS_IMPL_HPP
+
+#include "./allocator_stats.hpp"
+
+//#define OPENCV_DISABLE_ALLOCATOR_STATS
+
+#ifdef CV_CXX11
+
+#include <atomic>
+
+#ifndef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE
+#if defined(__GNUC__) && (\
+        (defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__ == 4) || \
+        (defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4) && !defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8)) \
+    )
+#define OPENCV_ALLOCATOR_STATS_COUNTER_TYPE int
+#endif
+#endif
+
+#ifndef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE
+#define OPENCV_ALLOCATOR_STATS_COUNTER_TYPE long long
+#endif
+
+#else  // CV_CXX11
+
+#ifndef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE
+#define OPENCV_ALLOCATOR_STATS_COUNTER_TYPE int  // CV_XADD supports int only
+#endif
+
+#endif  // CV_CXX11
+
+namespace cv { namespace utils {
+
+#ifdef CV__ALLOCATOR_STATS_LOG
+namespace {
+#endif
+
+class AllocatorStatistics : public AllocatorStatisticsInterface
+{
+#ifdef OPENCV_DISABLE_ALLOCATOR_STATS
+
+public:
+    AllocatorStatistics() {}
+    ~AllocatorStatistics() CV_OVERRIDE {}
+
+    uint64_t getCurrentUsage() const CV_OVERRIDE { return 0; }
+    uint64_t getTotalUsage() const CV_OVERRIDE { return 0; }
+    uint64_t getNumberOfAllocations() const CV_OVERRIDE { return 0; }
+    uint64_t getPeakUsage() const CV_OVERRIDE { return 0; }
+
+    /** set peak usage = current usage */
+    void resetPeakUsage() CV_OVERRIDE {};
+
+    void onAllocate(size_t /*sz*/) {}
+    void onFree(size_t /*sz*/) {}
+
+#elif defined(CV_CXX11)
+
+protected:
+    typedef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE counter_t;
+    std::atomic<counter_t> curr, total, total_allocs, peak;
+public:
+    AllocatorStatistics() {}
+    ~AllocatorStatistics() CV_OVERRIDE {}
+
+    uint64_t getCurrentUsage() const CV_OVERRIDE { return (uint64_t)curr.load(); }
+    uint64_t getTotalUsage() const CV_OVERRIDE { return (uint64_t)total.load(); }
+    uint64_t getNumberOfAllocations() const CV_OVERRIDE { return (uint64_t)total_allocs.load(); }
+    uint64_t getPeakUsage() const CV_OVERRIDE { return (uint64_t)peak.load(); }
+
+    /** set peak usage = current usage */
+    void resetPeakUsage() CV_OVERRIDE { peak.store(curr.load()); }
+
+    // Controller interface
+    void onAllocate(size_t sz)
+    {
+#ifdef CV__ALLOCATOR_STATS_LOG
+        CV__ALLOCATOR_STATS_LOG(cv::format("allocate: %lld (curr=%lld)", (long long int)sz, (long long int)curr.load()));
+#endif
+
+        counter_t new_curr = curr.fetch_add((counter_t)sz) + (counter_t)sz;
+
+        // peak = std::max((uint64_t)peak, new_curr);
+        auto prev_peak = peak.load();
+        while (prev_peak < new_curr)
+        {
+            if (peak.compare_exchange_weak(prev_peak, new_curr))
+                break;
+        }
+        // end of peak = max(...)
+
+        total += (counter_t)sz;
+        total_allocs++;
+    }
+    void onFree(size_t sz)
+    {
+#ifdef CV__ALLOCATOR_STATS_LOG
+        CV__ALLOCATOR_STATS_LOG(cv::format("free: %lld (curr=%lld)", (long long int)sz, (long long int)curr.load()));
+#endif
+        curr -= (counter_t)sz;
+    }
+
+#else  // non C++11
+
+protected:
+    typedef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE counter_t;
+    volatile counter_t curr, total, total_allocs, peak;  // overflow is possible, CV_XADD operates with 'int' only
+public:
+    AllocatorStatistics()
+        : curr(0), total(0), total_allocs(0), peak(0)
+    {}
+    ~AllocatorStatistics() CV_OVERRIDE {}
+
+    uint64_t getCurrentUsage() const CV_OVERRIDE { return (uint64_t)curr; }
+    uint64_t getTotalUsage() const CV_OVERRIDE { return (uint64_t)total; }
+    uint64_t getNumberOfAllocations() const CV_OVERRIDE { return (uint64_t)total_allocs; }
+    uint64_t getPeakUsage() const CV_OVERRIDE { return (uint64_t)peak; }
+
+    void resetPeakUsage() CV_OVERRIDE { peak = curr; }
+
+    // Controller interface
+    void onAllocate(size_t sz)
+    {
+#ifdef CV__ALLOCATOR_STATS_LOG
+        CV__ALLOCATOR_STATS_LOG(cv::format("allocate: %lld (curr=%lld)", (long long int)sz, (long long int)curr));
+#endif
+
+        counter_t new_curr = (counter_t)CV_XADD(&curr, (counter_t)sz) + (counter_t)sz;
+
+        peak = std::max((counter_t)peak, new_curr);  // non-thread safe
+
+        //CV_XADD(&total, (uint64_t)sz);  // overflow with int, non-reliable...
+        total += sz;
+
+        CV_XADD(&total_allocs, (counter_t)1);
+    }
+    void onFree(size_t sz)
+    {
+#ifdef CV__ALLOCATOR_STATS_LOG
+        CV__ALLOCATOR_STATS_LOG(cv::format("free: %lld (curr=%lld)", (long long int)sz, (long long int)curr));
+#endif
+        CV_XADD(&curr, (counter_t)-sz);
+    }
+#endif
+};
+
+#ifdef CV__ALLOCATOR_STATS_LOG
+} // namespace
+#endif
+
+}} // namespace
+
+#endif // OPENCV_CORE_ALLOCATOR_STATS_IMPL_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utils/filesystem.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utils/filesystem.hpp
new file mode 100644
index 0000000..a98d220
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utils/filesystem.hpp
@@ -0,0 +1,82 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_UTILS_FILESYSTEM_HPP
+#define OPENCV_UTILS_FILESYSTEM_HPP
+
+namespace cv { namespace utils { namespace fs {
+
+
+CV_EXPORTS bool exists(const cv::String& path);
+CV_EXPORTS bool isDirectory(const cv::String& path);
+
+CV_EXPORTS void remove_all(const cv::String& path);
+
+
+CV_EXPORTS cv::String getcwd();
+
+/** @brief Converts path p to a canonical absolute path
+ * Symlinks are processed if there is support for them on running platform.
+ *
+ * @param path input path. Target file/directory should exist.
+ */
+CV_EXPORTS cv::String canonical(const cv::String& path);
+
+/** Join path components */
+CV_EXPORTS cv::String join(const cv::String& base, const cv::String& path);
+
+/** Get parent directory */
+CV_EXPORTS cv::String getParent(const cv::String &path);
+CV_EXPORTS std::wstring getParent(const std::wstring& path);
+
+/**
+ * Generate a list of all files that match the globbing pattern.
+ *
+ * Result entries are prefixed by base directory path.
+ *
+ * @param directory base directory
+ * @param pattern filter pattern (based on '*'/'?' symbols). Use empty string to disable filtering and return all results
+ * @param[out] result result of globing.
+ * @param recursive scan nested directories too
+ * @param includeDirectories include directories into results list
+ */
+CV_EXPORTS void glob(const cv::String& directory, const cv::String& pattern,
+        CV_OUT std::vector<cv::String>& result,
+        bool recursive = false, bool includeDirectories = false);
+
+/**
+ * Generate a list of all files that match the globbing pattern.
+ *
+ * @param directory base directory
+ * @param pattern filter pattern (based on '*'/'?' symbols). Use empty string to disable filtering and return all results
+ * @param[out] result globbing result with relative paths from base directory
+ * @param recursive scan nested directories too
+ * @param includeDirectories include directories into results list
+ */
+CV_EXPORTS void glob_relative(const cv::String& directory, const cv::String& pattern,
+        CV_OUT std::vector<cv::String>& result,
+        bool recursive = false, bool includeDirectories = false);
+
+
+CV_EXPORTS bool createDirectory(const cv::String& path);
+CV_EXPORTS bool createDirectories(const cv::String& path);
+
+#ifdef __OPENCV_BUILD
+// TODO
+//CV_EXPORTS cv::String getTempDirectory();
+
+/**
+ * @brief Returns directory to store OpenCV cache files
+ * Create sub-directory in common OpenCV cache directory if it doesn't exist.
+ * @param sub_directory_name name of sub-directory. NULL or "" value asks to return root cache directory.
+ * @param configuration_name optional name of configuration parameter name which overrides default behavior.
+ * @return Path to cache directory. Returns empty string if cache directories support is not available. Returns "disabled" if cache disabled by user.
+ */
+CV_EXPORTS cv::String getCacheDirectory(const char* sub_directory_name, const char* configuration_name = NULL);
+
+#endif
+
+}}} // namespace
+
+#endif // OPENCV_UTILS_FILESYSTEM_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utils/fp_control_utils.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utils/fp_control_utils.hpp
new file mode 100644
index 0000000..930bc5d
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utils/fp_control_utils.hpp
@@ -0,0 +1,69 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_FP_CONTROL_UTILS_HPP
+#define OPENCV_CORE_FP_CONTROL_UTILS_HPP
+
+namespace cv {
+
+namespace details {
+
+struct FPDenormalsModeState
+{
+    uint32_t reserved[16];  // 64-bytes
+};  // FPDenormalsModeState
+
+CV_EXPORTS void setFPDenormalsIgnoreHint(bool ignore, CV_OUT FPDenormalsModeState& state);
+CV_EXPORTS int saveFPDenormalsState(CV_OUT FPDenormalsModeState& state);
+CV_EXPORTS bool restoreFPDenormalsState(const FPDenormalsModeState& state);
+
+class FPDenormalsIgnoreHintScope
+{
+public:
+    inline explicit FPDenormalsIgnoreHintScope(bool ignore = true)
+    {
+        details::setFPDenormalsIgnoreHint(ignore, saved_state);
+    }
+
+    inline explicit FPDenormalsIgnoreHintScope(const FPDenormalsModeState& state)
+    {
+        details::saveFPDenormalsState(saved_state);
+        details::restoreFPDenormalsState(state);
+    }
+
+    inline ~FPDenormalsIgnoreHintScope()
+    {
+        details::restoreFPDenormalsState(saved_state);
+    }
+
+protected:
+    FPDenormalsModeState saved_state;
+};  // FPDenormalsIgnoreHintScope
+
+class FPDenormalsIgnoreHintScopeNOOP
+{
+public:
+    inline FPDenormalsIgnoreHintScopeNOOP(bool ignore = true) { CV_UNUSED(ignore); }
+    inline FPDenormalsIgnoreHintScopeNOOP(const FPDenormalsModeState& state) { CV_UNUSED(state); }
+    inline ~FPDenormalsIgnoreHintScopeNOOP() { }
+};  // FPDenormalsIgnoreHintScopeNOOP
+
+}  // namespace details
+
+
+// Should depend on target compilation architecture only
+// Note: previously added archs should NOT be removed to preserve ABI compatibility
+#if defined(OPENCV_SUPPORTS_FP_DENORMALS_HINT)
+  // preserve configuration overloading through ports
+#elif defined(__i386__) || defined(__x86_64__) || defined(_M_X64) || defined(_X86_)
+typedef details::FPDenormalsIgnoreHintScope FPDenormalsIgnoreHintScope;
+#define OPENCV_SUPPORTS_FP_DENORMALS_HINT 1
+#else
+#define OPENCV_SUPPORTS_FP_DENORMALS_HINT 0
+typedef details::FPDenormalsIgnoreHintScopeNOOP FPDenormalsIgnoreHintScope;
+#endif
+
+}  // namespace cv
+
+#endif // OPENCV_CORE_FP_CONTROL_UTILS_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utils/instrumentation.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utils/instrumentation.hpp
new file mode 100644
index 0000000..3639867
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utils/instrumentation.hpp
@@ -0,0 +1,125 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_UTILS_INSTR_HPP
+#define OPENCV_UTILS_INSTR_HPP
+
+#include <opencv2/core/utility.hpp>
+#include <opencv2/core/utils/tls.hpp>
+
+namespace cv {
+
+//! @addtogroup core_utils
+//! @{
+
+#ifdef CV_COLLECT_IMPL_DATA
+CV_EXPORTS void setImpl(int flags); // set implementation flags and reset storage arrays
+CV_EXPORTS void addImpl(int flag, const char* func = 0); // add implementation and function name to storage arrays
+// Get stored implementation flags and functions names arrays
+// Each implementation entry correspond to function name entry, so you can find which implementation was executed in which function
+CV_EXPORTS int getImpl(std::vector<int> &impl, std::vector<String> &funName);
+
+CV_EXPORTS bool useCollection(); // return implementation collection state
+CV_EXPORTS void setUseCollection(bool flag); // set implementation collection state
+
+#define CV_IMPL_PLAIN  0x01 // native CPU OpenCV implementation
+#define CV_IMPL_OCL    0x02 // OpenCL implementation
+#define CV_IMPL_IPP    0x04 // IPP implementation
+#define CV_IMPL_MT     0x10 // multithreaded implementation
+
+#undef CV_IMPL_ADD
+#define CV_IMPL_ADD(impl)                                                   \
+    if(cv::useCollection())                                                 \
+    {                                                                       \
+        cv::addImpl(impl, CV_Func);                                         \
+    }
+#endif
+
+// Instrumentation external interface
+namespace instr
+{
+
+#if !defined OPENCV_ABI_CHECK
+
+enum TYPE
+{
+    TYPE_GENERAL = 0,   // OpenCV API function, e.g. exported function
+    TYPE_MARKER,        // Information marker
+    TYPE_WRAPPER,       // Wrapper function for implementation
+    TYPE_FUN,           // Simple function call
+};
+
+enum IMPL
+{
+    IMPL_PLAIN = 0,
+    IMPL_IPP,
+    IMPL_OPENCL,
+};
+
+struct NodeDataTls
+{
+    NodeDataTls()
+    {
+        m_ticksTotal = 0;
+    }
+    uint64      m_ticksTotal;
+};
+
+class CV_EXPORTS NodeData
+{
+public:
+    NodeData(const char* funName = 0, const char* fileName = NULL, int lineNum = 0, void* retAddress = NULL, bool alwaysExpand = false, cv::instr::TYPE instrType = TYPE_GENERAL, cv::instr::IMPL implType = IMPL_PLAIN);
+    NodeData(NodeData &ref);
+    ~NodeData();
+    NodeData& operator=(const NodeData&);
+
+    cv::String          m_funName;
+    cv::instr::TYPE     m_instrType;
+    cv::instr::IMPL     m_implType;
+    const char*         m_fileName;
+    int                 m_lineNum;
+    void*               m_retAddress;
+    bool                m_alwaysExpand;
+    bool                m_funError;
+
+    volatile int         m_counter;
+    volatile uint64      m_ticksTotal;
+    TLSDataAccumulator<NodeDataTls> m_tls;
+    int                  m_threads;
+
+    // No synchronization
+    double getTotalMs()   const { return ((double)m_ticksTotal / cv::getTickFrequency()) * 1000; }
+    double getMeanMs()    const { return (((double)m_ticksTotal/m_counter) / cv::getTickFrequency()) * 1000; }
+};
+bool operator==(const NodeData& lhs, const NodeData& rhs);
+
+typedef Node<NodeData> InstrNode;
+
+CV_EXPORTS InstrNode* getTrace();
+
+#endif // !defined OPENCV_ABI_CHECK
+
+
+CV_EXPORTS bool       useInstrumentation();
+CV_EXPORTS void       setUseInstrumentation(bool flag);
+CV_EXPORTS void       resetTrace();
+
+enum FLAGS
+{
+    FLAGS_NONE              = 0,
+    FLAGS_MAPPING           = 0x01,
+    FLAGS_EXPAND_SAME_NAMES = 0x02,
+};
+
+CV_EXPORTS void       setFlags(FLAGS modeFlags);
+static inline void    setFlags(int modeFlags) { setFlags((FLAGS)modeFlags); }
+CV_EXPORTS FLAGS      getFlags();
+
+} // namespace instr
+
+//! @}
+
+} // namespace
+
+#endif // OPENCV_UTILS_TLS_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utils/logger.defines.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utils/logger.defines.hpp
new file mode 100644
index 0000000..7d73f02
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utils/logger.defines.hpp
@@ -0,0 +1,42 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_LOGGER_DEFINES_HPP
+#define OPENCV_LOGGER_DEFINES_HPP
+
+//! @addtogroup core_logging
+//! @{
+
+// Supported logging levels and their semantic
+#define CV_LOG_LEVEL_SILENT 0          //!< for using in setLogLevel() call
+#define CV_LOG_LEVEL_FATAL 1           //!< Fatal (critical) error (unrecoverable internal error)
+#define CV_LOG_LEVEL_ERROR 2           //!< Error message
+#define CV_LOG_LEVEL_WARN 3            //!< Warning message
+#define CV_LOG_LEVEL_INFO 4            //!< Info message
+#define CV_LOG_LEVEL_DEBUG 5           //!< Debug message. Disabled in the "Release" build.
+#define CV_LOG_LEVEL_VERBOSE 6         //!< Verbose (trace) messages. Requires verbosity level. Disabled in the "Release" build.
+
+namespace cv {
+namespace utils {
+namespace logging {
+
+//! Supported logging levels and their semantic
+enum LogLevel {
+    LOG_LEVEL_SILENT = 0,              //!< for using in setLogVevel() call
+    LOG_LEVEL_FATAL = 1,               //!< Fatal (critical) error (unrecoverable internal error)
+    LOG_LEVEL_ERROR = 2,               //!< Error message
+    LOG_LEVEL_WARNING = 3,             //!< Warning message
+    LOG_LEVEL_INFO = 4,                //!< Info message
+    LOG_LEVEL_DEBUG = 5,               //!< Debug message. Disabled in the "Release" build.
+    LOG_LEVEL_VERBOSE = 6,             //!< Verbose (trace) messages. Requires verbosity level. Disabled in the "Release" build.
+#ifndef CV_DOXYGEN
+    ENUM_LOG_LEVEL_FORCE_INT = INT_MAX
+#endif
+};
+
+}}} // namespace
+
+//! @}
+
+#endif // OPENCV_LOGGER_DEFINES_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utils/logger.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utils/logger.hpp
new file mode 100644
index 0000000..accb860
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utils/logger.hpp
@@ -0,0 +1,218 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_LOGGER_HPP
+#define OPENCV_LOGGER_HPP
+
+#include <iostream>
+#include <sstream>
+#include <limits.h> // INT_MAX
+
+#include "logger.defines.hpp"
+#include "logtag.hpp"
+
+namespace cv {
+namespace utils {
+namespace logging {
+
+//! @addtogroup core_logging
+//! @{
+
+/** Set global logging level
+@return previous logging level
+*/
+CV_EXPORTS LogLevel setLogLevel(LogLevel logLevel);
+/** Get global logging level */
+CV_EXPORTS LogLevel getLogLevel();
+
+CV_EXPORTS void registerLogTag(cv::utils::logging::LogTag* plogtag);
+
+CV_EXPORTS void setLogTagLevel(const char* tag, cv::utils::logging::LogLevel level);
+
+CV_EXPORTS cv::utils::logging::LogLevel getLogTagLevel(const char* tag);
+
+namespace internal {
+
+/** Get global log tag */
+CV_EXPORTS cv::utils::logging::LogTag* getGlobalLogTag();
+
+/** Write log message */
+CV_EXPORTS void writeLogMessage(LogLevel logLevel, const char* message);
+
+/** Write log message */
+CV_EXPORTS void writeLogMessageEx(LogLevel logLevel, const char* tag, const char* file, int line, const char* func, const char* message);
+
+} // namespace
+
+struct LogTagAuto
+    : public LogTag
+{
+    inline LogTagAuto(const char* _name, LogLevel _level)
+        : LogTag(_name, _level)
+    {
+        registerLogTag(this);
+    }
+};
+
+/**
+ * \def CV_LOG_STRIP_LEVEL
+ *
+ * Define CV_LOG_STRIP_LEVEL=CV_LOG_LEVEL_[DEBUG|INFO|WARN|ERROR|FATAL|SILENT] to compile out anything at that and before that logging level
+ */
+#ifndef CV_LOG_STRIP_LEVEL
+# if defined NDEBUG
+#   define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_DEBUG
+# else
+#   define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_VERBOSE
+# endif
+#endif
+
+#define CV_LOGTAG_PTR_CAST(expr) static_cast<const cv::utils::logging::LogTag*>(expr)
+
+// CV_LOGTAG_EXPAND_NAME is intended to be re-defined (undef and then define again)
+// to allows logging users to use a shorter name argument when calling
+// CV_LOG_WITH_TAG or its related macros such as CV_LOG_INFO.
+//
+// This macro is intended to modify the tag argument as a string (token), via
+// preprocessor token pasting or metaprogramming techniques. A typical usage
+// is to apply a prefix, such as
+// ...... #define CV_LOGTAG_EXPAND_NAME(tag) cv_logtag_##tag
+//
+// It is permitted to re-define to a hard-coded expression, ignoring the tag.
+// This would work identically like the CV_LOGTAG_FALLBACK macro.
+//
+// Important: When the logging macro is called with tag being NULL, a user-defined
+// CV_LOGTAG_EXPAND_NAME may expand it into cv_logtag_0, cv_logtag_NULL, or
+// cv_logtag_nullptr. Use with care. Also be mindful of C++ symbol redefinitions.
+//
+// If there is significant amount of logging code with tag being NULL, it is
+// recommended to use (re-define) CV_LOGTAG_FALLBACK to inject locally a default
+// tag at the beginning of a compilation unit, to minimize lines of code changes.
+//
+#define CV_LOGTAG_EXPAND_NAME(tag) tag
+
+// CV_LOGTAG_FALLBACK is intended to be re-defined (undef and then define again)
+// by any other compilation units to provide a log tag when the logging statement
+// does not specify one. The macro needs to expand into a C++ expression that can
+// be static_cast into (cv::utils::logging::LogTag*). Null (nullptr) is permitted.
+#define CV_LOGTAG_FALLBACK nullptr
+
+// CV_LOGTAG_GLOBAL is the tag used when a log tag is not specified in the logging
+// statement nor the compilation unit. The macro needs to expand into a C++
+// expression that can be static_cast into (cv::utils::logging::LogTag*). Must be
+// non-null. Do not re-define.
+#define CV_LOGTAG_GLOBAL cv::utils::logging::internal::getGlobalLogTag()
+
+#define CV_LOG_WITH_TAG(tag, msgLevel, extra_check0, extra_check1, ...) \
+    for(;;) { \
+        extra_check0; \
+        const auto cv_temp_msglevel = (cv::utils::logging::LogLevel)(msgLevel); \
+        if (cv_temp_msglevel >= (CV_LOG_STRIP_LEVEL)) break; \
+        auto cv_temp_logtagptr = CV_LOGTAG_PTR_CAST(CV_LOGTAG_EXPAND_NAME(tag)); \
+        if (!cv_temp_logtagptr) cv_temp_logtagptr = CV_LOGTAG_PTR_CAST(CV_LOGTAG_FALLBACK); \
+        if (!cv_temp_logtagptr) cv_temp_logtagptr = CV_LOGTAG_PTR_CAST(CV_LOGTAG_GLOBAL); \
+        if (cv_temp_logtagptr && (cv_temp_msglevel > cv_temp_logtagptr->level)) break; \
+        extra_check1; \
+        std::stringstream cv_temp_logstream; \
+        cv_temp_logstream << __VA_ARGS__; \
+        cv::utils::logging::internal::writeLogMessageEx( \
+            cv_temp_msglevel, \
+            (cv_temp_logtagptr ? cv_temp_logtagptr->name : nullptr), \
+            __FILE__, \
+            __LINE__, \
+            CV_Func, \
+            cv_temp_logstream.str().c_str()); \
+        break; \
+    }
+
+#define CV_LOG_FATAL(tag, ...)   CV_LOG_WITH_TAG(tag, cv::utils::logging::LOG_LEVEL_FATAL, , , __VA_ARGS__)
+#define CV_LOG_ERROR(tag, ...)   CV_LOG_WITH_TAG(tag, cv::utils::logging::LOG_LEVEL_ERROR, , , __VA_ARGS__)
+#define CV_LOG_WARNING(tag, ...) CV_LOG_WITH_TAG(tag, cv::utils::logging::LOG_LEVEL_WARNING, , , __VA_ARGS__)
+#define CV_LOG_INFO(tag, ...) CV_LOG_WITH_TAG(tag, cv::utils::logging::LOG_LEVEL_INFO, , , __VA_ARGS__)
+#define CV_LOG_DEBUG(tag, ...) CV_LOG_WITH_TAG(tag, cv::utils::logging::LOG_LEVEL_DEBUG, , , __VA_ARGS__)
+#define CV_LOG_VERBOSE(tag, v, ...) CV_LOG_WITH_TAG(tag, (cv::utils::logging::LOG_LEVEL_VERBOSE + (int)(v)), , , __VA_ARGS__)
+
+#if CV_LOG_STRIP_LEVEL <= CV_LOG_LEVEL_INFO
+#undef CV_LOG_INFO
+#define CV_LOG_INFO(tag, ...)
+#endif
+
+#if CV_LOG_STRIP_LEVEL <= CV_LOG_LEVEL_DEBUG
+#undef CV_LOG_DEBUG
+#define CV_LOG_DEBUG(tag, ...)
+#endif
+
+#if CV_LOG_STRIP_LEVEL <= CV_LOG_LEVEL_VERBOSE
+#undef CV_LOG_VERBOSE
+#define CV_LOG_VERBOSE(tag, v, ...)
+#endif
+
+//! @cond IGNORED
+#define CV__LOG_ONCE_CHECK_PRE \
+    static bool _cv_log_once_ ## __LINE__ = false; \
+    if (_cv_log_once_ ## __LINE__) break;
+
+#define CV__LOG_ONCE_CHECK_POST \
+    _cv_log_once_ ## __LINE__ = true;
+
+#define CV__LOG_IF_CHECK(logging_cond) \
+    if (!(logging_cond)) break;
+
+//! @endcond
+
+
+// CV_LOG_ONCE_XXX macros
+
+#define CV_LOG_ONCE_ERROR(tag, ...) CV_LOG_WITH_TAG(tag, cv::utils::logging::LOG_LEVEL_ERROR, CV__LOG_ONCE_CHECK_PRE, CV__LOG_ONCE_CHECK_POST, __VA_ARGS__)
+#define CV_LOG_ONCE_WARNING(tag, ...) CV_LOG_WITH_TAG(tag, cv::utils::logging::LOG_LEVEL_WARNING, CV__LOG_ONCE_CHECK_PRE, CV__LOG_ONCE_CHECK_POST, __VA_ARGS__)
+#define CV_LOG_ONCE_INFO(tag, ...) CV_LOG_WITH_TAG(tag, cv::utils::logging::LOG_LEVEL_INFO, CV__LOG_ONCE_CHECK_PRE, CV__LOG_ONCE_CHECK_POST, __VA_ARGS__)
+#define CV_LOG_ONCE_DEBUG(tag, ...) CV_LOG_WITH_TAG(tag, cv::utils::logging::LOG_LEVEL_DEBUG, CV__LOG_ONCE_CHECK_PRE, CV__LOG_ONCE_CHECK_POST, __VA_ARGS__)
+#define CV_LOG_ONCE_VERBOSE(tag, v, ...) CV_LOG_WITH_TAG(tag, (cv::utils::logging::LOG_LEVEL_VERBOSE + (int)(v)), CV__LOG_ONCE_CHECK_PRE, CV__LOG_ONCE_CHECK_POST, __VA_ARGS__)
+
+#if CV_LOG_STRIP_LEVEL <= CV_LOG_LEVEL_INFO
+#undef CV_LOG_ONCE_INFO
+#define CV_LOG_ONCE_INFO(tag, ...)
+#endif
+
+#if CV_LOG_STRIP_LEVEL <= CV_LOG_LEVEL_DEBUG
+#undef CV_LOG_ONCE_DEBUG
+#define CV_LOG_ONCE_DEBUG(tag, ...)
+#endif
+
+#if CV_LOG_STRIP_LEVEL <= CV_LOG_LEVEL_VERBOSE
+#undef CV_LOG_ONCE_VERBOSE
+#define CV_LOG_ONCE_VERBOSE(tag, v, ...)
+#endif
+
+
+// CV_LOG_IF_XXX macros
+
+#define CV_LOG_IF_FATAL(tag, logging_cond, ...) CV_LOG_WITH_TAG(tag, cv::utils::logging::LOG_LEVEL_FATAL, , CV__LOG_IF_CHECK(logging_cond), __VA_ARGS__)
+#define CV_LOG_IF_ERROR(tag, logging_cond, ...) CV_LOG_WITH_TAG(tag, cv::utils::logging::LOG_LEVEL_ERROR, , CV__LOG_IF_CHECK(logging_cond), __VA_ARGS__)
+#define CV_LOG_IF_WARNING(tag, logging_cond, ...) CV_LOG_WITH_TAG(tag, cv::utils::logging::LOG_LEVEL_WARNING, , CV__LOG_IF_CHECK(logging_cond), __VA_ARGS__)
+#define CV_LOG_IF_INFO(tag, logging_cond, ...) CV_LOG_WITH_TAG(tag, cv::utils::logging::LOG_LEVEL_INFO, , CV__LOG_IF_CHECK(logging_cond), __VA_ARGS__)
+#define CV_LOG_IF_DEBUG(tag, logging_cond, ...) CV_LOG_WITH_TAG(tag, cv::utils::logging::LOG_LEVEL_DEBUG, , CV__LOG_IF_CHECK(logging_cond), __VA_ARGS__)
+#define CV_LOG_IF_VERBOSE(tag, v, logging_cond, ...) CV_LOG_WITH_TAG(tag, (cv::utils::logging::LOG_LEVEL_VERBOSE + (int)(v)), , CV__LOG_IF_CHECK(logging_cond), __VA_ARGS__)
+
+#if CV_LOG_STRIP_LEVEL <= CV_LOG_LEVEL_INFO
+#undef CV_LOG_IF_INFO
+#define CV_LOG_IF_INFO(tag, logging_cond, ...)
+#endif
+
+#if CV_LOG_STRIP_LEVEL <= CV_LOG_LEVEL_DEBUG
+#undef CV_LOG_IF_DEBUG
+#define CV_LOG_IF_DEBUG(tag, logging_cond, ...)
+#endif
+
+#if CV_LOG_STRIP_LEVEL <= CV_LOG_LEVEL_VERBOSE
+#undef CV_LOG_IF_VERBOSE
+#define CV_LOG_IF_VERBOSE(tag, v, logging_cond, ...)
+#endif
+
+
+//! @}
+
+}}} // namespace
+
+#endif // OPENCV_LOGGER_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utils/logtag.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utils/logtag.hpp
new file mode 100644
index 0000000..4089720
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utils/logtag.hpp
@@ -0,0 +1,28 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_LOGTAG_HPP
+#define OPENCV_CORE_LOGTAG_HPP
+
+#include "opencv2/core/cvstd.hpp"
+#include "logger.defines.hpp"
+
+namespace cv {
+namespace utils {
+namespace logging {
+
+struct LogTag
+{
+    const char* name;
+    LogLevel level;
+
+    inline LogTag(const char* _name, LogLevel _level)
+        : name(_name)
+        , level(_level)
+    {}
+};
+
+}}}
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utils/tls.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utils/tls.hpp
new file mode 100644
index 0000000..124caeb
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utils/tls.hpp
@@ -0,0 +1,235 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_UTILS_TLS_HPP
+#define OPENCV_UTILS_TLS_HPP
+
+#ifndef OPENCV_CORE_UTILITY_H
+#error "tls.hpp must be included after opencv2/core/utility.hpp or opencv2/core.hpp"
+#endif
+
+namespace cv {
+
+//! @addtogroup core_utils
+//! @{
+
+namespace details { class TlsStorage; }
+
+/** TLS container base implementation
+ *
+ * Don't use directly.
+ *
+ * @sa TLSData, TLSDataAccumulator templates
+ */
+class CV_EXPORTS TLSDataContainer
+{
+protected:
+    TLSDataContainer();
+    virtual ~TLSDataContainer();
+
+    /// @deprecated use detachData() instead
+    void  gatherData(std::vector<void*> &data) const;
+    /// get TLS data and detach all data from threads (similar to cleanup() call)
+    void  detachData(std::vector<void*>& data);
+
+    void* getData() const;
+    void  release();
+
+protected:
+    virtual void* createDataInstance() const = 0;
+    virtual void  deleteDataInstance(void* pData) const = 0;
+
+private:
+    int key_;
+
+    friend class cv::details::TlsStorage;  // core/src/system.cpp
+
+public:
+    void cleanup(); //!< Release created TLS data container objects. It is similar to release() call, but it keeps TLS container valid.
+
+private:
+    // Disable copy/assign (noncopyable pattern)
+    TLSDataContainer(TLSDataContainer &) = delete;
+    TLSDataContainer& operator =(const TLSDataContainer &) = delete;
+};
+
+
+/** @brief Simple TLS data class
+ *
+ * @sa TLSDataAccumulator
+ */
+template <typename T>
+class TLSData : protected TLSDataContainer
+{
+public:
+    inline TLSData() {}
+    inline ~TLSData() { release(); }
+
+    inline T* get() const   { return (T*)getData(); }  //!< Get data associated with key
+    inline T& getRef() const { T* ptr = (T*)getData(); CV_DbgAssert(ptr); return *ptr; }  //!< Get data associated with key
+
+    /// Release associated thread data
+    inline void cleanup()
+    {
+        TLSDataContainer::cleanup();
+    }
+
+protected:
+    /// Wrapper to allocate data by template
+    virtual void* createDataInstance() const CV_OVERRIDE { return new T; }
+    /// Wrapper to release data by template
+    virtual void  deleteDataInstance(void* pData) const CV_OVERRIDE { delete (T*)pData; }
+};
+
+
+/// TLS data accumulator with gathering methods
+template <typename T>
+class TLSDataAccumulator : public TLSData<T>
+{
+    mutable cv::Mutex mutex;
+    mutable std::vector<T*> dataFromTerminatedThreads;
+    std::vector<T*> detachedData;
+    bool cleanupMode;
+public:
+    TLSDataAccumulator() : cleanupMode(false) {}
+    ~TLSDataAccumulator()
+    {
+        release();
+    }
+
+    /** @brief Get data from all threads
+     * @deprecated replaced by detachData()
+     *
+     * Lifetime of vector data is valid until next detachData()/cleanup()/release() calls
+     *
+     * @param[out] data result buffer (should be empty)
+     */
+    void gather(std::vector<T*> &data) const
+    {
+        CV_Assert(cleanupMode == false);  // state is not valid
+        CV_Assert(data.empty());
+        {
+            std::vector<void*> &dataVoid = reinterpret_cast<std::vector<void*>&>(data);
+            TLSDataContainer::gatherData(dataVoid);
+        }
+        {
+            AutoLock lock(mutex);
+            data.reserve(data.size() + dataFromTerminatedThreads.size());
+            for (typename std::vector<T*>::const_iterator i = dataFromTerminatedThreads.begin(); i != dataFromTerminatedThreads.end(); ++i)
+            {
+                data.push_back((T*)*i);
+            }
+        }
+    }
+
+    /** @brief Get and detach data from all threads
+     *
+     * Call cleanupDetachedData() when returned vector is not needed anymore.
+     *
+     * @return Vector with associated data. Content is preserved (including lifetime of attached data pointers) until next detachData()/cleanupDetachedData()/cleanup()/release() calls
+     */
+    std::vector<T*>& detachData()
+    {
+        CV_Assert(cleanupMode == false);  // state is not valid
+        std::vector<void*> dataVoid;
+        {
+            TLSDataContainer::detachData(dataVoid);
+        }
+        {
+            AutoLock lock(mutex);
+            detachedData.reserve(dataVoid.size() + dataFromTerminatedThreads.size());
+            for (typename std::vector<T*>::const_iterator i = dataFromTerminatedThreads.begin(); i != dataFromTerminatedThreads.end(); ++i)
+            {
+                detachedData.push_back((T*)*i);
+            }
+            dataFromTerminatedThreads.clear();
+            for (typename std::vector<void*>::const_iterator i = dataVoid.begin(); i != dataVoid.end(); ++i)
+            {
+                detachedData.push_back((T*)(void*)*i);
+            }
+        }
+        dataVoid.clear();
+        return detachedData;
+    }
+
+    /// Release associated thread data returned by detachData() call
+    void cleanupDetachedData()
+    {
+        AutoLock lock(mutex);
+        cleanupMode = true;
+        _cleanupDetachedData();
+        cleanupMode = false;
+    }
+
+    /// Release associated thread data
+    void cleanup()
+    {
+        cleanupMode = true;
+        TLSDataContainer::cleanup();
+
+        AutoLock lock(mutex);
+        _cleanupDetachedData();
+        _cleanupTerminatedData();
+        cleanupMode = false;
+    }
+
+    /// Release associated thread data and free TLS key
+    void release()
+    {
+        cleanupMode = true;
+        TLSDataContainer::release();
+        {
+            AutoLock lock(mutex);
+            _cleanupDetachedData();
+            _cleanupTerminatedData();
+        }
+    }
+
+protected:
+    // synchronized
+    void _cleanupDetachedData()
+    {
+        for (typename std::vector<T*>::iterator i = detachedData.begin(); i != detachedData.end(); ++i)
+        {
+            deleteDataInstance((T*)*i);
+        }
+        detachedData.clear();
+    }
+
+    // synchronized
+    void _cleanupTerminatedData()
+    {
+        for (typename std::vector<T*>::iterator i = dataFromTerminatedThreads.begin(); i != dataFromTerminatedThreads.end(); ++i)
+        {
+            deleteDataInstance((T*)*i);
+        }
+        dataFromTerminatedThreads.clear();
+    }
+
+protected:
+    virtual void* createDataInstance() const CV_OVERRIDE
+    {
+        // Note: we can collect all allocated data here, but this would require raced mutex locks
+        return new T;
+    }
+    virtual void  deleteDataInstance(void* pData) const CV_OVERRIDE
+    {
+        if (cleanupMode)
+        {
+            delete (T*)pData;
+        }
+        else
+        {
+            AutoLock lock(mutex);
+            dataFromTerminatedThreads.push_back((T*)pData);
+        }
+    }
+};
+
+
+//! @}
+
+} // namespace
+
+#endif // OPENCV_UTILS_TLS_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utils/trace.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utils/trace.hpp
new file mode 100644
index 0000000..ef5d35b
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/utils/trace.hpp
@@ -0,0 +1,252 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_TRACE_HPP
+#define OPENCV_TRACE_HPP
+
+#include <opencv2/core/cvdef.h>
+
+namespace cv {
+namespace utils {
+namespace trace {
+
+//! @addtogroup core_logging
+//! @{
+
+//! Macro to trace function
+#define CV_TRACE_FUNCTION()
+
+#define CV_TRACE_FUNCTION_SKIP_NESTED()
+
+//! Trace code scope.
+//! @note Dynamic names are not supported in this macro (on stack or heap). Use string literals here only, like "initialize".
+#define CV_TRACE_REGION(name_as_static_string_literal)
+//! mark completed of the current opened region and create new one
+//! @note Dynamic names are not supported in this macro (on stack or heap). Use string literals here only, like "step1".
+#define CV_TRACE_REGION_NEXT(name_as_static_string_literal)
+
+//! Macro to trace argument value
+#define CV_TRACE_ARG(arg_id)
+
+//! Macro to trace argument value (expanded version)
+#define CV_TRACE_ARG_VALUE(arg_id, arg_name, value)
+
+//! @cond IGNORED
+#define CV_TRACE_NS cv::utils::trace
+
+#if !defined(OPENCV_DISABLE_TRACE) && defined(__EMSCRIPTEN__)
+#define OPENCV_DISABLE_TRACE 1
+#endif
+
+namespace details {
+
+#ifndef __OPENCV_TRACE
+# if defined __OPENCV_BUILD && !defined __OPENCV_TESTS && !defined __OPENCV_APPS
+#   define __OPENCV_TRACE 1
+# else
+#   define __OPENCV_TRACE 0
+# endif
+#endif
+
+#ifndef CV_TRACE_FILENAME
+# define CV_TRACE_FILENAME __FILE__
+#endif
+
+#ifndef CV__TRACE_FUNCTION
+# if defined _MSC_VER
+#   define CV__TRACE_FUNCTION __FUNCSIG__
+# elif defined __GNUC__
+#   define CV__TRACE_FUNCTION __PRETTY_FUNCTION__
+# else
+#   define CV__TRACE_FUNCTION "<unknown>"
+# endif
+#endif
+
+//! Thread-local instance (usually allocated on stack)
+class CV_EXPORTS Region
+{
+public:
+    struct LocationExtraData;
+    struct LocationStaticStorage
+    {
+        LocationExtraData** ppExtra;   //< implementation specific data
+        const char* name;              //< region name (function name or other custom name)
+        const char* filename;          //< source code filename
+        int line;                      //< source code line
+        int flags;                     //< flags (implementation code path: Plain, IPP, OpenCL)
+    };
+
+    Region(const LocationStaticStorage& location);
+    inline ~Region()
+    {
+        if (implFlags != 0)
+            destroy();
+        CV_DbgAssert(implFlags == 0);
+        CV_DbgAssert(pImpl == NULL);
+    }
+
+    class Impl;
+    Impl* pImpl; // NULL if current region is not active
+    int implFlags; // see RegionFlag, 0 if region is ignored
+
+    bool isActive() const { return pImpl != NULL; }
+
+    void destroy();
+private:
+    Region(const Region&); // disabled
+    Region& operator= (const Region&); // disabled
+};
+
+//! Specify region flags
+enum RegionLocationFlag {
+    REGION_FLAG_FUNCTION = (1 << 0),             //< region is function (=1) / nested named region (=0)
+    REGION_FLAG_APP_CODE = (1 << 1),             //< region is Application code (=1) / OpenCV library code (=0)
+    REGION_FLAG_SKIP_NESTED = (1 << 2),          //< avoid processing of nested regions
+
+    REGION_FLAG_IMPL_IPP = (1 << 16),            //< region is part of IPP code path
+    REGION_FLAG_IMPL_OPENCL = (2 << 16),         //< region is part of OpenCL code path
+    REGION_FLAG_IMPL_OPENVX = (3 << 16),         //< region is part of OpenVX code path
+
+    REGION_FLAG_IMPL_MASK = (15 << 16),
+
+    REGION_FLAG_REGION_FORCE = (1 << 30),
+    REGION_FLAG_REGION_NEXT = (1 << 31),         //< close previous region (see #CV_TRACE_REGION_NEXT macro)
+
+    ENUM_REGION_FLAG_FORCE_INT = INT_MAX
+};
+
+struct CV_EXPORTS TraceArg {
+public:
+    struct ExtraData;
+    ExtraData** ppExtra;
+    const char* name;
+    int flags;
+};
+/** @brief Add meta information to current region (function)
+ * See CV_TRACE_ARG macro
+ * @param arg argument information structure (global static cache)
+ * @param value argument value (can by dynamic string literal in case of string, static allocation is not required)
+ */
+CV_EXPORTS void traceArg(const TraceArg& arg, const char* value);
+//! @overload
+CV_EXPORTS void traceArg(const TraceArg& arg, int value);
+//! @overload
+CV_EXPORTS void traceArg(const TraceArg& arg, int64 value);
+//! @overload
+CV_EXPORTS void traceArg(const TraceArg& arg, double value);
+
+#define CV__TRACE_LOCATION_VARNAME(loc_id) CVAUX_CONCAT(CVAUX_CONCAT(__cv_trace_location_, loc_id), __LINE__)
+#define CV__TRACE_LOCATION_EXTRA_VARNAME(loc_id) CVAUX_CONCAT(CVAUX_CONCAT(__cv_trace_location_extra_, loc_id) , __LINE__)
+
+#define CV__TRACE_DEFINE_LOCATION_(loc_id, name, flags) \
+    static CV_TRACE_NS::details::Region::LocationExtraData* CV__TRACE_LOCATION_EXTRA_VARNAME(loc_id) = 0; \
+    static const CV_TRACE_NS::details::Region::LocationStaticStorage \
+        CV__TRACE_LOCATION_VARNAME(loc_id) = { &(CV__TRACE_LOCATION_EXTRA_VARNAME(loc_id)), name, CV_TRACE_FILENAME, __LINE__, flags};
+
+#define CV__TRACE_DEFINE_LOCATION_FN(name, flags) CV__TRACE_DEFINE_LOCATION_(fn, name, ((flags) | CV_TRACE_NS::details::REGION_FLAG_FUNCTION))
+
+
+#define CV__TRACE_OPENCV_FUNCTION() \
+    CV__TRACE_DEFINE_LOCATION_FN(CV__TRACE_FUNCTION, 0); \
+    const CV_TRACE_NS::details::Region __region_fn(CV__TRACE_LOCATION_VARNAME(fn));
+
+#define CV__TRACE_OPENCV_FUNCTION_NAME(name) \
+    CV__TRACE_DEFINE_LOCATION_FN(name, 0); \
+    const CV_TRACE_NS::details::Region __region_fn(CV__TRACE_LOCATION_VARNAME(fn));
+
+#define CV__TRACE_APP_FUNCTION() \
+    CV__TRACE_DEFINE_LOCATION_FN(CV__TRACE_FUNCTION, CV_TRACE_NS::details::REGION_FLAG_APP_CODE); \
+    const CV_TRACE_NS::details::Region __region_fn(CV__TRACE_LOCATION_VARNAME(fn));
+
+#define CV__TRACE_APP_FUNCTION_NAME(name) \
+    CV__TRACE_DEFINE_LOCATION_FN(name, CV_TRACE_NS::details::REGION_FLAG_APP_CODE); \
+    const CV_TRACE_NS::details::Region __region_fn(CV__TRACE_LOCATION_VARNAME(fn));
+
+
+#define CV__TRACE_OPENCV_FUNCTION_SKIP_NESTED() \
+    CV__TRACE_DEFINE_LOCATION_FN(CV__TRACE_FUNCTION, CV_TRACE_NS::details::REGION_FLAG_SKIP_NESTED); \
+    const CV_TRACE_NS::details::Region __region_fn(CV__TRACE_LOCATION_VARNAME(fn));
+
+#define CV__TRACE_OPENCV_FUNCTION_NAME_SKIP_NESTED(name) \
+    CV__TRACE_DEFINE_LOCATION_FN(name, CV_TRACE_NS::details::REGION_FLAG_SKIP_NESTED); \
+    const CV_TRACE_NS::details::Region __region_fn(CV__TRACE_LOCATION_VARNAME(fn));
+
+#define CV__TRACE_APP_FUNCTION_SKIP_NESTED() \
+    CV__TRACE_DEFINE_LOCATION_FN(CV__TRACE_FUNCTION, CV_TRACE_NS::details::REGION_FLAG_SKIP_NESTED | CV_TRACE_NS::details::REGION_FLAG_APP_CODE); \
+    const CV_TRACE_NS::details::Region __region_fn(CV__TRACE_LOCATION_VARNAME(fn));
+
+
+#define CV__TRACE_REGION_(name_as_static_string_literal, flags) \
+    CV__TRACE_DEFINE_LOCATION_(region, name_as_static_string_literal, flags); \
+    CV_TRACE_NS::details::Region CVAUX_CONCAT(__region_, __LINE__)(CV__TRACE_LOCATION_VARNAME(region));
+
+#define CV__TRACE_REGION(name_as_static_string_literal) CV__TRACE_REGION_(name_as_static_string_literal, 0)
+#define CV__TRACE_REGION_NEXT(name_as_static_string_literal) CV__TRACE_REGION_(name_as_static_string_literal, CV_TRACE_NS::details::REGION_FLAG_REGION_NEXT)
+
+#define CV__TRACE_ARG_VARNAME(arg_id) CVAUX_CONCAT(__cv_trace_arg_ ## arg_id, __LINE__)
+#define CV__TRACE_ARG_EXTRA_VARNAME(arg_id) CVAUX_CONCAT(__cv_trace_arg_extra_ ## arg_id, __LINE__)
+
+#define CV__TRACE_DEFINE_ARG_(arg_id, name, flags) \
+    static CV_TRACE_NS::details::TraceArg::ExtraData* CV__TRACE_ARG_EXTRA_VARNAME(arg_id) = 0; \
+    static const CV_TRACE_NS::details::TraceArg \
+        CV__TRACE_ARG_VARNAME(arg_id) = { &(CV__TRACE_ARG_EXTRA_VARNAME(arg_id)), name, flags };
+
+#define CV__TRACE_ARG_VALUE(arg_id, arg_name, value) \
+        CV__TRACE_DEFINE_ARG_(arg_id, arg_name, 0); \
+        CV_TRACE_NS::details::traceArg((CV__TRACE_ARG_VARNAME(arg_id)), value);
+
+#define CV__TRACE_ARG(arg_id) CV_TRACE_ARG_VALUE(arg_id, #arg_id, (arg_id))
+
+} // namespace
+
+#ifndef OPENCV_DISABLE_TRACE
+#undef CV_TRACE_FUNCTION
+#undef CV_TRACE_FUNCTION_SKIP_NESTED
+#if __OPENCV_TRACE
+#define CV_TRACE_FUNCTION CV__TRACE_OPENCV_FUNCTION
+#define CV_TRACE_FUNCTION_SKIP_NESTED CV__TRACE_OPENCV_FUNCTION_SKIP_NESTED
+#else
+#define CV_TRACE_FUNCTION CV__TRACE_APP_FUNCTION
+#define CV_TRACE_FUNCTION_SKIP_NESTED CV__TRACE_APP_FUNCTION_SKIP_NESTED
+#endif
+
+#undef CV_TRACE_REGION
+#define CV_TRACE_REGION CV__TRACE_REGION
+
+#undef CV_TRACE_REGION_NEXT
+#define CV_TRACE_REGION_NEXT CV__TRACE_REGION_NEXT
+
+#undef CV_TRACE_ARG_VALUE
+#define CV_TRACE_ARG_VALUE(arg_id, arg_name, value) \
+        if (__region_fn.isActive()) \
+        { \
+            CV__TRACE_ARG_VALUE(arg_id, arg_name, value); \
+        }
+
+#undef CV_TRACE_ARG
+#define CV_TRACE_ARG CV__TRACE_ARG
+
+#endif // OPENCV_DISABLE_TRACE
+
+#ifdef OPENCV_TRACE_VERBOSE
+#define CV_TRACE_FUNCTION_VERBOSE CV_TRACE_FUNCTION
+#define CV_TRACE_REGION_VERBOSE CV_TRACE_REGION
+#define CV_TRACE_REGION_NEXT_VERBOSE CV_TRACE_REGION_NEXT
+#define CV_TRACE_ARG_VALUE_VERBOSE CV_TRACE_ARG_VALUE
+#define CV_TRACE_ARG_VERBOSE CV_TRACE_ARG
+#else
+#define CV_TRACE_FUNCTION_VERBOSE(...)
+#define CV_TRACE_REGION_VERBOSE(...)
+#define CV_TRACE_REGION_NEXT_VERBOSE(...)
+#define CV_TRACE_ARG_VALUE_VERBOSE(...)
+#define CV_TRACE_ARG_VERBOSE(...)
+#endif
+
+//! @endcond
+
+//! @}
+
+}}} // namespace
+
+#endif // OPENCV_TRACE_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/va_intel.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/va_intel.hpp
new file mode 100644
index 0000000..b37ce75
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/va_intel.hpp
@@ -0,0 +1,75 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2015, Itseez, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#ifndef OPENCV_CORE_VA_INTEL_HPP
+#define OPENCV_CORE_VA_INTEL_HPP
+
+#ifndef __cplusplus
+#  error va_intel.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core.hpp"
+#include "ocl.hpp"
+
+#if defined(HAVE_VA)
+# include "va/va.h"
+#else  // HAVE_VA
+# if !defined(_VA_H_)
+    typedef void* VADisplay;
+    typedef unsigned int VASurfaceID;
+# endif // !_VA_H_
+#endif // HAVE_VA
+
+namespace cv { namespace va_intel {
+
+/** @addtogroup core_va_intel
+This section describes Intel VA-API/OpenCL (CL-VA) interoperability.
+
+To enable basic VA interoperability build OpenCV with libva library integration enabled: `-DWITH_VA=ON` (corresponding dev package should be installed).
+
+To enable advanced CL-VA interoperability support on Intel HW, enable option: `-DWITH_VA_INTEL=ON` (OpenCL integration should be enabled which is the default setting). Special runtime environment should be set up in order to use this feature: correct combination of [libva](https://github.com/intel/libva), [OpenCL runtime](https://github.com/intel/compute-runtime) and [media driver](https://github.com/intel/media-driver) should be installed.
+
+Check usage example for details: samples/va_intel/va_intel_interop.cpp
+*/
+//! @{
+
+/////////////////// CL-VA Interoperability Functions ///////////////////
+
+namespace ocl {
+using namespace cv::ocl;
+
+// TODO static functions in the Context class
+/** @brief Creates OpenCL context from VA.
+@param display    - VADisplay for which CL interop should be established.
+@param tryInterop - try to set up for interoperability, if true; set up for use slow copy if false.
+@return Returns reference to OpenCL Context
+ */
+CV_EXPORTS Context& initializeContextFromVA(VADisplay display, bool tryInterop = true);
+
+} // namespace cv::va_intel::ocl
+
+/** @brief Converts InputArray to VASurfaceID object.
+@param display - VADisplay object.
+@param src     - source InputArray.
+@param surface - destination VASurfaceID object.
+@param size    - size of image represented by VASurfaceID object.
+ */
+CV_EXPORTS void convertToVASurface(VADisplay display, InputArray src, VASurfaceID surface, Size size);
+
+/** @brief Converts VASurfaceID object to OutputArray.
+@param display - VADisplay object.
+@param surface - source VASurfaceID object.
+@param size    - size of image represented by VASurfaceID object.
+@param dst     - destination OutputArray.
+ */
+CV_EXPORTS void convertFromVASurface(VADisplay display, VASurfaceID surface, Size size, OutputArray dst);
+
+//! @}
+
+}} // namespace cv::va_intel
+
+#endif /* OPENCV_CORE_VA_INTEL_HPP */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/version.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/version.hpp
new file mode 100644
index 0000000..7b129b6
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/version.hpp
@@ -0,0 +1,26 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_VERSION_HPP
+#define OPENCV_VERSION_HPP
+
+#define CV_VERSION_MAJOR    4
+#define CV_VERSION_MINOR    6
+#define CV_VERSION_REVISION 0
+#define CV_VERSION_STATUS   ""
+
+#define CVAUX_STR_EXP(__A)  #__A
+#define CVAUX_STR(__A)      CVAUX_STR_EXP(__A)
+
+#define CVAUX_STRW_EXP(__A)  L ## #__A
+#define CVAUX_STRW(__A)      CVAUX_STRW_EXP(__A)
+
+#define CV_VERSION          CVAUX_STR(CV_VERSION_MAJOR) "." CVAUX_STR(CV_VERSION_MINOR) "." CVAUX_STR(CV_VERSION_REVISION) CV_VERSION_STATUS
+
+/* old  style version constants*/
+#define CV_MAJOR_VERSION    CV_VERSION_MAJOR
+#define CV_MINOR_VERSION    CV_VERSION_MINOR
+#define CV_SUBMINOR_VERSION CV_VERSION_REVISION
+
+#endif // OPENCV_VERSION_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/vsx_utils.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/vsx_utils.hpp
new file mode 100644
index 0000000..79a1074
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/core/vsx_utils.hpp
@@ -0,0 +1,1047 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_HAL_VSX_UTILS_HPP
+#define OPENCV_HAL_VSX_UTILS_HPP
+
+#include "opencv2/core/cvdef.h"
+
+#ifndef SKIP_INCLUDES
+#   include <assert.h>
+#endif
+
+//! @addtogroup core_utils_vsx
+//! @{
+#if CV_VSX
+
+#define __VSX_S16__(c, v) (c){v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v}
+#define __VSX_S8__(c, v)  (c){v, v, v, v, v, v, v, v}
+#define __VSX_S4__(c, v)  (c){v, v, v, v}
+#define __VSX_S2__(c, v)  (c){v, v}
+
+typedef __vector unsigned char vec_uchar16;
+#define vec_uchar16_set(...) (vec_uchar16){__VA_ARGS__}
+#define vec_uchar16_sp(c)    (__VSX_S16__(vec_uchar16, (unsigned char)c))
+#define vec_uchar16_c(v)     ((vec_uchar16)(v))
+#define vec_uchar16_z        vec_uchar16_sp(0)
+
+typedef __vector signed char vec_char16;
+#define vec_char16_set(...) (vec_char16){__VA_ARGS__}
+#define vec_char16_sp(c)    (__VSX_S16__(vec_char16, (signed char)c))
+#define vec_char16_c(v)     ((vec_char16)(v))
+#define vec_char16_z        vec_char16_sp(0)
+
+typedef __vector unsigned short vec_ushort8;
+#define vec_ushort8_set(...) (vec_ushort8){__VA_ARGS__}
+#define vec_ushort8_sp(c)    (__VSX_S8__(vec_ushort8, (unsigned short)c))
+#define vec_ushort8_c(v)     ((vec_ushort8)(v))
+#define vec_ushort8_z        vec_ushort8_sp(0)
+
+typedef __vector signed short vec_short8;
+#define vec_short8_set(...) (vec_short8){__VA_ARGS__}
+#define vec_short8_sp(c)    (__VSX_S8__(vec_short8, (signed short)c))
+#define vec_short8_c(v)     ((vec_short8)(v))
+#define vec_short8_z        vec_short8_sp(0)
+
+typedef __vector unsigned int vec_uint4;
+#define vec_uint4_set(...) (vec_uint4){__VA_ARGS__}
+#define vec_uint4_sp(c)    (__VSX_S4__(vec_uint4, (unsigned int)c))
+#define vec_uint4_c(v)     ((vec_uint4)(v))
+#define vec_uint4_z        vec_uint4_sp(0)
+
+typedef __vector signed int vec_int4;
+#define vec_int4_set(...)  (vec_int4){__VA_ARGS__}
+#define vec_int4_sp(c)     (__VSX_S4__(vec_int4, (signed int)c))
+#define vec_int4_c(v)      ((vec_int4)(v))
+#define vec_int4_z         vec_int4_sp(0)
+
+typedef __vector float vec_float4;
+#define vec_float4_set(...)  (vec_float4){__VA_ARGS__}
+#define vec_float4_sp(c)     (__VSX_S4__(vec_float4, c))
+#define vec_float4_c(v)      ((vec_float4)(v))
+#define vec_float4_z         vec_float4_sp(0)
+
+typedef __vector unsigned long long vec_udword2;
+#define vec_udword2_set(...) (vec_udword2){__VA_ARGS__}
+#define vec_udword2_sp(c)    (__VSX_S2__(vec_udword2, (unsigned long long)c))
+#define vec_udword2_c(v)     ((vec_udword2)(v))
+#define vec_udword2_z        vec_udword2_sp(0)
+
+typedef __vector signed long long vec_dword2;
+#define vec_dword2_set(...) (vec_dword2){__VA_ARGS__}
+#define vec_dword2_sp(c)    (__VSX_S2__(vec_dword2, (signed long long)c))
+#define vec_dword2_c(v)     ((vec_dword2)(v))
+#define vec_dword2_z        vec_dword2_sp(0)
+
+typedef  __vector double vec_double2;
+#define vec_double2_set(...) (vec_double2){__VA_ARGS__}
+#define vec_double2_c(v)     ((vec_double2)(v))
+#define vec_double2_sp(c)    (__VSX_S2__(vec_double2, c))
+#define vec_double2_z        vec_double2_sp(0)
+
+#define vec_bchar16           __vector __bool char
+#define vec_bchar16_set(...) (vec_bchar16){__VA_ARGS__}
+#define vec_bchar16_c(v)     ((vec_bchar16)(v))
+
+#define vec_bshort8           __vector __bool short
+#define vec_bshort8_set(...) (vec_bshort8){__VA_ARGS__}
+#define vec_bshort8_c(v)     ((vec_bshort8)(v))
+
+#define vec_bint4             __vector __bool int
+#define vec_bint4_set(...)   (vec_bint4){__VA_ARGS__}
+#define vec_bint4_c(v)       ((vec_bint4)(v))
+
+#define vec_bdword2            __vector __bool long long
+#define vec_bdword2_set(...)  (vec_bdword2){__VA_ARGS__}
+#define vec_bdword2_c(v)      ((vec_bdword2)(v))
+
+#define VSX_FINLINE(tp) extern inline tp __attribute__((always_inline))
+
+#define VSX_REDIRECT_1RG(rt, rg, fnm, fn2)   \
+VSX_FINLINE(rt) fnm(const rg& a) { return fn2(a); }
+
+#define VSX_REDIRECT_2RG(rt, rg, fnm, fn2)   \
+VSX_FINLINE(rt) fnm(const rg& a, const rg& b) { return fn2(a, b); }
+
+/*
+ * GCC VSX compatibility
+**/
+#if defined(__GNUG__) && !defined(__clang__)
+
+// inline asm helper
+#define VSX_IMPL_1RG(rt, rg, opc, fnm) \
+VSX_FINLINE(rt) fnm(const rg& a)       \
+{ rt rs; __asm__ __volatile__(#opc" %x0,%x1" : "=wa" (rs) : "wa" (a)); return rs; }
+
+#define VSX_IMPL_1VRG(rt, rg, opc, fnm) \
+VSX_FINLINE(rt) fnm(const rg& a)        \
+{ rt rs; __asm__ __volatile__(#opc" %0,%1" : "=v" (rs) : "v" (a)); return rs; }
+
+#define VSX_IMPL_2VRG_F(rt, rg, fopc, fnm)     \
+VSX_FINLINE(rt) fnm(const rg& a, const rg& b)  \
+{ rt rs; __asm__ __volatile__(fopc : "=v" (rs) : "v" (a), "v" (b)); return rs; }
+
+#define VSX_IMPL_2VRG(rt, rg, opc, fnm) VSX_IMPL_2VRG_F(rt, rg, #opc" %0,%1,%2", fnm)
+
+#if __GNUG__ < 8
+
+    // Support for int4 -> dword2 expanding multiply was added in GCC 8.
+    #ifdef vec_mule
+        #undef vec_mule
+    #endif
+    #ifdef vec_mulo
+        #undef vec_mulo
+    #endif
+
+    VSX_REDIRECT_2RG(vec_ushort8,  vec_uchar16,  vec_mule, __builtin_vec_mule)
+    VSX_REDIRECT_2RG(vec_short8,  vec_char16,  vec_mule, __builtin_vec_mule)
+    VSX_REDIRECT_2RG(vec_int4,  vec_short8,  vec_mule, __builtin_vec_mule)
+    VSX_REDIRECT_2RG(vec_uint4,  vec_ushort8,  vec_mule, __builtin_vec_mule)
+    VSX_REDIRECT_2RG(vec_ushort8,  vec_uchar16,  vec_mulo, __builtin_vec_mulo)
+    VSX_REDIRECT_2RG(vec_short8,  vec_char16,  vec_mulo, __builtin_vec_mulo)
+    VSX_REDIRECT_2RG(vec_int4,  vec_short8,  vec_mulo, __builtin_vec_mulo)
+    VSX_REDIRECT_2RG(vec_uint4,  vec_ushort8,  vec_mulo, __builtin_vec_mulo)
+
+    // dword2 support arrived in ISA 2.07 and GCC 8+
+    VSX_IMPL_2VRG(vec_dword2,  vec_int4,  vmulosw, vec_mule)
+    VSX_IMPL_2VRG(vec_udword2, vec_uint4, vmulouw, vec_mule)
+    VSX_IMPL_2VRG(vec_dword2,  vec_int4,  vmulesw, vec_mulo)
+    VSX_IMPL_2VRG(vec_udword2, vec_uint4, vmuleuw, vec_mulo)
+
+#endif
+
+#if __GNUG__ < 7
+// up to GCC 6 vec_mul only supports precisions and llong
+#   ifdef vec_mul
+#       undef vec_mul
+#   endif
+/*
+ * there's no a direct instruction for supporting 8-bit, 16-bit multiplication in ISA 2.07,
+ * XLC Implement it by using instruction "multiply even", "multiply odd" and "permute"
+**/
+#   define VSX_IMPL_MULH(Tvec, cperm)                                        \
+    VSX_FINLINE(Tvec) vec_mul(const Tvec& a, const Tvec& b)                  \
+    {                                                                        \
+        static const vec_uchar16 ev_od = {cperm};                            \
+        return vec_perm((Tvec)vec_mule(a, b), (Tvec)vec_mulo(a, b), ev_od);  \
+    }
+    #define VSX_IMPL_MULH_P16 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
+    VSX_IMPL_MULH(vec_char16,  VSX_IMPL_MULH_P16)
+    VSX_IMPL_MULH(vec_uchar16, VSX_IMPL_MULH_P16)
+    #define VSX_IMPL_MULH_P8 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29
+    VSX_IMPL_MULH(vec_short8,  VSX_IMPL_MULH_P8)
+    VSX_IMPL_MULH(vec_ushort8, VSX_IMPL_MULH_P8)
+    // vmuluwm can be used for unsigned or signed integers, that's what they said
+    VSX_IMPL_2VRG(vec_int4,  vec_int4,  vmuluwm, vec_mul)
+    VSX_IMPL_2VRG(vec_uint4, vec_uint4, vmuluwm, vec_mul)
+    // redirect to GCC builtin vec_mul, since it already supports precisions and llong
+    VSX_REDIRECT_2RG(vec_float4,  vec_float4,  vec_mul, __builtin_vec_mul)
+    VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mul, __builtin_vec_mul)
+    VSX_REDIRECT_2RG(vec_dword2,  vec_dword2,  vec_mul, __builtin_vec_mul)
+    VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mul, __builtin_vec_mul)
+#endif // __GNUG__ < 7
+
+#if __GNUG__ < 6
+/*
+ * Instruction "compare greater than or equal" in ISA 2.07 only supports single
+ * and double precision.
+ * In XLC and new versions of GCC implement integers by using instruction "greater than" and NOR.
+**/
+#   ifdef vec_cmpge
+#       undef vec_cmpge
+#   endif
+#   ifdef vec_cmple
+#       undef vec_cmple
+#   endif
+#   define vec_cmple(a, b) vec_cmpge(b, a)
+#   define VSX_IMPL_CMPGE(rt, rg, opc, fnm) \
+    VSX_IMPL_2VRG_F(rt, rg, #opc" %0,%2,%1\n\t xxlnor %x0,%x0,%x0", fnm)
+
+    VSX_IMPL_CMPGE(vec_bchar16, vec_char16,  vcmpgtsb, vec_cmpge)
+    VSX_IMPL_CMPGE(vec_bchar16, vec_uchar16, vcmpgtub, vec_cmpge)
+    VSX_IMPL_CMPGE(vec_bshort8, vec_short8,  vcmpgtsh, vec_cmpge)
+    VSX_IMPL_CMPGE(vec_bshort8, vec_ushort8, vcmpgtuh, vec_cmpge)
+    VSX_IMPL_CMPGE(vec_bint4,   vec_int4,    vcmpgtsw, vec_cmpge)
+    VSX_IMPL_CMPGE(vec_bint4,   vec_uint4,   vcmpgtuw, vec_cmpge)
+    VSX_IMPL_CMPGE(vec_bdword2, vec_dword2,  vcmpgtsd, vec_cmpge)
+    VSX_IMPL_CMPGE(vec_bdword2, vec_udword2, vcmpgtud, vec_cmpge)
+
+// redirect to GCC builtin cmpge, since it already supports precisions
+    VSX_REDIRECT_2RG(vec_bint4,   vec_float4,  vec_cmpge, __builtin_vec_cmpge)
+    VSX_REDIRECT_2RG(vec_bdword2, vec_double2, vec_cmpge, __builtin_vec_cmpge)
+
+// up to gcc5 vec_nor doesn't support bool long long
+#   undef vec_nor
+    template<typename T>
+    VSX_REDIRECT_2RG(T, T, vec_nor, __builtin_vec_nor)
+
+    VSX_FINLINE(vec_bdword2) vec_nor(const vec_bdword2& a, const vec_bdword2& b)
+    { return vec_bdword2_c(__builtin_vec_nor(vec_dword2_c(a), vec_dword2_c(b))); }
+
+// vec_packs doesn't support double words in gcc4 and old versions of gcc5
+#   undef vec_packs
+    VSX_REDIRECT_2RG(vec_char16,  vec_short8,  vec_packs, __builtin_vec_packs)
+    VSX_REDIRECT_2RG(vec_uchar16, vec_ushort8, vec_packs, __builtin_vec_packs)
+    VSX_REDIRECT_2RG(vec_short8,  vec_int4,    vec_packs, __builtin_vec_packs)
+    VSX_REDIRECT_2RG(vec_ushort8, vec_uint4,   vec_packs, __builtin_vec_packs)
+
+    VSX_IMPL_2VRG_F(vec_int4,  vec_dword2,  "vpksdss %0,%2,%1", vec_packs)
+    VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
+#endif // __GNUG__ < 6
+
+#if __GNUG__ < 5
+// vec_xxpermdi in gcc4 missing little-endian supports just like clang
+#   define vec_permi(a, b, c) vec_xxpermdi(b, a, (3 ^ (((c) & 1) << 1 | (c) >> 1)))
+// same as vec_xxpermdi
+#   undef vec_vbpermq
+    VSX_IMPL_2VRG(vec_udword2, vec_uchar16, vbpermq, vec_vbpermq)
+    VSX_IMPL_2VRG(vec_dword2,  vec_char16, vbpermq, vec_vbpermq)
+#else
+#   define vec_permi vec_xxpermdi
+#endif // __GNUG__ < 5
+
+// shift left double by word immediate
+#ifndef vec_sldw
+#   define vec_sldw __builtin_vsx_xxsldwi
+#endif
+
+// vector population count
+VSX_IMPL_1VRG(vec_uchar16, vec_uchar16, vpopcntb, vec_popcntu)
+VSX_IMPL_1VRG(vec_uchar16, vec_char16,  vpopcntb, vec_popcntu)
+VSX_IMPL_1VRG(vec_ushort8, vec_ushort8, vpopcnth, vec_popcntu)
+VSX_IMPL_1VRG(vec_ushort8, vec_short8,  vpopcnth, vec_popcntu)
+VSX_IMPL_1VRG(vec_uint4,   vec_uint4,   vpopcntw, vec_popcntu)
+VSX_IMPL_1VRG(vec_uint4,   vec_int4,    vpopcntw, vec_popcntu)
+VSX_IMPL_1VRG(vec_udword2, vec_udword2, vpopcntd, vec_popcntu)
+VSX_IMPL_1VRG(vec_udword2, vec_dword2,  vpopcntd, vec_popcntu)
+
+// converts between single and double-precision
+VSX_REDIRECT_1RG(vec_float4,  vec_double2, vec_cvfo, __builtin_vsx_xvcvdpsp)
+VSX_REDIRECT_1RG(vec_double2, vec_float4,  vec_cvfo, __builtin_vsx_xvcvspdp)
+
+// converts word and doubleword to double-precision
+#undef vec_ctd
+VSX_IMPL_1RG(vec_double2, vec_int4,    xvcvsxwdp, vec_ctdo)
+VSX_IMPL_1RG(vec_double2, vec_uint4,   xvcvuxwdp, vec_ctdo)
+VSX_IMPL_1RG(vec_double2, vec_dword2,  xvcvsxddp, vec_ctd)
+VSX_IMPL_1RG(vec_double2, vec_udword2, xvcvuxddp, vec_ctd)
+
+// converts word and doubleword to single-precision
+#undef vec_ctf
+VSX_IMPL_1RG(vec_float4, vec_int4,    xvcvsxwsp, vec_ctf)
+VSX_IMPL_1RG(vec_float4, vec_uint4,   xvcvuxwsp, vec_ctf)
+VSX_IMPL_1RG(vec_float4, vec_dword2,  xvcvsxdsp, vec_ctfo)
+VSX_IMPL_1RG(vec_float4, vec_udword2, xvcvuxdsp, vec_ctfo)
+
+// converts single and double precision to signed word
+#undef vec_cts
+VSX_IMPL_1RG(vec_int4,  vec_double2, xvcvdpsxws, vec_ctso)
+VSX_IMPL_1RG(vec_int4,  vec_float4,  xvcvspsxws, vec_cts)
+
+// converts single and double precision to unsigned word
+#undef vec_ctu
+VSX_IMPL_1RG(vec_uint4, vec_double2, xvcvdpuxws, vec_ctuo)
+VSX_IMPL_1RG(vec_uint4, vec_float4,  xvcvspuxws, vec_ctu)
+
+// converts single and double precision to signed doubleword
+#undef vec_ctsl
+VSX_IMPL_1RG(vec_dword2, vec_double2, xvcvdpsxds, vec_ctsl)
+VSX_IMPL_1RG(vec_dword2, vec_float4,  xvcvspsxds, vec_ctslo)
+
+// converts single and double precision to unsigned doubleword
+#undef vec_ctul
+VSX_IMPL_1RG(vec_udword2, vec_double2, xvcvdpuxds, vec_ctul)
+VSX_IMPL_1RG(vec_udword2, vec_float4,  xvcvspuxds, vec_ctulo)
+
+// just in case if GCC doesn't define it
+#ifndef vec_xl
+#   define vec_xl vec_vsx_ld
+#   define vec_xst vec_vsx_st
+#endif
+
+#endif // GCC VSX compatibility
+
+/*
+ * CLANG VSX compatibility
+**/
+#if defined(__clang__) && !defined(__IBMCPP__)
+
+/*
+ * CLANG doesn't support %x<n> in the inline asm template which fixes register number
+ * when using any of the register constraints wa, wd, wf
+ *
+ * For more explanation checkout PowerPC and IBM RS6000 in https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html
+ * Also there's already an open bug https://bugs.llvm.org/show_bug.cgi?id=31837
+ *
+ * So we're not able to use inline asm and only use built-in functions that CLANG supports
+ * and use __builtin_convertvector if clang missing any of vector conversions built-in functions
+ *
+ * todo: clang asm template bug is fixed, need to reconsider the current workarounds.
+*/
+
+// convert vector helper
+#define VSX_IMPL_CONVERT(rt, rg, fnm) \
+VSX_FINLINE(rt) fnm(const rg& a) { return __builtin_convertvector(a, rt); }
+
+#ifndef vec_permi
+#if __clang_major__ < 5
+// implement vec_permi in a dirty way
+#   define VSX_IMPL_CLANG_4_PERMI(Tvec)                                                 \
+    VSX_FINLINE(Tvec) vec_permi(const Tvec& a, const Tvec& b, unsigned const char c)    \
+    {                                                                                   \
+        switch (c)                                                                      \
+        {                                                                               \
+        case 0:                                                                         \
+            return vec_mergeh(a, b);                                                    \
+        case 1:                                                                         \
+            return vec_mergel(vec_mergeh(a, a), b);                                     \
+        case 2:                                                                         \
+            return vec_mergeh(vec_mergel(a, a), b);                                     \
+        default:                                                                        \
+            return vec_mergel(a, b);                                                    \
+        }                                                                               \
+    }
+    VSX_IMPL_CLANG_4_PERMI(vec_udword2)
+    VSX_IMPL_CLANG_4_PERMI(vec_dword2)
+    VSX_IMPL_CLANG_4_PERMI(vec_double2)
+
+// vec_xxsldwi is missing in clang 4
+#   define vec_xxsldwi(a, b, c) vec_sld(a, b, (c) * 4)
+#else
+// vec_xxpermdi is missing little-endian supports in clang 4 just like gcc4
+#   define vec_permi(a, b, c) vec_xxpermdi(b, a, (3 ^ (((c) & 1) << 1 | (c) >> 1)))
+#endif // __clang_major__ < 5
+#endif
+
+// shift left double by word immediate
+#ifndef vec_sldw
+#   define vec_sldw vec_xxsldwi
+#endif
+
+#if __clang_major__ < 13
+// Implement vec_rsqrt since clang only supports vec_rsqrte
+#ifndef vec_rsqrt
+    VSX_FINLINE(vec_float4) vec_rsqrt(const vec_float4& a)
+    { return vec_div(vec_float4_sp(1), vec_sqrt(a)); }
+
+    VSX_FINLINE(vec_double2) vec_rsqrt(const vec_double2& a)
+    { return vec_div(vec_double2_sp(1), vec_sqrt(a)); }
+#endif
+
+// vec_promote missing support for doubleword
+VSX_FINLINE(vec_dword2) vec_promote(long long a, int b)
+{
+    vec_dword2 ret = vec_dword2_z;
+    ret[b & 1] = a;
+    return ret;
+}
+
+VSX_FINLINE(vec_udword2) vec_promote(unsigned long long a, int b)
+{
+    vec_udword2 ret = vec_udword2_z;
+    ret[b & 1] = a;
+    return ret;
+}
+#endif
+
+// vec_popcnt should return unsigned but clang has different thought just like gcc in vec_vpopcnt
+#define VSX_IMPL_POPCNTU(Tvec, Tvec2, ucast)   \
+VSX_FINLINE(Tvec) vec_popcntu(const Tvec2& a)  \
+{ return ucast(vec_popcnt(a)); }
+VSX_IMPL_POPCNTU(vec_uchar16, vec_char16, vec_uchar16_c);
+VSX_IMPL_POPCNTU(vec_ushort8, vec_short8, vec_ushort8_c);
+VSX_IMPL_POPCNTU(vec_uint4,   vec_int4,   vec_uint4_c);
+VSX_IMPL_POPCNTU(vec_udword2, vec_dword2, vec_udword2_c);
+// redirect unsigned types
+VSX_REDIRECT_1RG(vec_uchar16, vec_uchar16, vec_popcntu, vec_popcnt)
+VSX_REDIRECT_1RG(vec_ushort8, vec_ushort8, vec_popcntu, vec_popcnt)
+VSX_REDIRECT_1RG(vec_uint4,   vec_uint4,   vec_popcntu, vec_popcnt)
+VSX_REDIRECT_1RG(vec_udword2, vec_udword2, vec_popcntu, vec_popcnt)
+
+// converts between single and double precision
+VSX_REDIRECT_1RG(vec_float4,  vec_double2, vec_cvfo, __builtin_vsx_xvcvdpsp)
+VSX_REDIRECT_1RG(vec_double2, vec_float4,  vec_cvfo, __builtin_vsx_xvcvspdp)
+
+// converts word and doubleword to double-precision
+#ifdef vec_ctd
+#   undef vec_ctd
+#endif
+VSX_REDIRECT_1RG(vec_double2, vec_int4,  vec_ctdo, __builtin_vsx_xvcvsxwdp)
+VSX_REDIRECT_1RG(vec_double2, vec_uint4, vec_ctdo, __builtin_vsx_xvcvuxwdp)
+
+VSX_IMPL_CONVERT(vec_double2, vec_dword2,  vec_ctd)
+VSX_IMPL_CONVERT(vec_double2, vec_udword2, vec_ctd)
+
+// converts word and doubleword to single-precision
+#if __clang_major__ > 4
+#   undef vec_ctf
+#endif
+VSX_IMPL_CONVERT(vec_float4, vec_int4,    vec_ctf)
+VSX_IMPL_CONVERT(vec_float4, vec_uint4,   vec_ctf)
+VSX_REDIRECT_1RG(vec_float4, vec_dword2,  vec_ctfo, __builtin_vsx_xvcvsxdsp)
+VSX_REDIRECT_1RG(vec_float4, vec_udword2, vec_ctfo, __builtin_vsx_xvcvuxdsp)
+
+// converts single and double precision to signed word
+#if __clang_major__ > 4
+#   undef vec_cts
+#endif
+VSX_REDIRECT_1RG(vec_int4,  vec_double2, vec_ctso, __builtin_vsx_xvcvdpsxws)
+VSX_IMPL_CONVERT(vec_int4,  vec_float4,  vec_cts)
+
+// converts single and double precision to unsigned word
+#if __clang_major__ > 4
+#   undef vec_ctu
+#endif
+VSX_REDIRECT_1RG(vec_uint4, vec_double2, vec_ctuo, __builtin_vsx_xvcvdpuxws)
+VSX_IMPL_CONVERT(vec_uint4, vec_float4,  vec_ctu)
+
+// converts single and double precision to signed doubleword
+#ifdef vec_ctsl
+#   undef vec_ctsl
+#endif
+VSX_IMPL_CONVERT(vec_dword2, vec_double2, vec_ctsl)
+// __builtin_convertvector unable to convert, xvcvspsxds is missing on it
+VSX_FINLINE(vec_dword2) vec_ctslo(const vec_float4& a)
+{ return vec_ctsl(vec_cvfo(a)); }
+
+// converts single and double precision to unsigned doubleword
+#ifdef vec_ctul
+#   undef vec_ctul
+#endif
+VSX_IMPL_CONVERT(vec_udword2, vec_double2, vec_ctul)
+// __builtin_convertvector unable to convert, xvcvspuxds is missing on it
+VSX_FINLINE(vec_udword2) vec_ctulo(const vec_float4& a)
+{ return vec_ctul(vec_cvfo(a)); }
+
+#endif // CLANG VSX compatibility
+
+/*
+ * Common GCC, CLANG compatibility
+**/
+#if defined(__GNUG__) && !defined(__IBMCPP__)
+
+#ifdef vec_cvf
+#   undef vec_cvf
+#endif
+
+#define VSX_IMPL_CONV_EVEN_4_2(rt, rg, fnm, fn2) \
+VSX_FINLINE(rt) fnm(const rg& a)                 \
+{ return fn2(vec_sldw(a, a, 1)); }
+
+VSX_IMPL_CONV_EVEN_4_2(vec_double2, vec_float4, vec_cvf,  vec_cvfo)
+VSX_IMPL_CONV_EVEN_4_2(vec_double2, vec_int4,   vec_ctd,  vec_ctdo)
+VSX_IMPL_CONV_EVEN_4_2(vec_double2, vec_uint4,  vec_ctd,  vec_ctdo)
+
+VSX_IMPL_CONV_EVEN_4_2(vec_dword2,  vec_float4, vec_ctsl, vec_ctslo)
+VSX_IMPL_CONV_EVEN_4_2(vec_udword2, vec_float4, vec_ctul, vec_ctulo)
+
+#define VSX_IMPL_CONV_EVEN_2_4(rt, rg, fnm, fn2) \
+VSX_FINLINE(rt) fnm(const rg& a)                 \
+{                                                \
+    rt v4 = fn2(a);                              \
+    return vec_sldw(v4, v4, 3);                  \
+}
+
+VSX_IMPL_CONV_EVEN_2_4(vec_float4, vec_double2, vec_cvf, vec_cvfo)
+VSX_IMPL_CONV_EVEN_2_4(vec_float4, vec_dword2,  vec_ctf, vec_ctfo)
+VSX_IMPL_CONV_EVEN_2_4(vec_float4, vec_udword2, vec_ctf, vec_ctfo)
+
+VSX_IMPL_CONV_EVEN_2_4(vec_int4,   vec_double2, vec_cts, vec_ctso)
+VSX_IMPL_CONV_EVEN_2_4(vec_uint4,  vec_double2, vec_ctu, vec_ctuo)
+
+// Only for Eigen!
+/*
+ * changing behavior of conversion intrinsics for gcc has effect on Eigen
+ * so we redefine old behavior again only on gcc, clang
+*/
+#if !defined(__clang__) || __clang_major__ > 4
+    // ignoring second arg since Eigen only truncates toward zero
+#   define VSX_IMPL_CONV_2VARIANT(rt, rg, fnm, fn2)     \
+    VSX_FINLINE(rt) fnm(const rg& a, int only_truncate) \
+    {                                                   \
+        assert(only_truncate == 0);                     \
+        CV_UNUSED(only_truncate);                       \
+        return fn2(a);                                  \
+    }
+    VSX_IMPL_CONV_2VARIANT(vec_int4,   vec_float4,  vec_cts, vec_cts)
+    VSX_IMPL_CONV_2VARIANT(vec_uint4,  vec_float4,  vec_ctu, vec_ctu)
+    VSX_IMPL_CONV_2VARIANT(vec_float4, vec_int4,    vec_ctf, vec_ctf)
+    VSX_IMPL_CONV_2VARIANT(vec_float4, vec_uint4,   vec_ctf, vec_ctf)
+    // define vec_cts for converting double precision to signed doubleword
+    // which isn't compatible with xlc but its okay since Eigen only uses it for gcc
+    VSX_IMPL_CONV_2VARIANT(vec_dword2, vec_double2, vec_cts, vec_ctsl)
+#endif // Eigen
+
+#endif // Common GCC, CLANG compatibility
+
+/*
+ * XLC VSX compatibility
+**/
+#if defined(__IBMCPP__)
+
+// vector population count
+#define vec_popcntu vec_popcnt
+
+// overload and redirect with setting second arg to zero
+// since we only support conversions without the second arg
+#define VSX_IMPL_OVERLOAD_Z2(rt, rg, fnm) \
+VSX_FINLINE(rt) fnm(const rg& a) { return fnm(a, 0); }
+
+VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_int4,    vec_ctd)
+VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_uint4,   vec_ctd)
+VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_dword2,  vec_ctd)
+VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_udword2, vec_ctd)
+
+VSX_IMPL_OVERLOAD_Z2(vec_float4,  vec_int4,    vec_ctf)
+VSX_IMPL_OVERLOAD_Z2(vec_float4,  vec_uint4,   vec_ctf)
+VSX_IMPL_OVERLOAD_Z2(vec_float4,  vec_dword2,  vec_ctf)
+VSX_IMPL_OVERLOAD_Z2(vec_float4,  vec_udword2, vec_ctf)
+
+VSX_IMPL_OVERLOAD_Z2(vec_int4,    vec_double2, vec_cts)
+VSX_IMPL_OVERLOAD_Z2(vec_int4,    vec_float4,  vec_cts)
+
+VSX_IMPL_OVERLOAD_Z2(vec_uint4,   vec_double2, vec_ctu)
+VSX_IMPL_OVERLOAD_Z2(vec_uint4,   vec_float4,  vec_ctu)
+
+VSX_IMPL_OVERLOAD_Z2(vec_dword2,  vec_double2, vec_ctsl)
+VSX_IMPL_OVERLOAD_Z2(vec_dword2,  vec_float4,  vec_ctsl)
+
+VSX_IMPL_OVERLOAD_Z2(vec_udword2, vec_double2, vec_ctul)
+VSX_IMPL_OVERLOAD_Z2(vec_udword2, vec_float4,  vec_ctul)
+
+// fixme: implement conversions of odd-numbered elements in a dirty way
+// since xlc doesn't support VSX registers operand in inline asm.
+#define VSX_IMPL_CONV_ODD_4_2(rt, rg, fnm, fn2) \
+VSX_FINLINE(rt) fnm(const rg& a) { return fn2(vec_sldw(a, a, 3)); }
+
+VSX_IMPL_CONV_ODD_4_2(vec_double2, vec_float4, vec_cvfo,  vec_cvf)
+VSX_IMPL_CONV_ODD_4_2(vec_double2, vec_int4,   vec_ctdo,  vec_ctd)
+VSX_IMPL_CONV_ODD_4_2(vec_double2, vec_uint4,  vec_ctdo,  vec_ctd)
+
+VSX_IMPL_CONV_ODD_4_2(vec_dword2,  vec_float4, vec_ctslo, vec_ctsl)
+VSX_IMPL_CONV_ODD_4_2(vec_udword2, vec_float4, vec_ctulo, vec_ctul)
+
+#define VSX_IMPL_CONV_ODD_2_4(rt, rg, fnm, fn2)  \
+VSX_FINLINE(rt) fnm(const rg& a)                 \
+{                                                \
+    rt v4 = fn2(a);                              \
+    return vec_sldw(v4, v4, 1);                  \
+}
+
+VSX_IMPL_CONV_ODD_2_4(vec_float4, vec_double2, vec_cvfo, vec_cvf)
+VSX_IMPL_CONV_ODD_2_4(vec_float4, vec_dword2,  vec_ctfo, vec_ctf)
+VSX_IMPL_CONV_ODD_2_4(vec_float4, vec_udword2, vec_ctfo, vec_ctf)
+
+VSX_IMPL_CONV_ODD_2_4(vec_int4,   vec_double2, vec_ctso, vec_cts)
+VSX_IMPL_CONV_ODD_2_4(vec_uint4,  vec_double2, vec_ctuo, vec_ctu)
+
+#endif // XLC VSX compatibility
+
+// ignore GCC warning that caused by -Wunused-but-set-variable in rare cases
+#if defined(__GNUG__) && !defined(__clang__)
+#   define VSX_UNUSED(Tvec) Tvec __attribute__((__unused__))
+#else // CLANG, XLC
+#   define VSX_UNUSED(Tvec) Tvec
+#endif
+
+// gcc can find his way in casting log int and XLC, CLANG ambiguous
+#if defined(__clang__) || defined(__IBMCPP__)
+    VSX_FINLINE(vec_udword2) vec_splats(uint64 v)
+    { return vec_splats((unsigned long long) v); }
+
+    VSX_FINLINE(vec_dword2) vec_splats(int64 v)
+    { return vec_splats((long long) v); }
+
+    VSX_FINLINE(vec_udword2) vec_promote(uint64 a, int b)
+    { return vec_promote((unsigned long long) a, b); }
+
+    VSX_FINLINE(vec_dword2) vec_promote(int64 a, int b)
+    { return vec_promote((long long) a, b); }
+#endif
+
+/*
+ * implement vsx_ld(offset, pointer), vsx_st(vector, offset, pointer)
+ * load and set using offset depend on the pointer type
+ *
+ * implement vsx_ldf(offset, pointer), vsx_stf(vector, offset, pointer)
+ * load and set using offset depend on fixed bytes size
+ *
+ * Note: In clang vec_xl and vec_xst fails to load unaligned addresses
+ * so we are using vec_vsx_ld, vec_vsx_st instead
+*/
+
+#if defined(__clang__) && !defined(__IBMCPP__)
+#   define vsx_ldf  vec_vsx_ld
+#   define vsx_stf  vec_vsx_st
+#else // GCC , XLC
+#   define vsx_ldf  vec_xl
+#   define vsx_stf  vec_xst
+#endif
+
+#define VSX_OFFSET(o, p) ((o) * sizeof(*(p)))
+#define vsx_ld(o, p) vsx_ldf(VSX_OFFSET(o, p), p)
+#define vsx_st(v, o, p) vsx_stf(v, VSX_OFFSET(o, p), p)
+
+/*
+ * implement vsx_ld2(offset, pointer), vsx_st2(vector, offset, pointer) to load and store double words
+ * In GCC vec_xl and vec_xst it maps to vec_vsx_ld, vec_vsx_st which doesn't support long long
+ * and in CLANG we are using vec_vsx_ld, vec_vsx_st because vec_xl, vec_xst fails to load unaligned addresses
+ *
+ * In XLC vec_xl and vec_xst fail to cast int64(long int) to long long
+*/
+#if (defined(__GNUG__) || defined(__clang__)) && !defined(__IBMCPP__)
+    VSX_FINLINE(vec_udword2) vsx_ld2(long o, const uint64* p)
+    { return vec_udword2_c(vsx_ldf(VSX_OFFSET(o, p), (unsigned int*)p)); }
+
+    VSX_FINLINE(vec_dword2) vsx_ld2(long o, const int64* p)
+    { return vec_dword2_c(vsx_ldf(VSX_OFFSET(o, p), (int*)p)); }
+
+    VSX_FINLINE(void) vsx_st2(const vec_udword2& vec, long o, uint64* p)
+    { vsx_stf(vec_uint4_c(vec), VSX_OFFSET(o, p), (unsigned int*)p); }
+
+    VSX_FINLINE(void) vsx_st2(const vec_dword2& vec, long o, int64* p)
+    { vsx_stf(vec_int4_c(vec), VSX_OFFSET(o, p), (int*)p); }
+#else // XLC
+    VSX_FINLINE(vec_udword2) vsx_ld2(long o, const uint64* p)
+    { return vsx_ldf(VSX_OFFSET(o, p), (unsigned long long*)p); }
+
+    VSX_FINLINE(vec_dword2) vsx_ld2(long o, const int64* p)
+    { return vsx_ldf(VSX_OFFSET(o, p), (long long*)p); }
+
+    VSX_FINLINE(void) vsx_st2(const vec_udword2& vec, long o, uint64* p)
+    { vsx_stf(vec, VSX_OFFSET(o, p), (unsigned long long*)p); }
+
+    VSX_FINLINE(void) vsx_st2(const vec_dword2& vec, long o, int64* p)
+    { vsx_stf(vec, VSX_OFFSET(o, p), (long long*)p); }
+#endif
+
+// Store lower 8 byte
+#define vec_st_l8(v, p) *((uint64*)(p)) = vec_extract(vec_udword2_c(v), 0)
+
+// Store higher 8 byte
+#define vec_st_h8(v, p) *((uint64*)(p)) = vec_extract(vec_udword2_c(v), 1)
+
+// Load 64-bits of integer data to lower part
+#define VSX_IMPL_LOAD_L8(Tvec, Tp)                  \
+VSX_FINLINE(Tvec) vec_ld_l8(const Tp *p)            \
+{ return ((Tvec)vec_promote(*((uint64*)p), 0)); }
+
+VSX_IMPL_LOAD_L8(vec_uchar16, uchar)
+VSX_IMPL_LOAD_L8(vec_char16,  schar)
+VSX_IMPL_LOAD_L8(vec_ushort8, ushort)
+VSX_IMPL_LOAD_L8(vec_short8,  short)
+VSX_IMPL_LOAD_L8(vec_uint4,   uint)
+VSX_IMPL_LOAD_L8(vec_int4,    int)
+VSX_IMPL_LOAD_L8(vec_float4,  float)
+VSX_IMPL_LOAD_L8(vec_udword2, uint64)
+VSX_IMPL_LOAD_L8(vec_dword2,  int64)
+VSX_IMPL_LOAD_L8(vec_double2, double)
+
+// logical not
+#define vec_not(a) vec_nor(a, a)
+
+// power9 yaya
+// not equal
+#ifndef vec_cmpne
+#   define vec_cmpne(a, b) vec_not(vec_cmpeq(a, b))
+#endif
+
+// absolute difference
+#ifndef _ARCH_PWR9
+#   undef vec_absd
+#   define vec_absd(a, b) vec_sub(vec_max(a, b), vec_min(a, b))
+#endif
+
+/*
+ * Implement vec_unpacklu and vec_unpackhu
+ * since vec_unpackl, vec_unpackh only support signed integers
+**/
+#define VSX_IMPL_UNPACKU(rt, rg, zero)      \
+VSX_FINLINE(rt) vec_unpacklu(const rg& a)   \
+{ return (rt)(vec_mergel(a, zero)); }       \
+VSX_FINLINE(rt) vec_unpackhu(const rg& a)   \
+{ return (rt)(vec_mergeh(a, zero));  }
+
+VSX_IMPL_UNPACKU(vec_ushort8, vec_uchar16, vec_uchar16_z)
+VSX_IMPL_UNPACKU(vec_uint4,   vec_ushort8, vec_ushort8_z)
+VSX_IMPL_UNPACKU(vec_udword2, vec_uint4,   vec_uint4_z)
+
+/*
+ * Implement vec_mergesqe and vec_mergesqo
+ * Merges the sequence values of even and odd elements of two vectors
+*/
+#define VSX_IMPL_PERM(rt, fnm, ...)            \
+VSX_FINLINE(rt) fnm(const rt& a, const rt& b)  \
+{ static const vec_uchar16 perm = {__VA_ARGS__}; return vec_perm(a, b, perm); }
+
+// 16
+#define perm16_mergesqe 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+#define perm16_mergesqo 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+VSX_IMPL_PERM(vec_uchar16, vec_mergesqe, perm16_mergesqe)
+VSX_IMPL_PERM(vec_uchar16, vec_mergesqo, perm16_mergesqo)
+VSX_IMPL_PERM(vec_char16,  vec_mergesqe, perm16_mergesqe)
+VSX_IMPL_PERM(vec_char16,  vec_mergesqo, perm16_mergesqo)
+// 8
+#define perm8_mergesqe 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
+#define perm8_mergesqo 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
+VSX_IMPL_PERM(vec_ushort8, vec_mergesqe, perm8_mergesqe)
+VSX_IMPL_PERM(vec_ushort8, vec_mergesqo, perm8_mergesqo)
+VSX_IMPL_PERM(vec_short8,  vec_mergesqe, perm8_mergesqe)
+VSX_IMPL_PERM(vec_short8,  vec_mergesqo, perm8_mergesqo)
+// 4
+#define perm4_mergesqe 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+#define perm4_mergesqo 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+VSX_IMPL_PERM(vec_uint4,  vec_mergesqe, perm4_mergesqe)
+VSX_IMPL_PERM(vec_uint4,  vec_mergesqo, perm4_mergesqo)
+VSX_IMPL_PERM(vec_int4,   vec_mergesqe, perm4_mergesqe)
+VSX_IMPL_PERM(vec_int4,   vec_mergesqo, perm4_mergesqo)
+VSX_IMPL_PERM(vec_float4, vec_mergesqe, perm4_mergesqe)
+VSX_IMPL_PERM(vec_float4, vec_mergesqo, perm4_mergesqo)
+// 2
+VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqe, vec_mergeh)
+VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqo, vec_mergel)
+VSX_REDIRECT_2RG(vec_dword2,  vec_dword2,  vec_mergesqe, vec_mergeh)
+VSX_REDIRECT_2RG(vec_dword2,  vec_dword2,  vec_mergesqo, vec_mergel)
+VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqe, vec_mergeh)
+VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqo, vec_mergel)
+
+/*
+ * Implement vec_mergesqh and vec_mergesql
+ * Merges the sequence most and least significant halves of two vectors
+*/
+#define VSX_IMPL_MERGESQHL(Tvec)                                    \
+VSX_FINLINE(Tvec) vec_mergesqh(const Tvec& a, const Tvec& b)        \
+{ return (Tvec)vec_mergeh(vec_udword2_c(a), vec_udword2_c(b)); }    \
+VSX_FINLINE(Tvec) vec_mergesql(const Tvec& a, const Tvec& b)        \
+{ return (Tvec)vec_mergel(vec_udword2_c(a), vec_udword2_c(b)); }
+VSX_IMPL_MERGESQHL(vec_uchar16)
+VSX_IMPL_MERGESQHL(vec_char16)
+VSX_IMPL_MERGESQHL(vec_ushort8)
+VSX_IMPL_MERGESQHL(vec_short8)
+VSX_IMPL_MERGESQHL(vec_uint4)
+VSX_IMPL_MERGESQHL(vec_int4)
+VSX_IMPL_MERGESQHL(vec_float4)
+VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqh, vec_mergeh)
+VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesql, vec_mergel)
+VSX_REDIRECT_2RG(vec_dword2,  vec_dword2,  vec_mergesqh, vec_mergeh)
+VSX_REDIRECT_2RG(vec_dword2,  vec_dword2,  vec_mergesql, vec_mergel)
+VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqh, vec_mergeh)
+VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesql, vec_mergel)
+
+
+// 2 and 4 channels interleave for all types except 2 lanes
+#define VSX_IMPL_ST_INTERLEAVE(Tp, Tvec)                                    \
+VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, Tp* ptr)  \
+{                                                                           \
+    vsx_stf(vec_mergeh(a, b), 0, ptr);                                      \
+    vsx_stf(vec_mergel(a, b), 16, ptr);                                     \
+}                                                                           \
+VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b,           \
+                                     const Tvec& c, const Tvec& d, Tp* ptr) \
+{                                                                           \
+    Tvec ac = vec_mergeh(a, c);                                             \
+    Tvec bd = vec_mergeh(b, d);                                             \
+    vsx_stf(vec_mergeh(ac, bd), 0, ptr);                                    \
+    vsx_stf(vec_mergel(ac, bd), 16, ptr);                                   \
+    ac = vec_mergel(a, c);                                                  \
+    bd = vec_mergel(b, d);                                                  \
+    vsx_stf(vec_mergeh(ac, bd), 32, ptr);                                   \
+    vsx_stf(vec_mergel(ac, bd), 48, ptr);                                   \
+}
+VSX_IMPL_ST_INTERLEAVE(uchar,  vec_uchar16)
+VSX_IMPL_ST_INTERLEAVE(schar,  vec_char16)
+VSX_IMPL_ST_INTERLEAVE(ushort, vec_ushort8)
+VSX_IMPL_ST_INTERLEAVE(short,  vec_short8)
+VSX_IMPL_ST_INTERLEAVE(uint,   vec_uint4)
+VSX_IMPL_ST_INTERLEAVE(int,    vec_int4)
+VSX_IMPL_ST_INTERLEAVE(float,  vec_float4)
+
+// 2 and 4 channels deinterleave for 16 lanes
+#define VSX_IMPL_ST_DINTERLEAVE_8(Tp, Tvec)                                 \
+VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b)      \
+{                                                                           \
+    Tvec v0 = vsx_ld(0, ptr);                                               \
+    Tvec v1 = vsx_ld(16, ptr);                                              \
+    a = vec_mergesqe(v0, v1);                                               \
+    b = vec_mergesqo(v0, v1);                                               \
+}                                                                           \
+VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b,      \
+                                       Tvec& c, Tvec& d)                    \
+{                                                                           \
+    Tvec v0 = vsx_ld(0, ptr);                                               \
+    Tvec v1 = vsx_ld(16, ptr);                                              \
+    Tvec v2 = vsx_ld(32, ptr);                                              \
+    Tvec v3 = vsx_ld(48, ptr);                                              \
+    Tvec m0 = vec_mergesqe(v0, v1);                                         \
+    Tvec m1 = vec_mergesqe(v2, v3);                                         \
+    a = vec_mergesqe(m0, m1);                                               \
+    c = vec_mergesqo(m0, m1);                                               \
+    m0 = vec_mergesqo(v0, v1);                                              \
+    m1 = vec_mergesqo(v2, v3);                                              \
+    b = vec_mergesqe(m0, m1);                                               \
+    d = vec_mergesqo(m0, m1);                                               \
+}
+VSX_IMPL_ST_DINTERLEAVE_8(uchar, vec_uchar16)
+VSX_IMPL_ST_DINTERLEAVE_8(schar, vec_char16)
+
+// 2 and 4 channels deinterleave for 8 lanes
+#define VSX_IMPL_ST_DINTERLEAVE_16(Tp, Tvec)                                \
+VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b)      \
+{                                                                           \
+    Tvec v0 = vsx_ld(0, ptr);                                               \
+    Tvec v1 = vsx_ld(8, ptr);                                               \
+    a = vec_mergesqe(v0, v1);                                               \
+    b = vec_mergesqo(v0, v1);                                               \
+}                                                                           \
+VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b,      \
+                                       Tvec& c, Tvec& d)                    \
+{                                                                           \
+    Tvec v0 = vsx_ld(0, ptr);                                               \
+    Tvec v1 = vsx_ld(8, ptr);                                               \
+    Tvec m0 = vec_mergeh(v0, v1);                                           \
+    Tvec m1 = vec_mergel(v0, v1);                                           \
+    Tvec ab0 = vec_mergeh(m0, m1);                                          \
+    Tvec cd0 = vec_mergel(m0, m1);                                          \
+    v0 = vsx_ld(16, ptr);                                                   \
+    v1 = vsx_ld(24, ptr);                                                   \
+    m0 = vec_mergeh(v0, v1);                                                \
+    m1 = vec_mergel(v0, v1);                                                \
+    Tvec ab1 = vec_mergeh(m0, m1);                                          \
+    Tvec cd1 = vec_mergel(m0, m1);                                          \
+    a = vec_mergesqh(ab0, ab1);                                             \
+    b = vec_mergesql(ab0, ab1);                                             \
+    c = vec_mergesqh(cd0, cd1);                                             \
+    d = vec_mergesql(cd0, cd1);                                             \
+}
+VSX_IMPL_ST_DINTERLEAVE_16(ushort, vec_ushort8)
+VSX_IMPL_ST_DINTERLEAVE_16(short,  vec_short8)
+
+// 2 and 4 channels deinterleave for 4 lanes
+#define VSX_IMPL_ST_DINTERLEAVE_32(Tp, Tvec)                                \
+VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b)      \
+{                                                                           \
+    a = vsx_ld(0, ptr);                                                     \
+    b = vsx_ld(4, ptr);                                                     \
+    Tvec m0 = vec_mergeh(a, b);                                             \
+    Tvec m1 = vec_mergel(a, b);                                             \
+    a = vec_mergeh(m0, m1);                                                 \
+    b = vec_mergel(m0, m1);                                                 \
+}                                                                           \
+VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b,      \
+                                       Tvec& c, Tvec& d)                    \
+{                                                                           \
+    Tvec v0 = vsx_ld(0, ptr);                                               \
+    Tvec v1 = vsx_ld(4, ptr);                                               \
+    Tvec v2 = vsx_ld(8, ptr);                                               \
+    Tvec v3 = vsx_ld(12, ptr);                                              \
+    Tvec m0 = vec_mergeh(v0, v2);                                           \
+    Tvec m1 = vec_mergeh(v1, v3);                                           \
+    a = vec_mergeh(m0, m1);                                                 \
+    b = vec_mergel(m0, m1);                                                 \
+    m0 = vec_mergel(v0, v2);                                                \
+    m1 = vec_mergel(v1, v3);                                                \
+    c = vec_mergeh(m0, m1);                                                 \
+    d = vec_mergel(m0, m1);                                                 \
+}
+VSX_IMPL_ST_DINTERLEAVE_32(uint,  vec_uint4)
+VSX_IMPL_ST_DINTERLEAVE_32(int,   vec_int4)
+VSX_IMPL_ST_DINTERLEAVE_32(float, vec_float4)
+
+// 2 and 4 channels interleave and deinterleave for 2 lanes
+#define VSX_IMPL_ST_D_INTERLEAVE_64(Tp, Tvec, ld_func, st_func)             \
+VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, Tp* ptr)  \
+{                                                                           \
+    st_func(vec_mergeh(a, b), 0, ptr);                                      \
+    st_func(vec_mergel(a, b), 2, ptr);                                      \
+}                                                                           \
+VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b,           \
+                                     const Tvec& c, const Tvec& d, Tp* ptr) \
+{                                                                           \
+    st_func(vec_mergeh(a, b), 0, ptr);                                      \
+    st_func(vec_mergeh(c, d), 2, ptr);                                      \
+    st_func(vec_mergel(a, b), 4, ptr);                                      \
+    st_func(vec_mergel(c, d), 6, ptr);                                      \
+}                                                                           \
+VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b)      \
+{                                                                           \
+    Tvec m0 = ld_func(0, ptr);                                              \
+    Tvec m1 = ld_func(2, ptr);                                              \
+    a = vec_mergeh(m0, m1);                                                 \
+    b = vec_mergel(m0, m1);                                                 \
+}                                                                           \
+VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b,      \
+                                       Tvec& c, Tvec& d)                    \
+{                                                                           \
+    Tvec v0 = ld_func(0, ptr);                                              \
+    Tvec v1 = ld_func(2, ptr);                                              \
+    Tvec v2 = ld_func(4, ptr);                                              \
+    Tvec v3 = ld_func(6, ptr);                                              \
+    a = vec_mergeh(v0, v2);                                                 \
+    b = vec_mergel(v0, v2);                                                 \
+    c = vec_mergeh(v1, v3);                                                 \
+    d = vec_mergel(v1, v3);                                                 \
+}
+VSX_IMPL_ST_D_INTERLEAVE_64(int64,  vec_dword2,  vsx_ld2, vsx_st2)
+VSX_IMPL_ST_D_INTERLEAVE_64(uint64, vec_udword2, vsx_ld2, vsx_st2)
+VSX_IMPL_ST_D_INTERLEAVE_64(double, vec_double2, vsx_ld,  vsx_st)
+
+/* 3 channels */
+#define VSX_IMPL_ST_INTERLEAVE_3CH_16(Tp, Tvec)                                                   \
+VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b,                                 \
+                                     const Tvec& c, Tp* ptr)                                      \
+{                                                                                                 \
+    static const vec_uchar16 a12 = {0, 16, 0, 1, 17, 0, 2, 18, 0, 3, 19, 0, 4, 20, 0, 5};         \
+    static const vec_uchar16 a123 = {0, 1, 16, 3, 4, 17, 6, 7, 18, 9, 10, 19, 12, 13, 20, 15};    \
+    vsx_st(vec_perm(vec_perm(a, b, a12), c, a123), 0, ptr);                                       \
+    static const vec_uchar16 b12 = {21, 0, 6, 22, 0, 7, 23, 0, 8, 24, 0, 9, 25, 0, 10, 26};       \
+    static const vec_uchar16 b123 = {0, 21, 2, 3, 22, 5, 6, 23, 8, 9, 24, 11, 12, 25, 14, 15};    \
+    vsx_st(vec_perm(vec_perm(a, b, b12), c, b123), 16, ptr);                                      \
+    static const vec_uchar16 c12 = {0, 11, 27, 0, 12, 28, 0, 13, 29, 0, 14, 30, 0, 15, 31, 0};    \
+    static const vec_uchar16 c123 = {26, 1, 2, 27, 4, 5, 28, 7, 8, 29, 10, 11, 30, 13, 14, 31};   \
+    vsx_st(vec_perm(vec_perm(a, b, c12), c, c123), 32, ptr);                                      \
+}                                                                                                 \
+VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c)                   \
+{                                                                                                 \
+    Tvec v1 = vsx_ld(0, ptr);                                                                     \
+    Tvec v2 = vsx_ld(16, ptr);                                                                    \
+    Tvec v3 = vsx_ld(32, ptr);                                                                    \
+    static const vec_uchar16 a12_perm = {0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0, 0, 0, 0, 0};  \
+    static const vec_uchar16 a123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 17, 20, 23, 26, 29};  \
+    a = vec_perm(vec_perm(v1, v2, a12_perm), v3, a123_perm);                                      \
+    static const vec_uchar16 b12_perm = {1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 0, 0, 0, 0, 0}; \
+    static const vec_uchar16 b123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 18, 21, 24, 27, 30};  \
+    b = vec_perm(vec_perm(v1, v2, b12_perm), v3, b123_perm);                                      \
+    static const vec_uchar16 c12_perm = {2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 0, 0, 0, 0, 0};  \
+    static const vec_uchar16 c123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 19, 22, 25, 28, 31};  \
+    c = vec_perm(vec_perm(v1, v2, c12_perm), v3, c123_perm);                                      \
+}
+VSX_IMPL_ST_INTERLEAVE_3CH_16(uchar, vec_uchar16)
+VSX_IMPL_ST_INTERLEAVE_3CH_16(schar, vec_char16)
+
+#define VSX_IMPL_ST_INTERLEAVE_3CH_8(Tp, Tvec)                                                    \
+VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b,                                 \
+                                     const Tvec& c, Tp* ptr)                                      \
+{                                                                                                 \
+    static const vec_uchar16 a12 = {0, 1, 16, 17, 0, 0, 2, 3, 18, 19, 0, 0, 4, 5, 20, 21};        \
+    static const vec_uchar16 a123 = {0, 1, 2, 3, 16, 17, 6, 7, 8, 9, 18, 19, 12, 13, 14, 15};     \
+    vsx_st(vec_perm(vec_perm(a, b, a12), c, a123), 0, ptr);                                       \
+    static const vec_uchar16 b12 = {0, 0, 6, 7, 22, 23, 0, 0, 8, 9, 24, 25, 0, 0, 10, 11};        \
+    static const vec_uchar16 b123 = {20, 21, 2, 3, 4, 5, 22, 23, 8, 9, 10, 11, 24, 25, 14, 15};   \
+    vsx_st(vec_perm(vec_perm(a, b, b12), c, b123), 8, ptr);                                       \
+    static const vec_uchar16 c12 = {26, 27, 0, 0, 12, 13, 28, 29, 0, 0, 14, 15, 30, 31, 0, 0};    \
+    static const vec_uchar16 c123 = {0, 1, 26, 27, 4, 5, 6, 7, 28, 29, 10, 11, 12, 13, 30, 31};   \
+    vsx_st(vec_perm(vec_perm(a, b, c12), c, c123), 16, ptr);                                      \
+}                                                                                                 \
+VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c)                   \
+{                                                                                                 \
+    Tvec v1 = vsx_ld(0, ptr);                                                                     \
+    Tvec v2 = vsx_ld(8, ptr);                                                                     \
+    Tvec v3 = vsx_ld(16, ptr);                                                                    \
+    static const vec_uchar16 a12_perm = {0, 1, 6, 7, 12, 13, 18, 19, 24, 25, 30, 31, 0, 0, 0, 0}; \
+    static const vec_uchar16 a123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 20, 21, 26, 27};  \
+    a = vec_perm(vec_perm(v1, v2, a12_perm), v3, a123_perm);                                      \
+    static const vec_uchar16 b12_perm = {2, 3, 8, 9, 14, 15, 20, 21, 26, 27, 0, 0, 0, 0, 0, 0};   \
+    static const vec_uchar16 b123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 17, 22, 23, 28, 29};  \
+    b = vec_perm(vec_perm(v1, v2, b12_perm), v3, b123_perm);                                      \
+    static const vec_uchar16 c12_perm = {4, 5, 10, 11, 16, 17, 22, 23, 28, 29, 0, 0, 0, 0, 0, 0}; \
+    static const vec_uchar16 c123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 18, 19, 24, 25, 30, 31};  \
+    c = vec_perm(vec_perm(v1, v2, c12_perm), v3, c123_perm);                                      \
+}
+VSX_IMPL_ST_INTERLEAVE_3CH_8(ushort, vec_ushort8)
+VSX_IMPL_ST_INTERLEAVE_3CH_8(short,  vec_short8)
+
+#define VSX_IMPL_ST_INTERLEAVE_3CH_4(Tp, Tvec)                                                     \
+VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b,                                  \
+                                     const Tvec& c, Tp* ptr)                                       \
+{                                                                                                  \
+    Tvec hbc = vec_mergeh(b, c);                                                                   \
+    static const vec_uchar16 ahbc = {0, 1, 2, 3, 16, 17, 18, 19, 20, 21, 22, 23, 4, 5, 6, 7};      \
+    vsx_st(vec_perm(a, hbc, ahbc), 0, ptr);                                                        \
+    Tvec lab = vec_mergel(a, b);                                                                   \
+    vsx_st(vec_sld(lab, hbc, 8), 4, ptr);                                                          \
+    static const vec_uchar16 clab = {8, 9, 10, 11, 24, 25, 26, 27, 28, 29, 30, 31, 12, 13, 14, 15};\
+    vsx_st(vec_perm(c, lab, clab), 8, ptr);                                                        \
+}                                                                                                  \
+VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c)                    \
+{                                                                                                  \
+    Tvec v1 = vsx_ld(0, ptr);                                                                      \
+    Tvec v2 = vsx_ld(4, ptr);                                                                      \
+    Tvec v3 = vsx_ld(8, ptr);                                                                      \
+    static const vec_uchar16 flp = {0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31};   \
+    a = vec_perm(v1, vec_sld(v3, v2, 8), flp);                                                     \
+    static const vec_uchar16 flp2 = {28, 29, 30, 31, 0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19};  \
+    b = vec_perm(v2, vec_sld(v1, v3, 8), flp2);                                                    \
+    c = vec_perm(vec_sld(v2, v1, 8), v3, flp);                                                     \
+}
+VSX_IMPL_ST_INTERLEAVE_3CH_4(uint,  vec_uint4)
+VSX_IMPL_ST_INTERLEAVE_3CH_4(int,   vec_int4)
+VSX_IMPL_ST_INTERLEAVE_3CH_4(float, vec_float4)
+
+#define VSX_IMPL_ST_INTERLEAVE_3CH_2(Tp, Tvec, ld_func, st_func)     \
+VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b,    \
+                                     const Tvec& c, Tp* ptr)         \
+{                                                                    \
+    st_func(vec_mergeh(a, b), 0, ptr);                               \
+    st_func(vec_permi(c, a, 1), 2, ptr);                             \
+    st_func(vec_mergel(b, c), 4, ptr);                               \
+}                                                                    \
+VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a,        \
+                                       Tvec& b, Tvec& c)             \
+{                                                                    \
+    Tvec v1 = ld_func(0, ptr);                                       \
+    Tvec v2 = ld_func(2, ptr);                                       \
+    Tvec v3 = ld_func(4, ptr);                                       \
+    a = vec_permi(v1, v2, 1);                                        \
+    b = vec_permi(v1, v3, 2);                                        \
+    c = vec_permi(v2, v3, 1);                                        \
+}
+VSX_IMPL_ST_INTERLEAVE_3CH_2(int64,  vec_dword2,  vsx_ld2, vsx_st2)
+VSX_IMPL_ST_INTERLEAVE_3CH_2(uint64, vec_udword2, vsx_ld2, vsx_st2)
+VSX_IMPL_ST_INTERLEAVE_3CH_2(double, vec_double2, vsx_ld,  vsx_st)
+
+#endif // CV_VSX
+
+//! @}
+
+#endif // OPENCV_HAL_VSX_UTILS_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/cvconfig.h b/duix-sdk/src/main/cpp/third/arm/include/opencv2/cvconfig.h
new file mode 100644
index 0000000..ceeeb8d
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/cvconfig.h
@@ -0,0 +1,149 @@
+#ifndef OPENCV_CVCONFIG_H_INCLUDED
+#define OPENCV_CVCONFIG_H_INCLUDED
+
+/* OpenCV compiled as static or dynamic libs */
+/* #undef BUILD_SHARED_LIBS */
+
+/* OpenCV intrinsics optimized code */
+#define CV_ENABLE_INTRINSICS
+
+/* OpenCV additional optimized code */
+/* #undef CV_DISABLE_OPTIMIZATION */
+
+/* Compile for 'real' NVIDIA GPU architectures */
+#define CUDA_ARCH_BIN ""
+
+/* NVIDIA GPU features are used */
+#define CUDA_ARCH_FEATURES ""
+
+/* Compile for 'virtual' NVIDIA PTX architectures */
+#define CUDA_ARCH_PTX ""
+
+/* AMD's Basic Linear Algebra Subprograms Library*/
+/* #undef HAVE_CLAMDBLAS */
+
+/* AMD's OpenCL Fast Fourier Transform Library*/
+/* #undef HAVE_CLAMDFFT */
+
+/* Clp support */
+/* #undef HAVE_CLP */
+
+/* NVIDIA CUDA Runtime API*/
+/* #undef HAVE_CUDA */
+
+/* NVIDIA CUDA Basic Linear Algebra Subprograms (BLAS) API*/
+/* #undef HAVE_CUBLAS */
+
+/* NVIDIA CUDA Deep Neural Network (cuDNN) API*/
+/* #undef HAVE_CUDNN */
+
+/* NVIDIA CUDA Fast Fourier Transform (FFT) API*/
+/* #undef HAVE_CUFFT */
+
+/* DirectX */
+/* #undef HAVE_DIRECTX */
+/* #undef HAVE_DIRECTX_NV12 */
+/* #undef HAVE_D3D11 */
+/* #undef HAVE_D3D10 */
+/* #undef HAVE_D3D9 */
+
+/* Eigen Matrix & Linear Algebra Library */
+/* #undef HAVE_EIGEN */
+
+/* Geospatial Data Abstraction Library */
+/* #undef HAVE_GDAL */
+
+/* Halide support */
+/* #undef HAVE_HALIDE */
+
+/* Vulkan support */
+/* #undef HAVE_VULKAN */
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+/* #undef HAVE_INTTYPES_H */
+
+/* Intel Integrated Performance Primitives */
+/* #undef HAVE_IPP */
+/* #undef HAVE_IPP_ICV */
+/* #undef HAVE_IPP_IW */
+/* #undef HAVE_IPP_IW_LL */
+
+/* JPEG-2000 codec */
+/* #undef HAVE_OPENJPEG */
+/* #undef HAVE_JASPER */
+
+/* IJG JPEG codec */
+/* #undef HAVE_JPEG */
+
+/* libpng/png.h needs to be included */
+/* #undef HAVE_LIBPNG_PNG_H */
+
+/* GDCM DICOM codec */
+/* #undef HAVE_GDCM */
+
+/* NVIDIA Video Decoding API*/
+/* #undef HAVE_NVCUVID */
+/* #undef HAVE_NVCUVID_HEADER */
+/* #undef HAVE_DYNLINK_NVCUVID_HEADER */
+
+/* NVIDIA Video Encoding API*/
+/* #undef HAVE_NVCUVENC */
+
+/* OpenCL Support */
+/* #undef HAVE_OPENCL */
+/* #undef HAVE_OPENCL_STATIC */
+/* #undef HAVE_OPENCL_SVM */
+
+/* NVIDIA OpenCL D3D Extensions support */
+/* #undef HAVE_OPENCL_D3D11_NV */
+
+/* OpenEXR codec */
+/* #undef HAVE_OPENEXR */
+
+/* OpenGL support*/
+/* #undef HAVE_OPENGL */
+
+/* PNG codec */
+/* #undef HAVE_PNG */
+
+/* Posix threads (pthreads) */
+#define HAVE_PTHREAD
+
+/* parallel_for with pthreads */
+/* #undef HAVE_PTHREADS_PF */
+
+/* Intel Threading Building Blocks */
+/* #undef HAVE_TBB */
+
+/* Ste||ar Group High Performance ParallelX */
+/* #undef HAVE_HPX */
+
+/* TIFF codec */
+/* #undef HAVE_TIFF */
+
+/* Define if your processor stores words with the most significant byte
+   first (like Motorola and SPARC, unlike Intel and VAX). */
+/* #undef WORDS_BIGENDIAN */
+
+/* VA library (libva) */
+/* #undef HAVE_VA */
+
+/* Intel VA-API/OpenCL */
+/* #undef HAVE_VA_INTEL */
+
+/* Lapack */
+/* #undef HAVE_LAPACK */
+
+/* Library was compiled with functions instrumentation */
+/* #undef ENABLE_INSTRUMENTATION */
+
+/* OpenVX */
+/* #undef HAVE_OPENVX */
+
+/* OpenCV trace utilities */
+/* #undef OPENCV_TRACE */
+
+/* Library QR-code decoding */
+/* #undef HAVE_QUIRC */
+
+#endif // OPENCV_CVCONFIG_H_INCLUDED
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/features2d.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/features2d.hpp
new file mode 100644
index 0000000..952d24c
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/features2d.hpp
@@ -0,0 +1,1535 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_FEATURES_2D_HPP
+#define OPENCV_FEATURES_2D_HPP
+
+#include "opencv2/opencv_modules.hpp"
+#include "opencv2/core.hpp"
+
+#ifdef HAVE_OPENCV_FLANN
+#include "opencv2/flann/miniflann.hpp"
+#endif
+
+/**
+  @defgroup features2d 2D Features Framework
+  @{
+    @defgroup features2d_main Feature Detection and Description
+    @defgroup features2d_match Descriptor Matchers
+
+Matchers of keypoint descriptors in OpenCV have wrappers with a common interface that enables you to
+easily switch between different algorithms solving the same problem. This section is devoted to
+matching descriptors that are represented as vectors in a multidimensional space. All objects that
+implement vector descriptor matchers inherit the DescriptorMatcher interface.
+
+    @defgroup features2d_draw Drawing Function of Keypoints and Matches
+    @defgroup features2d_category Object Categorization
+
+This section describes approaches based on local 2D features and used to categorize objects.
+
+    @defgroup feature2d_hal Hardware Acceleration Layer
+    @{
+        @defgroup features2d_hal_interface Interface
+    @}
+  @}
+ */
+
+namespace cv
+{
+
+//! @addtogroup features2d_main
+//! @{
+
+// //! writes vector of keypoints to the file storage
+// CV_EXPORTS void write(FileStorage& fs, const String& name, const std::vector<KeyPoint>& keypoints);
+// //! reads vector of keypoints from the specified file storage node
+// CV_EXPORTS void read(const FileNode& node, CV_OUT std::vector<KeyPoint>& keypoints);
+
+/** @brief A class filters a vector of keypoints.
+
+ Because now it is difficult to provide a convenient interface for all usage scenarios of the
+ keypoints filter class, it has only several needed by now static methods.
+ */
+class CV_EXPORTS KeyPointsFilter
+{
+public:
+    KeyPointsFilter(){}
+
+    /*
+     * Remove keypoints within borderPixels of an image edge.
+     */
+    static void runByImageBorder( std::vector<KeyPoint>& keypoints, Size imageSize, int borderSize );
+    /*
+     * Remove keypoints of sizes out of range.
+     */
+    static void runByKeypointSize( std::vector<KeyPoint>& keypoints, float minSize,
+                                   float maxSize=FLT_MAX );
+    /*
+     * Remove keypoints from some image by mask for pixels of this image.
+     */
+    static void runByPixelsMask( std::vector<KeyPoint>& keypoints, const Mat& mask );
+    /*
+     * Remove duplicated keypoints.
+     */
+    static void removeDuplicated( std::vector<KeyPoint>& keypoints );
+    /*
+     * Remove duplicated keypoints and sort the remaining keypoints
+     */
+    static void removeDuplicatedSorted( std::vector<KeyPoint>& keypoints );
+
+    /*
+     * Retain the specified number of the best keypoints (according to the response)
+     */
+    static void retainBest( std::vector<KeyPoint>& keypoints, int npoints );
+};
+
+
+/************************************ Base Classes ************************************/
+
+/** @brief Abstract base class for 2D image feature detectors and descriptor extractors
+*/
+#ifdef __EMSCRIPTEN__
+class CV_EXPORTS_W Feature2D : public Algorithm
+#else
+class CV_EXPORTS_W Feature2D : public virtual Algorithm
+#endif
+{
+public:
+    virtual ~Feature2D();
+
+    /** @brief Detects keypoints in an image (first variant) or image set (second variant).
+
+    @param image Image.
+    @param keypoints The detected keypoints. In the second variant of the method keypoints[i] is a set
+    of keypoints detected in images[i] .
+    @param mask Mask specifying where to look for keypoints (optional). It must be a 8-bit integer
+    matrix with non-zero values in the region of interest.
+     */
+    CV_WRAP virtual void detect( InputArray image,
+                                 CV_OUT std::vector<KeyPoint>& keypoints,
+                                 InputArray mask=noArray() );
+
+    /** @overload
+    @param images Image set.
+    @param keypoints The detected keypoints. In the second variant of the method keypoints[i] is a set
+    of keypoints detected in images[i] .
+    @param masks Masks for each input image specifying where to look for keypoints (optional).
+    masks[i] is a mask for images[i].
+    */
+    CV_WRAP virtual void detect( InputArrayOfArrays images,
+                         CV_OUT std::vector<std::vector<KeyPoint> >& keypoints,
+                         InputArrayOfArrays masks=noArray() );
+
+    /** @brief Computes the descriptors for a set of keypoints detected in an image (first variant) or image set
+    (second variant).
+
+    @param image Image.
+    @param keypoints Input collection of keypoints. Keypoints for which a descriptor cannot be
+    computed are removed. Sometimes new keypoints can be added, for example: SIFT duplicates keypoint
+    with several dominant orientations (for each orientation).
+    @param descriptors Computed descriptors. In the second variant of the method descriptors[i] are
+    descriptors computed for a keypoints[i]. Row j is the keypoints (or keypoints[i]) is the
+    descriptor for keypoint j-th keypoint.
+     */
+    CV_WRAP virtual void compute( InputArray image,
+                                  CV_OUT CV_IN_OUT std::vector<KeyPoint>& keypoints,
+                                  OutputArray descriptors );
+
+    /** @overload
+
+    @param images Image set.
+    @param keypoints Input collection of keypoints. Keypoints for which a descriptor cannot be
+    computed are removed. Sometimes new keypoints can be added, for example: SIFT duplicates keypoint
+    with several dominant orientations (for each orientation).
+    @param descriptors Computed descriptors. In the second variant of the method descriptors[i] are
+    descriptors computed for a keypoints[i]. Row j is the keypoints (or keypoints[i]) is the
+    descriptor for keypoint j-th keypoint.
+    */
+    CV_WRAP virtual void compute( InputArrayOfArrays images,
+                          CV_OUT CV_IN_OUT std::vector<std::vector<KeyPoint> >& keypoints,
+                          OutputArrayOfArrays descriptors );
+
+    /** Detects keypoints and computes the descriptors */
+    CV_WRAP virtual void detectAndCompute( InputArray image, InputArray mask,
+                                           CV_OUT std::vector<KeyPoint>& keypoints,
+                                           OutputArray descriptors,
+                                           bool useProvidedKeypoints=false );
+
+    CV_WRAP virtual int descriptorSize() const;
+    CV_WRAP virtual int descriptorType() const;
+    CV_WRAP virtual int defaultNorm() const;
+
+    CV_WRAP void write( const String& fileName ) const;
+
+    CV_WRAP void read( const String& fileName );
+
+    virtual void write( FileStorage&) const CV_OVERRIDE;
+
+    // see corresponding cv::Algorithm method
+    CV_WRAP virtual void read( const FileNode&) CV_OVERRIDE;
+
+    //! Return true if detector object is empty
+    CV_WRAP virtual bool empty() const CV_OVERRIDE;
+    CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+
+    // see corresponding cv::Algorithm method
+    CV_WRAP inline void write(const Ptr<FileStorage>& fs, const String& name = String()) const { Algorithm::write(fs, name); }
+};
+
+/** Feature detectors in OpenCV have wrappers with a common interface that enables you to easily switch
+between different algorithms solving the same problem. All objects that implement keypoint detectors
+inherit the FeatureDetector interface. */
+typedef Feature2D FeatureDetector;
+
+/** Extractors of keypoint descriptors in OpenCV have wrappers with a common interface that enables you
+to easily switch between different algorithms solving the same problem. This section is devoted to
+computing descriptors represented as vectors in a multidimensional space. All objects that implement
+the vector descriptor extractors inherit the DescriptorExtractor interface.
+ */
+typedef Feature2D DescriptorExtractor;
+
+
+/** @brief Class for implementing the wrapper which makes detectors and extractors to be affine invariant,
+described as ASIFT in @cite YM11 .
+*/
+class CV_EXPORTS_W AffineFeature : public Feature2D
+{
+public:
+    /**
+    @param backend The detector/extractor you want to use as backend.
+    @param maxTilt The highest power index of tilt factor. 5 is used in the paper as tilt sampling range n.
+    @param minTilt The lowest power index of tilt factor. 0 is used in the paper.
+    @param tiltStep Tilt sampling step \f$\delta_t\f$ in Algorithm 1 in the paper.
+    @param rotateStepBase Rotation sampling step factor b in Algorithm 1 in the paper.
+    */
+    CV_WRAP static Ptr<AffineFeature> create(const Ptr<Feature2D>& backend,
+        int maxTilt = 5, int minTilt = 0, float tiltStep = 1.4142135623730951f, float rotateStepBase = 72);
+
+    CV_WRAP virtual void setViewParams(const std::vector<float>& tilts, const std::vector<float>& rolls) = 0;
+    CV_WRAP virtual void getViewParams(std::vector<float>& tilts, std::vector<float>& rolls) const = 0;
+    CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+};
+
+typedef AffineFeature AffineFeatureDetector;
+typedef AffineFeature AffineDescriptorExtractor;
+
+
+/** @brief Class for extracting keypoints and computing descriptors using the Scale Invariant Feature Transform
+(SIFT) algorithm by D. Lowe @cite Lowe04 .
+*/
+class CV_EXPORTS_W SIFT : public Feature2D
+{
+public:
+    /**
+    @param nfeatures The number of best features to retain. The features are ranked by their scores
+    (measured in SIFT algorithm as the local contrast)
+
+    @param nOctaveLayers The number of layers in each octave. 3 is the value used in D. Lowe paper. The
+    number of octaves is computed automatically from the image resolution.
+
+    @param contrastThreshold The contrast threshold used to filter out weak features in semi-uniform
+    (low-contrast) regions. The larger the threshold, the less features are produced by the detector.
+
+    @note The contrast threshold will be divided by nOctaveLayers when the filtering is applied. When
+    nOctaveLayers is set to default and if you want to use the value used in D. Lowe paper, 0.03, set
+    this argument to 0.09.
+
+    @param edgeThreshold The threshold used to filter out edge-like features. Note that the its meaning
+    is different from the contrastThreshold, i.e. the larger the edgeThreshold, the less features are
+    filtered out (more features are retained).
+
+    @param sigma The sigma of the Gaussian applied to the input image at the octave \#0. If your image
+    is captured with a weak camera with soft lenses, you might want to reduce the number.
+    */
+    CV_WRAP static Ptr<SIFT> create(int nfeatures = 0, int nOctaveLayers = 3,
+        double contrastThreshold = 0.04, double edgeThreshold = 10,
+        double sigma = 1.6);
+
+    /** @brief Create SIFT with specified descriptorType.
+    @param nfeatures The number of best features to retain. The features are ranked by their scores
+    (measured in SIFT algorithm as the local contrast)
+
+    @param nOctaveLayers The number of layers in each octave. 3 is the value used in D. Lowe paper. The
+    number of octaves is computed automatically from the image resolution.
+
+    @param contrastThreshold The contrast threshold used to filter out weak features in semi-uniform
+    (low-contrast) regions. The larger the threshold, the less features are produced by the detector.
+
+    @note The contrast threshold will be divided by nOctaveLayers when the filtering is applied. When
+    nOctaveLayers is set to default and if you want to use the value used in D. Lowe paper, 0.03, set
+    this argument to 0.09.
+
+    @param edgeThreshold The threshold used to filter out edge-like features. Note that the its meaning
+    is different from the contrastThreshold, i.e. the larger the edgeThreshold, the less features are
+    filtered out (more features are retained).
+
+    @param sigma The sigma of the Gaussian applied to the input image at the octave \#0. If your image
+    is captured with a weak camera with soft lenses, you might want to reduce the number.
+
+    @param descriptorType The type of descriptors. Only CV_32F and CV_8U are supported.
+    */
+    CV_WRAP static Ptr<SIFT> create(int nfeatures, int nOctaveLayers,
+        double contrastThreshold, double edgeThreshold,
+        double sigma, int descriptorType);
+
+    CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+};
+
+typedef SIFT SiftFeatureDetector;
+typedef SIFT SiftDescriptorExtractor;
+
+
+/** @brief Class implementing the BRISK keypoint detector and descriptor extractor, described in @cite LCS11 .
+ */
+class CV_EXPORTS_W BRISK : public Feature2D
+{
+public:
+    /** @brief The BRISK constructor
+
+    @param thresh AGAST detection threshold score.
+    @param octaves detection octaves. Use 0 to do single scale.
+    @param patternScale apply this scale to the pattern used for sampling the neighbourhood of a
+    keypoint.
+     */
+    CV_WRAP static Ptr<BRISK> create(int thresh=30, int octaves=3, float patternScale=1.0f);
+
+    /** @brief The BRISK constructor for a custom pattern
+
+    @param radiusList defines the radii (in pixels) where the samples around a keypoint are taken (for
+    keypoint scale 1).
+    @param numberList defines the number of sampling points on the sampling circle. Must be the same
+    size as radiusList..
+    @param dMax threshold for the short pairings used for descriptor formation (in pixels for keypoint
+    scale 1).
+    @param dMin threshold for the long pairings used for orientation determination (in pixels for
+    keypoint scale 1).
+    @param indexChange index remapping of the bits. */
+    CV_WRAP static Ptr<BRISK> create(const std::vector<float> &radiusList, const std::vector<int> &numberList,
+        float dMax=5.85f, float dMin=8.2f, const std::vector<int>& indexChange=std::vector<int>());
+
+    /** @brief The BRISK constructor for a custom pattern, detection threshold and octaves
+
+    @param thresh AGAST detection threshold score.
+    @param octaves detection octaves. Use 0 to do single scale.
+    @param radiusList defines the radii (in pixels) where the samples around a keypoint are taken (for
+    keypoint scale 1).
+    @param numberList defines the number of sampling points on the sampling circle. Must be the same
+    size as radiusList..
+    @param dMax threshold for the short pairings used for descriptor formation (in pixels for keypoint
+    scale 1).
+    @param dMin threshold for the long pairings used for orientation determination (in pixels for
+    keypoint scale 1).
+    @param indexChange index remapping of the bits. */
+    CV_WRAP static Ptr<BRISK> create(int thresh, int octaves, const std::vector<float> &radiusList,
+        const std::vector<int> &numberList, float dMax=5.85f, float dMin=8.2f,
+        const std::vector<int>& indexChange=std::vector<int>());
+    CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+
+    /** @brief Set detection threshold.
+    @param threshold AGAST detection threshold score.
+    */
+    CV_WRAP virtual void setThreshold(int threshold) { CV_UNUSED(threshold); return; }
+    CV_WRAP virtual int getThreshold() const { return -1; }
+
+    /** @brief Set detection octaves.
+    @param octaves detection octaves. Use 0 to do single scale.
+    */
+    CV_WRAP virtual void setOctaves(int octaves) { CV_UNUSED(octaves); return; }
+    CV_WRAP virtual int getOctaves() const { return -1; }
+};
+
+/** @brief Class implementing the ORB (*oriented BRIEF*) keypoint detector and descriptor extractor
+
+described in @cite RRKB11 . The algorithm uses FAST in pyramids to detect stable keypoints, selects
+the strongest features using FAST or Harris response, finds their orientation using first-order
+moments and computes the descriptors using BRIEF (where the coordinates of random point pairs (or
+k-tuples) are rotated according to the measured orientation).
+ */
+class CV_EXPORTS_W ORB : public Feature2D
+{
+public:
+    enum ScoreType { HARRIS_SCORE=0, FAST_SCORE=1 };
+    static const int kBytes = 32;
+
+    /** @brief The ORB constructor
+
+    @param nfeatures The maximum number of features to retain.
+    @param scaleFactor Pyramid decimation ratio, greater than 1. scaleFactor==2 means the classical
+    pyramid, where each next level has 4x less pixels than the previous, but such a big scale factor
+    will degrade feature matching scores dramatically. On the other hand, too close to 1 scale factor
+    will mean that to cover certain scale range you will need more pyramid levels and so the speed
+    will suffer.
+    @param nlevels The number of pyramid levels. The smallest level will have linear size equal to
+    input_image_linear_size/pow(scaleFactor, nlevels - firstLevel).
+    @param edgeThreshold This is size of the border where the features are not detected. It should
+    roughly match the patchSize parameter.
+    @param firstLevel The level of pyramid to put source image to. Previous layers are filled
+    with upscaled source image.
+    @param WTA_K The number of points that produce each element of the oriented BRIEF descriptor. The
+    default value 2 means the BRIEF where we take a random point pair and compare their brightnesses,
+    so we get 0/1 response. Other possible values are 3 and 4. For example, 3 means that we take 3
+    random points (of course, those point coordinates are random, but they are generated from the
+    pre-defined seed, so each element of BRIEF descriptor is computed deterministically from the pixel
+    rectangle), find point of maximum brightness and output index of the winner (0, 1 or 2). Such
+    output will occupy 2 bits, and therefore it will need a special variant of Hamming distance,
+    denoted as NORM_HAMMING2 (2 bits per bin). When WTA_K=4, we take 4 random points to compute each
+    bin (that will also occupy 2 bits with possible values 0, 1, 2 or 3).
+    @param scoreType The default HARRIS_SCORE means that Harris algorithm is used to rank features
+    (the score is written to KeyPoint::score and is used to retain best nfeatures features);
+    FAST_SCORE is alternative value of the parameter that produces slightly less stable keypoints,
+    but it is a little faster to compute.
+    @param patchSize size of the patch used by the oriented BRIEF descriptor. Of course, on smaller
+    pyramid layers the perceived image area covered by a feature will be larger.
+    @param fastThreshold the fast threshold
+     */
+    CV_WRAP static Ptr<ORB> create(int nfeatures=500, float scaleFactor=1.2f, int nlevels=8, int edgeThreshold=31,
+        int firstLevel=0, int WTA_K=2, ORB::ScoreType scoreType=ORB::HARRIS_SCORE, int patchSize=31, int fastThreshold=20);
+
+    CV_WRAP virtual void setMaxFeatures(int maxFeatures) = 0;
+    CV_WRAP virtual int getMaxFeatures() const = 0;
+
+    CV_WRAP virtual void setScaleFactor(double scaleFactor) = 0;
+    CV_WRAP virtual double getScaleFactor() const = 0;
+
+    CV_WRAP virtual void setNLevels(int nlevels) = 0;
+    CV_WRAP virtual int getNLevels() const = 0;
+
+    CV_WRAP virtual void setEdgeThreshold(int edgeThreshold) = 0;
+    CV_WRAP virtual int getEdgeThreshold() const = 0;
+
+    CV_WRAP virtual void setFirstLevel(int firstLevel) = 0;
+    CV_WRAP virtual int getFirstLevel() const = 0;
+
+    CV_WRAP virtual void setWTA_K(int wta_k) = 0;
+    CV_WRAP virtual int getWTA_K() const = 0;
+
+    CV_WRAP virtual void setScoreType(ORB::ScoreType scoreType) = 0;
+    CV_WRAP virtual ORB::ScoreType getScoreType() const = 0;
+
+    CV_WRAP virtual void setPatchSize(int patchSize) = 0;
+    CV_WRAP virtual int getPatchSize() const = 0;
+
+    CV_WRAP virtual void setFastThreshold(int fastThreshold) = 0;
+    CV_WRAP virtual int getFastThreshold() const = 0;
+    CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+};
+
+/** @brief Maximally stable extremal region extractor
+
+The class encapsulates all the parameters of the %MSER extraction algorithm (see [wiki
+article](http://en.wikipedia.org/wiki/Maximally_stable_extremal_regions)).
+
+- there are two different implementation of %MSER: one for grey image, one for color image
+
+- the grey image algorithm is taken from: @cite nister2008linear ;  the paper claims to be faster
+than union-find method; it actually get 1.5~2m/s on my centrino L7200 1.2GHz laptop.
+
+- the color image algorithm is taken from: @cite forssen2007maximally ; it should be much slower
+than grey image method ( 3~4 times )
+
+- (Python) A complete example showing the use of the %MSER detector can be found at samples/python/mser.py
+*/
+class CV_EXPORTS_W MSER : public Feature2D
+{
+public:
+    /** @brief Full constructor for %MSER detector
+
+    @param delta it compares \f$(size_{i}-size_{i-delta})/size_{i-delta}\f$
+    @param min_area prune the area which smaller than minArea
+    @param max_area prune the area which bigger than maxArea
+    @param max_variation prune the area have similar size to its children
+    @param min_diversity for color image, trace back to cut off mser with diversity less than min_diversity
+    @param max_evolution  for color image, the evolution steps
+    @param area_threshold for color image, the area threshold to cause re-initialize
+    @param min_margin for color image, ignore too small margin
+    @param edge_blur_size for color image, the aperture size for edge blur
+     */
+    CV_WRAP static Ptr<MSER> create( int delta=5, int min_area=60, int max_area=14400,
+          double max_variation=0.25, double min_diversity=.2,
+          int max_evolution=200, double area_threshold=1.01,
+          double min_margin=0.003, int edge_blur_size=5 );
+
+    /** @brief Detect %MSER regions
+
+    @param image input image (8UC1, 8UC3 or 8UC4, must be greater or equal than 3x3)
+    @param msers resulting list of point sets
+    @param bboxes resulting bounding boxes
+    */
+    CV_WRAP virtual void detectRegions( InputArray image,
+                                        CV_OUT std::vector<std::vector<Point> >& msers,
+                                        CV_OUT std::vector<Rect>& bboxes ) = 0;
+
+    CV_WRAP virtual void setDelta(int delta) = 0;
+    CV_WRAP virtual int getDelta() const = 0;
+
+    CV_WRAP virtual void setMinArea(int minArea) = 0;
+    CV_WRAP virtual int getMinArea() const = 0;
+
+    CV_WRAP virtual void setMaxArea(int maxArea) = 0;
+    CV_WRAP virtual int getMaxArea() const = 0;
+
+    CV_WRAP virtual void setPass2Only(bool f) = 0;
+    CV_WRAP virtual bool getPass2Only() const = 0;
+    CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+};
+
+//! @} features2d_main
+
+//! @addtogroup features2d_main
+//! @{
+
+/** @brief Wrapping class for feature detection using the FAST method. :
+ */
+class CV_EXPORTS_W FastFeatureDetector : public Feature2D
+{
+public:
+    enum DetectorType
+    {
+        TYPE_5_8 = 0, TYPE_7_12 = 1, TYPE_9_16 = 2
+    };
+    enum
+    {
+        THRESHOLD = 10000, NONMAX_SUPPRESSION=10001, FAST_N=10002
+    };
+
+
+    CV_WRAP static Ptr<FastFeatureDetector> create( int threshold=10,
+                                                    bool nonmaxSuppression=true,
+                                                    FastFeatureDetector::DetectorType type=FastFeatureDetector::TYPE_9_16 );
+
+    CV_WRAP virtual void setThreshold(int threshold) = 0;
+    CV_WRAP virtual int getThreshold() const = 0;
+
+    CV_WRAP virtual void setNonmaxSuppression(bool f) = 0;
+    CV_WRAP virtual bool getNonmaxSuppression() const = 0;
+
+    CV_WRAP virtual void setType(FastFeatureDetector::DetectorType type) = 0;
+    CV_WRAP virtual FastFeatureDetector::DetectorType getType() const = 0;
+    CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+};
+
+/** @overload */
+CV_EXPORTS void FAST( InputArray image, CV_OUT std::vector<KeyPoint>& keypoints,
+                      int threshold, bool nonmaxSuppression=true );
+
+/** @brief Detects corners using the FAST algorithm
+
+@param image grayscale image where keypoints (corners) are detected.
+@param keypoints keypoints detected on the image.
+@param threshold threshold on difference between intensity of the central pixel and pixels of a
+circle around this pixel.
+@param nonmaxSuppression if true, non-maximum suppression is applied to detected corners
+(keypoints).
+@param type one of the three neighborhoods as defined in the paper:
+FastFeatureDetector::TYPE_9_16, FastFeatureDetector::TYPE_7_12,
+FastFeatureDetector::TYPE_5_8
+
+Detects corners using the FAST algorithm by @cite Rosten06 .
+
+@note In Python API, types are given as cv.FAST_FEATURE_DETECTOR_TYPE_5_8,
+cv.FAST_FEATURE_DETECTOR_TYPE_7_12 and cv.FAST_FEATURE_DETECTOR_TYPE_9_16. For corner
+detection, use cv.FAST.detect() method.
+ */
+CV_EXPORTS void FAST( InputArray image, CV_OUT std::vector<KeyPoint>& keypoints,
+                      int threshold, bool nonmaxSuppression, FastFeatureDetector::DetectorType type );
+
+//! @} features2d_main
+
+//! @addtogroup features2d_main
+//! @{
+
+/** @brief Wrapping class for feature detection using the AGAST method. :
+ */
+class CV_EXPORTS_W AgastFeatureDetector : public Feature2D
+{
+public:
+    enum DetectorType
+    {
+        AGAST_5_8 = 0, AGAST_7_12d = 1, AGAST_7_12s = 2, OAST_9_16 = 3,
+    };
+
+    enum
+    {
+        THRESHOLD = 10000, NONMAX_SUPPRESSION = 10001,
+    };
+
+    CV_WRAP static Ptr<AgastFeatureDetector> create( int threshold=10,
+                                                     bool nonmaxSuppression=true,
+                                                     AgastFeatureDetector::DetectorType type = AgastFeatureDetector::OAST_9_16);
+
+    CV_WRAP virtual void setThreshold(int threshold) = 0;
+    CV_WRAP virtual int getThreshold() const = 0;
+
+    CV_WRAP virtual void setNonmaxSuppression(bool f) = 0;
+    CV_WRAP virtual bool getNonmaxSuppression() const = 0;
+
+    CV_WRAP virtual void setType(AgastFeatureDetector::DetectorType type) = 0;
+    CV_WRAP virtual AgastFeatureDetector::DetectorType getType() const = 0;
+    CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+};
+
+/** @overload */
+CV_EXPORTS void AGAST( InputArray image, CV_OUT std::vector<KeyPoint>& keypoints,
+                      int threshold, bool nonmaxSuppression=true );
+
+/** @brief Detects corners using the AGAST algorithm
+
+@param image grayscale image where keypoints (corners) are detected.
+@param keypoints keypoints detected on the image.
+@param threshold threshold on difference between intensity of the central pixel and pixels of a
+circle around this pixel.
+@param nonmaxSuppression if true, non-maximum suppression is applied to detected corners
+(keypoints).
+@param type one of the four neighborhoods as defined in the paper:
+AgastFeatureDetector::AGAST_5_8, AgastFeatureDetector::AGAST_7_12d,
+AgastFeatureDetector::AGAST_7_12s, AgastFeatureDetector::OAST_9_16
+
+For non-Intel platforms, there is a tree optimised variant of AGAST with same numerical results.
+The 32-bit binary tree tables were generated automatically from original code using perl script.
+The perl script and examples of tree generation are placed in features2d/doc folder.
+Detects corners using the AGAST algorithm by @cite mair2010_agast .
+
+ */
+CV_EXPORTS void AGAST( InputArray image, CV_OUT std::vector<KeyPoint>& keypoints,
+                      int threshold, bool nonmaxSuppression, AgastFeatureDetector::DetectorType type );
+
+/** @brief Wrapping class for feature detection using the goodFeaturesToTrack function. :
+ */
+class CV_EXPORTS_W GFTTDetector : public Feature2D
+{
+public:
+    CV_WRAP static Ptr<GFTTDetector> create( int maxCorners=1000, double qualityLevel=0.01, double minDistance=1,
+                                             int blockSize=3, bool useHarrisDetector=false, double k=0.04 );
+    CV_WRAP static Ptr<GFTTDetector> create( int maxCorners, double qualityLevel, double minDistance,
+                                             int blockSize, int gradiantSize, bool useHarrisDetector=false, double k=0.04 );
+    CV_WRAP virtual void setMaxFeatures(int maxFeatures) = 0;
+    CV_WRAP virtual int getMaxFeatures() const = 0;
+
+    CV_WRAP virtual void setQualityLevel(double qlevel) = 0;
+    CV_WRAP virtual double getQualityLevel() const = 0;
+
+    CV_WRAP virtual void setMinDistance(double minDistance) = 0;
+    CV_WRAP virtual double getMinDistance() const = 0;
+
+    CV_WRAP virtual void setBlockSize(int blockSize) = 0;
+    CV_WRAP virtual int getBlockSize() const = 0;
+
+    CV_WRAP virtual void setHarrisDetector(bool val) = 0;
+    CV_WRAP virtual bool getHarrisDetector() const = 0;
+
+    CV_WRAP virtual void setK(double k) = 0;
+    CV_WRAP virtual double getK() const = 0;
+    CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+};
+
+/** @brief Class for extracting blobs from an image. :
+
+The class implements a simple algorithm for extracting blobs from an image:
+
+1.  Convert the source image to binary images by applying thresholding with several thresholds from
+    minThreshold (inclusive) to maxThreshold (exclusive) with distance thresholdStep between
+    neighboring thresholds.
+2.  Extract connected components from every binary image by findContours and calculate their
+    centers.
+3.  Group centers from several binary images by their coordinates. Close centers form one group that
+    corresponds to one blob, which is controlled by the minDistBetweenBlobs parameter.
+4.  From the groups, estimate final centers of blobs and their radiuses and return as locations and
+    sizes of keypoints.
+
+This class performs several filtrations of returned blobs. You should set filterBy\* to true/false
+to turn on/off corresponding filtration. Available filtrations:
+
+-   **By color**. This filter compares the intensity of a binary image at the center of a blob to
+blobColor. If they differ, the blob is filtered out. Use blobColor = 0 to extract dark blobs
+and blobColor = 255 to extract light blobs.
+-   **By area**. Extracted blobs have an area between minArea (inclusive) and maxArea (exclusive).
+-   **By circularity**. Extracted blobs have circularity
+(\f$\frac{4*\pi*Area}{perimeter * perimeter}\f$) between minCircularity (inclusive) and
+maxCircularity (exclusive).
+-   **By ratio of the minimum inertia to maximum inertia**. Extracted blobs have this ratio
+between minInertiaRatio (inclusive) and maxInertiaRatio (exclusive).
+-   **By convexity**. Extracted blobs have convexity (area / area of blob convex hull) between
+minConvexity (inclusive) and maxConvexity (exclusive).
+
+Default values of parameters are tuned to extract dark circular blobs.
+ */
+class CV_EXPORTS_W SimpleBlobDetector : public Feature2D
+{
+public:
+  struct CV_EXPORTS_W_SIMPLE Params
+  {
+      CV_WRAP Params();
+      CV_PROP_RW float thresholdStep;
+      CV_PROP_RW float minThreshold;
+      CV_PROP_RW float maxThreshold;
+      CV_PROP_RW size_t minRepeatability;
+      CV_PROP_RW float minDistBetweenBlobs;
+
+      CV_PROP_RW bool filterByColor;
+      CV_PROP_RW uchar blobColor;
+
+      CV_PROP_RW bool filterByArea;
+      CV_PROP_RW float minArea, maxArea;
+
+      CV_PROP_RW bool filterByCircularity;
+      CV_PROP_RW float minCircularity, maxCircularity;
+
+      CV_PROP_RW bool filterByInertia;
+      CV_PROP_RW float minInertiaRatio, maxInertiaRatio;
+
+      CV_PROP_RW bool filterByConvexity;
+      CV_PROP_RW float minConvexity, maxConvexity;
+
+      void read( const FileNode& fn );
+      void write( FileStorage& fs ) const;
+  };
+
+  CV_WRAP static Ptr<SimpleBlobDetector>
+    create(const SimpleBlobDetector::Params &parameters = SimpleBlobDetector::Params());
+  CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+};
+
+//! @} features2d_main
+
+//! @addtogroup features2d_main
+//! @{
+
+/** @brief Class implementing the KAZE keypoint detector and descriptor extractor, described in @cite ABD12 .
+
+@note AKAZE descriptor can only be used with KAZE or AKAZE keypoints .. [ABD12] KAZE Features. Pablo
+F. Alcantarilla, Adrien Bartoli and Andrew J. Davison. In European Conference on Computer Vision
+(ECCV), Fiorenze, Italy, October 2012.
+*/
+class CV_EXPORTS_W KAZE : public Feature2D
+{
+public:
+    enum DiffusivityType
+    {
+        DIFF_PM_G1 = 0,
+        DIFF_PM_G2 = 1,
+        DIFF_WEICKERT = 2,
+        DIFF_CHARBONNIER = 3
+    };
+
+    /** @brief The KAZE constructor
+
+    @param extended Set to enable extraction of extended (128-byte) descriptor.
+    @param upright Set to enable use of upright descriptors (non rotation-invariant).
+    @param threshold Detector response threshold to accept point
+    @param nOctaves Maximum octave evolution of the image
+    @param nOctaveLayers Default number of sublevels per scale level
+    @param diffusivity Diffusivity type. DIFF_PM_G1, DIFF_PM_G2, DIFF_WEICKERT or
+    DIFF_CHARBONNIER
+     */
+    CV_WRAP static Ptr<KAZE> create(bool extended=false, bool upright=false,
+                                    float threshold = 0.001f,
+                                    int nOctaves = 4, int nOctaveLayers = 4,
+                                    KAZE::DiffusivityType diffusivity = KAZE::DIFF_PM_G2);
+
+    CV_WRAP virtual void setExtended(bool extended) = 0;
+    CV_WRAP virtual bool getExtended() const = 0;
+
+    CV_WRAP virtual void setUpright(bool upright) = 0;
+    CV_WRAP virtual bool getUpright() const = 0;
+
+    CV_WRAP virtual void setThreshold(double threshold) = 0;
+    CV_WRAP virtual double getThreshold() const = 0;
+
+    CV_WRAP virtual void setNOctaves(int octaves) = 0;
+    CV_WRAP virtual int getNOctaves() const = 0;
+
+    CV_WRAP virtual void setNOctaveLayers(int octaveLayers) = 0;
+    CV_WRAP virtual int getNOctaveLayers() const = 0;
+
+    CV_WRAP virtual void setDiffusivity(KAZE::DiffusivityType diff) = 0;
+    CV_WRAP virtual KAZE::DiffusivityType getDiffusivity() const = 0;
+    CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+};
+
+/** @brief Class implementing the AKAZE keypoint detector and descriptor extractor, described in @cite ANB13.
+
+@details AKAZE descriptors can only be used with KAZE or AKAZE keypoints. This class is thread-safe.
+
+@note When you need descriptors use Feature2D::detectAndCompute, which
+provides better performance. When using Feature2D::detect followed by
+Feature2D::compute scale space pyramid is computed twice.
+
+@note AKAZE implements T-API. When image is passed as UMat some parts of the algorithm
+will use OpenCL.
+
+@note [ANB13] Fast Explicit Diffusion for Accelerated Features in Nonlinear
+Scale Spaces. Pablo F. Alcantarilla, Jesús Nuevo and Adrien Bartoli. In
+British Machine Vision Conference (BMVC), Bristol, UK, September 2013.
+
+*/
+class CV_EXPORTS_W AKAZE : public Feature2D
+{
+public:
+    // AKAZE descriptor type
+    enum DescriptorType
+    {
+        DESCRIPTOR_KAZE_UPRIGHT = 2, ///< Upright descriptors, not invariant to rotation
+        DESCRIPTOR_KAZE = 3,
+        DESCRIPTOR_MLDB_UPRIGHT = 4, ///< Upright descriptors, not invariant to rotation
+        DESCRIPTOR_MLDB = 5
+    };
+
+    /** @brief The AKAZE constructor
+
+    @param descriptor_type Type of the extracted descriptor: DESCRIPTOR_KAZE,
+    DESCRIPTOR_KAZE_UPRIGHT, DESCRIPTOR_MLDB or DESCRIPTOR_MLDB_UPRIGHT.
+    @param descriptor_size Size of the descriptor in bits. 0 -\> Full size
+    @param descriptor_channels Number of channels in the descriptor (1, 2, 3)
+    @param threshold Detector response threshold to accept point
+    @param nOctaves Maximum octave evolution of the image
+    @param nOctaveLayers Default number of sublevels per scale level
+    @param diffusivity Diffusivity type. DIFF_PM_G1, DIFF_PM_G2, DIFF_WEICKERT or
+    DIFF_CHARBONNIER
+     */
+    CV_WRAP static Ptr<AKAZE> create(AKAZE::DescriptorType descriptor_type = AKAZE::DESCRIPTOR_MLDB,
+                                     int descriptor_size = 0, int descriptor_channels = 3,
+                                     float threshold = 0.001f, int nOctaves = 4,
+                                     int nOctaveLayers = 4, KAZE::DiffusivityType diffusivity = KAZE::DIFF_PM_G2);
+
+    CV_WRAP virtual void setDescriptorType(AKAZE::DescriptorType dtype) = 0;
+    CV_WRAP virtual AKAZE::DescriptorType getDescriptorType() const = 0;
+
+    CV_WRAP virtual void setDescriptorSize(int dsize) = 0;
+    CV_WRAP virtual int getDescriptorSize() const = 0;
+
+    CV_WRAP virtual void setDescriptorChannels(int dch) = 0;
+    CV_WRAP virtual int getDescriptorChannels() const = 0;
+
+    CV_WRAP virtual void setThreshold(double threshold) = 0;
+    CV_WRAP virtual double getThreshold() const = 0;
+
+    CV_WRAP virtual void setNOctaves(int octaves) = 0;
+    CV_WRAP virtual int getNOctaves() const = 0;
+
+    CV_WRAP virtual void setNOctaveLayers(int octaveLayers) = 0;
+    CV_WRAP virtual int getNOctaveLayers() const = 0;
+
+    CV_WRAP virtual void setDiffusivity(KAZE::DiffusivityType diff) = 0;
+    CV_WRAP virtual KAZE::DiffusivityType getDiffusivity() const = 0;
+    CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+};
+
+//! @} features2d_main
+
+/****************************************************************************************\
+*                                      Distance                                          *
+\****************************************************************************************/
+
+template<typename T>
+struct CV_EXPORTS Accumulator
+{
+    typedef T Type;
+};
+
+template<> struct Accumulator<unsigned char>  { typedef float Type; };
+template<> struct Accumulator<unsigned short> { typedef float Type; };
+template<> struct Accumulator<char>   { typedef float Type; };
+template<> struct Accumulator<short>  { typedef float Type; };
+
+/*
+ * Squared Euclidean distance functor
+ */
+template<class T>
+struct CV_EXPORTS SL2
+{
+    static const NormTypes normType = NORM_L2SQR;
+    typedef T ValueType;
+    typedef typename Accumulator<T>::Type ResultType;
+
+    ResultType operator()( const T* a, const T* b, int size ) const
+    {
+        return normL2Sqr<ValueType, ResultType>(a, b, size);
+    }
+};
+
+/*
+ * Euclidean distance functor
+ */
+template<class T>
+struct L2
+{
+    static const NormTypes normType = NORM_L2;
+    typedef T ValueType;
+    typedef typename Accumulator<T>::Type ResultType;
+
+    ResultType operator()( const T* a, const T* b, int size ) const
+    {
+        return (ResultType)std::sqrt((double)normL2Sqr<ValueType, ResultType>(a, b, size));
+    }
+};
+
+/*
+ * Manhattan distance (city block distance) functor
+ */
+template<class T>
+struct L1
+{
+    static const NormTypes normType = NORM_L1;
+    typedef T ValueType;
+    typedef typename Accumulator<T>::Type ResultType;
+
+    ResultType operator()( const T* a, const T* b, int size ) const
+    {
+        return normL1<ValueType, ResultType>(a, b, size);
+    }
+};
+
+/****************************************************************************************\
+*                                  DescriptorMatcher                                     *
+\****************************************************************************************/
+
+//! @addtogroup features2d_match
+//! @{
+
+/** @brief Abstract base class for matching keypoint descriptors.
+
+It has two groups of match methods: for matching descriptors of an image with another image or with
+an image set.
+ */
+class CV_EXPORTS_W DescriptorMatcher : public Algorithm
+{
+public:
+   enum MatcherType
+    {
+        FLANNBASED            = 1,
+        BRUTEFORCE            = 2,
+        BRUTEFORCE_L1         = 3,
+        BRUTEFORCE_HAMMING    = 4,
+        BRUTEFORCE_HAMMINGLUT = 5,
+        BRUTEFORCE_SL2        = 6
+    };
+
+    virtual ~DescriptorMatcher();
+
+    /** @brief Adds descriptors to train a CPU(trainDescCollectionis) or GPU(utrainDescCollectionis) descriptor
+    collection.
+
+    If the collection is not empty, the new descriptors are added to existing train descriptors.
+
+    @param descriptors Descriptors to add. Each descriptors[i] is a set of descriptors from the same
+    train image.
+     */
+    CV_WRAP virtual void add( InputArrayOfArrays descriptors );
+
+    /** @brief Returns a constant link to the train descriptor collection trainDescCollection .
+     */
+    CV_WRAP const std::vector<Mat>& getTrainDescriptors() const;
+
+    /** @brief Clears the train descriptor collections.
+     */
+    CV_WRAP virtual void clear() CV_OVERRIDE;
+
+    /** @brief Returns true if there are no train descriptors in the both collections.
+     */
+    CV_WRAP virtual bool empty() const CV_OVERRIDE;
+
+    /** @brief Returns true if the descriptor matcher supports masking permissible matches.
+     */
+    CV_WRAP virtual bool isMaskSupported() const = 0;
+
+    /** @brief Trains a descriptor matcher
+
+    Trains a descriptor matcher (for example, the flann index). In all methods to match, the method
+    train() is run every time before matching. Some descriptor matchers (for example, BruteForceMatcher)
+    have an empty implementation of this method. Other matchers really train their inner structures (for
+    example, FlannBasedMatcher trains flann::Index ).
+     */
+    CV_WRAP virtual void train();
+
+    /** @brief Finds the best match for each descriptor from a query set.
+
+    @param queryDescriptors Query set of descriptors.
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
+    collection stored in the class object.
+    @param matches Matches. If a query descriptor is masked out in mask , no match is added for this
+    descriptor. So, matches size may be smaller than the query descriptors count.
+    @param mask Mask specifying permissible matches between an input query and train matrices of
+    descriptors.
+
+    In the first variant of this method, the train descriptors are passed as an input argument. In the
+    second variant of the method, train descriptors collection that was set by DescriptorMatcher::add is
+    used. Optional mask (or masks) can be passed to specify which query and training descriptors can be
+    matched. Namely, queryDescriptors[i] can be matched with trainDescriptors[j] only if
+    mask.at\<uchar\>(i,j) is non-zero.
+     */
+    CV_WRAP void match( InputArray queryDescriptors, InputArray trainDescriptors,
+                CV_OUT std::vector<DMatch>& matches, InputArray mask=noArray() ) const;
+
+    /** @brief Finds the k best matches for each descriptor from a query set.
+
+    @param queryDescriptors Query set of descriptors.
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
+    collection stored in the class object.
+    @param mask Mask specifying permissible matches between an input query and train matrices of
+    descriptors.
+    @param matches Matches. Each matches[i] is k or less matches for the same query descriptor.
+    @param k Count of best matches found per each query descriptor or less if a query descriptor has
+    less than k possible matches in total.
+    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
+    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
+    the matches vector does not contain matches for fully masked-out query descriptors.
+
+    These extended variants of DescriptorMatcher::match methods find several best matches for each query
+    descriptor. The matches are returned in the distance increasing order. See DescriptorMatcher::match
+    for the details about query and train descriptors.
+     */
+    CV_WRAP void knnMatch( InputArray queryDescriptors, InputArray trainDescriptors,
+                   CV_OUT std::vector<std::vector<DMatch> >& matches, int k,
+                   InputArray mask=noArray(), bool compactResult=false ) const;
+
+    /** @brief For each query descriptor, finds the training descriptors not farther than the specified distance.
+
+    @param queryDescriptors Query set of descriptors.
+    @param trainDescriptors Train set of descriptors. This set is not added to the train descriptors
+    collection stored in the class object.
+    @param matches Found matches.
+    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
+    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
+    the matches vector does not contain matches for fully masked-out query descriptors.
+    @param maxDistance Threshold for the distance between matched descriptors. Distance means here
+    metric distance (e.g. Hamming distance), not the distance between coordinates (which is measured
+    in Pixels)!
+    @param mask Mask specifying permissible matches between an input query and train matrices of
+    descriptors.
+
+    For each query descriptor, the methods find such training descriptors that the distance between the
+    query descriptor and the training descriptor is equal or smaller than maxDistance. Found matches are
+    returned in the distance increasing order.
+     */
+    CV_WRAP void radiusMatch( InputArray queryDescriptors, InputArray trainDescriptors,
+                      CV_OUT std::vector<std::vector<DMatch> >& matches, float maxDistance,
+                      InputArray mask=noArray(), bool compactResult=false ) const;
+
+    /** @overload
+    @param queryDescriptors Query set of descriptors.
+    @param matches Matches. If a query descriptor is masked out in mask , no match is added for this
+    descriptor. So, matches size may be smaller than the query descriptors count.
+    @param masks Set of masks. Each masks[i] specifies permissible matches between the input query
+    descriptors and stored train descriptors from the i-th image trainDescCollection[i].
+    */
+    CV_WRAP void match( InputArray queryDescriptors, CV_OUT std::vector<DMatch>& matches,
+                        InputArrayOfArrays masks=noArray() );
+    /** @overload
+    @param queryDescriptors Query set of descriptors.
+    @param matches Matches. Each matches[i] is k or less matches for the same query descriptor.
+    @param k Count of best matches found per each query descriptor or less if a query descriptor has
+    less than k possible matches in total.
+    @param masks Set of masks. Each masks[i] specifies permissible matches between the input query
+    descriptors and stored train descriptors from the i-th image trainDescCollection[i].
+    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
+    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
+    the matches vector does not contain matches for fully masked-out query descriptors.
+    */
+    CV_WRAP void knnMatch( InputArray queryDescriptors, CV_OUT std::vector<std::vector<DMatch> >& matches, int k,
+                           InputArrayOfArrays masks=noArray(), bool compactResult=false );
+    /** @overload
+    @param queryDescriptors Query set of descriptors.
+    @param matches Found matches.
+    @param maxDistance Threshold for the distance between matched descriptors. Distance means here
+    metric distance (e.g. Hamming distance), not the distance between coordinates (which is measured
+    in Pixels)!
+    @param masks Set of masks. Each masks[i] specifies permissible matches between the input query
+    descriptors and stored train descriptors from the i-th image trainDescCollection[i].
+    @param compactResult Parameter used when the mask (or masks) is not empty. If compactResult is
+    false, the matches vector has the same size as queryDescriptors rows. If compactResult is true,
+    the matches vector does not contain matches for fully masked-out query descriptors.
+    */
+    CV_WRAP void radiusMatch( InputArray queryDescriptors, CV_OUT std::vector<std::vector<DMatch> >& matches, float maxDistance,
+                      InputArrayOfArrays masks=noArray(), bool compactResult=false );
+
+
+    CV_WRAP void write( const String& fileName ) const
+    {
+        FileStorage fs(fileName, FileStorage::WRITE);
+        write(fs);
+    }
+
+    CV_WRAP void read( const String& fileName )
+    {
+        FileStorage fs(fileName, FileStorage::READ);
+        read(fs.root());
+    }
+    // Reads matcher object from a file node
+    // see corresponding cv::Algorithm method
+    CV_WRAP virtual void read( const FileNode& ) CV_OVERRIDE;
+    // Writes matcher object to a file storage
+    virtual void write( FileStorage& ) const CV_OVERRIDE;
+
+    /** @brief Clones the matcher.
+
+    @param emptyTrainData If emptyTrainData is false, the method creates a deep copy of the object,
+    that is, copies both parameters and train data. If emptyTrainData is true, the method creates an
+    object copy with the current parameters but with empty train data.
+     */
+    CV_WRAP CV_NODISCARD_STD virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const = 0;
+
+    /** @brief Creates a descriptor matcher of a given type with the default parameters (using default
+    constructor).
+
+    @param descriptorMatcherType Descriptor matcher type. Now the following matcher types are
+    supported:
+    -   `BruteForce` (it uses L2 )
+    -   `BruteForce-L1`
+    -   `BruteForce-Hamming`
+    -   `BruteForce-Hamming(2)`
+    -   `FlannBased`
+     */
+    CV_WRAP static Ptr<DescriptorMatcher> create( const String& descriptorMatcherType );
+
+    CV_WRAP static Ptr<DescriptorMatcher> create( const DescriptorMatcher::MatcherType& matcherType );
+
+
+    // see corresponding cv::Algorithm method
+    CV_WRAP inline void write(const Ptr<FileStorage>& fs, const String& name = String()) const { Algorithm::write(fs, name); }
+
+protected:
+    /**
+     * Class to work with descriptors from several images as with one merged matrix.
+     * It is used e.g. in FlannBasedMatcher.
+     */
+    class CV_EXPORTS DescriptorCollection
+    {
+    public:
+        DescriptorCollection();
+        DescriptorCollection( const DescriptorCollection& collection );
+        virtual ~DescriptorCollection();
+
+        // Vector of matrices "descriptors" will be merged to one matrix "mergedDescriptors" here.
+        void set( const std::vector<Mat>& descriptors );
+        virtual void clear();
+
+        const Mat& getDescriptors() const;
+        Mat getDescriptor( int imgIdx, int localDescIdx ) const;
+        Mat getDescriptor( int globalDescIdx ) const;
+        void getLocalIdx( int globalDescIdx, int& imgIdx, int& localDescIdx ) const;
+
+        int size() const;
+
+    protected:
+        Mat mergedDescriptors;
+        std::vector<int> startIdxs;
+    };
+
+    //! In fact the matching is implemented only by the following two methods. These methods suppose
+    //! that the class object has been trained already. Public match methods call these methods
+    //! after calling train().
+    virtual void knnMatchImpl( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, int k,
+        InputArrayOfArrays masks=noArray(), bool compactResult=false ) = 0;
+    virtual void radiusMatchImpl( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
+        InputArrayOfArrays masks=noArray(), bool compactResult=false ) = 0;
+
+    static bool isPossibleMatch( InputArray mask, int queryIdx, int trainIdx );
+    static bool isMaskedOut( InputArrayOfArrays masks, int queryIdx );
+
+    CV_NODISCARD_STD static Mat clone_op( Mat m ) { return m.clone(); }
+    void checkMasks( InputArrayOfArrays masks, int queryDescriptorsCount ) const;
+
+    //! Collection of descriptors from train images.
+    std::vector<Mat> trainDescCollection;
+    std::vector<UMat> utrainDescCollection;
+};
+
+/** @brief Brute-force descriptor matcher.
+
+For each descriptor in the first set, this matcher finds the closest descriptor in the second set
+by trying each one. This descriptor matcher supports masking permissible matches of descriptor
+sets.
+ */
+class CV_EXPORTS_W BFMatcher : public DescriptorMatcher
+{
+public:
+    /** @brief Brute-force matcher constructor (obsolete). Please use BFMatcher.create()
+     *
+     *
+    */
+    CV_WRAP BFMatcher( int normType=NORM_L2, bool crossCheck=false );
+
+    virtual ~BFMatcher() {}
+
+    virtual bool isMaskSupported() const CV_OVERRIDE { return true; }
+
+    /** @brief Brute-force matcher create method.
+    @param normType One of NORM_L1, NORM_L2, NORM_HAMMING, NORM_HAMMING2. L1 and L2 norms are
+    preferable choices for SIFT and SURF descriptors, NORM_HAMMING should be used with ORB, BRISK and
+    BRIEF, NORM_HAMMING2 should be used with ORB when WTA_K==3 or 4 (see ORB::ORB constructor
+    description).
+    @param crossCheck If it is false, this is will be default BFMatcher behaviour when it finds the k
+    nearest neighbors for each query descriptor. If crossCheck==true, then the knnMatch() method with
+    k=1 will only return pairs (i,j) such that for i-th query descriptor the j-th descriptor in the
+    matcher's collection is the nearest and vice versa, i.e. the BFMatcher will only return consistent
+    pairs. Such technique usually produces best results with minimal number of outliers when there are
+    enough matches. This is alternative to the ratio test, used by D. Lowe in SIFT paper.
+     */
+    CV_WRAP static Ptr<BFMatcher> create( int normType=NORM_L2, bool crossCheck=false ) ;
+
+    CV_NODISCARD_STD virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const CV_OVERRIDE;
+protected:
+    virtual void knnMatchImpl( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, int k,
+        InputArrayOfArrays masks=noArray(), bool compactResult=false ) CV_OVERRIDE;
+    virtual void radiusMatchImpl( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
+        InputArrayOfArrays masks=noArray(), bool compactResult=false ) CV_OVERRIDE;
+
+    int normType;
+    bool crossCheck;
+};
+
+#if defined(HAVE_OPENCV_FLANN) || defined(CV_DOXYGEN)
+
+/** @brief Flann-based descriptor matcher.
+
+This matcher trains cv::flann::Index on a train descriptor collection and calls its nearest search
+methods to find the best matches. So, this matcher may be faster when matching a large train
+collection than the brute force matcher. FlannBasedMatcher does not support masking permissible
+matches of descriptor sets because flann::Index does not support this. :
+ */
+class CV_EXPORTS_W FlannBasedMatcher : public DescriptorMatcher
+{
+public:
+    CV_WRAP FlannBasedMatcher( const Ptr<flann::IndexParams>& indexParams=makePtr<flann::KDTreeIndexParams>(),
+                       const Ptr<flann::SearchParams>& searchParams=makePtr<flann::SearchParams>() );
+
+    virtual void add( InputArrayOfArrays descriptors ) CV_OVERRIDE;
+    virtual void clear() CV_OVERRIDE;
+
+    // Reads matcher object from a file node
+    virtual void read( const FileNode& ) CV_OVERRIDE;
+    // Writes matcher object to a file storage
+    virtual void write( FileStorage& ) const CV_OVERRIDE;
+
+    virtual void train() CV_OVERRIDE;
+    virtual bool isMaskSupported() const CV_OVERRIDE;
+
+    CV_WRAP static Ptr<FlannBasedMatcher> create();
+
+    CV_NODISCARD_STD virtual Ptr<DescriptorMatcher> clone( bool emptyTrainData=false ) const CV_OVERRIDE;
+protected:
+    static void convertToDMatches( const DescriptorCollection& descriptors,
+                                   const Mat& indices, const Mat& distances,
+                                   std::vector<std::vector<DMatch> >& matches );
+
+    virtual void knnMatchImpl( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, int k,
+        InputArrayOfArrays masks=noArray(), bool compactResult=false ) CV_OVERRIDE;
+    virtual void radiusMatchImpl( InputArray queryDescriptors, std::vector<std::vector<DMatch> >& matches, float maxDistance,
+        InputArrayOfArrays masks=noArray(), bool compactResult=false ) CV_OVERRIDE;
+
+    Ptr<flann::IndexParams> indexParams;
+    Ptr<flann::SearchParams> searchParams;
+    Ptr<flann::Index> flannIndex;
+
+    DescriptorCollection mergedDescriptors;
+    int addedDescCount;
+};
+
+#endif
+
+//! @} features2d_match
+
+/****************************************************************************************\
+*                                   Drawing functions                                    *
+\****************************************************************************************/
+
+//! @addtogroup features2d_draw
+//! @{
+
+enum struct DrawMatchesFlags
+{
+  DEFAULT = 0, //!< Output image matrix will be created (Mat::create),
+               //!< i.e. existing memory of output image may be reused.
+               //!< Two source image, matches and single keypoints will be drawn.
+               //!< For each keypoint only the center point will be drawn (without
+               //!< the circle around keypoint with keypoint size and orientation).
+  DRAW_OVER_OUTIMG = 1, //!< Output image matrix will not be created (Mat::create).
+                        //!< Matches will be drawn on existing content of output image.
+  NOT_DRAW_SINGLE_POINTS = 2, //!< Single keypoints will not be drawn.
+  DRAW_RICH_KEYPOINTS = 4 //!< For each keypoint the circle around keypoint with keypoint size and
+                          //!< orientation will be drawn.
+};
+CV_ENUM_FLAGS(DrawMatchesFlags)
+
+/** @brief Draws keypoints.
+
+@param image Source image.
+@param keypoints Keypoints from the source image.
+@param outImage Output image. Its content depends on the flags value defining what is drawn in the
+output image. See possible flags bit values below.
+@param color Color of keypoints.
+@param flags Flags setting drawing features. Possible flags bit values are defined by
+DrawMatchesFlags. See details above in drawMatches .
+
+@note
+For Python API, flags are modified as cv.DRAW_MATCHES_FLAGS_DEFAULT,
+cv.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS, cv.DRAW_MATCHES_FLAGS_DRAW_OVER_OUTIMG,
+cv.DRAW_MATCHES_FLAGS_NOT_DRAW_SINGLE_POINTS
+ */
+CV_EXPORTS_W void drawKeypoints( InputArray image, const std::vector<KeyPoint>& keypoints, InputOutputArray outImage,
+                               const Scalar& color=Scalar::all(-1), DrawMatchesFlags flags=DrawMatchesFlags::DEFAULT );
+
+/** @brief Draws the found matches of keypoints from two images.
+
+@param img1 First source image.
+@param keypoints1 Keypoints from the first source image.
+@param img2 Second source image.
+@param keypoints2 Keypoints from the second source image.
+@param matches1to2 Matches from the first image to the second one, which means that keypoints1[i]
+has a corresponding point in keypoints2[matches[i]] .
+@param outImg Output image. Its content depends on the flags value defining what is drawn in the
+output image. See possible flags bit values below.
+@param matchColor Color of matches (lines and connected keypoints). If matchColor==Scalar::all(-1)
+, the color is generated randomly.
+@param singlePointColor Color of single keypoints (circles), which means that keypoints do not
+have the matches. If singlePointColor==Scalar::all(-1) , the color is generated randomly.
+@param matchesMask Mask determining which matches are drawn. If the mask is empty, all matches are
+drawn.
+@param flags Flags setting drawing features. Possible flags bit values are defined by
+DrawMatchesFlags.
+
+This function draws matches of keypoints from two images in the output image. Match is a line
+connecting two keypoints (circles). See cv::DrawMatchesFlags.
+ */
+CV_EXPORTS_W void drawMatches( InputArray img1, const std::vector<KeyPoint>& keypoints1,
+                             InputArray img2, const std::vector<KeyPoint>& keypoints2,
+                             const std::vector<DMatch>& matches1to2, InputOutputArray outImg,
+                             const Scalar& matchColor=Scalar::all(-1), const Scalar& singlePointColor=Scalar::all(-1),
+                             const std::vector<char>& matchesMask=std::vector<char>(), DrawMatchesFlags flags=DrawMatchesFlags::DEFAULT );
+
+/** @overload */
+CV_EXPORTS_W void drawMatches( InputArray img1, const std::vector<KeyPoint>& keypoints1,
+                             InputArray img2, const std::vector<KeyPoint>& keypoints2,
+                             const std::vector<DMatch>& matches1to2, InputOutputArray outImg,
+                             const int matchesThickness, const Scalar& matchColor=Scalar::all(-1),
+                             const Scalar& singlePointColor=Scalar::all(-1), const std::vector<char>& matchesMask=std::vector<char>(),
+                             DrawMatchesFlags flags=DrawMatchesFlags::DEFAULT );
+
+CV_EXPORTS_AS(drawMatchesKnn) void drawMatches( InputArray img1, const std::vector<KeyPoint>& keypoints1,
+                             InputArray img2, const std::vector<KeyPoint>& keypoints2,
+                             const std::vector<std::vector<DMatch> >& matches1to2, InputOutputArray outImg,
+                             const Scalar& matchColor=Scalar::all(-1), const Scalar& singlePointColor=Scalar::all(-1),
+                             const std::vector<std::vector<char> >& matchesMask=std::vector<std::vector<char> >(), DrawMatchesFlags flags=DrawMatchesFlags::DEFAULT );
+
+//! @} features2d_draw
+
+/****************************************************************************************\
+*   Functions to evaluate the feature detectors and [generic] descriptor extractors      *
+\****************************************************************************************/
+
+CV_EXPORTS void evaluateFeatureDetector( const Mat& img1, const Mat& img2, const Mat& H1to2,
+                                         std::vector<KeyPoint>* keypoints1, std::vector<KeyPoint>* keypoints2,
+                                         float& repeatability, int& correspCount,
+                                         const Ptr<FeatureDetector>& fdetector=Ptr<FeatureDetector>() );
+
+CV_EXPORTS void computeRecallPrecisionCurve( const std::vector<std::vector<DMatch> >& matches1to2,
+                                             const std::vector<std::vector<uchar> >& correctMatches1to2Mask,
+                                             std::vector<Point2f>& recallPrecisionCurve );
+
+CV_EXPORTS float getRecall( const std::vector<Point2f>& recallPrecisionCurve, float l_precision );
+CV_EXPORTS int getNearestPoint( const std::vector<Point2f>& recallPrecisionCurve, float l_precision );
+
+/****************************************************************************************\
+*                                     Bag of visual words                                *
+\****************************************************************************************/
+
+//! @addtogroup features2d_category
+//! @{
+
+/** @brief Abstract base class for training the *bag of visual words* vocabulary from a set of descriptors.
+
+For details, see, for example, *Visual Categorization with Bags of Keypoints* by Gabriella Csurka,
+Christopher R. Dance, Lixin Fan, Jutta Willamowski, Cedric Bray, 2004. :
+ */
+class CV_EXPORTS_W BOWTrainer
+{
+public:
+    BOWTrainer();
+    virtual ~BOWTrainer();
+
+    /** @brief Adds descriptors to a training set.
+
+    @param descriptors Descriptors to add to a training set. Each row of the descriptors matrix is a
+    descriptor.
+
+    The training set is clustered using clustermethod to construct the vocabulary.
+     */
+    CV_WRAP void add( const Mat& descriptors );
+
+    /** @brief Returns a training set of descriptors.
+    */
+    CV_WRAP const std::vector<Mat>& getDescriptors() const;
+
+    /** @brief Returns the count of all descriptors stored in the training set.
+    */
+    CV_WRAP int descriptorsCount() const;
+
+    CV_WRAP virtual void clear();
+
+    /** @overload */
+    CV_WRAP virtual Mat cluster() const = 0;
+
+    /** @brief Clusters train descriptors.
+
+    @param descriptors Descriptors to cluster. Each row of the descriptors matrix is a descriptor.
+    Descriptors are not added to the inner train descriptor set.
+
+    The vocabulary consists of cluster centers. So, this method returns the vocabulary. In the first
+    variant of the method, train descriptors stored in the object are clustered. In the second variant,
+    input descriptors are clustered.
+     */
+    CV_WRAP virtual Mat cluster( const Mat& descriptors ) const = 0;
+
+protected:
+    std::vector<Mat> descriptors;
+    int size;
+};
+
+/** @brief kmeans -based class to train visual vocabulary using the *bag of visual words* approach. :
+ */
+class CV_EXPORTS_W BOWKMeansTrainer : public BOWTrainer
+{
+public:
+    /** @brief The constructor.
+
+    @see cv::kmeans
+    */
+    CV_WRAP BOWKMeansTrainer( int clusterCount, const TermCriteria& termcrit=TermCriteria(),
+                      int attempts=3, int flags=KMEANS_PP_CENTERS );
+    virtual ~BOWKMeansTrainer();
+
+    // Returns trained vocabulary (i.e. cluster centers).
+    CV_WRAP virtual Mat cluster() const CV_OVERRIDE;
+    CV_WRAP virtual Mat cluster( const Mat& descriptors ) const CV_OVERRIDE;
+
+protected:
+
+    int clusterCount;
+    TermCriteria termcrit;
+    int attempts;
+    int flags;
+};
+
+/** @brief Class to compute an image descriptor using the *bag of visual words*.
+
+Such a computation consists of the following steps:
+
+1.  Compute descriptors for a given image and its keypoints set.
+2.  Find the nearest visual words from the vocabulary for each keypoint descriptor.
+3.  Compute the bag-of-words image descriptor as is a normalized histogram of vocabulary words
+encountered in the image. The i-th bin of the histogram is a frequency of i-th word of the
+vocabulary in the given image.
+ */
+class CV_EXPORTS_W BOWImgDescriptorExtractor
+{
+public:
+    /** @brief The constructor.
+
+    @param dextractor Descriptor extractor that is used to compute descriptors for an input image and
+    its keypoints.
+    @param dmatcher Descriptor matcher that is used to find the nearest word of the trained vocabulary
+    for each keypoint descriptor of the image.
+     */
+    CV_WRAP BOWImgDescriptorExtractor( const Ptr<DescriptorExtractor>& dextractor,
+                               const Ptr<DescriptorMatcher>& dmatcher );
+    /** @overload */
+    BOWImgDescriptorExtractor( const Ptr<DescriptorMatcher>& dmatcher );
+    virtual ~BOWImgDescriptorExtractor();
+
+    /** @brief Sets a visual vocabulary.
+
+    @param vocabulary Vocabulary (can be trained using the inheritor of BOWTrainer ). Each row of the
+    vocabulary is a visual word (cluster center).
+     */
+    CV_WRAP void setVocabulary( const Mat& vocabulary );
+
+    /** @brief Returns the set vocabulary.
+    */
+    CV_WRAP const Mat& getVocabulary() const;
+
+    /** @brief Computes an image descriptor using the set visual vocabulary.
+
+    @param image Image, for which the descriptor is computed.
+    @param keypoints Keypoints detected in the input image.
+    @param imgDescriptor Computed output image descriptor.
+    @param pointIdxsOfClusters Indices of keypoints that belong to the cluster. This means that
+    pointIdxsOfClusters[i] are keypoint indices that belong to the i -th cluster (word of vocabulary)
+    returned if it is non-zero.
+    @param descriptors Descriptors of the image keypoints that are returned if they are non-zero.
+     */
+    void compute( InputArray image, std::vector<KeyPoint>& keypoints, OutputArray imgDescriptor,
+                  std::vector<std::vector<int> >* pointIdxsOfClusters=0, Mat* descriptors=0 );
+    /** @overload
+    @param keypointDescriptors Computed descriptors to match with vocabulary.
+    @param imgDescriptor Computed output image descriptor.
+    @param pointIdxsOfClusters Indices of keypoints that belong to the cluster. This means that
+    pointIdxsOfClusters[i] are keypoint indices that belong to the i -th cluster (word of vocabulary)
+    returned if it is non-zero.
+    */
+    void compute( InputArray keypointDescriptors, OutputArray imgDescriptor,
+                  std::vector<std::vector<int> >* pointIdxsOfClusters=0 );
+    // compute() is not constant because DescriptorMatcher::match is not constant
+
+    CV_WRAP_AS(compute) void compute2( const Mat& image, std::vector<KeyPoint>& keypoints, CV_OUT Mat& imgDescriptor )
+    { compute(image,keypoints,imgDescriptor); }
+
+    /** @brief Returns an image descriptor size if the vocabulary is set. Otherwise, it returns 0.
+    */
+    CV_WRAP int descriptorSize() const;
+
+    /** @brief Returns an image descriptor type.
+     */
+    CV_WRAP int descriptorType() const;
+
+protected:
+    Mat vocabulary;
+    Ptr<DescriptorExtractor> dextractor;
+    Ptr<DescriptorMatcher> dmatcher;
+};
+
+//! @} features2d_category
+
+//! @} features2d
+
+} /* namespace cv */
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/features2d/features2d.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/features2d/features2d.hpp
new file mode 100644
index 0000000..e81df0a
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/features2d/features2d.hpp
@@ -0,0 +1,48 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __OPENCV_BUILD
+#error this is a compatibility header which should not be used inside the OpenCV library
+#endif
+
+#include "opencv2/features2d.hpp"
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/features2d/hal/interface.h b/duix-sdk/src/main/cpp/third/arm/include/opencv2/features2d/hal/interface.h
new file mode 100644
index 0000000..bc3b084
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/features2d/hal/interface.h
@@ -0,0 +1,33 @@
+#ifndef OPENCV_FEATURE2D_HAL_INTERFACE_H
+#define OPENCV_FEATURE2D_HAL_INTERFACE_H
+
+#include "opencv2/core/cvdef.h"
+//! @addtogroup features2d_hal_interface
+//! @{
+
+//! @name Fast feature detector types
+//! @sa cv::FastFeatureDetector
+//! @{
+#define CV_HAL_TYPE_5_8  0
+#define CV_HAL_TYPE_7_12 1
+#define CV_HAL_TYPE_9_16 2
+//! @}
+
+//! @name Key point
+//! @sa cv::KeyPoint
+//! @{
+struct CV_EXPORTS cvhalKeyPoint
+{
+    float x;
+    float y;
+    float size;
+    float angle;
+    float response;
+    int octave;
+    int class_id;
+};
+//! @}
+
+//! @}
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/highgui.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/highgui.hpp
new file mode 100644
index 0000000..21560ee
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/highgui.hpp
@@ -0,0 +1,17 @@
+//
+// Copyright (C) 2021 nihui
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//         http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "opencv2/highgui/highgui.hpp"
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/highgui/highgui.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/highgui/highgui.hpp
new file mode 100644
index 0000000..b337fb6
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/highgui/highgui.hpp
@@ -0,0 +1,62 @@
+//
+// Copyright (C) 2021 nihui
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//         http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#ifndef OPENCV_HIGHGUI_HPP
+#define OPENCV_HIGHGUI_HPP
+
+#include "opencv2/core.hpp"
+
+enum
+{
+    CV_LOAD_IMAGE_UNCHANGED     = -1,
+    CV_LOAD_IMAGE_GRAYSCALE     = 0,
+    CV_LOAD_IMAGE_COLOR         = 1,
+};
+
+enum
+{
+    CV_IMWRITE_JPEG_QUALITY     = 1
+};
+
+namespace cv {
+
+enum ImreadModes
+{
+    IMREAD_UNCHANGED            = -1,
+    IMREAD_GRAYSCALE            = 0,
+    IMREAD_COLOR                = 1
+};
+
+enum ImwriteFlags
+{
+    IMWRITE_JPEG_QUALITY        = 1
+};
+
+CV_EXPORTS_W Mat imread(const String& filename, int flags = IMREAD_COLOR);
+
+CV_EXPORTS_W bool imwrite(const String& filename, InputArray img, const std::vector<int>& params = std::vector<int>());
+
+CV_EXPORTS_W Mat imdecode(InputArray buf, int flags);
+
+CV_EXPORTS_W bool imencode(const String& ext, InputArray img, CV_OUT std::vector<uchar>& buf, const std::vector<int>& params = std::vector<int>());
+
+CV_EXPORTS_W void imshow(const String& winname, InputArray mat);
+
+CV_EXPORTS_W int waitKey(int delay = 0);
+
+} // namespace cv
+
+#endif // OPENCV_HIGHGUI_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/imgproc.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/imgproc.hpp
new file mode 100644
index 0000000..3e0180a
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/imgproc.hpp
@@ -0,0 +1,5005 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_IMGPROC_HPP
+#define OPENCV_IMGPROC_HPP
+
+#include "opencv2/core.hpp"
+
+/**
+  @defgroup imgproc Image Processing
+
+This module includes image-processing functions.
+
+  @{
+    @defgroup imgproc_filter Image Filtering
+
+Functions and classes described in this section are used to perform various linear or non-linear
+filtering operations on 2D images (represented as Mat's). It means that for each pixel location
+\f$(x,y)\f$ in the source image (normally, rectangular), its neighborhood is considered and used to
+compute the response. In case of a linear filter, it is a weighted sum of pixel values. In case of
+morphological operations, it is the minimum or maximum values, and so on. The computed response is
+stored in the destination image at the same location \f$(x,y)\f$. It means that the output image
+will be of the same size as the input image. Normally, the functions support multi-channel arrays,
+in which case every channel is processed independently. Therefore, the output image will also have
+the same number of channels as the input one.
+
+Another common feature of the functions and classes described in this section is that, unlike
+simple arithmetic functions, they need to extrapolate values of some non-existing pixels. For
+example, if you want to smooth an image using a Gaussian \f$3 \times 3\f$ filter, then, when
+processing the left-most pixels in each row, you need pixels to the left of them, that is, outside
+of the image. You can let these pixels be the same as the left-most image pixels ("replicated
+border" extrapolation method), or assume that all the non-existing pixels are zeros ("constant
+border" extrapolation method), and so on. OpenCV enables you to specify the extrapolation method.
+For details, see #BorderTypes
+
+@anchor filter_depths
+### Depth combinations
+Input depth (src.depth()) | Output depth (ddepth)
+--------------------------|----------------------
+CV_8U                     | -1/CV_16S/CV_32F/CV_64F
+CV_16U/CV_16S             | -1/CV_32F/CV_64F
+CV_32F                    | -1/CV_32F/CV_64F
+CV_64F                    | -1/CV_64F
+
+@note when ddepth=-1, the output image will have the same depth as the source.
+
+    @defgroup imgproc_transform Geometric Image Transformations
+
+The functions in this section perform various geometrical transformations of 2D images. They do not
+change the image content but deform the pixel grid and map this deformed grid to the destination
+image. In fact, to avoid sampling artifacts, the mapping is done in the reverse order, from
+destination to the source. That is, for each pixel \f$(x, y)\f$ of the destination image, the
+functions compute coordinates of the corresponding "donor" pixel in the source image and copy the
+pixel value:
+
+\f[\texttt{dst} (x,y)= \texttt{src} (f_x(x,y), f_y(x,y))\f]
+
+In case when you specify the forward mapping \f$\left<g_x, g_y\right>: \texttt{src} \rightarrow
+\texttt{dst}\f$, the OpenCV functions first compute the corresponding inverse mapping
+\f$\left<f_x, f_y\right>: \texttt{dst} \rightarrow \texttt{src}\f$ and then use the above formula.
+
+The actual implementations of the geometrical transformations, from the most generic remap and to
+the simplest and the fastest resize, need to solve two main problems with the above formula:
+
+- Extrapolation of non-existing pixels. Similarly to the filtering functions described in the
+previous section, for some \f$(x,y)\f$, either one of \f$f_x(x,y)\f$, or \f$f_y(x,y)\f$, or both
+of them may fall outside of the image. In this case, an extrapolation method needs to be used.
+OpenCV provides the same selection of extrapolation methods as in the filtering functions. In
+addition, it provides the method #BORDER_TRANSPARENT. This means that the corresponding pixels in
+the destination image will not be modified at all.
+
+- Interpolation of pixel values. Usually \f$f_x(x,y)\f$ and \f$f_y(x,y)\f$ are floating-point
+numbers. This means that \f$\left<f_x, f_y\right>\f$ can be either an affine or perspective
+transformation, or radial lens distortion correction, and so on. So, a pixel value at fractional
+coordinates needs to be retrieved. In the simplest case, the coordinates can be just rounded to the
+nearest integer coordinates and the corresponding pixel can be used. This is called a
+nearest-neighbor interpolation. However, a better result can be achieved by using more
+sophisticated [interpolation methods](http://en.wikipedia.org/wiki/Multivariate_interpolation) ,
+where a polynomial function is fit into some neighborhood of the computed pixel \f$(f_x(x,y),
+f_y(x,y))\f$, and then the value of the polynomial at \f$(f_x(x,y), f_y(x,y))\f$ is taken as the
+interpolated pixel value. In OpenCV, you can choose between several interpolation methods. See
+#resize for details.
+
+@note The geometrical transformations do not work with `CV_8S` or `CV_32S` images.
+
+    @defgroup imgproc_misc Miscellaneous Image Transformations
+    @defgroup imgproc_draw Drawing Functions
+
+Drawing functions work with matrices/images of arbitrary depth. The boundaries of the shapes can be
+rendered with antialiasing (implemented only for 8-bit images for now). All the functions include
+the parameter color that uses an RGB value (that may be constructed with the Scalar constructor )
+for color images and brightness for grayscale images. For color images, the channel ordering is
+normally *Blue, Green, Red*. This is what imshow, imread, and imwrite expect. So, if you form a
+color using the Scalar constructor, it should look like:
+
+\f[\texttt{Scalar} (blue \_ component, green \_ component, red \_ component[, alpha \_ component])\f]
+
+If you are using your own image rendering and I/O functions, you can use any channel ordering. The
+drawing functions process each channel independently and do not depend on the channel order or even
+on the used color space. The whole image can be converted from BGR to RGB or to a different color
+space using cvtColor .
+
+If a drawn figure is partially or completely outside the image, the drawing functions clip it. Also,
+many drawing functions can handle pixel coordinates specified with sub-pixel accuracy. This means
+that the coordinates can be passed as fixed-point numbers encoded as integers. The number of
+fractional bits is specified by the shift parameter and the real point coordinates are calculated as
+\f$\texttt{Point}(x,y)\rightarrow\texttt{Point2f}(x*2^{-shift},y*2^{-shift})\f$ . This feature is
+especially effective when rendering antialiased shapes.
+
+@note The functions do not support alpha-transparency when the target image is 4-channel. In this
+case, the color[3] is simply copied to the repainted pixels. Thus, if you want to paint
+semi-transparent shapes, you can paint them in a separate buffer and then blend it with the main
+image.
+
+    @defgroup imgproc_color_conversions Color Space Conversions
+    @defgroup imgproc_colormap ColorMaps in OpenCV
+
+The human perception isn't built for observing fine changes in grayscale images. Human eyes are more
+sensitive to observing changes between colors, so you often need to recolor your grayscale images to
+get a clue about them. OpenCV now comes with various colormaps to enhance the visualization in your
+computer vision application.
+
+In OpenCV you only need applyColorMap to apply a colormap on a given image. The following sample
+code reads the path to an image from command line, applies a Jet colormap on it and shows the
+result:
+
+@include snippets/imgproc_applyColorMap.cpp
+
+@see #ColormapTypes
+
+    @defgroup imgproc_subdiv2d Planar Subdivision
+
+The Subdiv2D class described in this section is used to perform various planar subdivision on
+a set of 2D points (represented as vector of Point2f). OpenCV subdivides a plane into triangles
+using the Delaunay's algorithm, which corresponds to the dual graph of the Voronoi diagram.
+In the figure below, the Delaunay's triangulation is marked with black lines and the Voronoi
+diagram with red lines.
+
+![Delaunay triangulation (black) and Voronoi (red)](pics/delaunay_voronoi.png)
+
+The subdivisions can be used for the 3D piece-wise transformation of a plane, morphing, fast
+location of points on the plane, building special graphs (such as NNG,RNG), and so forth.
+
+    @defgroup imgproc_hist Histograms
+    @defgroup imgproc_shape Structural Analysis and Shape Descriptors
+    @defgroup imgproc_motion Motion Analysis and Object Tracking
+    @defgroup imgproc_feature Feature Detection
+    @defgroup imgproc_object Object Detection
+    @defgroup imgproc_segmentation Image Segmentation
+    @defgroup imgproc_c C API
+    @defgroup imgproc_hal Hardware Acceleration Layer
+    @{
+        @defgroup imgproc_hal_functions Functions
+        @defgroup imgproc_hal_interface Interface
+    @}
+  @}
+*/
+
+namespace cv
+{
+
+/** @addtogroup imgproc
+@{
+*/
+
+//! @addtogroup imgproc_filter
+//! @{
+
+enum SpecialFilter {
+    FILTER_SCHARR = -1
+};
+
+//! type of morphological operation
+enum MorphTypes{
+    MORPH_ERODE    = 0, //!< see #erode
+    MORPH_DILATE   = 1, //!< see #dilate
+    MORPH_OPEN     = 2, //!< an opening operation
+                        //!< \f[\texttt{dst} = \mathrm{open} ( \texttt{src} , \texttt{element} )= \mathrm{dilate} ( \mathrm{erode} ( \texttt{src} , \texttt{element} ))\f]
+    MORPH_CLOSE    = 3, //!< a closing operation
+                        //!< \f[\texttt{dst} = \mathrm{close} ( \texttt{src} , \texttt{element} )= \mathrm{erode} ( \mathrm{dilate} ( \texttt{src} , \texttt{element} ))\f]
+    MORPH_GRADIENT = 4, //!< a morphological gradient
+                        //!< \f[\texttt{dst} = \mathrm{morph\_grad} ( \texttt{src} , \texttt{element} )= \mathrm{dilate} ( \texttt{src} , \texttt{element} )- \mathrm{erode} ( \texttt{src} , \texttt{element} )\f]
+    MORPH_TOPHAT   = 5, //!< "top hat"
+                        //!< \f[\texttt{dst} = \mathrm{tophat} ( \texttt{src} , \texttt{element} )= \texttt{src} - \mathrm{open} ( \texttt{src} , \texttt{element} )\f]
+    MORPH_BLACKHAT = 6, //!< "black hat"
+                        //!< \f[\texttt{dst} = \mathrm{blackhat} ( \texttt{src} , \texttt{element} )= \mathrm{close} ( \texttt{src} , \texttt{element} )- \texttt{src}\f]
+    MORPH_HITMISS  = 7  //!< "hit or miss"
+                        //!<   .- Only supported for CV_8UC1 binary images. A tutorial can be found in the documentation
+};
+
+//! shape of the structuring element
+enum MorphShapes {
+    MORPH_RECT    = 0, //!< a rectangular structuring element:  \f[E_{ij}=1\f]
+    MORPH_CROSS   = 1, //!< a cross-shaped structuring element:
+                       //!< \f[E_{ij} = \begin{cases} 1 & \texttt{if } {i=\texttt{anchor.y } {or } {j=\texttt{anchor.x}}} \\0 & \texttt{otherwise} \end{cases}\f]
+    MORPH_ELLIPSE = 2 //!< an elliptic structuring element, that is, a filled ellipse inscribed
+                      //!< into the rectangle Rect(0, 0, esize.width, 0.esize.height)
+};
+
+//! @} imgproc_filter
+
+//! @addtogroup imgproc_transform
+//! @{
+
+//! interpolation algorithm
+enum InterpolationFlags{
+    /** nearest neighbor interpolation */
+    INTER_NEAREST        = 0,
+    /** bilinear interpolation */
+    INTER_LINEAR         = 1,
+    /** bicubic interpolation */
+    INTER_CUBIC          = 2,
+    /** resampling using pixel area relation. It may be a preferred method for image decimation, as
+    it gives moire'-free results. But when the image is zoomed, it is similar to the INTER_NEAREST
+    method. */
+    INTER_AREA           = 3,
+    /** Lanczos interpolation over 8x8 neighborhood */
+    INTER_LANCZOS4       = 4,
+    /** Bit exact bilinear interpolation */
+    INTER_LINEAR_EXACT = 5,
+    /** Bit exact nearest neighbor interpolation. This will produce same results as
+    the nearest neighbor method in PIL, scikit-image or Matlab. */
+    INTER_NEAREST_EXACT  = 6,
+    /** mask for interpolation codes */
+    INTER_MAX            = 7,
+    /** flag, fills all of the destination image pixels. If some of them correspond to outliers in the
+    source image, they are set to zero */
+    WARP_FILL_OUTLIERS   = 8,
+    /** flag, inverse transformation
+
+    For example, #linearPolar or #logPolar transforms:
+    - flag is __not__ set: \f$dst( \rho , \phi ) = src(x,y)\f$
+    - flag is set: \f$dst(x,y) = src( \rho , \phi )\f$
+    */
+    WARP_INVERSE_MAP     = 16
+};
+
+/** \brief Specify the polar mapping mode
+@sa warpPolar
+*/
+enum WarpPolarMode
+{
+    WARP_POLAR_LINEAR = 0, ///< Remaps an image to/from polar space.
+    WARP_POLAR_LOG = 256   ///< Remaps an image to/from semilog-polar space.
+};
+
+enum InterpolationMasks {
+       INTER_BITS      = 5,
+       INTER_BITS2     = INTER_BITS * 2,
+       INTER_TAB_SIZE  = 1 << INTER_BITS,
+       INTER_TAB_SIZE2 = INTER_TAB_SIZE * INTER_TAB_SIZE
+     };
+
+//! @} imgproc_transform
+
+//! @addtogroup imgproc_misc
+//! @{
+
+//! Distance types for Distance Transform and M-estimators
+//! @see distanceTransform, fitLine
+enum DistanceTypes {
+    DIST_USER    = -1,  //!< User defined distance
+    DIST_L1      = 1,   //!< distance = |x1-x2| + |y1-y2|
+    DIST_L2      = 2,   //!< the simple euclidean distance
+    DIST_C       = 3,   //!< distance = max(|x1-x2|,|y1-y2|)
+    DIST_L12     = 4,   //!< L1-L2 metric: distance = 2(sqrt(1+x*x/2) - 1))
+    DIST_FAIR    = 5,   //!< distance = c^2(|x|/c-log(1+|x|/c)), c = 1.3998
+    DIST_WELSCH  = 6,   //!< distance = c^2/2(1-exp(-(x/c)^2)), c = 2.9846
+    DIST_HUBER   = 7    //!< distance = |x|<c ? x^2/2 : c(|x|-c/2), c=1.345
+};
+
+//! Mask size for distance transform
+enum DistanceTransformMasks {
+    DIST_MASK_3       = 3, //!< mask=3
+    DIST_MASK_5       = 5, //!< mask=5
+    DIST_MASK_PRECISE = 0  //!<
+};
+
+//! type of the threshold operation
+//! ![threshold types](pics/threshold.png)
+enum ThresholdTypes {
+    THRESH_BINARY     = 0, //!< \f[\texttt{dst} (x,y) =  \fork{\texttt{maxval}}{if \(\texttt{src}(x,y) > \texttt{thresh}\)}{0}{otherwise}\f]
+    THRESH_BINARY_INV = 1, //!< \f[\texttt{dst} (x,y) =  \fork{0}{if \(\texttt{src}(x,y) > \texttt{thresh}\)}{\texttt{maxval}}{otherwise}\f]
+    THRESH_TRUNC      = 2, //!< \f[\texttt{dst} (x,y) =  \fork{\texttt{threshold}}{if \(\texttt{src}(x,y) > \texttt{thresh}\)}{\texttt{src}(x,y)}{otherwise}\f]
+    THRESH_TOZERO     = 3, //!< \f[\texttt{dst} (x,y) =  \fork{\texttt{src}(x,y)}{if \(\texttt{src}(x,y) > \texttt{thresh}\)}{0}{otherwise}\f]
+    THRESH_TOZERO_INV = 4, //!< \f[\texttt{dst} (x,y) =  \fork{0}{if \(\texttt{src}(x,y) > \texttt{thresh}\)}{\texttt{src}(x,y)}{otherwise}\f]
+    THRESH_MASK       = 7,
+    THRESH_OTSU       = 8, //!< flag, use Otsu algorithm to choose the optimal threshold value
+    THRESH_TRIANGLE   = 16 //!< flag, use Triangle algorithm to choose the optimal threshold value
+};
+
+//! adaptive threshold algorithm
+//! @see adaptiveThreshold
+enum AdaptiveThresholdTypes {
+    /** the threshold value \f$T(x,y)\f$ is a mean of the \f$\texttt{blockSize} \times
+    \texttt{blockSize}\f$ neighborhood of \f$(x, y)\f$ minus C */
+    ADAPTIVE_THRESH_MEAN_C     = 0,
+    /** the threshold value \f$T(x, y)\f$ is a weighted sum (cross-correlation with a Gaussian
+    window) of the \f$\texttt{blockSize} \times \texttt{blockSize}\f$ neighborhood of \f$(x, y)\f$
+    minus C . The default sigma (standard deviation) is used for the specified blockSize . See
+    #getGaussianKernel*/
+    ADAPTIVE_THRESH_GAUSSIAN_C = 1
+};
+
+//! class of the pixel in GrabCut algorithm
+enum GrabCutClasses {
+    GC_BGD    = 0,  //!< an obvious background pixels
+    GC_FGD    = 1,  //!< an obvious foreground (object) pixel
+    GC_PR_BGD = 2,  //!< a possible background pixel
+    GC_PR_FGD = 3   //!< a possible foreground pixel
+};
+
+//! GrabCut algorithm flags
+enum GrabCutModes {
+    /** The function initializes the state and the mask using the provided rectangle. After that it
+    runs iterCount iterations of the algorithm. */
+    GC_INIT_WITH_RECT  = 0,
+    /** The function initializes the state using the provided mask. Note that GC_INIT_WITH_RECT
+    and GC_INIT_WITH_MASK can be combined. Then, all the pixels outside of the ROI are
+    automatically initialized with GC_BGD .*/
+    GC_INIT_WITH_MASK  = 1,
+    /** The value means that the algorithm should just resume. */
+    GC_EVAL            = 2,
+    /** The value means that the algorithm should just run the grabCut algorithm (a single iteration) with the fixed model */
+    GC_EVAL_FREEZE_MODEL = 3
+};
+
+//! distanceTransform algorithm flags
+enum DistanceTransformLabelTypes {
+    /** each connected component of zeros in src (as well as all the non-zero pixels closest to the
+    connected component) will be assigned the same label */
+    DIST_LABEL_CCOMP = 0,
+    /** each zero pixel (and all the non-zero pixels closest to it) gets its own label. */
+    DIST_LABEL_PIXEL = 1
+};
+
+//! floodfill algorithm flags
+enum FloodFillFlags {
+    /** If set, the difference between the current pixel and seed pixel is considered. Otherwise,
+    the difference between neighbor pixels is considered (that is, the range is floating). */
+    FLOODFILL_FIXED_RANGE = 1 << 16,
+    /** If set, the function does not change the image ( newVal is ignored), and only fills the
+    mask with the value specified in bits 8-16 of flags as described above. This option only make
+    sense in function variants that have the mask parameter. */
+    FLOODFILL_MASK_ONLY   = 1 << 17
+};
+
+//! @} imgproc_misc
+
+//! @addtogroup imgproc_shape
+//! @{
+
+//! connected components statistics
+enum ConnectedComponentsTypes {
+    CC_STAT_LEFT   = 0, //!< The leftmost (x) coordinate which is the inclusive start of the bounding
+                        //!< box in the horizontal direction.
+    CC_STAT_TOP    = 1, //!< The topmost (y) coordinate which is the inclusive start of the bounding
+                        //!< box in the vertical direction.
+    CC_STAT_WIDTH  = 2, //!< The horizontal size of the bounding box
+    CC_STAT_HEIGHT = 3, //!< The vertical size of the bounding box
+    CC_STAT_AREA   = 4, //!< The total area (in pixels) of the connected component
+#ifndef CV_DOXYGEN
+    CC_STAT_MAX    = 5 //!< Max enumeration value. Used internally only for memory allocation
+#endif
+};
+
+//! connected components algorithm
+enum ConnectedComponentsAlgorithmsTypes {
+    CCL_DEFAULT   = -1, //!< Spaghetti @cite Bolelli2019 algorithm for 8-way connectivity, Spaghetti4C @cite Bolelli2021 algorithm for 4-way connectivity.
+    CCL_WU        = 0,  //!< SAUF @cite Wu2009 algorithm for 8-way connectivity, SAUF algorithm for 4-way connectivity. The parallel implementation described in @cite Bolelli2017 is available for SAUF.
+    CCL_GRANA     = 1,  //!< BBDT @cite Grana2010 algorithm for 8-way connectivity, SAUF algorithm for 4-way connectivity. The parallel implementation described in @cite Bolelli2017 is available for both BBDT and SAUF.
+    CCL_BOLELLI   = 2,  //!< Spaghetti @cite Bolelli2019 algorithm for 8-way connectivity, Spaghetti4C @cite Bolelli2021 algorithm for 4-way connectivity. The parallel implementation described in @cite Bolelli2017 is available for both Spaghetti and Spaghetti4C.
+    CCL_SAUF      = 3,  //!< Same as CCL_WU. It is preferable to use the flag with the name of the algorithm (CCL_SAUF) rather than the one with the name of the first author (CCL_WU).
+    CCL_BBDT      = 4,  //!< Same as CCL_GRANA. It is preferable to use the flag with the name of the algorithm (CCL_BBDT) rather than the one with the name of the first author (CCL_GRANA).
+    CCL_SPAGHETTI = 5,  //!< Same as CCL_BOLELLI. It is preferable to use the flag with the name of the algorithm (CCL_SPAGHETTI) rather than the one with the name of the first author (CCL_BOLELLI).
+};
+
+//! mode of the contour retrieval algorithm
+enum RetrievalModes {
+    /** retrieves only the extreme outer contours. It sets `hierarchy[i][2]=hierarchy[i][3]=-1` for
+    all the contours. */
+    RETR_EXTERNAL  = 0,
+    /** retrieves all of the contours without establishing any hierarchical relationships. */
+    RETR_LIST      = 1,
+    /** retrieves all of the contours and organizes them into a two-level hierarchy. At the top
+    level, there are external boundaries of the components. At the second level, there are
+    boundaries of the holes. If there is another contour inside a hole of a connected component, it
+    is still put at the top level. */
+    RETR_CCOMP     = 2,
+    /** retrieves all of the contours and reconstructs a full hierarchy of nested contours.*/
+    RETR_TREE      = 3,
+    RETR_FLOODFILL = 4 //!<
+};
+
+//! the contour approximation algorithm
+enum ContourApproximationModes {
+    /** stores absolutely all the contour points. That is, any 2 subsequent points (x1,y1) and
+    (x2,y2) of the contour will be either horizontal, vertical or diagonal neighbors, that is,
+    max(abs(x1-x2),abs(y2-y1))==1. */
+    CHAIN_APPROX_NONE      = 1,
+    /** compresses horizontal, vertical, and diagonal segments and leaves only their end points.
+    For example, an up-right rectangular contour is encoded with 4 points. */
+    CHAIN_APPROX_SIMPLE    = 2,
+    /** applies one of the flavors of the Teh-Chin chain approximation algorithm @cite TehChin89 */
+    CHAIN_APPROX_TC89_L1   = 3,
+    /** applies one of the flavors of the Teh-Chin chain approximation algorithm @cite TehChin89 */
+    CHAIN_APPROX_TC89_KCOS = 4
+};
+
+/** @brief Shape matching methods
+
+\f$A\f$ denotes object1,\f$B\f$ denotes object2
+
+\f$\begin{array}{l} m^A_i =  \mathrm{sign} (h^A_i)  \cdot \log{h^A_i} \\ m^B_i =  \mathrm{sign} (h^B_i)  \cdot \log{h^B_i} \end{array}\f$
+
+and \f$h^A_i, h^B_i\f$ are the Hu moments of \f$A\f$ and \f$B\f$ , respectively.
+*/
+enum ShapeMatchModes {
+    CONTOURS_MATCH_I1  =1, //!< \f[I_1(A,B) =  \sum _{i=1...7}  \left |  \frac{1}{m^A_i} -  \frac{1}{m^B_i} \right |\f]
+    CONTOURS_MATCH_I2  =2, //!< \f[I_2(A,B) =  \sum _{i=1...7}  \left | m^A_i - m^B_i  \right |\f]
+    CONTOURS_MATCH_I3  =3  //!< \f[I_3(A,B) =  \max _{i=1...7}  \frac{ \left| m^A_i - m^B_i \right| }{ \left| m^A_i \right| }\f]
+};
+
+//! @} imgproc_shape
+
+//! @addtogroup imgproc_feature
+//! @{
+
+//! Variants of a Hough transform
+enum HoughModes {
+
+    /** classical or standard Hough transform. Every line is represented by two floating-point
+    numbers \f$(\rho, \theta)\f$ , where \f$\rho\f$ is a distance between (0,0) point and the line,
+    and \f$\theta\f$ is the angle between x-axis and the normal to the line. Thus, the matrix must
+    be (the created sequence will be) of CV_32FC2 type */
+    HOUGH_STANDARD      = 0,
+    /** probabilistic Hough transform (more efficient in case if the picture contains a few long
+    linear segments). It returns line segments rather than the whole line. Each segment is
+    represented by starting and ending points, and the matrix must be (the created sequence will
+    be) of the CV_32SC4 type. */
+    HOUGH_PROBABILISTIC = 1,
+    /** multi-scale variant of the classical Hough transform. The lines are encoded the same way as
+    HOUGH_STANDARD. */
+    HOUGH_MULTI_SCALE   = 2,
+    HOUGH_GRADIENT      = 3, //!< basically *21HT*, described in @cite Yuen90
+    HOUGH_GRADIENT_ALT  = 4, //!< variation of HOUGH_GRADIENT to get better accuracy
+};
+
+//! Variants of Line Segment %Detector
+enum LineSegmentDetectorModes {
+    LSD_REFINE_NONE = 0, //!< No refinement applied
+    LSD_REFINE_STD  = 1, //!< Standard refinement is applied. E.g. breaking arches into smaller straighter line approximations.
+    LSD_REFINE_ADV  = 2  //!< Advanced refinement. Number of false alarms is calculated, lines are
+                         //!< refined through increase of precision, decrement in size, etc.
+};
+
+//! @} imgproc_feature
+
+/** Histogram comparison methods
+  @ingroup imgproc_hist
+*/
+enum HistCompMethods {
+    /** Correlation
+    \f[d(H_1,H_2) =  \frac{\sum_I (H_1(I) - \bar{H_1}) (H_2(I) - \bar{H_2})}{\sqrt{\sum_I(H_1(I) - \bar{H_1})^2 \sum_I(H_2(I) - \bar{H_2})^2}}\f]
+    where
+    \f[\bar{H_k} =  \frac{1}{N} \sum _J H_k(J)\f]
+    and \f$N\f$ is a total number of histogram bins. */
+    HISTCMP_CORREL        = 0,
+    /** Chi-Square
+    \f[d(H_1,H_2) =  \sum _I  \frac{\left(H_1(I)-H_2(I)\right)^2}{H_1(I)}\f] */
+    HISTCMP_CHISQR        = 1,
+    /** Intersection
+    \f[d(H_1,H_2) =  \sum _I  \min (H_1(I), H_2(I))\f] */
+    HISTCMP_INTERSECT     = 2,
+    /** Bhattacharyya distance
+    (In fact, OpenCV computes Hellinger distance, which is related to Bhattacharyya coefficient.)
+    \f[d(H_1,H_2) =  \sqrt{1 - \frac{1}{\sqrt{\bar{H_1} \bar{H_2} N^2}} \sum_I \sqrt{H_1(I) \cdot H_2(I)}}\f] */
+    HISTCMP_BHATTACHARYYA = 3,
+    HISTCMP_HELLINGER     = HISTCMP_BHATTACHARYYA, //!< Synonym for HISTCMP_BHATTACHARYYA
+    /** Alternative Chi-Square
+    \f[d(H_1,H_2) =  2 * \sum _I  \frac{\left(H_1(I)-H_2(I)\right)^2}{H_1(I)+H_2(I)}\f]
+    This alternative formula is regularly used for texture comparison. See e.g. @cite Puzicha1997 */
+    HISTCMP_CHISQR_ALT    = 4,
+    /** Kullback-Leibler divergence
+    \f[d(H_1,H_2) = \sum _I H_1(I) \log \left(\frac{H_1(I)}{H_2(I)}\right)\f] */
+    HISTCMP_KL_DIV        = 5
+};
+
+/** the color conversion codes
+@see @ref imgproc_color_conversions
+@ingroup imgproc_color_conversions
+ */
+enum ColorConversionCodes {
+    COLOR_BGR2BGRA     = 0, //!< add alpha channel to RGB or BGR image
+    COLOR_RGB2RGBA     = COLOR_BGR2BGRA,
+
+    COLOR_BGRA2BGR     = 1, //!< remove alpha channel from RGB or BGR image
+    COLOR_RGBA2RGB     = COLOR_BGRA2BGR,
+
+    COLOR_BGR2RGBA     = 2, //!< convert between RGB and BGR color spaces (with or without alpha channel)
+    COLOR_RGB2BGRA     = COLOR_BGR2RGBA,
+
+    COLOR_RGBA2BGR     = 3,
+    COLOR_BGRA2RGB     = COLOR_RGBA2BGR,
+
+    COLOR_BGR2RGB      = 4,
+    COLOR_RGB2BGR      = COLOR_BGR2RGB,
+
+    COLOR_BGRA2RGBA    = 5,
+    COLOR_RGBA2BGRA    = COLOR_BGRA2RGBA,
+
+    COLOR_BGR2GRAY     = 6, //!< convert between RGB/BGR and grayscale, @ref color_convert_rgb_gray "color conversions"
+    COLOR_RGB2GRAY     = 7,
+    COLOR_GRAY2BGR     = 8,
+    COLOR_GRAY2RGB     = COLOR_GRAY2BGR,
+    COLOR_GRAY2BGRA    = 9,
+    COLOR_GRAY2RGBA    = COLOR_GRAY2BGRA,
+    COLOR_BGRA2GRAY    = 10,
+    COLOR_RGBA2GRAY    = 11,
+
+    COLOR_BGR2BGR565   = 12, //!< convert between RGB/BGR and BGR565 (16-bit images)
+    COLOR_RGB2BGR565   = 13,
+    COLOR_BGR5652BGR   = 14,
+    COLOR_BGR5652RGB   = 15,
+    COLOR_BGRA2BGR565  = 16,
+    COLOR_RGBA2BGR565  = 17,
+    COLOR_BGR5652BGRA  = 18,
+    COLOR_BGR5652RGBA  = 19,
+
+    COLOR_GRAY2BGR565  = 20, //!< convert between grayscale to BGR565 (16-bit images)
+    COLOR_BGR5652GRAY  = 21,
+
+    COLOR_BGR2BGR555   = 22,  //!< convert between RGB/BGR and BGR555 (16-bit images)
+    COLOR_RGB2BGR555   = 23,
+    COLOR_BGR5552BGR   = 24,
+    COLOR_BGR5552RGB   = 25,
+    COLOR_BGRA2BGR555  = 26,
+    COLOR_RGBA2BGR555  = 27,
+    COLOR_BGR5552BGRA  = 28,
+    COLOR_BGR5552RGBA  = 29,
+
+    COLOR_GRAY2BGR555  = 30, //!< convert between grayscale and BGR555 (16-bit images)
+    COLOR_BGR5552GRAY  = 31,
+
+    COLOR_BGR2XYZ      = 32, //!< convert RGB/BGR to CIE XYZ, @ref color_convert_rgb_xyz "color conversions"
+    COLOR_RGB2XYZ      = 33,
+    COLOR_XYZ2BGR      = 34,
+    COLOR_XYZ2RGB      = 35,
+
+    COLOR_BGR2YCrCb    = 36, //!< convert RGB/BGR to luma-chroma (aka YCC), @ref color_convert_rgb_ycrcb "color conversions"
+    COLOR_RGB2YCrCb    = 37,
+    COLOR_YCrCb2BGR    = 38,
+    COLOR_YCrCb2RGB    = 39,
+
+    COLOR_BGR2HSV      = 40, //!< convert RGB/BGR to HSV (hue saturation value) with H range 0..180 if 8 bit image, @ref color_convert_rgb_hsv "color conversions"
+    COLOR_RGB2HSV      = 41,
+
+    COLOR_BGR2Lab      = 44, //!< convert RGB/BGR to CIE Lab, @ref color_convert_rgb_lab "color conversions"
+    COLOR_RGB2Lab      = 45,
+
+    COLOR_BGR2Luv      = 50, //!< convert RGB/BGR to CIE Luv, @ref color_convert_rgb_luv "color conversions"
+    COLOR_RGB2Luv      = 51,
+    COLOR_BGR2HLS      = 52, //!< convert RGB/BGR to HLS (hue lightness saturation) with H range 0..180 if 8 bit image, @ref color_convert_rgb_hls "color conversions"
+    COLOR_RGB2HLS      = 53,
+
+    COLOR_HSV2BGR      = 54, //!< backward conversions HSV to RGB/BGR with H range 0..180 if 8 bit image
+    COLOR_HSV2RGB      = 55,
+
+    COLOR_Lab2BGR      = 56,
+    COLOR_Lab2RGB      = 57,
+    COLOR_Luv2BGR      = 58,
+    COLOR_Luv2RGB      = 59,
+    COLOR_HLS2BGR      = 60, //!< backward conversions HLS to RGB/BGR with H range 0..180 if 8 bit image
+    COLOR_HLS2RGB      = 61,
+
+    COLOR_BGR2HSV_FULL = 66, //!< convert RGB/BGR to HSV (hue saturation value) with H range 0..255 if 8 bit image, @ref color_convert_rgb_hsv "color conversions"
+    COLOR_RGB2HSV_FULL = 67,
+    COLOR_BGR2HLS_FULL = 68, //!< convert RGB/BGR to HLS (hue lightness saturation) with H range 0..255 if 8 bit image, @ref color_convert_rgb_hls "color conversions"
+    COLOR_RGB2HLS_FULL = 69,
+
+    COLOR_HSV2BGR_FULL = 70, //!< backward conversions HSV to RGB/BGR with H range 0..255 if 8 bit image
+    COLOR_HSV2RGB_FULL = 71,
+    COLOR_HLS2BGR_FULL = 72, //!< backward conversions HLS to RGB/BGR with H range 0..255 if 8 bit image
+    COLOR_HLS2RGB_FULL = 73,
+
+    COLOR_LBGR2Lab     = 74,
+    COLOR_LRGB2Lab     = 75,
+    COLOR_LBGR2Luv     = 76,
+    COLOR_LRGB2Luv     = 77,
+
+    COLOR_Lab2LBGR     = 78,
+    COLOR_Lab2LRGB     = 79,
+    COLOR_Luv2LBGR     = 80,
+    COLOR_Luv2LRGB     = 81,
+
+    COLOR_BGR2YUV      = 82, //!< convert between RGB/BGR and YUV
+    COLOR_RGB2YUV      = 83,
+    COLOR_YUV2BGR      = 84,
+    COLOR_YUV2RGB      = 85,
+
+    //! YUV 4:2:0 family to RGB
+    COLOR_YUV2RGB_NV12  = 90,
+    COLOR_YUV2BGR_NV12  = 91,
+    COLOR_YUV2RGB_NV21  = 92,
+    COLOR_YUV2BGR_NV21  = 93,
+    COLOR_YUV420sp2RGB  = COLOR_YUV2RGB_NV21,
+    COLOR_YUV420sp2BGR  = COLOR_YUV2BGR_NV21,
+
+    COLOR_YUV2RGBA_NV12 = 94,
+    COLOR_YUV2BGRA_NV12 = 95,
+    COLOR_YUV2RGBA_NV21 = 96,
+    COLOR_YUV2BGRA_NV21 = 97,
+    COLOR_YUV420sp2RGBA = COLOR_YUV2RGBA_NV21,
+    COLOR_YUV420sp2BGRA = COLOR_YUV2BGRA_NV21,
+
+    COLOR_YUV2RGB_YV12  = 98,
+    COLOR_YUV2BGR_YV12  = 99,
+    COLOR_YUV2RGB_IYUV  = 100,
+    COLOR_YUV2BGR_IYUV  = 101,
+    COLOR_YUV2RGB_I420  = COLOR_YUV2RGB_IYUV,
+    COLOR_YUV2BGR_I420  = COLOR_YUV2BGR_IYUV,
+    COLOR_YUV420p2RGB   = COLOR_YUV2RGB_YV12,
+    COLOR_YUV420p2BGR   = COLOR_YUV2BGR_YV12,
+
+    COLOR_YUV2RGBA_YV12 = 102,
+    COLOR_YUV2BGRA_YV12 = 103,
+    COLOR_YUV2RGBA_IYUV = 104,
+    COLOR_YUV2BGRA_IYUV = 105,
+    COLOR_YUV2RGBA_I420 = COLOR_YUV2RGBA_IYUV,
+    COLOR_YUV2BGRA_I420 = COLOR_YUV2BGRA_IYUV,
+    COLOR_YUV420p2RGBA  = COLOR_YUV2RGBA_YV12,
+    COLOR_YUV420p2BGRA  = COLOR_YUV2BGRA_YV12,
+
+    COLOR_YUV2GRAY_420  = 106,
+    COLOR_YUV2GRAY_NV21 = COLOR_YUV2GRAY_420,
+    COLOR_YUV2GRAY_NV12 = COLOR_YUV2GRAY_420,
+    COLOR_YUV2GRAY_YV12 = COLOR_YUV2GRAY_420,
+    COLOR_YUV2GRAY_IYUV = COLOR_YUV2GRAY_420,
+    COLOR_YUV2GRAY_I420 = COLOR_YUV2GRAY_420,
+    COLOR_YUV420sp2GRAY = COLOR_YUV2GRAY_420,
+    COLOR_YUV420p2GRAY  = COLOR_YUV2GRAY_420,
+
+    //! YUV 4:2:2 family to RGB
+    COLOR_YUV2RGB_UYVY = 107,
+    COLOR_YUV2BGR_UYVY = 108,
+    //COLOR_YUV2RGB_VYUY = 109,
+    //COLOR_YUV2BGR_VYUY = 110,
+    COLOR_YUV2RGB_Y422 = COLOR_YUV2RGB_UYVY,
+    COLOR_YUV2BGR_Y422 = COLOR_YUV2BGR_UYVY,
+    COLOR_YUV2RGB_UYNV = COLOR_YUV2RGB_UYVY,
+    COLOR_YUV2BGR_UYNV = COLOR_YUV2BGR_UYVY,
+
+    COLOR_YUV2RGBA_UYVY = 111,
+    COLOR_YUV2BGRA_UYVY = 112,
+    //COLOR_YUV2RGBA_VYUY = 113,
+    //COLOR_YUV2BGRA_VYUY = 114,
+    COLOR_YUV2RGBA_Y422 = COLOR_YUV2RGBA_UYVY,
+    COLOR_YUV2BGRA_Y422 = COLOR_YUV2BGRA_UYVY,
+    COLOR_YUV2RGBA_UYNV = COLOR_YUV2RGBA_UYVY,
+    COLOR_YUV2BGRA_UYNV = COLOR_YUV2BGRA_UYVY,
+
+    COLOR_YUV2RGB_YUY2 = 115,
+    COLOR_YUV2BGR_YUY2 = 116,
+    COLOR_YUV2RGB_YVYU = 117,
+    COLOR_YUV2BGR_YVYU = 118,
+    COLOR_YUV2RGB_YUYV = COLOR_YUV2RGB_YUY2,
+    COLOR_YUV2BGR_YUYV = COLOR_YUV2BGR_YUY2,
+    COLOR_YUV2RGB_YUNV = COLOR_YUV2RGB_YUY2,
+    COLOR_YUV2BGR_YUNV = COLOR_YUV2BGR_YUY2,
+
+    COLOR_YUV2RGBA_YUY2 = 119,
+    COLOR_YUV2BGRA_YUY2 = 120,
+    COLOR_YUV2RGBA_YVYU = 121,
+    COLOR_YUV2BGRA_YVYU = 122,
+    COLOR_YUV2RGBA_YUYV = COLOR_YUV2RGBA_YUY2,
+    COLOR_YUV2BGRA_YUYV = COLOR_YUV2BGRA_YUY2,
+    COLOR_YUV2RGBA_YUNV = COLOR_YUV2RGBA_YUY2,
+    COLOR_YUV2BGRA_YUNV = COLOR_YUV2BGRA_YUY2,
+
+    COLOR_YUV2GRAY_UYVY = 123,
+    COLOR_YUV2GRAY_YUY2 = 124,
+    //CV_YUV2GRAY_VYUY    = CV_YUV2GRAY_UYVY,
+    COLOR_YUV2GRAY_Y422 = COLOR_YUV2GRAY_UYVY,
+    COLOR_YUV2GRAY_UYNV = COLOR_YUV2GRAY_UYVY,
+    COLOR_YUV2GRAY_YVYU = COLOR_YUV2GRAY_YUY2,
+    COLOR_YUV2GRAY_YUYV = COLOR_YUV2GRAY_YUY2,
+    COLOR_YUV2GRAY_YUNV = COLOR_YUV2GRAY_YUY2,
+
+    //! alpha premultiplication
+    COLOR_RGBA2mRGBA    = 125,
+    COLOR_mRGBA2RGBA    = 126,
+
+    //! RGB to YUV 4:2:0 family
+    COLOR_RGB2YUV_I420  = 127,
+    COLOR_BGR2YUV_I420  = 128,
+    COLOR_RGB2YUV_IYUV  = COLOR_RGB2YUV_I420,
+    COLOR_BGR2YUV_IYUV  = COLOR_BGR2YUV_I420,
+
+    COLOR_RGBA2YUV_I420 = 129,
+    COLOR_BGRA2YUV_I420 = 130,
+    COLOR_RGBA2YUV_IYUV = COLOR_RGBA2YUV_I420,
+    COLOR_BGRA2YUV_IYUV = COLOR_BGRA2YUV_I420,
+    COLOR_RGB2YUV_YV12  = 131,
+    COLOR_BGR2YUV_YV12  = 132,
+    COLOR_RGBA2YUV_YV12 = 133,
+    COLOR_BGRA2YUV_YV12 = 134,
+
+    //! Demosaicing, see @ref color_convert_bayer "color conversions" for additional information
+    COLOR_BayerBG2BGR = 46, //!< equivalent to RGGB Bayer pattern
+    COLOR_BayerGB2BGR = 47, //!< equivalent to GRBG Bayer pattern
+    COLOR_BayerRG2BGR = 48, //!< equivalent to BGGR Bayer pattern
+    COLOR_BayerGR2BGR = 49, //!< equivalent to GBRG Bayer pattern
+
+    COLOR_BayerRGGB2BGR = COLOR_BayerBG2BGR,
+    COLOR_BayerGRBG2BGR = COLOR_BayerGB2BGR,
+    COLOR_BayerBGGR2BGR = COLOR_BayerRG2BGR,
+    COLOR_BayerGBRG2BGR = COLOR_BayerGR2BGR,
+
+    COLOR_BayerRGGB2RGB = COLOR_BayerBGGR2BGR,
+    COLOR_BayerGRBG2RGB = COLOR_BayerGBRG2BGR,
+    COLOR_BayerBGGR2RGB = COLOR_BayerRGGB2BGR,
+    COLOR_BayerGBRG2RGB = COLOR_BayerGRBG2BGR,
+
+    COLOR_BayerBG2RGB = COLOR_BayerRG2BGR, //!< equivalent to RGGB Bayer pattern
+    COLOR_BayerGB2RGB = COLOR_BayerGR2BGR, //!< equivalent to GRBG Bayer pattern
+    COLOR_BayerRG2RGB = COLOR_BayerBG2BGR, //!< equivalent to BGGR Bayer pattern
+    COLOR_BayerGR2RGB = COLOR_BayerGB2BGR, //!< equivalent to GBRG Bayer pattern
+
+    COLOR_BayerBG2GRAY = 86, //!< equivalent to RGGB Bayer pattern
+    COLOR_BayerGB2GRAY = 87, //!< equivalent to GRBG Bayer pattern
+    COLOR_BayerRG2GRAY = 88, //!< equivalent to BGGR Bayer pattern
+    COLOR_BayerGR2GRAY = 89, //!< equivalent to GBRG Bayer pattern
+
+    COLOR_BayerRGGB2GRAY = COLOR_BayerBG2GRAY,
+    COLOR_BayerGRBG2GRAY = COLOR_BayerGB2GRAY,
+    COLOR_BayerBGGR2GRAY = COLOR_BayerRG2GRAY,
+    COLOR_BayerGBRG2GRAY = COLOR_BayerGR2GRAY,
+
+    //! Demosaicing using Variable Number of Gradients
+    COLOR_BayerBG2BGR_VNG = 62, //!< equivalent to RGGB Bayer pattern
+    COLOR_BayerGB2BGR_VNG = 63, //!< equivalent to GRBG Bayer pattern
+    COLOR_BayerRG2BGR_VNG = 64, //!< equivalent to BGGR Bayer pattern
+    COLOR_BayerGR2BGR_VNG = 65, //!< equivalent to GBRG Bayer pattern
+
+    COLOR_BayerRGGB2BGR_VNG = COLOR_BayerBG2BGR_VNG,
+    COLOR_BayerGRBG2BGR_VNG = COLOR_BayerGB2BGR_VNG,
+    COLOR_BayerBGGR2BGR_VNG = COLOR_BayerRG2BGR_VNG,
+    COLOR_BayerGBRG2BGR_VNG = COLOR_BayerGR2BGR_VNG,
+
+    COLOR_BayerRGGB2RGB_VNG = COLOR_BayerBGGR2BGR_VNG,
+    COLOR_BayerGRBG2RGB_VNG = COLOR_BayerGBRG2BGR_VNG,
+    COLOR_BayerBGGR2RGB_VNG = COLOR_BayerRGGB2BGR_VNG,
+    COLOR_BayerGBRG2RGB_VNG = COLOR_BayerGRBG2BGR_VNG,
+
+    COLOR_BayerBG2RGB_VNG = COLOR_BayerRG2BGR_VNG, //!< equivalent to RGGB Bayer pattern
+    COLOR_BayerGB2RGB_VNG = COLOR_BayerGR2BGR_VNG, //!< equivalent to GRBG Bayer pattern
+    COLOR_BayerRG2RGB_VNG = COLOR_BayerBG2BGR_VNG, //!< equivalent to BGGR Bayer pattern
+    COLOR_BayerGR2RGB_VNG = COLOR_BayerGB2BGR_VNG, //!< equivalent to GBRG Bayer pattern
+
+    //! Edge-Aware Demosaicing
+    COLOR_BayerBG2BGR_EA  = 135, //!< equivalent to RGGB Bayer pattern
+    COLOR_BayerGB2BGR_EA  = 136, //!< equivalent to GRBG Bayer pattern
+    COLOR_BayerRG2BGR_EA  = 137, //!< equivalent to BGGR Bayer pattern
+    COLOR_BayerGR2BGR_EA  = 138, //!< equivalent to GBRG Bayer pattern
+
+    COLOR_BayerRGGB2BGR_EA  = COLOR_BayerBG2BGR_EA,
+    COLOR_BayerGRBG2BGR_EA  = COLOR_BayerGB2BGR_EA,
+    COLOR_BayerBGGR2BGR_EA  = COLOR_BayerRG2BGR_EA,
+    COLOR_BayerGBRG2BGR_EA  = COLOR_BayerGR2BGR_EA,
+
+    COLOR_BayerRGGB2RGB_EA  = COLOR_BayerBGGR2BGR_EA,
+    COLOR_BayerGRBG2RGB_EA  = COLOR_BayerGBRG2BGR_EA,
+    COLOR_BayerBGGR2RGB_EA  = COLOR_BayerRGGB2BGR_EA,
+    COLOR_BayerGBRG2RGB_EA  = COLOR_BayerGRBG2BGR_EA,
+
+    COLOR_BayerBG2RGB_EA  = COLOR_BayerRG2BGR_EA, //!< equivalent to RGGB Bayer pattern
+    COLOR_BayerGB2RGB_EA  = COLOR_BayerGR2BGR_EA, //!< equivalent to GRBG Bayer pattern
+    COLOR_BayerRG2RGB_EA  = COLOR_BayerBG2BGR_EA, //!< equivalent to BGGR Bayer pattern
+    COLOR_BayerGR2RGB_EA  = COLOR_BayerGB2BGR_EA, //!< equivalent to GBRG Bayer pattern
+
+    //! Demosaicing with alpha channel
+    COLOR_BayerBG2BGRA = 139, //!< equivalent to RGGB Bayer pattern
+    COLOR_BayerGB2BGRA = 140, //!< equivalent to GRBG Bayer pattern
+    COLOR_BayerRG2BGRA = 141, //!< equivalent to BGGR Bayer pattern
+    COLOR_BayerGR2BGRA = 142, //!< equivalent to GBRG Bayer pattern
+
+    COLOR_BayerRGGB2BGRA = COLOR_BayerBG2BGRA,
+    COLOR_BayerGRBG2BGRA = COLOR_BayerGB2BGRA,
+    COLOR_BayerBGGR2BGRA = COLOR_BayerRG2BGRA,
+    COLOR_BayerGBRG2BGRA = COLOR_BayerGR2BGRA,
+
+    COLOR_BayerRGGB2RGBA = COLOR_BayerBGGR2BGRA,
+    COLOR_BayerGRBG2RGBA = COLOR_BayerGBRG2BGRA,
+    COLOR_BayerBGGR2RGBA = COLOR_BayerRGGB2BGRA,
+    COLOR_BayerGBRG2RGBA = COLOR_BayerGRBG2BGRA,
+
+    COLOR_BayerBG2RGBA = COLOR_BayerRG2BGRA, //!< equivalent to RGGB Bayer pattern
+    COLOR_BayerGB2RGBA = COLOR_BayerGR2BGRA, //!< equivalent to GRBG Bayer pattern
+    COLOR_BayerRG2RGBA = COLOR_BayerBG2BGRA, //!< equivalent to BGGR Bayer pattern
+    COLOR_BayerGR2RGBA = COLOR_BayerGB2BGRA, //!< equivalent to GBRG Bayer pattern
+
+    COLOR_COLORCVT_MAX  = 143
+};
+
+//! @addtogroup imgproc_shape
+//! @{
+
+//! types of intersection between rectangles
+enum RectanglesIntersectTypes {
+    INTERSECT_NONE = 0, //!< No intersection
+    INTERSECT_PARTIAL  = 1, //!< There is a partial intersection
+    INTERSECT_FULL  = 2 //!< One of the rectangle is fully enclosed in the other
+};
+
+/** types of line
+@ingroup imgproc_draw
+*/
+enum LineTypes {
+    FILLED  = -1,
+    LINE_4  = 4, //!< 4-connected line
+    LINE_8  = 8, //!< 8-connected line
+    LINE_AA = 16 //!< antialiased line
+};
+
+/** Only a subset of Hershey fonts <https://en.wikipedia.org/wiki/Hershey_fonts> are supported
+@ingroup imgproc_draw
+*/
+enum HersheyFonts {
+    FONT_HERSHEY_SIMPLEX        = 0, //!< normal size sans-serif font
+    FONT_HERSHEY_PLAIN          = 1, //!< small size sans-serif font
+    FONT_HERSHEY_DUPLEX         = 2, //!< normal size sans-serif font (more complex than FONT_HERSHEY_SIMPLEX)
+    FONT_HERSHEY_COMPLEX        = 3, //!< normal size serif font
+    FONT_HERSHEY_TRIPLEX        = 4, //!< normal size serif font (more complex than FONT_HERSHEY_COMPLEX)
+    FONT_HERSHEY_COMPLEX_SMALL  = 5, //!< smaller version of FONT_HERSHEY_COMPLEX
+    FONT_HERSHEY_SCRIPT_SIMPLEX = 6, //!< hand-writing style font
+    FONT_HERSHEY_SCRIPT_COMPLEX = 7, //!< more complex variant of FONT_HERSHEY_SCRIPT_SIMPLEX
+    FONT_ITALIC                 = 16 //!< flag for italic font
+};
+
+/** Possible set of marker types used for the cv::drawMarker function
+@ingroup imgproc_draw
+*/
+enum MarkerTypes
+{
+    MARKER_CROSS = 0,           //!< A crosshair marker shape
+    MARKER_TILTED_CROSS = 1,    //!< A 45 degree tilted crosshair marker shape
+    MARKER_STAR = 2,            //!< A star marker shape, combination of cross and tilted cross
+    MARKER_DIAMOND = 3,         //!< A diamond marker shape
+    MARKER_SQUARE = 4,          //!< A square marker shape
+    MARKER_TRIANGLE_UP = 5,     //!< An upwards pointing triangle marker shape
+    MARKER_TRIANGLE_DOWN = 6    //!< A downwards pointing triangle marker shape
+};
+
+/** @brief finds arbitrary template in the grayscale image using Generalized Hough Transform
+*/
+class CV_EXPORTS_W GeneralizedHough : public Algorithm
+{
+public:
+    //! set template to search
+    CV_WRAP virtual void setTemplate(InputArray templ, Point templCenter = Point(-1, -1)) = 0;
+    CV_WRAP virtual void setTemplate(InputArray edges, InputArray dx, InputArray dy, Point templCenter = Point(-1, -1)) = 0;
+
+    //! find template on image
+    CV_WRAP virtual void detect(InputArray image, OutputArray positions, OutputArray votes = noArray()) = 0;
+    CV_WRAP virtual void detect(InputArray edges, InputArray dx, InputArray dy, OutputArray positions, OutputArray votes = noArray()) = 0;
+
+    //! Canny low threshold.
+    CV_WRAP virtual void setCannyLowThresh(int cannyLowThresh) = 0;
+    CV_WRAP virtual int getCannyLowThresh() const = 0;
+
+    //! Canny high threshold.
+    CV_WRAP virtual void setCannyHighThresh(int cannyHighThresh) = 0;
+    CV_WRAP virtual int getCannyHighThresh() const = 0;
+
+    //! Minimum distance between the centers of the detected objects.
+    CV_WRAP virtual void setMinDist(double minDist) = 0;
+    CV_WRAP virtual double getMinDist() const = 0;
+
+    //! Inverse ratio of the accumulator resolution to the image resolution.
+    CV_WRAP virtual void setDp(double dp) = 0;
+    CV_WRAP virtual double getDp() const = 0;
+
+    //! Maximal size of inner buffers.
+    CV_WRAP virtual void setMaxBufferSize(int maxBufferSize) = 0;
+    CV_WRAP virtual int getMaxBufferSize() const = 0;
+};
+
+/** @brief finds arbitrary template in the grayscale image using Generalized Hough Transform
+
+Detects position only without translation and rotation @cite Ballard1981 .
+*/
+class CV_EXPORTS_W GeneralizedHoughBallard : public GeneralizedHough
+{
+public:
+    //! R-Table levels.
+    CV_WRAP virtual void setLevels(int levels) = 0;
+    CV_WRAP virtual int getLevels() const = 0;
+
+    //! The accumulator threshold for the template centers at the detection stage. The smaller it is, the more false positions may be detected.
+    CV_WRAP virtual void setVotesThreshold(int votesThreshold) = 0;
+    CV_WRAP virtual int getVotesThreshold() const = 0;
+};
+
+/** @brief finds arbitrary template in the grayscale image using Generalized Hough Transform
+
+Detects position, translation and rotation @cite Guil1999 .
+*/
+class CV_EXPORTS_W GeneralizedHoughGuil : public GeneralizedHough
+{
+public:
+    //! Angle difference in degrees between two points in feature.
+    CV_WRAP virtual void setXi(double xi) = 0;
+    CV_WRAP virtual double getXi() const = 0;
+
+    //! Feature table levels.
+    CV_WRAP virtual void setLevels(int levels) = 0;
+    CV_WRAP virtual int getLevels() const = 0;
+
+    //! Maximal difference between angles that treated as equal.
+    CV_WRAP virtual void setAngleEpsilon(double angleEpsilon) = 0;
+    CV_WRAP virtual double getAngleEpsilon() const = 0;
+
+    //! Minimal rotation angle to detect in degrees.
+    CV_WRAP virtual void setMinAngle(double minAngle) = 0;
+    CV_WRAP virtual double getMinAngle() const = 0;
+
+    //! Maximal rotation angle to detect in degrees.
+    CV_WRAP virtual void setMaxAngle(double maxAngle) = 0;
+    CV_WRAP virtual double getMaxAngle() const = 0;
+
+    //! Angle step in degrees.
+    CV_WRAP virtual void setAngleStep(double angleStep) = 0;
+    CV_WRAP virtual double getAngleStep() const = 0;
+
+    //! Angle votes threshold.
+    CV_WRAP virtual void setAngleThresh(int angleThresh) = 0;
+    CV_WRAP virtual int getAngleThresh() const = 0;
+
+    //! Minimal scale to detect.
+    CV_WRAP virtual void setMinScale(double minScale) = 0;
+    CV_WRAP virtual double getMinScale() const = 0;
+
+    //! Maximal scale to detect.
+    CV_WRAP virtual void setMaxScale(double maxScale) = 0;
+    CV_WRAP virtual double getMaxScale() const = 0;
+
+    //! Scale step.
+    CV_WRAP virtual void setScaleStep(double scaleStep) = 0;
+    CV_WRAP virtual double getScaleStep() const = 0;
+
+    //! Scale votes threshold.
+    CV_WRAP virtual void setScaleThresh(int scaleThresh) = 0;
+    CV_WRAP virtual int getScaleThresh() const = 0;
+
+    //! Position votes threshold.
+    CV_WRAP virtual void setPosThresh(int posThresh) = 0;
+    CV_WRAP virtual int getPosThresh() const = 0;
+};
+
+//! @} imgproc_shape
+
+//! @addtogroup imgproc_hist
+//! @{
+
+/** @brief Base class for Contrast Limited Adaptive Histogram Equalization.
+*/
+class CV_EXPORTS_W CLAHE : public Algorithm
+{
+public:
+    /** @brief Equalizes the histogram of a grayscale image using Contrast Limited Adaptive Histogram Equalization.
+
+    @param src Source image of type CV_8UC1 or CV_16UC1.
+    @param dst Destination image.
+     */
+    CV_WRAP virtual void apply(InputArray src, OutputArray dst) = 0;
+
+    /** @brief Sets threshold for contrast limiting.
+
+    @param clipLimit threshold value.
+    */
+    CV_WRAP virtual void setClipLimit(double clipLimit) = 0;
+
+    //! Returns threshold value for contrast limiting.
+    CV_WRAP virtual double getClipLimit() const = 0;
+
+    /** @brief Sets size of grid for histogram equalization. Input image will be divided into
+    equally sized rectangular tiles.
+
+    @param tileGridSize defines the number of tiles in row and column.
+    */
+    CV_WRAP virtual void setTilesGridSize(Size tileGridSize) = 0;
+
+    //!@brief Returns Size defines the number of tiles in row and column.
+    CV_WRAP virtual Size getTilesGridSize() const = 0;
+
+    CV_WRAP virtual void collectGarbage() = 0;
+};
+
+//! @} imgproc_hist
+
+//! @addtogroup imgproc_subdiv2d
+//! @{
+
+class CV_EXPORTS_W Subdiv2D
+{
+public:
+    /** Subdiv2D point location cases */
+    enum { PTLOC_ERROR        = -2, //!< Point location error
+           PTLOC_OUTSIDE_RECT = -1, //!< Point outside the subdivision bounding rect
+           PTLOC_INSIDE       = 0, //!< Point inside some facet
+           PTLOC_VERTEX       = 1, //!< Point coincides with one of the subdivision vertices
+           PTLOC_ON_EDGE      = 2  //!< Point on some edge
+         };
+
+    /** Subdiv2D edge type navigation (see: getEdge()) */
+    enum { NEXT_AROUND_ORG   = 0x00,
+           NEXT_AROUND_DST   = 0x22,
+           PREV_AROUND_ORG   = 0x11,
+           PREV_AROUND_DST   = 0x33,
+           NEXT_AROUND_LEFT  = 0x13,
+           NEXT_AROUND_RIGHT = 0x31,
+           PREV_AROUND_LEFT  = 0x20,
+           PREV_AROUND_RIGHT = 0x02
+         };
+
+    /** creates an empty Subdiv2D object.
+    To create a new empty Delaunay subdivision you need to use the #initDelaunay function.
+     */
+    CV_WRAP Subdiv2D();
+
+    /** @overload
+
+    @param rect Rectangle that includes all of the 2D points that are to be added to the subdivision.
+
+    The function creates an empty Delaunay subdivision where 2D points can be added using the function
+    insert() . All of the points to be added must be within the specified rectangle, otherwise a runtime
+    error is raised.
+     */
+    CV_WRAP Subdiv2D(Rect rect);
+
+    /** @brief Creates a new empty Delaunay subdivision
+
+    @param rect Rectangle that includes all of the 2D points that are to be added to the subdivision.
+
+     */
+    CV_WRAP void initDelaunay(Rect rect);
+
+    /** @brief Insert a single point into a Delaunay triangulation.
+
+    @param pt Point to insert.
+
+    The function inserts a single point into a subdivision and modifies the subdivision topology
+    appropriately. If a point with the same coordinates exists already, no new point is added.
+    @returns the ID of the point.
+
+    @note If the point is outside of the triangulation specified rect a runtime error is raised.
+     */
+    CV_WRAP int insert(Point2f pt);
+
+    /** @brief Insert multiple points into a Delaunay triangulation.
+
+    @param ptvec Points to insert.
+
+    The function inserts a vector of points into a subdivision and modifies the subdivision topology
+    appropriately.
+     */
+    CV_WRAP void insert(const std::vector<Point2f>& ptvec);
+
+    /** @brief Returns the location of a point within a Delaunay triangulation.
+
+    @param pt Point to locate.
+    @param edge Output edge that the point belongs to or is located to the right of it.
+    @param vertex Optional output vertex the input point coincides with.
+
+    The function locates the input point within the subdivision and gives one of the triangle edges
+    or vertices.
+
+    @returns an integer which specify one of the following five cases for point location:
+    -  The point falls into some facet. The function returns #PTLOC_INSIDE and edge will contain one of
+       edges of the facet.
+    -  The point falls onto the edge. The function returns #PTLOC_ON_EDGE and edge will contain this edge.
+    -  The point coincides with one of the subdivision vertices. The function returns #PTLOC_VERTEX and
+       vertex will contain a pointer to the vertex.
+    -  The point is outside the subdivision reference rectangle. The function returns #PTLOC_OUTSIDE_RECT
+       and no pointers are filled.
+    -  One of input arguments is invalid. A runtime error is raised or, if silent or "parent" error
+       processing mode is selected, #PTLOC_ERROR is returned.
+     */
+    CV_WRAP int locate(Point2f pt, CV_OUT int& edge, CV_OUT int& vertex);
+
+    /** @brief Finds the subdivision vertex closest to the given point.
+
+    @param pt Input point.
+    @param nearestPt Output subdivision vertex point.
+
+    The function is another function that locates the input point within the subdivision. It finds the
+    subdivision vertex that is the closest to the input point. It is not necessarily one of vertices
+    of the facet containing the input point, though the facet (located using locate() ) is used as a
+    starting point.
+
+    @returns vertex ID.
+     */
+    CV_WRAP int findNearest(Point2f pt, CV_OUT Point2f* nearestPt = 0);
+
+    /** @brief Returns a list of all edges.
+
+    @param edgeList Output vector.
+
+    The function gives each edge as a 4 numbers vector, where each two are one of the edge
+    vertices. i.e. org_x = v[0], org_y = v[1], dst_x = v[2], dst_y = v[3].
+     */
+    CV_WRAP void getEdgeList(CV_OUT std::vector<Vec4f>& edgeList) const;
+
+    /** @brief Returns a list of the leading edge ID connected to each triangle.
+
+    @param leadingEdgeList Output vector.
+
+    The function gives one edge ID for each triangle.
+     */
+    CV_WRAP void getLeadingEdgeList(CV_OUT std::vector<int>& leadingEdgeList) const;
+
+    /** @brief Returns a list of all triangles.
+
+    @param triangleList Output vector.
+
+    The function gives each triangle as a 6 numbers vector, where each two are one of the triangle
+    vertices. i.e. p1_x = v[0], p1_y = v[1], p2_x = v[2], p2_y = v[3], p3_x = v[4], p3_y = v[5].
+     */
+    CV_WRAP void getTriangleList(CV_OUT std::vector<Vec6f>& triangleList) const;
+
+    /** @brief Returns a list of all Voronoi facets.
+
+    @param idx Vector of vertices IDs to consider. For all vertices you can pass empty vector.
+    @param facetList Output vector of the Voronoi facets.
+    @param facetCenters Output vector of the Voronoi facets center points.
+
+     */
+    CV_WRAP void getVoronoiFacetList(const std::vector<int>& idx, CV_OUT std::vector<std::vector<Point2f> >& facetList,
+                                     CV_OUT std::vector<Point2f>& facetCenters);
+
+    /** @brief Returns vertex location from vertex ID.
+
+    @param vertex vertex ID.
+    @param firstEdge Optional. The first edge ID which is connected to the vertex.
+    @returns vertex (x,y)
+
+     */
+    CV_WRAP Point2f getVertex(int vertex, CV_OUT int* firstEdge = 0) const;
+
+    /** @brief Returns one of the edges related to the given edge.
+
+    @param edge Subdivision edge ID.
+    @param nextEdgeType Parameter specifying which of the related edges to return.
+    The following values are possible:
+    -   NEXT_AROUND_ORG next around the edge origin ( eOnext on the picture below if e is the input edge)
+    -   NEXT_AROUND_DST next around the edge vertex ( eDnext )
+    -   PREV_AROUND_ORG previous around the edge origin (reversed eRnext )
+    -   PREV_AROUND_DST previous around the edge destination (reversed eLnext )
+    -   NEXT_AROUND_LEFT next around the left facet ( eLnext )
+    -   NEXT_AROUND_RIGHT next around the right facet ( eRnext )
+    -   PREV_AROUND_LEFT previous around the left facet (reversed eOnext )
+    -   PREV_AROUND_RIGHT previous around the right facet (reversed eDnext )
+
+    ![sample output](pics/quadedge.png)
+
+    @returns edge ID related to the input edge.
+     */
+    CV_WRAP int getEdge( int edge, int nextEdgeType ) const;
+
+    /** @brief Returns next edge around the edge origin.
+
+    @param edge Subdivision edge ID.
+
+    @returns an integer which is next edge ID around the edge origin: eOnext on the
+    picture above if e is the input edge).
+     */
+    CV_WRAP int nextEdge(int edge) const;
+
+    /** @brief Returns another edge of the same quad-edge.
+
+    @param edge Subdivision edge ID.
+    @param rotate Parameter specifying which of the edges of the same quad-edge as the input
+    one to return. The following values are possible:
+    -   0 - the input edge ( e on the picture below if e is the input edge)
+    -   1 - the rotated edge ( eRot )
+    -   2 - the reversed edge (reversed e (in green))
+    -   3 - the reversed rotated edge (reversed eRot (in green))
+
+    @returns one of the edges ID of the same quad-edge as the input edge.
+     */
+    CV_WRAP int rotateEdge(int edge, int rotate) const;
+    CV_WRAP int symEdge(int edge) const;
+
+    /** @brief Returns the edge origin.
+
+    @param edge Subdivision edge ID.
+    @param orgpt Output vertex location.
+
+    @returns vertex ID.
+     */
+    CV_WRAP int edgeOrg(int edge, CV_OUT Point2f* orgpt = 0) const;
+
+    /** @brief Returns the edge destination.
+
+    @param edge Subdivision edge ID.
+    @param dstpt Output vertex location.
+
+    @returns vertex ID.
+     */
+    CV_WRAP int edgeDst(int edge, CV_OUT Point2f* dstpt = 0) const;
+
+protected:
+    int newEdge();
+    void deleteEdge(int edge);
+    int newPoint(Point2f pt, bool isvirtual, int firstEdge = 0);
+    void deletePoint(int vtx);
+    void setEdgePoints( int edge, int orgPt, int dstPt );
+    void splice( int edgeA, int edgeB );
+    int connectEdges( int edgeA, int edgeB );
+    void swapEdges( int edge );
+    int isRightOf(Point2f pt, int edge) const;
+    void calcVoronoi();
+    void clearVoronoi();
+    void checkSubdiv() const;
+
+    struct CV_EXPORTS Vertex
+    {
+        Vertex();
+        Vertex(Point2f pt, bool isvirtual, int firstEdge=0);
+        bool isvirtual() const;
+        bool isfree() const;
+
+        int firstEdge;
+        int type;
+        Point2f pt;
+    };
+
+    struct CV_EXPORTS QuadEdge
+    {
+        QuadEdge();
+        QuadEdge(int edgeidx);
+        bool isfree() const;
+
+        int next[4];
+        int pt[4];
+    };
+
+    //! All of the vertices
+    std::vector<Vertex> vtx;
+    //! All of the edges
+    std::vector<QuadEdge> qedges;
+    int freeQEdge;
+    int freePoint;
+    bool validGeometry;
+
+    int recentEdge;
+    //! Top left corner of the bounding rect
+    Point2f topLeft;
+    //! Bottom right corner of the bounding rect
+    Point2f bottomRight;
+};
+
+//! @} imgproc_subdiv2d
+
+//! @addtogroup imgproc_feature
+//! @{
+
+/** @example samples/cpp/lsd_lines.cpp
+An example using the LineSegmentDetector
+\image html building_lsd.png "Sample output image" width=434 height=300
+*/
+
+/** @brief Line segment detector class
+
+following the algorithm described at @cite Rafael12 .
+
+@note Implementation has been removed from OpenCV version 3.4.6 to 3.4.15 and version 4.1.0 to 4.5.3 due original code license conflict.
+restored again after [Computation of a NFA](https://github.com/rafael-grompone-von-gioi/binomial_nfa) code published under the MIT license.
+*/
+class CV_EXPORTS_W LineSegmentDetector : public Algorithm
+{
+public:
+
+    /** @brief Finds lines in the input image.
+
+    This is the output of the default parameters of the algorithm on the above shown image.
+
+    ![image](pics/building_lsd.png)
+
+    @param image A grayscale (CV_8UC1) input image. If only a roi needs to be selected, use:
+    `lsd_ptr-\>detect(image(roi), lines, ...); lines += Scalar(roi.x, roi.y, roi.x, roi.y);`
+    @param lines A vector of Vec4f elements specifying the beginning and ending point of a line. Where
+    Vec4f is (x1, y1, x2, y2), point 1 is the start, point 2 - end. Returned lines are strictly
+    oriented depending on the gradient.
+    @param width Vector of widths of the regions, where the lines are found. E.g. Width of line.
+    @param prec Vector of precisions with which the lines are found.
+    @param nfa Vector containing number of false alarms in the line region, with precision of 10%. The
+    bigger the value, logarithmically better the detection.
+    - -1 corresponds to 10 mean false alarms
+    - 0 corresponds to 1 mean false alarm
+    - 1 corresponds to 0.1 mean false alarms
+    This vector will be calculated only when the objects type is #LSD_REFINE_ADV.
+    */
+    CV_WRAP virtual void detect(InputArray image, OutputArray lines,
+                        OutputArray width = noArray(), OutputArray prec = noArray(),
+                        OutputArray nfa = noArray()) = 0;
+
+    /** @brief Draws the line segments on a given image.
+    @param image The image, where the lines will be drawn. Should be bigger or equal to the image,
+    where the lines were found.
+    @param lines A vector of the lines that needed to be drawn.
+     */
+    CV_WRAP virtual void drawSegments(InputOutputArray image, InputArray lines) = 0;
+
+    /** @brief Draws two groups of lines in blue and red, counting the non overlapping (mismatching) pixels.
+
+    @param size The size of the image, where lines1 and lines2 were found.
+    @param lines1 The first group of lines that needs to be drawn. It is visualized in blue color.
+    @param lines2 The second group of lines. They visualized in red color.
+    @param image Optional image, where the lines will be drawn. The image should be color(3-channel)
+    in order for lines1 and lines2 to be drawn in the above mentioned colors.
+     */
+    CV_WRAP virtual int compareSegments(const Size& size, InputArray lines1, InputArray lines2, InputOutputArray image = noArray()) = 0;
+
+    virtual ~LineSegmentDetector() { }
+};
+
+/** @brief Creates a smart pointer to a LineSegmentDetector object and initializes it.
+
+The LineSegmentDetector algorithm is defined using the standard values. Only advanced users may want
+to edit those, as to tailor it for their own application.
+
+@param refine The way found lines will be refined, see #LineSegmentDetectorModes
+@param scale The scale of the image that will be used to find the lines. Range (0..1].
+@param sigma_scale Sigma for Gaussian filter. It is computed as sigma = sigma_scale/scale.
+@param quant Bound to the quantization error on the gradient norm.
+@param ang_th Gradient angle tolerance in degrees.
+@param log_eps Detection threshold: -log10(NFA) \> log_eps. Used only when advance refinement is chosen.
+@param density_th Minimal density of aligned region points in the enclosing rectangle.
+@param n_bins Number of bins in pseudo-ordering of gradient modulus.
+ */
+CV_EXPORTS_W Ptr<LineSegmentDetector> createLineSegmentDetector(
+    int refine = LSD_REFINE_STD, double scale = 0.8,
+    double sigma_scale = 0.6, double quant = 2.0, double ang_th = 22.5,
+    double log_eps = 0, double density_th = 0.7, int n_bins = 1024);
+
+//! @} imgproc_feature
+
+//! @addtogroup imgproc_filter
+//! @{
+
+/** @brief Returns Gaussian filter coefficients.
+
+The function computes and returns the \f$\texttt{ksize} \times 1\f$ matrix of Gaussian filter
+coefficients:
+
+\f[G_i= \alpha *e^{-(i-( \texttt{ksize} -1)/2)^2/(2* \texttt{sigma}^2)},\f]
+
+where \f$i=0..\texttt{ksize}-1\f$ and \f$\alpha\f$ is the scale factor chosen so that \f$\sum_i G_i=1\f$.
+
+Two of such generated kernels can be passed to sepFilter2D. Those functions automatically recognize
+smoothing kernels (a symmetrical kernel with sum of weights equal to 1) and handle them accordingly.
+You may also use the higher-level GaussianBlur.
+@param ksize Aperture size. It should be odd ( \f$\texttt{ksize} \mod 2 = 1\f$ ) and positive.
+@param sigma Gaussian standard deviation. If it is non-positive, it is computed from ksize as
+`sigma = 0.3*((ksize-1)*0.5 - 1) + 0.8`.
+@param ktype Type of filter coefficients. It can be CV_32F or CV_64F .
+@sa  sepFilter2D, getDerivKernels, getStructuringElement, GaussianBlur
+ */
+CV_EXPORTS_W Mat getGaussianKernel( int ksize, double sigma, int ktype = CV_64F );
+
+/** @brief Returns filter coefficients for computing spatial image derivatives.
+
+The function computes and returns the filter coefficients for spatial image derivatives. When
+`ksize=FILTER_SCHARR`, the Scharr \f$3 \times 3\f$ kernels are generated (see #Scharr). Otherwise, Sobel
+kernels are generated (see #Sobel). The filters are normally passed to #sepFilter2D or to
+
+@param kx Output matrix of row filter coefficients. It has the type ktype .
+@param ky Output matrix of column filter coefficients. It has the type ktype .
+@param dx Derivative order in respect of x.
+@param dy Derivative order in respect of y.
+@param ksize Aperture size. It can be FILTER_SCHARR, 1, 3, 5, or 7.
+@param normalize Flag indicating whether to normalize (scale down) the filter coefficients or not.
+Theoretically, the coefficients should have the denominator \f$=2^{ksize*2-dx-dy-2}\f$. If you are
+going to filter floating-point images, you are likely to use the normalized kernels. But if you
+compute derivatives of an 8-bit image, store the results in a 16-bit image, and wish to preserve
+all the fractional bits, you may want to set normalize=false .
+@param ktype Type of filter coefficients. It can be CV_32f or CV_64F .
+ */
+CV_EXPORTS_W void getDerivKernels( OutputArray kx, OutputArray ky,
+                                   int dx, int dy, int ksize,
+                                   bool normalize = false, int ktype = CV_32F );
+
+/** @brief Returns Gabor filter coefficients.
+
+For more details about gabor filter equations and parameters, see: [Gabor
+Filter](http://en.wikipedia.org/wiki/Gabor_filter).
+
+@param ksize Size of the filter returned.
+@param sigma Standard deviation of the gaussian envelope.
+@param theta Orientation of the normal to the parallel stripes of a Gabor function.
+@param lambd Wavelength of the sinusoidal factor.
+@param gamma Spatial aspect ratio.
+@param psi Phase offset.
+@param ktype Type of filter coefficients. It can be CV_32F or CV_64F .
+ */
+CV_EXPORTS_W Mat getGaborKernel( Size ksize, double sigma, double theta, double lambd,
+                                 double gamma, double psi = CV_PI*0.5, int ktype = CV_64F );
+
+//! returns "magic" border value for erosion and dilation. It is automatically transformed to Scalar::all(-DBL_MAX) for dilation.
+static inline Scalar morphologyDefaultBorderValue() { return Scalar::all(DBL_MAX); }
+
+/** @brief Returns a structuring element of the specified size and shape for morphological operations.
+
+The function constructs and returns the structuring element that can be further passed to #erode,
+#dilate or #morphologyEx. But you can also construct an arbitrary binary mask yourself and use it as
+the structuring element.
+
+@param shape Element shape that could be one of #MorphShapes
+@param ksize Size of the structuring element.
+@param anchor Anchor position within the element. The default value \f$(-1, -1)\f$ means that the
+anchor is at the center. Note that only the shape of a cross-shaped element depends on the anchor
+position. In other cases the anchor just regulates how much the result of the morphological
+operation is shifted.
+ */
+CV_EXPORTS_W Mat getStructuringElement(int shape, Size ksize, Point anchor = Point(-1,-1));
+
+/** @example samples/cpp/tutorial_code/ImgProc/Smoothing/Smoothing.cpp
+Sample code for simple filters
+![Sample screenshot](Smoothing_Tutorial_Result_Median_Filter.jpg)
+Check @ref tutorial_gausian_median_blur_bilateral_filter "the corresponding tutorial" for more details
+ */
+
+/** @brief Blurs an image using the median filter.
+
+The function smoothes an image using the median filter with the \f$\texttt{ksize} \times
+\texttt{ksize}\f$ aperture. Each channel of a multi-channel image is processed independently.
+In-place operation is supported.
+
+@note The median filter uses #BORDER_REPLICATE internally to cope with border pixels, see #BorderTypes
+
+@param src input 1-, 3-, or 4-channel image; when ksize is 3 or 5, the image depth should be
+CV_8U, CV_16U, or CV_32F, for larger aperture sizes, it can only be CV_8U.
+@param dst destination array of the same size and type as src.
+@param ksize aperture linear size; it must be odd and greater than 1, for example: 3, 5, 7 ...
+@sa  bilateralFilter, blur, boxFilter, GaussianBlur
+ */
+CV_EXPORTS_W void medianBlur( InputArray src, OutputArray dst, int ksize );
+
+/** @brief Blurs an image using a Gaussian filter.
+
+The function convolves the source image with the specified Gaussian kernel. In-place filtering is
+supported.
+
+@param src input image; the image can have any number of channels, which are processed
+independently, but the depth should be CV_8U, CV_16U, CV_16S, CV_32F or CV_64F.
+@param dst output image of the same size and type as src.
+@param ksize Gaussian kernel size. ksize.width and ksize.height can differ but they both must be
+positive and odd. Or, they can be zero's and then they are computed from sigma.
+@param sigmaX Gaussian kernel standard deviation in X direction.
+@param sigmaY Gaussian kernel standard deviation in Y direction; if sigmaY is zero, it is set to be
+equal to sigmaX, if both sigmas are zeros, they are computed from ksize.width and ksize.height,
+respectively (see #getGaussianKernel for details); to fully control the result regardless of
+possible future modifications of all this semantics, it is recommended to specify all of ksize,
+sigmaX, and sigmaY.
+@param borderType pixel extrapolation method, see #BorderTypes. #BORDER_WRAP is not supported.
+
+@sa  sepFilter2D, filter2D, blur, boxFilter, bilateralFilter, medianBlur
+ */
+CV_EXPORTS_W void GaussianBlur( InputArray src, OutputArray dst, Size ksize,
+                                double sigmaX, double sigmaY = 0,
+                                int borderType = BORDER_DEFAULT );
+
+/** @brief Applies the bilateral filter to an image.
+
+The function applies bilateral filtering to the input image, as described in
+http://www.dai.ed.ac.uk/CVonline/LOCAL_COPIES/MANDUCHI1/Bilateral_Filtering.html
+bilateralFilter can reduce unwanted noise very well while keeping edges fairly sharp. However, it is
+very slow compared to most filters.
+
+_Sigma values_: For simplicity, you can set the 2 sigma values to be the same. If they are small (\<
+10), the filter will not have much effect, whereas if they are large (\> 150), they will have a very
+strong effect, making the image look "cartoonish".
+
+_Filter size_: Large filters (d \> 5) are very slow, so it is recommended to use d=5 for real-time
+applications, and perhaps d=9 for offline applications that need heavy noise filtering.
+
+This filter does not work inplace.
+@param src Source 8-bit or floating-point, 1-channel or 3-channel image.
+@param dst Destination image of the same size and type as src .
+@param d Diameter of each pixel neighborhood that is used during filtering. If it is non-positive,
+it is computed from sigmaSpace.
+@param sigmaColor Filter sigma in the color space. A larger value of the parameter means that
+farther colors within the pixel neighborhood (see sigmaSpace) will be mixed together, resulting
+in larger areas of semi-equal color.
+@param sigmaSpace Filter sigma in the coordinate space. A larger value of the parameter means that
+farther pixels will influence each other as long as their colors are close enough (see sigmaColor
+). When d\>0, it specifies the neighborhood size regardless of sigmaSpace. Otherwise, d is
+proportional to sigmaSpace.
+@param borderType border mode used to extrapolate pixels outside of the image, see #BorderTypes
+ */
+CV_EXPORTS_W void bilateralFilter( InputArray src, OutputArray dst, int d,
+                                   double sigmaColor, double sigmaSpace,
+                                   int borderType = BORDER_DEFAULT );
+
+/** @brief Blurs an image using the box filter.
+
+The function smooths an image using the kernel:
+
+\f[\texttt{K} =  \alpha \begin{bmatrix} 1 & 1 & 1 &  \cdots & 1 & 1  \\ 1 & 1 & 1 &  \cdots & 1 & 1  \\ \hdotsfor{6} \\ 1 & 1 & 1 &  \cdots & 1 & 1 \end{bmatrix}\f]
+
+where
+
+\f[\alpha = \begin{cases} \frac{1}{\texttt{ksize.width*ksize.height}} & \texttt{when } \texttt{normalize=true}  \\1 & \texttt{otherwise}\end{cases}\f]
+
+Unnormalized box filter is useful for computing various integral characteristics over each pixel
+neighborhood, such as covariance matrices of image derivatives (used in dense optical flow
+algorithms, and so on). If you need to compute pixel sums over variable-size windows, use #integral.
+
+@param src input image.
+@param dst output image of the same size and type as src.
+@param ddepth the output image depth (-1 to use src.depth()).
+@param ksize blurring kernel size.
+@param anchor anchor point; default value Point(-1,-1) means that the anchor is at the kernel
+center.
+@param normalize flag, specifying whether the kernel is normalized by its area or not.
+@param borderType border mode used to extrapolate pixels outside of the image, see #BorderTypes. #BORDER_WRAP is not supported.
+@sa  blur, bilateralFilter, GaussianBlur, medianBlur, integral
+ */
+CV_EXPORTS_W void boxFilter( InputArray src, OutputArray dst, int ddepth,
+                             Size ksize, Point anchor = Point(-1,-1),
+                             bool normalize = true,
+                             int borderType = BORDER_DEFAULT );
+
+/** @brief Calculates the normalized sum of squares of the pixel values overlapping the filter.
+
+For every pixel \f$ (x, y) \f$ in the source image, the function calculates the sum of squares of those neighboring
+pixel values which overlap the filter placed over the pixel \f$ (x, y) \f$.
+
+The unnormalized square box filter can be useful in computing local image statistics such as the local
+variance and standard deviation around the neighborhood of a pixel.
+
+@param src input image
+@param dst output image of the same size and type as src
+@param ddepth the output image depth (-1 to use src.depth())
+@param ksize kernel size
+@param anchor kernel anchor point. The default value of Point(-1, -1) denotes that the anchor is at the kernel
+center.
+@param normalize flag, specifying whether the kernel is to be normalized by it's area or not.
+@param borderType border mode used to extrapolate pixels outside of the image, see #BorderTypes. #BORDER_WRAP is not supported.
+@sa boxFilter
+*/
+CV_EXPORTS_W void sqrBoxFilter( InputArray src, OutputArray dst, int ddepth,
+                                Size ksize, Point anchor = Point(-1, -1),
+                                bool normalize = true,
+                                int borderType = BORDER_DEFAULT );
+
+/** @brief Blurs an image using the normalized box filter.
+
+The function smooths an image using the kernel:
+
+\f[\texttt{K} =  \frac{1}{\texttt{ksize.width*ksize.height}} \begin{bmatrix} 1 & 1 & 1 &  \cdots & 1 & 1  \\ 1 & 1 & 1 &  \cdots & 1 & 1  \\ \hdotsfor{6} \\ 1 & 1 & 1 &  \cdots & 1 & 1  \\ \end{bmatrix}\f]
+
+The call `blur(src, dst, ksize, anchor, borderType)` is equivalent to `boxFilter(src, dst, src.type(), ksize,
+anchor, true, borderType)`.
+
+@param src input image; it can have any number of channels, which are processed independently, but
+the depth should be CV_8U, CV_16U, CV_16S, CV_32F or CV_64F.
+@param dst output image of the same size and type as src.
+@param ksize blurring kernel size.
+@param anchor anchor point; default value Point(-1,-1) means that the anchor is at the kernel
+center.
+@param borderType border mode used to extrapolate pixels outside of the image, see #BorderTypes. #BORDER_WRAP is not supported.
+@sa  boxFilter, bilateralFilter, GaussianBlur, medianBlur
+ */
+CV_EXPORTS_W void blur( InputArray src, OutputArray dst,
+                        Size ksize, Point anchor = Point(-1,-1),
+                        int borderType = BORDER_DEFAULT );
+
+/** @brief Convolves an image with the kernel.
+
+The function applies an arbitrary linear filter to an image. In-place operation is supported. When
+the aperture is partially outside the image, the function interpolates outlier pixel values
+according to the specified border mode.
+
+The function does actually compute correlation, not the convolution:
+
+\f[\texttt{dst} (x,y) =  \sum _{ \substack{0\leq x' < \texttt{kernel.cols}\\{0\leq y' < \texttt{kernel.rows}}}}  \texttt{kernel} (x',y')* \texttt{src} (x+x'- \texttt{anchor.x} ,y+y'- \texttt{anchor.y} )\f]
+
+That is, the kernel is not mirrored around the anchor point. If you need a real convolution, flip
+the kernel using #flip and set the new anchor to `(kernel.cols - anchor.x - 1, kernel.rows -
+anchor.y - 1)`.
+
+The function uses the DFT-based algorithm in case of sufficiently large kernels (~`11 x 11` or
+larger) and the direct algorithm for small kernels.
+
+@param src input image.
+@param dst output image of the same size and the same number of channels as src.
+@param ddepth desired depth of the destination image, see @ref filter_depths "combinations"
+@param kernel convolution kernel (or rather a correlation kernel), a single-channel floating point
+matrix; if you want to apply different kernels to different channels, split the image into
+separate color planes using split and process them individually.
+@param anchor anchor of the kernel that indicates the relative position of a filtered point within
+the kernel; the anchor should lie within the kernel; default value (-1,-1) means that the anchor
+is at the kernel center.
+@param delta optional value added to the filtered pixels before storing them in dst.
+@param borderType pixel extrapolation method, see #BorderTypes. #BORDER_WRAP is not supported.
+@sa  sepFilter2D, dft, matchTemplate
+ */
+CV_EXPORTS_W void filter2D( InputArray src, OutputArray dst, int ddepth,
+                            InputArray kernel, Point anchor = Point(-1,-1),
+                            double delta = 0, int borderType = BORDER_DEFAULT );
+
+/** @brief Applies a separable linear filter to an image.
+
+The function applies a separable linear filter to the image. That is, first, every row of src is
+filtered with the 1D kernel kernelX. Then, every column of the result is filtered with the 1D
+kernel kernelY. The final result shifted by delta is stored in dst .
+
+@param src Source image.
+@param dst Destination image of the same size and the same number of channels as src .
+@param ddepth Destination image depth, see @ref filter_depths "combinations"
+@param kernelX Coefficients for filtering each row.
+@param kernelY Coefficients for filtering each column.
+@param anchor Anchor position within the kernel. The default value \f$(-1,-1)\f$ means that the anchor
+is at the kernel center.
+@param delta Value added to the filtered results before storing them.
+@param borderType Pixel extrapolation method, see #BorderTypes. #BORDER_WRAP is not supported.
+@sa  filter2D, Sobel, GaussianBlur, boxFilter, blur
+ */
+CV_EXPORTS_W void sepFilter2D( InputArray src, OutputArray dst, int ddepth,
+                               InputArray kernelX, InputArray kernelY,
+                               Point anchor = Point(-1,-1),
+                               double delta = 0, int borderType = BORDER_DEFAULT );
+
+/** @example samples/cpp/tutorial_code/ImgTrans/Sobel_Demo.cpp
+Sample code using Sobel and/or Scharr OpenCV functions to make a simple Edge Detector
+![Sample screenshot](Sobel_Derivatives_Tutorial_Result.jpg)
+Check @ref tutorial_sobel_derivatives "the corresponding tutorial" for more details
+*/
+
+/** @brief Calculates the first, second, third, or mixed image derivatives using an extended Sobel operator.
+
+In all cases except one, the \f$\texttt{ksize} \times \texttt{ksize}\f$ separable kernel is used to
+calculate the derivative. When \f$\texttt{ksize = 1}\f$, the \f$3 \times 1\f$ or \f$1 \times 3\f$
+kernel is used (that is, no Gaussian smoothing is done). `ksize = 1` can only be used for the first
+or the second x- or y- derivatives.
+
+There is also the special value `ksize = #FILTER_SCHARR (-1)` that corresponds to the \f$3\times3\f$ Scharr
+filter that may give more accurate results than the \f$3\times3\f$ Sobel. The Scharr aperture is
+
+\f[\vecthreethree{-3}{0}{3}{-10}{0}{10}{-3}{0}{3}\f]
+
+for the x-derivative, or transposed for the y-derivative.
+
+The function calculates an image derivative by convolving the image with the appropriate kernel:
+
+\f[\texttt{dst} =  \frac{\partial^{xorder+yorder} \texttt{src}}{\partial x^{xorder} \partial y^{yorder}}\f]
+
+The Sobel operators combine Gaussian smoothing and differentiation, so the result is more or less
+resistant to the noise. Most often, the function is called with ( xorder = 1, yorder = 0, ksize = 3)
+or ( xorder = 0, yorder = 1, ksize = 3) to calculate the first x- or y- image derivative. The first
+case corresponds to a kernel of:
+
+\f[\vecthreethree{-1}{0}{1}{-2}{0}{2}{-1}{0}{1}\f]
+
+The second case corresponds to a kernel of:
+
+\f[\vecthreethree{-1}{-2}{-1}{0}{0}{0}{1}{2}{1}\f]
+
+@param src input image.
+@param dst output image of the same size and the same number of channels as src .
+@param ddepth output image depth, see @ref filter_depths "combinations"; in the case of
+    8-bit input images it will result in truncated derivatives.
+@param dx order of the derivative x.
+@param dy order of the derivative y.
+@param ksize size of the extended Sobel kernel; it must be 1, 3, 5, or 7.
+@param scale optional scale factor for the computed derivative values; by default, no scaling is
+applied (see #getDerivKernels for details).
+@param delta optional delta value that is added to the results prior to storing them in dst.
+@param borderType pixel extrapolation method, see #BorderTypes. #BORDER_WRAP is not supported.
+@sa  Scharr, Laplacian, sepFilter2D, filter2D, GaussianBlur, cartToPolar
+ */
+CV_EXPORTS_W void Sobel( InputArray src, OutputArray dst, int ddepth,
+                         int dx, int dy, int ksize = 3,
+                         double scale = 1, double delta = 0,
+                         int borderType = BORDER_DEFAULT );
+
+/** @brief Calculates the first order image derivative in both x and y using a Sobel operator
+
+Equivalent to calling:
+
+@code
+Sobel( src, dx, CV_16SC1, 1, 0, 3 );
+Sobel( src, dy, CV_16SC1, 0, 1, 3 );
+@endcode
+
+@param src input image.
+@param dx output image with first-order derivative in x.
+@param dy output image with first-order derivative in y.
+@param ksize size of Sobel kernel. It must be 3.
+@param borderType pixel extrapolation method, see #BorderTypes.
+                  Only #BORDER_DEFAULT=#BORDER_REFLECT_101 and #BORDER_REPLICATE are supported.
+
+@sa Sobel
+ */
+
+CV_EXPORTS_W void spatialGradient( InputArray src, OutputArray dx,
+                                   OutputArray dy, int ksize = 3,
+                                   int borderType = BORDER_DEFAULT );
+
+/** @brief Calculates the first x- or y- image derivative using Scharr operator.
+
+The function computes the first x- or y- spatial image derivative using the Scharr operator. The
+call
+
+\f[\texttt{Scharr(src, dst, ddepth, dx, dy, scale, delta, borderType)}\f]
+
+is equivalent to
+
+\f[\texttt{Sobel(src, dst, ddepth, dx, dy, FILTER_SCHARR, scale, delta, borderType)} .\f]
+
+@param src input image.
+@param dst output image of the same size and the same number of channels as src.
+@param ddepth output image depth, see @ref filter_depths "combinations"
+@param dx order of the derivative x.
+@param dy order of the derivative y.
+@param scale optional scale factor for the computed derivative values; by default, no scaling is
+applied (see #getDerivKernels for details).
+@param delta optional delta value that is added to the results prior to storing them in dst.
+@param borderType pixel extrapolation method, see #BorderTypes. #BORDER_WRAP is not supported.
+@sa  cartToPolar
+ */
+CV_EXPORTS_W void Scharr( InputArray src, OutputArray dst, int ddepth,
+                          int dx, int dy, double scale = 1, double delta = 0,
+                          int borderType = BORDER_DEFAULT );
+
+/** @example samples/cpp/laplace.cpp
+An example using Laplace transformations for edge detection
+*/
+
+/** @brief Calculates the Laplacian of an image.
+
+The function calculates the Laplacian of the source image by adding up the second x and y
+derivatives calculated using the Sobel operator:
+
+\f[\texttt{dst} =  \Delta \texttt{src} =  \frac{\partial^2 \texttt{src}}{\partial x^2} +  \frac{\partial^2 \texttt{src}}{\partial y^2}\f]
+
+This is done when `ksize > 1`. When `ksize == 1`, the Laplacian is computed by filtering the image
+with the following \f$3 \times 3\f$ aperture:
+
+\f[\vecthreethree {0}{1}{0}{1}{-4}{1}{0}{1}{0}\f]
+
+@param src Source image.
+@param dst Destination image of the same size and the same number of channels as src .
+@param ddepth Desired depth of the destination image.
+@param ksize Aperture size used to compute the second-derivative filters. See #getDerivKernels for
+details. The size must be positive and odd.
+@param scale Optional scale factor for the computed Laplacian values. By default, no scaling is
+applied. See #getDerivKernels for details.
+@param delta Optional delta value that is added to the results prior to storing them in dst .
+@param borderType Pixel extrapolation method, see #BorderTypes. #BORDER_WRAP is not supported.
+@sa  Sobel, Scharr
+ */
+CV_EXPORTS_W void Laplacian( InputArray src, OutputArray dst, int ddepth,
+                             int ksize = 1, double scale = 1, double delta = 0,
+                             int borderType = BORDER_DEFAULT );
+
+//! @} imgproc_filter
+
+//! @addtogroup imgproc_feature
+//! @{
+
+/** @example samples/cpp/edge.cpp
+This program demonstrates usage of the Canny edge detector
+
+Check @ref tutorial_canny_detector "the corresponding tutorial" for more details
+*/
+
+/** @brief Finds edges in an image using the Canny algorithm @cite Canny86 .
+
+The function finds edges in the input image and marks them in the output map edges using the
+Canny algorithm. The smallest value between threshold1 and threshold2 is used for edge linking. The
+largest value is used to find initial segments of strong edges. See
+<http://en.wikipedia.org/wiki/Canny_edge_detector>
+
+@param image 8-bit input image.
+@param edges output edge map; single channels 8-bit image, which has the same size as image .
+@param threshold1 first threshold for the hysteresis procedure.
+@param threshold2 second threshold for the hysteresis procedure.
+@param apertureSize aperture size for the Sobel operator.
+@param L2gradient a flag, indicating whether a more accurate \f$L_2\f$ norm
+\f$=\sqrt{(dI/dx)^2 + (dI/dy)^2}\f$ should be used to calculate the image gradient magnitude (
+L2gradient=true ), or whether the default \f$L_1\f$ norm \f$=|dI/dx|+|dI/dy|\f$ is enough (
+L2gradient=false ).
+ */
+CV_EXPORTS_W void Canny( InputArray image, OutputArray edges,
+                         double threshold1, double threshold2,
+                         int apertureSize = 3, bool L2gradient = false );
+
+/** \overload
+
+Finds edges in an image using the Canny algorithm with custom image gradient.
+
+@param dx 16-bit x derivative of input image (CV_16SC1 or CV_16SC3).
+@param dy 16-bit y derivative of input image (same type as dx).
+@param edges output edge map; single channels 8-bit image, which has the same size as image .
+@param threshold1 first threshold for the hysteresis procedure.
+@param threshold2 second threshold for the hysteresis procedure.
+@param L2gradient a flag, indicating whether a more accurate \f$L_2\f$ norm
+\f$=\sqrt{(dI/dx)^2 + (dI/dy)^2}\f$ should be used to calculate the image gradient magnitude (
+L2gradient=true ), or whether the default \f$L_1\f$ norm \f$=|dI/dx|+|dI/dy|\f$ is enough (
+L2gradient=false ).
+ */
+CV_EXPORTS_W void Canny( InputArray dx, InputArray dy,
+                         OutputArray edges,
+                         double threshold1, double threshold2,
+                         bool L2gradient = false );
+
+/** @brief Calculates the minimal eigenvalue of gradient matrices for corner detection.
+
+The function is similar to cornerEigenValsAndVecs but it calculates and stores only the minimal
+eigenvalue of the covariance matrix of derivatives, that is, \f$\min(\lambda_1, \lambda_2)\f$ in terms
+of the formulae in the cornerEigenValsAndVecs description.
+
+@param src Input single-channel 8-bit or floating-point image.
+@param dst Image to store the minimal eigenvalues. It has the type CV_32FC1 and the same size as
+src .
+@param blockSize Neighborhood size (see the details on #cornerEigenValsAndVecs ).
+@param ksize Aperture parameter for the Sobel operator.
+@param borderType Pixel extrapolation method. See #BorderTypes. #BORDER_WRAP is not supported.
+ */
+CV_EXPORTS_W void cornerMinEigenVal( InputArray src, OutputArray dst,
+                                     int blockSize, int ksize = 3,
+                                     int borderType = BORDER_DEFAULT );
+
+/** @brief Harris corner detector.
+
+The function runs the Harris corner detector on the image. Similarly to cornerMinEigenVal and
+cornerEigenValsAndVecs , for each pixel \f$(x, y)\f$ it calculates a \f$2\times2\f$ gradient covariance
+matrix \f$M^{(x,y)}\f$ over a \f$\texttt{blockSize} \times \texttt{blockSize}\f$ neighborhood. Then, it
+computes the following characteristic:
+
+\f[\texttt{dst} (x,y) =  \mathrm{det} M^{(x,y)} - k  \cdot \left ( \mathrm{tr} M^{(x,y)} \right )^2\f]
+
+Corners in the image can be found as the local maxima of this response map.
+
+@param src Input single-channel 8-bit or floating-point image.
+@param dst Image to store the Harris detector responses. It has the type CV_32FC1 and the same
+size as src .
+@param blockSize Neighborhood size (see the details on #cornerEigenValsAndVecs ).
+@param ksize Aperture parameter for the Sobel operator.
+@param k Harris detector free parameter. See the formula above.
+@param borderType Pixel extrapolation method. See #BorderTypes. #BORDER_WRAP is not supported.
+ */
+CV_EXPORTS_W void cornerHarris( InputArray src, OutputArray dst, int blockSize,
+                                int ksize, double k,
+                                int borderType = BORDER_DEFAULT );
+
+/** @brief Calculates eigenvalues and eigenvectors of image blocks for corner detection.
+
+For every pixel \f$p\f$ , the function cornerEigenValsAndVecs considers a blockSize \f$\times\f$ blockSize
+neighborhood \f$S(p)\f$ . It calculates the covariation matrix of derivatives over the neighborhood as:
+
+\f[M =  \begin{bmatrix} \sum _{S(p)}(dI/dx)^2 &  \sum _{S(p)}dI/dx dI/dy  \\ \sum _{S(p)}dI/dx dI/dy &  \sum _{S(p)}(dI/dy)^2 \end{bmatrix}\f]
+
+where the derivatives are computed using the Sobel operator.
+
+After that, it finds eigenvectors and eigenvalues of \f$M\f$ and stores them in the destination image as
+\f$(\lambda_1, \lambda_2, x_1, y_1, x_2, y_2)\f$ where
+
+-   \f$\lambda_1, \lambda_2\f$ are the non-sorted eigenvalues of \f$M\f$
+-   \f$x_1, y_1\f$ are the eigenvectors corresponding to \f$\lambda_1\f$
+-   \f$x_2, y_2\f$ are the eigenvectors corresponding to \f$\lambda_2\f$
+
+The output of the function can be used for robust edge or corner detection.
+
+@param src Input single-channel 8-bit or floating-point image.
+@param dst Image to store the results. It has the same size as src and the type CV_32FC(6) .
+@param blockSize Neighborhood size (see details below).
+@param ksize Aperture parameter for the Sobel operator.
+@param borderType Pixel extrapolation method. See #BorderTypes. #BORDER_WRAP is not supported.
+
+@sa  cornerMinEigenVal, cornerHarris, preCornerDetect
+ */
+CV_EXPORTS_W void cornerEigenValsAndVecs( InputArray src, OutputArray dst,
+                                          int blockSize, int ksize,
+                                          int borderType = BORDER_DEFAULT );
+
+/** @brief Calculates a feature map for corner detection.
+
+The function calculates the complex spatial derivative-based function of the source image
+
+\f[\texttt{dst} = (D_x  \texttt{src} )^2  \cdot D_{yy}  \texttt{src} + (D_y  \texttt{src} )^2  \cdot D_{xx}  \texttt{src} - 2 D_x  \texttt{src} \cdot D_y  \texttt{src} \cdot D_{xy}  \texttt{src}\f]
+
+where \f$D_x\f$,\f$D_y\f$ are the first image derivatives, \f$D_{xx}\f$,\f$D_{yy}\f$ are the second image
+derivatives, and \f$D_{xy}\f$ is the mixed derivative.
+
+The corners can be found as local maximums of the functions, as shown below:
+@code
+    Mat corners, dilated_corners;
+    preCornerDetect(image, corners, 3);
+    // dilation with 3x3 rectangular structuring element
+    dilate(corners, dilated_corners, Mat(), 1);
+    Mat corner_mask = corners == dilated_corners;
+@endcode
+
+@param src Source single-channel 8-bit of floating-point image.
+@param dst Output image that has the type CV_32F and the same size as src .
+@param ksize %Aperture size of the Sobel .
+@param borderType Pixel extrapolation method. See #BorderTypes. #BORDER_WRAP is not supported.
+ */
+CV_EXPORTS_W void preCornerDetect( InputArray src, OutputArray dst, int ksize,
+                                   int borderType = BORDER_DEFAULT );
+
+/** @brief Refines the corner locations.
+
+The function iterates to find the sub-pixel accurate location of corners or radial saddle
+points as described in @cite forstner1987fast, and as shown on the figure below.
+
+![image](pics/cornersubpix.png)
+
+Sub-pixel accurate corner locator is based on the observation that every vector from the center \f$q\f$
+to a point \f$p\f$ located within a neighborhood of \f$q\f$ is orthogonal to the image gradient at \f$p\f$
+subject to image and measurement noise. Consider the expression:
+
+\f[\epsilon _i = {DI_{p_i}}^T  \cdot (q - p_i)\f]
+
+where \f${DI_{p_i}}\f$ is an image gradient at one of the points \f$p_i\f$ in a neighborhood of \f$q\f$ . The
+value of \f$q\f$ is to be found so that \f$\epsilon_i\f$ is minimized. A system of equations may be set up
+with \f$\epsilon_i\f$ set to zero:
+
+\f[\sum _i(DI_{p_i}  \cdot {DI_{p_i}}^T) \cdot q -  \sum _i(DI_{p_i}  \cdot {DI_{p_i}}^T  \cdot p_i)\f]
+
+where the gradients are summed within a neighborhood ("search window") of \f$q\f$ . Calling the first
+gradient term \f$G\f$ and the second gradient term \f$b\f$ gives:
+
+\f[q = G^{-1}  \cdot b\f]
+
+The algorithm sets the center of the neighborhood window at this new center \f$q\f$ and then iterates
+until the center stays within a set threshold.
+
+@param image Input single-channel, 8-bit or float image.
+@param corners Initial coordinates of the input corners and refined coordinates provided for
+output.
+@param winSize Half of the side length of the search window. For example, if winSize=Size(5,5) ,
+then a \f$(5*2+1) \times (5*2+1) = 11 \times 11\f$ search window is used.
+@param zeroZone Half of the size of the dead region in the middle of the search zone over which
+the summation in the formula below is not done. It is used sometimes to avoid possible
+singularities of the autocorrelation matrix. The value of (-1,-1) indicates that there is no such
+a size.
+@param criteria Criteria for termination of the iterative process of corner refinement. That is,
+the process of corner position refinement stops either after criteria.maxCount iterations or when
+the corner position moves by less than criteria.epsilon on some iteration.
+ */
+CV_EXPORTS_W void cornerSubPix( InputArray image, InputOutputArray corners,
+                                Size winSize, Size zeroZone,
+                                TermCriteria criteria );
+
+/** @brief Determines strong corners on an image.
+
+The function finds the most prominent corners in the image or in the specified image region, as
+described in @cite Shi94
+
+-   Function calculates the corner quality measure at every source image pixel using the
+    #cornerMinEigenVal or #cornerHarris .
+-   Function performs a non-maximum suppression (the local maximums in *3 x 3* neighborhood are
+    retained).
+-   The corners with the minimal eigenvalue less than
+    \f$\texttt{qualityLevel} \cdot \max_{x,y} qualityMeasureMap(x,y)\f$ are rejected.
+-   The remaining corners are sorted by the quality measure in the descending order.
+-   Function throws away each corner for which there is a stronger corner at a distance less than
+    maxDistance.
+
+The function can be used to initialize a point-based tracker of an object.
+
+@note If the function is called with different values A and B of the parameter qualityLevel , and
+A \> B, the vector of returned corners with qualityLevel=A will be the prefix of the output vector
+with qualityLevel=B .
+
+@param image Input 8-bit or floating-point 32-bit, single-channel image.
+@param corners Output vector of detected corners.
+@param maxCorners Maximum number of corners to return. If there are more corners than are found,
+the strongest of them is returned. `maxCorners <= 0` implies that no limit on the maximum is set
+and all detected corners are returned.
+@param qualityLevel Parameter characterizing the minimal accepted quality of image corners. The
+parameter value is multiplied by the best corner quality measure, which is the minimal eigenvalue
+(see #cornerMinEigenVal ) or the Harris function response (see #cornerHarris ). The corners with the
+quality measure less than the product are rejected. For example, if the best corner has the
+quality measure = 1500, and the qualityLevel=0.01 , then all the corners with the quality measure
+less than 15 are rejected.
+@param minDistance Minimum possible Euclidean distance between the returned corners.
+@param mask Optional region of interest. If the image is not empty (it needs to have the type
+CV_8UC1 and the same size as image ), it specifies the region in which the corners are detected.
+@param blockSize Size of an average block for computing a derivative covariation matrix over each
+pixel neighborhood. See cornerEigenValsAndVecs .
+@param useHarrisDetector Parameter indicating whether to use a Harris detector (see #cornerHarris)
+or #cornerMinEigenVal.
+@param k Free parameter of the Harris detector.
+
+@sa  cornerMinEigenVal, cornerHarris, calcOpticalFlowPyrLK, estimateRigidTransform,
+ */
+
+CV_EXPORTS_W void goodFeaturesToTrack( InputArray image, OutputArray corners,
+                                     int maxCorners, double qualityLevel, double minDistance,
+                                     InputArray mask = noArray(), int blockSize = 3,
+                                     bool useHarrisDetector = false, double k = 0.04 );
+
+CV_EXPORTS_W void goodFeaturesToTrack( InputArray image, OutputArray corners,
+                                     int maxCorners, double qualityLevel, double minDistance,
+                                     InputArray mask, int blockSize,
+                                     int gradientSize, bool useHarrisDetector = false,
+                                     double k = 0.04 );
+
+/** @brief Same as above, but returns also quality measure of the detected corners.
+
+@param image Input 8-bit or floating-point 32-bit, single-channel image.
+@param corners Output vector of detected corners.
+@param maxCorners Maximum number of corners to return. If there are more corners than are found,
+the strongest of them is returned. `maxCorners <= 0` implies that no limit on the maximum is set
+and all detected corners are returned.
+@param qualityLevel Parameter characterizing the minimal accepted quality of image corners. The
+parameter value is multiplied by the best corner quality measure, which is the minimal eigenvalue
+(see #cornerMinEigenVal ) or the Harris function response (see #cornerHarris ). The corners with the
+quality measure less than the product are rejected. For example, if the best corner has the
+quality measure = 1500, and the qualityLevel=0.01 , then all the corners with the quality measure
+less than 15 are rejected.
+@param minDistance Minimum possible Euclidean distance between the returned corners.
+@param mask Region of interest. If the image is not empty (it needs to have the type
+CV_8UC1 and the same size as image ), it specifies the region in which the corners are detected.
+@param cornersQuality Output vector of quality measure of the detected corners.
+@param blockSize Size of an average block for computing a derivative covariation matrix over each
+pixel neighborhood. See cornerEigenValsAndVecs .
+@param gradientSize Aperture parameter for the Sobel operator used for derivatives computation.
+See cornerEigenValsAndVecs .
+@param useHarrisDetector Parameter indicating whether to use a Harris detector (see #cornerHarris)
+or #cornerMinEigenVal.
+@param k Free parameter of the Harris detector.
+ */
+CV_EXPORTS CV_WRAP_AS(goodFeaturesToTrackWithQuality) void goodFeaturesToTrack(
+        InputArray image, OutputArray corners,
+        int maxCorners, double qualityLevel, double minDistance,
+        InputArray mask, OutputArray cornersQuality, int blockSize = 3,
+        int gradientSize = 3, bool useHarrisDetector = false, double k = 0.04);
+
+/** @example samples/cpp/tutorial_code/ImgTrans/houghlines.cpp
+An example using the Hough line detector
+![Sample input image](Hough_Lines_Tutorial_Original_Image.jpg) ![Output image](Hough_Lines_Tutorial_Result.jpg)
+*/
+
+/** @brief Finds lines in a binary image using the standard Hough transform.
+
+The function implements the standard or standard multi-scale Hough transform algorithm for line
+detection. See <http://homepages.inf.ed.ac.uk/rbf/HIPR2/hough.htm> for a good explanation of Hough
+transform.
+
+@param image 8-bit, single-channel binary source image. The image may be modified by the function.
+@param lines Output vector of lines. Each line is represented by a 2 or 3 element vector
+\f$(\rho, \theta)\f$ or \f$(\rho, \theta, \textrm{votes})\f$ . \f$\rho\f$ is the distance from the coordinate origin \f$(0,0)\f$ (top-left corner of
+the image). \f$\theta\f$ is the line rotation angle in radians (
+\f$0 \sim \textrm{vertical line}, \pi/2 \sim \textrm{horizontal line}\f$ ).
+\f$\textrm{votes}\f$ is the value of accumulator.
+@param rho Distance resolution of the accumulator in pixels.
+@param theta Angle resolution of the accumulator in radians.
+@param threshold Accumulator threshold parameter. Only those lines are returned that get enough
+votes ( \f$>\texttt{threshold}\f$ ).
+@param srn For the multi-scale Hough transform, it is a divisor for the distance resolution rho .
+The coarse accumulator distance resolution is rho and the accurate accumulator resolution is
+rho/srn . If both srn=0 and stn=0 , the classical Hough transform is used. Otherwise, both these
+parameters should be positive.
+@param stn For the multi-scale Hough transform, it is a divisor for the distance resolution theta.
+@param min_theta For standard and multi-scale Hough transform, minimum angle to check for lines.
+Must fall between 0 and max_theta.
+@param max_theta For standard and multi-scale Hough transform, maximum angle to check for lines.
+Must fall between min_theta and CV_PI.
+ */
+CV_EXPORTS_W void HoughLines( InputArray image, OutputArray lines,
+                              double rho, double theta, int threshold,
+                              double srn = 0, double stn = 0,
+                              double min_theta = 0, double max_theta = CV_PI );
+
+/** @brief Finds line segments in a binary image using the probabilistic Hough transform.
+
+The function implements the probabilistic Hough transform algorithm for line detection, described
+in @cite Matas00
+
+See the line detection example below:
+@include snippets/imgproc_HoughLinesP.cpp
+This is a sample picture the function parameters have been tuned for:
+
+![image](pics/building.jpg)
+
+And this is the output of the above program in case of the probabilistic Hough transform:
+
+![image](pics/houghp.png)
+
+@param image 8-bit, single-channel binary source image. The image may be modified by the function.
+@param lines Output vector of lines. Each line is represented by a 4-element vector
+\f$(x_1, y_1, x_2, y_2)\f$ , where \f$(x_1,y_1)\f$ and \f$(x_2, y_2)\f$ are the ending points of each detected
+line segment.
+@param rho Distance resolution of the accumulator in pixels.
+@param theta Angle resolution of the accumulator in radians.
+@param threshold Accumulator threshold parameter. Only those lines are returned that get enough
+votes ( \f$>\texttt{threshold}\f$ ).
+@param minLineLength Minimum line length. Line segments shorter than that are rejected.
+@param maxLineGap Maximum allowed gap between points on the same line to link them.
+
+@sa LineSegmentDetector
+ */
+CV_EXPORTS_W void HoughLinesP( InputArray image, OutputArray lines,
+                               double rho, double theta, int threshold,
+                               double minLineLength = 0, double maxLineGap = 0 );
+
+/** @brief Finds lines in a set of points using the standard Hough transform.
+
+The function finds lines in a set of points using a modification of the Hough transform.
+@include snippets/imgproc_HoughLinesPointSet.cpp
+@param point Input vector of points. Each vector must be encoded as a Point vector \f$(x,y)\f$. Type must be CV_32FC2 or CV_32SC2.
+@param lines Output vector of found lines. Each vector is encoded as a vector<Vec3d> \f$(votes, rho, theta)\f$.
+The larger the value of 'votes', the higher the reliability of the Hough line.
+@param lines_max Max count of Hough lines.
+@param threshold Accumulator threshold parameter. Only those lines are returned that get enough
+votes ( \f$>\texttt{threshold}\f$ ).
+@param min_rho Minimum value for \f$\rho\f$ for the accumulator (Note: \f$\rho\f$ can be negative. The absolute value \f$|\rho|\f$ is the distance of a line to the origin.).
+@param max_rho Maximum value for \f$\rho\f$ for the accumulator.
+@param rho_step Distance resolution of the accumulator.
+@param min_theta Minimum angle value of the accumulator in radians.
+@param max_theta Maximum angle value of the accumulator in radians.
+@param theta_step Angle resolution of the accumulator in radians.
+ */
+CV_EXPORTS_W void HoughLinesPointSet( InputArray point, OutputArray lines, int lines_max, int threshold,
+                                      double min_rho, double max_rho, double rho_step,
+                                      double min_theta, double max_theta, double theta_step );
+
+/** @example samples/cpp/tutorial_code/ImgTrans/houghcircles.cpp
+An example using the Hough circle detector
+*/
+
+/** @brief Finds circles in a grayscale image using the Hough transform.
+
+The function finds circles in a grayscale image using a modification of the Hough transform.
+
+Example: :
+@include snippets/imgproc_HoughLinesCircles.cpp
+
+@note Usually the function detects the centers of circles well. However, it may fail to find correct
+radii. You can assist to the function by specifying the radius range ( minRadius and maxRadius ) if
+you know it. Or, in the case of #HOUGH_GRADIENT method you may set maxRadius to a negative number
+to return centers only without radius search, and find the correct radius using an additional procedure.
+
+It also helps to smooth image a bit unless it's already soft. For example,
+GaussianBlur() with 7x7 kernel and 1.5x1.5 sigma or similar blurring may help.
+
+@param image 8-bit, single-channel, grayscale input image.
+@param circles Output vector of found circles. Each vector is encoded as  3 or 4 element
+floating-point vector \f$(x, y, radius)\f$ or \f$(x, y, radius, votes)\f$ .
+@param method Detection method, see #HoughModes. The available methods are #HOUGH_GRADIENT and #HOUGH_GRADIENT_ALT.
+@param dp Inverse ratio of the accumulator resolution to the image resolution. For example, if
+dp=1 , the accumulator has the same resolution as the input image. If dp=2 , the accumulator has
+half as big width and height. For #HOUGH_GRADIENT_ALT the recommended value is dp=1.5,
+unless some small very circles need to be detected.
+@param minDist Minimum distance between the centers of the detected circles. If the parameter is
+too small, multiple neighbor circles may be falsely detected in addition to a true one. If it is
+too large, some circles may be missed.
+@param param1 First method-specific parameter. In case of #HOUGH_GRADIENT and #HOUGH_GRADIENT_ALT,
+it is the higher threshold of the two passed to the Canny edge detector (the lower one is twice smaller).
+Note that #HOUGH_GRADIENT_ALT uses #Scharr algorithm to compute image derivatives, so the threshold value
+shough normally be higher, such as 300 or normally exposed and contrasty images.
+@param param2 Second method-specific parameter. In case of #HOUGH_GRADIENT, it is the
+accumulator threshold for the circle centers at the detection stage. The smaller it is, the more
+false circles may be detected. Circles, corresponding to the larger accumulator values, will be
+returned first. In the case of #HOUGH_GRADIENT_ALT algorithm, this is the circle "perfectness" measure.
+The closer it to 1, the better shaped circles algorithm selects. In most cases 0.9 should be fine.
+If you want get better detection of small circles, you may decrease it to 0.85, 0.8 or even less.
+But then also try to limit the search range [minRadius, maxRadius] to avoid many false circles.
+@param minRadius Minimum circle radius.
+@param maxRadius Maximum circle radius. If <= 0, uses the maximum image dimension. If < 0, #HOUGH_GRADIENT returns
+centers without finding the radius. #HOUGH_GRADIENT_ALT always computes circle radiuses.
+
+@sa fitEllipse, minEnclosingCircle
+ */
+CV_EXPORTS_W void HoughCircles( InputArray image, OutputArray circles,
+                               int method, double dp, double minDist,
+                               double param1 = 100, double param2 = 100,
+                               int minRadius = 0, int maxRadius = 0 );
+
+//! @} imgproc_feature
+
+//! @addtogroup imgproc_filter
+//! @{
+
+/** @example samples/cpp/tutorial_code/ImgProc/Morphology_2.cpp
+Advanced morphology Transformations sample code
+![Sample screenshot](Morphology_2_Tutorial_Result.jpg)
+Check @ref tutorial_opening_closing_hats "the corresponding tutorial" for more details
+*/
+
+/** @brief Erodes an image by using a specific structuring element.
+
+The function erodes the source image using the specified structuring element that determines the
+shape of a pixel neighborhood over which the minimum is taken:
+
+\f[\texttt{dst} (x,y) =  \min _{(x',y'):  \, \texttt{element} (x',y') \ne0 } \texttt{src} (x+x',y+y')\f]
+
+The function supports the in-place mode. Erosion can be applied several ( iterations ) times. In
+case of multi-channel images, each channel is processed independently.
+
+@param src input image; the number of channels can be arbitrary, but the depth should be one of
+CV_8U, CV_16U, CV_16S, CV_32F or CV_64F.
+@param dst output image of the same size and type as src.
+@param kernel structuring element used for erosion; if `element=Mat()`, a `3 x 3` rectangular
+structuring element is used. Kernel can be created using #getStructuringElement.
+@param anchor position of the anchor within the element; default value (-1, -1) means that the
+anchor is at the element center.
+@param iterations number of times erosion is applied.
+@param borderType pixel extrapolation method, see #BorderTypes. #BORDER_WRAP is not supported.
+@param borderValue border value in case of a constant border
+@sa  dilate, morphologyEx, getStructuringElement
+ */
+CV_EXPORTS_W void erode( InputArray src, OutputArray dst, InputArray kernel,
+                         Point anchor = Point(-1,-1), int iterations = 1,
+                         int borderType = BORDER_CONSTANT,
+                         const Scalar& borderValue = morphologyDefaultBorderValue() );
+
+/** @example samples/cpp/tutorial_code/ImgProc/Morphology_1.cpp
+Erosion and Dilation sample code
+![Sample Screenshot-Erosion](Morphology_1_Tutorial_Erosion_Result.jpg)![Sample Screenshot-Dilation](Morphology_1_Tutorial_Dilation_Result.jpg)
+Check @ref tutorial_erosion_dilatation "the corresponding tutorial" for more details
+*/
+
+/** @brief Dilates an image by using a specific structuring element.
+
+The function dilates the source image using the specified structuring element that determines the
+shape of a pixel neighborhood over which the maximum is taken:
+\f[\texttt{dst} (x,y) =  \max _{(x',y'):  \, \texttt{element} (x',y') \ne0 } \texttt{src} (x+x',y+y')\f]
+
+The function supports the in-place mode. Dilation can be applied several ( iterations ) times. In
+case of multi-channel images, each channel is processed independently.
+
+@param src input image; the number of channels can be arbitrary, but the depth should be one of
+CV_8U, CV_16U, CV_16S, CV_32F or CV_64F.
+@param dst output image of the same size and type as src.
+@param kernel structuring element used for dilation; if elemenat=Mat(), a 3 x 3 rectangular
+structuring element is used. Kernel can be created using #getStructuringElement
+@param anchor position of the anchor within the element; default value (-1, -1) means that the
+anchor is at the element center.
+@param iterations number of times dilation is applied.
+@param borderType pixel extrapolation method, see #BorderTypes. #BORDER_WRAP is not suported.
+@param borderValue border value in case of a constant border
+@sa  erode, morphologyEx, getStructuringElement
+ */
+CV_EXPORTS_W void dilate( InputArray src, OutputArray dst, InputArray kernel,
+                          Point anchor = Point(-1,-1), int iterations = 1,
+                          int borderType = BORDER_CONSTANT,
+                          const Scalar& borderValue = morphologyDefaultBorderValue() );
+
+/** @brief Performs advanced morphological transformations.
+
+The function cv::morphologyEx can perform advanced morphological transformations using an erosion and dilation as
+basic operations.
+
+Any of the operations can be done in-place. In case of multi-channel images, each channel is
+processed independently.
+
+@param src Source image. The number of channels can be arbitrary. The depth should be one of
+CV_8U, CV_16U, CV_16S, CV_32F or CV_64F.
+@param dst Destination image of the same size and type as source image.
+@param op Type of a morphological operation, see #MorphTypes
+@param kernel Structuring element. It can be created using #getStructuringElement.
+@param anchor Anchor position with the kernel. Negative values mean that the anchor is at the
+kernel center.
+@param iterations Number of times erosion and dilation are applied.
+@param borderType Pixel extrapolation method, see #BorderTypes. #BORDER_WRAP is not supported.
+@param borderValue Border value in case of a constant border. The default value has a special
+meaning.
+@sa  dilate, erode, getStructuringElement
+@note The number of iterations is the number of times erosion or dilatation operation will be applied.
+For instance, an opening operation (#MORPH_OPEN) with two iterations is equivalent to apply
+successively: erode -> erode -> dilate -> dilate (and not erode -> dilate -> erode -> dilate).
+ */
+CV_EXPORTS_W void morphologyEx( InputArray src, OutputArray dst,
+                                int op, InputArray kernel,
+                                Point anchor = Point(-1,-1), int iterations = 1,
+                                int borderType = BORDER_CONSTANT,
+                                const Scalar& borderValue = morphologyDefaultBorderValue() );
+
+//! @} imgproc_filter
+
+//! @addtogroup imgproc_transform
+//! @{
+
+/** @brief Resizes an image.
+
+The function resize resizes the image src down to or up to the specified size. Note that the
+initial dst type or size are not taken into account. Instead, the size and type are derived from
+the `src`,`dsize`,`fx`, and `fy`. If you want to resize src so that it fits the pre-created dst,
+you may call the function as follows:
+@code
+    // explicitly specify dsize=dst.size(); fx and fy will be computed from that.
+    resize(src, dst, dst.size(), 0, 0, interpolation);
+@endcode
+If you want to decimate the image by factor of 2 in each direction, you can call the function this
+way:
+@code
+    // specify fx and fy and let the function compute the destination image size.
+    resize(src, dst, Size(), 0.5, 0.5, interpolation);
+@endcode
+To shrink an image, it will generally look best with #INTER_AREA interpolation, whereas to
+enlarge an image, it will generally look best with #INTER_CUBIC (slow) or #INTER_LINEAR
+(faster but still looks OK).
+
+@param src input image.
+@param dst output image; it has the size dsize (when it is non-zero) or the size computed from
+src.size(), fx, and fy; the type of dst is the same as of src.
+@param dsize output image size; if it equals zero (`None` in Python), it is computed as:
+ \f[\texttt{dsize = Size(round(fx*src.cols), round(fy*src.rows))}\f]
+ Either dsize or both fx and fy must be non-zero.
+@param fx scale factor along the horizontal axis; when it equals 0, it is computed as
+\f[\texttt{(double)dsize.width/src.cols}\f]
+@param fy scale factor along the vertical axis; when it equals 0, it is computed as
+\f[\texttt{(double)dsize.height/src.rows}\f]
+@param interpolation interpolation method, see #InterpolationFlags
+
+@sa  warpAffine, warpPerspective, remap
+ */
+CV_EXPORTS_W void resize( InputArray src, OutputArray dst,
+                          Size dsize, double fx = 0, double fy = 0,
+                          int interpolation = INTER_LINEAR );
+
+/** @brief Applies an affine transformation to an image.
+
+The function warpAffine transforms the source image using the specified matrix:
+
+\f[\texttt{dst} (x,y) =  \texttt{src} ( \texttt{M} _{11} x +  \texttt{M} _{12} y +  \texttt{M} _{13}, \texttt{M} _{21} x +  \texttt{M} _{22} y +  \texttt{M} _{23})\f]
+
+when the flag #WARP_INVERSE_MAP is set. Otherwise, the transformation is first inverted
+with #invertAffineTransform and then put in the formula above instead of M. The function cannot
+operate in-place.
+
+@param src input image.
+@param dst output image that has the size dsize and the same type as src .
+@param M \f$2\times 3\f$ transformation matrix.
+@param dsize size of the output image.
+@param flags combination of interpolation methods (see #InterpolationFlags) and the optional
+flag #WARP_INVERSE_MAP that means that M is the inverse transformation (
+\f$\texttt{dst}\rightarrow\texttt{src}\f$ ).
+@param borderMode pixel extrapolation method (see #BorderTypes); when
+borderMode=#BORDER_TRANSPARENT, it means that the pixels in the destination image corresponding to
+the "outliers" in the source image are not modified by the function.
+@param borderValue value used in case of a constant border; by default, it is 0.
+
+@sa  warpPerspective, resize, remap, getRectSubPix, transform
+ */
+CV_EXPORTS_W void warpAffine( InputArray src, OutputArray dst,
+                              InputArray M, Size dsize,
+                              int flags = INTER_LINEAR,
+                              int borderMode = BORDER_CONSTANT,
+                              const Scalar& borderValue = Scalar());
+
+/** @example samples/cpp/warpPerspective_demo.cpp
+An example program shows using cv::getPerspectiveTransform and cv::warpPerspective for image warping
+*/
+
+/** @brief Applies a perspective transformation to an image.
+
+The function warpPerspective transforms the source image using the specified matrix:
+
+\f[\texttt{dst} (x,y) =  \texttt{src} \left ( \frac{M_{11} x + M_{12} y + M_{13}}{M_{31} x + M_{32} y + M_{33}} ,
+     \frac{M_{21} x + M_{22} y + M_{23}}{M_{31} x + M_{32} y + M_{33}} \right )\f]
+
+when the flag #WARP_INVERSE_MAP is set. Otherwise, the transformation is first inverted with invert
+and then put in the formula above instead of M. The function cannot operate in-place.
+
+@param src input image.
+@param dst output image that has the size dsize and the same type as src .
+@param M \f$3\times 3\f$ transformation matrix.
+@param dsize size of the output image.
+@param flags combination of interpolation methods (#INTER_LINEAR or #INTER_NEAREST) and the
+optional flag #WARP_INVERSE_MAP, that sets M as the inverse transformation (
+\f$\texttt{dst}\rightarrow\texttt{src}\f$ ).
+@param borderMode pixel extrapolation method (#BORDER_CONSTANT or #BORDER_REPLICATE).
+@param borderValue value used in case of a constant border; by default, it equals 0.
+
+@sa  warpAffine, resize, remap, getRectSubPix, perspectiveTransform
+ */
+CV_EXPORTS_W void warpPerspective( InputArray src, OutputArray dst,
+                                   InputArray M, Size dsize,
+                                   int flags = INTER_LINEAR,
+                                   int borderMode = BORDER_CONSTANT,
+                                   const Scalar& borderValue = Scalar());
+
+/** @brief Applies a generic geometrical transformation to an image.
+
+The function remap transforms the source image using the specified map:
+
+\f[\texttt{dst} (x,y) =  \texttt{src} (map_x(x,y),map_y(x,y))\f]
+
+where values of pixels with non-integer coordinates are computed using one of available
+interpolation methods. \f$map_x\f$ and \f$map_y\f$ can be encoded as separate floating-point maps
+in \f$map_1\f$ and \f$map_2\f$ respectively, or interleaved floating-point maps of \f$(x,y)\f$ in
+\f$map_1\f$, or fixed-point maps created by using #convertMaps. The reason you might want to
+convert from floating to fixed-point representations of a map is that they can yield much faster
+(\~2x) remapping operations. In the converted case, \f$map_1\f$ contains pairs (cvFloor(x),
+cvFloor(y)) and \f$map_2\f$ contains indices in a table of interpolation coefficients.
+
+This function cannot operate in-place.
+
+@param src Source image.
+@param dst Destination image. It has the same size as map1 and the same type as src .
+@param map1 The first map of either (x,y) points or just x values having the type CV_16SC2 ,
+CV_32FC1, or CV_32FC2. See #convertMaps for details on converting a floating point
+representation to fixed-point for speed.
+@param map2 The second map of y values having the type CV_16UC1, CV_32FC1, or none (empty map
+if map1 is (x,y) points), respectively.
+@param interpolation Interpolation method (see #InterpolationFlags). The methods #INTER_AREA
+and #INTER_LINEAR_EXACT are not supported by this function.
+@param borderMode Pixel extrapolation method (see #BorderTypes). When
+borderMode=#BORDER_TRANSPARENT, it means that the pixels in the destination image that
+corresponds to the "outliers" in the source image are not modified by the function.
+@param borderValue Value used in case of a constant border. By default, it is 0.
+@note
+Due to current implementation limitations the size of an input and output images should be less than 32767x32767.
+ */
+CV_EXPORTS_W void remap( InputArray src, OutputArray dst,
+                         InputArray map1, InputArray map2,
+                         int interpolation, int borderMode = BORDER_CONSTANT,
+                         const Scalar& borderValue = Scalar());
+
+/** @brief Converts image transformation maps from one representation to another.
+
+The function converts a pair of maps for remap from one representation to another. The following
+options ( (map1.type(), map2.type()) \f$\rightarrow\f$ (dstmap1.type(), dstmap2.type()) ) are
+supported:
+
+- \f$\texttt{(CV_32FC1, CV_32FC1)} \rightarrow \texttt{(CV_16SC2, CV_16UC1)}\f$. This is the
+most frequently used conversion operation, in which the original floating-point maps (see #remap)
+are converted to a more compact and much faster fixed-point representation. The first output array
+contains the rounded coordinates and the second array (created only when nninterpolation=false )
+contains indices in the interpolation tables.
+
+- \f$\texttt{(CV_32FC2)} \rightarrow \texttt{(CV_16SC2, CV_16UC1)}\f$. The same as above but
+the original maps are stored in one 2-channel matrix.
+
+- Reverse conversion. Obviously, the reconstructed floating-point maps will not be exactly the same
+as the originals.
+
+@param map1 The first input map of type CV_16SC2, CV_32FC1, or CV_32FC2 .
+@param map2 The second input map of type CV_16UC1, CV_32FC1, or none (empty matrix),
+respectively.
+@param dstmap1 The first output map that has the type dstmap1type and the same size as src .
+@param dstmap2 The second output map.
+@param dstmap1type Type of the first output map that should be CV_16SC2, CV_32FC1, or
+CV_32FC2 .
+@param nninterpolation Flag indicating whether the fixed-point maps are used for the
+nearest-neighbor or for a more complex interpolation.
+
+@sa  remap, undistort, initUndistortRectifyMap
+ */
+CV_EXPORTS_W void convertMaps( InputArray map1, InputArray map2,
+                               OutputArray dstmap1, OutputArray dstmap2,
+                               int dstmap1type, bool nninterpolation = false );
+
+/** @brief Calculates an affine matrix of 2D rotation.
+
+The function calculates the following matrix:
+
+\f[\begin{bmatrix} \alpha &  \beta & (1- \alpha )  \cdot \texttt{center.x} -  \beta \cdot \texttt{center.y} \\ - \beta &  \alpha &  \beta \cdot \texttt{center.x} + (1- \alpha )  \cdot \texttt{center.y} \end{bmatrix}\f]
+
+where
+
+\f[\begin{array}{l} \alpha =  \texttt{scale} \cdot \cos \texttt{angle} , \\ \beta =  \texttt{scale} \cdot \sin \texttt{angle} \end{array}\f]
+
+The transformation maps the rotation center to itself. If this is not the target, adjust the shift.
+
+@param center Center of the rotation in the source image.
+@param angle Rotation angle in degrees. Positive values mean counter-clockwise rotation (the
+coordinate origin is assumed to be the top-left corner).
+@param scale Isotropic scale factor.
+
+@sa  getAffineTransform, warpAffine, transform
+ */
+CV_EXPORTS_W Mat getRotationMatrix2D(Point2f center, double angle, double scale);
+
+/** @sa getRotationMatrix2D */
+CV_EXPORTS Matx23d getRotationMatrix2D_(Point2f center, double angle, double scale);
+
+inline
+Mat getRotationMatrix2D(Point2f center, double angle, double scale)
+{
+    return Mat(getRotationMatrix2D_(center, angle, scale), true);
+}
+
+/** @brief Calculates an affine transform from three pairs of the corresponding points.
+
+The function calculates the \f$2 \times 3\f$ matrix of an affine transform so that:
+
+\f[\begin{bmatrix} x'_i \\ y'_i \end{bmatrix} = \texttt{map_matrix} \cdot \begin{bmatrix} x_i \\ y_i \\ 1 \end{bmatrix}\f]
+
+where
+
+\f[dst(i)=(x'_i,y'_i), src(i)=(x_i, y_i), i=0,1,2\f]
+
+@param src Coordinates of triangle vertices in the source image.
+@param dst Coordinates of the corresponding triangle vertices in the destination image.
+
+@sa  warpAffine, transform
+ */
+CV_EXPORTS Mat getAffineTransform( const Point2f src[], const Point2f dst[] );
+
+/** @brief Inverts an affine transformation.
+
+The function computes an inverse affine transformation represented by \f$2 \times 3\f$ matrix M:
+
+\f[\begin{bmatrix} a_{11} & a_{12} & b_1  \\ a_{21} & a_{22} & b_2 \end{bmatrix}\f]
+
+The result is also a \f$2 \times 3\f$ matrix of the same type as M.
+
+@param M Original affine transformation.
+@param iM Output reverse affine transformation.
+ */
+CV_EXPORTS_W void invertAffineTransform( InputArray M, OutputArray iM );
+
+/** @brief Calculates a perspective transform from four pairs of the corresponding points.
+
+The function calculates the \f$3 \times 3\f$ matrix of a perspective transform so that:
+
+\f[\begin{bmatrix} t_i x'_i \\ t_i y'_i \\ t_i \end{bmatrix} = \texttt{map_matrix} \cdot \begin{bmatrix} x_i \\ y_i \\ 1 \end{bmatrix}\f]
+
+where
+
+\f[dst(i)=(x'_i,y'_i), src(i)=(x_i, y_i), i=0,1,2,3\f]
+
+@param src Coordinates of quadrangle vertices in the source image.
+@param dst Coordinates of the corresponding quadrangle vertices in the destination image.
+@param solveMethod method passed to cv::solve (#DecompTypes)
+
+@sa  findHomography, warpPerspective, perspectiveTransform
+ */
+CV_EXPORTS_W Mat getPerspectiveTransform(InputArray src, InputArray dst, int solveMethod = DECOMP_LU);
+
+/** @overload */
+CV_EXPORTS Mat getPerspectiveTransform(const Point2f src[], const Point2f dst[], int solveMethod = DECOMP_LU);
+
+
+CV_EXPORTS_W Mat getAffineTransform( InputArray src, InputArray dst );
+
+/** @brief Retrieves a pixel rectangle from an image with sub-pixel accuracy.
+
+The function getRectSubPix extracts pixels from src:
+
+\f[patch(x, y) = src(x +  \texttt{center.x} - ( \texttt{dst.cols} -1)*0.5, y +  \texttt{center.y} - ( \texttt{dst.rows} -1)*0.5)\f]
+
+where the values of the pixels at non-integer coordinates are retrieved using bilinear
+interpolation. Every channel of multi-channel images is processed independently. Also
+the image should be a single channel or three channel image. While the center of the
+rectangle must be inside the image, parts of the rectangle may be outside.
+
+@param image Source image.
+@param patchSize Size of the extracted patch.
+@param center Floating point coordinates of the center of the extracted rectangle within the
+source image. The center must be inside the image.
+@param patch Extracted patch that has the size patchSize and the same number of channels as src .
+@param patchType Depth of the extracted pixels. By default, they have the same depth as src .
+
+@sa  warpAffine, warpPerspective
+ */
+CV_EXPORTS_W void getRectSubPix( InputArray image, Size patchSize,
+                                 Point2f center, OutputArray patch, int patchType = -1 );
+
+/** @example samples/cpp/polar_transforms.cpp
+An example using the cv::linearPolar and cv::logPolar operations
+*/
+
+/** @brief Remaps an image to semilog-polar coordinates space.
+
+@deprecated This function produces same result as cv::warpPolar(src, dst, src.size(), center, maxRadius, flags+WARP_POLAR_LOG);
+
+@internal
+Transform the source image using the following transformation (See @ref polar_remaps_reference_image "Polar remaps reference image d)"):
+\f[\begin{array}{l}
+  dst( \rho , \phi ) = src(x,y) \\
+  dst.size() \leftarrow src.size()
+\end{array}\f]
+
+where
+\f[\begin{array}{l}
+  I = (dx,dy) = (x - center.x,y - center.y) \\
+  \rho = M \cdot log_e(\texttt{magnitude} (I)) ,\\
+  \phi = Kangle \cdot \texttt{angle} (I) \\
+\end{array}\f]
+
+and
+\f[\begin{array}{l}
+  M = src.cols / log_e(maxRadius) \\
+  Kangle = src.rows / 2\Pi \\
+\end{array}\f]
+
+The function emulates the human "foveal" vision and can be used for fast scale and
+rotation-invariant template matching, for object tracking and so forth.
+@param src Source image
+@param dst Destination image. It will have same size and type as src.
+@param center The transformation center; where the output precision is maximal
+@param M Magnitude scale parameter. It determines the radius of the bounding circle to transform too.
+@param flags A combination of interpolation methods, see #InterpolationFlags
+
+@note
+-   The function can not operate in-place.
+-   To calculate magnitude and angle in degrees #cartToPolar is used internally thus angles are measured from 0 to 360 with accuracy about 0.3 degrees.
+
+@sa cv::linearPolar
+@endinternal
+*/
+CV_EXPORTS_W void logPolar( InputArray src, OutputArray dst,
+                            Point2f center, double M, int flags );
+
+/** @brief Remaps an image to polar coordinates space.
+
+@deprecated This function produces same result as cv::warpPolar(src, dst, src.size(), center, maxRadius, flags)
+
+@internal
+Transform the source image using the following transformation (See @ref polar_remaps_reference_image "Polar remaps reference image c)"):
+\f[\begin{array}{l}
+  dst( \rho , \phi ) = src(x,y) \\
+  dst.size() \leftarrow src.size()
+\end{array}\f]
+
+where
+\f[\begin{array}{l}
+  I = (dx,dy) = (x - center.x,y - center.y) \\
+  \rho = Kmag \cdot \texttt{magnitude} (I) ,\\
+  \phi = angle \cdot \texttt{angle} (I)
+\end{array}\f]
+
+and
+\f[\begin{array}{l}
+  Kx = src.cols / maxRadius \\
+  Ky = src.rows / 2\Pi
+\end{array}\f]
+
+
+@param src Source image
+@param dst Destination image. It will have same size and type as src.
+@param center The transformation center;
+@param maxRadius The radius of the bounding circle to transform. It determines the inverse magnitude scale parameter too.
+@param flags A combination of interpolation methods, see #InterpolationFlags
+
+@note
+-   The function can not operate in-place.
+-   To calculate magnitude and angle in degrees #cartToPolar is used internally thus angles are measured from 0 to 360 with accuracy about 0.3 degrees.
+
+@sa cv::logPolar
+@endinternal
+*/
+CV_EXPORTS_W void linearPolar( InputArray src, OutputArray dst,
+                               Point2f center, double maxRadius, int flags );
+
+
+/** \brief Remaps an image to polar or semilog-polar coordinates space
+
+@anchor polar_remaps_reference_image
+![Polar remaps reference](pics/polar_remap_doc.png)
+
+Transform the source image using the following transformation:
+\f[
+dst(\rho , \phi ) = src(x,y)
+\f]
+
+where
+\f[
+\begin{array}{l}
+\vec{I} = (x - center.x, \;y - center.y) \\
+\phi = Kangle \cdot \texttt{angle} (\vec{I}) \\
+\rho = \left\{\begin{matrix}
+Klin \cdot \texttt{magnitude} (\vec{I}) & default \\
+Klog \cdot log_e(\texttt{magnitude} (\vec{I})) & if \; semilog \\
+\end{matrix}\right.
+\end{array}
+\f]
+
+and
+\f[
+\begin{array}{l}
+Kangle = dsize.height / 2\Pi \\
+Klin = dsize.width / maxRadius \\
+Klog = dsize.width / log_e(maxRadius) \\
+\end{array}
+\f]
+
+
+\par Linear vs semilog mapping
+
+Polar mapping can be linear or semi-log. Add one of #WarpPolarMode to `flags` to specify the polar mapping mode.
+
+Linear is the default mode.
+
+The semilog mapping emulates the human "foveal" vision that permit very high acuity on the line of sight (central vision)
+in contrast to peripheral vision where acuity is minor.
+
+\par Option on `dsize`:
+
+- if both values in `dsize <=0 ` (default),
+the destination image will have (almost) same area of source bounding circle:
+\f[\begin{array}{l}
+dsize.area  \leftarrow (maxRadius^2 \cdot \Pi) \\
+dsize.width = \texttt{cvRound}(maxRadius) \\
+dsize.height = \texttt{cvRound}(maxRadius \cdot \Pi) \\
+\end{array}\f]
+
+
+- if only `dsize.height <= 0`,
+the destination image area will be proportional to the bounding circle area but scaled by `Kx * Kx`:
+\f[\begin{array}{l}
+dsize.height = \texttt{cvRound}(dsize.width \cdot \Pi) \\
+\end{array}
+\f]
+
+- if both values in `dsize > 0 `,
+the destination image will have the given size therefore the area of the bounding circle will be scaled to `dsize`.
+
+
+\par Reverse mapping
+
+You can get reverse mapping adding #WARP_INVERSE_MAP to `flags`
+\snippet polar_transforms.cpp InverseMap
+
+In addiction, to calculate the original coordinate from a polar mapped coordinate \f$(rho, phi)->(x, y)\f$:
+\snippet polar_transforms.cpp InverseCoordinate
+
+@param src Source image.
+@param dst Destination image. It will have same type as src.
+@param dsize The destination image size (see description for valid options).
+@param center The transformation center.
+@param maxRadius The radius of the bounding circle to transform. It determines the inverse magnitude scale parameter too.
+@param flags A combination of interpolation methods, #InterpolationFlags + #WarpPolarMode.
+            - Add #WARP_POLAR_LINEAR to select linear polar mapping (default)
+            - Add #WARP_POLAR_LOG to select semilog polar mapping
+            - Add #WARP_INVERSE_MAP for reverse mapping.
+@note
+-  The function can not operate in-place.
+-  To calculate magnitude and angle in degrees #cartToPolar is used internally thus angles are measured from 0 to 360 with accuracy about 0.3 degrees.
+-  This function uses #remap. Due to current implementation limitations the size of an input and output images should be less than 32767x32767.
+
+@sa cv::remap
+*/
+CV_EXPORTS_W void warpPolar(InputArray src, OutputArray dst, Size dsize,
+                            Point2f center, double maxRadius, int flags);
+
+
+//! @} imgproc_transform
+
+//! @addtogroup imgproc_misc
+//! @{
+
+/** @brief Calculates the integral of an image.
+
+The function calculates one or more integral images for the source image as follows:
+
+\f[\texttt{sum} (X,Y) =  \sum _{x<X,y<Y}  \texttt{image} (x,y)\f]
+
+\f[\texttt{sqsum} (X,Y) =  \sum _{x<X,y<Y}  \texttt{image} (x,y)^2\f]
+
+\f[\texttt{tilted} (X,Y) =  \sum _{y<Y,abs(x-X+1) \leq Y-y-1}  \texttt{image} (x,y)\f]
+
+Using these integral images, you can calculate sum, mean, and standard deviation over a specific
+up-right or rotated rectangular region of the image in a constant time, for example:
+
+\f[\sum _{x_1 \leq x < x_2,  \, y_1  \leq y < y_2}  \texttt{image} (x,y) =  \texttt{sum} (x_2,y_2)- \texttt{sum} (x_1,y_2)- \texttt{sum} (x_2,y_1)+ \texttt{sum} (x_1,y_1)\f]
+
+It makes possible to do a fast blurring or fast block correlation with a variable window size, for
+example. In case of multi-channel images, sums for each channel are accumulated independently.
+
+As a practical example, the next figure shows the calculation of the integral of a straight
+rectangle Rect(3,3,3,2) and of a tilted rectangle Rect(5,1,2,3) . The selected pixels in the
+original image are shown, as well as the relative pixels in the integral images sum and tilted .
+
+![integral calculation example](pics/integral.png)
+
+@param src input image as \f$W \times H\f$, 8-bit or floating-point (32f or 64f).
+@param sum integral image as \f$(W+1)\times (H+1)\f$ , 32-bit integer or floating-point (32f or 64f).
+@param sqsum integral image for squared pixel values; it is \f$(W+1)\times (H+1)\f$, double-precision
+floating-point (64f) array.
+@param tilted integral for the image rotated by 45 degrees; it is \f$(W+1)\times (H+1)\f$ array with
+the same data type as sum.
+@param sdepth desired depth of the integral and the tilted integral images, CV_32S, CV_32F, or
+CV_64F.
+@param sqdepth desired depth of the integral image of squared pixel values, CV_32F or CV_64F.
+ */
+CV_EXPORTS_AS(integral3) void integral( InputArray src, OutputArray sum,
+                                        OutputArray sqsum, OutputArray tilted,
+                                        int sdepth = -1, int sqdepth = -1 );
+
+/** @overload */
+CV_EXPORTS_W void integral( InputArray src, OutputArray sum, int sdepth = -1 );
+
+/** @overload */
+CV_EXPORTS_AS(integral2) void integral( InputArray src, OutputArray sum,
+                                        OutputArray sqsum, int sdepth = -1, int sqdepth = -1 );
+
+//! @} imgproc_misc
+
+//! @addtogroup imgproc_motion
+//! @{
+
+/** @brief Adds an image to the accumulator image.
+
+The function adds src or some of its elements to dst :
+
+\f[\texttt{dst} (x,y)  \leftarrow \texttt{dst} (x,y) +  \texttt{src} (x,y)  \quad \text{if} \quad \texttt{mask} (x,y)  \ne 0\f]
+
+The function supports multi-channel images. Each channel is processed independently.
+
+The function cv::accumulate can be used, for example, to collect statistics of a scene background
+viewed by a still camera and for the further foreground-background segmentation.
+
+@param src Input image of type CV_8UC(n), CV_16UC(n), CV_32FC(n) or CV_64FC(n), where n is a positive integer.
+@param dst %Accumulator image with the same number of channels as input image, and a depth of CV_32F or CV_64F.
+@param mask Optional operation mask.
+
+@sa  accumulateSquare, accumulateProduct, accumulateWeighted
+ */
+CV_EXPORTS_W void accumulate( InputArray src, InputOutputArray dst,
+                              InputArray mask = noArray() );
+
+/** @brief Adds the square of a source image to the accumulator image.
+
+The function adds the input image src or its selected region, raised to a power of 2, to the
+accumulator dst :
+
+\f[\texttt{dst} (x,y)  \leftarrow \texttt{dst} (x,y) +  \texttt{src} (x,y)^2  \quad \text{if} \quad \texttt{mask} (x,y)  \ne 0\f]
+
+The function supports multi-channel images. Each channel is processed independently.
+
+@param src Input image as 1- or 3-channel, 8-bit or 32-bit floating point.
+@param dst %Accumulator image with the same number of channels as input image, 32-bit or 64-bit
+floating-point.
+@param mask Optional operation mask.
+
+@sa  accumulateSquare, accumulateProduct, accumulateWeighted
+ */
+CV_EXPORTS_W void accumulateSquare( InputArray src, InputOutputArray dst,
+                                    InputArray mask = noArray() );
+
+/** @brief Adds the per-element product of two input images to the accumulator image.
+
+The function adds the product of two images or their selected regions to the accumulator dst :
+
+\f[\texttt{dst} (x,y)  \leftarrow \texttt{dst} (x,y) +  \texttt{src1} (x,y)  \cdot \texttt{src2} (x,y)  \quad \text{if} \quad \texttt{mask} (x,y)  \ne 0\f]
+
+The function supports multi-channel images. Each channel is processed independently.
+
+@param src1 First input image, 1- or 3-channel, 8-bit or 32-bit floating point.
+@param src2 Second input image of the same type and the same size as src1 .
+@param dst %Accumulator image with the same number of channels as input images, 32-bit or 64-bit
+floating-point.
+@param mask Optional operation mask.
+
+@sa  accumulate, accumulateSquare, accumulateWeighted
+ */
+CV_EXPORTS_W void accumulateProduct( InputArray src1, InputArray src2,
+                                     InputOutputArray dst, InputArray mask=noArray() );
+
+/** @brief Updates a running average.
+
+The function calculates the weighted sum of the input image src and the accumulator dst so that dst
+becomes a running average of a frame sequence:
+
+\f[\texttt{dst} (x,y)  \leftarrow (1- \texttt{alpha} )  \cdot \texttt{dst} (x,y) +  \texttt{alpha} \cdot \texttt{src} (x,y)  \quad \text{if} \quad \texttt{mask} (x,y)  \ne 0\f]
+
+That is, alpha regulates the update speed (how fast the accumulator "forgets" about earlier images).
+The function supports multi-channel images. Each channel is processed independently.
+
+@param src Input image as 1- or 3-channel, 8-bit or 32-bit floating point.
+@param dst %Accumulator image with the same number of channels as input image, 32-bit or 64-bit
+floating-point.
+@param alpha Weight of the input image.
+@param mask Optional operation mask.
+
+@sa  accumulate, accumulateSquare, accumulateProduct
+ */
+CV_EXPORTS_W void accumulateWeighted( InputArray src, InputOutputArray dst,
+                                      double alpha, InputArray mask = noArray() );
+
+/** @brief The function is used to detect translational shifts that occur between two images.
+
+The operation takes advantage of the Fourier shift theorem for detecting the translational shift in
+the frequency domain. It can be used for fast image registration as well as motion estimation. For
+more information please see <http://en.wikipedia.org/wiki/Phase_correlation>
+
+Calculates the cross-power spectrum of two supplied source arrays. The arrays are padded if needed
+with getOptimalDFTSize.
+
+The function performs the following equations:
+- First it applies a Hanning window (see <http://en.wikipedia.org/wiki/Hann_function>) to each
+image to remove possible edge effects. This window is cached until the array size changes to speed
+up processing time.
+- Next it computes the forward DFTs of each source array:
+\f[\mathbf{G}_a = \mathcal{F}\{src_1\}, \; \mathbf{G}_b = \mathcal{F}\{src_2\}\f]
+where \f$\mathcal{F}\f$ is the forward DFT.
+- It then computes the cross-power spectrum of each frequency domain array:
+\f[R = \frac{ \mathbf{G}_a \mathbf{G}_b^*}{|\mathbf{G}_a \mathbf{G}_b^*|}\f]
+- Next the cross-correlation is converted back into the time domain via the inverse DFT:
+\f[r = \mathcal{F}^{-1}\{R\}\f]
+- Finally, it computes the peak location and computes a 5x5 weighted centroid around the peak to
+achieve sub-pixel accuracy.
+\f[(\Delta x, \Delta y) = \texttt{weightedCentroid} \{\arg \max_{(x, y)}\{r\}\}\f]
+- If non-zero, the response parameter is computed as the sum of the elements of r within the 5x5
+centroid around the peak location. It is normalized to a maximum of 1 (meaning there is a single
+peak) and will be smaller when there are multiple peaks.
+
+@param src1 Source floating point array (CV_32FC1 or CV_64FC1)
+@param src2 Source floating point array (CV_32FC1 or CV_64FC1)
+@param window Floating point array with windowing coefficients to reduce edge effects (optional).
+@param response Signal power within the 5x5 centroid around the peak, between 0 and 1 (optional).
+@returns detected phase shift (sub-pixel) between the two arrays.
+
+@sa dft, getOptimalDFTSize, idft, mulSpectrums createHanningWindow
+ */
+CV_EXPORTS_W Point2d phaseCorrelate(InputArray src1, InputArray src2,
+                                    InputArray window = noArray(), CV_OUT double* response = 0);
+
+/** @brief This function computes a Hanning window coefficients in two dimensions.
+
+See (http://en.wikipedia.org/wiki/Hann_function) and (http://en.wikipedia.org/wiki/Window_function)
+for more information.
+
+An example is shown below:
+@code
+    // create hanning window of size 100x100 and type CV_32F
+    Mat hann;
+    createHanningWindow(hann, Size(100, 100), CV_32F);
+@endcode
+@param dst Destination array to place Hann coefficients in
+@param winSize The window size specifications (both width and height must be > 1)
+@param type Created array type
+ */
+CV_EXPORTS_W void createHanningWindow(OutputArray dst, Size winSize, int type);
+
+/** @brief Performs the per-element division of the first Fourier spectrum by the second Fourier spectrum.
+
+The function cv::divSpectrums performs the per-element division of the first array by the second array.
+The arrays are CCS-packed or complex matrices that are results of a real or complex Fourier transform.
+
+@param a first input array.
+@param b second input array of the same size and type as src1 .
+@param c output array of the same size and type as src1 .
+@param flags operation flags; currently, the only supported flag is cv::DFT_ROWS, which indicates that
+each row of src1 and src2 is an independent 1D Fourier spectrum. If you do not want to use this flag, then simply add a `0` as value.
+@param conjB optional flag that conjugates the second input array before the multiplication (true)
+or not (false).
+*/
+CV_EXPORTS_W void divSpectrums(InputArray a, InputArray b, OutputArray c,
+                               int flags, bool conjB = false);
+
+//! @} imgproc_motion
+
+//! @addtogroup imgproc_misc
+//! @{
+
+/** @brief Applies a fixed-level threshold to each array element.
+
+The function applies fixed-level thresholding to a multiple-channel array. The function is typically
+used to get a bi-level (binary) image out of a grayscale image ( #compare could be also used for
+this purpose) or for removing a noise, that is, filtering out pixels with too small or too large
+values. There are several types of thresholding supported by the function. They are determined by
+type parameter.
+
+Also, the special values #THRESH_OTSU or #THRESH_TRIANGLE may be combined with one of the
+above values. In these cases, the function determines the optimal threshold value using the Otsu's
+or Triangle algorithm and uses it instead of the specified thresh.
+
+@note Currently, the Otsu's and Triangle methods are implemented only for 8-bit single-channel images.
+
+@param src input array (multiple-channel, 8-bit or 32-bit floating point).
+@param dst output array of the same size  and type and the same number of channels as src.
+@param thresh threshold value.
+@param maxval maximum value to use with the #THRESH_BINARY and #THRESH_BINARY_INV thresholding
+types.
+@param type thresholding type (see #ThresholdTypes).
+@return the computed threshold value if Otsu's or Triangle methods used.
+
+@sa  adaptiveThreshold, findContours, compare, min, max
+ */
+CV_EXPORTS_W double threshold( InputArray src, OutputArray dst,
+                               double thresh, double maxval, int type );
+
+
+/** @brief Applies an adaptive threshold to an array.
+
+The function transforms a grayscale image to a binary image according to the formulae:
+-   **THRESH_BINARY**
+    \f[dst(x,y) =  \fork{\texttt{maxValue}}{if \(src(x,y) > T(x,y)\)}{0}{otherwise}\f]
+-   **THRESH_BINARY_INV**
+    \f[dst(x,y) =  \fork{0}{if \(src(x,y) > T(x,y)\)}{\texttt{maxValue}}{otherwise}\f]
+where \f$T(x,y)\f$ is a threshold calculated individually for each pixel (see adaptiveMethod parameter).
+
+The function can process the image in-place.
+
+@param src Source 8-bit single-channel image.
+@param dst Destination image of the same size and the same type as src.
+@param maxValue Non-zero value assigned to the pixels for which the condition is satisfied
+@param adaptiveMethod Adaptive thresholding algorithm to use, see #AdaptiveThresholdTypes.
+The #BORDER_REPLICATE | #BORDER_ISOLATED is used to process boundaries.
+@param thresholdType Thresholding type that must be either #THRESH_BINARY or #THRESH_BINARY_INV,
+see #ThresholdTypes.
+@param blockSize Size of a pixel neighborhood that is used to calculate a threshold value for the
+pixel: 3, 5, 7, and so on.
+@param C Constant subtracted from the mean or weighted mean (see the details below). Normally, it
+is positive but may be zero or negative as well.
+
+@sa  threshold, blur, GaussianBlur
+ */
+CV_EXPORTS_W void adaptiveThreshold( InputArray src, OutputArray dst,
+                                     double maxValue, int adaptiveMethod,
+                                     int thresholdType, int blockSize, double C );
+
+//! @} imgproc_misc
+
+//! @addtogroup imgproc_filter
+//! @{
+
+/** @example samples/cpp/tutorial_code/ImgProc/Pyramids/Pyramids.cpp
+An example using pyrDown and pyrUp functions
+*/
+
+/** @brief Blurs an image and downsamples it.
+
+By default, size of the output image is computed as `Size((src.cols+1)/2, (src.rows+1)/2)`, but in
+any case, the following conditions should be satisfied:
+
+\f[\begin{array}{l} | \texttt{dstsize.width} *2-src.cols| \leq 2 \\ | \texttt{dstsize.height} *2-src.rows| \leq 2 \end{array}\f]
+
+The function performs the downsampling step of the Gaussian pyramid construction. First, it
+convolves the source image with the kernel:
+
+\f[\frac{1}{256} \begin{bmatrix} 1 & 4 & 6 & 4 & 1  \\ 4 & 16 & 24 & 16 & 4  \\ 6 & 24 & 36 & 24 & 6  \\ 4 & 16 & 24 & 16 & 4  \\ 1 & 4 & 6 & 4 & 1 \end{bmatrix}\f]
+
+Then, it downsamples the image by rejecting even rows and columns.
+
+@param src input image.
+@param dst output image; it has the specified size and the same type as src.
+@param dstsize size of the output image.
+@param borderType Pixel extrapolation method, see #BorderTypes (#BORDER_CONSTANT isn't supported)
+ */
+CV_EXPORTS_W void pyrDown( InputArray src, OutputArray dst,
+                           const Size& dstsize = Size(), int borderType = BORDER_DEFAULT );
+
+/** @brief Upsamples an image and then blurs it.
+
+By default, size of the output image is computed as `Size(src.cols\*2, (src.rows\*2)`, but in any
+case, the following conditions should be satisfied:
+
+\f[\begin{array}{l} | \texttt{dstsize.width} -src.cols*2| \leq  ( \texttt{dstsize.width}   \mod  2)  \\ | \texttt{dstsize.height} -src.rows*2| \leq  ( \texttt{dstsize.height}   \mod  2) \end{array}\f]
+
+The function performs the upsampling step of the Gaussian pyramid construction, though it can
+actually be used to construct the Laplacian pyramid. First, it upsamples the source image by
+injecting even zero rows and columns and then convolves the result with the same kernel as in
+pyrDown multiplied by 4.
+
+@param src input image.
+@param dst output image. It has the specified size and the same type as src .
+@param dstsize size of the output image.
+@param borderType Pixel extrapolation method, see #BorderTypes (only #BORDER_DEFAULT is supported)
+ */
+CV_EXPORTS_W void pyrUp( InputArray src, OutputArray dst,
+                         const Size& dstsize = Size(), int borderType = BORDER_DEFAULT );
+
+/** @brief Constructs the Gaussian pyramid for an image.
+
+The function constructs a vector of images and builds the Gaussian pyramid by recursively applying
+pyrDown to the previously built pyramid layers, starting from `dst[0]==src`.
+
+@param src Source image. Check pyrDown for the list of supported types.
+@param dst Destination vector of maxlevel+1 images of the same type as src. dst[0] will be the
+same as src. dst[1] is the next pyramid layer, a smoothed and down-sized src, and so on.
+@param maxlevel 0-based index of the last (the smallest) pyramid layer. It must be non-negative.
+@param borderType Pixel extrapolation method, see #BorderTypes (#BORDER_CONSTANT isn't supported)
+ */
+CV_EXPORTS void buildPyramid( InputArray src, OutputArrayOfArrays dst,
+                              int maxlevel, int borderType = BORDER_DEFAULT );
+
+//! @} imgproc_filter
+
+//! @addtogroup imgproc_hist
+//! @{
+
+/** @example samples/cpp/demhist.cpp
+An example for creating histograms of an image
+*/
+
+/** @brief Calculates a histogram of a set of arrays.
+
+The function cv::calcHist calculates the histogram of one or more arrays. The elements of a tuple used
+to increment a histogram bin are taken from the corresponding input arrays at the same location. The
+sample below shows how to compute a 2D Hue-Saturation histogram for a color image. :
+@include snippets/imgproc_calcHist.cpp
+
+@param images Source arrays. They all should have the same depth, CV_8U, CV_16U or CV_32F , and the same
+size. Each of them can have an arbitrary number of channels.
+@param nimages Number of source images.
+@param channels List of the dims channels used to compute the histogram. The first array channels
+are numerated from 0 to images[0].channels()-1 , the second array channels are counted from
+images[0].channels() to images[0].channels() + images[1].channels()-1, and so on.
+@param mask Optional mask. If the matrix is not empty, it must be an 8-bit array of the same size
+as images[i] . The non-zero mask elements mark the array elements counted in the histogram.
+@param hist Output histogram, which is a dense or sparse dims -dimensional array.
+@param dims Histogram dimensionality that must be positive and not greater than CV_MAX_DIMS
+(equal to 32 in the current OpenCV version).
+@param histSize Array of histogram sizes in each dimension.
+@param ranges Array of the dims arrays of the histogram bin boundaries in each dimension. When the
+histogram is uniform ( uniform =true), then for each dimension i it is enough to specify the lower
+(inclusive) boundary \f$L_0\f$ of the 0-th histogram bin and the upper (exclusive) boundary
+\f$U_{\texttt{histSize}[i]-1}\f$ for the last histogram bin histSize[i]-1 . That is, in case of a
+uniform histogram each of ranges[i] is an array of 2 elements. When the histogram is not uniform (
+uniform=false ), then each of ranges[i] contains histSize[i]+1 elements:
+\f$L_0, U_0=L_1, U_1=L_2, ..., U_{\texttt{histSize[i]}-2}=L_{\texttt{histSize[i]}-1}, U_{\texttt{histSize[i]}-1}\f$
+. The array elements, that are not between \f$L_0\f$ and \f$U_{\texttt{histSize[i]}-1}\f$ , are not
+counted in the histogram.
+@param uniform Flag indicating whether the histogram is uniform or not (see above).
+@param accumulate Accumulation flag. If it is set, the histogram is not cleared in the beginning
+when it is allocated. This feature enables you to compute a single histogram from several sets of
+arrays, or to update the histogram in time.
+*/
+CV_EXPORTS void calcHist( const Mat* images, int nimages,
+                          const int* channels, InputArray mask,
+                          OutputArray hist, int dims, const int* histSize,
+                          const float** ranges, bool uniform = true, bool accumulate = false );
+
+/** @overload
+
+this variant uses %SparseMat for output
+*/
+CV_EXPORTS void calcHist( const Mat* images, int nimages,
+                          const int* channels, InputArray mask,
+                          SparseMat& hist, int dims,
+                          const int* histSize, const float** ranges,
+                          bool uniform = true, bool accumulate = false );
+
+/** @overload */
+CV_EXPORTS_W void calcHist( InputArrayOfArrays images,
+                            const std::vector<int>& channels,
+                            InputArray mask, OutputArray hist,
+                            const std::vector<int>& histSize,
+                            const std::vector<float>& ranges,
+                            bool accumulate = false );
+
+/** @brief Calculates the back projection of a histogram.
+
+The function cv::calcBackProject calculates the back project of the histogram. That is, similarly to
+#calcHist , at each location (x, y) the function collects the values from the selected channels
+in the input images and finds the corresponding histogram bin. But instead of incrementing it, the
+function reads the bin value, scales it by scale , and stores in backProject(x,y) . In terms of
+statistics, the function computes probability of each element value in respect with the empirical
+probability distribution represented by the histogram. See how, for example, you can find and track
+a bright-colored object in a scene:
+
+- Before tracking, show the object to the camera so that it covers almost the whole frame.
+Calculate a hue histogram. The histogram may have strong maximums, corresponding to the dominant
+colors in the object.
+
+- When tracking, calculate a back projection of a hue plane of each input video frame using that
+pre-computed histogram. Threshold the back projection to suppress weak colors. It may also make
+sense to suppress pixels with non-sufficient color saturation and too dark or too bright pixels.
+
+- Find connected components in the resulting picture and choose, for example, the largest
+component.
+
+This is an approximate algorithm of the CamShift color object tracker.
+
+@param images Source arrays. They all should have the same depth, CV_8U, CV_16U or CV_32F , and the same
+size. Each of them can have an arbitrary number of channels.
+@param nimages Number of source images.
+@param channels The list of channels used to compute the back projection. The number of channels
+must match the histogram dimensionality. The first array channels are numerated from 0 to
+images[0].channels()-1 , the second array channels are counted from images[0].channels() to
+images[0].channels() + images[1].channels()-1, and so on.
+@param hist Input histogram that can be dense or sparse.
+@param backProject Destination back projection array that is a single-channel array of the same
+size and depth as images[0] .
+@param ranges Array of arrays of the histogram bin boundaries in each dimension. See #calcHist .
+@param scale Optional scale factor for the output back projection.
+@param uniform Flag indicating whether the histogram is uniform or not (see above).
+
+@sa calcHist, compareHist
+ */
+CV_EXPORTS void calcBackProject( const Mat* images, int nimages,
+                                 const int* channels, InputArray hist,
+                                 OutputArray backProject, const float** ranges,
+                                 double scale = 1, bool uniform = true );
+
+/** @overload */
+CV_EXPORTS void calcBackProject( const Mat* images, int nimages,
+                                 const int* channels, const SparseMat& hist,
+                                 OutputArray backProject, const float** ranges,
+                                 double scale = 1, bool uniform = true );
+
+/** @overload */
+CV_EXPORTS_W void calcBackProject( InputArrayOfArrays images, const std::vector<int>& channels,
+                                   InputArray hist, OutputArray dst,
+                                   const std::vector<float>& ranges,
+                                   double scale );
+
+/** @brief Compares two histograms.
+
+The function cv::compareHist compares two dense or two sparse histograms using the specified method.
+
+The function returns \f$d(H_1, H_2)\f$ .
+
+While the function works well with 1-, 2-, 3-dimensional dense histograms, it may not be suitable
+for high-dimensional sparse histograms. In such histograms, because of aliasing and sampling
+problems, the coordinates of non-zero histogram bins can slightly shift. To compare such histograms
+or more general sparse configurations of weighted points, consider using the #EMD function.
+
+@param H1 First compared histogram.
+@param H2 Second compared histogram of the same size as H1 .
+@param method Comparison method, see #HistCompMethods
+ */
+CV_EXPORTS_W double compareHist( InputArray H1, InputArray H2, int method );
+
+/** @overload */
+CV_EXPORTS double compareHist( const SparseMat& H1, const SparseMat& H2, int method );
+
+/** @brief Equalizes the histogram of a grayscale image.
+
+The function equalizes the histogram of the input image using the following algorithm:
+
+- Calculate the histogram \f$H\f$ for src .
+- Normalize the histogram so that the sum of histogram bins is 255.
+- Compute the integral of the histogram:
+\f[H'_i =  \sum _{0  \le j < i} H(j)\f]
+- Transform the image using \f$H'\f$ as a look-up table: \f$\texttt{dst}(x,y) = H'(\texttt{src}(x,y))\f$
+
+The algorithm normalizes the brightness and increases the contrast of the image.
+
+@param src Source 8-bit single channel image.
+@param dst Destination image of the same size and type as src .
+ */
+CV_EXPORTS_W void equalizeHist( InputArray src, OutputArray dst );
+
+/** @brief Creates a smart pointer to a cv::CLAHE class and initializes it.
+
+@param clipLimit Threshold for contrast limiting.
+@param tileGridSize Size of grid for histogram equalization. Input image will be divided into
+equally sized rectangular tiles. tileGridSize defines the number of tiles in row and column.
+ */
+CV_EXPORTS_W Ptr<CLAHE> createCLAHE(double clipLimit = 40.0, Size tileGridSize = Size(8, 8));
+
+/** @brief Computes the "minimal work" distance between two weighted point configurations.
+
+The function computes the earth mover distance and/or a lower boundary of the distance between the
+two weighted point configurations. One of the applications described in @cite RubnerSept98,
+@cite Rubner2000 is multi-dimensional histogram comparison for image retrieval. EMD is a transportation
+problem that is solved using some modification of a simplex algorithm, thus the complexity is
+exponential in the worst case, though, on average it is much faster. In the case of a real metric
+the lower boundary can be calculated even faster (using linear-time algorithm) and it can be used
+to determine roughly whether the two signatures are far enough so that they cannot relate to the
+same object.
+
+@param signature1 First signature, a \f$\texttt{size1}\times \texttt{dims}+1\f$ floating-point matrix.
+Each row stores the point weight followed by the point coordinates. The matrix is allowed to have
+a single column (weights only) if the user-defined cost matrix is used. The weights must be
+non-negative and have at least one non-zero value.
+@param signature2 Second signature of the same format as signature1 , though the number of rows
+may be different. The total weights may be different. In this case an extra "dummy" point is added
+to either signature1 or signature2. The weights must be non-negative and have at least one non-zero
+value.
+@param distType Used metric. See #DistanceTypes.
+@param cost User-defined \f$\texttt{size1}\times \texttt{size2}\f$ cost matrix. Also, if a cost matrix
+is used, lower boundary lowerBound cannot be calculated because it needs a metric function.
+@param lowerBound Optional input/output parameter: lower boundary of a distance between the two
+signatures that is a distance between mass centers. The lower boundary may not be calculated if
+the user-defined cost matrix is used, the total weights of point configurations are not equal, or
+if the signatures consist of weights only (the signature matrices have a single column). You
+**must** initialize \*lowerBound . If the calculated distance between mass centers is greater or
+equal to \*lowerBound (it means that the signatures are far enough), the function does not
+calculate EMD. In any case \*lowerBound is set to the calculated distance between mass centers on
+return. Thus, if you want to calculate both distance between mass centers and EMD, \*lowerBound
+should be set to 0.
+@param flow Resultant \f$\texttt{size1} \times \texttt{size2}\f$ flow matrix: \f$\texttt{flow}_{i,j}\f$ is
+a flow from \f$i\f$ -th point of signature1 to \f$j\f$ -th point of signature2 .
+ */
+CV_EXPORTS float EMD( InputArray signature1, InputArray signature2,
+                      int distType, InputArray cost=noArray(),
+                      float* lowerBound = 0, OutputArray flow = noArray() );
+
+CV_EXPORTS_AS(EMD) float wrapperEMD( InputArray signature1, InputArray signature2,
+                      int distType, InputArray cost=noArray(),
+                      CV_IN_OUT Ptr<float> lowerBound = Ptr<float>(), OutputArray flow = noArray() );
+
+//! @} imgproc_hist
+
+//! @addtogroup imgproc_segmentation
+//! @{
+
+/** @example samples/cpp/watershed.cpp
+An example using the watershed algorithm
+*/
+
+/** @brief Performs a marker-based image segmentation using the watershed algorithm.
+
+The function implements one of the variants of watershed, non-parametric marker-based segmentation
+algorithm, described in @cite Meyer92 .
+
+Before passing the image to the function, you have to roughly outline the desired regions in the
+image markers with positive (\>0) indices. So, every region is represented as one or more connected
+components with the pixel values 1, 2, 3, and so on. Such markers can be retrieved from a binary
+mask using #findContours and #drawContours (see the watershed.cpp demo). The markers are "seeds" of
+the future image regions. All the other pixels in markers , whose relation to the outlined regions
+is not known and should be defined by the algorithm, should be set to 0's. In the function output,
+each pixel in markers is set to a value of the "seed" components or to -1 at boundaries between the
+regions.
+
+@note Any two neighbor connected components are not necessarily separated by a watershed boundary
+(-1's pixels); for example, they can touch each other in the initial marker image passed to the
+function.
+
+@param image Input 8-bit 3-channel image.
+@param markers Input/output 32-bit single-channel image (map) of markers. It should have the same
+size as image .
+
+@sa findContours
+ */
+CV_EXPORTS_W void watershed( InputArray image, InputOutputArray markers );
+
+//! @} imgproc_segmentation
+
+//! @addtogroup imgproc_filter
+//! @{
+
+/** @brief Performs initial step of meanshift segmentation of an image.
+
+The function implements the filtering stage of meanshift segmentation, that is, the output of the
+function is the filtered "posterized" image with color gradients and fine-grain texture flattened.
+At every pixel (X,Y) of the input image (or down-sized input image, see below) the function executes
+meanshift iterations, that is, the pixel (X,Y) neighborhood in the joint space-color hyperspace is
+considered:
+
+\f[(x,y): X- \texttt{sp} \le x  \le X+ \texttt{sp} , Y- \texttt{sp} \le y  \le Y+ \texttt{sp} , ||(R,G,B)-(r,g,b)||   \le \texttt{sr}\f]
+
+where (R,G,B) and (r,g,b) are the vectors of color components at (X,Y) and (x,y), respectively
+(though, the algorithm does not depend on the color space used, so any 3-component color space can
+be used instead). Over the neighborhood the average spatial value (X',Y') and average color vector
+(R',G',B') are found and they act as the neighborhood center on the next iteration:
+
+\f[(X,Y)~(X',Y'), (R,G,B)~(R',G',B').\f]
+
+After the iterations over, the color components of the initial pixel (that is, the pixel from where
+the iterations started) are set to the final value (average color at the last iteration):
+
+\f[I(X,Y) <- (R*,G*,B*)\f]
+
+When maxLevel \> 0, the gaussian pyramid of maxLevel+1 levels is built, and the above procedure is
+run on the smallest layer first. After that, the results are propagated to the larger layer and the
+iterations are run again only on those pixels where the layer colors differ by more than sr from the
+lower-resolution layer of the pyramid. That makes boundaries of color regions sharper. Note that the
+results will be actually different from the ones obtained by running the meanshift procedure on the
+whole original image (i.e. when maxLevel==0).
+
+@param src The source 8-bit, 3-channel image.
+@param dst The destination image of the same format and the same size as the source.
+@param sp The spatial window radius.
+@param sr The color window radius.
+@param maxLevel Maximum level of the pyramid for the segmentation.
+@param termcrit Termination criteria: when to stop meanshift iterations.
+ */
+CV_EXPORTS_W void pyrMeanShiftFiltering( InputArray src, OutputArray dst,
+                                         double sp, double sr, int maxLevel = 1,
+                                         TermCriteria termcrit=TermCriteria(TermCriteria::MAX_ITER+TermCriteria::EPS,5,1) );
+
+//! @}
+
+//! @addtogroup imgproc_segmentation
+//! @{
+
+/** @example samples/cpp/grabcut.cpp
+An example using the GrabCut algorithm
+![Sample Screenshot](grabcut_output1.jpg)
+*/
+
+/** @brief Runs the GrabCut algorithm.
+
+The function implements the [GrabCut image segmentation algorithm](http://en.wikipedia.org/wiki/GrabCut).
+
+@param img Input 8-bit 3-channel image.
+@param mask Input/output 8-bit single-channel mask. The mask is initialized by the function when
+mode is set to #GC_INIT_WITH_RECT. Its elements may have one of the #GrabCutClasses.
+@param rect ROI containing a segmented object. The pixels outside of the ROI are marked as
+"obvious background". The parameter is only used when mode==#GC_INIT_WITH_RECT .
+@param bgdModel Temporary array for the background model. Do not modify it while you are
+processing the same image.
+@param fgdModel Temporary arrays for the foreground model. Do not modify it while you are
+processing the same image.
+@param iterCount Number of iterations the algorithm should make before returning the result. Note
+that the result can be refined with further calls with mode==#GC_INIT_WITH_MASK or
+mode==GC_EVAL .
+@param mode Operation mode that could be one of the #GrabCutModes
+ */
+CV_EXPORTS_W void grabCut( InputArray img, InputOutputArray mask, Rect rect,
+                           InputOutputArray bgdModel, InputOutputArray fgdModel,
+                           int iterCount, int mode = GC_EVAL );
+
+//! @} imgproc_segmentation
+
+//! @addtogroup imgproc_misc
+//! @{
+
+/** @example samples/cpp/distrans.cpp
+An example on using the distance transform
+*/
+
+/** @brief Calculates the distance to the closest zero pixel for each pixel of the source image.
+
+The function cv::distanceTransform calculates the approximate or precise distance from every binary
+image pixel to the nearest zero pixel. For zero image pixels, the distance will obviously be zero.
+
+When maskSize == #DIST_MASK_PRECISE and distanceType == #DIST_L2 , the function runs the
+algorithm described in @cite Felzenszwalb04 . This algorithm is parallelized with the TBB library.
+
+In other cases, the algorithm @cite Borgefors86 is used. This means that for a pixel the function
+finds the shortest path to the nearest zero pixel consisting of basic shifts: horizontal, vertical,
+diagonal, or knight's move (the latest is available for a \f$5\times 5\f$ mask). The overall
+distance is calculated as a sum of these basic distances. Since the distance function should be
+symmetric, all of the horizontal and vertical shifts must have the same cost (denoted as a ), all
+the diagonal shifts must have the same cost (denoted as `b`), and all knight's moves must have the
+same cost (denoted as `c`). For the #DIST_C and #DIST_L1 types, the distance is calculated
+precisely, whereas for #DIST_L2 (Euclidean distance) the distance can be calculated only with a
+relative error (a \f$5\times 5\f$ mask gives more accurate results). For `a`,`b`, and `c`, OpenCV
+uses the values suggested in the original paper:
+- DIST_L1: `a = 1, b = 2`
+- DIST_L2:
+    - `3 x 3`: `a=0.955, b=1.3693`
+    - `5 x 5`: `a=1, b=1.4, c=2.1969`
+- DIST_C: `a = 1, b = 1`
+
+Typically, for a fast, coarse distance estimation #DIST_L2, a \f$3\times 3\f$ mask is used. For a
+more accurate distance estimation #DIST_L2, a \f$5\times 5\f$ mask or the precise algorithm is used.
+Note that both the precise and the approximate algorithms are linear on the number of pixels.
+
+This variant of the function does not only compute the minimum distance for each pixel \f$(x, y)\f$
+but also identifies the nearest connected component consisting of zero pixels
+(labelType==#DIST_LABEL_CCOMP) or the nearest zero pixel (labelType==#DIST_LABEL_PIXEL). Index of the
+component/pixel is stored in `labels(x, y)`. When labelType==#DIST_LABEL_CCOMP, the function
+automatically finds connected components of zero pixels in the input image and marks them with
+distinct labels. When labelType==#DIST_LABEL_PIXEL, the function scans through the input image and
+marks all the zero pixels with distinct labels.
+
+In this mode, the complexity is still linear. That is, the function provides a very fast way to
+compute the Voronoi diagram for a binary image. Currently, the second variant can use only the
+approximate distance transform algorithm, i.e. maskSize=#DIST_MASK_PRECISE is not supported
+yet.
+
+@param src 8-bit, single-channel (binary) source image.
+@param dst Output image with calculated distances. It is a 8-bit or 32-bit floating-point,
+single-channel image of the same size as src.
+@param labels Output 2D array of labels (the discrete Voronoi diagram). It has the type
+CV_32SC1 and the same size as src.
+@param distanceType Type of distance, see #DistanceTypes
+@param maskSize Size of the distance transform mask, see #DistanceTransformMasks.
+#DIST_MASK_PRECISE is not supported by this variant. In case of the #DIST_L1 or #DIST_C distance type,
+the parameter is forced to 3 because a \f$3\times 3\f$ mask gives the same result as \f$5\times
+5\f$ or any larger aperture.
+@param labelType Type of the label array to build, see #DistanceTransformLabelTypes.
+ */
+CV_EXPORTS_AS(distanceTransformWithLabels) void distanceTransform( InputArray src, OutputArray dst,
+                                     OutputArray labels, int distanceType, int maskSize,
+                                     int labelType = DIST_LABEL_CCOMP );
+
+/** @overload
+@param src 8-bit, single-channel (binary) source image.
+@param dst Output image with calculated distances. It is a 8-bit or 32-bit floating-point,
+single-channel image of the same size as src .
+@param distanceType Type of distance, see #DistanceTypes
+@param maskSize Size of the distance transform mask, see #DistanceTransformMasks. In case of the
+#DIST_L1 or #DIST_C distance type, the parameter is forced to 3 because a \f$3\times 3\f$ mask gives
+the same result as \f$5\times 5\f$ or any larger aperture.
+@param dstType Type of output image. It can be CV_8U or CV_32F. Type CV_8U can be used only for
+the first variant of the function and distanceType == #DIST_L1.
+*/
+CV_EXPORTS_W void distanceTransform( InputArray src, OutputArray dst,
+                                     int distanceType, int maskSize, int dstType=CV_32F);
+
+/** @brief Fills a connected component with the given color.
+
+The function cv::floodFill fills a connected component starting from the seed point with the specified
+color. The connectivity is determined by the color/brightness closeness of the neighbor pixels. The
+pixel at \f$(x,y)\f$ is considered to belong to the repainted domain if:
+
+- in case of a grayscale image and floating range
+\f[\texttt{src} (x',y')- \texttt{loDiff} \leq \texttt{src} (x,y)  \leq \texttt{src} (x',y')+ \texttt{upDiff}\f]
+
+
+- in case of a grayscale image and fixed range
+\f[\texttt{src} ( \texttt{seedPoint} .x, \texttt{seedPoint} .y)- \texttt{loDiff} \leq \texttt{src} (x,y)  \leq \texttt{src} ( \texttt{seedPoint} .x, \texttt{seedPoint} .y)+ \texttt{upDiff}\f]
+
+
+- in case of a color image and floating range
+\f[\texttt{src} (x',y')_r- \texttt{loDiff} _r \leq \texttt{src} (x,y)_r \leq \texttt{src} (x',y')_r+ \texttt{upDiff} _r,\f]
+\f[\texttt{src} (x',y')_g- \texttt{loDiff} _g \leq \texttt{src} (x,y)_g \leq \texttt{src} (x',y')_g+ \texttt{upDiff} _g\f]
+and
+\f[\texttt{src} (x',y')_b- \texttt{loDiff} _b \leq \texttt{src} (x,y)_b \leq \texttt{src} (x',y')_b+ \texttt{upDiff} _b\f]
+
+
+- in case of a color image and fixed range
+\f[\texttt{src} ( \texttt{seedPoint} .x, \texttt{seedPoint} .y)_r- \texttt{loDiff} _r \leq \texttt{src} (x,y)_r \leq \texttt{src} ( \texttt{seedPoint} .x, \texttt{seedPoint} .y)_r+ \texttt{upDiff} _r,\f]
+\f[\texttt{src} ( \texttt{seedPoint} .x, \texttt{seedPoint} .y)_g- \texttt{loDiff} _g \leq \texttt{src} (x,y)_g \leq \texttt{src} ( \texttt{seedPoint} .x, \texttt{seedPoint} .y)_g+ \texttt{upDiff} _g\f]
+and
+\f[\texttt{src} ( \texttt{seedPoint} .x, \texttt{seedPoint} .y)_b- \texttt{loDiff} _b \leq \texttt{src} (x,y)_b \leq \texttt{src} ( \texttt{seedPoint} .x, \texttt{seedPoint} .y)_b+ \texttt{upDiff} _b\f]
+
+
+where \f$src(x',y')\f$ is the value of one of pixel neighbors that is already known to belong to the
+component. That is, to be added to the connected component, a color/brightness of the pixel should
+be close enough to:
+- Color/brightness of one of its neighbors that already belong to the connected component in case
+of a floating range.
+- Color/brightness of the seed point in case of a fixed range.
+
+Use these functions to either mark a connected component with the specified color in-place, or build
+a mask and then extract the contour, or copy the region to another image, and so on.
+
+@param image Input/output 1- or 3-channel, 8-bit, or floating-point image. It is modified by the
+function unless the #FLOODFILL_MASK_ONLY flag is set in the second variant of the function. See
+the details below.
+@param mask Operation mask that should be a single-channel 8-bit image, 2 pixels wider and 2 pixels
+taller than image. If an empty Mat is passed it will be created automatically. Since this is both an
+input and output parameter, you must take responsibility of initializing it.
+Flood-filling cannot go across non-zero pixels in the input mask. For example,
+an edge detector output can be used as a mask to stop filling at edges. On output, pixels in the
+mask corresponding to filled pixels in the image are set to 1 or to the specified value in flags
+as described below. Additionally, the function fills the border of the mask with ones to simplify
+internal processing. It is therefore possible to use the same mask in multiple calls to the function
+to make sure the filled areas do not overlap.
+@param seedPoint Starting point.
+@param newVal New value of the repainted domain pixels.
+@param loDiff Maximal lower brightness/color difference between the currently observed pixel and
+one of its neighbors belonging to the component, or a seed pixel being added to the component.
+@param upDiff Maximal upper brightness/color difference between the currently observed pixel and
+one of its neighbors belonging to the component, or a seed pixel being added to the component.
+@param rect Optional output parameter set by the function to the minimum bounding rectangle of the
+repainted domain.
+@param flags Operation flags. The first 8 bits contain a connectivity value. The default value of
+4 means that only the four nearest neighbor pixels (those that share an edge) are considered. A
+connectivity value of 8 means that the eight nearest neighbor pixels (those that share a corner)
+will be considered. The next 8 bits (8-16) contain a value between 1 and 255 with which to fill
+the mask (the default value is 1). For example, 4 | ( 255 \<\< 8 ) will consider 4 nearest
+neighbours and fill the mask with a value of 255. The following additional options occupy higher
+bits and therefore may be further combined with the connectivity and mask fill values using
+bit-wise or (|), see #FloodFillFlags.
+
+@note Since the mask is larger than the filled image, a pixel \f$(x, y)\f$ in image corresponds to the
+pixel \f$(x+1, y+1)\f$ in the mask .
+
+@sa findContours
+ */
+CV_EXPORTS_W int floodFill( InputOutputArray image, InputOutputArray mask,
+                            Point seedPoint, Scalar newVal, CV_OUT Rect* rect=0,
+                            Scalar loDiff = Scalar(), Scalar upDiff = Scalar(),
+                            int flags = 4 );
+
+/** @example samples/cpp/ffilldemo.cpp
+An example using the FloodFill technique
+*/
+
+/** @overload
+
+variant without `mask` parameter
+*/
+CV_EXPORTS int floodFill( InputOutputArray image,
+                          Point seedPoint, Scalar newVal, CV_OUT Rect* rect = 0,
+                          Scalar loDiff = Scalar(), Scalar upDiff = Scalar(),
+                          int flags = 4 );
+
+//! Performs linear blending of two images:
+//! \f[ \texttt{dst}(i,j) = \texttt{weights1}(i,j)*\texttt{src1}(i,j) + \texttt{weights2}(i,j)*\texttt{src2}(i,j) \f]
+//! @param src1 It has a type of CV_8UC(n) or CV_32FC(n), where n is a positive integer.
+//! @param src2 It has the same type and size as src1.
+//! @param weights1 It has a type of CV_32FC1 and the same size with src1.
+//! @param weights2 It has a type of CV_32FC1 and the same size with src1.
+//! @param dst It is created if it does not have the same size and type with src1.
+CV_EXPORTS_W void blendLinear(InputArray src1, InputArray src2, InputArray weights1, InputArray weights2, OutputArray dst);
+
+//! @} imgproc_misc
+
+//! @addtogroup imgproc_color_conversions
+//! @{
+
+/** @brief Converts an image from one color space to another.
+
+The function converts an input image from one color space to another. In case of a transformation
+to-from RGB color space, the order of the channels should be specified explicitly (RGB or BGR). Note
+that the default color format in OpenCV is often referred to as RGB but it is actually BGR (the
+bytes are reversed). So the first byte in a standard (24-bit) color image will be an 8-bit Blue
+component, the second byte will be Green, and the third byte will be Red. The fourth, fifth, and
+sixth bytes would then be the second pixel (Blue, then Green, then Red), and so on.
+
+The conventional ranges for R, G, and B channel values are:
+-   0 to 255 for CV_8U images
+-   0 to 65535 for CV_16U images
+-   0 to 1 for CV_32F images
+
+In case of linear transformations, the range does not matter. But in case of a non-linear
+transformation, an input RGB image should be normalized to the proper value range to get the correct
+results, for example, for RGB \f$\rightarrow\f$ L\*u\*v\* transformation. For example, if you have a
+32-bit floating-point image directly converted from an 8-bit image without any scaling, then it will
+have the 0..255 value range instead of 0..1 assumed by the function. So, before calling #cvtColor ,
+you need first to scale the image down:
+@code
+    img *= 1./255;
+    cvtColor(img, img, COLOR_BGR2Luv);
+@endcode
+If you use #cvtColor with 8-bit images, the conversion will have some information lost. For many
+applications, this will not be noticeable but it is recommended to use 32-bit images in applications
+that need the full range of colors or that convert an image before an operation and then convert
+back.
+
+If conversion adds the alpha channel, its value will set to the maximum of corresponding channel
+range: 255 for CV_8U, 65535 for CV_16U, 1 for CV_32F.
+
+@param src input image: 8-bit unsigned, 16-bit unsigned ( CV_16UC... ), or single-precision
+floating-point.
+@param dst output image of the same size and depth as src.
+@param code color space conversion code (see #ColorConversionCodes).
+@param dstCn number of channels in the destination image; if the parameter is 0, the number of the
+channels is derived automatically from src and code.
+
+@see @ref imgproc_color_conversions
+ */
+CV_EXPORTS_W void cvtColor( InputArray src, OutputArray dst, int code, int dstCn = 0 );
+
+/** @brief Converts an image from one color space to another where the source image is
+stored in two planes.
+
+This function only supports YUV420 to RGB conversion as of now.
+
+@param src1: 8-bit image (#CV_8U) of the Y plane.
+@param src2: image containing interleaved U/V plane.
+@param dst: output image.
+@param code: Specifies the type of conversion. It can take any of the following values:
+- #COLOR_YUV2BGR_NV12
+- #COLOR_YUV2RGB_NV12
+- #COLOR_YUV2BGRA_NV12
+- #COLOR_YUV2RGBA_NV12
+- #COLOR_YUV2BGR_NV21
+- #COLOR_YUV2RGB_NV21
+- #COLOR_YUV2BGRA_NV21
+- #COLOR_YUV2RGBA_NV21
+*/
+CV_EXPORTS_W void cvtColorTwoPlane( InputArray src1, InputArray src2, OutputArray dst, int code );
+
+/** @brief main function for all demosaicing processes
+
+@param src input image: 8-bit unsigned or 16-bit unsigned.
+@param dst output image of the same size and depth as src.
+@param code Color space conversion code (see the description below).
+@param dstCn number of channels in the destination image; if the parameter is 0, the number of the
+channels is derived automatically from src and code.
+
+The function can do the following transformations:
+
+-   Demosaicing using bilinear interpolation
+
+    #COLOR_BayerBG2BGR , #COLOR_BayerGB2BGR , #COLOR_BayerRG2BGR , #COLOR_BayerGR2BGR
+
+    #COLOR_BayerBG2GRAY , #COLOR_BayerGB2GRAY , #COLOR_BayerRG2GRAY , #COLOR_BayerGR2GRAY
+
+-   Demosaicing using Variable Number of Gradients.
+
+    #COLOR_BayerBG2BGR_VNG , #COLOR_BayerGB2BGR_VNG , #COLOR_BayerRG2BGR_VNG , #COLOR_BayerGR2BGR_VNG
+
+-   Edge-Aware Demosaicing.
+
+    #COLOR_BayerBG2BGR_EA , #COLOR_BayerGB2BGR_EA , #COLOR_BayerRG2BGR_EA , #COLOR_BayerGR2BGR_EA
+
+-   Demosaicing with alpha channel
+
+    #COLOR_BayerBG2BGRA , #COLOR_BayerGB2BGRA , #COLOR_BayerRG2BGRA , #COLOR_BayerGR2BGRA
+
+@sa cvtColor
+*/
+CV_EXPORTS_W void demosaicing(InputArray src, OutputArray dst, int code, int dstCn = 0);
+
+//! @} imgproc_color_conversions
+
+//! @addtogroup imgproc_shape
+//! @{
+
+/** @brief Calculates all of the moments up to the third order of a polygon or rasterized shape.
+
+The function computes moments, up to the 3rd order, of a vector shape or a rasterized shape. The
+results are returned in the structure cv::Moments.
+
+@param array Raster image (single-channel, 8-bit or floating-point 2D array) or an array (
+\f$1 \times N\f$ or \f$N \times 1\f$ ) of 2D points (Point or Point2f ).
+@param binaryImage If it is true, all non-zero image pixels are treated as 1's. The parameter is
+used for images only.
+@returns moments.
+
+@note Only applicable to contour moments calculations from Python bindings: Note that the numpy
+type for the input array should be either np.int32 or np.float32.
+
+@sa  contourArea, arcLength
+ */
+CV_EXPORTS_W Moments moments( InputArray array, bool binaryImage = false );
+
+/** @brief Calculates seven Hu invariants.
+
+The function calculates seven Hu invariants (introduced in @cite Hu62; see also
+<http://en.wikipedia.org/wiki/Image_moment>) defined as:
+
+\f[\begin{array}{l} hu[0]= \eta _{20}+ \eta _{02} \\ hu[1]=( \eta _{20}- \eta _{02})^{2}+4 \eta _{11}^{2} \\ hu[2]=( \eta _{30}-3 \eta _{12})^{2}+ (3 \eta _{21}- \eta _{03})^{2} \\ hu[3]=( \eta _{30}+ \eta _{12})^{2}+ ( \eta _{21}+ \eta _{03})^{2} \\ hu[4]=( \eta _{30}-3 \eta _{12})( \eta _{30}+ \eta _{12})[( \eta _{30}+ \eta _{12})^{2}-3( \eta _{21}+ \eta _{03})^{2}]+(3 \eta _{21}- \eta _{03})( \eta _{21}+ \eta _{03})[3( \eta _{30}+ \eta _{12})^{2}-( \eta _{21}+ \eta _{03})^{2}] \\ hu[5]=( \eta _{20}- \eta _{02})[( \eta _{30}+ \eta _{12})^{2}- ( \eta _{21}+ \eta _{03})^{2}]+4 \eta _{11}( \eta _{30}+ \eta _{12})( \eta _{21}+ \eta _{03}) \\ hu[6]=(3 \eta _{21}- \eta _{03})( \eta _{21}+ \eta _{03})[3( \eta _{30}+ \eta _{12})^{2}-( \eta _{21}+ \eta _{03})^{2}]-( \eta _{30}-3 \eta _{12})( \eta _{21}+ \eta _{03})[3( \eta _{30}+ \eta _{12})^{2}-( \eta _{21}+ \eta _{03})^{2}] \\ \end{array}\f]
+
+where \f$\eta_{ji}\f$ stands for \f$\texttt{Moments::nu}_{ji}\f$ .
+
+These values are proved to be invariants to the image scale, rotation, and reflection except the
+seventh one, whose sign is changed by reflection. This invariance is proved with the assumption of
+infinite image resolution. In case of raster images, the computed Hu invariants for the original and
+transformed images are a bit different.
+
+@param moments Input moments computed with moments .
+@param hu Output Hu invariants.
+
+@sa matchShapes
+ */
+CV_EXPORTS void HuMoments( const Moments& moments, double hu[7] );
+
+/** @overload */
+CV_EXPORTS_W void HuMoments( const Moments& m, OutputArray hu );
+
+//! @} imgproc_shape
+
+//! @addtogroup imgproc_object
+//! @{
+
+//! type of the template matching operation
+enum TemplateMatchModes {
+    TM_SQDIFF        = 0, /*!< \f[R(x,y)= \sum _{x',y'} (T(x',y')-I(x+x',y+y'))^2\f]
+                               with mask:
+                               \f[R(x,y)= \sum _{x',y'} \left( (T(x',y')-I(x+x',y+y')) \cdot
+                                  M(x',y') \right)^2\f] */
+    TM_SQDIFF_NORMED = 1, /*!< \f[R(x,y)= \frac{\sum_{x',y'} (T(x',y')-I(x+x',y+y'))^2}{\sqrt{\sum_{
+                                  x',y'}T(x',y')^2 \cdot \sum_{x',y'} I(x+x',y+y')^2}}\f]
+                               with mask:
+                               \f[R(x,y)= \frac{\sum _{x',y'} \left( (T(x',y')-I(x+x',y+y')) \cdot
+                                  M(x',y') \right)^2}{\sqrt{\sum_{x',y'} \left( T(x',y') \cdot
+                                  M(x',y') \right)^2 \cdot \sum_{x',y'} \left( I(x+x',y+y') \cdot
+                                  M(x',y') \right)^2}}\f] */
+    TM_CCORR         = 2, /*!< \f[R(x,y)= \sum _{x',y'} (T(x',y') \cdot I(x+x',y+y'))\f]
+                               with mask:
+                               \f[R(x,y)= \sum _{x',y'} (T(x',y') \cdot I(x+x',y+y') \cdot M(x',y')
+                                  ^2)\f] */
+    TM_CCORR_NORMED  = 3, /*!< \f[R(x,y)= \frac{\sum_{x',y'} (T(x',y') \cdot I(x+x',y+y'))}{\sqrt{
+                                  \sum_{x',y'}T(x',y')^2 \cdot \sum_{x',y'} I(x+x',y+y')^2}}\f]
+                               with mask:
+                               \f[R(x,y)= \frac{\sum_{x',y'} (T(x',y') \cdot I(x+x',y+y') \cdot
+                                  M(x',y')^2)}{\sqrt{\sum_{x',y'} \left( T(x',y') \cdot M(x',y')
+                                  \right)^2 \cdot \sum_{x',y'} \left( I(x+x',y+y') \cdot M(x',y')
+                                  \right)^2}}\f] */
+    TM_CCOEFF        = 4, /*!< \f[R(x,y)= \sum _{x',y'} (T'(x',y') \cdot I'(x+x',y+y'))\f]
+                               where
+                               \f[\begin{array}{l} T'(x',y')=T(x',y') - 1/(w \cdot h) \cdot \sum _{
+                                  x'',y''} T(x'',y'') \\ I'(x+x',y+y')=I(x+x',y+y') - 1/(w \cdot h)
+                                  \cdot \sum _{x'',y''} I(x+x'',y+y'') \end{array}\f]
+                               with mask:
+                               \f[\begin{array}{l} T'(x',y')=M(x',y') \cdot \left( T(x',y') -
+                                  \frac{1}{\sum _{x'',y''} M(x'',y'')} \cdot \sum _{x'',y''}
+                                  (T(x'',y'') \cdot M(x'',y'')) \right) \\ I'(x+x',y+y')=M(x',y')
+                                  \cdot \left( I(x+x',y+y') - \frac{1}{\sum _{x'',y''} M(x'',y'')}
+                                  \cdot \sum _{x'',y''} (I(x+x'',y+y'') \cdot M(x'',y'')) \right)
+                                  \end{array} \f] */
+    TM_CCOEFF_NORMED = 5  /*!< \f[R(x,y)= \frac{ \sum_{x',y'} (T'(x',y') \cdot I'(x+x',y+y')) }{
+                                  \sqrt{\sum_{x',y'}T'(x',y')^2 \cdot \sum_{x',y'} I'(x+x',y+y')^2}
+                                  }\f] */
+};
+
+/** @example samples/cpp/tutorial_code/Histograms_Matching/MatchTemplate_Demo.cpp
+An example using Template Matching algorithm
+*/
+
+/** @brief Compares a template against overlapped image regions.
+
+The function slides through image , compares the overlapped patches of size \f$w \times h\f$ against
+templ using the specified method and stores the comparison results in result . #TemplateMatchModes
+describes the formulae for the available comparison methods ( \f$I\f$ denotes image, \f$T\f$
+template, \f$R\f$ result, \f$M\f$ the optional mask ). The summation is done over template and/or
+the image patch: \f$x' = 0...w-1, y' = 0...h-1\f$
+
+After the function finishes the comparison, the best matches can be found as global minimums (when
+#TM_SQDIFF was used) or maximums (when #TM_CCORR or #TM_CCOEFF was used) using the
+#minMaxLoc function. In case of a color image, template summation in the numerator and each sum in
+the denominator is done over all of the channels and separate mean values are used for each channel.
+That is, the function can take a color template and a color image. The result will still be a
+single-channel image, which is easier to analyze.
+
+@param image Image where the search is running. It must be 8-bit or 32-bit floating-point.
+@param templ Searched template. It must be not greater than the source image and have the same
+data type.
+@param result Map of comparison results. It must be single-channel 32-bit floating-point. If image
+is \f$W \times H\f$ and templ is \f$w \times h\f$ , then result is \f$(W-w+1) \times (H-h+1)\f$ .
+@param method Parameter specifying the comparison method, see #TemplateMatchModes
+@param mask Optional mask. It must have the same size as templ. It must either have the same number
+            of channels as template or only one channel, which is then used for all template and
+            image channels. If the data type is #CV_8U, the mask is interpreted as a binary mask,
+            meaning only elements where mask is nonzero are used and are kept unchanged independent
+            of the actual mask value (weight equals 1). For data tpye #CV_32F, the mask values are
+            used as weights. The exact formulas are documented in #TemplateMatchModes.
+ */
+CV_EXPORTS_W void matchTemplate( InputArray image, InputArray templ,
+                                 OutputArray result, int method, InputArray mask = noArray() );
+
+//! @}
+
+//! @addtogroup imgproc_shape
+//! @{
+
+/** @example samples/cpp/connected_components.cpp
+This program demonstrates connected components and use of the trackbar
+*/
+
+/** @brief computes the connected components labeled image of boolean image
+
+image with 4 or 8 way connectivity - returns N, the total number of labels [0, N-1] where 0
+represents the background label. ltype specifies the output label image type, an important
+consideration based on the total number of labels or alternatively the total number of pixels in
+the source image. ccltype specifies the connected components labeling algorithm to use, currently
+Bolelli (Spaghetti) @cite Bolelli2019, Grana (BBDT) @cite Grana2010 and Wu's (SAUF) @cite Wu2009 algorithms
+are supported, see the #ConnectedComponentsAlgorithmsTypes for details. Note that SAUF algorithm forces
+a row major ordering of labels while Spaghetti and BBDT do not.
+This function uses parallel version of the algorithms if at least one allowed
+parallel framework is enabled and if the rows of the image are at least twice the number returned by #getNumberOfCPUs.
+
+@param image the 8-bit single-channel image to be labeled
+@param labels destination labeled image
+@param connectivity 8 or 4 for 8-way or 4-way connectivity respectively
+@param ltype output image label type. Currently CV_32S and CV_16U are supported.
+@param ccltype connected components algorithm type (see the #ConnectedComponentsAlgorithmsTypes).
+*/
+CV_EXPORTS_AS(connectedComponentsWithAlgorithm) int connectedComponents(InputArray image, OutputArray labels,
+                                                                        int connectivity, int ltype, int ccltype);
+
+
+/** @overload
+
+@param image the 8-bit single-channel image to be labeled
+@param labels destination labeled image
+@param connectivity 8 or 4 for 8-way or 4-way connectivity respectively
+@param ltype output image label type. Currently CV_32S and CV_16U are supported.
+*/
+CV_EXPORTS_W int connectedComponents(InputArray image, OutputArray labels,
+                                     int connectivity = 8, int ltype = CV_32S);
+
+
+/** @brief computes the connected components labeled image of boolean image and also produces a statistics output for each label
+
+image with 4 or 8 way connectivity - returns N, the total number of labels [0, N-1] where 0
+represents the background label. ltype specifies the output label image type, an important
+consideration based on the total number of labels or alternatively the total number of pixels in
+the source image. ccltype specifies the connected components labeling algorithm to use, currently
+Bolelli (Spaghetti) @cite Bolelli2019, Grana (BBDT) @cite Grana2010 and Wu's (SAUF) @cite Wu2009 algorithms
+are supported, see the #ConnectedComponentsAlgorithmsTypes for details. Note that SAUF algorithm forces
+a row major ordering of labels while Spaghetti and BBDT do not.
+This function uses parallel version of the algorithms (statistics included) if at least one allowed
+parallel framework is enabled and if the rows of the image are at least twice the number returned by #getNumberOfCPUs.
+
+@param image the 8-bit single-channel image to be labeled
+@param labels destination labeled image
+@param stats statistics output for each label, including the background label.
+Statistics are accessed via stats(label, COLUMN) where COLUMN is one of
+#ConnectedComponentsTypes, selecting the statistic. The data type is CV_32S.
+@param centroids centroid output for each label, including the background label. Centroids are
+accessed via centroids(label, 0) for x and centroids(label, 1) for y. The data type CV_64F.
+@param connectivity 8 or 4 for 8-way or 4-way connectivity respectively
+@param ltype output image label type. Currently CV_32S and CV_16U are supported.
+@param ccltype connected components algorithm type (see #ConnectedComponentsAlgorithmsTypes).
+*/
+CV_EXPORTS_AS(connectedComponentsWithStatsWithAlgorithm) int connectedComponentsWithStats(InputArray image, OutputArray labels,
+                                                                                          OutputArray stats, OutputArray centroids,
+                                                                                          int connectivity, int ltype, int ccltype);
+
+/** @overload
+@param image the 8-bit single-channel image to be labeled
+@param labels destination labeled image
+@param stats statistics output for each label, including the background label.
+Statistics are accessed via stats(label, COLUMN) where COLUMN is one of
+#ConnectedComponentsTypes, selecting the statistic. The data type is CV_32S.
+@param centroids centroid output for each label, including the background label. Centroids are
+accessed via centroids(label, 0) for x and centroids(label, 1) for y. The data type CV_64F.
+@param connectivity 8 or 4 for 8-way or 4-way connectivity respectively
+@param ltype output image label type. Currently CV_32S and CV_16U are supported.
+*/
+CV_EXPORTS_W int connectedComponentsWithStats(InputArray image, OutputArray labels,
+                                              OutputArray stats, OutputArray centroids,
+                                              int connectivity = 8, int ltype = CV_32S);
+
+
+/** @brief Finds contours in a binary image.
+
+The function retrieves contours from the binary image using the algorithm @cite Suzuki85 . The contours
+are a useful tool for shape analysis and object detection and recognition. See squares.cpp in the
+OpenCV sample directory.
+@note Since opencv 3.2 source image is not modified by this function.
+
+@param image Source, an 8-bit single-channel image. Non-zero pixels are treated as 1's. Zero
+pixels remain 0's, so the image is treated as binary . You can use #compare, #inRange, #threshold ,
+#adaptiveThreshold, #Canny, and others to create a binary image out of a grayscale or color one.
+If mode equals to #RETR_CCOMP or #RETR_FLOODFILL, the input can also be a 32-bit integer image of labels (CV_32SC1).
+@param contours Detected contours. Each contour is stored as a vector of points (e.g.
+std::vector<std::vector<cv::Point> >).
+@param hierarchy Optional output vector (e.g. std::vector<cv::Vec4i>), containing information about the image topology. It has
+as many elements as the number of contours. For each i-th contour contours[i], the elements
+hierarchy[i][0] , hierarchy[i][1] , hierarchy[i][2] , and hierarchy[i][3] are set to 0-based indices
+in contours of the next and previous contours at the same hierarchical level, the first child
+contour and the parent contour, respectively. If for the contour i there are no next, previous,
+parent, or nested contours, the corresponding elements of hierarchy[i] will be negative.
+@note In Python, hierarchy is nested inside a top level array. Use hierarchy[0][i] to access hierarchical elements of i-th contour.
+@param mode Contour retrieval mode, see #RetrievalModes
+@param method Contour approximation method, see #ContourApproximationModes
+@param offset Optional offset by which every contour point is shifted. This is useful if the
+contours are extracted from the image ROI and then they should be analyzed in the whole image
+context.
+ */
+CV_EXPORTS_W void findContours( InputArray image, OutputArrayOfArrays contours,
+                              OutputArray hierarchy, int mode,
+                              int method, Point offset = Point());
+
+/** @overload */
+CV_EXPORTS void findContours( InputArray image, OutputArrayOfArrays contours,
+                              int mode, int method, Point offset = Point());
+
+/** @example samples/cpp/squares.cpp
+A program using pyramid scaling, Canny, contours and contour simplification to find
+squares in a list of images (pic1-6.png). Returns sequence of squares detected on the image.
+*/
+
+/** @example samples/tapi/squares.cpp
+A program using pyramid scaling, Canny, contours and contour simplification to find
+squares in the input image.
+*/
+
+/** @brief Approximates a polygonal curve(s) with the specified precision.
+
+The function cv::approxPolyDP approximates a curve or a polygon with another curve/polygon with less
+vertices so that the distance between them is less or equal to the specified precision. It uses the
+Douglas-Peucker algorithm <http://en.wikipedia.org/wiki/Ramer-Douglas-Peucker_algorithm>
+
+@param curve Input vector of a 2D point stored in std::vector or Mat
+@param approxCurve Result of the approximation. The type should match the type of the input curve.
+@param epsilon Parameter specifying the approximation accuracy. This is the maximum distance
+between the original curve and its approximation.
+@param closed If true, the approximated curve is closed (its first and last vertices are
+connected). Otherwise, it is not closed.
+ */
+CV_EXPORTS_W void approxPolyDP( InputArray curve,
+                                OutputArray approxCurve,
+                                double epsilon, bool closed );
+
+/** @brief Calculates a contour perimeter or a curve length.
+
+The function computes a curve length or a closed contour perimeter.
+
+@param curve Input vector of 2D points, stored in std::vector or Mat.
+@param closed Flag indicating whether the curve is closed or not.
+ */
+CV_EXPORTS_W double arcLength( InputArray curve, bool closed );
+
+/** @brief Calculates the up-right bounding rectangle of a point set or non-zero pixels of gray-scale image.
+
+The function calculates and returns the minimal up-right bounding rectangle for the specified point set or
+non-zero pixels of gray-scale image.
+
+@param array Input gray-scale image or 2D point set, stored in std::vector or Mat.
+ */
+CV_EXPORTS_W Rect boundingRect( InputArray array );
+
+/** @brief Calculates a contour area.
+
+The function computes a contour area. Similarly to moments , the area is computed using the Green
+formula. Thus, the returned area and the number of non-zero pixels, if you draw the contour using
+#drawContours or #fillPoly , can be different. Also, the function will most certainly give a wrong
+results for contours with self-intersections.
+
+Example:
+@code
+    vector<Point> contour;
+    contour.push_back(Point2f(0, 0));
+    contour.push_back(Point2f(10, 0));
+    contour.push_back(Point2f(10, 10));
+    contour.push_back(Point2f(5, 4));
+
+    double area0 = contourArea(contour);
+    vector<Point> approx;
+    approxPolyDP(contour, approx, 5, true);
+    double area1 = contourArea(approx);
+
+    cout << "area0 =" << area0 << endl <<
+            "area1 =" << area1 << endl <<
+            "approx poly vertices" << approx.size() << endl;
+@endcode
+@param contour Input vector of 2D points (contour vertices), stored in std::vector or Mat.
+@param oriented Oriented area flag. If it is true, the function returns a signed area value,
+depending on the contour orientation (clockwise or counter-clockwise). Using this feature you can
+determine orientation of a contour by taking the sign of an area. By default, the parameter is
+false, which means that the absolute value is returned.
+ */
+CV_EXPORTS_W double contourArea( InputArray contour, bool oriented = false );
+
+/** @brief Finds a rotated rectangle of the minimum area enclosing the input 2D point set.
+
+The function calculates and returns the minimum-area bounding rectangle (possibly rotated) for a
+specified point set. Developer should keep in mind that the returned RotatedRect can contain negative
+indices when data is close to the containing Mat element boundary.
+
+@param points Input vector of 2D points, stored in std::vector\<\> or Mat
+ */
+CV_EXPORTS_W RotatedRect minAreaRect( InputArray points );
+
+/** @brief Finds the four vertices of a rotated rect. Useful to draw the rotated rectangle.
+
+The function finds the four vertices of a rotated rectangle. This function is useful to draw the
+rectangle. In C++, instead of using this function, you can directly use RotatedRect::points method. Please
+visit the @ref tutorial_bounding_rotated_ellipses "tutorial on Creating Bounding rotated boxes and ellipses for contours" for more information.
+
+@param box The input rotated rectangle. It may be the output of
+@param points The output array of four vertices of rectangles.
+ */
+CV_EXPORTS_W void boxPoints(RotatedRect box, OutputArray points);
+
+/** @brief Finds a circle of the minimum area enclosing a 2D point set.
+
+The function finds the minimal enclosing circle of a 2D point set using an iterative algorithm.
+
+@param points Input vector of 2D points, stored in std::vector\<\> or Mat
+@param center Output center of the circle.
+@param radius Output radius of the circle.
+ */
+CV_EXPORTS_W void minEnclosingCircle( InputArray points,
+                                      CV_OUT Point2f& center, CV_OUT float& radius );
+
+/** @example samples/cpp/minarea.cpp
+*/
+
+/** @brief Finds a triangle of minimum area enclosing a 2D point set and returns its area.
+
+The function finds a triangle of minimum area enclosing the given set of 2D points and returns its
+area. The output for a given 2D point set is shown in the image below. 2D points are depicted in
+*red* and the enclosing triangle in *yellow*.
+
+![Sample output of the minimum enclosing triangle function](pics/minenclosingtriangle.png)
+
+The implementation of the algorithm is based on O'Rourke's @cite ORourke86 and Klee and Laskowski's
+@cite KleeLaskowski85 papers. O'Rourke provides a \f$\theta(n)\f$ algorithm for finding the minimal
+enclosing triangle of a 2D convex polygon with n vertices. Since the #minEnclosingTriangle function
+takes a 2D point set as input an additional preprocessing step of computing the convex hull of the
+2D point set is required. The complexity of the #convexHull function is \f$O(n log(n))\f$ which is higher
+than \f$\theta(n)\f$. Thus the overall complexity of the function is \f$O(n log(n))\f$.
+
+@param points Input vector of 2D points with depth CV_32S or CV_32F, stored in std::vector\<\> or Mat
+@param triangle Output vector of three 2D points defining the vertices of the triangle. The depth
+of the OutputArray must be CV_32F.
+ */
+CV_EXPORTS_W double minEnclosingTriangle( InputArray points, CV_OUT OutputArray triangle );
+
+/** @brief Compares two shapes.
+
+The function compares two shapes. All three implemented methods use the Hu invariants (see #HuMoments)
+
+@param contour1 First contour or grayscale image.
+@param contour2 Second contour or grayscale image.
+@param method Comparison method, see #ShapeMatchModes
+@param parameter Method-specific parameter (not supported now).
+ */
+CV_EXPORTS_W double matchShapes( InputArray contour1, InputArray contour2,
+                                 int method, double parameter );
+
+/** @example samples/cpp/convexhull.cpp
+An example using the convexHull functionality
+*/
+
+/** @brief Finds the convex hull of a point set.
+
+The function cv::convexHull finds the convex hull of a 2D point set using the Sklansky's algorithm @cite Sklansky82
+that has *O(N logN)* complexity in the current implementation.
+
+@param points Input 2D point set, stored in std::vector or Mat.
+@param hull Output convex hull. It is either an integer vector of indices or vector of points. In
+the first case, the hull elements are 0-based indices of the convex hull points in the original
+array (since the set of convex hull points is a subset of the original point set). In the second
+case, hull elements are the convex hull points themselves.
+@param clockwise Orientation flag. If it is true, the output convex hull is oriented clockwise.
+Otherwise, it is oriented counter-clockwise. The assumed coordinate system has its X axis pointing
+to the right, and its Y axis pointing upwards.
+@param returnPoints Operation flag. In case of a matrix, when the flag is true, the function
+returns convex hull points. Otherwise, it returns indices of the convex hull points. When the
+output array is std::vector, the flag is ignored, and the output depends on the type of the
+vector: std::vector\<int\> implies returnPoints=false, std::vector\<Point\> implies
+returnPoints=true.
+
+@note `points` and `hull` should be different arrays, inplace processing isn't supported.
+
+Check @ref tutorial_hull "the corresponding tutorial" for more details.
+
+useful links:
+
+https://www.learnopencv.com/convex-hull-using-opencv-in-python-and-c/
+ */
+CV_EXPORTS_W void convexHull( InputArray points, OutputArray hull,
+                              bool clockwise = false, bool returnPoints = true );
+
+/** @brief Finds the convexity defects of a contour.
+
+The figure below displays convexity defects of a hand contour:
+
+![image](pics/defects.png)
+
+@param contour Input contour.
+@param convexhull Convex hull obtained using convexHull that should contain indices of the contour
+points that make the hull.
+@param convexityDefects The output vector of convexity defects. In C++ and the new Python/Java
+interface each convexity defect is represented as 4-element integer vector (a.k.a. #Vec4i):
+(start_index, end_index, farthest_pt_index, fixpt_depth), where indices are 0-based indices
+in the original contour of the convexity defect beginning, end and the farthest point, and
+fixpt_depth is fixed-point approximation (with 8 fractional bits) of the distance between the
+farthest contour point and the hull. That is, to get the floating-point value of the depth will be
+fixpt_depth/256.0.
+ */
+CV_EXPORTS_W void convexityDefects( InputArray contour, InputArray convexhull, OutputArray convexityDefects );
+
+/** @brief Tests a contour convexity.
+
+The function tests whether the input contour is convex or not. The contour must be simple, that is,
+without self-intersections. Otherwise, the function output is undefined.
+
+@param contour Input vector of 2D points, stored in std::vector\<\> or Mat
+ */
+CV_EXPORTS_W bool isContourConvex( InputArray contour );
+
+/** @example samples/cpp/intersectExample.cpp
+Examples of how intersectConvexConvex works
+*/
+
+/** @brief Finds intersection of two convex polygons
+
+@param p1 First polygon
+@param p2 Second polygon
+@param p12 Output polygon describing the intersecting area
+@param handleNested When true, an intersection is found if one of the polygons is fully enclosed in the other.
+When false, no intersection is found. If the polygons share a side or the vertex of one polygon lies on an edge
+of the other, they are not considered nested and an intersection will be found regardless of the value of handleNested.
+
+@returns Absolute value of area of intersecting polygon
+
+@note intersectConvexConvex doesn't confirm that both polygons are convex and will return invalid results if they aren't.
+ */
+CV_EXPORTS_W float intersectConvexConvex( InputArray p1, InputArray p2,
+                                          OutputArray p12, bool handleNested = true );
+
+/** @example samples/cpp/fitellipse.cpp
+An example using the fitEllipse technique
+*/
+
+/** @brief Fits an ellipse around a set of 2D points.
+
+The function calculates the ellipse that fits (in a least-squares sense) a set of 2D points best of
+all. It returns the rotated rectangle in which the ellipse is inscribed. The first algorithm described by @cite Fitzgibbon95
+is used. Developer should keep in mind that it is possible that the returned
+ellipse/rotatedRect data contains negative indices, due to the data points being close to the
+border of the containing Mat element.
+
+@param points Input 2D point set, stored in std::vector\<\> or Mat
+ */
+CV_EXPORTS_W RotatedRect fitEllipse( InputArray points );
+
+/** @brief Fits an ellipse around a set of 2D points.
+
+ The function calculates the ellipse that fits a set of 2D points.
+ It returns the rotated rectangle in which the ellipse is inscribed.
+ The Approximate Mean Square (AMS) proposed by @cite Taubin1991 is used.
+
+ For an ellipse, this basis set is \f$ \chi= \left(x^2, x y, y^2, x, y, 1\right) \f$,
+ which is a set of six free coefficients \f$ A^T=\left\{A_{\text{xx}},A_{\text{xy}},A_{\text{yy}},A_x,A_y,A_0\right\} \f$.
+ However, to specify an ellipse, all that is needed is five numbers; the major and minor axes lengths \f$ (a,b) \f$,
+ the position \f$ (x_0,y_0) \f$, and the orientation \f$ \theta \f$. This is because the basis set includes lines,
+ quadratics, parabolic and hyperbolic functions as well as elliptical functions as possible fits.
+ If the fit is found to be a parabolic or hyperbolic function then the standard #fitEllipse method is used.
+ The AMS method restricts the fit to parabolic, hyperbolic and elliptical curves
+ by imposing the condition that \f$ A^T ( D_x^T D_x  +   D_y^T D_y) A = 1 \f$ where
+ the matrices \f$ Dx \f$ and \f$ Dy \f$ are the partial derivatives of the design matrix \f$ D \f$ with
+ respect to x and y. The matrices are formed row by row applying the following to
+ each of the points in the set:
+ \f{align*}{
+ D(i,:)&=\left\{x_i^2, x_i y_i, y_i^2, x_i, y_i, 1\right\} &
+ D_x(i,:)&=\left\{2 x_i,y_i,0,1,0,0\right\} &
+ D_y(i,:)&=\left\{0,x_i,2 y_i,0,1,0\right\}
+ \f}
+ The AMS method minimizes the cost function
+ \f{equation*}{
+ \epsilon ^2=\frac{ A^T D^T D A }{ A^T (D_x^T D_x +  D_y^T D_y) A^T }
+ \f}
+
+ The minimum cost is found by solving the generalized eigenvalue problem.
+
+ \f{equation*}{
+ D^T D A = \lambda  \left( D_x^T D_x +  D_y^T D_y\right) A
+ \f}
+
+ @param points Input 2D point set, stored in std::vector\<\> or Mat
+ */
+CV_EXPORTS_W RotatedRect fitEllipseAMS( InputArray points );
+
+
+/** @brief Fits an ellipse around a set of 2D points.
+
+ The function calculates the ellipse that fits a set of 2D points.
+ It returns the rotated rectangle in which the ellipse is inscribed.
+ The Direct least square (Direct) method by @cite Fitzgibbon1999 is used.
+
+ For an ellipse, this basis set is \f$ \chi= \left(x^2, x y, y^2, x, y, 1\right) \f$,
+ which is a set of six free coefficients \f$ A^T=\left\{A_{\text{xx}},A_{\text{xy}},A_{\text{yy}},A_x,A_y,A_0\right\} \f$.
+ However, to specify an ellipse, all that is needed is five numbers; the major and minor axes lengths \f$ (a,b) \f$,
+ the position \f$ (x_0,y_0) \f$, and the orientation \f$ \theta \f$. This is because the basis set includes lines,
+ quadratics, parabolic and hyperbolic functions as well as elliptical functions as possible fits.
+ The Direct method confines the fit to ellipses by ensuring that \f$ 4 A_{xx} A_{yy}- A_{xy}^2 > 0 \f$.
+ The condition imposed is that \f$ 4 A_{xx} A_{yy}- A_{xy}^2=1 \f$ which satisfies the inequality
+ and as the coefficients can be arbitrarily scaled is not overly restrictive.
+
+ \f{equation*}{
+ \epsilon ^2= A^T D^T D A \quad \text{with} \quad A^T C A =1 \quad \text{and} \quad C=\left(\begin{matrix}
+ 0 & 0  & 2  & 0  & 0  &  0  \\
+ 0 & -1  & 0  & 0  & 0  &  0 \\
+ 2 & 0  & 0  & 0  & 0  &  0 \\
+ 0 & 0  & 0  & 0  & 0  &  0 \\
+ 0 & 0  & 0  & 0  & 0  &  0 \\
+ 0 & 0  & 0  & 0  & 0  &  0
+ \end{matrix} \right)
+ \f}
+
+ The minimum cost is found by solving the generalized eigenvalue problem.
+
+ \f{equation*}{
+ D^T D A = \lambda  \left( C\right) A
+ \f}
+
+ The system produces only one positive eigenvalue \f$ \lambda\f$ which is chosen as the solution
+ with its eigenvector \f$\mathbf{u}\f$. These are used to find the coefficients
+
+ \f{equation*}{
+ A = \sqrt{\frac{1}{\mathbf{u}^T C \mathbf{u}}}  \mathbf{u}
+ \f}
+ The scaling factor guarantees that  \f$A^T C A =1\f$.
+
+ @param points Input 2D point set, stored in std::vector\<\> or Mat
+ */
+CV_EXPORTS_W RotatedRect fitEllipseDirect( InputArray points );
+
+/** @brief Fits a line to a 2D or 3D point set.
+
+The function fitLine fits a line to a 2D or 3D point set by minimizing \f$\sum_i \rho(r_i)\f$ where
+\f$r_i\f$ is a distance between the \f$i^{th}\f$ point, the line and \f$\rho(r)\f$ is a distance function, one
+of the following:
+-  DIST_L2
+\f[\rho (r) = r^2/2  \quad \text{(the simplest and the fastest least-squares method)}\f]
+- DIST_L1
+\f[\rho (r) = r\f]
+- DIST_L12
+\f[\rho (r) = 2  \cdot ( \sqrt{1 + \frac{r^2}{2}} - 1)\f]
+- DIST_FAIR
+\f[\rho \left (r \right ) = C^2  \cdot \left (  \frac{r}{C} -  \log{\left(1 + \frac{r}{C}\right)} \right )  \quad \text{where} \quad C=1.3998\f]
+- DIST_WELSCH
+\f[\rho \left (r \right ) =  \frac{C^2}{2} \cdot \left ( 1 -  \exp{\left(-\left(\frac{r}{C}\right)^2\right)} \right )  \quad \text{where} \quad C=2.9846\f]
+- DIST_HUBER
+\f[\rho (r) =  \fork{r^2/2}{if \(r < C\)}{C \cdot (r-C/2)}{otherwise} \quad \text{where} \quad C=1.345\f]
+
+The algorithm is based on the M-estimator ( <http://en.wikipedia.org/wiki/M-estimator> ) technique
+that iteratively fits the line using the weighted least-squares algorithm. After each iteration the
+weights \f$w_i\f$ are adjusted to be inversely proportional to \f$\rho(r_i)\f$ .
+
+@param points Input vector of 2D or 3D points, stored in std::vector\<\> or Mat.
+@param line Output line parameters. In case of 2D fitting, it should be a vector of 4 elements
+(like Vec4f) - (vx, vy, x0, y0), where (vx, vy) is a normalized vector collinear to the line and
+(x0, y0) is a point on the line. In case of 3D fitting, it should be a vector of 6 elements (like
+Vec6f) - (vx, vy, vz, x0, y0, z0), where (vx, vy, vz) is a normalized vector collinear to the line
+and (x0, y0, z0) is a point on the line.
+@param distType Distance used by the M-estimator, see #DistanceTypes
+@param param Numerical parameter ( C ) for some types of distances. If it is 0, an optimal value
+is chosen.
+@param reps Sufficient accuracy for the radius (distance between the coordinate origin and the line).
+@param aeps Sufficient accuracy for the angle. 0.01 would be a good default value for reps and aeps.
+ */
+CV_EXPORTS_W void fitLine( InputArray points, OutputArray line, int distType,
+                           double param, double reps, double aeps );
+
+/** @brief Performs a point-in-contour test.
+
+The function determines whether the point is inside a contour, outside, or lies on an edge (or
+coincides with a vertex). It returns positive (inside), negative (outside), or zero (on an edge)
+value, correspondingly. When measureDist=false , the return value is +1, -1, and 0, respectively.
+Otherwise, the return value is a signed distance between the point and the nearest contour edge.
+
+See below a sample output of the function where each image pixel is tested against the contour:
+
+![sample output](pics/pointpolygon.png)
+
+@param contour Input contour.
+@param pt Point tested against the contour.
+@param measureDist If true, the function estimates the signed distance from the point to the
+nearest contour edge. Otherwise, the function only checks if the point is inside a contour or not.
+ */
+CV_EXPORTS_W double pointPolygonTest( InputArray contour, Point2f pt, bool measureDist );
+
+/** @brief Finds out if there is any intersection between two rotated rectangles.
+
+If there is then the vertices of the intersecting region are returned as well.
+
+Below are some examples of intersection configurations. The hatched pattern indicates the
+intersecting region and the red vertices are returned by the function.
+
+![intersection examples](pics/intersection.png)
+
+@param rect1 First rectangle
+@param rect2 Second rectangle
+@param intersectingRegion The output array of the vertices of the intersecting region. It returns
+at most 8 vertices. Stored as std::vector\<cv::Point2f\> or cv::Mat as Mx1 of type CV_32FC2.
+@returns One of #RectanglesIntersectTypes
+ */
+CV_EXPORTS_W int rotatedRectangleIntersection( const RotatedRect& rect1, const RotatedRect& rect2, OutputArray intersectingRegion  );
+
+/** @brief Creates a smart pointer to a cv::GeneralizedHoughBallard class and initializes it.
+*/
+CV_EXPORTS_W Ptr<GeneralizedHoughBallard> createGeneralizedHoughBallard();
+
+/** @brief Creates a smart pointer to a cv::GeneralizedHoughGuil class and initializes it.
+*/
+CV_EXPORTS_W Ptr<GeneralizedHoughGuil> createGeneralizedHoughGuil();
+
+//! @} imgproc_shape
+
+//! @addtogroup imgproc_colormap
+//! @{
+
+//! GNU Octave/MATLAB equivalent colormaps
+enum ColormapTypes
+{
+    COLORMAP_AUTUMN = 0, //!< ![autumn](pics/colormaps/colorscale_autumn.jpg)
+    COLORMAP_BONE = 1, //!< ![bone](pics/colormaps/colorscale_bone.jpg)
+    COLORMAP_JET = 2, //!< ![jet](pics/colormaps/colorscale_jet.jpg)
+    COLORMAP_WINTER = 3, //!< ![winter](pics/colormaps/colorscale_winter.jpg)
+    COLORMAP_RAINBOW = 4, //!< ![rainbow](pics/colormaps/colorscale_rainbow.jpg)
+    COLORMAP_OCEAN = 5, //!< ![ocean](pics/colormaps/colorscale_ocean.jpg)
+    COLORMAP_SUMMER = 6, //!< ![summer](pics/colormaps/colorscale_summer.jpg)
+    COLORMAP_SPRING = 7, //!< ![spring](pics/colormaps/colorscale_spring.jpg)
+    COLORMAP_COOL = 8, //!< ![cool](pics/colormaps/colorscale_cool.jpg)
+    COLORMAP_HSV = 9, //!< ![HSV](pics/colormaps/colorscale_hsv.jpg)
+    COLORMAP_PINK = 10, //!< ![pink](pics/colormaps/colorscale_pink.jpg)
+    COLORMAP_HOT = 11, //!< ![hot](pics/colormaps/colorscale_hot.jpg)
+    COLORMAP_PARULA = 12, //!< ![parula](pics/colormaps/colorscale_parula.jpg)
+    COLORMAP_MAGMA = 13, //!< ![magma](pics/colormaps/colorscale_magma.jpg)
+    COLORMAP_INFERNO = 14, //!< ![inferno](pics/colormaps/colorscale_inferno.jpg)
+    COLORMAP_PLASMA = 15, //!< ![plasma](pics/colormaps/colorscale_plasma.jpg)
+    COLORMAP_VIRIDIS = 16, //!< ![viridis](pics/colormaps/colorscale_viridis.jpg)
+    COLORMAP_CIVIDIS = 17, //!< ![cividis](pics/colormaps/colorscale_cividis.jpg)
+    COLORMAP_TWILIGHT = 18, //!< ![twilight](pics/colormaps/colorscale_twilight.jpg)
+    COLORMAP_TWILIGHT_SHIFTED = 19, //!< ![twilight shifted](pics/colormaps/colorscale_twilight_shifted.jpg)
+    COLORMAP_TURBO = 20, //!< ![turbo](pics/colormaps/colorscale_turbo.jpg)
+    COLORMAP_DEEPGREEN = 21  //!< ![deepgreen](pics/colormaps/colorscale_deepgreen.jpg)
+};
+
+/** @example samples/cpp/falsecolor.cpp
+An example using applyColorMap function
+*/
+
+/** @brief Applies a GNU Octave/MATLAB equivalent colormap on a given image.
+
+@param src The source image, grayscale or colored of type CV_8UC1 or CV_8UC3.
+@param dst The result is the colormapped source image. Note: Mat::create is called on dst.
+@param colormap The colormap to apply, see #ColormapTypes
+*/
+CV_EXPORTS_W void applyColorMap(InputArray src, OutputArray dst, int colormap);
+
+/** @brief Applies a user colormap on a given image.
+
+@param src The source image, grayscale or colored of type CV_8UC1 or CV_8UC3.
+@param dst The result is the colormapped source image. Note: Mat::create is called on dst.
+@param userColor The colormap to apply of type CV_8UC1 or CV_8UC3 and size 256
+*/
+CV_EXPORTS_W void applyColorMap(InputArray src, OutputArray dst, InputArray userColor);
+
+//! @} imgproc_colormap
+
+//! @addtogroup imgproc_draw
+//! @{
+
+
+/** OpenCV color channel order is BGR[A] */
+#define CV_RGB(r, g, b)  cv::Scalar((b), (g), (r), 0)
+
+/** @brief Draws a line segment connecting two points.
+
+The function line draws the line segment between pt1 and pt2 points in the image. The line is
+clipped by the image boundaries. For non-antialiased lines with integer coordinates, the 8-connected
+or 4-connected Bresenham algorithm is used. Thick lines are drawn with rounding endings. Antialiased
+lines are drawn using Gaussian filtering.
+
+@param img Image.
+@param pt1 First point of the line segment.
+@param pt2 Second point of the line segment.
+@param color Line color.
+@param thickness Line thickness.
+@param lineType Type of the line. See #LineTypes.
+@param shift Number of fractional bits in the point coordinates.
+ */
+CV_EXPORTS_W void line(InputOutputArray img, Point pt1, Point pt2, const Scalar& color,
+                     int thickness = 1, int lineType = LINE_8, int shift = 0);
+
+/** @brief Draws an arrow segment pointing from the first point to the second one.
+
+The function cv::arrowedLine draws an arrow between pt1 and pt2 points in the image. See also #line.
+
+@param img Image.
+@param pt1 The point the arrow starts from.
+@param pt2 The point the arrow points to.
+@param color Line color.
+@param thickness Line thickness.
+@param line_type Type of the line. See #LineTypes
+@param shift Number of fractional bits in the point coordinates.
+@param tipLength The length of the arrow tip in relation to the arrow length
+ */
+CV_EXPORTS_W void arrowedLine(InputOutputArray img, Point pt1, Point pt2, const Scalar& color,
+                     int thickness=1, int line_type=8, int shift=0, double tipLength=0.1);
+
+/** @brief Draws a simple, thick, or filled up-right rectangle.
+
+The function cv::rectangle draws a rectangle outline or a filled rectangle whose two opposite corners
+are pt1 and pt2.
+
+@param img Image.
+@param pt1 Vertex of the rectangle.
+@param pt2 Vertex of the rectangle opposite to pt1 .
+@param color Rectangle color or brightness (grayscale image).
+@param thickness Thickness of lines that make up the rectangle. Negative values, like #FILLED,
+mean that the function has to draw a filled rectangle.
+@param lineType Type of the line. See #LineTypes
+@param shift Number of fractional bits in the point coordinates.
+ */
+CV_EXPORTS_W void rectangle(InputOutputArray img, Point pt1, Point pt2,
+                          const Scalar& color, int thickness = 1,
+                          int lineType = LINE_8, int shift = 0);
+
+/** @overload
+
+use `rec` parameter as alternative specification of the drawn rectangle: `r.tl() and
+r.br()-Point(1,1)` are opposite corners
+*/
+CV_EXPORTS_W void rectangle(InputOutputArray img, Rect rec,
+                          const Scalar& color, int thickness = 1,
+                          int lineType = LINE_8, int shift = 0);
+
+/** @example samples/cpp/tutorial_code/ImgProc/basic_drawing/Drawing_2.cpp
+An example using drawing functions
+*/
+
+/** @brief Draws a circle.
+
+The function cv::circle draws a simple or filled circle with a given center and radius.
+@param img Image where the circle is drawn.
+@param center Center of the circle.
+@param radius Radius of the circle.
+@param color Circle color.
+@param thickness Thickness of the circle outline, if positive. Negative values, like #FILLED,
+mean that a filled circle is to be drawn.
+@param lineType Type of the circle boundary. See #LineTypes
+@param shift Number of fractional bits in the coordinates of the center and in the radius value.
+ */
+CV_EXPORTS_W void circle(InputOutputArray img, Point center, int radius,
+                       const Scalar& color, int thickness = 1,
+                       int lineType = LINE_8, int shift = 0);
+
+/** @brief Draws a simple or thick elliptic arc or fills an ellipse sector.
+
+The function cv::ellipse with more parameters draws an ellipse outline, a filled ellipse, an elliptic
+arc, or a filled ellipse sector. The drawing code uses general parametric form.
+A piecewise-linear curve is used to approximate the elliptic arc
+boundary. If you need more control of the ellipse rendering, you can retrieve the curve using
+#ellipse2Poly and then render it with #polylines or fill it with #fillPoly. If you use the first
+variant of the function and want to draw the whole ellipse, not an arc, pass `startAngle=0` and
+`endAngle=360`. If `startAngle` is greater than `endAngle`, they are swapped. The figure below explains
+the meaning of the parameters to draw the blue arc.
+
+![Parameters of Elliptic Arc](pics/ellipse.svg)
+
+@param img Image.
+@param center Center of the ellipse.
+@param axes Half of the size of the ellipse main axes.
+@param angle Ellipse rotation angle in degrees.
+@param startAngle Starting angle of the elliptic arc in degrees.
+@param endAngle Ending angle of the elliptic arc in degrees.
+@param color Ellipse color.
+@param thickness Thickness of the ellipse arc outline, if positive. Otherwise, this indicates that
+a filled ellipse sector is to be drawn.
+@param lineType Type of the ellipse boundary. See #LineTypes
+@param shift Number of fractional bits in the coordinates of the center and values of axes.
+ */
+CV_EXPORTS_W void ellipse(InputOutputArray img, Point center, Size axes,
+                        double angle, double startAngle, double endAngle,
+                        const Scalar& color, int thickness = 1,
+                        int lineType = LINE_8, int shift = 0);
+
+/** @overload
+@param img Image.
+@param box Alternative ellipse representation via RotatedRect. This means that the function draws
+an ellipse inscribed in the rotated rectangle.
+@param color Ellipse color.
+@param thickness Thickness of the ellipse arc outline, if positive. Otherwise, this indicates that
+a filled ellipse sector is to be drawn.
+@param lineType Type of the ellipse boundary. See #LineTypes
+*/
+CV_EXPORTS_W void ellipse(InputOutputArray img, const RotatedRect& box, const Scalar& color,
+                        int thickness = 1, int lineType = LINE_8);
+
+/* ----------------------------------------------------------------------------------------- */
+/* ADDING A SET OF PREDEFINED MARKERS WHICH COULD BE USED TO HIGHLIGHT POSITIONS IN AN IMAGE */
+/* ----------------------------------------------------------------------------------------- */
+
+/** @brief Draws a marker on a predefined position in an image.
+
+The function cv::drawMarker draws a marker on a given position in the image. For the moment several
+marker types are supported, see #MarkerTypes for more information.
+
+@param img Image.
+@param position The point where the crosshair is positioned.
+@param color Line color.
+@param markerType The specific type of marker you want to use, see #MarkerTypes
+@param thickness Line thickness.
+@param line_type Type of the line, See #LineTypes
+@param markerSize The length of the marker axis [default = 20 pixels]
+ */
+CV_EXPORTS_W void drawMarker(InputOutputArray img, Point position, const Scalar& color,
+                             int markerType = MARKER_CROSS, int markerSize=20, int thickness=1,
+                             int line_type=8);
+
+/* ----------------------------------------------------------------------------------------- */
+/* END OF MARKER SECTION */
+/* ----------------------------------------------------------------------------------------- */
+
+/** @brief Fills a convex polygon.
+
+The function cv::fillConvexPoly draws a filled convex polygon. This function is much faster than the
+function #fillPoly . It can fill not only convex polygons but any monotonic polygon without
+self-intersections, that is, a polygon whose contour intersects every horizontal line (scan line)
+twice at the most (though, its top-most and/or the bottom edge could be horizontal).
+
+@param img Image.
+@param points Polygon vertices.
+@param color Polygon color.
+@param lineType Type of the polygon boundaries. See #LineTypes
+@param shift Number of fractional bits in the vertex coordinates.
+ */
+CV_EXPORTS_W void fillConvexPoly(InputOutputArray img, InputArray points,
+                                 const Scalar& color, int lineType = LINE_8,
+                                 int shift = 0);
+
+/** @overload */
+CV_EXPORTS void fillConvexPoly(InputOutputArray img, const Point* pts, int npts,
+                               const Scalar& color, int lineType = LINE_8,
+                               int shift = 0);
+
+/** @example samples/cpp/tutorial_code/ImgProc/basic_drawing/Drawing_1.cpp
+An example using drawing functions
+Check @ref tutorial_random_generator_and_text "the corresponding tutorial" for more details
+*/
+
+/** @brief Fills the area bounded by one or more polygons.
+
+The function cv::fillPoly fills an area bounded by several polygonal contours. The function can fill
+complex areas, for example, areas with holes, contours with self-intersections (some of their
+parts), and so forth.
+
+@param img Image.
+@param pts Array of polygons where each polygon is represented as an array of points.
+@param color Polygon color.
+@param lineType Type of the polygon boundaries. See #LineTypes
+@param shift Number of fractional bits in the vertex coordinates.
+@param offset Optional offset of all points of the contours.
+ */
+CV_EXPORTS_W void fillPoly(InputOutputArray img, InputArrayOfArrays pts,
+                           const Scalar& color, int lineType = LINE_8, int shift = 0,
+                           Point offset = Point() );
+
+/** @overload */
+CV_EXPORTS void fillPoly(InputOutputArray img, const Point** pts,
+                         const int* npts, int ncontours,
+                         const Scalar& color, int lineType = LINE_8, int shift = 0,
+                         Point offset = Point() );
+
+/** @brief Draws several polygonal curves.
+
+@param img Image.
+@param pts Array of polygonal curves.
+@param isClosed Flag indicating whether the drawn polylines are closed or not. If they are closed,
+the function draws a line from the last vertex of each curve to its first vertex.
+@param color Polyline color.
+@param thickness Thickness of the polyline edges.
+@param lineType Type of the line segments. See #LineTypes
+@param shift Number of fractional bits in the vertex coordinates.
+
+The function cv::polylines draws one or more polygonal curves.
+ */
+CV_EXPORTS_W void polylines(InputOutputArray img, InputArrayOfArrays pts,
+                            bool isClosed, const Scalar& color,
+                            int thickness = 1, int lineType = LINE_8, int shift = 0 );
+
+/** @overload */
+CV_EXPORTS void polylines(InputOutputArray img, const Point* const* pts, const int* npts,
+                          int ncontours, bool isClosed, const Scalar& color,
+                          int thickness = 1, int lineType = LINE_8, int shift = 0 );
+
+/** @example samples/cpp/contours2.cpp
+An example program illustrates the use of cv::findContours and cv::drawContours
+\image html WindowsQtContoursOutput.png "Screenshot of the program"
+*/
+
+/** @example samples/cpp/segment_objects.cpp
+An example using drawContours to clean up a background segmentation result
+*/
+
+/** @brief Draws contours outlines or filled contours.
+
+The function draws contour outlines in the image if \f$\texttt{thickness} \ge 0\f$ or fills the area
+bounded by the contours if \f$\texttt{thickness}<0\f$ . The example below shows how to retrieve
+connected components from the binary image and label them: :
+@include snippets/imgproc_drawContours.cpp
+
+@param image Destination image.
+@param contours All the input contours. Each contour is stored as a point vector.
+@param contourIdx Parameter indicating a contour to draw. If it is negative, all the contours are drawn.
+@param color Color of the contours.
+@param thickness Thickness of lines the contours are drawn with. If it is negative (for example,
+thickness=#FILLED ), the contour interiors are drawn.
+@param lineType Line connectivity. See #LineTypes
+@param hierarchy Optional information about hierarchy. It is only needed if you want to draw only
+some of the contours (see maxLevel ).
+@param maxLevel Maximal level for drawn contours. If it is 0, only the specified contour is drawn.
+If it is 1, the function draws the contour(s) and all the nested contours. If it is 2, the function
+draws the contours, all the nested contours, all the nested-to-nested contours, and so on. This
+parameter is only taken into account when there is hierarchy available.
+@param offset Optional contour shift parameter. Shift all the drawn contours by the specified
+\f$\texttt{offset}=(dx,dy)\f$ .
+@note When thickness=#FILLED, the function is designed to handle connected components with holes correctly
+even when no hierarchy data is provided. This is done by analyzing all the outlines together
+using even-odd rule. This may give incorrect results if you have a joint collection of separately retrieved
+contours. In order to solve this problem, you need to call #drawContours separately for each sub-group
+of contours, or iterate over the collection using contourIdx parameter.
+ */
+CV_EXPORTS_W void drawContours( InputOutputArray image, InputArrayOfArrays contours,
+                              int contourIdx, const Scalar& color,
+                              int thickness = 1, int lineType = LINE_8,
+                              InputArray hierarchy = noArray(),
+                              int maxLevel = INT_MAX, Point offset = Point() );
+
+/** @brief Clips the line against the image rectangle.
+
+The function cv::clipLine calculates a part of the line segment that is entirely within the specified
+rectangle. It returns false if the line segment is completely outside the rectangle. Otherwise,
+it returns true .
+@param imgSize Image size. The image rectangle is Rect(0, 0, imgSize.width, imgSize.height) .
+@param pt1 First line point.
+@param pt2 Second line point.
+ */
+CV_EXPORTS bool clipLine(Size imgSize, CV_IN_OUT Point& pt1, CV_IN_OUT Point& pt2);
+
+/** @overload
+@param imgSize Image size. The image rectangle is Rect(0, 0, imgSize.width, imgSize.height) .
+@param pt1 First line point.
+@param pt2 Second line point.
+*/
+CV_EXPORTS bool clipLine(Size2l imgSize, CV_IN_OUT Point2l& pt1, CV_IN_OUT Point2l& pt2);
+
+/** @overload
+@param imgRect Image rectangle.
+@param pt1 First line point.
+@param pt2 Second line point.
+*/
+CV_EXPORTS_W bool clipLine(Rect imgRect, CV_OUT CV_IN_OUT Point& pt1, CV_OUT CV_IN_OUT Point& pt2);
+
+/** @brief Approximates an elliptic arc with a polyline.
+
+The function ellipse2Poly computes the vertices of a polyline that approximates the specified
+elliptic arc. It is used by #ellipse. If `arcStart` is greater than `arcEnd`, they are swapped.
+
+@param center Center of the arc.
+@param axes Half of the size of the ellipse main axes. See #ellipse for details.
+@param angle Rotation angle of the ellipse in degrees. See #ellipse for details.
+@param arcStart Starting angle of the elliptic arc in degrees.
+@param arcEnd Ending angle of the elliptic arc in degrees.
+@param delta Angle between the subsequent polyline vertices. It defines the approximation
+accuracy.
+@param pts Output vector of polyline vertices.
+ */
+CV_EXPORTS_W void ellipse2Poly( Point center, Size axes, int angle,
+                                int arcStart, int arcEnd, int delta,
+                                CV_OUT std::vector<Point>& pts );
+
+/** @overload
+@param center Center of the arc.
+@param axes Half of the size of the ellipse main axes. See #ellipse for details.
+@param angle Rotation angle of the ellipse in degrees. See #ellipse for details.
+@param arcStart Starting angle of the elliptic arc in degrees.
+@param arcEnd Ending angle of the elliptic arc in degrees.
+@param delta Angle between the subsequent polyline vertices. It defines the approximation accuracy.
+@param pts Output vector of polyline vertices.
+*/
+CV_EXPORTS void ellipse2Poly(Point2d center, Size2d axes, int angle,
+                             int arcStart, int arcEnd, int delta,
+                             CV_OUT std::vector<Point2d>& pts);
+
+/** @brief Draws a text string.
+
+The function cv::putText renders the specified text string in the image. Symbols that cannot be rendered
+using the specified font are replaced by question marks. See #getTextSize for a text rendering code
+example.
+
+@param img Image.
+@param text Text string to be drawn.
+@param org Bottom-left corner of the text string in the image.
+@param fontFace Font type, see #HersheyFonts.
+@param fontScale Font scale factor that is multiplied by the font-specific base size.
+@param color Text color.
+@param thickness Thickness of the lines used to draw a text.
+@param lineType Line type. See #LineTypes
+@param bottomLeftOrigin When true, the image data origin is at the bottom-left corner. Otherwise,
+it is at the top-left corner.
+ */
+CV_EXPORTS_W void putText( InputOutputArray img, const String& text, Point org,
+                         int fontFace, double fontScale, Scalar color,
+                         int thickness = 1, int lineType = LINE_8,
+                         bool bottomLeftOrigin = false );
+
+/** @brief Calculates the width and height of a text string.
+
+The function cv::getTextSize calculates and returns the size of a box that contains the specified text.
+That is, the following code renders some text, the tight box surrounding it, and the baseline: :
+@code
+    String text = "Funny text inside the box";
+    int fontFace = FONT_HERSHEY_SCRIPT_SIMPLEX;
+    double fontScale = 2;
+    int thickness = 3;
+
+    Mat img(600, 800, CV_8UC3, Scalar::all(0));
+
+    int baseline=0;
+    Size textSize = getTextSize(text, fontFace,
+                                fontScale, thickness, &baseline);
+    baseline += thickness;
+
+    // center the text
+    Point textOrg((img.cols - textSize.width)/2,
+                  (img.rows + textSize.height)/2);
+
+    // draw the box
+    rectangle(img, textOrg + Point(0, baseline),
+              textOrg + Point(textSize.width, -textSize.height),
+              Scalar(0,0,255));
+    // ... and the baseline first
+    line(img, textOrg + Point(0, thickness),
+         textOrg + Point(textSize.width, thickness),
+         Scalar(0, 0, 255));
+
+    // then put the text itself
+    putText(img, text, textOrg, fontFace, fontScale,
+            Scalar::all(255), thickness, 8);
+@endcode
+
+@param text Input text string.
+@param fontFace Font to use, see #HersheyFonts.
+@param fontScale Font scale factor that is multiplied by the font-specific base size.
+@param thickness Thickness of lines used to render the text. See #putText for details.
+@param[out] baseLine y-coordinate of the baseline relative to the bottom-most text
+point.
+@return The size of a box that contains the specified text.
+
+@see putText
+ */
+CV_EXPORTS_W Size getTextSize(const String& text, int fontFace,
+                            double fontScale, int thickness,
+                            CV_OUT int* baseLine);
+
+
+/** @brief Calculates the font-specific size to use to achieve a given height in pixels.
+
+@param fontFace Font to use, see cv::HersheyFonts.
+@param pixelHeight Pixel height to compute the fontScale for
+@param thickness Thickness of lines used to render the text.See putText for details.
+@return The fontSize to use for cv::putText
+
+@see cv::putText
+*/
+CV_EXPORTS_W double getFontScaleFromHeight(const int fontFace,
+                                           const int pixelHeight,
+                                           const int thickness = 1);
+
+/** @brief Class for iterating over all pixels on a raster line segment.
+
+The class LineIterator is used to get each pixel of a raster line connecting
+two specified points.
+It can be treated as a versatile implementation of the Bresenham algorithm
+where you can stop at each pixel and do some extra processing, for
+example, grab pixel values along the line or draw a line with an effect
+(for example, with XOR operation).
+
+The number of pixels along the line is stored in LineIterator::count.
+The method LineIterator::pos returns the current position in the image:
+
+@code{.cpp}
+// grabs pixels along the line (pt1, pt2)
+// from 8-bit 3-channel image to the buffer
+LineIterator it(img, pt1, pt2, 8);
+LineIterator it2 = it;
+vector<Vec3b> buf(it.count);
+
+for(int i = 0; i < it.count; i++, ++it)
+    buf[i] = *(const Vec3b*)*it;
+
+// alternative way of iterating through the line
+for(int i = 0; i < it2.count; i++, ++it2)
+{
+    Vec3b val = img.at<Vec3b>(it2.pos());
+    CV_Assert(buf[i] == val);
+}
+@endcode
+*/
+class CV_EXPORTS LineIterator
+{
+public:
+    /** @brief Initializes iterator object for the given line and image.
+
+    The returned iterator can be used to traverse all pixels on a line that
+    connects the given two points.
+    The line will be clipped on the image boundaries.
+
+    @param img Underlying image.
+    @param pt1 First endpoint of the line.
+    @param pt2 The other endpoint of the line.
+    @param connectivity Pixel connectivity of the iterator. Valid values are 4 (iterator can move
+    up, down, left and right) and 8 (iterator can also move diagonally).
+    @param leftToRight If true, the line is traversed from the leftmost endpoint to the rightmost
+    endpoint. Otherwise, the line is traversed from \p pt1 to \p pt2.
+    */
+    LineIterator( const Mat& img, Point pt1, Point pt2,
+                  int connectivity = 8, bool leftToRight = false )
+    {
+        init(&img, Rect(0, 0, img.cols, img.rows), pt1, pt2, connectivity, leftToRight);
+        ptmode = false;
+    }
+    LineIterator( Point pt1, Point pt2,
+                  int connectivity = 8, bool leftToRight = false )
+    {
+        init(0, Rect(std::min(pt1.x, pt2.x),
+                     std::min(pt1.y, pt2.y),
+                     std::max(pt1.x, pt2.x) - std::min(pt1.x, pt2.x) + 1,
+                     std::max(pt1.y, pt2.y) - std::min(pt1.y, pt2.y) + 1),
+             pt1, pt2, connectivity, leftToRight);
+        ptmode = true;
+    }
+    LineIterator( Size boundingAreaSize, Point pt1, Point pt2,
+                  int connectivity = 8, bool leftToRight = false )
+    {
+        init(0, Rect(0, 0, boundingAreaSize.width, boundingAreaSize.height),
+             pt1, pt2, connectivity, leftToRight);
+        ptmode = true;
+    }
+    LineIterator( Rect boundingAreaRect, Point pt1, Point pt2,
+                  int connectivity = 8, bool leftToRight = false )
+    {
+        init(0, boundingAreaRect, pt1, pt2, connectivity, leftToRight);
+        ptmode = true;
+    }
+    void init(const Mat* img, Rect boundingAreaRect, Point pt1, Point pt2, int connectivity, bool leftToRight);
+
+    /** @brief Returns pointer to the current pixel.
+    */
+    uchar* operator *();
+
+    /** @brief Moves iterator to the next pixel on the line.
+
+    This is the prefix version (++it).
+    */
+    LineIterator& operator ++();
+
+    /** @brief Moves iterator to the next pixel on the line.
+
+    This is the postfix version (it++).
+    */
+    LineIterator operator ++(int);
+
+    /** @brief Returns coordinates of the current pixel.
+    */
+    Point pos() const;
+
+    uchar* ptr;
+    const uchar* ptr0;
+    int step, elemSize;
+    int err, count;
+    int minusDelta, plusDelta;
+    int minusStep, plusStep;
+    int minusShift, plusShift;
+    Point p;
+    bool ptmode;
+};
+
+//! @cond IGNORED
+
+// === LineIterator implementation ===
+
+inline
+uchar* LineIterator::operator *()
+{
+    return ptmode ? 0 : ptr;
+}
+
+inline
+LineIterator& LineIterator::operator ++()
+{
+    int mask = err < 0 ? -1 : 0;
+    err += minusDelta + (plusDelta & mask);
+    if(!ptmode)
+    {
+        ptr += minusStep + (plusStep & mask);
+    }
+    else
+    {
+        p.x += minusShift + (plusShift & mask);
+        p.y += minusStep + (plusStep & mask);
+    }
+    return *this;
+}
+
+inline
+LineIterator LineIterator::operator ++(int)
+{
+    LineIterator it = *this;
+    ++(*this);
+    return it;
+}
+
+inline
+Point LineIterator::pos() const
+{
+    if(!ptmode)
+    {
+        size_t offset = (size_t)(ptr - ptr0);
+        int y = (int)(offset/step);
+        int x = (int)((offset - (size_t)y*step)/elemSize);
+        return Point(x, y);
+    }
+    return p;
+}
+
+//! @endcond
+
+//! @} imgproc_draw
+
+//! @} imgproc
+
+} // cv
+
+
+#include "./imgproc/segmentation.hpp"
+
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/imgproc/bindings.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/imgproc/bindings.hpp
new file mode 100644
index 0000000..c69527a
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/imgproc/bindings.hpp
@@ -0,0 +1,34 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_IMGPROC_BINDINGS_HPP
+#define OPENCV_IMGPROC_BINDINGS_HPP
+
+// This file contains special overloads for OpenCV bindings
+// No need to use these functions in C++ code.
+
+namespace cv {
+
+/** @brief Finds lines in a binary image using the standard Hough transform and get accumulator.
+ *
+ * @note This function is for bindings use only. Use original function in C++ code
+ *
+ * @sa HoughLines
+ */
+CV_WRAP static inline
+void HoughLinesWithAccumulator(
+        InputArray image, OutputArray lines,
+        double rho, double theta, int threshold,
+        double srn = 0, double stn = 0,
+        double min_theta = 0, double max_theta = CV_PI
+)
+{
+    std::vector<Vec3f> lines_acc;
+    HoughLines(image, lines_acc, rho, theta, threshold, srn, stn, min_theta, max_theta);
+    Mat(lines_acc).copyTo(lines);
+}
+
+}  // namespace
+
+#endif  // OPENCV_IMGPROC_BINDINGS_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/imgproc/detail/gcgraph.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/imgproc/detail/gcgraph.hpp
new file mode 100644
index 0000000..f17c6e7
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/imgproc/detail/gcgraph.hpp
@@ -0,0 +1,395 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_IMGPROC_DETAIL_GCGRAPH_HPP
+#define OPENCV_IMGPROC_DETAIL_GCGRAPH_HPP
+
+//! @cond IGNORED
+
+namespace cv { namespace detail {
+template <class TWeight> class GCGraph
+{
+public:
+    GCGraph();
+    GCGraph( unsigned int vtxCount, unsigned int edgeCount );
+    ~GCGraph();
+    void create( unsigned int vtxCount, unsigned int edgeCount );
+    int addVtx();
+    void addEdges( int i, int j, TWeight w, TWeight revw );
+    void addTermWeights( int i, TWeight sourceW, TWeight sinkW );
+    TWeight maxFlow();
+    bool inSourceSegment( int i );
+private:
+    class Vtx
+    {
+    public:
+        Vtx *next; // initialized and used in maxFlow() only
+        int parent;
+        int first;
+        int ts;
+        int dist;
+        TWeight weight;
+        uchar t;
+    };
+    class Edge
+    {
+    public:
+        int dst;
+        int next;
+        TWeight weight;
+    };
+
+    std::vector<Vtx> vtcs;
+    std::vector<Edge> edges;
+    TWeight flow;
+};
+
+template <class TWeight>
+GCGraph<TWeight>::GCGraph()
+{
+    flow = 0;
+}
+template <class TWeight>
+GCGraph<TWeight>::GCGraph( unsigned int vtxCount, unsigned int edgeCount )
+{
+    create( vtxCount, edgeCount );
+}
+template <class TWeight>
+GCGraph<TWeight>::~GCGraph()
+{
+}
+template <class TWeight>
+void GCGraph<TWeight>::create( unsigned int vtxCount, unsigned int edgeCount )
+{
+    vtcs.reserve( vtxCount );
+    edges.reserve( edgeCount + 2 );
+    flow = 0;
+}
+
+template <class TWeight>
+int GCGraph<TWeight>::addVtx()
+{
+    Vtx v;
+    memset( &v, 0, sizeof(Vtx));
+    vtcs.push_back(v);
+    return (int)vtcs.size() - 1;
+}
+
+template <class TWeight>
+void GCGraph<TWeight>::addEdges( int i, int j, TWeight w, TWeight revw )
+{
+    CV_Assert( i>=0 && i<(int)vtcs.size() );
+    CV_Assert( j>=0 && j<(int)vtcs.size() );
+    CV_Assert( w>=0 && revw>=0 );
+    CV_Assert( i != j );
+
+    if( !edges.size() )
+        edges.resize( 2 );
+
+    Edge fromI, toI;
+    fromI.dst = j;
+    fromI.next = vtcs[i].first;
+    fromI.weight = w;
+    vtcs[i].first = (int)edges.size();
+    edges.push_back( fromI );
+
+    toI.dst = i;
+    toI.next = vtcs[j].first;
+    toI.weight = revw;
+    vtcs[j].first = (int)edges.size();
+    edges.push_back( toI );
+}
+
+template <class TWeight>
+void GCGraph<TWeight>::addTermWeights( int i, TWeight sourceW, TWeight sinkW )
+{
+    CV_Assert( i>=0 && i<(int)vtcs.size() );
+
+    TWeight dw = vtcs[i].weight;
+    if( dw > 0 )
+        sourceW += dw;
+    else
+        sinkW -= dw;
+    flow += (sourceW < sinkW) ? sourceW : sinkW;
+    vtcs[i].weight = sourceW - sinkW;
+}
+
+template <class TWeight>
+TWeight GCGraph<TWeight>::maxFlow()
+{
+    CV_Assert(!vtcs.empty());
+    CV_Assert(!edges.empty());
+    const int TERMINAL = -1, ORPHAN = -2;
+    Vtx stub, *nilNode = &stub, *first = nilNode, *last = nilNode;
+    int curr_ts = 0;
+    stub.next = nilNode;
+    Vtx *vtxPtr = &vtcs[0];
+    Edge *edgePtr = &edges[0];
+
+    std::vector<Vtx*> orphans;
+
+    // initialize the active queue and the graph vertices
+    for( int i = 0; i < (int)vtcs.size(); i++ )
+    {
+        Vtx* v = vtxPtr + i;
+        v->ts = 0;
+        if( v->weight != 0 )
+        {
+            last = last->next = v;
+            v->dist = 1;
+            v->parent = TERMINAL;
+            v->t = v->weight < 0;
+        }
+        else
+            v->parent = 0;
+    }
+    first = first->next;
+    last->next = nilNode;
+    nilNode->next = 0;
+
+    // run the search-path -> augment-graph -> restore-trees loop
+    for(;;)
+    {
+        Vtx* v, *u;
+        int e0 = -1, ei = 0, ej = 0;
+        TWeight minWeight, weight;
+        uchar vt;
+
+        // grow S & T search trees, find an edge connecting them
+        while( first != nilNode )
+        {
+            v = first;
+            if( v->parent )
+            {
+                vt = v->t;
+                for( ei = v->first; ei != 0; ei = edgePtr[ei].next )
+                {
+                    if( edgePtr[ei^vt].weight == 0 )
+                        continue;
+                    u = vtxPtr+edgePtr[ei].dst;
+                    if( !u->parent )
+                    {
+                        u->t = vt;
+                        u->parent = ei ^ 1;
+                        u->ts = v->ts;
+                        u->dist = v->dist + 1;
+                        if( !u->next )
+                        {
+                            u->next = nilNode;
+                            last = last->next = u;
+                        }
+                        continue;
+                    }
+
+                    if( u->t != vt )
+                    {
+                        e0 = ei ^ vt;
+                        break;
+                    }
+
+                    if( u->dist > v->dist+1 && u->ts <= v->ts )
+                    {
+                        // reassign the parent
+                        u->parent = ei ^ 1;
+                        u->ts = v->ts;
+                        u->dist = v->dist + 1;
+                    }
+                }
+                if( e0 > 0 )
+                    break;
+            }
+            // exclude the vertex from the active list
+            first = first->next;
+            v->next = 0;
+        }
+
+        if( e0 <= 0 )
+            break;
+
+        // find the minimum edge weight along the path
+        minWeight = edgePtr[e0].weight;
+        CV_Assert( minWeight > 0 );
+        // k = 1: source tree, k = 0: destination tree
+        for( int k = 1; k >= 0; k-- )
+        {
+            for( v = vtxPtr+edgePtr[e0^k].dst;; v = vtxPtr+edgePtr[ei].dst )
+            {
+                if( (ei = v->parent) < 0 )
+                    break;
+                weight = edgePtr[ei^k].weight;
+                minWeight = MIN(minWeight, weight);
+                CV_Assert( minWeight > 0 );
+            }
+            weight = fabs(v->weight);
+            minWeight = MIN(minWeight, weight);
+            CV_Assert( minWeight > 0 );
+        }
+
+        // modify weights of the edges along the path and collect orphans
+        edgePtr[e0].weight -= minWeight;
+        edgePtr[e0^1].weight += minWeight;
+        flow += minWeight;
+
+        // k = 1: source tree, k = 0: destination tree
+        for( int k = 1; k >= 0; k-- )
+        {
+            for( v = vtxPtr+edgePtr[e0^k].dst;; v = vtxPtr+edgePtr[ei].dst )
+            {
+                if( (ei = v->parent) < 0 )
+                    break;
+                edgePtr[ei^(k^1)].weight += minWeight;
+                if( (edgePtr[ei^k].weight -= minWeight) == 0 )
+                {
+                    orphans.push_back(v);
+                    v->parent = ORPHAN;
+                }
+            }
+
+            v->weight = v->weight + minWeight*(1-k*2);
+            if( v->weight == 0 )
+            {
+               orphans.push_back(v);
+               v->parent = ORPHAN;
+            }
+        }
+
+        // restore the search trees by finding new parents for the orphans
+        curr_ts++;
+        while( !orphans.empty() )
+        {
+            Vtx* v2 = orphans.back();
+            orphans.pop_back();
+
+            int d, minDist = INT_MAX;
+            e0 = 0;
+            vt = v2->t;
+
+            for( ei = v2->first; ei != 0; ei = edgePtr[ei].next )
+            {
+                if( edgePtr[ei^(vt^1)].weight == 0 )
+                    continue;
+                u = vtxPtr+edgePtr[ei].dst;
+                if( u->t != vt || u->parent == 0 )
+                    continue;
+                // compute the distance to the tree root
+                for( d = 0;; )
+                {
+                    if( u->ts == curr_ts )
+                    {
+                        d += u->dist;
+                        break;
+                    }
+                    ej = u->parent;
+                    d++;
+                    if( ej < 0 )
+                    {
+                        if( ej == ORPHAN )
+                            d = INT_MAX-1;
+                        else
+                        {
+                            u->ts = curr_ts;
+                            u->dist = 1;
+                        }
+                        break;
+                    }
+                    u = vtxPtr+edgePtr[ej].dst;
+                }
+
+                // update the distance
+                if( ++d < INT_MAX )
+                {
+                    if( d < minDist )
+                    {
+                        minDist = d;
+                        e0 = ei;
+                    }
+                    for( u = vtxPtr+edgePtr[ei].dst; u->ts != curr_ts; u = vtxPtr+edgePtr[u->parent].dst )
+                    {
+                        u->ts = curr_ts;
+                        u->dist = --d;
+                    }
+                }
+            }
+
+            if( (v2->parent = e0) > 0 )
+            {
+                v2->ts = curr_ts;
+                v2->dist = minDist;
+                continue;
+            }
+
+            /* no parent is found */
+            v2->ts = 0;
+            for( ei = v2->first; ei != 0; ei = edgePtr[ei].next )
+            {
+                u = vtxPtr+edgePtr[ei].dst;
+                ej = u->parent;
+                if( u->t != vt || !ej )
+                    continue;
+                if( edgePtr[ei^(vt^1)].weight && !u->next )
+                {
+                    u->next = nilNode;
+                    last = last->next = u;
+                }
+                if( ej > 0 && vtxPtr+edgePtr[ej].dst == v2 )
+                {
+                    orphans.push_back(u);
+                    u->parent = ORPHAN;
+                }
+            }
+        }
+    }
+    return flow;
+}
+
+template <class TWeight>
+bool GCGraph<TWeight>::inSourceSegment( int i )
+{
+    CV_Assert( i>=0 && i<(int)vtcs.size() );
+    return vtcs[i].t == 0;
+}
+
+}} // namespace detail, cv
+
+
+//! @endcond
+
+#endif  // OPENCV_IMGPROC_DETAIL_GCGRAPH_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/imgproc/hal/hal.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/imgproc/hal/hal.hpp
new file mode 100644
index 0000000..f129012
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/imgproc/hal/hal.hpp
@@ -0,0 +1,246 @@
+#ifndef CV_IMGPROC_HAL_HPP
+#define CV_IMGPROC_HAL_HPP
+
+#include "opencv2/core/cvdef.h"
+#include "opencv2/core/cvstd.hpp"
+#include "opencv2/core/hal/interface.h"
+
+namespace cv { namespace hal {
+
+//! @addtogroup imgproc_hal_functions
+//! @{
+
+//---------------------------
+//! @cond IGNORED
+
+struct CV_EXPORTS Filter2D
+{
+    CV_DEPRECATED static Ptr<hal::Filter2D> create(uchar * , size_t , int ,
+                                     int , int ,
+                                     int , int ,
+                                     int , int ,
+                                     int , double ,
+                                     int , int ,
+                                     bool , bool );
+    virtual void apply(uchar * , size_t ,
+                       uchar * , size_t ,
+                       int , int ,
+                       int , int ,
+                       int , int ) = 0;
+    virtual ~Filter2D() {}
+};
+
+struct CV_EXPORTS SepFilter2D
+{
+    CV_DEPRECATED static Ptr<hal::SepFilter2D> create(int , int , int ,
+                                        uchar * , int ,
+                                        uchar * , int ,
+                                        int , int ,
+                                        double , int );
+    virtual void apply(uchar * , size_t ,
+                       uchar * , size_t ,
+                       int , int ,
+                       int , int ,
+                       int , int ) = 0;
+    virtual ~SepFilter2D() {}
+};
+
+
+struct CV_EXPORTS Morph
+{
+    CV_DEPRECATED static Ptr<hal::Morph> create(int , int , int , int , int ,
+                                    int , uchar * , size_t ,
+                                    int , int ,
+                                    int , int ,
+                                    int , const double *,
+                                    int , bool , bool );
+    virtual void apply(uchar * , size_t , uchar * , size_t , int , int ,
+                       int , int , int , int ,
+                       int , int , int , int ) = 0;
+    virtual ~Morph() {}
+};
+
+//! @endcond
+//---------------------------
+
+CV_EXPORTS void filter2D(int stype, int dtype, int kernel_type,
+                         uchar * src_data, size_t src_step,
+                         uchar * dst_data, size_t dst_step,
+                         int width, int height,
+                         int full_width, int full_height,
+                         int offset_x, int offset_y,
+                         uchar * kernel_data, size_t kernel_step,
+                         int kernel_width, int kernel_height,
+                         int anchor_x, int anchor_y,
+                         double delta, int borderType,
+                         bool isSubmatrix);
+
+CV_EXPORTS void sepFilter2D(int stype, int dtype, int ktype,
+                            uchar * src_data, size_t src_step,
+                            uchar * dst_data, size_t dst_step,
+                            int width, int height,
+                            int full_width, int full_height,
+                            int offset_x, int offset_y,
+                            uchar * kernelx_data, int kernelx_len,
+                            uchar * kernely_data, int kernely_len,
+                            int anchor_x, int anchor_y,
+                            double delta, int borderType);
+
+CV_EXPORTS void morph(int op, int src_type, int dst_type,
+                      uchar * src_data, size_t src_step,
+                      uchar * dst_data, size_t dst_step,
+                      int width, int height,
+                      int roi_width, int roi_height, int roi_x, int roi_y,
+                      int roi_width2, int roi_height2, int roi_x2, int roi_y2,
+                      int kernel_type, uchar * kernel_data, size_t kernel_step,
+                      int kernel_width, int kernel_height, int anchor_x, int anchor_y,
+                      int borderType, const double borderValue[4],
+                      int iterations, bool isSubmatrix);
+
+
+CV_EXPORTS void resize(int src_type,
+                       const uchar * src_data, size_t src_step, int src_width, int src_height,
+                       uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
+                       double inv_scale_x, double inv_scale_y, int interpolation);
+
+CV_EXPORTS void warpAffine(int src_type,
+                           const uchar * src_data, size_t src_step, int src_width, int src_height,
+                           uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
+                           const double M[6], int interpolation, int borderType, const double borderValue[4]);
+
+CV_EXPORTS void warpPerspective(int src_type,
+                               const uchar * src_data, size_t src_step, int src_width, int src_height,
+                               uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
+                               const double M[9], int interpolation, int borderType, const double borderValue[4]);
+
+CV_EXPORTS void cvtBGRtoBGR(const uchar * src_data, size_t src_step,
+                            uchar * dst_data, size_t dst_step,
+                            int width, int height,
+                            int depth, int scn, int dcn, bool swapBlue);
+
+CV_EXPORTS void cvtBGRtoBGR5x5(const uchar * src_data, size_t src_step,
+                               uchar * dst_data, size_t dst_step,
+                               int width, int height,
+                               int scn, bool swapBlue, int greenBits);
+
+CV_EXPORTS void cvtBGR5x5toBGR(const uchar * src_data, size_t src_step,
+                               uchar * dst_data, size_t dst_step,
+                               int width, int height,
+                               int dcn, bool swapBlue, int greenBits);
+
+CV_EXPORTS void cvtBGRtoGray(const uchar * src_data, size_t src_step,
+                             uchar * dst_data, size_t dst_step,
+                             int width, int height,
+                             int depth, int scn, bool swapBlue);
+
+CV_EXPORTS void cvtGraytoBGR(const uchar * src_data, size_t src_step,
+                             uchar * dst_data, size_t dst_step,
+                             int width, int height,
+                             int depth, int dcn);
+
+CV_EXPORTS void cvtBGR5x5toGray(const uchar * src_data, size_t src_step,
+                                uchar * dst_data, size_t dst_step,
+                                int width, int height,
+                                int greenBits);
+
+CV_EXPORTS void cvtGraytoBGR5x5(const uchar * src_data, size_t src_step,
+                                uchar * dst_data, size_t dst_step,
+                                int width, int height,
+                                int greenBits);
+CV_EXPORTS void cvtBGRtoYUV(const uchar * src_data, size_t src_step,
+                            uchar * dst_data, size_t dst_step,
+                            int width, int height,
+                            int depth, int scn, bool swapBlue, bool isCbCr);
+
+CV_EXPORTS void cvtYUVtoBGR(const uchar * src_data, size_t src_step,
+                            uchar * dst_data, size_t dst_step,
+                            int width, int height,
+                            int depth, int dcn, bool swapBlue, bool isCbCr);
+
+CV_EXPORTS void cvtBGRtoXYZ(const uchar * src_data, size_t src_step,
+                            uchar * dst_data, size_t dst_step,
+                            int width, int height,
+                            int depth, int scn, bool swapBlue);
+
+CV_EXPORTS void cvtXYZtoBGR(const uchar * src_data, size_t src_step,
+                            uchar * dst_data, size_t dst_step,
+                            int width, int height,
+                            int depth, int dcn, bool swapBlue);
+
+CV_EXPORTS void cvtBGRtoHSV(const uchar * src_data, size_t src_step,
+                            uchar * dst_data, size_t dst_step,
+                            int width, int height,
+                            int depth, int scn, bool swapBlue, bool isFullRange, bool isHSV);
+
+CV_EXPORTS void cvtHSVtoBGR(const uchar * src_data, size_t src_step,
+                            uchar * dst_data, size_t dst_step,
+                            int width, int height,
+                            int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV);
+
+CV_EXPORTS void cvtBGRtoLab(const uchar * src_data, size_t src_step,
+                            uchar * dst_data, size_t dst_step,
+                            int width, int height,
+                            int depth, int scn, bool swapBlue, bool isLab, bool srgb);
+
+CV_EXPORTS void cvtLabtoBGR(const uchar * src_data, size_t src_step,
+                            uchar * dst_data, size_t dst_step,
+                            int width, int height,
+                            int depth, int dcn, bool swapBlue, bool isLab, bool srgb);
+
+CV_EXPORTS void cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step,
+                                    uchar * dst_data, size_t dst_step,
+                                    int dst_width, int dst_height,
+                                    int dcn, bool swapBlue, int uIdx);
+
+//! Separate Y and UV planes
+CV_EXPORTS void cvtTwoPlaneYUVtoBGR(const uchar * y_data, const uchar * uv_data, size_t src_step,
+                                    uchar * dst_data, size_t dst_step,
+                                    int dst_width, int dst_height,
+                                    int dcn, bool swapBlue, int uIdx);
+
+CV_EXPORTS void cvtTwoPlaneYUVtoBGR(const uchar * y_data, size_t y_step, const uchar * uv_data, size_t uv_step,
+                                    uchar * dst_data, size_t dst_step,
+                                    int dst_width, int dst_height,
+                                    int dcn, bool swapBlue, int uIdx);
+
+CV_EXPORTS void cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
+                                      uchar * dst_data, size_t dst_step,
+                                      int dst_width, int dst_height,
+                                      int dcn, bool swapBlue, int uIdx);
+
+CV_EXPORTS void cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step,
+                                      uchar * dst_data, size_t dst_step,
+                                      int width, int height,
+                                      int scn, bool swapBlue, int uIdx);
+
+//! Separate Y and UV planes
+CV_EXPORTS void cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step,
+                                    uchar * y_data, uchar * uv_data, size_t dst_step,
+                                    int width, int height,
+                                    int scn, bool swapBlue, int uIdx);
+
+CV_EXPORTS void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
+                                    uchar * dst_data, size_t dst_step,
+                                    int width, int height,
+                                    int dcn, bool swapBlue, int uIdx, int ycn);
+
+CV_EXPORTS void cvtRGBAtoMultipliedRGBA(const uchar * src_data, size_t src_step,
+                                        uchar * dst_data, size_t dst_step,
+                                        int width, int height);
+
+CV_EXPORTS void cvtMultipliedRGBAtoRGBA(const uchar * src_data, size_t src_step,
+                                        uchar * dst_data, size_t dst_step,
+                                        int width, int height);
+
+CV_EXPORTS void integral(int depth, int sdepth, int sqdepth,
+                         const uchar* src, size_t srcstep,
+                         uchar* sum, size_t sumstep,
+                         uchar* sqsum, size_t sqsumstep,
+                         uchar* tilted, size_t tstep,
+                         int width, int height, int cn);
+
+//! @}
+
+}}
+
+#endif // CV_IMGPROC_HAL_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/imgproc/hal/interface.h b/duix-sdk/src/main/cpp/third/arm/include/opencv2/imgproc/hal/interface.h
new file mode 100644
index 0000000..f8dbcfe
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/imgproc/hal/interface.h
@@ -0,0 +1,46 @@
+#ifndef OPENCV_IMGPROC_HAL_INTERFACE_H
+#define OPENCV_IMGPROC_HAL_INTERFACE_H
+
+//! @addtogroup imgproc_hal_interface
+//! @{
+
+//! @name Interpolation modes
+//! @sa cv::InterpolationFlags
+//! @{
+#define CV_HAL_INTER_NEAREST 0
+#define CV_HAL_INTER_LINEAR 1
+#define CV_HAL_INTER_CUBIC 2
+#define CV_HAL_INTER_AREA 3
+#define CV_HAL_INTER_LANCZOS4 4
+//! @}
+
+//! @name Morphology operations
+//! @sa cv::MorphTypes
+//! @{
+#define CV_HAL_MORPH_ERODE 0
+#define CV_HAL_MORPH_DILATE 1
+//! @}
+
+//! @name Threshold types
+//! @sa cv::ThresholdTypes
+//! @{
+#define CV_HAL_THRESH_BINARY      0
+#define CV_HAL_THRESH_BINARY_INV  1
+#define CV_HAL_THRESH_TRUNC       2
+#define CV_HAL_THRESH_TOZERO      3
+#define CV_HAL_THRESH_TOZERO_INV  4
+#define CV_HAL_THRESH_MASK        7
+#define CV_HAL_THRESH_OTSU        8
+#define CV_HAL_THRESH_TRIANGLE    16
+//! @}
+
+//! @name Adaptive threshold algorithm
+//! @sa cv::AdaptiveThresholdTypes
+//! @{
+#define CV_HAL_ADAPTIVE_THRESH_MEAN_C     0
+#define CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C 1
+//! @}
+
+//! @}
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/imgproc/imgproc.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/imgproc/imgproc.hpp
new file mode 100644
index 0000000..4175bd0
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/imgproc/imgproc.hpp
@@ -0,0 +1,48 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __OPENCV_BUILD
+#error this is a compatibility header which should not be used inside the OpenCV library
+#endif
+
+#include "opencv2/imgproc.hpp"
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/imgproc/imgproc_c.h b/duix-sdk/src/main/cpp/third/arm/include/opencv2/imgproc/imgproc_c.h
new file mode 100644
index 0000000..86dc119
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/imgproc/imgproc_c.h
@@ -0,0 +1,1177 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_IMGPROC_IMGPROC_C_H
+#define OPENCV_IMGPROC_IMGPROC_C_H
+
+#include "opencv2/imgproc/types_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup imgproc_c
+@{
+*/
+
+/*********************** Background statistics accumulation *****************************/
+
+/** @brief Adds image to accumulator
+@see cv::accumulate
+*/
+CVAPI(void)  cvAcc( const CvArr* image, CvArr* sum,
+                   const CvArr* mask CV_DEFAULT(NULL) );
+
+/** @brief Adds squared image to accumulator
+@see cv::accumulateSquare
+*/
+CVAPI(void)  cvSquareAcc( const CvArr* image, CvArr* sqsum,
+                         const CvArr* mask CV_DEFAULT(NULL) );
+
+/** @brief Adds a product of two images to accumulator
+@see cv::accumulateProduct
+*/
+CVAPI(void)  cvMultiplyAcc( const CvArr* image1, const CvArr* image2, CvArr* acc,
+                           const CvArr* mask CV_DEFAULT(NULL) );
+
+/** @brief Adds image to accumulator with weights: acc = acc*(1-alpha) + image*alpha
+@see cv::accumulateWeighted
+*/
+CVAPI(void)  cvRunningAvg( const CvArr* image, CvArr* acc, double alpha,
+                          const CvArr* mask CV_DEFAULT(NULL) );
+
+/****************************************************************************************\
+*                                    Image Processing                                    *
+\****************************************************************************************/
+
+/** Copies source 2D array inside of the larger destination array and
+   makes a border of the specified type (IPL_BORDER_*) around the copied area. */
+CVAPI(void) cvCopyMakeBorder( const CvArr* src, CvArr* dst, CvPoint offset,
+                              int bordertype, CvScalar value CV_DEFAULT(cvScalarAll(0)));
+
+/** @brief Smooths the image in one of several ways.
+
+@param src The source image
+@param dst The destination image
+@param smoothtype Type of the smoothing, see SmoothMethod_c
+@param size1 The first parameter of the smoothing operation, the aperture width. Must be a
+positive odd number (1, 3, 5, ...)
+@param size2 The second parameter of the smoothing operation, the aperture height. Ignored by
+CV_MEDIAN and CV_BILATERAL methods. In the case of simple scaled/non-scaled and Gaussian blur if
+size2 is zero, it is set to size1. Otherwise it must be a positive odd number.
+@param sigma1 In the case of a Gaussian parameter this parameter may specify Gaussian \f$\sigma\f$
+(standard deviation). If it is zero, it is calculated from the kernel size:
+\f[\sigma  = 0.3 (n/2 - 1) + 0.8  \quad   \text{where}   \quad  n= \begin{array}{l l} \mbox{\texttt{size1} for horizontal kernel} \\ \mbox{\texttt{size2} for vertical kernel} \end{array}\f]
+Using standard sigma for small kernels ( \f$3\times 3\f$ to \f$7\times 7\f$ ) gives better speed. If
+sigma1 is not zero, while size1 and size2 are zeros, the kernel size is calculated from the
+sigma (to provide accurate enough operation).
+@param sigma2 additional parameter for bilateral filtering
+
+@see cv::GaussianBlur, cv::blur, cv::medianBlur, cv::bilateralFilter.
+ */
+CVAPI(void) cvSmooth( const CvArr* src, CvArr* dst,
+                      int smoothtype CV_DEFAULT(CV_GAUSSIAN),
+                      int size1 CV_DEFAULT(3),
+                      int size2 CV_DEFAULT(0),
+                      double sigma1 CV_DEFAULT(0),
+                      double sigma2 CV_DEFAULT(0));
+
+/** @brief Convolves an image with the kernel.
+
+@param src input image.
+@param dst output image of the same size and the same number of channels as src.
+@param kernel convolution kernel (or rather a correlation kernel), a single-channel floating point
+matrix; if you want to apply different kernels to different channels, split the image into
+separate color planes using split and process them individually.
+@param anchor anchor of the kernel that indicates the relative position of a filtered point within
+the kernel; the anchor should lie within the kernel; default value (-1,-1) means that the anchor
+is at the kernel center.
+
+@see cv::filter2D
+ */
+CVAPI(void) cvFilter2D( const CvArr* src, CvArr* dst, const CvMat* kernel,
+                        CvPoint anchor CV_DEFAULT(cvPoint(-1,-1)));
+
+/** @brief Finds integral image: SUM(X,Y) = sum(x<X,y<Y)I(x,y)
+@see cv::integral
+*/
+CVAPI(void) cvIntegral( const CvArr* image, CvArr* sum,
+                       CvArr* sqsum CV_DEFAULT(NULL),
+                       CvArr* tilted_sum CV_DEFAULT(NULL));
+
+/** @brief Smoothes the input image with gaussian kernel and then down-samples it.
+
+   dst_width = floor(src_width/2)[+1],
+   dst_height = floor(src_height/2)[+1]
+   @see cv::pyrDown
+*/
+CVAPI(void)  cvPyrDown( const CvArr* src, CvArr* dst,
+                        int filter CV_DEFAULT(CV_GAUSSIAN_5x5) );
+
+/** @brief Up-samples image and smoothes the result with gaussian kernel.
+
+   dst_width = src_width*2,
+   dst_height = src_height*2
+   @see cv::pyrUp
+*/
+CVAPI(void)  cvPyrUp( const CvArr* src, CvArr* dst,
+                      int filter CV_DEFAULT(CV_GAUSSIAN_5x5) );
+
+/** @brief Builds pyramid for an image
+@see buildPyramid
+*/
+CVAPI(CvMat**) cvCreatePyramid( const CvArr* img, int extra_layers, double rate,
+                                const CvSize* layer_sizes CV_DEFAULT(0),
+                                CvArr* bufarr CV_DEFAULT(0),
+                                int calc CV_DEFAULT(1),
+                                int filter CV_DEFAULT(CV_GAUSSIAN_5x5) );
+
+/** @brief Releases pyramid */
+CVAPI(void)  cvReleasePyramid( CvMat*** pyramid, int extra_layers );
+
+
+/** @brief Filters image using meanshift algorithm
+@see cv::pyrMeanShiftFiltering
+*/
+CVAPI(void) cvPyrMeanShiftFiltering( const CvArr* src, CvArr* dst,
+    double sp, double sr, int max_level CV_DEFAULT(1),
+    CvTermCriteria termcrit CV_DEFAULT(cvTermCriteria(CV_TERMCRIT_ITER+CV_TERMCRIT_EPS,5,1)));
+
+/** @brief Segments image using seed "markers"
+@see cv::watershed
+*/
+CVAPI(void) cvWatershed( const CvArr* image, CvArr* markers );
+
+/** @brief Calculates an image derivative using generalized Sobel
+
+   (aperture_size = 1,3,5,7) or Scharr (aperture_size = -1) operator.
+   Scharr can be used only for the first dx or dy derivative
+@see cv::Sobel
+*/
+CVAPI(void) cvSobel( const CvArr* src, CvArr* dst,
+                    int xorder, int yorder,
+                    int aperture_size CV_DEFAULT(3));
+
+/** @brief Calculates the image Laplacian: (d2/dx + d2/dy)I
+@see cv::Laplacian
+*/
+CVAPI(void) cvLaplace( const CvArr* src, CvArr* dst,
+                      int aperture_size CV_DEFAULT(3) );
+
+/** @brief Converts input array pixels from one color space to another
+@see cv::cvtColor
+*/
+CVAPI(void)  cvCvtColor( const CvArr* src, CvArr* dst, int code );
+
+
+/** @brief Resizes image (input array is resized to fit the destination array)
+@see cv::resize
+*/
+CVAPI(void)  cvResize( const CvArr* src, CvArr* dst,
+                       int interpolation CV_DEFAULT( CV_INTER_LINEAR ));
+
+/** @brief Warps image with affine transform
+@note ::cvGetQuadrangleSubPix is similar to ::cvWarpAffine, but the outliers are extrapolated using
+replication border mode.
+@see cv::warpAffine
+*/
+CVAPI(void)  cvWarpAffine( const CvArr* src, CvArr* dst, const CvMat* map_matrix,
+                           int flags CV_DEFAULT(CV_INTER_LINEAR+CV_WARP_FILL_OUTLIERS),
+                           CvScalar fillval CV_DEFAULT(cvScalarAll(0)) );
+
+/** @brief Computes affine transform matrix for mapping src[i] to dst[i] (i=0,1,2)
+@see cv::getAffineTransform
+*/
+CVAPI(CvMat*) cvGetAffineTransform( const CvPoint2D32f * src,
+                                    const CvPoint2D32f * dst,
+                                    CvMat * map_matrix );
+
+/** @brief Computes rotation_matrix matrix
+@see cv::getRotationMatrix2D
+*/
+CVAPI(CvMat*)  cv2DRotationMatrix( CvPoint2D32f center, double angle,
+                                   double scale, CvMat* map_matrix );
+
+/** @brief Warps image with perspective (projective) transform
+@see cv::warpPerspective
+*/
+CVAPI(void)  cvWarpPerspective( const CvArr* src, CvArr* dst, const CvMat* map_matrix,
+                                int flags CV_DEFAULT(CV_INTER_LINEAR+CV_WARP_FILL_OUTLIERS),
+                                CvScalar fillval CV_DEFAULT(cvScalarAll(0)) );
+
+/** @brief Computes perspective transform matrix for mapping src[i] to dst[i] (i=0,1,2,3)
+@see cv::getPerspectiveTransform
+*/
+CVAPI(CvMat*) cvGetPerspectiveTransform( const CvPoint2D32f* src,
+                                         const CvPoint2D32f* dst,
+                                         CvMat* map_matrix );
+
+/** @brief Performs generic geometric transformation using the specified coordinate maps
+@see cv::remap
+*/
+CVAPI(void)  cvRemap( const CvArr* src, CvArr* dst,
+                      const CvArr* mapx, const CvArr* mapy,
+                      int flags CV_DEFAULT(CV_INTER_LINEAR+CV_WARP_FILL_OUTLIERS),
+                      CvScalar fillval CV_DEFAULT(cvScalarAll(0)) );
+
+/** @brief Converts mapx & mapy from floating-point to integer formats for cvRemap
+@see cv::convertMaps
+*/
+CVAPI(void)  cvConvertMaps( const CvArr* mapx, const CvArr* mapy,
+                            CvArr* mapxy, CvArr* mapalpha );
+
+/** @brief Performs forward or inverse log-polar image transform
+@see cv::warpPolar
+*/
+CVAPI(void)  cvLogPolar( const CvArr* src, CvArr* dst,
+                         CvPoint2D32f center, double M,
+                         int flags CV_DEFAULT(CV_INTER_LINEAR+CV_WARP_FILL_OUTLIERS));
+
+/** Performs forward or inverse linear-polar image transform
+@see cv::warpPolar
+*/
+CVAPI(void)  cvLinearPolar( const CvArr* src, CvArr* dst,
+                         CvPoint2D32f center, double maxRadius,
+                         int flags CV_DEFAULT(CV_INTER_LINEAR+CV_WARP_FILL_OUTLIERS));
+
+/** @brief Returns a structuring element of the specified size and shape for morphological operations.
+
+@note the created structuring element IplConvKernel\* element must be released in the end using
+`cvReleaseStructuringElement(&element)`.
+
+@param cols Width of the structuring element
+@param rows Height of the structuring element
+@param anchor_x x-coordinate of the anchor
+@param anchor_y y-coordinate of the anchor
+@param shape element shape that could be one of the cv::MorphShapes_c
+@param values integer array of cols*rows elements that specifies the custom shape of the
+structuring element, when shape=CV_SHAPE_CUSTOM.
+
+@see cv::getStructuringElement
+ */
+ CVAPI(IplConvKernel*)  cvCreateStructuringElementEx(
+            int cols, int  rows, int  anchor_x, int  anchor_y,
+            int shape, int* values CV_DEFAULT(NULL) );
+
+/** @brief releases structuring element
+@see cvCreateStructuringElementEx
+*/
+CVAPI(void)  cvReleaseStructuringElement( IplConvKernel** element );
+
+/** @brief erodes input image (applies minimum filter) one or more times.
+   If element pointer is NULL, 3x3 rectangular element is used
+@see cv::erode
+*/
+CVAPI(void)  cvErode( const CvArr* src, CvArr* dst,
+                      IplConvKernel* element CV_DEFAULT(NULL),
+                      int iterations CV_DEFAULT(1) );
+
+/** @brief dilates input image (applies maximum filter) one or more times.
+
+   If element pointer is NULL, 3x3 rectangular element is used
+@see cv::dilate
+*/
+CVAPI(void)  cvDilate( const CvArr* src, CvArr* dst,
+                       IplConvKernel* element CV_DEFAULT(NULL),
+                       int iterations CV_DEFAULT(1) );
+
+/** @brief Performs complex morphological transformation
+@see cv::morphologyEx
+*/
+CVAPI(void)  cvMorphologyEx( const CvArr* src, CvArr* dst,
+                             CvArr* temp, IplConvKernel* element,
+                             int operation, int iterations CV_DEFAULT(1) );
+
+/** @brief Calculates all spatial and central moments up to the 3rd order
+@see cv::moments
+*/
+CVAPI(void) cvMoments( const CvArr* arr, CvMoments* moments, int binary CV_DEFAULT(0));
+
+/** @brief Retrieve spatial moments */
+CVAPI(double)  cvGetSpatialMoment( CvMoments* moments, int x_order, int y_order );
+/** @brief Retrieve central moments */
+CVAPI(double)  cvGetCentralMoment( CvMoments* moments, int x_order, int y_order );
+/** @brief Retrieve normalized central moments */
+CVAPI(double)  cvGetNormalizedCentralMoment( CvMoments* moments,
+                                             int x_order, int y_order );
+
+/** @brief Calculates 7 Hu's invariants from precalculated spatial and central moments
+@see cv::HuMoments
+*/
+CVAPI(void) cvGetHuMoments( CvMoments*  moments, CvHuMoments*  hu_moments );
+
+/*********************************** data sampling **************************************/
+
+/** @brief Fetches pixels that belong to the specified line segment and stores them to the buffer.
+
+   Returns the number of retrieved points.
+@see cv::LineSegmentDetector
+*/
+CVAPI(int)  cvSampleLine( const CvArr* image, CvPoint pt1, CvPoint pt2, void* buffer,
+                          int connectivity CV_DEFAULT(8));
+
+/** @brief Retrieves the rectangular image region with specified center from the input array.
+
+ dst(x,y) <- src(x + center.x - dst_width/2, y + center.y - dst_height/2).
+ Values of pixels with fractional coordinates are retrieved using bilinear interpolation
+@see cv::getRectSubPix
+*/
+CVAPI(void)  cvGetRectSubPix( const CvArr* src, CvArr* dst, CvPoint2D32f center );
+
+
+/** @brief Retrieves quadrangle from the input array.
+
+    matrixarr = ( a11  a12 | b1 )   dst(x,y) <- src(A[x y]' + b)
+                ( a21  a22 | b2 )   (bilinear interpolation is used to retrieve pixels
+                                     with fractional coordinates)
+@see cvWarpAffine
+*/
+CVAPI(void)  cvGetQuadrangleSubPix( const CvArr* src, CvArr* dst,
+                                    const CvMat* map_matrix );
+
+/** @brief Measures similarity between template and overlapped windows in the source image
+   and fills the resultant image with the measurements
+@see cv::matchTemplate
+*/
+CVAPI(void)  cvMatchTemplate( const CvArr* image, const CvArr* templ,
+                              CvArr* result, int method );
+
+/** @brief Computes earth mover distance between
+   two weighted point sets (called signatures)
+@see cv::EMD
+*/
+CVAPI(float)  cvCalcEMD2( const CvArr* signature1,
+                          const CvArr* signature2,
+                          int distance_type,
+                          CvDistanceFunction distance_func CV_DEFAULT(NULL),
+                          const CvArr* cost_matrix CV_DEFAULT(NULL),
+                          CvArr* flow CV_DEFAULT(NULL),
+                          float* lower_bound CV_DEFAULT(NULL),
+                          void* userdata CV_DEFAULT(NULL));
+
+/****************************************************************************************\
+*                              Contours retrieving                                       *
+\****************************************************************************************/
+
+/** @brief Retrieves outer and optionally inner boundaries of white (non-zero) connected
+   components in the black (zero) background
+@see cv::findContours, cvStartFindContours, cvFindNextContour, cvSubstituteContour, cvEndFindContours
+*/
+CVAPI(int)  cvFindContours( CvArr* image, CvMemStorage* storage, CvSeq** first_contour,
+                            int header_size CV_DEFAULT(sizeof(CvContour)),
+                            int mode CV_DEFAULT(CV_RETR_LIST),
+                            int method CV_DEFAULT(CV_CHAIN_APPROX_SIMPLE),
+                            CvPoint offset CV_DEFAULT(cvPoint(0,0)));
+
+/** @brief Initializes contour retrieving process.
+
+   Calls cvStartFindContours.
+   Calls cvFindNextContour until null pointer is returned
+   or some other condition becomes true.
+   Calls cvEndFindContours at the end.
+@see cvFindContours
+*/
+CVAPI(CvContourScanner)  cvStartFindContours( CvArr* image, CvMemStorage* storage,
+                            int header_size CV_DEFAULT(sizeof(CvContour)),
+                            int mode CV_DEFAULT(CV_RETR_LIST),
+                            int method CV_DEFAULT(CV_CHAIN_APPROX_SIMPLE),
+                            CvPoint offset CV_DEFAULT(cvPoint(0,0)));
+
+/** @brief Retrieves next contour
+@see cvFindContours
+*/
+CVAPI(CvSeq*)  cvFindNextContour( CvContourScanner scanner );
+
+
+/** @brief Substitutes the last retrieved contour with the new one
+
+   (if the substitutor is null, the last retrieved contour is removed from the tree)
+@see cvFindContours
+*/
+CVAPI(void)   cvSubstituteContour( CvContourScanner scanner, CvSeq* new_contour );
+
+
+/** @brief Releases contour scanner and returns pointer to the first outer contour
+@see cvFindContours
+*/
+CVAPI(CvSeq*)  cvEndFindContours( CvContourScanner* scanner );
+
+/** @brief Approximates Freeman chain(s) with a polygonal curve.
+
+This is a standalone contour approximation routine, not represented in the new interface. When
+cvFindContours retrieves contours as Freeman chains, it calls the function to get approximated
+contours, represented as polygons.
+
+@param src_seq Pointer to the approximated Freeman chain that can refer to other chains.
+@param storage Storage location for the resulting polylines.
+@param method Approximation method (see the description of the function :ocvFindContours ).
+@param parameter Method parameter (not used now).
+@param minimal_perimeter Approximates only those contours whose perimeters are not less than
+minimal_perimeter . Other chains are removed from the resulting structure.
+@param recursive Recursion flag. If it is non-zero, the function approximates all chains that can
+be obtained from chain by using the h_next or v_next links. Otherwise, the single input chain is
+approximated.
+@see cvStartReadChainPoints, cvReadChainPoint
+ */
+CVAPI(CvSeq*) cvApproxChains( CvSeq* src_seq, CvMemStorage* storage,
+                            int method CV_DEFAULT(CV_CHAIN_APPROX_SIMPLE),
+                            double parameter CV_DEFAULT(0),
+                            int  minimal_perimeter CV_DEFAULT(0),
+                            int  recursive CV_DEFAULT(0));
+
+/** @brief Initializes Freeman chain reader.
+
+   The reader is used to iteratively get coordinates of all the chain points.
+   If the Freeman codes should be read as is, a simple sequence reader should be used
+@see cvApproxChains
+*/
+CVAPI(void) cvStartReadChainPoints( CvChain* chain, CvChainPtReader* reader );
+
+/** @brief Retrieves the next chain point
+@see cvApproxChains
+*/
+CVAPI(CvPoint) cvReadChainPoint( CvChainPtReader* reader );
+
+
+/****************************************************************************************\
+*                            Contour Processing and Shape Analysis                       *
+\****************************************************************************************/
+
+/** @brief Approximates a single polygonal curve (contour) or
+   a tree of polygonal curves (contours)
+@see cv::approxPolyDP
+*/
+CVAPI(CvSeq*)  cvApproxPoly( const void* src_seq,
+                             int header_size, CvMemStorage* storage,
+                             int method, double eps,
+                             int recursive CV_DEFAULT(0));
+
+/** @brief Calculates perimeter of a contour or length of a part of contour
+@see cv::arcLength
+*/
+CVAPI(double)  cvArcLength( const void* curve,
+                            CvSlice slice CV_DEFAULT(CV_WHOLE_SEQ),
+                            int is_closed CV_DEFAULT(-1));
+
+/** same as cvArcLength for closed contour
+*/
+CV_INLINE double cvContourPerimeter( const void* contour )
+{
+    return cvArcLength( contour, CV_WHOLE_SEQ, 1 );
+}
+
+
+/** @brief Calculates contour bounding rectangle (update=1) or
+   just retrieves pre-calculated rectangle (update=0)
+@see cv::boundingRect
+*/
+CVAPI(CvRect)  cvBoundingRect( CvArr* points, int update CV_DEFAULT(0) );
+
+/** @brief Calculates area of a contour or contour segment
+@see cv::contourArea
+*/
+CVAPI(double)  cvContourArea( const CvArr* contour,
+                              CvSlice slice CV_DEFAULT(CV_WHOLE_SEQ),
+                              int oriented CV_DEFAULT(0));
+
+/** @brief Finds minimum area rotated rectangle bounding a set of points
+@see cv::minAreaRect
+*/
+CVAPI(CvBox2D)  cvMinAreaRect2( const CvArr* points,
+                                CvMemStorage* storage CV_DEFAULT(NULL));
+
+/** @brief Finds minimum enclosing circle for a set of points
+@see cv::minEnclosingCircle
+*/
+CVAPI(int)  cvMinEnclosingCircle( const CvArr* points,
+                                  CvPoint2D32f* center, float* radius );
+
+/** @brief Compares two contours by matching their moments
+@see cv::matchShapes
+*/
+CVAPI(double)  cvMatchShapes( const void* object1, const void* object2,
+                              int method, double parameter CV_DEFAULT(0));
+
+/** @brief Calculates exact convex hull of 2d point set
+@see cv::convexHull
+*/
+CVAPI(CvSeq*) cvConvexHull2( const CvArr* input,
+                             void* hull_storage CV_DEFAULT(NULL),
+                             int orientation CV_DEFAULT(CV_CLOCKWISE),
+                             int return_points CV_DEFAULT(0));
+
+/** @brief Checks whether the contour is convex or not (returns 1 if convex, 0 if not)
+@see cv::isContourConvex
+*/
+CVAPI(int)  cvCheckContourConvexity( const CvArr* contour );
+
+
+/** @brief Finds convexity defects for the contour
+@see cv::convexityDefects
+*/
+CVAPI(CvSeq*)  cvConvexityDefects( const CvArr* contour, const CvArr* convexhull,
+                                   CvMemStorage* storage CV_DEFAULT(NULL));
+
+/** @brief Fits ellipse into a set of 2d points
+@see cv::fitEllipse
+*/
+CVAPI(CvBox2D) cvFitEllipse2( const CvArr* points );
+
+/** @brief Finds minimum rectangle containing two given rectangles */
+CVAPI(CvRect)  cvMaxRect( const CvRect* rect1, const CvRect* rect2 );
+
+/** @brief Finds coordinates of the box vertices */
+CVAPI(void) cvBoxPoints( CvBox2D box, CvPoint2D32f pt[4] );
+
+/** @brief Initializes sequence header for a matrix (column or row vector) of points
+
+   a wrapper for cvMakeSeqHeaderForArray (it does not initialize bounding rectangle!!!) */
+CVAPI(CvSeq*) cvPointSeqFromMat( int seq_kind, const CvArr* mat,
+                                 CvContour* contour_header,
+                                 CvSeqBlock* block );
+
+/** @brief Checks whether the point is inside polygon, outside, on an edge (at a vertex).
+
+   Returns positive, negative or zero value, correspondingly.
+   Optionally, measures a signed distance between
+   the point and the nearest polygon edge (measure_dist=1)
+@see cv::pointPolygonTest
+*/
+CVAPI(double) cvPointPolygonTest( const CvArr* contour,
+                                  CvPoint2D32f pt, int measure_dist );
+
+/****************************************************************************************\
+*                                  Histogram functions                                   *
+\****************************************************************************************/
+
+/** @brief Creates a histogram.
+
+The function creates a histogram of the specified size and returns a pointer to the created
+histogram. If the array ranges is 0, the histogram bin ranges must be specified later via the
+function cvSetHistBinRanges. Though cvCalcHist and cvCalcBackProject may process 8-bit images
+without setting bin ranges, they assume they are equally spaced in 0 to 255 bins.
+
+@param dims Number of histogram dimensions.
+@param sizes Array of the histogram dimension sizes.
+@param type Histogram representation format. CV_HIST_ARRAY means that the histogram data is
+represented as a multi-dimensional dense array CvMatND. CV_HIST_SPARSE means that histogram data
+is represented as a multi-dimensional sparse array CvSparseMat.
+@param ranges Array of ranges for the histogram bins. Its meaning depends on the uniform parameter
+value. The ranges are used when the histogram is calculated or backprojected to determine which
+histogram bin corresponds to which value/tuple of values from the input image(s).
+@param uniform Uniformity flag. If not zero, the histogram has evenly spaced bins and for every
+\f$0<=i<cDims\f$ ranges[i] is an array of two numbers: lower and upper boundaries for the i-th
+histogram dimension. The whole range [lower,upper] is then split into dims[i] equal parts to
+determine the i-th input tuple value ranges for every histogram bin. And if uniform=0 , then the
+i-th element of the ranges array contains dims[i]+1 elements: \f$\texttt{lower}_0,
+\texttt{upper}_0, \texttt{lower}_1, \texttt{upper}_1 = \texttt{lower}_2,
+...
+\texttt{upper}_{dims[i]-1}\f$ where \f$\texttt{lower}_j\f$ and \f$\texttt{upper}_j\f$ are lower
+and upper boundaries of the i-th input tuple value for the j-th bin, respectively. In either
+case, the input values that are beyond the specified range for a histogram bin are not counted
+by cvCalcHist and filled with 0 by cvCalcBackProject.
+ */
+CVAPI(CvHistogram*)  cvCreateHist( int dims, int* sizes, int type,
+                                   float** ranges CV_DEFAULT(NULL),
+                                   int uniform CV_DEFAULT(1));
+
+/** @brief Sets the bounds of the histogram bins.
+
+This is a standalone function for setting bin ranges in the histogram. For a more detailed
+description of the parameters ranges and uniform, see the :ocvCalcHist function that can initialize
+the ranges as well. Ranges for the histogram bins must be set before the histogram is calculated or
+the backproject of the histogram is calculated.
+
+@param hist Histogram.
+@param ranges Array of bin ranges arrays. See :ocvCreateHist for details.
+@param uniform Uniformity flag. See :ocvCreateHist for details.
+ */
+CVAPI(void)  cvSetHistBinRanges( CvHistogram* hist, float** ranges,
+                                int uniform CV_DEFAULT(1));
+
+/** @brief Makes a histogram out of an array.
+
+The function initializes the histogram, whose header and bins are allocated by the user.
+cvReleaseHist does not need to be called afterwards. Only dense histograms can be initialized this
+way. The function returns hist.
+
+@param dims Number of the histogram dimensions.
+@param sizes Array of the histogram dimension sizes.
+@param hist Histogram header initialized by the function.
+@param data Array used to store histogram bins.
+@param ranges Histogram bin ranges. See cvCreateHist for details.
+@param uniform Uniformity flag. See cvCreateHist for details.
+ */
+CVAPI(CvHistogram*)  cvMakeHistHeaderForArray(
+                            int  dims, int* sizes, CvHistogram* hist,
+                            float* data, float** ranges CV_DEFAULT(NULL),
+                            int uniform CV_DEFAULT(1));
+
+/** @brief Releases the histogram.
+
+The function releases the histogram (header and the data). The pointer to the histogram is cleared
+by the function. If \*hist pointer is already NULL, the function does nothing.
+
+@param hist Double pointer to the released histogram.
+ */
+CVAPI(void)  cvReleaseHist( CvHistogram** hist );
+
+/** @brief Clears the histogram.
+
+The function sets all of the histogram bins to 0 in case of a dense histogram and removes all
+histogram bins in case of a sparse array.
+
+@param hist Histogram.
+ */
+CVAPI(void)  cvClearHist( CvHistogram* hist );
+
+/** @brief Finds the minimum and maximum histogram bins.
+
+The function finds the minimum and maximum histogram bins and their positions. All of output
+arguments are optional. Among several extremas with the same value the ones with the minimum index
+(in the lexicographical order) are returned. In case of several maximums or minimums, the earliest
+in the lexicographical order (extrema locations) is returned.
+
+@param hist Histogram.
+@param min_value Pointer to the minimum value of the histogram.
+@param max_value Pointer to the maximum value of the histogram.
+@param min_idx Pointer to the array of coordinates for the minimum.
+@param max_idx Pointer to the array of coordinates for the maximum.
+ */
+CVAPI(void)  cvGetMinMaxHistValue( const CvHistogram* hist,
+                                   float* min_value, float* max_value,
+                                   int* min_idx CV_DEFAULT(NULL),
+                                   int* max_idx CV_DEFAULT(NULL));
+
+
+/** @brief Normalizes the histogram.
+
+The function normalizes the histogram bins by scaling them so that the sum of the bins becomes equal
+to factor.
+
+@param hist Pointer to the histogram.
+@param factor Normalization factor.
+ */
+CVAPI(void)  cvNormalizeHist( CvHistogram* hist, double factor );
+
+
+/** @brief Thresholds the histogram.
+
+The function clears histogram bins that are below the specified threshold.
+
+@param hist Pointer to the histogram.
+@param threshold Threshold level.
+ */
+CVAPI(void)  cvThreshHist( CvHistogram* hist, double threshold );
+
+
+/** Compares two histogram */
+CVAPI(double)  cvCompareHist( const CvHistogram* hist1,
+                              const CvHistogram* hist2,
+                              int method);
+
+/** @brief Copies a histogram.
+
+The function makes a copy of the histogram. If the second histogram pointer \*dst is NULL, a new
+histogram of the same size as src is created. Otherwise, both histograms must have equal types and
+sizes. Then the function copies the bin values of the source histogram to the destination histogram
+and sets the same bin value ranges as in src.
+
+@param src Source histogram.
+@param dst Pointer to the destination histogram.
+ */
+CVAPI(void)  cvCopyHist( const CvHistogram* src, CvHistogram** dst );
+
+
+/** @brief Calculates bayesian probabilistic histograms
+   (each or src and dst is an array of _number_ histograms */
+CVAPI(void)  cvCalcBayesianProb( CvHistogram** src, int number,
+                                CvHistogram** dst);
+
+/** @brief Calculates array histogram
+@see cv::calcHist
+*/
+CVAPI(void)  cvCalcArrHist( CvArr** arr, CvHistogram* hist,
+                            int accumulate CV_DEFAULT(0),
+                            const CvArr* mask CV_DEFAULT(NULL) );
+
+/** @overload */
+CV_INLINE  void  cvCalcHist( IplImage** image, CvHistogram* hist,
+                             int accumulate CV_DEFAULT(0),
+                             const CvArr* mask CV_DEFAULT(NULL) )
+{
+    cvCalcArrHist( (CvArr**)image, hist, accumulate, mask );
+}
+
+/** @brief Calculates back project
+@see cvCalcBackProject, cv::calcBackProject
+*/
+CVAPI(void)  cvCalcArrBackProject( CvArr** image, CvArr* dst,
+                                   const CvHistogram* hist );
+
+#define  cvCalcBackProject(image, dst, hist) cvCalcArrBackProject((CvArr**)image, dst, hist)
+
+
+/** @brief Locates a template within an image by using a histogram comparison.
+
+The function calculates the back projection by comparing histograms of the source image patches with
+the given histogram. The function is similar to matchTemplate, but instead of comparing the raster
+patch with all its possible positions within the search window, the function CalcBackProjectPatch
+compares histograms. See the algorithm diagram below:
+
+![image](pics/backprojectpatch.png)
+
+@param image Source images (though, you may pass CvMat\*\* as well).
+@param dst Destination image.
+@param range
+@param hist Histogram.
+@param method Comparison method passed to cvCompareHist (see the function description).
+@param factor Normalization factor for histograms that affects the normalization scale of the
+destination image. Pass 1 if not sure.
+
+@see cvCalcBackProjectPatch
+ */
+CVAPI(void)  cvCalcArrBackProjectPatch( CvArr** image, CvArr* dst, CvSize range,
+                                        CvHistogram* hist, int method,
+                                        double factor );
+
+#define  cvCalcBackProjectPatch( image, dst, range, hist, method, factor ) \
+     cvCalcArrBackProjectPatch( (CvArr**)image, dst, range, hist, method, factor )
+
+
+/** @brief Divides one histogram by another.
+
+The function calculates the object probability density from two histograms as:
+
+\f[\texttt{disthist} (I)= \forkthree{0}{if \(\texttt{hist1}(I)=0\)}{\texttt{scale}}{if \(\texttt{hist1}(I) \ne 0\) and \(\texttt{hist2}(I) > \texttt{hist1}(I)\)}{\frac{\texttt{hist2}(I) \cdot \texttt{scale}}{\texttt{hist1}(I)}}{if \(\texttt{hist1}(I) \ne 0\) and \(\texttt{hist2}(I) \le \texttt{hist1}(I)\)}\f]
+
+@param hist1 First histogram (the divisor).
+@param hist2 Second histogram.
+@param dst_hist Destination histogram.
+@param scale Scale factor for the destination histogram.
+ */
+CVAPI(void)  cvCalcProbDensity( const CvHistogram* hist1, const CvHistogram* hist2,
+                                CvHistogram* dst_hist, double scale CV_DEFAULT(255) );
+
+/** @brief equalizes histogram of 8-bit single-channel image
+@see cv::equalizeHist
+*/
+CVAPI(void)  cvEqualizeHist( const CvArr* src, CvArr* dst );
+
+
+/** @brief Applies distance transform to binary image
+@see cv::distanceTransform
+*/
+CVAPI(void)  cvDistTransform( const CvArr* src, CvArr* dst,
+                              int distance_type CV_DEFAULT(CV_DIST_L2),
+                              int mask_size CV_DEFAULT(3),
+                              const float* mask CV_DEFAULT(NULL),
+                              CvArr* labels CV_DEFAULT(NULL),
+                              int labelType CV_DEFAULT(CV_DIST_LABEL_CCOMP));
+
+
+/** @brief Applies fixed-level threshold to grayscale image.
+
+   This is a basic operation applied before retrieving contours
+@see cv::threshold
+*/
+CVAPI(double)  cvThreshold( const CvArr*  src, CvArr*  dst,
+                            double  threshold, double  max_value,
+                            int threshold_type );
+
+/** @brief Applies adaptive threshold to grayscale image.
+
+   The two parameters for methods CV_ADAPTIVE_THRESH_MEAN_C and
+   CV_ADAPTIVE_THRESH_GAUSSIAN_C are:
+   neighborhood size (3, 5, 7 etc.),
+   and a constant subtracted from mean (...,-3,-2,-1,0,1,2,3,...)
+@see cv::adaptiveThreshold
+*/
+CVAPI(void)  cvAdaptiveThreshold( const CvArr* src, CvArr* dst, double max_value,
+                                  int adaptive_method CV_DEFAULT(CV_ADAPTIVE_THRESH_MEAN_C),
+                                  int threshold_type CV_DEFAULT(CV_THRESH_BINARY),
+                                  int block_size CV_DEFAULT(3),
+                                  double param1 CV_DEFAULT(5));
+
+/** @brief Fills the connected component until the color difference gets large enough
+@see cv::floodFill
+*/
+CVAPI(void)  cvFloodFill( CvArr* image, CvPoint seed_point,
+                          CvScalar new_val, CvScalar lo_diff CV_DEFAULT(cvScalarAll(0)),
+                          CvScalar up_diff CV_DEFAULT(cvScalarAll(0)),
+                          CvConnectedComp* comp CV_DEFAULT(NULL),
+                          int flags CV_DEFAULT(4),
+                          CvArr* mask CV_DEFAULT(NULL));
+
+/****************************************************************************************\
+*                                  Feature detection                                     *
+\****************************************************************************************/
+
+/** @brief Runs canny edge detector
+@see cv::Canny
+*/
+CVAPI(void)  cvCanny( const CvArr* image, CvArr* edges, double threshold1,
+                      double threshold2, int  aperture_size CV_DEFAULT(3) );
+
+/** @brief Calculates constraint image for corner detection
+
+   Dx^2 * Dyy + Dxx * Dy^2 - 2 * Dx * Dy * Dxy.
+   Applying threshold to the result gives coordinates of corners
+@see cv::preCornerDetect
+*/
+CVAPI(void) cvPreCornerDetect( const CvArr* image, CvArr* corners,
+                               int aperture_size CV_DEFAULT(3) );
+
+/** @brief Calculates eigen values and vectors of 2x2
+   gradient covariation matrix at every image pixel
+@see cv::cornerEigenValsAndVecs
+*/
+CVAPI(void)  cvCornerEigenValsAndVecs( const CvArr* image, CvArr* eigenvv,
+                                       int block_size, int aperture_size CV_DEFAULT(3) );
+
+/** @brief Calculates minimal eigenvalue for 2x2 gradient covariation matrix at
+   every image pixel
+@see cv::cornerMinEigenVal
+*/
+CVAPI(void)  cvCornerMinEigenVal( const CvArr* image, CvArr* eigenval,
+                                  int block_size, int aperture_size CV_DEFAULT(3) );
+
+/** @brief Harris corner detector:
+
+   Calculates det(M) - k*(trace(M)^2), where M is 2x2 gradient covariation matrix for each pixel
+@see cv::cornerHarris
+*/
+CVAPI(void)  cvCornerHarris( const CvArr* image, CvArr* harris_response,
+                             int block_size, int aperture_size CV_DEFAULT(3),
+                             double k CV_DEFAULT(0.04) );
+
+/** @brief Adjust corner position using some sort of gradient search
+@see cv::cornerSubPix
+*/
+CVAPI(void)  cvFindCornerSubPix( const CvArr* image, CvPoint2D32f* corners,
+                                 int count, CvSize win, CvSize zero_zone,
+                                 CvTermCriteria  criteria );
+
+/** @brief Finds a sparse set of points within the selected region
+   that seem to be easy to track
+@see cv::goodFeaturesToTrack
+*/
+CVAPI(void)  cvGoodFeaturesToTrack( const CvArr* image, CvArr* eig_image,
+                                    CvArr* temp_image, CvPoint2D32f* corners,
+                                    int* corner_count, double  quality_level,
+                                    double  min_distance,
+                                    const CvArr* mask CV_DEFAULT(NULL),
+                                    int block_size CV_DEFAULT(3),
+                                    int use_harris CV_DEFAULT(0),
+                                    double k CV_DEFAULT(0.04) );
+
+/** @brief Finds lines on binary image using one of several methods.
+
+   line_storage is either memory storage or 1 x _max number of lines_ CvMat, its
+   number of columns is changed by the function.
+   method is one of CV_HOUGH_*;
+   rho, theta and threshold are used for each of those methods;
+   param1 ~ line length, param2 ~ line gap - for probabilistic,
+   param1 ~ srn, param2 ~ stn - for multi-scale
+@see cv::HoughLines
+*/
+CVAPI(CvSeq*)  cvHoughLines2( CvArr* image, void* line_storage, int method,
+                              double rho, double theta, int threshold,
+                              double param1 CV_DEFAULT(0), double param2 CV_DEFAULT(0),
+                              double min_theta CV_DEFAULT(0), double max_theta CV_DEFAULT(CV_PI));
+
+/** @brief Finds circles in the image
+@see cv::HoughCircles
+*/
+CVAPI(CvSeq*) cvHoughCircles( CvArr* image, void* circle_storage,
+                              int method, double dp, double min_dist,
+                              double param1 CV_DEFAULT(100),
+                              double param2 CV_DEFAULT(100),
+                              int min_radius CV_DEFAULT(0),
+                              int max_radius CV_DEFAULT(0));
+
+/** @brief Fits a line into set of 2d or 3d points in a robust way (M-estimator technique)
+@see cv::fitLine
+*/
+CVAPI(void)  cvFitLine( const CvArr* points, int dist_type, double param,
+                        double reps, double aeps, float* line );
+
+/****************************************************************************************\
+*                                     Drawing                                            *
+\****************************************************************************************/
+
+/****************************************************************************************\
+*       Drawing functions work with images/matrices of arbitrary type.                   *
+*       For color images the channel order is BGR[A]                                     *
+*       Antialiasing is supported only for 8-bit image now.                              *
+*       All the functions include parameter color that means rgb value (that may be      *
+*       constructed with CV_RGB macro) for color images and brightness                   *
+*       for grayscale images.                                                            *
+*       If a drawn figure is partially or completely outside of the image, it is clipped.*
+\****************************************************************************************/
+
+#define CV_FILLED -1
+
+#define CV_AA 16
+
+/** @brief Draws 4-connected, 8-connected or antialiased line segment connecting two points
+@see cv::line
+*/
+CVAPI(void)  cvLine( CvArr* img, CvPoint pt1, CvPoint pt2,
+                     CvScalar color, int thickness CV_DEFAULT(1),
+                     int line_type CV_DEFAULT(8), int shift CV_DEFAULT(0) );
+
+/** @brief Draws a rectangle given two opposite corners of the rectangle (pt1 & pt2)
+
+   if thickness<0 (e.g. thickness == CV_FILLED), the filled box is drawn
+@see cv::rectangle
+*/
+CVAPI(void)  cvRectangle( CvArr* img, CvPoint pt1, CvPoint pt2,
+                          CvScalar color, int thickness CV_DEFAULT(1),
+                          int line_type CV_DEFAULT(8),
+                          int shift CV_DEFAULT(0));
+
+/** @brief Draws a rectangle specified by a CvRect structure
+@see cv::rectangle
+*/
+CVAPI(void)  cvRectangleR( CvArr* img, CvRect r,
+                           CvScalar color, int thickness CV_DEFAULT(1),
+                           int line_type CV_DEFAULT(8),
+                           int shift CV_DEFAULT(0));
+
+
+/** @brief Draws a circle with specified center and radius.
+
+   Thickness works in the same way as with cvRectangle
+@see cv::circle
+*/
+CVAPI(void)  cvCircle( CvArr* img, CvPoint center, int radius,
+                       CvScalar color, int thickness CV_DEFAULT(1),
+                       int line_type CV_DEFAULT(8), int shift CV_DEFAULT(0));
+
+/** @brief Draws ellipse outline, filled ellipse, elliptic arc or filled elliptic sector
+
+   depending on _thickness_, _start_angle_ and _end_angle_ parameters. The resultant figure
+   is rotated by _angle_. All the angles are in degrees
+@see cv::ellipse
+*/
+CVAPI(void)  cvEllipse( CvArr* img, CvPoint center, CvSize axes,
+                        double angle, double start_angle, double end_angle,
+                        CvScalar color, int thickness CV_DEFAULT(1),
+                        int line_type CV_DEFAULT(8), int shift CV_DEFAULT(0));
+
+CV_INLINE  void  cvEllipseBox( CvArr* img, CvBox2D box, CvScalar color,
+                               int thickness CV_DEFAULT(1),
+                               int line_type CV_DEFAULT(8), int shift CV_DEFAULT(0) )
+{
+    CvSize axes = cvSize(
+        cvRound(box.size.width*0.5),
+        cvRound(box.size.height*0.5)
+    );
+
+    cvEllipse( img, cvPointFrom32f( box.center ), axes, box.angle,
+               0, 360, color, thickness, line_type, shift );
+}
+
+/** @brief Fills convex or monotonous polygon.
+@see cv::fillConvexPoly
+*/
+CVAPI(void)  cvFillConvexPoly( CvArr* img, const CvPoint* pts, int npts, CvScalar color,
+                               int line_type CV_DEFAULT(8), int shift CV_DEFAULT(0));
+
+/** @brief Fills an area bounded by one or more arbitrary polygons
+@see cv::fillPoly
+*/
+CVAPI(void)  cvFillPoly( CvArr* img, CvPoint** pts, const int* npts,
+                         int contours, CvScalar color,
+                         int line_type CV_DEFAULT(8), int shift CV_DEFAULT(0) );
+
+/** @brief Draws one or more polygonal curves
+@see cv::polylines
+*/
+CVAPI(void)  cvPolyLine( CvArr* img, CvPoint** pts, const int* npts, int contours,
+                         int is_closed, CvScalar color, int thickness CV_DEFAULT(1),
+                         int line_type CV_DEFAULT(8), int shift CV_DEFAULT(0) );
+
+#define cvDrawRect cvRectangle
+#define cvDrawLine cvLine
+#define cvDrawCircle cvCircle
+#define cvDrawEllipse cvEllipse
+#define cvDrawPolyLine cvPolyLine
+
+/** @brief Clips the line segment connecting *pt1 and *pt2
+   by the rectangular window
+
+   (0<=x<img_size.width, 0<=y<img_size.height).
+@see cv::clipLine
+*/
+CVAPI(int) cvClipLine( CvSize img_size, CvPoint* pt1, CvPoint* pt2 );
+
+/** @brief Initializes line iterator.
+
+Initially, line_iterator->ptr will point to pt1 (or pt2, see left_to_right description) location in
+the image. Returns the number of pixels on the line between the ending points.
+@see cv::LineIterator
+*/
+CVAPI(int)  cvInitLineIterator( const CvArr* image, CvPoint pt1, CvPoint pt2,
+                                CvLineIterator* line_iterator,
+                                int connectivity CV_DEFAULT(8),
+                                int left_to_right CV_DEFAULT(0));
+
+#define CV_NEXT_LINE_POINT( line_iterator )                     \
+{                                                               \
+    int _line_iterator_mask = (line_iterator).err < 0 ? -1 : 0; \
+    (line_iterator).err += (line_iterator).minus_delta +        \
+        ((line_iterator).plus_delta & _line_iterator_mask);     \
+    (line_iterator).ptr += (line_iterator).minus_step +         \
+        ((line_iterator).plus_step & _line_iterator_mask);      \
+}
+
+
+#define CV_FONT_HERSHEY_SIMPLEX         0
+#define CV_FONT_HERSHEY_PLAIN           1
+#define CV_FONT_HERSHEY_DUPLEX          2
+#define CV_FONT_HERSHEY_COMPLEX         3
+#define CV_FONT_HERSHEY_TRIPLEX         4
+#define CV_FONT_HERSHEY_COMPLEX_SMALL   5
+#define CV_FONT_HERSHEY_SCRIPT_SIMPLEX  6
+#define CV_FONT_HERSHEY_SCRIPT_COMPLEX  7
+
+#define CV_FONT_ITALIC                 16
+
+#define CV_FONT_VECTOR0    CV_FONT_HERSHEY_SIMPLEX
+
+
+/** Font structure */
+typedef struct CvFont
+{
+  const char* nameFont;   //Qt:nameFont
+  CvScalar color;       //Qt:ColorFont -> cvScalar(blue_component, green_component, red_component[, alpha_component])
+    int         font_face;    //Qt: bool italic         /** =CV_FONT_* */
+    const int*  ascii;      //!< font data and metrics
+    const int*  greek;
+    const int*  cyrillic;
+    float       hscale, vscale;
+    float       shear;      //!< slope coefficient: 0 - normal, >0 - italic
+    int         thickness;    //!< Qt: weight               /** letters thickness */
+    float       dx;       //!< horizontal interval between letters
+    int         line_type;    //!< Qt: PointSize
+}
+CvFont;
+
+/** @brief Initializes font structure (OpenCV 1.x API).
+
+The function initializes the font structure that can be passed to text rendering functions.
+
+@param font Pointer to the font structure initialized by the function
+@param font_face Font name identifier. See cv::HersheyFonts and corresponding old CV_* identifiers.
+@param hscale Horizontal scale. If equal to 1.0f , the characters have the original width
+depending on the font type. If equal to 0.5f , the characters are of half the original width.
+@param vscale Vertical scale. If equal to 1.0f , the characters have the original height depending
+on the font type. If equal to 0.5f , the characters are of half the original height.
+@param shear Approximate tangent of the character slope relative to the vertical line. A zero
+value means a non-italic font, 1.0f means about a 45 degree slope, etc.
+@param thickness Thickness of the text strokes
+@param line_type Type of the strokes, see line description
+
+@sa cvPutText
+ */
+CVAPI(void)  cvInitFont( CvFont* font, int font_face,
+                         double hscale, double vscale,
+                         double shear CV_DEFAULT(0),
+                         int thickness CV_DEFAULT(1),
+                         int line_type CV_DEFAULT(8));
+
+CV_INLINE CvFont cvFont( double scale, int thickness CV_DEFAULT(1) )
+{
+    CvFont font;
+    cvInitFont( &font, CV_FONT_HERSHEY_PLAIN, scale, scale, 0, thickness, CV_AA );
+    return font;
+}
+
+/** @brief Renders text stroke with specified font and color at specified location.
+   CvFont should be initialized with cvInitFont
+@see cvInitFont, cvGetTextSize, cvFont, cv::putText
+*/
+CVAPI(void)  cvPutText( CvArr* img, const char* text, CvPoint org,
+                        const CvFont* font, CvScalar color );
+
+/** @brief Calculates bounding box of text stroke (useful for alignment)
+@see cv::getTextSize
+*/
+CVAPI(void)  cvGetTextSize( const char* text_string, const CvFont* font,
+                            CvSize* text_size, int* baseline );
+
+/** @brief Unpacks color value
+
+if arrtype is CV_8UC?, _color_ is treated as packed color value, otherwise the first channels
+(depending on arrtype) of destination scalar are set to the same value = _color_
+*/
+CVAPI(CvScalar)  cvColorToScalar( double packed_color, int arrtype );
+
+/** @brief Returns the polygon points which make up the given ellipse.
+
+The ellipse is define by the box of size 'axes' rotated 'angle' around the 'center'. A partial
+sweep of the ellipse arc can be done by specifying arc_start and arc_end to be something other than
+0 and 360, respectively. The input array 'pts' must be large enough to hold the result. The total
+number of points stored into 'pts' is returned by this function.
+@see cv::ellipse2Poly
+*/
+CVAPI(int) cvEllipse2Poly( CvPoint center, CvSize axes,
+                 int angle, int arc_start, int arc_end, CvPoint * pts, int delta );
+
+/** @brief Draws contour outlines or filled interiors on the image
+@see cv::drawContours
+*/
+CVAPI(void)  cvDrawContours( CvArr *img, CvSeq* contour,
+                             CvScalar external_color, CvScalar hole_color,
+                             int max_level, int thickness CV_DEFAULT(1),
+                             int line_type CV_DEFAULT(8),
+                             CvPoint offset CV_DEFAULT(cvPoint(0,0)));
+
+/** @} */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/imgproc/segmentation.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/imgproc/segmentation.hpp
new file mode 100644
index 0000000..c40d501
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/imgproc/segmentation.hpp
@@ -0,0 +1,141 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_IMGPROC_SEGMENTATION_HPP
+#define OPENCV_IMGPROC_SEGMENTATION_HPP
+
+#include "opencv2/imgproc.hpp"
+
+namespace cv {
+
+namespace segmentation {
+
+//! @addtogroup imgproc_segmentation
+//! @{
+
+
+/** @brief Intelligent Scissors image segmentation
+ *
+ * This class is used to find the path (contour) between two points
+ * which can be used for image segmentation.
+ *
+ * Usage example:
+ * @snippet snippets/imgproc_segmentation.cpp usage_example_intelligent_scissors
+ *
+ * Reference: <a href="http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.138.3811&rep=rep1&type=pdf">"Intelligent Scissors for Image Composition"</a>
+ * algorithm designed by Eric N. Mortensen and William A. Barrett, Brigham Young University
+ * @cite Mortensen95intelligentscissors
+ */
+class CV_EXPORTS_W_SIMPLE IntelligentScissorsMB
+{
+public:
+    CV_WRAP
+    IntelligentScissorsMB();
+
+    /** @brief Specify weights of feature functions
+     *
+     * Consider keeping weights normalized (sum of weights equals to 1.0)
+     * Discrete dynamic programming (DP) goal is minimization of costs between pixels.
+     *
+     * @param weight_non_edge Specify cost of non-edge pixels (default: 0.43f)
+     * @param weight_gradient_direction Specify cost of gradient direction function (default: 0.43f)
+     * @param weight_gradient_magnitude Specify cost of gradient magnitude function (default: 0.14f)
+     */
+    CV_WRAP
+    IntelligentScissorsMB& setWeights(float weight_non_edge, float weight_gradient_direction, float weight_gradient_magnitude);
+
+    /** @brief Specify gradient magnitude max value threshold
+     *
+     * Zero limit value is used to disable gradient magnitude thresholding (default behavior, as described in original article).
+     * Otherwize pixels with `gradient magnitude >= threshold` have zero cost.
+     *
+     * @note Thresholding should be used for images with irregular regions (to avoid stuck on parameters from high-contract areas, like embedded logos).
+     *
+     * @param gradient_magnitude_threshold_max Specify gradient magnitude max value threshold (default: 0, disabled)
+     */
+    CV_WRAP
+    IntelligentScissorsMB& setGradientMagnitudeMaxLimit(float gradient_magnitude_threshold_max = 0.0f);
+
+    /** @brief Switch to "Laplacian Zero-Crossing" edge feature extractor and specify its parameters
+     *
+     * This feature extractor is used by default according to article.
+     *
+     * Implementation has additional filtering for regions with low-amplitude noise.
+     * This filtering is enabled through parameter of minimal gradient amplitude (use some small value 4, 8, 16).
+     *
+     * @note Current implementation of this feature extractor is based on processing of grayscale images (color image is converted to grayscale image first).
+     *
+     * @note Canny edge detector is a bit slower, but provides better results (especially on color images): use setEdgeFeatureCannyParameters().
+     *
+     * @param gradient_magnitude_min_value Minimal gradient magnitude value for edge pixels (default: 0, check is disabled)
+     */
+    CV_WRAP
+    IntelligentScissorsMB& setEdgeFeatureZeroCrossingParameters(float gradient_magnitude_min_value = 0.0f);
+
+    /** @brief Switch edge feature extractor to use Canny edge detector
+     *
+     * @note "Laplacian Zero-Crossing" feature extractor is used by default (following to original article)
+     *
+     * @sa Canny
+     */
+    CV_WRAP
+    IntelligentScissorsMB& setEdgeFeatureCannyParameters(
+            double threshold1, double threshold2,
+            int apertureSize = 3, bool L2gradient = false
+    );
+
+    /** @brief Specify input image and extract image features
+     *
+     * @param image input image. Type is #CV_8UC1 / #CV_8UC3
+     */
+    CV_WRAP
+    IntelligentScissorsMB& applyImage(InputArray image);
+
+    /** @brief Specify custom features of input image
+     *
+     * Customized advanced variant of applyImage() call.
+     *
+     * @param non_edge Specify cost of non-edge pixels. Type is CV_8UC1. Expected values are `{0, 1}`.
+     * @param gradient_direction Specify gradient direction feature. Type is CV_32FC2. Values are expected to be normalized: `x^2 + y^2 == 1`
+     * @param gradient_magnitude Specify cost of gradient magnitude function: Type is CV_32FC1. Values should be in range `[0, 1]`.
+     * @param image **Optional parameter**. Must be specified if subset of features is specified (non-specified features are calculated internally)
+     */
+    CV_WRAP
+    IntelligentScissorsMB& applyImageFeatures(
+            InputArray non_edge, InputArray gradient_direction, InputArray gradient_magnitude,
+            InputArray image = noArray()
+    );
+
+    /** @brief Prepares a map of optimal paths for the given source point on the image
+     *
+     * @note applyImage() / applyImageFeatures() must be called before this call
+     *
+     * @param sourcePt The source point used to find the paths
+     */
+    CV_WRAP void buildMap(const Point& sourcePt);
+
+    /** @brief Extracts optimal contour for the given target point on the image
+     *
+     * @note buildMap() must be called before this call
+     *
+     * @param targetPt The target point
+     * @param[out] contour The list of pixels which contains optimal path between the source and the target points of the image. Type is CV_32SC2 (compatible with `std::vector<Point>`)
+     * @param backward Flag to indicate reverse order of retrived pixels (use "true" value to fetch points from the target to the source point)
+     */
+    CV_WRAP void getContour(const Point& targetPt, OutputArray contour, bool backward = false) const;
+
+#ifndef CV_DOXYGEN
+    struct Impl;
+    inline Impl* getImpl() const { return impl.get(); }
+protected:
+    std::shared_ptr<Impl> impl;
+#endif
+};
+
+//! @}
+
+}  // namespace segmentation
+}  // namespace cv
+
+#endif // OPENCV_IMGPROC_SEGMENTATION_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/imgproc/types_c.h b/duix-sdk/src/main/cpp/third/arm/include/opencv2/imgproc/types_c.h
new file mode 100644
index 0000000..d3e55f5
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/imgproc/types_c.h
@@ -0,0 +1,659 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_IMGPROC_TYPES_C_H
+#define OPENCV_IMGPROC_TYPES_C_H
+
+#include "opencv2/core/core_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup imgproc_c
+  @{
+*/
+
+/** Connected component structure */
+typedef struct CvConnectedComp
+{
+    double area;    /**<area of the connected component  */
+    CvScalar value; /**<average color of the connected component */
+    CvRect rect;    /**<ROI of the component  */
+    CvSeq* contour; /**<optional component boundary
+                      (the contour might have child contours corresponding to the holes)*/
+}
+CvConnectedComp;
+
+/** Image smooth methods */
+enum SmoothMethod_c
+{
+    /** linear convolution with \f$\texttt{size1}\times\texttt{size2}\f$ box kernel (all 1's). If
+    you want to smooth different pixels with different-size box kernels, you can use the integral
+    image that is computed using integral */
+    CV_BLUR_NO_SCALE =0,
+    /** linear convolution with \f$\texttt{size1}\times\texttt{size2}\f$ box kernel (all
+    1's) with subsequent scaling by \f$1/(\texttt{size1}\cdot\texttt{size2})\f$ */
+    CV_BLUR  =1,
+    /** linear convolution with a \f$\texttt{size1}\times\texttt{size2}\f$ Gaussian kernel */
+    CV_GAUSSIAN  =2,
+    /** median filter with a \f$\texttt{size1}\times\texttt{size1}\f$ square aperture */
+    CV_MEDIAN =3,
+    /** bilateral filter with a \f$\texttt{size1}\times\texttt{size1}\f$ square aperture, color
+    sigma= sigma1 and spatial sigma= sigma2. If size1=0, the aperture square side is set to
+    cvRound(sigma2\*1.5)\*2+1. See cv::bilateralFilter */
+    CV_BILATERAL =4
+};
+
+/** Filters used in pyramid decomposition */
+enum
+{
+    CV_GAUSSIAN_5x5 = 7
+};
+
+/** Special filters */
+enum
+{
+    CV_SCHARR =-1,
+    CV_MAX_SOBEL_KSIZE =7
+};
+
+/** Constants for color conversion */
+enum
+{
+    CV_BGR2BGRA    =0,
+    CV_RGB2RGBA    =CV_BGR2BGRA,
+
+    CV_BGRA2BGR    =1,
+    CV_RGBA2RGB    =CV_BGRA2BGR,
+
+    CV_BGR2RGBA    =2,
+    CV_RGB2BGRA    =CV_BGR2RGBA,
+
+    CV_RGBA2BGR    =3,
+    CV_BGRA2RGB    =CV_RGBA2BGR,
+
+    CV_BGR2RGB     =4,
+    CV_RGB2BGR     =CV_BGR2RGB,
+
+    CV_BGRA2RGBA   =5,
+    CV_RGBA2BGRA   =CV_BGRA2RGBA,
+
+    CV_BGR2GRAY    =6,
+    CV_RGB2GRAY    =7,
+    CV_GRAY2BGR    =8,
+    CV_GRAY2RGB    =CV_GRAY2BGR,
+    CV_GRAY2BGRA   =9,
+    CV_GRAY2RGBA   =CV_GRAY2BGRA,
+    CV_BGRA2GRAY   =10,
+    CV_RGBA2GRAY   =11,
+
+    CV_BGR2BGR565  =12,
+    CV_RGB2BGR565  =13,
+    CV_BGR5652BGR  =14,
+    CV_BGR5652RGB  =15,
+    CV_BGRA2BGR565 =16,
+    CV_RGBA2BGR565 =17,
+    CV_BGR5652BGRA =18,
+    CV_BGR5652RGBA =19,
+
+    CV_GRAY2BGR565 =20,
+    CV_BGR5652GRAY =21,
+
+    CV_BGR2BGR555  =22,
+    CV_RGB2BGR555  =23,
+    CV_BGR5552BGR  =24,
+    CV_BGR5552RGB  =25,
+    CV_BGRA2BGR555 =26,
+    CV_RGBA2BGR555 =27,
+    CV_BGR5552BGRA =28,
+    CV_BGR5552RGBA =29,
+
+    CV_GRAY2BGR555 =30,
+    CV_BGR5552GRAY =31,
+
+    CV_BGR2XYZ     =32,
+    CV_RGB2XYZ     =33,
+    CV_XYZ2BGR     =34,
+    CV_XYZ2RGB     =35,
+
+    CV_BGR2YCrCb   =36,
+    CV_RGB2YCrCb   =37,
+    CV_YCrCb2BGR   =38,
+    CV_YCrCb2RGB   =39,
+
+    CV_BGR2HSV     =40,
+    CV_RGB2HSV     =41,
+
+    CV_BGR2Lab     =44,
+    CV_RGB2Lab     =45,
+
+    CV_BayerBG2BGR =46,
+    CV_BayerGB2BGR =47,
+    CV_BayerRG2BGR =48,
+    CV_BayerGR2BGR =49,
+
+    CV_BayerBG2RGB =CV_BayerRG2BGR,
+    CV_BayerGB2RGB =CV_BayerGR2BGR,
+    CV_BayerRG2RGB =CV_BayerBG2BGR,
+    CV_BayerGR2RGB =CV_BayerGB2BGR,
+
+    CV_BGR2Luv     =50,
+    CV_RGB2Luv     =51,
+    CV_BGR2HLS     =52,
+    CV_RGB2HLS     =53,
+
+    CV_HSV2BGR     =54,
+    CV_HSV2RGB     =55,
+
+    CV_Lab2BGR     =56,
+    CV_Lab2RGB     =57,
+    CV_Luv2BGR     =58,
+    CV_Luv2RGB     =59,
+    CV_HLS2BGR     =60,
+    CV_HLS2RGB     =61,
+
+    CV_BayerBG2BGR_VNG =62,
+    CV_BayerGB2BGR_VNG =63,
+    CV_BayerRG2BGR_VNG =64,
+    CV_BayerGR2BGR_VNG =65,
+
+    CV_BayerBG2RGB_VNG =CV_BayerRG2BGR_VNG,
+    CV_BayerGB2RGB_VNG =CV_BayerGR2BGR_VNG,
+    CV_BayerRG2RGB_VNG =CV_BayerBG2BGR_VNG,
+    CV_BayerGR2RGB_VNG =CV_BayerGB2BGR_VNG,
+
+    CV_BGR2HSV_FULL = 66,
+    CV_RGB2HSV_FULL = 67,
+    CV_BGR2HLS_FULL = 68,
+    CV_RGB2HLS_FULL = 69,
+
+    CV_HSV2BGR_FULL = 70,
+    CV_HSV2RGB_FULL = 71,
+    CV_HLS2BGR_FULL = 72,
+    CV_HLS2RGB_FULL = 73,
+
+    CV_LBGR2Lab     = 74,
+    CV_LRGB2Lab     = 75,
+    CV_LBGR2Luv     = 76,
+    CV_LRGB2Luv     = 77,
+
+    CV_Lab2LBGR     = 78,
+    CV_Lab2LRGB     = 79,
+    CV_Luv2LBGR     = 80,
+    CV_Luv2LRGB     = 81,
+
+    CV_BGR2YUV      = 82,
+    CV_RGB2YUV      = 83,
+    CV_YUV2BGR      = 84,
+    CV_YUV2RGB      = 85,
+
+    CV_BayerBG2GRAY = 86,
+    CV_BayerGB2GRAY = 87,
+    CV_BayerRG2GRAY = 88,
+    CV_BayerGR2GRAY = 89,
+
+    //YUV 4:2:0 formats family
+    CV_YUV2RGB_NV12 = 90,
+    CV_YUV2BGR_NV12 = 91,
+    CV_YUV2RGB_NV21 = 92,
+    CV_YUV2BGR_NV21 = 93,
+    CV_YUV420sp2RGB = CV_YUV2RGB_NV21,
+    CV_YUV420sp2BGR = CV_YUV2BGR_NV21,
+
+    CV_YUV2RGBA_NV12 = 94,
+    CV_YUV2BGRA_NV12 = 95,
+    CV_YUV2RGBA_NV21 = 96,
+    CV_YUV2BGRA_NV21 = 97,
+    CV_YUV420sp2RGBA = CV_YUV2RGBA_NV21,
+    CV_YUV420sp2BGRA = CV_YUV2BGRA_NV21,
+
+    CV_YUV2RGB_YV12 = 98,
+    CV_YUV2BGR_YV12 = 99,
+    CV_YUV2RGB_IYUV = 100,
+    CV_YUV2BGR_IYUV = 101,
+    CV_YUV2RGB_I420 = CV_YUV2RGB_IYUV,
+    CV_YUV2BGR_I420 = CV_YUV2BGR_IYUV,
+    CV_YUV420p2RGB = CV_YUV2RGB_YV12,
+    CV_YUV420p2BGR = CV_YUV2BGR_YV12,
+
+    CV_YUV2RGBA_YV12 = 102,
+    CV_YUV2BGRA_YV12 = 103,
+    CV_YUV2RGBA_IYUV = 104,
+    CV_YUV2BGRA_IYUV = 105,
+    CV_YUV2RGBA_I420 = CV_YUV2RGBA_IYUV,
+    CV_YUV2BGRA_I420 = CV_YUV2BGRA_IYUV,
+    CV_YUV420p2RGBA = CV_YUV2RGBA_YV12,
+    CV_YUV420p2BGRA = CV_YUV2BGRA_YV12,
+
+    CV_YUV2GRAY_420 = 106,
+    CV_YUV2GRAY_NV21 = CV_YUV2GRAY_420,
+    CV_YUV2GRAY_NV12 = CV_YUV2GRAY_420,
+    CV_YUV2GRAY_YV12 = CV_YUV2GRAY_420,
+    CV_YUV2GRAY_IYUV = CV_YUV2GRAY_420,
+    CV_YUV2GRAY_I420 = CV_YUV2GRAY_420,
+    CV_YUV420sp2GRAY = CV_YUV2GRAY_420,
+    CV_YUV420p2GRAY = CV_YUV2GRAY_420,
+
+    //YUV 4:2:2 formats family
+    CV_YUV2RGB_UYVY = 107,
+    CV_YUV2BGR_UYVY = 108,
+    //CV_YUV2RGB_VYUY = 109,
+    //CV_YUV2BGR_VYUY = 110,
+    CV_YUV2RGB_Y422 = CV_YUV2RGB_UYVY,
+    CV_YUV2BGR_Y422 = CV_YUV2BGR_UYVY,
+    CV_YUV2RGB_UYNV = CV_YUV2RGB_UYVY,
+    CV_YUV2BGR_UYNV = CV_YUV2BGR_UYVY,
+
+    CV_YUV2RGBA_UYVY = 111,
+    CV_YUV2BGRA_UYVY = 112,
+    //CV_YUV2RGBA_VYUY = 113,
+    //CV_YUV2BGRA_VYUY = 114,
+    CV_YUV2RGBA_Y422 = CV_YUV2RGBA_UYVY,
+    CV_YUV2BGRA_Y422 = CV_YUV2BGRA_UYVY,
+    CV_YUV2RGBA_UYNV = CV_YUV2RGBA_UYVY,
+    CV_YUV2BGRA_UYNV = CV_YUV2BGRA_UYVY,
+
+    CV_YUV2RGB_YUY2 = 115,
+    CV_YUV2BGR_YUY2 = 116,
+    CV_YUV2RGB_YVYU = 117,
+    CV_YUV2BGR_YVYU = 118,
+    CV_YUV2RGB_YUYV = CV_YUV2RGB_YUY2,
+    CV_YUV2BGR_YUYV = CV_YUV2BGR_YUY2,
+    CV_YUV2RGB_YUNV = CV_YUV2RGB_YUY2,
+    CV_YUV2BGR_YUNV = CV_YUV2BGR_YUY2,
+
+    CV_YUV2RGBA_YUY2 = 119,
+    CV_YUV2BGRA_YUY2 = 120,
+    CV_YUV2RGBA_YVYU = 121,
+    CV_YUV2BGRA_YVYU = 122,
+    CV_YUV2RGBA_YUYV = CV_YUV2RGBA_YUY2,
+    CV_YUV2BGRA_YUYV = CV_YUV2BGRA_YUY2,
+    CV_YUV2RGBA_YUNV = CV_YUV2RGBA_YUY2,
+    CV_YUV2BGRA_YUNV = CV_YUV2BGRA_YUY2,
+
+    CV_YUV2GRAY_UYVY = 123,
+    CV_YUV2GRAY_YUY2 = 124,
+    //CV_YUV2GRAY_VYUY = CV_YUV2GRAY_UYVY,
+    CV_YUV2GRAY_Y422 = CV_YUV2GRAY_UYVY,
+    CV_YUV2GRAY_UYNV = CV_YUV2GRAY_UYVY,
+    CV_YUV2GRAY_YVYU = CV_YUV2GRAY_YUY2,
+    CV_YUV2GRAY_YUYV = CV_YUV2GRAY_YUY2,
+    CV_YUV2GRAY_YUNV = CV_YUV2GRAY_YUY2,
+
+    // alpha premultiplication
+    CV_RGBA2mRGBA = 125,
+    CV_mRGBA2RGBA = 126,
+
+    CV_RGB2YUV_I420 = 127,
+    CV_BGR2YUV_I420 = 128,
+    CV_RGB2YUV_IYUV = CV_RGB2YUV_I420,
+    CV_BGR2YUV_IYUV = CV_BGR2YUV_I420,
+
+    CV_RGBA2YUV_I420 = 129,
+    CV_BGRA2YUV_I420 = 130,
+    CV_RGBA2YUV_IYUV = CV_RGBA2YUV_I420,
+    CV_BGRA2YUV_IYUV = CV_BGRA2YUV_I420,
+    CV_RGB2YUV_YV12  = 131,
+    CV_BGR2YUV_YV12  = 132,
+    CV_RGBA2YUV_YV12 = 133,
+    CV_BGRA2YUV_YV12 = 134,
+
+    // Edge-Aware Demosaicing
+    CV_BayerBG2BGR_EA = 135,
+    CV_BayerGB2BGR_EA = 136,
+    CV_BayerRG2BGR_EA = 137,
+    CV_BayerGR2BGR_EA = 138,
+
+    CV_BayerBG2RGB_EA = CV_BayerRG2BGR_EA,
+    CV_BayerGB2RGB_EA = CV_BayerGR2BGR_EA,
+    CV_BayerRG2RGB_EA = CV_BayerBG2BGR_EA,
+    CV_BayerGR2RGB_EA = CV_BayerGB2BGR_EA,
+
+    CV_BayerBG2BGRA =139,
+    CV_BayerGB2BGRA =140,
+    CV_BayerRG2BGRA =141,
+    CV_BayerGR2BGRA =142,
+
+    CV_BayerBG2RGBA =CV_BayerRG2BGRA,
+    CV_BayerGB2RGBA =CV_BayerGR2BGRA,
+    CV_BayerRG2RGBA =CV_BayerBG2BGRA,
+    CV_BayerGR2RGBA =CV_BayerGB2BGRA,
+
+    CV_COLORCVT_MAX  = 143
+};
+
+
+/** Sub-pixel interpolation methods */
+enum
+{
+    CV_INTER_NN        =0,
+    CV_INTER_LINEAR    =1,
+    CV_INTER_CUBIC     =2,
+    CV_INTER_AREA      =3,
+    CV_INTER_LANCZOS4  =4
+};
+
+/** ... and other image warping flags */
+enum
+{
+    CV_WARP_FILL_OUTLIERS =8,
+    CV_WARP_INVERSE_MAP  =16
+};
+
+/** Shapes of a structuring element for morphological operations
+@see cv::MorphShapes, cv::getStructuringElement
+*/
+enum MorphShapes_c
+{
+    CV_SHAPE_RECT      =0,
+    CV_SHAPE_CROSS     =1,
+    CV_SHAPE_ELLIPSE   =2,
+    CV_SHAPE_CUSTOM    =100 //!< custom structuring element
+};
+
+/** Morphological operations */
+enum
+{
+    CV_MOP_ERODE        =0,
+    CV_MOP_DILATE       =1,
+    CV_MOP_OPEN         =2,
+    CV_MOP_CLOSE        =3,
+    CV_MOP_GRADIENT     =4,
+    CV_MOP_TOPHAT       =5,
+    CV_MOP_BLACKHAT     =6
+};
+
+/** Spatial and central moments */
+typedef struct CvMoments
+{
+    double  m00, m10, m01, m20, m11, m02, m30, m21, m12, m03; /**< spatial moments */
+    double  mu20, mu11, mu02, mu30, mu21, mu12, mu03; /**< central moments */
+    double  inv_sqrt_m00; /**< m00 != 0 ? 1/sqrt(m00) : 0 */
+
+#if defined(CV__ENABLE_C_API_CTORS) && defined(__cplusplus)
+    CvMoments(){}
+    CvMoments(const cv::Moments& m)
+    {
+        m00 = m.m00; m10 = m.m10; m01 = m.m01;
+        m20 = m.m20; m11 = m.m11; m02 = m.m02;
+        m30 = m.m30; m21 = m.m21; m12 = m.m12; m03 = m.m03;
+        mu20 = m.mu20; mu11 = m.mu11; mu02 = m.mu02;
+        mu30 = m.mu30; mu21 = m.mu21; mu12 = m.mu12; mu03 = m.mu03;
+        double am00 = std::abs(m.m00);
+        inv_sqrt_m00 = am00 > DBL_EPSILON ? 1./std::sqrt(am00) : 0;
+    }
+    operator cv::Moments() const
+    {
+        return cv::Moments(m00, m10, m01, m20, m11, m02, m30, m21, m12, m03);
+    }
+#endif
+}
+CvMoments;
+
+#ifdef __cplusplus
+} // extern "C"
+
+CV_INLINE CvMoments cvMoments()
+{
+#if !defined(CV__ENABLE_C_API_CTORS)
+    CvMoments self = CV_STRUCT_INITIALIZER; return self;
+#else
+    return CvMoments();
+#endif
+}
+
+CV_INLINE CvMoments cvMoments(const cv::Moments& m)
+{
+#if !defined(CV__ENABLE_C_API_CTORS)
+    double am00 = std::abs(m.m00);
+    CvMoments self = {
+        m.m00, m.m10, m.m01, m.m20, m.m11, m.m02, m.m30, m.m21, m.m12, m.m03,
+        m.mu20, m.mu11, m.mu02, m.mu30, m.mu21, m.mu12, m.mu03,
+        am00 > DBL_EPSILON ? 1./std::sqrt(am00) : 0
+    };
+    return self;
+#else
+    return CvMoments(m);
+#endif
+}
+
+extern "C" {
+#endif // __cplusplus
+
+/** Hu invariants */
+typedef struct CvHuMoments
+{
+    double hu1, hu2, hu3, hu4, hu5, hu6, hu7; /**< Hu invariants */
+}
+CvHuMoments;
+
+/** Template matching methods */
+enum
+{
+    CV_TM_SQDIFF        =0,
+    CV_TM_SQDIFF_NORMED =1,
+    CV_TM_CCORR         =2,
+    CV_TM_CCORR_NORMED  =3,
+    CV_TM_CCOEFF        =4,
+    CV_TM_CCOEFF_NORMED =5
+};
+
+typedef float (CV_CDECL * CvDistanceFunction)( const float* a, const float* b, void* user_param );
+
+/** Contour retrieval modes */
+enum
+{
+    CV_RETR_EXTERNAL=0,
+    CV_RETR_LIST=1,
+    CV_RETR_CCOMP=2,
+    CV_RETR_TREE=3,
+    CV_RETR_FLOODFILL=4
+};
+
+/** Contour approximation methods */
+enum
+{
+    CV_CHAIN_CODE=0,
+    CV_CHAIN_APPROX_NONE=1,
+    CV_CHAIN_APPROX_SIMPLE=2,
+    CV_CHAIN_APPROX_TC89_L1=3,
+    CV_CHAIN_APPROX_TC89_KCOS=4,
+    CV_LINK_RUNS=5
+};
+
+/*
+Internal structure that is used for sequential retrieving contours from the image.
+It supports both hierarchical and plane variants of Suzuki algorithm.
+*/
+typedef struct _CvContourScanner* CvContourScanner;
+
+/** Freeman chain reader state */
+typedef struct CvChainPtReader
+{
+    CV_SEQ_READER_FIELDS()
+    char      code;
+    CvPoint   pt;
+    schar     deltas[8][2];
+}
+CvChainPtReader;
+
+/** initializes 8-element array for fast access to 3x3 neighborhood of a pixel */
+#define  CV_INIT_3X3_DELTAS( deltas, step, nch )            \
+    ((deltas)[0] =  (nch),  (deltas)[1] = -(step) + (nch),  \
+     (deltas)[2] = -(step), (deltas)[3] = -(step) - (nch),  \
+     (deltas)[4] = -(nch),  (deltas)[5] =  (step) - (nch),  \
+     (deltas)[6] =  (step), (deltas)[7] =  (step) + (nch))
+
+
+/** Contour approximation algorithms */
+enum
+{
+    CV_POLY_APPROX_DP = 0
+};
+
+/** Shape matching methods */
+enum
+{
+    CV_CONTOURS_MATCH_I1  =1, //!< \f[I_1(A,B) =  \sum _{i=1...7}  \left |  \frac{1}{m^A_i} -  \frac{1}{m^B_i} \right |\f]
+    CV_CONTOURS_MATCH_I2  =2, //!< \f[I_2(A,B) =  \sum _{i=1...7}  \left | m^A_i - m^B_i  \right |\f]
+    CV_CONTOURS_MATCH_I3  =3  //!< \f[I_3(A,B) =  \max _{i=1...7}  \frac{ \left| m^A_i - m^B_i \right| }{ \left| m^A_i \right| }\f]
+};
+
+/** Shape orientation */
+enum
+{
+    CV_CLOCKWISE         =1,
+    CV_COUNTER_CLOCKWISE =2
+};
+
+
+/** Convexity defect */
+typedef struct CvConvexityDefect
+{
+    CvPoint* start; /**< point of the contour where the defect begins */
+    CvPoint* end; /**< point of the contour where the defect ends */
+    CvPoint* depth_point; /**< the farthest from the convex hull point within the defect */
+    float depth; /**< distance between the farthest point and the convex hull */
+} CvConvexityDefect;
+
+
+/** Histogram comparison methods */
+enum
+{
+    CV_COMP_CORREL        =0,
+    CV_COMP_CHISQR        =1,
+    CV_COMP_INTERSECT     =2,
+    CV_COMP_BHATTACHARYYA =3,
+    CV_COMP_HELLINGER     =CV_COMP_BHATTACHARYYA,
+    CV_COMP_CHISQR_ALT    =4,
+    CV_COMP_KL_DIV        =5
+};
+
+/** Mask size for distance transform */
+enum
+{
+    CV_DIST_MASK_3   =3,
+    CV_DIST_MASK_5   =5,
+    CV_DIST_MASK_PRECISE =0
+};
+
+/** Content of output label array: connected components or pixels */
+enum
+{
+  CV_DIST_LABEL_CCOMP = 0,
+  CV_DIST_LABEL_PIXEL = 1
+};
+
+/** Distance types for Distance Transform and M-estimators */
+enum
+{
+    CV_DIST_USER    =-1,  /**< User defined distance */
+    CV_DIST_L1      =1,   /**< distance = |x1-x2| + |y1-y2| */
+    CV_DIST_L2      =2,   /**< the simple euclidean distance */
+    CV_DIST_C       =3,   /**< distance = max(|x1-x2|,|y1-y2|) */
+    CV_DIST_L12     =4,   /**< L1-L2 metric: distance = 2(sqrt(1+x*x/2) - 1)) */
+    CV_DIST_FAIR    =5,   /**< distance = c^2(|x|/c-log(1+|x|/c)), c = 1.3998 */
+    CV_DIST_WELSCH  =6,   /**< distance = c^2/2(1-exp(-(x/c)^2)), c = 2.9846 */
+    CV_DIST_HUBER   =7    /**< distance = |x|<c ? x^2/2 : c(|x|-c/2), c=1.345 */
+};
+
+
+/** Threshold types */
+enum
+{
+    CV_THRESH_BINARY      =0,  /**< value = value > threshold ? max_value : 0       */
+    CV_THRESH_BINARY_INV  =1,  /**< value = value > threshold ? 0 : max_value       */
+    CV_THRESH_TRUNC       =2,  /**< value = value > threshold ? threshold : value   */
+    CV_THRESH_TOZERO      =3,  /**< value = value > threshold ? value : 0           */
+    CV_THRESH_TOZERO_INV  =4,  /**< value = value > threshold ? 0 : value           */
+    CV_THRESH_MASK        =7,
+    CV_THRESH_OTSU        =8, /**< use Otsu algorithm to choose the optimal threshold value;
+                                 combine the flag with one of the above CV_THRESH_* values */
+    CV_THRESH_TRIANGLE    =16  /**< use Triangle algorithm to choose the optimal threshold value;
+                                 combine the flag with one of the above CV_THRESH_* values, but not
+                                 with CV_THRESH_OTSU */
+};
+
+/** Adaptive threshold methods */
+enum
+{
+    CV_ADAPTIVE_THRESH_MEAN_C  =0,
+    CV_ADAPTIVE_THRESH_GAUSSIAN_C  =1
+};
+
+/** FloodFill flags */
+enum
+{
+    CV_FLOODFILL_FIXED_RANGE =(1 << 16),
+    CV_FLOODFILL_MASK_ONLY   =(1 << 17)
+};
+
+
+/** Canny edge detector flags */
+enum
+{
+    CV_CANNY_L2_GRADIENT  =(1 << 31)
+};
+
+/** Variants of a Hough transform */
+enum
+{
+    CV_HOUGH_STANDARD =0,
+    CV_HOUGH_PROBABILISTIC =1,
+    CV_HOUGH_MULTI_SCALE =2,
+    CV_HOUGH_GRADIENT =3
+};
+
+
+/* Fast search data structures  */
+struct CvFeatureTree;
+struct CvLSH;
+struct CvLSHOperations;
+
+/** @} */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/opencv.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/opencv.hpp
new file mode 100644
index 0000000..d17b94a
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/opencv.hpp
@@ -0,0 +1,95 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2010, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_ALL_HPP
+#define OPENCV_ALL_HPP
+
+// File that defines what modules where included during the build of OpenCV
+// These are purely the defines of the correct HAVE_OPENCV_modulename values
+#include "opencv2/opencv_modules.hpp"
+
+// Then the list of defines is checked to include the correct headers
+// Core library is always included --> without no OpenCV functionality available
+#include "opencv2/core.hpp"
+
+// Then the optional modules are checked
+#ifdef HAVE_OPENCV_CALIB3D
+#include "opencv2/calib3d.hpp"
+#endif
+#ifdef HAVE_OPENCV_FEATURES2D
+#include "opencv2/features2d.hpp"
+#endif
+#ifdef HAVE_OPENCV_DNN
+#include "opencv2/dnn.hpp"
+#endif
+#ifdef HAVE_OPENCV_FLANN
+#include "opencv2/flann.hpp"
+#endif
+#ifdef HAVE_OPENCV_HIGHGUI
+#include "opencv2/highgui.hpp"
+#endif
+#ifdef HAVE_OPENCV_IMGCODECS
+#include "opencv2/imgcodecs.hpp"
+#endif
+#ifdef HAVE_OPENCV_IMGPROC
+#include "opencv2/imgproc.hpp"
+#endif
+#ifdef HAVE_OPENCV_ML
+#include "opencv2/ml.hpp"
+#endif
+#ifdef HAVE_OPENCV_OBJDETECT
+#include "opencv2/objdetect.hpp"
+#endif
+#ifdef HAVE_OPENCV_PHOTO
+#include "opencv2/photo.hpp"
+#endif
+#ifdef HAVE_OPENCV_STITCHING
+#include "opencv2/stitching.hpp"
+#endif
+#ifdef HAVE_OPENCV_VIDEO
+#include "opencv2/video.hpp"
+#endif
+#ifdef HAVE_OPENCV_VIDEOIO
+#include "opencv2/videoio.hpp"
+#endif
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/opencv_modules.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/opencv_modules.hpp
new file mode 100644
index 0000000..5fafd9c
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/opencv_modules.hpp
@@ -0,0 +1,20 @@
+/*
+ *      ** File generated automatically, do not modify **
+ *
+ * This file defines the list of modules available in current build configuration
+ *
+ *
+*/
+
+// This definition means that OpenCV is built with enabled non-free code.
+// For example, patented algorithms for non-profit/non-commercial use only.
+/* #undef OPENCV_ENABLE_NONFREE */
+
+#define HAVE_OPENCV_CORE
+#define HAVE_OPENCV_FEATURES2D
+#define HAVE_OPENCV_HIGHGUI
+#define HAVE_OPENCV_IMGPROC
+#define HAVE_OPENCV_PHOTO
+#define HAVE_OPENCV_VIDEO
+
+
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/photo.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/photo.hpp
new file mode 100644
index 0000000..c2e89a3
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/photo.hpp
@@ -0,0 +1,858 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_PHOTO_HPP
+#define OPENCV_PHOTO_HPP
+
+#include "opencv2/core.hpp"
+#include "opencv2/imgproc.hpp"
+
+/**
+@defgroup photo Computational Photography
+
+This module includes photo processing algorithms
+@{
+    @defgroup photo_inpaint Inpainting
+    @defgroup photo_denoise Denoising
+    @defgroup photo_hdr HDR imaging
+
+This section describes high dynamic range imaging algorithms namely tonemapping, exposure alignment,
+camera calibration with multiple exposures and exposure fusion.
+
+    @defgroup photo_decolor Contrast Preserving Decolorization
+
+Useful links:
+
+http://www.cse.cuhk.edu.hk/leojia/projects/color2gray/index.html
+
+    @defgroup photo_clone Seamless Cloning
+
+Useful links:
+
+https://www.learnopencv.com/seamless-cloning-using-opencv-python-cpp
+
+    @defgroup photo_render Non-Photorealistic Rendering
+
+Useful links:
+
+http://www.inf.ufrgs.br/~eslgastal/DomainTransform
+
+https://www.learnopencv.com/non-photorealistic-rendering-using-opencv-python-c/
+
+    @defgroup photo_c C API
+@}
+  */
+
+namespace cv
+{
+
+//! @addtogroup photo
+//! @{
+
+//! @addtogroup photo_inpaint
+//! @{
+//! the inpainting algorithm
+enum
+{
+    INPAINT_NS    = 0, //!< Use Navier-Stokes based method
+    INPAINT_TELEA = 1 //!< Use the algorithm proposed by Alexandru Telea @cite Telea04
+};
+
+/** @brief Restores the selected region in an image using the region neighborhood.
+
+@param src Input 8-bit, 16-bit unsigned or 32-bit float 1-channel or 8-bit 3-channel image.
+@param inpaintMask Inpainting mask, 8-bit 1-channel image. Non-zero pixels indicate the area that
+needs to be inpainted.
+@param dst Output image with the same size and type as src .
+@param inpaintRadius Radius of a circular neighborhood of each point inpainted that is considered
+by the algorithm.
+@param flags Inpainting method that could be cv::INPAINT_NS or cv::INPAINT_TELEA
+
+The function reconstructs the selected image area from the pixel near the area boundary. The
+function may be used to remove dust and scratches from a scanned photo, or to remove undesirable
+objects from still images or video. See <http://en.wikipedia.org/wiki/Inpainting> for more details.
+
+@note
+   -   An example using the inpainting technique can be found at
+        opencv_source_code/samples/cpp/inpaint.cpp
+   -   (Python) An example using the inpainting technique can be found at
+        opencv_source_code/samples/python/inpaint.py
+ */
+CV_EXPORTS_W void inpaint( InputArray src, InputArray inpaintMask,
+        OutputArray dst, double inpaintRadius, int flags );
+
+//! @} photo_inpaint
+
+//! @addtogroup photo_denoise
+//! @{
+
+/** @brief Perform image denoising using Non-local Means Denoising algorithm
+<http://www.ipol.im/pub/algo/bcm_non_local_means_denoising/> with several computational
+optimizations. Noise expected to be a gaussian white noise
+
+@param src Input 8-bit 1-channel, 2-channel, 3-channel or 4-channel image.
+@param dst Output image with the same size and type as src .
+@param templateWindowSize Size in pixels of the template patch that is used to compute weights.
+Should be odd. Recommended value 7 pixels
+@param searchWindowSize Size in pixels of the window that is used to compute weighted average for
+given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
+denoising time. Recommended value 21 pixels
+@param h Parameter regulating filter strength. Big h value perfectly removes noise but also
+removes image details, smaller h value preserves details but also preserves some noise
+
+This function expected to be applied to grayscale images. For colored images look at
+fastNlMeansDenoisingColored. Advanced usage of this functions can be manual denoising of colored
+image in different colorspaces. Such approach is used in fastNlMeansDenoisingColored by converting
+image to CIELAB colorspace and then separately denoise L and AB components with different h
+parameter.
+ */
+CV_EXPORTS_W void fastNlMeansDenoising( InputArray src, OutputArray dst, float h = 3,
+        int templateWindowSize = 7, int searchWindowSize = 21);
+
+/** @brief Perform image denoising using Non-local Means Denoising algorithm
+<http://www.ipol.im/pub/algo/bcm_non_local_means_denoising/> with several computational
+optimizations. Noise expected to be a gaussian white noise
+
+@param src Input 8-bit or 16-bit (only with NORM_L1) 1-channel,
+2-channel, 3-channel or 4-channel image.
+@param dst Output image with the same size and type as src .
+@param templateWindowSize Size in pixels of the template patch that is used to compute weights.
+Should be odd. Recommended value 7 pixels
+@param searchWindowSize Size in pixels of the window that is used to compute weighted average for
+given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
+denoising time. Recommended value 21 pixels
+@param h Array of parameters regulating filter strength, either one
+parameter applied to all channels or one per channel in dst. Big h value
+perfectly removes noise but also removes image details, smaller h
+value preserves details but also preserves some noise
+@param normType Type of norm used for weight calculation. Can be either NORM_L2 or NORM_L1
+
+This function expected to be applied to grayscale images. For colored images look at
+fastNlMeansDenoisingColored. Advanced usage of this functions can be manual denoising of colored
+image in different colorspaces. Such approach is used in fastNlMeansDenoisingColored by converting
+image to CIELAB colorspace and then separately denoise L and AB components with different h
+parameter.
+ */
+CV_EXPORTS_W void fastNlMeansDenoising( InputArray src, OutputArray dst,
+                                        const std::vector<float>& h,
+                                        int templateWindowSize = 7, int searchWindowSize = 21,
+                                        int normType = NORM_L2);
+
+/** @brief Modification of fastNlMeansDenoising function for colored images
+
+@param src Input 8-bit 3-channel image.
+@param dst Output image with the same size and type as src .
+@param templateWindowSize Size in pixels of the template patch that is used to compute weights.
+Should be odd. Recommended value 7 pixels
+@param searchWindowSize Size in pixels of the window that is used to compute weighted average for
+given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
+denoising time. Recommended value 21 pixels
+@param h Parameter regulating filter strength for luminance component. Bigger h value perfectly
+removes noise but also removes image details, smaller h value preserves details but also preserves
+some noise
+@param hColor The same as h but for color components. For most images value equals 10
+will be enough to remove colored noise and do not distort colors
+
+The function converts image to CIELAB colorspace and then separately denoise L and AB components
+with given h parameters using fastNlMeansDenoising function.
+ */
+CV_EXPORTS_W void fastNlMeansDenoisingColored( InputArray src, OutputArray dst,
+        float h = 3, float hColor = 3,
+        int templateWindowSize = 7, int searchWindowSize = 21);
+
+/** @brief Modification of fastNlMeansDenoising function for images sequence where consecutive images have been
+captured in small period of time. For example video. This version of the function is for grayscale
+images or for manual manipulation with colorspaces. For more details see
+<http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.131.6394>
+
+@param srcImgs Input 8-bit 1-channel, 2-channel, 3-channel or
+4-channel images sequence. All images should have the same type and
+size.
+@param imgToDenoiseIndex Target image to denoise index in srcImgs sequence
+@param temporalWindowSize Number of surrounding images to use for target image denoising. Should
+be odd. Images from imgToDenoiseIndex - temporalWindowSize / 2 to
+imgToDenoiseIndex - temporalWindowSize / 2 from srcImgs will be used to denoise
+srcImgs[imgToDenoiseIndex] image.
+@param dst Output image with the same size and type as srcImgs images.
+@param templateWindowSize Size in pixels of the template patch that is used to compute weights.
+Should be odd. Recommended value 7 pixels
+@param searchWindowSize Size in pixels of the window that is used to compute weighted average for
+given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
+denoising time. Recommended value 21 pixels
+@param h Parameter regulating filter strength. Bigger h value
+perfectly removes noise but also removes image details, smaller h
+value preserves details but also preserves some noise
+ */
+CV_EXPORTS_W void fastNlMeansDenoisingMulti( InputArrayOfArrays srcImgs, OutputArray dst,
+        int imgToDenoiseIndex, int temporalWindowSize,
+        float h = 3, int templateWindowSize = 7, int searchWindowSize = 21);
+
+/** @brief Modification of fastNlMeansDenoising function for images sequence where consecutive images have been
+captured in small period of time. For example video. This version of the function is for grayscale
+images or for manual manipulation with colorspaces. For more details see
+<http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.131.6394>
+
+@param srcImgs Input 8-bit or 16-bit (only with NORM_L1) 1-channel,
+2-channel, 3-channel or 4-channel images sequence. All images should
+have the same type and size.
+@param imgToDenoiseIndex Target image to denoise index in srcImgs sequence
+@param temporalWindowSize Number of surrounding images to use for target image denoising. Should
+be odd. Images from imgToDenoiseIndex - temporalWindowSize / 2 to
+imgToDenoiseIndex - temporalWindowSize / 2 from srcImgs will be used to denoise
+srcImgs[imgToDenoiseIndex] image.
+@param dst Output image with the same size and type as srcImgs images.
+@param templateWindowSize Size in pixels of the template patch that is used to compute weights.
+Should be odd. Recommended value 7 pixels
+@param searchWindowSize Size in pixels of the window that is used to compute weighted average for
+given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
+denoising time. Recommended value 21 pixels
+@param h Array of parameters regulating filter strength, either one
+parameter applied to all channels or one per channel in dst. Big h value
+perfectly removes noise but also removes image details, smaller h
+value preserves details but also preserves some noise
+@param normType Type of norm used for weight calculation. Can be either NORM_L2 or NORM_L1
+ */
+CV_EXPORTS_W void fastNlMeansDenoisingMulti( InputArrayOfArrays srcImgs, OutputArray dst,
+                                             int imgToDenoiseIndex, int temporalWindowSize,
+                                             const std::vector<float>& h,
+                                             int templateWindowSize = 7, int searchWindowSize = 21,
+                                             int normType = NORM_L2);
+
+/** @brief Modification of fastNlMeansDenoisingMulti function for colored images sequences
+
+@param srcImgs Input 8-bit 3-channel images sequence. All images should have the same type and
+size.
+@param imgToDenoiseIndex Target image to denoise index in srcImgs sequence
+@param temporalWindowSize Number of surrounding images to use for target image denoising. Should
+be odd. Images from imgToDenoiseIndex - temporalWindowSize / 2 to
+imgToDenoiseIndex - temporalWindowSize / 2 from srcImgs will be used to denoise
+srcImgs[imgToDenoiseIndex] image.
+@param dst Output image with the same size and type as srcImgs images.
+@param templateWindowSize Size in pixels of the template patch that is used to compute weights.
+Should be odd. Recommended value 7 pixels
+@param searchWindowSize Size in pixels of the window that is used to compute weighted average for
+given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
+denoising time. Recommended value 21 pixels
+@param h Parameter regulating filter strength for luminance component. Bigger h value perfectly
+removes noise but also removes image details, smaller h value preserves details but also preserves
+some noise.
+@param hColor The same as h but for color components.
+
+The function converts images to CIELAB colorspace and then separately denoise L and AB components
+with given h parameters using fastNlMeansDenoisingMulti function.
+ */
+CV_EXPORTS_W void fastNlMeansDenoisingColoredMulti( InputArrayOfArrays srcImgs, OutputArray dst,
+        int imgToDenoiseIndex, int temporalWindowSize,
+        float h = 3, float hColor = 3,
+        int templateWindowSize = 7, int searchWindowSize = 21);
+
+/** @brief Primal-dual algorithm is an algorithm for solving special types of variational problems (that is,
+finding a function to minimize some functional). As the image denoising, in particular, may be seen
+as the variational problem, primal-dual algorithm then can be used to perform denoising and this is
+exactly what is implemented.
+
+It should be noted, that this implementation was taken from the July 2013 blog entry
+@cite MA13 , which also contained (slightly more general) ready-to-use source code on Python.
+Subsequently, that code was rewritten on C++ with the usage of openCV by Vadim Pisarevsky at the end
+of July 2013 and finally it was slightly adapted by later authors.
+
+Although the thorough discussion and justification of the algorithm involved may be found in
+@cite ChambolleEtAl, it might make sense to skim over it here, following @cite MA13 . To begin
+with, we consider the 1-byte gray-level images as the functions from the rectangular domain of
+pixels (it may be seen as set
+\f$\left\{(x,y)\in\mathbb{N}\times\mathbb{N}\mid 1\leq x\leq n,\;1\leq y\leq m\right\}\f$ for some
+\f$m,\;n\in\mathbb{N}\f$) into \f$\{0,1,\dots,255\}\f$. We shall denote the noised images as \f$f_i\f$ and with
+this view, given some image \f$x\f$ of the same size, we may measure how bad it is by the formula
+
+\f[\left\|\left\|\nabla x\right\|\right\| + \lambda\sum_i\left\|\left\|x-f_i\right\|\right\|\f]
+
+\f$\|\|\cdot\|\|\f$ here denotes \f$L_2\f$-norm and as you see, the first addend states that we want our
+image to be smooth (ideally, having zero gradient, thus being constant) and the second states that
+we want our result to be close to the observations we've got. If we treat \f$x\f$ as a function, this is
+exactly the functional what we seek to minimize and here the Primal-Dual algorithm comes into play.
+
+@param observations This array should contain one or more noised versions of the image that is to
+be restored.
+@param result Here the denoised image will be stored. There is no need to do pre-allocation of
+storage space, as it will be automatically allocated, if necessary.
+@param lambda Corresponds to \f$\lambda\f$ in the formulas above. As it is enlarged, the smooth
+(blurred) images are treated more favorably than detailed (but maybe more noised) ones. Roughly
+speaking, as it becomes smaller, the result will be more blur but more sever outliers will be
+removed.
+@param niters Number of iterations that the algorithm will run. Of course, as more iterations as
+better, but it is hard to quantitatively refine this statement, so just use the default and
+increase it if the results are poor.
+ */
+CV_EXPORTS_W void denoise_TVL1(const std::vector<Mat>& observations,Mat& result, double lambda=1.0, int niters=30);
+
+//! @} photo_denoise
+
+//! @addtogroup photo_hdr
+//! @{
+
+enum { LDR_SIZE = 256 };
+
+/** @brief Base class for tonemapping algorithms - tools that are used to map HDR image to 8-bit range.
+ */
+class CV_EXPORTS_W Tonemap : public Algorithm
+{
+public:
+    /** @brief Tonemaps image
+
+    @param src source image - CV_32FC3 Mat (float 32 bits 3 channels)
+    @param dst destination image - CV_32FC3 Mat with values in [0, 1] range
+     */
+    CV_WRAP virtual void process(InputArray src, OutputArray dst) = 0;
+
+    CV_WRAP virtual float getGamma() const = 0;
+    CV_WRAP virtual void setGamma(float gamma) = 0;
+};
+
+/** @brief Creates simple linear mapper with gamma correction
+
+@param gamma positive value for gamma correction. Gamma value of 1.0 implies no correction, gamma
+equal to 2.2f is suitable for most displays.
+Generally gamma \> 1 brightens the image and gamma \< 1 darkens it.
+ */
+CV_EXPORTS_W Ptr<Tonemap> createTonemap(float gamma = 1.0f);
+
+/** @brief Adaptive logarithmic mapping is a fast global tonemapping algorithm that scales the image in
+logarithmic domain.
+
+Since it's a global operator the same function is applied to all the pixels, it is controlled by the
+bias parameter.
+
+Optional saturation enhancement is possible as described in @cite FL02 .
+
+For more information see @cite DM03 .
+ */
+class CV_EXPORTS_W TonemapDrago : public Tonemap
+{
+public:
+
+    CV_WRAP virtual float getSaturation() const = 0;
+    CV_WRAP virtual void setSaturation(float saturation) = 0;
+
+    CV_WRAP virtual float getBias() const = 0;
+    CV_WRAP virtual void setBias(float bias) = 0;
+};
+
+/** @brief Creates TonemapDrago object
+
+@param gamma gamma value for gamma correction. See createTonemap
+@param saturation positive saturation enhancement value. 1.0 preserves saturation, values greater
+than 1 increase saturation and values less than 1 decrease it.
+@param bias value for bias function in [0, 1] range. Values from 0.7 to 0.9 usually give best
+results, default value is 0.85.
+ */
+CV_EXPORTS_W Ptr<TonemapDrago> createTonemapDrago(float gamma = 1.0f, float saturation = 1.0f, float bias = 0.85f);
+
+
+/** @brief This is a global tonemapping operator that models human visual system.
+
+Mapping function is controlled by adaptation parameter, that is computed using light adaptation and
+color adaptation.
+
+For more information see @cite RD05 .
+ */
+class CV_EXPORTS_W TonemapReinhard : public Tonemap
+{
+public:
+    CV_WRAP virtual float getIntensity() const = 0;
+    CV_WRAP virtual void setIntensity(float intensity) = 0;
+
+    CV_WRAP virtual float getLightAdaptation() const = 0;
+    CV_WRAP virtual void setLightAdaptation(float light_adapt) = 0;
+
+    CV_WRAP virtual float getColorAdaptation() const = 0;
+    CV_WRAP virtual void setColorAdaptation(float color_adapt) = 0;
+};
+
+/** @brief Creates TonemapReinhard object
+
+@param gamma gamma value for gamma correction. See createTonemap
+@param intensity result intensity in [-8, 8] range. Greater intensity produces brighter results.
+@param light_adapt light adaptation in [0, 1] range. If 1 adaptation is based only on pixel
+value, if 0 it's global, otherwise it's a weighted mean of this two cases.
+@param color_adapt chromatic adaptation in [0, 1] range. If 1 channels are treated independently,
+if 0 adaptation level is the same for each channel.
+ */
+CV_EXPORTS_W Ptr<TonemapReinhard>
+createTonemapReinhard(float gamma = 1.0f, float intensity = 0.0f, float light_adapt = 1.0f, float color_adapt = 0.0f);
+
+/** @brief This algorithm transforms image to contrast using gradients on all levels of gaussian pyramid,
+transforms contrast values to HVS response and scales the response. After this the image is
+reconstructed from new contrast values.
+
+For more information see @cite MM06 .
+ */
+class CV_EXPORTS_W TonemapMantiuk : public Tonemap
+{
+public:
+    CV_WRAP virtual float getScale() const = 0;
+    CV_WRAP virtual void setScale(float scale) = 0;
+
+    CV_WRAP virtual float getSaturation() const = 0;
+    CV_WRAP virtual void setSaturation(float saturation) = 0;
+};
+
+/** @brief Creates TonemapMantiuk object
+
+@param gamma gamma value for gamma correction. See createTonemap
+@param scale contrast scale factor. HVS response is multiplied by this parameter, thus compressing
+dynamic range. Values from 0.6 to 0.9 produce best results.
+@param saturation saturation enhancement value. See createTonemapDrago
+ */
+CV_EXPORTS_W Ptr<TonemapMantiuk>
+createTonemapMantiuk(float gamma = 1.0f, float scale = 0.7f, float saturation = 1.0f);
+
+/** @brief The base class for algorithms that align images of the same scene with different exposures
+ */
+class CV_EXPORTS_W AlignExposures : public Algorithm
+{
+public:
+    /** @brief Aligns images
+
+    @param src vector of input images
+    @param dst vector of aligned images
+    @param times vector of exposure time values for each image
+    @param response 256x1 matrix with inverse camera response function for each pixel value, it should
+    have the same number of channels as images.
+     */
+    CV_WRAP virtual void process(InputArrayOfArrays src, std::vector<Mat>& dst,
+                                 InputArray times, InputArray response) = 0;
+};
+
+/** @brief This algorithm converts images to median threshold bitmaps (1 for pixels brighter than median
+luminance and 0 otherwise) and than aligns the resulting bitmaps using bit operations.
+
+It is invariant to exposure, so exposure values and camera response are not necessary.
+
+In this implementation new image regions are filled with zeros.
+
+For more information see @cite GW03 .
+ */
+class CV_EXPORTS_W AlignMTB : public AlignExposures
+{
+public:
+    CV_WRAP virtual void process(InputArrayOfArrays src, std::vector<Mat>& dst,
+                                 InputArray times, InputArray response) CV_OVERRIDE = 0;
+
+    /** @brief Short version of process, that doesn't take extra arguments.
+
+    @param src vector of input images
+    @param dst vector of aligned images
+     */
+    CV_WRAP virtual void process(InputArrayOfArrays src, std::vector<Mat>& dst) = 0;
+
+    /** @brief Calculates shift between two images, i. e. how to shift the second image to correspond it with the
+    first.
+
+    @param img0 first image
+    @param img1 second image
+     */
+    CV_WRAP virtual Point calculateShift(InputArray img0, InputArray img1) = 0;
+    /** @brief Helper function, that shift Mat filling new regions with zeros.
+
+    @param src input image
+    @param dst result image
+    @param shift shift value
+     */
+    CV_WRAP virtual void shiftMat(InputArray src, OutputArray dst, const Point shift) = 0;
+    /** @brief Computes median threshold and exclude bitmaps of given image.
+
+    @param img input image
+    @param tb median threshold bitmap
+    @param eb exclude bitmap
+     */
+    CV_WRAP virtual void computeBitmaps(InputArray img, OutputArray tb, OutputArray eb) = 0;
+
+    CV_WRAP virtual int getMaxBits() const = 0;
+    CV_WRAP virtual void setMaxBits(int max_bits) = 0;
+
+    CV_WRAP virtual int getExcludeRange() const = 0;
+    CV_WRAP virtual void setExcludeRange(int exclude_range) = 0;
+
+    CV_WRAP virtual bool getCut() const = 0;
+    CV_WRAP virtual void setCut(bool value) = 0;
+};
+
+/** @brief Creates AlignMTB object
+
+@param max_bits logarithm to the base 2 of maximal shift in each dimension. Values of 5 and 6 are
+usually good enough (31 and 63 pixels shift respectively).
+@param exclude_range range for exclusion bitmap that is constructed to suppress noise around the
+median value.
+@param cut if true cuts images, otherwise fills the new regions with zeros.
+ */
+CV_EXPORTS_W Ptr<AlignMTB> createAlignMTB(int max_bits = 6, int exclude_range = 4, bool cut = true);
+
+/** @brief The base class for camera response calibration algorithms.
+ */
+class CV_EXPORTS_W CalibrateCRF : public Algorithm
+{
+public:
+    /** @brief Recovers inverse camera response.
+
+    @param src vector of input images
+    @param dst 256x1 matrix with inverse camera response function
+    @param times vector of exposure time values for each image
+     */
+    CV_WRAP virtual void process(InputArrayOfArrays src, OutputArray dst, InputArray times) = 0;
+};
+
+/** @brief Inverse camera response function is extracted for each brightness value by minimizing an objective
+function as linear system. Objective function is constructed using pixel values on the same position
+in all images, extra term is added to make the result smoother.
+
+For more information see @cite DM97 .
+ */
+class CV_EXPORTS_W CalibrateDebevec : public CalibrateCRF
+{
+public:
+    CV_WRAP virtual float getLambda() const = 0;
+    CV_WRAP virtual void setLambda(float lambda) = 0;
+
+    CV_WRAP virtual int getSamples() const = 0;
+    CV_WRAP virtual void setSamples(int samples) = 0;
+
+    CV_WRAP virtual bool getRandom() const = 0;
+    CV_WRAP virtual void setRandom(bool random) = 0;
+};
+
+/** @brief Creates CalibrateDebevec object
+
+@param samples number of pixel locations to use
+@param lambda smoothness term weight. Greater values produce smoother results, but can alter the
+response.
+@param random if true sample pixel locations are chosen at random, otherwise they form a
+rectangular grid.
+ */
+CV_EXPORTS_W Ptr<CalibrateDebevec> createCalibrateDebevec(int samples = 70, float lambda = 10.0f, bool random = false);
+
+/** @brief Inverse camera response function is extracted for each brightness value by minimizing an objective
+function as linear system. This algorithm uses all image pixels.
+
+For more information see @cite RB99 .
+ */
+class CV_EXPORTS_W CalibrateRobertson : public CalibrateCRF
+{
+public:
+    CV_WRAP virtual int getMaxIter() const = 0;
+    CV_WRAP virtual void setMaxIter(int max_iter) = 0;
+
+    CV_WRAP virtual float getThreshold() const = 0;
+    CV_WRAP virtual void setThreshold(float threshold) = 0;
+
+    CV_WRAP virtual Mat getRadiance() const = 0;
+};
+
+/** @brief Creates CalibrateRobertson object
+
+@param max_iter maximal number of Gauss-Seidel solver iterations.
+@param threshold target difference between results of two successive steps of the minimization.
+ */
+CV_EXPORTS_W Ptr<CalibrateRobertson> createCalibrateRobertson(int max_iter = 30, float threshold = 0.01f);
+
+/** @brief The base class algorithms that can merge exposure sequence to a single image.
+ */
+class CV_EXPORTS_W MergeExposures : public Algorithm
+{
+public:
+    /** @brief Merges images.
+
+    @param src vector of input images
+    @param dst result image
+    @param times vector of exposure time values for each image
+    @param response 256x1 matrix with inverse camera response function for each pixel value, it should
+    have the same number of channels as images.
+     */
+    CV_WRAP virtual void process(InputArrayOfArrays src, OutputArray dst,
+                                 InputArray times, InputArray response) = 0;
+};
+
+/** @brief The resulting HDR image is calculated as weighted average of the exposures considering exposure
+values and camera response.
+
+For more information see @cite DM97 .
+ */
+class CV_EXPORTS_W MergeDebevec : public MergeExposures
+{
+public:
+    CV_WRAP virtual void process(InputArrayOfArrays src, OutputArray dst,
+                                 InputArray times, InputArray response) CV_OVERRIDE = 0;
+    CV_WRAP virtual void process(InputArrayOfArrays src, OutputArray dst, InputArray times) = 0;
+};
+
+/** @brief Creates MergeDebevec object
+ */
+CV_EXPORTS_W Ptr<MergeDebevec> createMergeDebevec();
+
+/** @brief Pixels are weighted using contrast, saturation and well-exposedness measures, than images are
+combined using laplacian pyramids.
+
+The resulting image weight is constructed as weighted average of contrast, saturation and
+well-exposedness measures.
+
+The resulting image doesn't require tonemapping and can be converted to 8-bit image by multiplying
+by 255, but it's recommended to apply gamma correction and/or linear tonemapping.
+
+For more information see @cite MK07 .
+ */
+class CV_EXPORTS_W MergeMertens : public MergeExposures
+{
+public:
+    CV_WRAP virtual void process(InputArrayOfArrays src, OutputArray dst,
+                                 InputArray times, InputArray response) CV_OVERRIDE = 0;
+    /** @brief Short version of process, that doesn't take extra arguments.
+
+    @param src vector of input images
+    @param dst result image
+     */
+    CV_WRAP virtual void process(InputArrayOfArrays src, OutputArray dst) = 0;
+
+    CV_WRAP virtual float getContrastWeight() const = 0;
+    CV_WRAP virtual void setContrastWeight(float contrast_weiht) = 0;
+
+    CV_WRAP virtual float getSaturationWeight() const = 0;
+    CV_WRAP virtual void setSaturationWeight(float saturation_weight) = 0;
+
+    CV_WRAP virtual float getExposureWeight() const = 0;
+    CV_WRAP virtual void setExposureWeight(float exposure_weight) = 0;
+};
+
+/** @brief Creates MergeMertens object
+
+@param contrast_weight contrast measure weight. See MergeMertens.
+@param saturation_weight saturation measure weight
+@param exposure_weight well-exposedness measure weight
+ */
+CV_EXPORTS_W Ptr<MergeMertens>
+createMergeMertens(float contrast_weight = 1.0f, float saturation_weight = 1.0f, float exposure_weight = 0.0f);
+
+/** @brief The resulting HDR image is calculated as weighted average of the exposures considering exposure
+values and camera response.
+
+For more information see @cite RB99 .
+ */
+class CV_EXPORTS_W MergeRobertson : public MergeExposures
+{
+public:
+    CV_WRAP virtual void process(InputArrayOfArrays src, OutputArray dst,
+                                 InputArray times, InputArray response) CV_OVERRIDE = 0;
+    CV_WRAP virtual void process(InputArrayOfArrays src, OutputArray dst, InputArray times) = 0;
+};
+
+/** @brief Creates MergeRobertson object
+ */
+CV_EXPORTS_W Ptr<MergeRobertson> createMergeRobertson();
+
+//! @} photo_hdr
+
+//! @addtogroup photo_decolor
+//! @{
+
+/** @brief Transforms a color image to a grayscale image. It is a basic tool in digital printing, stylized
+black-and-white photograph rendering, and in many single channel image processing applications
+@cite CL12 .
+
+@param src Input 8-bit 3-channel image.
+@param grayscale Output 8-bit 1-channel image.
+@param color_boost Output 8-bit 3-channel image.
+
+This function is to be applied on color images.
+ */
+CV_EXPORTS_W void decolor( InputArray src, OutputArray grayscale, OutputArray color_boost);
+
+//! @} photo_decolor
+
+//! @addtogroup photo_clone
+//! @{
+
+
+//! seamlessClone algorithm flags
+enum
+{
+    /** The power of the method is fully expressed when inserting objects with complex outlines into a new background*/
+    NORMAL_CLONE = 1,
+    /** The classic method, color-based selection and alpha masking might be time consuming and often leaves an undesirable
+    halo. Seamless cloning, even averaged with the original image, is not effective. Mixed seamless cloning based on a loose selection proves effective.*/
+    MIXED_CLONE  = 2,
+    /** Monochrome transfer allows the user to easily replace certain features of one object by alternative features.*/
+    MONOCHROME_TRANSFER = 3};
+
+
+/** @example samples/cpp/tutorial_code/photo/seamless_cloning/cloning_demo.cpp
+An example using seamlessClone function
+*/
+/** @brief Image editing tasks concern either global changes (color/intensity corrections, filters,
+deformations) or local changes concerned to a selection. Here we are interested in achieving local
+changes, ones that are restricted to a region manually selected (ROI), in a seamless and effortless
+manner. The extent of the changes ranges from slight distortions to complete replacement by novel
+content @cite PM03 .
+
+@param src Input 8-bit 3-channel image.
+@param dst Input 8-bit 3-channel image.
+@param mask Input 8-bit 1 or 3-channel image.
+@param p Point in dst image where object is placed.
+@param blend Output image with the same size and type as dst.
+@param flags Cloning method that could be cv::NORMAL_CLONE, cv::MIXED_CLONE or cv::MONOCHROME_TRANSFER
+ */
+CV_EXPORTS_W void seamlessClone( InputArray src, InputArray dst, InputArray mask, Point p,
+        OutputArray blend, int flags);
+
+/** @brief Given an original color image, two differently colored versions of this image can be mixed
+seamlessly.
+
+@param src Input 8-bit 3-channel image.
+@param mask Input 8-bit 1 or 3-channel image.
+@param dst Output image with the same size and type as src .
+@param red_mul R-channel multiply factor.
+@param green_mul G-channel multiply factor.
+@param blue_mul B-channel multiply factor.
+
+Multiplication factor is between .5 to 2.5.
+ */
+CV_EXPORTS_W void colorChange(InputArray src, InputArray mask, OutputArray dst, float red_mul = 1.0f,
+        float green_mul = 1.0f, float blue_mul = 1.0f);
+
+/** @brief Applying an appropriate non-linear transformation to the gradient field inside the selection and
+then integrating back with a Poisson solver, modifies locally the apparent illumination of an image.
+
+@param src Input 8-bit 3-channel image.
+@param mask Input 8-bit 1 or 3-channel image.
+@param dst Output image with the same size and type as src.
+@param alpha Value ranges between 0-2.
+@param beta Value ranges between 0-2.
+
+This is useful to highlight under-exposed foreground objects or to reduce specular reflections.
+ */
+CV_EXPORTS_W void illuminationChange(InputArray src, InputArray mask, OutputArray dst,
+        float alpha = 0.2f, float beta = 0.4f);
+
+/** @brief By retaining only the gradients at edge locations, before integrating with the Poisson solver, one
+washes out the texture of the selected region, giving its contents a flat aspect. Here Canny Edge %Detector is used.
+
+@param src Input 8-bit 3-channel image.
+@param mask Input 8-bit 1 or 3-channel image.
+@param dst Output image with the same size and type as src.
+@param low_threshold %Range from 0 to 100.
+@param high_threshold Value \> 100.
+@param kernel_size The size of the Sobel kernel to be used.
+
+@note
+The algorithm assumes that the color of the source image is close to that of the destination. This
+assumption means that when the colors don't match, the source image color gets tinted toward the
+color of the destination image.
+ */
+CV_EXPORTS_W void textureFlattening(InputArray src, InputArray mask, OutputArray dst,
+        float low_threshold = 30, float high_threshold = 45,
+        int kernel_size = 3);
+
+//! @} photo_clone
+
+//! @addtogroup photo_render
+//! @{
+
+//! Edge preserving filters
+enum
+{
+    RECURS_FILTER = 1, //!< Recursive Filtering
+    NORMCONV_FILTER = 2 //!< Normalized Convolution Filtering
+};
+
+/** @brief Filtering is the fundamental operation in image and video processing. Edge-preserving smoothing
+filters are used in many different applications @cite EM11 .
+
+@param src Input 8-bit 3-channel image.
+@param dst Output 8-bit 3-channel image.
+@param flags Edge preserving filters: cv::RECURS_FILTER or cv::NORMCONV_FILTER
+@param sigma_s %Range between 0 to 200.
+@param sigma_r %Range between 0 to 1.
+ */
+CV_EXPORTS_W void edgePreservingFilter(InputArray src, OutputArray dst, int flags = 1,
+        float sigma_s = 60, float sigma_r = 0.4f);
+
+/** @brief This filter enhances the details of a particular image.
+
+@param src Input 8-bit 3-channel image.
+@param dst Output image with the same size and type as src.
+@param sigma_s %Range between 0 to 200.
+@param sigma_r %Range between 0 to 1.
+ */
+CV_EXPORTS_W void detailEnhance(InputArray src, OutputArray dst, float sigma_s = 10,
+        float sigma_r = 0.15f);
+
+/** @example samples/cpp/tutorial_code/photo/non_photorealistic_rendering/npr_demo.cpp
+An example using non-photorealistic line drawing functions
+*/
+/** @brief Pencil-like non-photorealistic line drawing
+
+@param src Input 8-bit 3-channel image.
+@param dst1 Output 8-bit 1-channel image.
+@param dst2 Output image with the same size and type as src.
+@param sigma_s %Range between 0 to 200.
+@param sigma_r %Range between 0 to 1.
+@param shade_factor %Range between 0 to 0.1.
+ */
+CV_EXPORTS_W void pencilSketch(InputArray src, OutputArray dst1, OutputArray dst2,
+        float sigma_s = 60, float sigma_r = 0.07f, float shade_factor = 0.02f);
+
+/** @brief Stylization aims to produce digital imagery with a wide variety of effects not focused on
+photorealism. Edge-aware filters are ideal for stylization, as they can abstract regions of low
+contrast while preserving, or enhancing, high-contrast features.
+
+@param src Input 8-bit 3-channel image.
+@param dst Output image with the same size and type as src.
+@param sigma_s %Range between 0 to 200.
+@param sigma_r %Range between 0 to 1.
+ */
+CV_EXPORTS_W void stylization(InputArray src, OutputArray dst, float sigma_s = 60,
+        float sigma_r = 0.45f);
+
+//! @} photo_render
+
+//! @} photo
+
+} // cv
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/photo/cuda.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/photo/cuda.hpp
new file mode 100644
index 0000000..a2f3816
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/photo/cuda.hpp
@@ -0,0 +1,132 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_PHOTO_CUDA_HPP
+#define OPENCV_PHOTO_CUDA_HPP
+
+#include "opencv2/core/cuda.hpp"
+
+namespace cv { namespace cuda {
+
+//! @addtogroup photo_denoise
+//! @{
+
+/** @brief Performs pure non local means denoising without any simplification, and thus it is not fast.
+
+@param src Source image. Supports only CV_8UC1, CV_8UC2 and CV_8UC3.
+@param dst Destination image.
+@param h Filter sigma regulating filter strength for color.
+@param search_window Size of search window.
+@param block_size Size of block used for computing weights.
+@param borderMode Border type. See borderInterpolate for details. BORDER_REFLECT101 ,
+BORDER_REPLICATE , BORDER_CONSTANT , BORDER_REFLECT and BORDER_WRAP are supported for now.
+@param stream Stream for the asynchronous version.
+
+@sa
+   fastNlMeansDenoising
+ */
+CV_EXPORTS void nonLocalMeans(InputArray src, OutputArray dst,
+                              float h,
+                              int search_window = 21,
+                              int block_size = 7,
+                              int borderMode = BORDER_DEFAULT,
+                              Stream& stream = Stream::Null());
+
+/** @brief Perform image denoising using Non-local Means Denoising algorithm
+<http://www.ipol.im/pub/algo/bcm_non_local_means_denoising> with several computational
+optimizations. Noise expected to be a gaussian white noise
+
+@param src Input 8-bit 1-channel, 2-channel or 3-channel image.
+@param dst Output image with the same size and type as src .
+@param h Parameter regulating filter strength. Big h value perfectly removes noise but also
+removes image details, smaller h value preserves details but also preserves some noise
+@param search_window Size in pixels of the window that is used to compute weighted average for
+given pixel. Should be odd. Affect performance linearly: greater search_window - greater
+denoising time. Recommended value 21 pixels
+@param block_size Size in pixels of the template patch that is used to compute weights. Should be
+odd. Recommended value 7 pixels
+@param stream Stream for the asynchronous invocations.
+
+This function expected to be applied to grayscale images. For colored images look at
+FastNonLocalMeansDenoising::labMethod.
+
+@sa
+   fastNlMeansDenoising
+ */
+CV_EXPORTS void fastNlMeansDenoising(InputArray src, OutputArray dst,
+                                     float h,
+                                     int search_window = 21,
+                                     int block_size = 7,
+                                     Stream& stream = Stream::Null());
+
+/** @brief Modification of fastNlMeansDenoising function for colored images
+
+@param src Input 8-bit 3-channel image.
+@param dst Output image with the same size and type as src .
+@param h_luminance Parameter regulating filter strength. Big h value perfectly removes noise but
+also removes image details, smaller h value preserves details but also preserves some noise
+@param photo_render float The same as h but for color components. For most images value equals 10 will be
+enough to remove colored noise and do not distort colors
+@param search_window Size in pixels of the window that is used to compute weighted average for
+given pixel. Should be odd. Affect performance linearly: greater search_window - greater
+denoising time. Recommended value 21 pixels
+@param block_size Size in pixels of the template patch that is used to compute weights. Should be
+odd. Recommended value 7 pixels
+@param stream Stream for the asynchronous invocations.
+
+The function converts image to CIELAB colorspace and then separately denoise L and AB components
+with given h parameters using FastNonLocalMeansDenoising::simpleMethod function.
+
+@sa
+   fastNlMeansDenoisingColored
+ */
+CV_EXPORTS void fastNlMeansDenoisingColored(InputArray src, OutputArray dst,
+                                            float h_luminance, float photo_render,
+                                            int search_window = 21,
+                                            int block_size = 7,
+                                            Stream& stream = Stream::Null());
+
+//! @} photo
+
+}} // namespace cv { namespace cuda {
+
+#endif /* OPENCV_PHOTO_CUDA_HPP */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/photo/legacy/constants_c.h b/duix-sdk/src/main/cpp/third/arm/include/opencv2/photo/legacy/constants_c.h
new file mode 100644
index 0000000..ec1d440
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/photo/legacy/constants_c.h
@@ -0,0 +1,14 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_PHOTO_LEGACY_CONSTANTS_H
+#define OPENCV_PHOTO_LEGACY_CONSTANTS_H
+
+enum InpaintingModes
+{
+    CV_INPAINT_NS      =0,
+    CV_INPAINT_TELEA   =1
+};
+
+#endif // OPENCV_PHOTO_LEGACY_CONSTANTS_H
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/photo/photo.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/photo/photo.hpp
new file mode 100644
index 0000000..8af5e9f
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/photo/photo.hpp
@@ -0,0 +1,48 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __OPENCV_BUILD
+#error this is a compatibility header which should not be used inside the OpenCV library
+#endif
+
+#include "opencv2/photo.hpp"
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/video.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/video.hpp
new file mode 100644
index 0000000..a3dde60
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/video.hpp
@@ -0,0 +1,59 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_VIDEO_HPP
+#define OPENCV_VIDEO_HPP
+
+/**
+  @defgroup video Video Analysis
+  @{
+    @defgroup video_motion Motion Analysis
+    @defgroup video_track Object Tracking
+    @defgroup video_c C API
+  @}
+*/
+
+#include "opencv2/video/tracking.hpp"
+#include "opencv2/video/background_segm.hpp"
+
+#endif //OPENCV_VIDEO_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/video/background_segm.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/video/background_segm.hpp
new file mode 100644
index 0000000..e1dfa15
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/video/background_segm.hpp
@@ -0,0 +1,317 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_BACKGROUND_SEGM_HPP
+#define OPENCV_BACKGROUND_SEGM_HPP
+
+#include "opencv2/core.hpp"
+
+namespace cv
+{
+
+//! @addtogroup video_motion
+//! @{
+
+/** @brief Base class for background/foreground segmentation. :
+
+The class is only used to define the common interface for the whole family of background/foreground
+segmentation algorithms.
+ */
+class CV_EXPORTS_W BackgroundSubtractor : public Algorithm
+{
+public:
+    /** @brief Computes a foreground mask.
+
+    @param image Next video frame.
+    @param fgmask The output foreground mask as an 8-bit binary image.
+    @param learningRate The value between 0 and 1 that indicates how fast the background model is
+    learnt. Negative parameter value makes the algorithm to use some automatically chosen learning
+    rate. 0 means that the background model is not updated at all, 1 means that the background model
+    is completely reinitialized from the last frame.
+     */
+    CV_WRAP virtual void apply(InputArray image, OutputArray fgmask, double learningRate=-1) = 0;
+
+    /** @brief Computes a background image.
+
+    @param backgroundImage The output background image.
+
+    @note Sometimes the background image can be very blurry, as it contain the average background
+    statistics.
+     */
+    CV_WRAP virtual void getBackgroundImage(OutputArray backgroundImage) const = 0;
+};
+
+
+/** @brief Gaussian Mixture-based Background/Foreground Segmentation Algorithm.
+
+The class implements the Gaussian mixture model background subtraction described in @cite Zivkovic2004
+and @cite Zivkovic2006 .
+ */
+class CV_EXPORTS_W BackgroundSubtractorMOG2 : public BackgroundSubtractor
+{
+public:
+    /** @brief Returns the number of last frames that affect the background model
+    */
+    CV_WRAP virtual int getHistory() const = 0;
+    /** @brief Sets the number of last frames that affect the background model
+    */
+    CV_WRAP virtual void setHistory(int history) = 0;
+
+    /** @brief Returns the number of gaussian components in the background model
+    */
+    CV_WRAP virtual int getNMixtures() const = 0;
+    /** @brief Sets the number of gaussian components in the background model.
+
+    The model needs to be reinitalized to reserve memory.
+    */
+    CV_WRAP virtual void setNMixtures(int nmixtures) = 0;//needs reinitialization!
+
+    /** @brief Returns the "background ratio" parameter of the algorithm
+
+    If a foreground pixel keeps semi-constant value for about backgroundRatio\*history frames, it's
+    considered background and added to the model as a center of a new component. It corresponds to TB
+    parameter in the paper.
+     */
+    CV_WRAP virtual double getBackgroundRatio() const = 0;
+    /** @brief Sets the "background ratio" parameter of the algorithm
+    */
+    CV_WRAP virtual void setBackgroundRatio(double ratio) = 0;
+
+    /** @brief Returns the variance threshold for the pixel-model match
+
+    The main threshold on the squared Mahalanobis distance to decide if the sample is well described by
+    the background model or not. Related to Cthr from the paper.
+     */
+    CV_WRAP virtual double getVarThreshold() const = 0;
+    /** @brief Sets the variance threshold for the pixel-model match
+    */
+    CV_WRAP virtual void setVarThreshold(double varThreshold) = 0;
+
+    /** @brief Returns the variance threshold for the pixel-model match used for new mixture component generation
+
+    Threshold for the squared Mahalanobis distance that helps decide when a sample is close to the
+    existing components (corresponds to Tg in the paper). If a pixel is not close to any component, it
+    is considered foreground or added as a new component. 3 sigma =\> Tg=3\*3=9 is default. A smaller Tg
+    value generates more components. A higher Tg value may result in a small number of components but
+    they can grow too large.
+     */
+    CV_WRAP virtual double getVarThresholdGen() const = 0;
+    /** @brief Sets the variance threshold for the pixel-model match used for new mixture component generation
+    */
+    CV_WRAP virtual void setVarThresholdGen(double varThresholdGen) = 0;
+
+    /** @brief Returns the initial variance of each gaussian component
+    */
+    CV_WRAP virtual double getVarInit() const = 0;
+    /** @brief Sets the initial variance of each gaussian component
+    */
+    CV_WRAP virtual void setVarInit(double varInit) = 0;
+
+    CV_WRAP virtual double getVarMin() const = 0;
+    CV_WRAP virtual void setVarMin(double varMin) = 0;
+
+    CV_WRAP virtual double getVarMax() const = 0;
+    CV_WRAP virtual void setVarMax(double varMax) = 0;
+
+    /** @brief Returns the complexity reduction threshold
+
+    This parameter defines the number of samples needed to accept to prove the component exists. CT=0.05
+    is a default value for all the samples. By setting CT=0 you get an algorithm very similar to the
+    standard Stauffer&Grimson algorithm.
+     */
+    CV_WRAP virtual double getComplexityReductionThreshold() const = 0;
+    /** @brief Sets the complexity reduction threshold
+    */
+    CV_WRAP virtual void setComplexityReductionThreshold(double ct) = 0;
+
+    /** @brief Returns the shadow detection flag
+
+    If true, the algorithm detects shadows and marks them. See createBackgroundSubtractorMOG2 for
+    details.
+     */
+    CV_WRAP virtual bool getDetectShadows() const = 0;
+    /** @brief Enables or disables shadow detection
+    */
+    CV_WRAP virtual void setDetectShadows(bool detectShadows) = 0;
+
+    /** @brief Returns the shadow value
+
+    Shadow value is the value used to mark shadows in the foreground mask. Default value is 127. Value 0
+    in the mask always means background, 255 means foreground.
+     */
+    CV_WRAP virtual int getShadowValue() const = 0;
+    /** @brief Sets the shadow value
+    */
+    CV_WRAP virtual void setShadowValue(int value) = 0;
+
+    /** @brief Returns the shadow threshold
+
+    A shadow is detected if pixel is a darker version of the background. The shadow threshold (Tau in
+    the paper) is a threshold defining how much darker the shadow can be. Tau= 0.5 means that if a pixel
+    is more than twice darker then it is not shadow. See Prati, Mikic, Trivedi and Cucchiara,
+    *Detecting Moving Shadows...*, IEEE PAMI,2003.
+     */
+    CV_WRAP virtual double getShadowThreshold() const = 0;
+    /** @brief Sets the shadow threshold
+    */
+    CV_WRAP virtual void setShadowThreshold(double threshold) = 0;
+
+    /** @brief Computes a foreground mask.
+
+    @param image Next video frame. Floating point frame will be used without scaling and should be in range \f$[0,255]\f$.
+    @param fgmask The output foreground mask as an 8-bit binary image.
+    @param learningRate The value between 0 and 1 that indicates how fast the background model is
+    learnt. Negative parameter value makes the algorithm to use some automatically chosen learning
+    rate. 0 means that the background model is not updated at all, 1 means that the background model
+    is completely reinitialized from the last frame.
+     */
+    CV_WRAP virtual void apply(InputArray image, OutputArray fgmask, double learningRate=-1) CV_OVERRIDE = 0;
+};
+
+/** @brief Creates MOG2 Background Subtractor
+
+@param history Length of the history.
+@param varThreshold Threshold on the squared Mahalanobis distance between the pixel and the model
+to decide whether a pixel is well described by the background model. This parameter does not
+affect the background update.
+@param detectShadows If true, the algorithm will detect shadows and mark them. It decreases the
+speed a bit, so if you do not need this feature, set the parameter to false.
+ */
+CV_EXPORTS_W Ptr<BackgroundSubtractorMOG2>
+    createBackgroundSubtractorMOG2(int history=500, double varThreshold=16,
+                                   bool detectShadows=true);
+
+/** @brief K-nearest neighbours - based Background/Foreground Segmentation Algorithm.
+
+The class implements the K-nearest neighbours background subtraction described in @cite Zivkovic2006 .
+Very efficient if number of foreground pixels is low.
+ */
+class CV_EXPORTS_W BackgroundSubtractorKNN : public BackgroundSubtractor
+{
+public:
+    /** @brief Returns the number of last frames that affect the background model
+    */
+    CV_WRAP virtual int getHistory() const = 0;
+    /** @brief Sets the number of last frames that affect the background model
+    */
+    CV_WRAP virtual void setHistory(int history) = 0;
+
+    /** @brief Returns the number of data samples in the background model
+    */
+    CV_WRAP virtual int getNSamples() const = 0;
+    /** @brief Sets the number of data samples in the background model.
+
+    The model needs to be reinitalized to reserve memory.
+    */
+    CV_WRAP virtual void setNSamples(int _nN) = 0;//needs reinitialization!
+
+    /** @brief Returns the threshold on the squared distance between the pixel and the sample
+
+    The threshold on the squared distance between the pixel and the sample to decide whether a pixel is
+    close to a data sample.
+     */
+    CV_WRAP virtual double getDist2Threshold() const = 0;
+    /** @brief Sets the threshold on the squared distance
+    */
+    CV_WRAP virtual void setDist2Threshold(double _dist2Threshold) = 0;
+
+    /** @brief Returns the number of neighbours, the k in the kNN.
+
+    K is the number of samples that need to be within dist2Threshold in order to decide that that
+    pixel is matching the kNN background model.
+     */
+    CV_WRAP virtual int getkNNSamples() const = 0;
+    /** @brief Sets the k in the kNN. How many nearest neighbours need to match.
+    */
+    CV_WRAP virtual void setkNNSamples(int _nkNN) = 0;
+
+    /** @brief Returns the shadow detection flag
+
+    If true, the algorithm detects shadows and marks them. See createBackgroundSubtractorKNN for
+    details.
+     */
+    CV_WRAP virtual bool getDetectShadows() const = 0;
+    /** @brief Enables or disables shadow detection
+    */
+    CV_WRAP virtual void setDetectShadows(bool detectShadows) = 0;
+
+    /** @brief Returns the shadow value
+
+    Shadow value is the value used to mark shadows in the foreground mask. Default value is 127. Value 0
+    in the mask always means background, 255 means foreground.
+     */
+    CV_WRAP virtual int getShadowValue() const = 0;
+    /** @brief Sets the shadow value
+    */
+    CV_WRAP virtual void setShadowValue(int value) = 0;
+
+    /** @brief Returns the shadow threshold
+
+    A shadow is detected if pixel is a darker version of the background. The shadow threshold (Tau in
+    the paper) is a threshold defining how much darker the shadow can be. Tau= 0.5 means that if a pixel
+    is more than twice darker then it is not shadow. See Prati, Mikic, Trivedi and Cucchiara,
+    *Detecting Moving Shadows...*, IEEE PAMI,2003.
+     */
+    CV_WRAP virtual double getShadowThreshold() const = 0;
+    /** @brief Sets the shadow threshold
+     */
+    CV_WRAP virtual void setShadowThreshold(double threshold) = 0;
+};
+
+/** @brief Creates KNN Background Subtractor
+
+@param history Length of the history.
+@param dist2Threshold Threshold on the squared distance between the pixel and the sample to decide
+whether a pixel is close to that sample. This parameter does not affect the background update.
+@param detectShadows If true, the algorithm will detect shadows and mark them. It decreases the
+speed a bit, so if you do not need this feature, set the parameter to false.
+ */
+CV_EXPORTS_W Ptr<BackgroundSubtractorKNN>
+    createBackgroundSubtractorKNN(int history=500, double dist2Threshold=400.0,
+                                   bool detectShadows=true);
+
+//! @} video_motion
+
+} // cv
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/video/detail/tracking.detail.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/video/detail/tracking.detail.hpp
new file mode 100644
index 0000000..1e61079
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/video/detail/tracking.detail.hpp
@@ -0,0 +1,406 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_VIDEO_DETAIL_TRACKING_HPP
+#define OPENCV_VIDEO_DETAIL_TRACKING_HPP
+
+/*
+ * Partially based on:
+ * ====================================================================================================================
+ *  - [AAM] S. Salti, A. Cavallaro, L. Di Stefano, Adaptive Appearance Modeling for Video Tracking: Survey and Evaluation
+ *  - [AMVOT] X. Li, W. Hu, C. Shen, Z. Zhang, A. Dick, A. van den Hengel, A Survey of Appearance Models in Visual Object Tracking
+ *
+ * This Tracking API has been designed with PlantUML. If you modify this API please change UML files under modules/tracking/doc/uml
+ *
+ */
+
+#include "opencv2/core.hpp"
+
+namespace cv {
+namespace detail {
+inline namespace tracking {
+
+/** @addtogroup tracking_detail
+@{
+*/
+
+/************************************ TrackerFeature Base Classes ************************************/
+
+/** @brief Abstract base class for TrackerFeature that represents the feature.
+*/
+class CV_EXPORTS TrackerFeature
+{
+public:
+    virtual ~TrackerFeature();
+
+    /** @brief Compute the features in the images collection
+    @param images The images
+    @param response The output response
+    */
+    void compute(const std::vector<Mat>& images, Mat& response);
+
+protected:
+    virtual bool computeImpl(const std::vector<Mat>& images, Mat& response) = 0;
+};
+
+/** @brief Class that manages the extraction and selection of features
+
+@cite AAM Feature Extraction and Feature Set Refinement (Feature Processing and Feature Selection).
+See table I and section III C @cite AMVOT Appearance modelling -\> Visual representation (Table II,
+section 3.1 - 3.2)
+
+TrackerFeatureSet is an aggregation of TrackerFeature
+
+@sa
+   TrackerFeature
+
+*/
+class CV_EXPORTS TrackerFeatureSet
+{
+public:
+    TrackerFeatureSet();
+
+    ~TrackerFeatureSet();
+
+    /** @brief Extract features from the images collection
+    @param images The input images
+    */
+    void extraction(const std::vector<Mat>& images);
+
+    /** @brief Add TrackerFeature in the collection. Return true if TrackerFeature is added, false otherwise
+    @param feature The TrackerFeature class
+    */
+    bool addTrackerFeature(const Ptr<TrackerFeature>& feature);
+
+    /** @brief Get the TrackerFeature collection (TrackerFeature name, TrackerFeature pointer)
+    */
+    const std::vector<Ptr<TrackerFeature>>& getTrackerFeatures() const;
+
+    /** @brief Get the responses
+    @note Be sure to call extraction before getResponses Example TrackerFeatureSet::getResponses
+    */
+    const std::vector<Mat>& getResponses() const;
+
+private:
+    void clearResponses();
+    bool blockAddTrackerFeature;
+
+    std::vector<Ptr<TrackerFeature>> features;  // list of features
+    std::vector<Mat> responses;  // list of response after compute
+};
+
+/************************************ TrackerSampler Base Classes ************************************/
+
+/** @brief Abstract base class for TrackerSamplerAlgorithm that represents the algorithm for the specific
+sampler.
+*/
+class CV_EXPORTS TrackerSamplerAlgorithm
+{
+public:
+    virtual ~TrackerSamplerAlgorithm();
+
+    /** @brief Computes the regions starting from a position in an image.
+
+    Return true if samples are computed, false otherwise
+
+    @param image The current frame
+    @param boundingBox The bounding box from which regions can be calculated
+
+    @param sample The computed samples @cite AAM Fig. 1 variable Sk
+    */
+    virtual bool sampling(const Mat& image, const Rect& boundingBox, std::vector<Mat>& sample) = 0;
+};
+
+/**
+ * \brief Class that manages the sampler in order to select regions for the update the model of the tracker
+ * [AAM] Sampling e Labeling. See table I and section III B
+ */
+
+/** @brief Class that manages the sampler in order to select regions for the update the model of the tracker
+
+@cite AAM Sampling e Labeling. See table I and section III B
+
+TrackerSampler is an aggregation of TrackerSamplerAlgorithm
+@sa
+   TrackerSamplerAlgorithm
+ */
+class CV_EXPORTS TrackerSampler
+{
+public:
+    TrackerSampler();
+
+    ~TrackerSampler();
+
+    /** @brief Computes the regions starting from a position in an image
+    @param image The current frame
+    @param boundingBox The bounding box from which regions can be calculated
+    */
+    void sampling(const Mat& image, Rect boundingBox);
+
+    /** @brief Return the collection of the TrackerSamplerAlgorithm
+    */
+    const std::vector<Ptr<TrackerSamplerAlgorithm>>& getSamplers() const;
+
+    /** @brief Return the samples from all TrackerSamplerAlgorithm, @cite AAM Fig. 1 variable Sk
+    */
+    const std::vector<Mat>& getSamples() const;
+
+    /** @brief Add TrackerSamplerAlgorithm in the collection. Return true if sampler is added, false otherwise
+    @param sampler The TrackerSamplerAlgorithm
+    */
+    bool addTrackerSamplerAlgorithm(const Ptr<TrackerSamplerAlgorithm>& sampler);
+
+private:
+    std::vector<Ptr<TrackerSamplerAlgorithm>> samplers;
+    std::vector<Mat> samples;
+    bool blockAddTrackerSampler;
+
+    void clearSamples();
+};
+
+/************************************ TrackerModel Base Classes ************************************/
+
+/** @brief Abstract base class for TrackerTargetState that represents a possible state of the target.
+
+See @cite AAM \f$\hat{x}^{i}_{k}\f$ all the states candidates.
+
+Inherits this class with your Target state, In own implementation you can add scale variation,
+width, height, orientation, etc.
+*/
+class CV_EXPORTS TrackerTargetState
+{
+public:
+    virtual ~TrackerTargetState() {};
+    /** @brief Get the position
+    * @return The position
+    */
+    Point2f getTargetPosition() const;
+
+    /** @brief Set the position
+    * @param position The position
+    */
+    void setTargetPosition(const Point2f& position);
+    /** @brief Get the width of the target
+    * @return The width of the target
+    */
+    int getTargetWidth() const;
+
+    /** @brief Set the width of the target
+    * @param width The width of the target
+    */
+    void setTargetWidth(int width);
+    /** @brief Get the height of the target
+    * @return The height of the target
+    */
+    int getTargetHeight() const;
+
+    /** @brief Set the height of the target
+    * @param height The height of the target
+    */
+    void setTargetHeight(int height);
+
+protected:
+    Point2f targetPosition;
+    int targetWidth;
+    int targetHeight;
+};
+
+/** @brief Represents the model of the target at frame \f$k\f$ (all states and scores)
+
+See @cite AAM The set of the pair \f$\langle \hat{x}^{i}_{k}, C^{i}_{k} \rangle\f$
+@sa TrackerTargetState
+*/
+typedef std::vector<std::pair<Ptr<TrackerTargetState>, float>> ConfidenceMap;
+
+/** @brief Represents the estimate states for all frames
+
+@cite AAM \f$x_{k}\f$ is the trajectory of the target up to time \f$k\f$
+
+@sa TrackerTargetState
+*/
+typedef std::vector<Ptr<TrackerTargetState>> Trajectory;
+
+/** @brief Abstract base class for TrackerStateEstimator that estimates the most likely target state.
+
+See @cite AAM State estimator
+
+See @cite AMVOT Statistical modeling (Fig. 3), Table III (generative) - IV (discriminative) - V (hybrid)
+*/
+class CV_EXPORTS TrackerStateEstimator
+{
+public:
+    virtual ~TrackerStateEstimator();
+
+    /** @brief Estimate the most likely target state, return the estimated state
+    @param confidenceMaps The overall appearance model as a list of :cConfidenceMap
+    */
+    Ptr<TrackerTargetState> estimate(const std::vector<ConfidenceMap>& confidenceMaps);
+
+    /** @brief Update the ConfidenceMap with the scores
+    @param confidenceMaps The overall appearance model as a list of :cConfidenceMap
+    */
+    void update(std::vector<ConfidenceMap>& confidenceMaps);
+
+    /** @brief Create TrackerStateEstimator by tracker state estimator type
+    @param trackeStateEstimatorType The TrackerStateEstimator name
+
+    The modes available now:
+
+    -   "BOOSTING" -- Boosting-based discriminative appearance models. See @cite AMVOT section 4.4
+
+    The modes available soon:
+
+    -   "SVM" -- SVM-based discriminative appearance models. See @cite AMVOT section 4.5
+    */
+    static Ptr<TrackerStateEstimator> create(const String& trackeStateEstimatorType);
+
+    /** @brief Get the name of the specific TrackerStateEstimator
+    */
+    String getClassName() const;
+
+protected:
+    virtual Ptr<TrackerTargetState> estimateImpl(const std::vector<ConfidenceMap>& confidenceMaps) = 0;
+    virtual void updateImpl(std::vector<ConfidenceMap>& confidenceMaps) = 0;
+    String className;
+};
+
+/** @brief Abstract class that represents the model of the target.
+
+It must be instantiated by specialized tracker
+
+See @cite AAM Ak
+
+Inherits this with your TrackerModel
+*/
+class CV_EXPORTS TrackerModel
+{
+public:
+    TrackerModel();
+
+    virtual ~TrackerModel();
+
+    /** @brief Set TrackerEstimator, return true if the tracker state estimator is added, false otherwise
+    @param trackerStateEstimator The TrackerStateEstimator
+    @note You can add only one TrackerStateEstimator
+    */
+    bool setTrackerStateEstimator(Ptr<TrackerStateEstimator> trackerStateEstimator);
+
+    /** @brief Estimate the most likely target location
+
+    @cite AAM ME, Model Estimation table I
+    @param responses Features extracted from TrackerFeatureSet
+    */
+    void modelEstimation(const std::vector<Mat>& responses);
+
+    /** @brief Update the model
+
+    @cite AAM MU, Model Update table I
+    */
+    void modelUpdate();
+
+    /** @brief Run the TrackerStateEstimator, return true if is possible to estimate a new state, false otherwise
+    */
+    bool runStateEstimator();
+
+    /** @brief Set the current TrackerTargetState in the Trajectory
+    @param lastTargetState The current TrackerTargetState
+    */
+    void setLastTargetState(const Ptr<TrackerTargetState>& lastTargetState);
+
+    /** @brief Get the last TrackerTargetState from Trajectory
+    */
+    Ptr<TrackerTargetState> getLastTargetState() const;
+
+    /** @brief Get the list of the ConfidenceMap
+    */
+    const std::vector<ConfidenceMap>& getConfidenceMaps() const;
+
+    /** @brief Get the last ConfidenceMap for the current frame
+    */
+    const ConfidenceMap& getLastConfidenceMap() const;
+
+    /** @brief Get the TrackerStateEstimator
+    */
+    Ptr<TrackerStateEstimator> getTrackerStateEstimator() const;
+
+private:
+    void clearCurrentConfidenceMap();
+
+protected:
+    std::vector<ConfidenceMap> confidenceMaps;
+    Ptr<TrackerStateEstimator> stateEstimator;
+    ConfidenceMap currentConfidenceMap;
+    Trajectory trajectory;
+    int maxCMLength;
+
+    virtual void modelEstimationImpl(const std::vector<Mat>& responses) = 0;
+    virtual void modelUpdateImpl() = 0;
+};
+
+/************************************ Specific TrackerStateEstimator Classes ************************************/
+
+// None
+
+/************************************ Specific TrackerSamplerAlgorithm Classes ************************************/
+
+/** @brief TrackerSampler based on CSC (current state centered), used by MIL algorithm TrackerMIL
+ */
+class CV_EXPORTS TrackerSamplerCSC : public TrackerSamplerAlgorithm
+{
+public:
+    ~TrackerSamplerCSC();
+
+    enum MODE
+    {
+        MODE_INIT_POS = 1,  //!< mode for init positive samples
+        MODE_INIT_NEG = 2,  //!< mode for init negative samples
+        MODE_TRACK_POS = 3,  //!< mode for update positive samples
+        MODE_TRACK_NEG = 4,  //!< mode for update negative samples
+        MODE_DETECT = 5  //!< mode for detect samples
+    };
+
+    struct CV_EXPORTS Params
+    {
+        Params();
+        float initInRad;  //!< radius for gathering positive instances during init
+        float trackInPosRad;  //!< radius for gathering positive instances during tracking
+        float searchWinSize;  //!< size of search window
+        int initMaxNegNum;  //!< # negative samples to use during init
+        int trackMaxPosNum;  //!< # positive samples to use during training
+        int trackMaxNegNum;  //!< # negative samples to use during training
+    };
+
+    /** @brief Constructor
+    @param parameters TrackerSamplerCSC parameters TrackerSamplerCSC::Params
+    */
+    TrackerSamplerCSC(const TrackerSamplerCSC::Params& parameters = TrackerSamplerCSC::Params());
+
+    /** @brief Set the sampling mode of TrackerSamplerCSC
+    @param samplingMode The sampling mode
+
+    The modes are:
+
+    -   "MODE_INIT_POS = 1" -- for the positive sampling in initialization step
+    -   "MODE_INIT_NEG = 2" -- for the negative sampling in initialization step
+    -   "MODE_TRACK_POS = 3" -- for the positive sampling in update step
+    -   "MODE_TRACK_NEG = 4" -- for the negative sampling in update step
+    -   "MODE_DETECT = 5" -- for the sampling in detection step
+    */
+    void setMode(int samplingMode);
+
+    bool sampling(const Mat& image, const Rect& boundingBox, std::vector<Mat>& sample) CV_OVERRIDE;
+
+private:
+    Params params;
+    int mode;
+    RNG rng;
+
+    std::vector<Mat> sampleImage(const Mat& img, int x, int y, int w, int h, float inrad, float outrad = 0, int maxnum = 1000000);
+};
+
+//! @}
+
+}}}  // namespace cv::detail::tracking
+
+#endif  // OPENCV_VIDEO_DETAIL_TRACKING_HPP
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/video/legacy/constants_c.h b/duix-sdk/src/main/cpp/third/arm/include/opencv2/video/legacy/constants_c.h
new file mode 100644
index 0000000..1a98f52
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/video/legacy/constants_c.h
@@ -0,0 +1,16 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_VIDEO_LEGACY_CONSTANTS_H
+#define OPENCV_VIDEO_LEGACY_CONSTANTS_H
+
+enum
+{
+    CV_LKFLOW_PYR_A_READY = 1,
+    CV_LKFLOW_PYR_B_READY = 2,
+    CV_LKFLOW_INITIAL_GUESSES = 4,
+    CV_LKFLOW_GET_MIN_EIGENVALS = 8
+};
+
+#endif // OPENCV_VIDEO_LEGACY_CONSTANTS_H
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/video/tracking.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/video/tracking.hpp
new file mode 100644
index 0000000..7ec6bc5
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/video/tracking.hpp
@@ -0,0 +1,857 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_TRACKING_HPP
+#define OPENCV_TRACKING_HPP
+
+#include "opencv2/core.hpp"
+#include "opencv2/imgproc.hpp"
+
+namespace cv
+{
+
+//! @addtogroup video_track
+//! @{
+
+enum { OPTFLOW_USE_INITIAL_FLOW     = 4,
+       OPTFLOW_LK_GET_MIN_EIGENVALS = 8,
+       OPTFLOW_FARNEBACK_GAUSSIAN   = 256
+     };
+
+/** @brief Finds an object center, size, and orientation.
+
+@param probImage Back projection of the object histogram. See calcBackProject.
+@param window Initial search window.
+@param criteria Stop criteria for the underlying meanShift.
+returns
+(in old interfaces) Number of iterations CAMSHIFT took to converge
+The function implements the CAMSHIFT object tracking algorithm @cite Bradski98 . First, it finds an
+object center using meanShift and then adjusts the window size and finds the optimal rotation. The
+function returns the rotated rectangle structure that includes the object position, size, and
+orientation. The next position of the search window can be obtained with RotatedRect::boundingRect()
+
+See the OpenCV sample camshiftdemo.c that tracks colored objects.
+
+@note
+-   (Python) A sample explaining the camshift tracking algorithm can be found at
+    opencv_source_code/samples/python/camshift.py
+ */
+CV_EXPORTS_W RotatedRect CamShift( InputArray probImage, CV_IN_OUT Rect& window,
+                                   TermCriteria criteria );
+/** @example samples/cpp/camshiftdemo.cpp
+An example using the mean-shift tracking algorithm
+*/
+
+/** @brief Finds an object on a back projection image.
+
+@param probImage Back projection of the object histogram. See calcBackProject for details.
+@param window Initial search window.
+@param criteria Stop criteria for the iterative search algorithm.
+returns
+:   Number of iterations CAMSHIFT took to converge.
+The function implements the iterative object search algorithm. It takes the input back projection of
+an object and the initial position. The mass center in window of the back projection image is
+computed and the search window center shifts to the mass center. The procedure is repeated until the
+specified number of iterations criteria.maxCount is done or until the window center shifts by less
+than criteria.epsilon. The algorithm is used inside CamShift and, unlike CamShift , the search
+window size or orientation do not change during the search. You can simply pass the output of
+calcBackProject to this function. But better results can be obtained if you pre-filter the back
+projection and remove the noise. For example, you can do this by retrieving connected components
+with findContours , throwing away contours with small area ( contourArea ), and rendering the
+remaining contours with drawContours.
+
+ */
+CV_EXPORTS_W int meanShift( InputArray probImage, CV_IN_OUT Rect& window, TermCriteria criteria );
+
+/** @brief Constructs the image pyramid which can be passed to calcOpticalFlowPyrLK.
+
+@param img 8-bit input image.
+@param pyramid output pyramid.
+@param winSize window size of optical flow algorithm. Must be not less than winSize argument of
+calcOpticalFlowPyrLK. It is needed to calculate required padding for pyramid levels.
+@param maxLevel 0-based maximal pyramid level number.
+@param withDerivatives set to precompute gradients for the every pyramid level. If pyramid is
+constructed without the gradients then calcOpticalFlowPyrLK will calculate them internally.
+@param pyrBorder the border mode for pyramid layers.
+@param derivBorder the border mode for gradients.
+@param tryReuseInputImage put ROI of input image into the pyramid if possible. You can pass false
+to force data copying.
+@return number of levels in constructed pyramid. Can be less than maxLevel.
+ */
+CV_EXPORTS_W int buildOpticalFlowPyramid( InputArray img, OutputArrayOfArrays pyramid,
+                                          Size winSize, int maxLevel, bool withDerivatives = true,
+                                          int pyrBorder = BORDER_REFLECT_101,
+                                          int derivBorder = BORDER_CONSTANT,
+                                          bool tryReuseInputImage = true );
+
+/** @example samples/cpp/lkdemo.cpp
+An example using the Lucas-Kanade optical flow algorithm
+*/
+
+/** @brief Calculates an optical flow for a sparse feature set using the iterative Lucas-Kanade method with
+pyramids.
+
+@param prevImg first 8-bit input image or pyramid constructed by buildOpticalFlowPyramid.
+@param nextImg second input image or pyramid of the same size and the same type as prevImg.
+@param prevPts vector of 2D points for which the flow needs to be found; point coordinates must be
+single-precision floating-point numbers.
+@param nextPts output vector of 2D points (with single-precision floating-point coordinates)
+containing the calculated new positions of input features in the second image; when
+OPTFLOW_USE_INITIAL_FLOW flag is passed, the vector must have the same size as in the input.
+@param status output status vector (of unsigned chars); each element of the vector is set to 1 if
+the flow for the corresponding features has been found, otherwise, it is set to 0.
+@param err output vector of errors; each element of the vector is set to an error for the
+corresponding feature, type of the error measure can be set in flags parameter; if the flow wasn't
+found then the error is not defined (use the status parameter to find such cases).
+@param winSize size of the search window at each pyramid level.
+@param maxLevel 0-based maximal pyramid level number; if set to 0, pyramids are not used (single
+level), if set to 1, two levels are used, and so on; if pyramids are passed to input then
+algorithm will use as many levels as pyramids have but no more than maxLevel.
+@param criteria parameter, specifying the termination criteria of the iterative search algorithm
+(after the specified maximum number of iterations criteria.maxCount or when the search window
+moves by less than criteria.epsilon.
+@param flags operation flags:
+ -   **OPTFLOW_USE_INITIAL_FLOW** uses initial estimations, stored in nextPts; if the flag is
+     not set, then prevPts is copied to nextPts and is considered the initial estimate.
+ -   **OPTFLOW_LK_GET_MIN_EIGENVALS** use minimum eigen values as an error measure (see
+     minEigThreshold description); if the flag is not set, then L1 distance between patches
+     around the original and a moved point, divided by number of pixels in a window, is used as a
+     error measure.
+@param minEigThreshold the algorithm calculates the minimum eigen value of a 2x2 normal matrix of
+optical flow equations (this matrix is called a spatial gradient matrix in @cite Bouguet00), divided
+by number of pixels in a window; if this value is less than minEigThreshold, then a corresponding
+feature is filtered out and its flow is not processed, so it allows to remove bad points and get a
+performance boost.
+
+The function implements a sparse iterative version of the Lucas-Kanade optical flow in pyramids. See
+@cite Bouguet00 . The function is parallelized with the TBB library.
+
+@note
+
+-   An example using the Lucas-Kanade optical flow algorithm can be found at
+    opencv_source_code/samples/cpp/lkdemo.cpp
+-   (Python) An example using the Lucas-Kanade optical flow algorithm can be found at
+    opencv_source_code/samples/python/lk_track.py
+-   (Python) An example using the Lucas-Kanade tracker for homography matching can be found at
+    opencv_source_code/samples/python/lk_homography.py
+ */
+CV_EXPORTS_W void calcOpticalFlowPyrLK( InputArray prevImg, InputArray nextImg,
+                                        InputArray prevPts, InputOutputArray nextPts,
+                                        OutputArray status, OutputArray err,
+                                        Size winSize = Size(21,21), int maxLevel = 3,
+                                        TermCriteria criteria = TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, 30, 0.01),
+                                        int flags = 0, double minEigThreshold = 1e-4 );
+
+/** @brief Computes a dense optical flow using the Gunnar Farneback's algorithm.
+
+@param prev first 8-bit single-channel input image.
+@param next second input image of the same size and the same type as prev.
+@param flow computed flow image that has the same size as prev and type CV_32FC2.
+@param pyr_scale parameter, specifying the image scale (\<1) to build pyramids for each image;
+pyr_scale=0.5 means a classical pyramid, where each next layer is twice smaller than the previous
+one.
+@param levels number of pyramid layers including the initial image; levels=1 means that no extra
+layers are created and only the original images are used.
+@param winsize averaging window size; larger values increase the algorithm robustness to image
+noise and give more chances for fast motion detection, but yield more blurred motion field.
+@param iterations number of iterations the algorithm does at each pyramid level.
+@param poly_n size of the pixel neighborhood used to find polynomial expansion in each pixel;
+larger values mean that the image will be approximated with smoother surfaces, yielding more
+robust algorithm and more blurred motion field, typically poly_n =5 or 7.
+@param poly_sigma standard deviation of the Gaussian that is used to smooth derivatives used as a
+basis for the polynomial expansion; for poly_n=5, you can set poly_sigma=1.1, for poly_n=7, a
+good value would be poly_sigma=1.5.
+@param flags operation flags that can be a combination of the following:
+ -   **OPTFLOW_USE_INITIAL_FLOW** uses the input flow as an initial flow approximation.
+ -   **OPTFLOW_FARNEBACK_GAUSSIAN** uses the Gaussian \f$\texttt{winsize}\times\texttt{winsize}\f$
+     filter instead of a box filter of the same size for optical flow estimation; usually, this
+     option gives z more accurate flow than with a box filter, at the cost of lower speed;
+     normally, winsize for a Gaussian window should be set to a larger value to achieve the same
+     level of robustness.
+
+The function finds an optical flow for each prev pixel using the @cite Farneback2003 algorithm so that
+
+\f[\texttt{prev} (y,x)  \sim \texttt{next} ( y + \texttt{flow} (y,x)[1],  x + \texttt{flow} (y,x)[0])\f]
+
+@note
+
+-   An example using the optical flow algorithm described by Gunnar Farneback can be found at
+    opencv_source_code/samples/cpp/fback.cpp
+-   (Python) An example using the optical flow algorithm described by Gunnar Farneback can be
+    found at opencv_source_code/samples/python/opt_flow.py
+ */
+CV_EXPORTS_W void calcOpticalFlowFarneback( InputArray prev, InputArray next, InputOutputArray flow,
+                                            double pyr_scale, int levels, int winsize,
+                                            int iterations, int poly_n, double poly_sigma,
+                                            int flags );
+
+/** @brief Computes an optimal affine transformation between two 2D point sets.
+
+@param src First input 2D point set stored in std::vector or Mat, or an image stored in Mat.
+@param dst Second input 2D point set of the same size and the same type as A, or another image.
+@param fullAffine If true, the function finds an optimal affine transformation with no additional
+restrictions (6 degrees of freedom). Otherwise, the class of transformations to choose from is
+limited to combinations of translation, rotation, and uniform scaling (4 degrees of freedom).
+
+The function finds an optimal affine transform *[A|b]* (a 2 x 3 floating-point matrix) that
+approximates best the affine transformation between:
+
+*   Two point sets
+*   Two raster images. In this case, the function first finds some features in the src image and
+    finds the corresponding features in dst image. After that, the problem is reduced to the first
+    case.
+In case of point sets, the problem is formulated as follows: you need to find a 2x2 matrix *A* and
+2x1 vector *b* so that:
+
+\f[[A^*|b^*] = arg  \min _{[A|b]}  \sum _i  \| \texttt{dst}[i] - A { \texttt{src}[i]}^T - b  \| ^2\f]
+where src[i] and dst[i] are the i-th points in src and dst, respectively
+\f$[A|b]\f$ can be either arbitrary (when fullAffine=true ) or have a form of
+\f[\begin{bmatrix} a_{11} & a_{12} & b_1  \\ -a_{12} & a_{11} & b_2  \end{bmatrix}\f]
+when fullAffine=false.
+
+@deprecated Use cv::estimateAffine2D, cv::estimateAffinePartial2D instead. If you are using this function
+with images, extract points using cv::calcOpticalFlowPyrLK and then use the estimation functions.
+
+@sa
+estimateAffine2D, estimateAffinePartial2D, getAffineTransform, getPerspectiveTransform, findHomography
+ */
+CV_DEPRECATED CV_EXPORTS Mat estimateRigidTransform( InputArray src, InputArray dst, bool fullAffine );
+
+enum
+{
+    MOTION_TRANSLATION = 0,
+    MOTION_EUCLIDEAN   = 1,
+    MOTION_AFFINE      = 2,
+    MOTION_HOMOGRAPHY  = 3
+};
+
+/** @brief Computes the Enhanced Correlation Coefficient value between two images @cite EP08 .
+
+@param templateImage single-channel template image; CV_8U or CV_32F array.
+@param inputImage single-channel input image to be warped to provide an image similar to
+ templateImage, same type as templateImage.
+@param inputMask An optional mask to indicate valid values of inputImage.
+
+@sa
+findTransformECC
+ */
+
+CV_EXPORTS_W double computeECC(InputArray templateImage, InputArray inputImage, InputArray inputMask = noArray());
+
+/** @example samples/cpp/image_alignment.cpp
+An example using the image alignment ECC algorithm
+*/
+
+/** @brief Finds the geometric transform (warp) between two images in terms of the ECC criterion @cite EP08 .
+
+@param templateImage single-channel template image; CV_8U or CV_32F array.
+@param inputImage single-channel input image which should be warped with the final warpMatrix in
+order to provide an image similar to templateImage, same type as templateImage.
+@param warpMatrix floating-point \f$2\times 3\f$ or \f$3\times 3\f$ mapping matrix (warp).
+@param motionType parameter, specifying the type of motion:
+ -   **MOTION_TRANSLATION** sets a translational motion model; warpMatrix is \f$2\times 3\f$ with
+     the first \f$2\times 2\f$ part being the unity matrix and the rest two parameters being
+     estimated.
+ -   **MOTION_EUCLIDEAN** sets a Euclidean (rigid) transformation as motion model; three
+     parameters are estimated; warpMatrix is \f$2\times 3\f$.
+ -   **MOTION_AFFINE** sets an affine motion model (DEFAULT); six parameters are estimated;
+     warpMatrix is \f$2\times 3\f$.
+ -   **MOTION_HOMOGRAPHY** sets a homography as a motion model; eight parameters are
+     estimated;\`warpMatrix\` is \f$3\times 3\f$.
+@param criteria parameter, specifying the termination criteria of the ECC algorithm;
+criteria.epsilon defines the threshold of the increment in the correlation coefficient between two
+iterations (a negative criteria.epsilon makes criteria.maxcount the only termination criterion).
+Default values are shown in the declaration above.
+@param inputMask An optional mask to indicate valid values of inputImage.
+@param gaussFiltSize An optional value indicating size of gaussian blur filter; (DEFAULT: 5)
+
+The function estimates the optimum transformation (warpMatrix) with respect to ECC criterion
+(@cite EP08), that is
+
+\f[\texttt{warpMatrix} = \arg\max_{W} \texttt{ECC}(\texttt{templateImage}(x,y),\texttt{inputImage}(x',y'))\f]
+
+where
+
+\f[\begin{bmatrix} x' \\ y' \end{bmatrix} = W \cdot \begin{bmatrix} x \\ y \\ 1 \end{bmatrix}\f]
+
+(the equation holds with homogeneous coordinates for homography). It returns the final enhanced
+correlation coefficient, that is the correlation coefficient between the template image and the
+final warped input image. When a \f$3\times 3\f$ matrix is given with motionType =0, 1 or 2, the third
+row is ignored.
+
+Unlike findHomography and estimateRigidTransform, the function findTransformECC implements an
+area-based alignment that builds on intensity similarities. In essence, the function updates the
+initial transformation that roughly aligns the images. If this information is missing, the identity
+warp (unity matrix) is used as an initialization. Note that if images undergo strong
+displacements/rotations, an initial transformation that roughly aligns the images is necessary
+(e.g., a simple euclidean/similarity transform that allows for the images showing the same image
+content approximately). Use inverse warping in the second image to take an image close to the first
+one, i.e. use the flag WARP_INVERSE_MAP with warpAffine or warpPerspective. See also the OpenCV
+sample image_alignment.cpp that demonstrates the use of the function. Note that the function throws
+an exception if algorithm does not converges.
+
+@sa
+computeECC, estimateAffine2D, estimateAffinePartial2D, findHomography
+ */
+CV_EXPORTS_W double findTransformECC( InputArray templateImage, InputArray inputImage,
+                                      InputOutputArray warpMatrix, int motionType,
+                                      TermCriteria criteria,
+                                      InputArray inputMask, int gaussFiltSize);
+
+/** @overload */
+CV_EXPORTS_W
+double findTransformECC(InputArray templateImage, InputArray inputImage,
+    InputOutputArray warpMatrix, int motionType = MOTION_AFFINE,
+    TermCriteria criteria = TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, 50, 0.001),
+    InputArray inputMask = noArray());
+
+/** @example samples/cpp/kalman.cpp
+An example using the standard Kalman filter
+*/
+
+/** @brief Kalman filter class.
+
+The class implements a standard Kalman filter <http://en.wikipedia.org/wiki/Kalman_filter>,
+@cite Welch95 . However, you can modify transitionMatrix, controlMatrix, and measurementMatrix to get
+an extended Kalman filter functionality.
+@note In C API when CvKalman\* kalmanFilter structure is not needed anymore, it should be released
+with cvReleaseKalman(&kalmanFilter)
+ */
+class CV_EXPORTS_W KalmanFilter
+{
+public:
+    CV_WRAP KalmanFilter();
+    /** @overload
+    @param dynamParams Dimensionality of the state.
+    @param measureParams Dimensionality of the measurement.
+    @param controlParams Dimensionality of the control vector.
+    @param type Type of the created matrices that should be CV_32F or CV_64F.
+    */
+    CV_WRAP KalmanFilter( int dynamParams, int measureParams, int controlParams = 0, int type = CV_32F );
+
+    /** @brief Re-initializes Kalman filter. The previous content is destroyed.
+
+    @param dynamParams Dimensionality of the state.
+    @param measureParams Dimensionality of the measurement.
+    @param controlParams Dimensionality of the control vector.
+    @param type Type of the created matrices that should be CV_32F or CV_64F.
+     */
+    void init( int dynamParams, int measureParams, int controlParams = 0, int type = CV_32F );
+
+    /** @brief Computes a predicted state.
+
+    @param control The optional input control
+     */
+    CV_WRAP const Mat& predict( const Mat& control = Mat() );
+
+    /** @brief Updates the predicted state from the measurement.
+
+    @param measurement The measured system parameters
+     */
+    CV_WRAP const Mat& correct( const Mat& measurement );
+
+    CV_PROP_RW Mat statePre;           //!< predicted state (x'(k)): x(k)=A*x(k-1)+B*u(k)
+    CV_PROP_RW Mat statePost;          //!< corrected state (x(k)): x(k)=x'(k)+K(k)*(z(k)-H*x'(k))
+    CV_PROP_RW Mat transitionMatrix;   //!< state transition matrix (A)
+    CV_PROP_RW Mat controlMatrix;      //!< control matrix (B) (not used if there is no control)
+    CV_PROP_RW Mat measurementMatrix;  //!< measurement matrix (H)
+    CV_PROP_RW Mat processNoiseCov;    //!< process noise covariance matrix (Q)
+    CV_PROP_RW Mat measurementNoiseCov;//!< measurement noise covariance matrix (R)
+    CV_PROP_RW Mat errorCovPre;        //!< priori error estimate covariance matrix (P'(k)): P'(k)=A*P(k-1)*At + Q)*/
+    CV_PROP_RW Mat gain;               //!< Kalman gain matrix (K(k)): K(k)=P'(k)*Ht*inv(H*P'(k)*Ht+R)
+    CV_PROP_RW Mat errorCovPost;       //!< posteriori error estimate covariance matrix (P(k)): P(k)=(I-K(k)*H)*P'(k)
+
+    // temporary matrices
+    Mat temp1;
+    Mat temp2;
+    Mat temp3;
+    Mat temp4;
+    Mat temp5;
+};
+
+
+/** @brief Read a .flo file
+
+ @param path Path to the file to be loaded
+
+ The function readOpticalFlow loads a flow field from a file and returns it as a single matrix.
+ Resulting Mat has a type CV_32FC2 - floating-point, 2-channel. First channel corresponds to the
+ flow in the horizontal direction (u), second - vertical (v).
+ */
+CV_EXPORTS_W Mat readOpticalFlow( const String& path );
+/** @brief Write a .flo to disk
+
+ @param path Path to the file to be written
+ @param flow Flow field to be stored
+
+ The function stores a flow field in a file, returns true on success, false otherwise.
+ The flow field must be a 2-channel, floating-point matrix (CV_32FC2). First channel corresponds
+ to the flow in the horizontal direction (u), second - vertical (v).
+ */
+CV_EXPORTS_W bool writeOpticalFlow( const String& path, InputArray flow );
+
+/**
+   Base class for dense optical flow algorithms
+*/
+class CV_EXPORTS_W DenseOpticalFlow : public Algorithm
+{
+public:
+    /** @brief Calculates an optical flow.
+
+    @param I0 first 8-bit single-channel input image.
+    @param I1 second input image of the same size and the same type as prev.
+    @param flow computed flow image that has the same size as prev and type CV_32FC2.
+     */
+    CV_WRAP virtual void calc( InputArray I0, InputArray I1, InputOutputArray flow ) = 0;
+    /** @brief Releases all inner buffers.
+    */
+    CV_WRAP virtual void collectGarbage() = 0;
+};
+
+/** @brief Base interface for sparse optical flow algorithms.
+ */
+class CV_EXPORTS_W SparseOpticalFlow : public Algorithm
+{
+public:
+    /** @brief Calculates a sparse optical flow.
+
+    @param prevImg First input image.
+    @param nextImg Second input image of the same size and the same type as prevImg.
+    @param prevPts Vector of 2D points for which the flow needs to be found.
+    @param nextPts Output vector of 2D points containing the calculated new positions of input features in the second image.
+    @param status Output status vector. Each element of the vector is set to 1 if the
+                  flow for the corresponding features has been found. Otherwise, it is set to 0.
+    @param err Optional output vector that contains error response for each point (inverse confidence).
+     */
+    CV_WRAP virtual void calc(InputArray prevImg, InputArray nextImg,
+                      InputArray prevPts, InputOutputArray nextPts,
+                      OutputArray status,
+                      OutputArray err = cv::noArray()) = 0;
+};
+
+
+/** @brief Class computing a dense optical flow using the Gunnar Farneback's algorithm.
+ */
+class CV_EXPORTS_W FarnebackOpticalFlow : public DenseOpticalFlow
+{
+public:
+    CV_WRAP virtual int getNumLevels() const = 0;
+    CV_WRAP virtual void setNumLevels(int numLevels) = 0;
+
+    CV_WRAP virtual double getPyrScale() const = 0;
+    CV_WRAP virtual void setPyrScale(double pyrScale) = 0;
+
+    CV_WRAP virtual bool getFastPyramids() const = 0;
+    CV_WRAP virtual void setFastPyramids(bool fastPyramids) = 0;
+
+    CV_WRAP virtual int getWinSize() const = 0;
+    CV_WRAP virtual void setWinSize(int winSize) = 0;
+
+    CV_WRAP virtual int getNumIters() const = 0;
+    CV_WRAP virtual void setNumIters(int numIters) = 0;
+
+    CV_WRAP virtual int getPolyN() const = 0;
+    CV_WRAP virtual void setPolyN(int polyN) = 0;
+
+    CV_WRAP virtual double getPolySigma() const = 0;
+    CV_WRAP virtual void setPolySigma(double polySigma) = 0;
+
+    CV_WRAP virtual int getFlags() const = 0;
+    CV_WRAP virtual void setFlags(int flags) = 0;
+
+    CV_WRAP static Ptr<FarnebackOpticalFlow> create(
+            int numLevels = 5,
+            double pyrScale = 0.5,
+            bool fastPyramids = false,
+            int winSize = 13,
+            int numIters = 10,
+            int polyN = 5,
+            double polySigma = 1.1,
+            int flags = 0);
+};
+
+/** @brief Variational optical flow refinement
+
+This class implements variational refinement of the input flow field, i.e.
+it uses input flow to initialize the minimization of the following functional:
+\f$E(U) = \int_{\Omega} \delta \Psi(E_I) + \gamma \Psi(E_G) + \alpha \Psi(E_S) \f$,
+where \f$E_I,E_G,E_S\f$ are color constancy, gradient constancy and smoothness terms
+respectively. \f$\Psi(s^2)=\sqrt{s^2+\epsilon^2}\f$ is a robust penalizer to limit the
+influence of outliers. A complete formulation and a description of the minimization
+procedure can be found in @cite Brox2004
+*/
+class CV_EXPORTS_W VariationalRefinement : public DenseOpticalFlow
+{
+public:
+    /** @brief @ref calc function overload to handle separate horizontal (u) and vertical (v) flow components
+    (to avoid extra splits/merges) */
+    CV_WRAP virtual void calcUV(InputArray I0, InputArray I1, InputOutputArray flow_u, InputOutputArray flow_v) = 0;
+
+    /** @brief Number of outer (fixed-point) iterations in the minimization procedure.
+    @see setFixedPointIterations */
+    CV_WRAP virtual int getFixedPointIterations() const = 0;
+    /** @copybrief getFixedPointIterations @see getFixedPointIterations */
+    CV_WRAP virtual void setFixedPointIterations(int val) = 0;
+
+    /** @brief Number of inner successive over-relaxation (SOR) iterations
+        in the minimization procedure to solve the respective linear system.
+    @see setSorIterations */
+    CV_WRAP virtual int getSorIterations() const = 0;
+    /** @copybrief getSorIterations @see getSorIterations */
+    CV_WRAP virtual void setSorIterations(int val) = 0;
+
+    /** @brief Relaxation factor in SOR
+    @see setOmega */
+    CV_WRAP virtual float getOmega() const = 0;
+    /** @copybrief getOmega @see getOmega */
+    CV_WRAP virtual void setOmega(float val) = 0;
+
+    /** @brief Weight of the smoothness term
+    @see setAlpha */
+    CV_WRAP virtual float getAlpha() const = 0;
+    /** @copybrief getAlpha @see getAlpha */
+    CV_WRAP virtual void setAlpha(float val) = 0;
+
+    /** @brief Weight of the color constancy term
+    @see setDelta */
+    CV_WRAP virtual float getDelta() const = 0;
+    /** @copybrief getDelta @see getDelta */
+    CV_WRAP virtual void setDelta(float val) = 0;
+
+    /** @brief Weight of the gradient constancy term
+    @see setGamma */
+    CV_WRAP virtual float getGamma() const = 0;
+    /** @copybrief getGamma @see getGamma */
+    CV_WRAP virtual void setGamma(float val) = 0;
+
+    /** @brief Creates an instance of VariationalRefinement
+    */
+    CV_WRAP static Ptr<VariationalRefinement> create();
+};
+
+/** @brief DIS optical flow algorithm.
+
+This class implements the Dense Inverse Search (DIS) optical flow algorithm. More
+details about the algorithm can be found at @cite Kroeger2016 . Includes three presets with preselected
+parameters to provide reasonable trade-off between speed and quality. However, even the slowest preset is
+still relatively fast, use DeepFlow if you need better quality and don't care about speed.
+
+This implementation includes several additional features compared to the algorithm described in the paper,
+including spatial propagation of flow vectors (@ref getUseSpatialPropagation), as well as an option to
+utilize an initial flow approximation passed to @ref calc (which is, essentially, temporal propagation,
+if the previous frame's flow field is passed).
+*/
+class CV_EXPORTS_W DISOpticalFlow : public DenseOpticalFlow
+{
+public:
+    enum
+    {
+        PRESET_ULTRAFAST = 0,
+        PRESET_FAST = 1,
+        PRESET_MEDIUM = 2
+    };
+
+    /** @brief Finest level of the Gaussian pyramid on which the flow is computed (zero level
+        corresponds to the original image resolution). The final flow is obtained by bilinear upscaling.
+        @see setFinestScale */
+    CV_WRAP virtual int getFinestScale() const = 0;
+    /** @copybrief getFinestScale @see getFinestScale */
+    CV_WRAP virtual void setFinestScale(int val) = 0;
+
+    /** @brief Size of an image patch for matching (in pixels). Normally, default 8x8 patches work well
+        enough in most cases.
+        @see setPatchSize */
+    CV_WRAP virtual int getPatchSize() const = 0;
+    /** @copybrief getPatchSize @see getPatchSize */
+    CV_WRAP virtual void setPatchSize(int val) = 0;
+
+    /** @brief Stride between neighbor patches. Must be less than patch size. Lower values correspond
+        to higher flow quality.
+        @see setPatchStride */
+    CV_WRAP virtual int getPatchStride() const = 0;
+    /** @copybrief getPatchStride @see getPatchStride */
+    CV_WRAP virtual void setPatchStride(int val) = 0;
+
+    /** @brief Maximum number of gradient descent iterations in the patch inverse search stage. Higher values
+        may improve quality in some cases.
+        @see setGradientDescentIterations */
+    CV_WRAP virtual int getGradientDescentIterations() const = 0;
+    /** @copybrief getGradientDescentIterations @see getGradientDescentIterations */
+    CV_WRAP virtual void setGradientDescentIterations(int val) = 0;
+
+    /** @brief Number of fixed point iterations of variational refinement per scale. Set to zero to
+        disable variational refinement completely. Higher values will typically result in more smooth and
+        high-quality flow.
+    @see setGradientDescentIterations */
+    CV_WRAP virtual int getVariationalRefinementIterations() const = 0;
+    /** @copybrief getGradientDescentIterations @see getGradientDescentIterations */
+    CV_WRAP virtual void setVariationalRefinementIterations(int val) = 0;
+
+    /** @brief Weight of the smoothness term
+    @see setVariationalRefinementAlpha */
+    CV_WRAP virtual float getVariationalRefinementAlpha() const = 0;
+    /** @copybrief getVariationalRefinementAlpha @see getVariationalRefinementAlpha */
+    CV_WRAP virtual void setVariationalRefinementAlpha(float val) = 0;
+
+    /** @brief Weight of the color constancy term
+    @see setVariationalRefinementDelta */
+    CV_WRAP virtual float getVariationalRefinementDelta() const = 0;
+    /** @copybrief getVariationalRefinementDelta @see getVariationalRefinementDelta */
+    CV_WRAP virtual void setVariationalRefinementDelta(float val) = 0;
+
+    /** @brief Weight of the gradient constancy term
+    @see setVariationalRefinementGamma */
+    CV_WRAP virtual float getVariationalRefinementGamma() const = 0;
+    /** @copybrief getVariationalRefinementGamma @see getVariationalRefinementGamma */
+    CV_WRAP virtual void setVariationalRefinementGamma(float val) = 0;
+
+
+    /** @brief Whether to use mean-normalization of patches when computing patch distance. It is turned on
+        by default as it typically provides a noticeable quality boost because of increased robustness to
+        illumination variations. Turn it off if you are certain that your sequence doesn't contain any changes
+        in illumination.
+    @see setUseMeanNormalization */
+    CV_WRAP virtual bool getUseMeanNormalization() const = 0;
+    /** @copybrief getUseMeanNormalization @see getUseMeanNormalization */
+    CV_WRAP virtual void setUseMeanNormalization(bool val) = 0;
+
+    /** @brief Whether to use spatial propagation of good optical flow vectors. This option is turned on by
+        default, as it tends to work better on average and can sometimes help recover from major errors
+        introduced by the coarse-to-fine scheme employed by the DIS optical flow algorithm. Turning this
+        option off can make the output flow field a bit smoother, however.
+    @see setUseSpatialPropagation */
+    CV_WRAP virtual bool getUseSpatialPropagation() const = 0;
+    /** @copybrief getUseSpatialPropagation @see getUseSpatialPropagation */
+    CV_WRAP virtual void setUseSpatialPropagation(bool val) = 0;
+
+    /** @brief Creates an instance of DISOpticalFlow
+
+    @param preset one of PRESET_ULTRAFAST, PRESET_FAST and PRESET_MEDIUM
+    */
+    CV_WRAP static Ptr<DISOpticalFlow> create(int preset = DISOpticalFlow::PRESET_FAST);
+};
+
+/** @brief Class used for calculating a sparse optical flow.
+
+The class can calculate an optical flow for a sparse feature set using the
+iterative Lucas-Kanade method with pyramids.
+
+@sa calcOpticalFlowPyrLK
+
+*/
+class CV_EXPORTS_W SparsePyrLKOpticalFlow : public SparseOpticalFlow
+{
+public:
+    CV_WRAP virtual Size getWinSize() const = 0;
+    CV_WRAP virtual void setWinSize(Size winSize) = 0;
+
+    CV_WRAP virtual int getMaxLevel() const = 0;
+    CV_WRAP virtual void setMaxLevel(int maxLevel) = 0;
+
+    CV_WRAP virtual TermCriteria getTermCriteria() const = 0;
+    CV_WRAP virtual void setTermCriteria(TermCriteria& crit) = 0;
+
+    CV_WRAP virtual int getFlags() const = 0;
+    CV_WRAP virtual void setFlags(int flags) = 0;
+
+    CV_WRAP virtual double getMinEigThreshold() const = 0;
+    CV_WRAP virtual void setMinEigThreshold(double minEigThreshold) = 0;
+
+    CV_WRAP static Ptr<SparsePyrLKOpticalFlow> create(
+            Size winSize = Size(21, 21),
+            int maxLevel = 3, TermCriteria crit =
+            TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, 30, 0.01),
+            int flags = 0,
+            double minEigThreshold = 1e-4);
+};
+
+
+
+
+/** @brief Base abstract class for the long-term tracker
+ */
+class CV_EXPORTS_W Tracker
+{
+protected:
+    Tracker();
+public:
+    virtual ~Tracker();
+
+    /** @brief Initialize the tracker with a known bounding box that surrounded the target
+    @param image The initial frame
+    @param boundingBox The initial bounding box
+    */
+    CV_WRAP virtual
+    void init(InputArray image, const Rect& boundingBox) = 0;
+
+    /** @brief Update the tracker, find the new most likely bounding box for the target
+    @param image The current frame
+    @param boundingBox The bounding box that represent the new target location, if true was returned, not
+    modified otherwise
+
+    @return True means that target was located and false means that tracker cannot locate target in
+    current frame. Note, that latter *does not* imply that tracker has failed, maybe target is indeed
+    missing from the frame (say, out of sight)
+    */
+    CV_WRAP virtual
+    bool update(InputArray image, CV_OUT Rect& boundingBox) = 0;
+};
+
+
+
+/** @brief The MIL algorithm trains a classifier in an online manner to separate the object from the
+background.
+
+Multiple Instance Learning avoids the drift problem for a robust tracking. The implementation is
+based on @cite MIL .
+
+Original code can be found here <http://vision.ucsd.edu/~bbabenko/project_miltrack.shtml>
+ */
+class CV_EXPORTS_W TrackerMIL : public Tracker
+{
+protected:
+    TrackerMIL();  // use ::create()
+public:
+    virtual ~TrackerMIL() CV_OVERRIDE;
+
+    struct CV_EXPORTS_W_SIMPLE Params
+    {
+        CV_WRAP Params();
+        //parameters for sampler
+        CV_PROP_RW float samplerInitInRadius;  //!< radius for gathering positive instances during init
+        CV_PROP_RW int samplerInitMaxNegNum;  //!< # negative samples to use during init
+        CV_PROP_RW float samplerSearchWinSize;  //!< size of search window
+        CV_PROP_RW float samplerTrackInRadius;  //!< radius for gathering positive instances during tracking
+        CV_PROP_RW int samplerTrackMaxPosNum;  //!< # positive samples to use during tracking
+        CV_PROP_RW int samplerTrackMaxNegNum;  //!< # negative samples to use during tracking
+        CV_PROP_RW int featureSetNumFeatures;  //!< # features
+    };
+
+    /** @brief Create MIL tracker instance
+     *  @param parameters MIL parameters TrackerMIL::Params
+     */
+    static CV_WRAP
+    Ptr<TrackerMIL> create(const TrackerMIL::Params &parameters = TrackerMIL::Params());
+
+    //void init(InputArray image, const Rect& boundingBox) CV_OVERRIDE;
+    //bool update(InputArray image, CV_OUT Rect& boundingBox) CV_OVERRIDE;
+};
+
+
+
+/** @brief the GOTURN (Generic Object Tracking Using Regression Networks) tracker
+ *
+ *  GOTURN (@cite GOTURN) is kind of trackers based on Convolutional Neural Networks (CNN). While taking all advantages of CNN trackers,
+ *  GOTURN is much faster due to offline training without online fine-tuning nature.
+ *  GOTURN tracker addresses the problem of single target tracking: given a bounding box label of an object in the first frame of the video,
+ *  we track that object through the rest of the video. NOTE: Current method of GOTURN does not handle occlusions; however, it is fairly
+ *  robust to viewpoint changes, lighting changes, and deformations.
+ *  Inputs of GOTURN are two RGB patches representing Target and Search patches resized to 227x227.
+ *  Outputs of GOTURN are predicted bounding box coordinates, relative to Search patch coordinate system, in format X1,Y1,X2,Y2.
+ *  Original paper is here: <http://davheld.github.io/GOTURN/GOTURN.pdf>
+ *  As long as original authors implementation: <https://github.com/davheld/GOTURN#train-the-tracker>
+ *  Implementation of training algorithm is placed in separately here due to 3d-party dependencies:
+ *  <https://github.com/Auron-X/GOTURN_Training_Toolkit>
+ *  GOTURN architecture goturn.prototxt and trained model goturn.caffemodel are accessible on opencv_extra GitHub repository.
+ */
+class CV_EXPORTS_W TrackerGOTURN : public Tracker
+{
+protected:
+    TrackerGOTURN();  // use ::create()
+public:
+    virtual ~TrackerGOTURN() CV_OVERRIDE;
+
+    struct CV_EXPORTS_W_SIMPLE Params
+    {
+        CV_WRAP Params();
+        CV_PROP_RW std::string modelTxt;
+        CV_PROP_RW std::string modelBin;
+    };
+
+    /** @brief Constructor
+    @param parameters GOTURN parameters TrackerGOTURN::Params
+    */
+    static CV_WRAP
+    Ptr<TrackerGOTURN> create(const TrackerGOTURN::Params& parameters = TrackerGOTURN::Params());
+
+    //void init(InputArray image, const Rect& boundingBox) CV_OVERRIDE;
+    //bool update(InputArray image, CV_OUT Rect& boundingBox) CV_OVERRIDE;
+};
+
+class CV_EXPORTS_W TrackerDaSiamRPN : public Tracker
+{
+protected:
+    TrackerDaSiamRPN();  // use ::create()
+public:
+    virtual ~TrackerDaSiamRPN() CV_OVERRIDE;
+
+    struct CV_EXPORTS_W_SIMPLE Params
+    {
+        CV_WRAP Params();
+        CV_PROP_RW std::string model;
+        CV_PROP_RW std::string kernel_cls1;
+        CV_PROP_RW std::string kernel_r1;
+        CV_PROP_RW int backend;
+        CV_PROP_RW int target;
+    };
+
+    /** @brief Constructor
+    @param parameters DaSiamRPN parameters TrackerDaSiamRPN::Params
+    */
+    static CV_WRAP
+    Ptr<TrackerDaSiamRPN> create(const TrackerDaSiamRPN::Params& parameters = TrackerDaSiamRPN::Params());
+
+    /** @brief Return tracking score
+    */
+    CV_WRAP virtual float getTrackingScore() = 0;
+
+    //void init(InputArray image, const Rect& boundingBox) CV_OVERRIDE;
+    //bool update(InputArray image, CV_OUT Rect& boundingBox) CV_OVERRIDE;
+};
+
+
+//! @} video_track
+
+} // cv
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/opencv2/video/video.hpp b/duix-sdk/src/main/cpp/third/arm/include/opencv2/video/video.hpp
new file mode 100644
index 0000000..8267b85
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/opencv2/video/video.hpp
@@ -0,0 +1,48 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __OPENCV_BUILD
+#error this is a compatibility header which should not be used inside the OpenCV library
+#endif
+
+#include "opencv2/video.hpp"
diff --git a/duix-sdk/src/main/cpp/third/arm/include/rknn_api.h b/duix-sdk/src/main/cpp/third/arm/include/rknn_api.h
new file mode 100644
index 0000000..8007931
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/rknn_api.h
@@ -0,0 +1,697 @@
+/****************************************************************************
+*
+*    Copyright (c) 2017 - 2022 by Rockchip Corp.  All rights reserved.
+*
+*    The material in this file is confidential and contains trade secrets
+*    of Rockchip Corporation. This is proprietary information owned by
+*    Rockchip Corporation. No part of this work may be disclosed,
+*    reproduced, copied, transmitted, or used in any way for any purpose,
+*    without the express written permission of Rockchip Corporation.
+*
+*****************************************************************************/
+
+
+#ifndef _RKNN_API_H
+#define _RKNN_API_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+/*
+    Definition of extended flag for rknn_init.
+*/
+/* set high priority context. */
+#define RKNN_FLAG_PRIOR_HIGH                    0x00000000
+
+/* set medium priority context */
+#define RKNN_FLAG_PRIOR_MEDIUM                  0x00000001
+
+/* set low priority context. */
+#define RKNN_FLAG_PRIOR_LOW                     0x00000002
+
+/* asynchronous mode.
+   when enable, rknn_outputs_get will not block for too long because it directly retrieves the result of
+   the previous frame which can increase the frame rate on single-threaded mode, but at the cost of
+   rknn_outputs_get not retrieves the result of the current frame.
+   in multi-threaded mode you do not need to turn this mode on. */
+#define RKNN_FLAG_ASYNC_MASK                    0x00000004
+
+/* collect performance mode.
+   when enable, you can get detailed performance reports via rknn_query(ctx, RKNN_QUERY_PERF_DETAIL, ...),
+   but it will reduce the frame rate. */
+#define RKNN_FLAG_COLLECT_PERF_MASK             0x00000008
+
+/* allocate all memory in outside, includes weight/internal/inputs/outputs */
+#define RKNN_FLAG_MEM_ALLOC_OUTSIDE             0x00000010
+
+/* weight sharing with the same network structure */
+#define RKNN_FLAG_SHARE_WEIGHT_MEM              0x00000020
+
+/* send fence fd from outside */
+#define RKNN_FLAG_FENCE_IN_OUTSIDE              0x00000040
+
+/* get fence fd from inside */
+#define RKNN_FLAG_FENCE_OUT_OUTSIDE             0x00000080
+
+/* dummy init flag: could only get total_weight_size and total_internal_size by rknn_query*/
+#define RKNN_FLAG_COLLECT_MODEL_INFO_ONLY       0x00000100
+
+/*
+    Error code returned by the RKNN API.
+*/
+#define RKNN_SUCC                               0       /* execute succeed. */
+#define RKNN_ERR_FAIL                           -1      /* execute failed. */
+#define RKNN_ERR_TIMEOUT                        -2      /* execute timeout. */
+#define RKNN_ERR_DEVICE_UNAVAILABLE             -3      /* device is unavailable. */
+#define RKNN_ERR_MALLOC_FAIL                    -4      /* memory malloc fail. */
+#define RKNN_ERR_PARAM_INVALID                  -5      /* parameter is invalid. */
+#define RKNN_ERR_MODEL_INVALID                  -6      /* model is invalid. */
+#define RKNN_ERR_CTX_INVALID                    -7      /* context is invalid. */
+#define RKNN_ERR_INPUT_INVALID                  -8      /* input is invalid. */
+#define RKNN_ERR_OUTPUT_INVALID                 -9      /* output is invalid. */
+#define RKNN_ERR_DEVICE_UNMATCH                 -10     /* the device is unmatch, please update rknn sdk
+                                                           and npu driver/firmware. */
+#define RKNN_ERR_INCOMPATILE_PRE_COMPILE_MODEL  -11     /* This RKNN model use pre_compile mode, but not compatible with current driver. */
+#define RKNN_ERR_INCOMPATILE_OPTIMIZATION_LEVEL_VERSION  -12     /* This RKNN model set optimization level, but not compatible with current driver. */
+#define RKNN_ERR_TARGET_PLATFORM_UNMATCH        -13     /* This RKNN model set target platform, but not compatible with current platform. */
+
+/*
+    Definition for tensor
+*/
+#define RKNN_MAX_DIMS                           16      /* maximum dimension of tensor. */
+#define RKNN_MAX_NUM_CHANNEL                    15      /* maximum channel number of input tensor. */
+#define RKNN_MAX_NAME_LEN                       256     /* maximum name lenth of tensor. */
+#define RKNN_MAX_DYNAMIC_SHAPE_NUM              512     /* maximum number of dynamic shape for each input. */
+
+#ifdef __arm__
+typedef uint32_t rknn_context;
+#else
+typedef uint64_t rknn_context;
+#endif
+
+
+/*
+    The query command for rknn_query
+*/
+typedef enum _rknn_query_cmd {
+    RKNN_QUERY_IN_OUT_NUM = 0,                              /* query the number of input & output tensor. */
+    RKNN_QUERY_INPUT_ATTR = 1,                              /* query the attribute of input tensor. */
+    RKNN_QUERY_OUTPUT_ATTR = 2,                             /* query the attribute of output tensor. */
+    RKNN_QUERY_PERF_DETAIL = 3,                             /* query the detail performance, need set
+                                                               RKNN_FLAG_COLLECT_PERF_MASK when call rknn_init,
+                                                               this query needs to be valid after rknn_outputs_get. */
+    RKNN_QUERY_PERF_RUN = 4,                                /* query the time of run,
+                                                               this query needs to be valid after rknn_outputs_get. */
+    RKNN_QUERY_SDK_VERSION = 5,                             /* query the sdk & driver version */
+
+    RKNN_QUERY_MEM_SIZE = 6,                                /* query the weight & internal memory size */
+    RKNN_QUERY_CUSTOM_STRING = 7,                           /* query the custom string */
+
+    RKNN_QUERY_NATIVE_INPUT_ATTR = 8,                       /* query the attribute of native input tensor. */
+    RKNN_QUERY_NATIVE_OUTPUT_ATTR = 9,                      /* query the attribute of native output tensor. */
+
+    RKNN_QUERY_NATIVE_NC1HWC2_INPUT_ATTR = 8,               /* query the attribute of native input tensor. */
+    RKNN_QUERY_NATIVE_NC1HWC2_OUTPUT_ATTR = 9,              /* query the attribute of native output tensor. */
+
+    RKNN_QUERY_NATIVE_NHWC_INPUT_ATTR = 10,                 /* query the attribute of native input tensor. */
+    RKNN_QUERY_NATIVE_NHWC_OUTPUT_ATTR = 11,                /* query the attribute of native output tensor. */
+
+    RKNN_QUERY_DEVICE_MEM_INFO = 12,                        /* query the attribute of rknn memory information. */
+
+    RKNN_QUERY_INPUT_DYNAMIC_RANGE = 13,                    /* query the dynamic shape range of rknn input tensor. */
+    RKNN_QUERY_CURRENT_INPUT_ATTR = 14,                     /* query the current shape of rknn input tensor, only valid for dynamic rknn model*/
+    RKNN_QUERY_CURRENT_OUTPUT_ATTR = 15,                    /* query the current shape of rknn output tensor, only valid for dynamic rknn model*/
+
+    RKNN_QUERY_CMD_MAX
+} rknn_query_cmd;
+
+/*
+    the tensor data type.
+*/
+typedef enum _rknn_tensor_type {
+    RKNN_TENSOR_FLOAT32 = 0,                            /* data type is float32. */
+    RKNN_TENSOR_FLOAT16,                                /* data type is float16. */
+    RKNN_TENSOR_INT8,                                   /* data type is int8. */
+    RKNN_TENSOR_UINT8,                                  /* data type is uint8. */
+    RKNN_TENSOR_INT16,                                  /* data type is int16. */
+    RKNN_TENSOR_UINT16,                                 /* data type is uint16. */
+    RKNN_TENSOR_INT32,                                  /* data type is int32. */
+    RKNN_TENSOR_UINT32,                                 /* data type is uint32. */
+    RKNN_TENSOR_INT64,                                  /* data type is int64. */
+    RKNN_TENSOR_BOOL,
+
+    RKNN_TENSOR_TYPE_MAX
+} rknn_tensor_type;
+
+inline static const char* get_type_string(rknn_tensor_type type)
+{
+    switch(type) {
+    case RKNN_TENSOR_FLOAT32: return "FP32";
+    case RKNN_TENSOR_FLOAT16: return "FP16";
+    case RKNN_TENSOR_INT8: return "INT8";
+    case RKNN_TENSOR_UINT8: return "UINT8";
+    case RKNN_TENSOR_INT16: return "INT16";
+    case RKNN_TENSOR_UINT16: return "UINT16";
+    case RKNN_TENSOR_INT32: return "INT32";
+    case RKNN_TENSOR_UINT32: return "UINT32";
+    case RKNN_TENSOR_INT64: return "INT64";
+    case RKNN_TENSOR_BOOL: return "BOOL";
+    default: return "UNKNOW";
+    }
+}
+
+/*
+    the quantitative type.
+*/
+typedef enum _rknn_tensor_qnt_type {
+    RKNN_TENSOR_QNT_NONE = 0,                           /* none. */
+    RKNN_TENSOR_QNT_DFP,                                /* dynamic fixed point. */
+    RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC,                  /* asymmetric affine. */
+
+    RKNN_TENSOR_QNT_MAX
+} rknn_tensor_qnt_type;
+
+inline static const char* get_qnt_type_string(rknn_tensor_qnt_type type)
+{
+    switch(type) {
+    case RKNN_TENSOR_QNT_NONE: return "NONE";
+    case RKNN_TENSOR_QNT_DFP: return "DFP";
+    case RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC: return "AFFINE";
+    default: return "UNKNOW";
+    }
+}
+
+/*
+    the tensor data format.
+*/
+typedef enum _rknn_tensor_format {
+    RKNN_TENSOR_NCHW = 0,                               /* data format is NCHW. */
+    RKNN_TENSOR_NHWC,                                   /* data format is NHWC. */
+    RKNN_TENSOR_NC1HWC2,                                /* data format is NC1HWC2. */
+    RKNN_TENSOR_UNDEFINED,
+
+    RKNN_TENSOR_FORMAT_MAX
+} rknn_tensor_format;
+
+/*
+    the mode of running on target NPU core.
+*/
+typedef enum _rknn_core_mask {
+    RKNN_NPU_CORE_AUTO = 0,                                       /* default, run on NPU core randomly. */
+    RKNN_NPU_CORE_0 = 1,                                          /* run on NPU core 0. */
+    RKNN_NPU_CORE_1 = 2,                                          /* run on NPU core 1. */
+    RKNN_NPU_CORE_2 = 4,                                          /* run on NPU core 2. */
+    RKNN_NPU_CORE_0_1 = RKNN_NPU_CORE_0 | RKNN_NPU_CORE_1,        /* run on NPU core 1 and core 2. */
+    RKNN_NPU_CORE_0_1_2 = RKNN_NPU_CORE_0_1 | RKNN_NPU_CORE_2,    /* run on NPU core 1 and core 2 and core 3. */
+
+    RKNN_NPU_CORE_UNDEFINED,
+} rknn_core_mask;
+
+inline static const char* get_format_string(rknn_tensor_format fmt)
+{
+    switch(fmt) {
+    case RKNN_TENSOR_NCHW: return "NCHW";
+    case RKNN_TENSOR_NHWC: return "NHWC";
+    case RKNN_TENSOR_NC1HWC2: return "NC1HWC2";
+    case RKNN_TENSOR_UNDEFINED: return "UNDEFINED";
+    default: return "UNKNOW";
+    }
+}
+
+/*
+    the information for RKNN_QUERY_IN_OUT_NUM.
+*/
+typedef struct _rknn_input_output_num {
+    uint32_t n_input;                                   /* the number of input. */
+    uint32_t n_output;                                  /* the number of output. */
+} rknn_input_output_num;
+
+/*
+    the information for RKNN_QUERY_INPUT_ATTR / RKNN_QUERY_OUTPUT_ATTR.
+*/
+typedef struct _rknn_tensor_attr {
+    uint32_t index;                                     /* input parameter, the index of input/output tensor,
+                                                           need set before call rknn_query. */
+
+    uint32_t n_dims;                                    /* the number of dimensions. */
+    uint32_t dims[RKNN_MAX_DIMS];                       /* the dimensions array. */
+    char name[RKNN_MAX_NAME_LEN];                       /* the name of tensor. */
+
+    uint32_t n_elems;                                   /* the number of elements. */
+    uint32_t size;                                      /* the bytes size of tensor. */
+
+    rknn_tensor_format fmt;                             /* the data format of tensor. */
+    rknn_tensor_type type;                              /* the data type of tensor. */
+    rknn_tensor_qnt_type qnt_type;                      /* the quantitative type of tensor. */
+    int8_t fl;                                          /* fractional length for RKNN_TENSOR_QNT_DFP. */
+    int32_t zp;                                         /* zero point for RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC. */
+    float scale;                                        /* scale for RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC. */
+
+    uint32_t w_stride;                                  /* the stride of tensor along the width dimention of input,
+                                                           Note: it is read-only, 0 means equal to width. */
+    uint32_t size_with_stride;                          /* the bytes size of tensor with stride. */
+
+    uint8_t pass_through;                               /* pass through mode, for rknn_set_io_mem interface.
+                                                           if TRUE, the buf data is passed directly to the input node of the rknn model
+                                                                    without any conversion. the following variables do not need to be set.
+                                                           if FALSE, the buf data is converted into an input consistent with the model
+                                                                     according to the following type and fmt. so the following variables
+                                                                     need to be set.*/
+    uint32_t h_stride;                                  /* the stride along the height dimention of input,
+                                                           Note: it is write-only, if it was set to 0, h_stride = height. */
+} rknn_tensor_attr;
+
+typedef struct _rknn_input_range {
+    uint32_t index;                                                 /* input parameter, the index of input/output tensor,
+                                                                        need set before call rknn_query. */
+    uint32_t shape_number;                                          /* the number of shape. */
+    rknn_tensor_format fmt;                                         /* the data format of tensor. */
+    char name[RKNN_MAX_NAME_LEN];                                   /* the name of tensor. */
+    uint32_t dyn_range[RKNN_MAX_DYNAMIC_SHAPE_NUM][RKNN_MAX_DIMS];  /* the dynamic input dimensions range. */
+    uint32_t n_dims;                                                /* the number of dimensions. */
+
+} rknn_input_range;
+
+/*
+    the information for RKNN_QUERY_PERF_DETAIL.
+*/
+typedef struct _rknn_perf_detail {
+    char* perf_data;                                    /* the string pointer of perf detail. don't need free it by user. */
+    uint64_t data_len;                                  /* the string length. */
+} rknn_perf_detail;
+
+/*
+    the information for RKNN_QUERY_PERF_RUN.
+*/
+typedef struct _rknn_perf_run {
+    int64_t run_duration;                               /* real inference time (us) */
+} rknn_perf_run;
+
+/*
+    the information for RKNN_QUERY_SDK_VERSION.
+*/
+typedef struct _rknn_sdk_version {
+    char api_version[256];                              /* the version of rknn api. */
+    char drv_version[256];                              /* the version of rknn driver. */
+} rknn_sdk_version;
+
+/*
+    the information for RKNN_QUERY_MEM_SIZE.
+*/
+typedef struct _rknn_mem_size {
+    uint32_t total_weight_size;                         /* the weight memory size */
+    uint32_t total_internal_size;                       /* the internal memory size, exclude inputs/outputs */
+    uint64_t total_dma_allocated_size;                  /* total dma memory allocated size */
+    uint32_t total_sram_size;                           /* total system sram size reserved for rknn */
+    uint32_t free_sram_size;                            /* free system sram size reserved for rknn */
+    uint32_t reserved[10];                              /* reserved */
+} rknn_mem_size;
+
+/*
+    the information for RKNN_QUERY_CUSTOM_STRING.
+*/
+typedef struct _rknn_custom_string {
+    char string[1024];                                  /* the string of custom, lengths max to 1024 bytes */
+} rknn_custom_string;
+
+/*
+   The flags of rknn_tensor_mem.
+*/
+typedef enum _rknn_tensor_mem_flags {
+    RKNN_TENSOR_MEMORY_FLAGS_ALLOC_INSIDE = 1,           /*Used to mark in rknn_destroy_mem() whether it is necessary to release the "mem" pointer itself.
+                                                         If the flag RKNN_TENSOR_MEMORY_FLAGS_ALLOC_INSIDE is set, rknn_destroy_mem() will call free(mem).*/
+    RKNN_TENSOR_MEMORY_FLAGS_FROM_FD      = 2,           /*Used to mark in rknn_create_mem_from_fd() whether it is necessary to release the "mem" pointer itself.
+                                                         If the flag RKNN_TENSOR_MEMORY_FLAGS_FROM_FD is set, rknn_destroy_mem() will call free(mem).*/
+    RKNN_TENSOR_MEMORY_FLAGS_FROM_PHYS    = 3,           /*Used to mark in rknn_create_mem_from_phys() whether it is necessary to release the "mem" pointer itself.
+                                                         If the flag RKNN_TENSOR_MEMORY_FLAGS_FROM_PHYS is set, rknn_destroy_mem() will call free(mem).*/
+    RKNN_TENSOR_MEMORY_FLAGS_UNKNOWN
+} rknn_tensor_mem_flags;
+
+/*
+    the memory information of tensor.
+*/
+typedef struct _rknn_tensor_memory {
+    void*            virt_addr;                         /* the virtual address of tensor buffer. */
+    uint64_t         phys_addr;                         /* the physical address of tensor buffer. */
+    int32_t          fd;                                /* the fd of tensor buffer. */
+    int32_t          offset;                            /* indicates the offset of the memory. */
+    uint32_t         size;                              /* the size of tensor buffer. */
+    uint32_t         flags;                             /* the flags of tensor buffer, reserved */
+    void *           priv_data;                         /* the private data of tensor buffer. */
+} rknn_tensor_mem;
+
+/*
+    the input information for rknn_input_set.
+*/
+typedef struct _rknn_input {
+    uint32_t index;                                     /* the input index. */
+    void* buf;                                          /* the input buf for index. */
+    uint32_t size;                                      /* the size of input buf. */
+    uint8_t pass_through;                               /* pass through mode.
+                                                           if TRUE, the buf data is passed directly to the input node of the rknn model
+                                                                    without any conversion. the following variables do not need to be set.
+                                                           if FALSE, the buf data is converted into an input consistent with the model
+                                                                     according to the following type and fmt. so the following variables
+                                                                     need to be set.*/
+    rknn_tensor_type type;                              /* the data type of input buf. */
+    rknn_tensor_format fmt;                             /* the data format of input buf.
+                                                           currently the internal input format of NPU is NCHW by default.
+                                                           so entering NCHW data can avoid the format conversion in the driver. */
+} rknn_input;
+
+/*
+    the output information for rknn_outputs_get.
+*/
+typedef struct _rknn_output {
+    uint8_t want_float;                                 /* want transfer output data to float */
+    uint8_t is_prealloc;                                /* whether buf is pre-allocated.
+                                                           if TRUE, the following variables need to be set.
+                                                           if FALSE, the following variables do not need to be set. */
+    uint32_t index;                                     /* the output index. */
+    void* buf;                                          /* the output buf for index.
+                                                           when is_prealloc = FALSE and rknn_outputs_release called,
+                                                           this buf pointer will be free and don't use it anymore. */
+    uint32_t size;                                      /* the size of output buf. */
+} rknn_output;
+
+/*
+    the extend information for rknn_init.
+*/
+typedef struct _rknn_init_extend {
+    rknn_context ctx;                                    /* rknn context */
+    int32_t      real_model_offset;                      /* real rknn model file offset, only valid when init context with rknn file path */
+    uint32_t     real_model_size;                        /* real rknn model file size, only valid when init context with rknn file path */
+    uint8_t      reserved[120];                          /* reserved */
+} rknn_init_extend;
+
+/*
+    the extend information for rknn_run.
+*/
+typedef struct _rknn_run_extend {
+    uint64_t frame_id;                                  /* output parameter, indicate current frame id of run. */
+    int32_t non_block;                                  /* block flag of run, 0 is block else 1 is non block */
+    int32_t timeout_ms;                                 /* timeout for block mode, in milliseconds */
+    int32_t fence_fd;                                   /* fence fd from other unit */
+} rknn_run_extend;
+
+/*
+    the extend information for rknn_outputs_get.
+*/
+typedef struct _rknn_output_extend {
+    uint64_t frame_id;                                  /* output parameter, indicate the frame id of outputs, corresponds to
+                                                           struct rknn_run_extend.frame_id.*/
+} rknn_output_extend;
+
+
+/*  rknn_init
+
+    initial the context and load the rknn model.
+
+    input:
+        rknn_context* context       the pointer of context handle.
+        void* model                 if size > 0, pointer to the rknn model, if size = 0, filepath to the rknn model.
+        uint32_t size               the size of rknn model.
+        uint32_t flag               extend flag, see the define of RKNN_FLAG_XXX_XXX.
+        rknn_init_extend* extend    the extend information of init.
+    return:
+        int                         error code.
+*/
+int rknn_init(rknn_context* context, void* model, uint32_t size, uint32_t flag, rknn_init_extend* extend);
+
+/*  rknn_dup_context
+
+    initial the context and load the rknn model.
+
+    input:
+        rknn_context* context_in       the pointer of context in handle.
+        rknn_context* context_out      the pointer of context out handle.
+    return:
+        int                         error code.
+*/
+int rknn_dup_context(rknn_context* context_in, rknn_context* context_out);
+
+/*  rknn_destroy
+
+    unload the rknn model and destroy the context.
+
+    input:
+        rknn_context context        the handle of context.
+    return:
+        int                         error code.
+*/
+int rknn_destroy(rknn_context context);
+
+
+/*  rknn_query
+
+    query the information about model or others. see rknn_query_cmd.
+
+    input:
+        rknn_context context        the handle of context.
+        rknn_query_cmd cmd          the command of query.
+        void* info                  the buffer point of information.
+        uint32_t size               the size of information.
+    return:
+        int                         error code.
+*/
+int rknn_query(rknn_context context, rknn_query_cmd cmd, void* info, uint32_t size);
+
+
+/*  rknn_inputs_set
+
+    set inputs information by input index of rknn model.
+    inputs information see rknn_input.
+
+    input:
+        rknn_context context        the handle of context.
+        uint32_t n_inputs           the number of inputs.
+        rknn_input inputs[]         the arrays of inputs information, see rknn_input.
+    return:
+        int                         error code
+*/
+int rknn_inputs_set(rknn_context context, uint32_t n_inputs, rknn_input inputs[]);
+
+/*
+    rknn_set_batch_core_num
+
+    set rknn batch core_num.
+
+    input:
+        rknn_context context        the handle of context.
+        int core_num                the core number.
+    return:
+        int                         error code.
+
+*/
+int rknn_set_batch_core_num(rknn_context context, int core_num);
+
+/*  rknn_set_core_mask
+
+    set rknn core mask.(only supported on RK3588 now)
+
+    RKNN_NPU_CORE_AUTO: auto mode, default value
+    RKNN_NPU_CORE_0: core 0 mode
+    RKNN_NPU_CORE_1: core 1 mode
+    RKNN_NPU_CORE_2: core 2 mode
+    RKNN_NPU_CORE_0_1: combine core 0/1 mode
+    RKNN_NPU_CORE_0_1_2: combine core 0/1/2 mode
+
+    input:
+        rknn_context context        the handle of context.
+        rknn_core_mask core_mask    the core mask.
+    return:
+        int                         error code.
+*/
+int rknn_set_core_mask(rknn_context context, rknn_core_mask core_mask);
+
+/*  rknn_run
+
+    run the model to execute inference.
+
+    input:
+        rknn_context context        the handle of context.
+        rknn_run_extend* extend     the extend information of run.
+    return:
+        int                         error code.
+*/
+int rknn_run(rknn_context context, rknn_run_extend* extend);
+
+
+/*  rknn_wait
+
+    wait the model after execute inference.
+
+    input:
+        rknn_context context        the handle of context.
+        rknn_run_extend* extend     the extend information of run.
+    return:
+        int                         error code.
+*/
+int rknn_wait(rknn_context context, rknn_run_extend* extend);
+
+
+/*  rknn_outputs_get
+
+    wait the inference to finish and get the outputs.
+    this function will block until inference finish.
+    the results will set to outputs[].
+
+    input:
+        rknn_context context        the handle of context.
+        uint32_t n_outputs          the number of outputs.
+        rknn_output outputs[]       the arrays of output, see rknn_output.
+        rknn_output_extend*         the extend information of output.
+    return:
+        int                         error code.
+*/
+int rknn_outputs_get(rknn_context context, uint32_t n_outputs, rknn_output outputs[], rknn_output_extend* extend);
+
+
+/*  rknn_outputs_release
+
+    release the outputs that get by rknn_outputs_get.
+    after called, the rknn_output[x].buf get from rknn_outputs_get will
+    also be free when rknn_output[x].is_prealloc = FALSE.
+
+    input:
+        rknn_context context        the handle of context.
+        uint32_t n_ouputs           the number of outputs.
+        rknn_output outputs[]       the arrays of output.
+    return:
+        int                         error code
+*/
+int rknn_outputs_release(rknn_context context, uint32_t n_ouputs, rknn_output outputs[]);
+
+
+/* new api for zero copy */
+
+/*  rknn_create_mem_from_phys (memory allocated outside)
+
+    initialize tensor memory from physical address.
+
+    input:
+        rknn_context ctx            the handle of context.
+        uint64_t phys_addr          physical address.
+        void *virt_addr             virtual address.
+        uint32_t size               the size of tensor buffer.
+    return:
+        rknn_tensor_mem             the pointer of tensor memory information.
+*/
+rknn_tensor_mem* rknn_create_mem_from_phys(rknn_context ctx, uint64_t phys_addr, void *virt_addr, uint32_t size);
+
+
+/*  rknn_create_mem_from_fd (memory allocated outside)
+
+    initialize tensor memory from file description.
+
+    input:
+        rknn_context ctx            the handle of context.
+        int32_t fd                  file description.
+        void *virt_addr             virtual address.
+        uint32_t size               the size of tensor buffer.
+        int32_t offset              indicates the offset of the memory (virt_addr without offset).
+    return:
+        rknn_tensor_mem             the pointer of tensor memory information.
+*/
+rknn_tensor_mem* rknn_create_mem_from_fd(rknn_context ctx, int32_t fd, void *virt_addr, uint32_t size, int32_t offset);
+
+
+/*  rknn_create_mem_from_mb_blk (memory allocated outside)
+
+    create tensor memory from mb_blk.
+
+    input:
+        rknn_context ctx            the handle of context.
+        void *mb_blk                mb_blk allocate from system api.
+        int32_t offset              indicates the offset of the memory.
+    return:
+        rknn_tensor_mem             the pointer of tensor memory information.
+*/
+rknn_tensor_mem* rknn_create_mem_from_mb_blk(rknn_context ctx, void *mb_blk, int32_t offset);
+
+
+/*  rknn_create_mem (memory allocated inside)
+
+    create tensor memory.
+
+    input:
+        rknn_context ctx            the handle of context.
+        uint32_t size               the size of tensor buffer.
+    return:
+        rknn_tensor_mem             the pointer of tensor memory information.
+*/
+rknn_tensor_mem* rknn_create_mem(rknn_context ctx, uint32_t size);
+
+
+/*  rknn_destroy_mem (support allocate inside and outside)
+
+    destroy tensor memory.
+
+    input:
+        rknn_context ctx            the handle of context.
+        rknn_tensor_mem *mem        the pointer of tensor memory information.
+    return:
+        int                         error code
+*/
+int rknn_destroy_mem(rknn_context ctx, rknn_tensor_mem *mem);
+
+
+/*  rknn_set_weight_mem
+
+    set the weight memory.
+
+    input:
+        rknn_context ctx            the handle of context.
+        rknn_tensor_mem *mem        the array of tensor memory information
+    return:
+        int                         error code.
+*/
+int rknn_set_weight_mem(rknn_context ctx, rknn_tensor_mem *mem);
+
+
+/*  rknn_set_internal_mem
+
+    set the internal memory.
+
+    input:
+        rknn_context ctx            the handle of context.
+        rknn_tensor_mem *mem        the array of tensor memory information
+    return:
+        int                         error code.
+*/
+int rknn_set_internal_mem(rknn_context ctx, rknn_tensor_mem *mem);
+
+
+/*  rknn_set_io_mem
+
+    set the input and output tensors buffer.
+
+    input:
+        rknn_context ctx            the handle of context.
+        rknn_tensor_mem *mem        the array of tensor memory information.
+        rknn_tensor_attr *attr      the attribute of input or output tensor buffer.
+    return:
+        int                         error code.
+*/
+int rknn_set_io_mem(rknn_context ctx, rknn_tensor_mem *mem, rknn_tensor_attr *attr);
+
+/*  rknn_set_input_shape
+
+    set the input tensor shape (only valid for dynamic shape rknn model).
+
+    input:
+        rknn_context ctx            the handle of context.
+        rknn_tensor_attr *attr      the attribute of input or output tensor buffer.
+    return:
+        int                         error code.
+*/
+int rknn_set_input_shape(rknn_context ctx, rknn_tensor_attr* attr);
+
+#ifdef __cplusplus
+} //extern "C"
+#endif
+
+#endif  //_RKNN_API_H
diff --git a/duix-sdk/src/main/cpp/third/arm/include/tjpeg/bmp.h b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/bmp.h
new file mode 100644
index 0000000..0d1e4dc
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/bmp.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C)2011 D. R. Commander.  All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * - Neither the name of the libjpeg-turbo Project nor the names of its
+ *   contributors may be used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __BMP_H__
+#define __BMP_H__
+
+#include "./turbojpeg.h"
+
+int loadbmp(char *filename, unsigned char **buf, int *w, int *h, int pf,
+	int bottomup);
+
+int savebmp(char *filename, unsigned char *buf, int w, int h, int pf,
+	int bottomup);
+
+const char *bmpgeterr(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/tjpeg/cderror.h b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/cderror.h
new file mode 100644
index 0000000..e19c475
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/cderror.h
@@ -0,0 +1,134 @@
+/*
+ * cderror.h
+ *
+ * Copyright (C) 1994-1997, Thomas G. Lane.
+ * Modified 2009 by Guido Vollbeding.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file defines the error and message codes for the cjpeg/djpeg
+ * applications.  These strings are not needed as part of the JPEG library
+ * proper.
+ * Edit this file to add new codes, or to translate the message strings to
+ * some other language.
+ */
+
+/*
+ * To define the enum list of message codes, include this file without
+ * defining macro JMESSAGE.  To create a message string table, include it
+ * again with a suitable JMESSAGE definition (see jerror.c for an example).
+ */
+#ifndef JMESSAGE
+#ifndef CDERROR_H
+#define CDERROR_H
+/* First time through, define the enum list */
+#define JMAKE_ENUM_LIST
+#else
+/* Repeated inclusions of this file are no-ops unless JMESSAGE is defined */
+#define JMESSAGE(code,string)
+#endif /* CDERROR_H */
+#endif /* JMESSAGE */
+
+#ifdef JMAKE_ENUM_LIST
+
+typedef enum {
+
+#define JMESSAGE(code,string)	code ,
+
+#endif /* JMAKE_ENUM_LIST */
+
+JMESSAGE(JMSG_FIRSTADDONCODE=1000, NULL) /* Must be first entry! */
+
+#ifdef BMP_SUPPORTED
+JMESSAGE(JERR_BMP_BADCMAP, "Unsupported BMP colormap format")
+JMESSAGE(JERR_BMP_BADDEPTH, "Only 8- and 24-bit BMP files are supported")
+JMESSAGE(JERR_BMP_BADHEADER, "Invalid BMP file: bad header length")
+JMESSAGE(JERR_BMP_BADPLANES, "Invalid BMP file: biPlanes not equal to 1")
+JMESSAGE(JERR_BMP_COLORSPACE, "BMP output must be grayscale or RGB")
+JMESSAGE(JERR_BMP_COMPRESSED, "Sorry, compressed BMPs not yet supported")
+JMESSAGE(JERR_BMP_EMPTY, "Empty BMP image")
+JMESSAGE(JERR_BMP_NOT, "Not a BMP file - does not start with BM")
+JMESSAGE(JTRC_BMP, "%ux%u 24-bit BMP image")
+JMESSAGE(JTRC_BMP_MAPPED, "%ux%u 8-bit colormapped BMP image")
+JMESSAGE(JTRC_BMP_OS2, "%ux%u 24-bit OS2 BMP image")
+JMESSAGE(JTRC_BMP_OS2_MAPPED, "%ux%u 8-bit colormapped OS2 BMP image")
+#endif /* BMP_SUPPORTED */
+
+#ifdef GIF_SUPPORTED
+JMESSAGE(JERR_GIF_BUG, "GIF output got confused")
+JMESSAGE(JERR_GIF_CODESIZE, "Bogus GIF codesize %d")
+JMESSAGE(JERR_GIF_COLORSPACE, "GIF output must be grayscale or RGB")
+JMESSAGE(JERR_GIF_IMAGENOTFOUND, "Too few images in GIF file")
+JMESSAGE(JERR_GIF_NOT, "Not a GIF file")
+JMESSAGE(JTRC_GIF, "%ux%ux%d GIF image")
+JMESSAGE(JTRC_GIF_BADVERSION,
+	 "Warning: unexpected GIF version number '%c%c%c'")
+JMESSAGE(JTRC_GIF_EXTENSION, "Ignoring GIF extension block of type 0x%02x")
+JMESSAGE(JTRC_GIF_NONSQUARE, "Caution: nonsquare pixels in input")
+JMESSAGE(JWRN_GIF_BADDATA, "Corrupt data in GIF file")
+JMESSAGE(JWRN_GIF_CHAR, "Bogus char 0x%02x in GIF file, ignoring")
+JMESSAGE(JWRN_GIF_ENDCODE, "Premature end of GIF image")
+JMESSAGE(JWRN_GIF_NOMOREDATA, "Ran out of GIF bits")
+#endif /* GIF_SUPPORTED */
+
+#ifdef PPM_SUPPORTED
+JMESSAGE(JERR_PPM_COLORSPACE, "PPM output must be grayscale or RGB")
+JMESSAGE(JERR_PPM_NONNUMERIC, "Nonnumeric data in PPM file")
+JMESSAGE(JERR_PPM_NOT, "Not a PPM/PGM file")
+JMESSAGE(JTRC_PGM, "%ux%u PGM image")
+JMESSAGE(JTRC_PGM_TEXT, "%ux%u text PGM image")
+JMESSAGE(JTRC_PPM, "%ux%u PPM image")
+JMESSAGE(JTRC_PPM_TEXT, "%ux%u text PPM image")
+#endif /* PPM_SUPPORTED */
+
+#ifdef RLE_SUPPORTED
+JMESSAGE(JERR_RLE_BADERROR, "Bogus error code from RLE library")
+JMESSAGE(JERR_RLE_COLORSPACE, "RLE output must be grayscale or RGB")
+JMESSAGE(JERR_RLE_DIMENSIONS, "Image dimensions (%ux%u) too large for RLE")
+JMESSAGE(JERR_RLE_EMPTY, "Empty RLE file")
+JMESSAGE(JERR_RLE_EOF, "Premature EOF in RLE header")
+JMESSAGE(JERR_RLE_MEM, "Insufficient memory for RLE header")
+JMESSAGE(JERR_RLE_NOT, "Not an RLE file")
+JMESSAGE(JERR_RLE_TOOMANYCHANNELS, "Cannot handle %d output channels for RLE")
+JMESSAGE(JERR_RLE_UNSUPPORTED, "Cannot handle this RLE setup")
+JMESSAGE(JTRC_RLE, "%ux%u full-color RLE file")
+JMESSAGE(JTRC_RLE_FULLMAP, "%ux%u full-color RLE file with map of length %d")
+JMESSAGE(JTRC_RLE_GRAY, "%ux%u grayscale RLE file")
+JMESSAGE(JTRC_RLE_MAPGRAY, "%ux%u grayscale RLE file with map of length %d")
+JMESSAGE(JTRC_RLE_MAPPED, "%ux%u colormapped RLE file with map of length %d")
+#endif /* RLE_SUPPORTED */
+
+#ifdef TARGA_SUPPORTED
+JMESSAGE(JERR_TGA_BADCMAP, "Unsupported Targa colormap format")
+JMESSAGE(JERR_TGA_BADPARMS, "Invalid or unsupported Targa file")
+JMESSAGE(JERR_TGA_COLORSPACE, "Targa output must be grayscale or RGB")
+JMESSAGE(JTRC_TGA, "%ux%u RGB Targa image")
+JMESSAGE(JTRC_TGA_GRAY, "%ux%u grayscale Targa image")
+JMESSAGE(JTRC_TGA_MAPPED, "%ux%u colormapped Targa image")
+#else
+JMESSAGE(JERR_TGA_NOTCOMP, "Targa support was not compiled")
+#endif /* TARGA_SUPPORTED */
+
+JMESSAGE(JERR_BAD_CMAP_FILE,
+	 "Color map file is invalid or of unsupported format")
+JMESSAGE(JERR_TOO_MANY_COLORS,
+	 "Output file format cannot handle %d colormap entries")
+JMESSAGE(JERR_UNGETC_FAILED, "ungetc failed")
+#ifdef TARGA_SUPPORTED
+JMESSAGE(JERR_UNKNOWN_FORMAT,
+	 "Unrecognized input file format --- perhaps you need -targa")
+#else
+JMESSAGE(JERR_UNKNOWN_FORMAT, "Unrecognized input file format")
+#endif
+JMESSAGE(JERR_UNSUPPORTED_FORMAT, "Unsupported output file format")
+
+#ifdef JMAKE_ENUM_LIST
+
+  JMSG_LASTADDONCODE
+} ADDON_MESSAGE_CODE;
+
+#undef JMAKE_ENUM_LIST
+#endif /* JMAKE_ENUM_LIST */
+
+/* Zap JMESSAGE macro so that future re-inclusions do nothing by default */
+#undef JMESSAGE
diff --git a/duix-sdk/src/main/cpp/third/arm/include/tjpeg/cdjpeg.h b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/cdjpeg.h
new file mode 100644
index 0000000..ed024ac
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/cdjpeg.h
@@ -0,0 +1,187 @@
+/*
+ * cdjpeg.h
+ *
+ * Copyright (C) 1994-1997, Thomas G. Lane.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains common declarations for the sample applications
+ * cjpeg and djpeg.  It is NOT used by the core JPEG library.
+ */
+
+#define JPEG_CJPEG_DJPEG	/* define proper options in jconfig.h */
+#define JPEG_INTERNAL_OPTIONS	/* cjpeg.c,djpeg.c need to see xxx_SUPPORTED */
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jerror.h"		/* get library error codes too */
+#include "cderror.h"		/* get application-specific error codes */
+
+
+/*
+ * Object interface for cjpeg's source file decoding modules
+ */
+
+typedef struct cjpeg_source_struct * cjpeg_source_ptr;
+
+struct cjpeg_source_struct {
+  JMETHOD(void, start_input, (j_compress_ptr cinfo,
+			      cjpeg_source_ptr sinfo));
+  JMETHOD(JDIMENSION, get_pixel_rows, (j_compress_ptr cinfo,
+				       cjpeg_source_ptr sinfo));
+  JMETHOD(void, finish_input, (j_compress_ptr cinfo,
+			       cjpeg_source_ptr sinfo));
+
+  FILE *input_file;
+
+  JSAMPARRAY buffer;
+  JDIMENSION buffer_height;
+};
+
+
+/*
+ * Object interface for djpeg's output file encoding modules
+ */
+
+typedef struct djpeg_dest_struct * djpeg_dest_ptr;
+
+struct djpeg_dest_struct {
+  /* start_output is called after jpeg_start_decompress finishes.
+   * The color map will be ready at this time, if one is needed.
+   */
+  JMETHOD(void, start_output, (j_decompress_ptr cinfo,
+			       djpeg_dest_ptr dinfo));
+  /* Emit the specified number of pixel rows from the buffer. */
+  JMETHOD(void, put_pixel_rows, (j_decompress_ptr cinfo,
+				 djpeg_dest_ptr dinfo,
+				 JDIMENSION rows_supplied));
+  /* Finish up at the end of the image. */
+  JMETHOD(void, finish_output, (j_decompress_ptr cinfo,
+				djpeg_dest_ptr dinfo));
+
+  /* Target file spec; filled in by djpeg.c after object is created. */
+  FILE * output_file;
+
+  /* Output pixel-row buffer.  Created by module init or start_output.
+   * Width is cinfo->output_width * cinfo->output_components;
+   * height is buffer_height.
+   */
+  JSAMPARRAY buffer;
+  JDIMENSION buffer_height;
+};
+
+
+/*
+ * cjpeg/djpeg may need to perform extra passes to convert to or from
+ * the source/destination file format.  The JPEG library does not know
+ * about these passes, but we'd like them to be counted by the progress
+ * monitor.  We use an expanded progress monitor object to hold the
+ * additional pass count.
+ */
+
+struct cdjpeg_progress_mgr {
+  struct jpeg_progress_mgr pub;	/* fields known to JPEG library */
+  int completed_extra_passes;	/* extra passes completed */
+  int total_extra_passes;	/* total extra */
+  /* last printed percentage stored here to avoid multiple printouts */
+  int percent_done;
+};
+
+typedef struct cdjpeg_progress_mgr * cd_progress_ptr;
+
+
+/* Short forms of external names for systems with brain-damaged linkers. */
+
+#ifdef NEED_SHORT_EXTERNAL_NAMES
+#define jinit_read_bmp		jIRdBMP
+#define jinit_write_bmp		jIWrBMP
+#define jinit_read_gif		jIRdGIF
+#define jinit_write_gif		jIWrGIF
+#define jinit_read_ppm		jIRdPPM
+#define jinit_write_ppm		jIWrPPM
+#define jinit_read_rle		jIRdRLE
+#define jinit_write_rle		jIWrRLE
+#define jinit_read_targa	jIRdTarga
+#define jinit_write_targa	jIWrTarga
+#define read_quant_tables	RdQTables
+#define read_scan_script	RdScnScript
+#define set_quality_ratings     SetQRates
+#define set_quant_slots		SetQSlots
+#define set_sample_factors	SetSFacts
+#define read_color_map		RdCMap
+#define enable_signal_catcher	EnSigCatcher
+#define start_progress_monitor	StProgMon
+#define end_progress_monitor	EnProgMon
+#define read_stdin		RdStdin
+#define write_stdout		WrStdout
+#endif /* NEED_SHORT_EXTERNAL_NAMES */
+
+/* Module selection routines for I/O modules. */
+
+EXTERN(cjpeg_source_ptr) jinit_read_bmp JPP((j_compress_ptr cinfo));
+EXTERN(djpeg_dest_ptr) jinit_write_bmp JPP((j_decompress_ptr cinfo,
+					    boolean is_os2));
+EXTERN(cjpeg_source_ptr) jinit_read_gif JPP((j_compress_ptr cinfo));
+EXTERN(djpeg_dest_ptr) jinit_write_gif JPP((j_decompress_ptr cinfo));
+EXTERN(cjpeg_source_ptr) jinit_read_ppm JPP((j_compress_ptr cinfo));
+EXTERN(djpeg_dest_ptr) jinit_write_ppm JPP((j_decompress_ptr cinfo));
+EXTERN(cjpeg_source_ptr) jinit_read_rle JPP((j_compress_ptr cinfo));
+EXTERN(djpeg_dest_ptr) jinit_write_rle JPP((j_decompress_ptr cinfo));
+EXTERN(cjpeg_source_ptr) jinit_read_targa JPP((j_compress_ptr cinfo));
+EXTERN(djpeg_dest_ptr) jinit_write_targa JPP((j_decompress_ptr cinfo));
+
+/* cjpeg support routines (in rdswitch.c) */
+
+EXTERN(boolean) read_quant_tables JPP((j_compress_ptr cinfo, char * filename,
+				       boolean force_baseline));
+EXTERN(boolean) read_scan_script JPP((j_compress_ptr cinfo, char * filename));
+EXTERN(boolean) set_quality_ratings JPP((j_compress_ptr cinfo, char *arg,
+					 boolean force_baseline));
+EXTERN(boolean) set_quant_slots JPP((j_compress_ptr cinfo, char *arg));
+EXTERN(boolean) set_sample_factors JPP((j_compress_ptr cinfo, char *arg));
+
+/* djpeg support routines (in rdcolmap.c) */
+
+EXTERN(void) read_color_map JPP((j_decompress_ptr cinfo, FILE * infile));
+
+/* common support routines (in cdjpeg.c) */
+
+EXTERN(void) enable_signal_catcher JPP((j_common_ptr cinfo));
+EXTERN(void) start_progress_monitor JPP((j_common_ptr cinfo,
+					 cd_progress_ptr progress));
+EXTERN(void) end_progress_monitor JPP((j_common_ptr cinfo));
+EXTERN(boolean) keymatch JPP((char * arg, const char * keyword, int minchars));
+EXTERN(FILE *) read_stdin JPP((void));
+EXTERN(FILE *) write_stdout JPP((void));
+
+/* miscellaneous useful macros */
+
+#ifdef DONT_USE_B_MODE		/* define mode parameters for fopen() */
+#define READ_BINARY	"r"
+#define WRITE_BINARY	"w"
+#else
+#ifdef VMS			/* VMS is very nonstandard */
+#define READ_BINARY	"rb", "ctx=stm"
+#define WRITE_BINARY	"wb", "ctx=stm"
+#else				/* standard ANSI-compliant case */
+#define READ_BINARY	"rb"
+#define WRITE_BINARY	"wb"
+#endif
+#endif
+
+#ifndef EXIT_FAILURE		/* define exit() codes if not provided */
+#define EXIT_FAILURE  1
+#endif
+#ifndef EXIT_SUCCESS
+#ifdef VMS
+#define EXIT_SUCCESS  1		/* VMS is very nonstandard */
+#else
+#define EXIT_SUCCESS  0
+#endif
+#endif
+#ifndef EXIT_WARNING
+#ifdef VMS
+#define EXIT_WARNING  1		/* VMS is very nonstandard */
+#else
+#define EXIT_WARNING  2
+#endif
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/tjpeg/config.h b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/config.h
new file mode 100644
index 0000000..6e38c88
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/config.h
@@ -0,0 +1,131 @@
+/* config.h.  Generated from config.h.in by configure.  */
+/* config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* Build number */
+#define BUILD "20110829"
+
+/* Support arithmetic encoding */
+#define C_ARITH_CODING_SUPPORTED 1
+
+/* Support arithmetic decoding */
+#define D_ARITH_CODING_SUPPORTED 1
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#define HAVE_DLFCN_H 1
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define to 1 if you have the <jni.h> header file. */
+/* #undef HAVE_JNI_H */
+
+/* Define to 1 if you have the `memcpy' function. */
+#define HAVE_MEMCPY 1
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* Define to 1 if you have the `memset' function. */
+#define HAVE_MEMSET 1
+
+/* Define if your compiler supports prototypes */
+#define HAVE_PROTOTYPES 1
+
+/* Define to 1 if you have the <stddef.h> header file. */
+#define HAVE_STDDEF_H 1
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* Define to 1 if the system has the type `unsigned char'. */
+#define HAVE_UNSIGNED_CHAR 1
+
+/* Define to 1 if the system has the type `unsigned short'. */
+#define HAVE_UNSIGNED_SHORT 1
+
+/* Compiler does not support pointers to undefined structures. */
+/* #undef INCOMPLETE_TYPES_BROKEN */
+
+/* libjpeg API version */
+#define JPEG_LIB_VERSION 62
+
+/* Define to the sub-directory in which libtool stores uninstalled libraries.
+   */
+#define LT_OBJDIR ".libs/"
+
+/* Define if you have BSD-like bzero and bcopy */
+/* #undef NEED_BSD_STRINGS */
+
+/* Define if you need short function names */
+/* #undef NEED_SHORT_EXTERNAL_NAMES */
+
+/* Define if you have sys/types.h */
+#define NEED_SYS_TYPES_H 1
+
+/* Name of package */
+#define PACKAGE "libjpeg-turbo"
+
+/* Define to the address where bug reports for this package should be sent. */
+#define PACKAGE_BUGREPORT ""
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME "libjpeg-turbo"
+
+/* Define to the full name and version of this package. */
+#define PACKAGE_STRING "libjpeg-turbo 1.1.90"
+
+/* Define to the one symbol short name of this package. */
+#define PACKAGE_TARNAME "libjpeg-turbo"
+
+/* Define to the home page for this package. */
+#define PACKAGE_URL ""
+
+/* Define to the version of this package. */
+#define PACKAGE_VERSION "1.1.90"
+
+/* Define if shift is unsigned */
+/* #undef RIGHT_SHIFT_IS_UNSIGNED */
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* Version number of package */
+#define VERSION "1.1.90"
+
+/* Use accelerated SIMD routines. */
+#define WITH_SIMD 1
+
+/* Define to 1 if type `char' is unsigned and you are not using gcc.  */
+#ifndef __CHAR_UNSIGNED__
+/* # undef __CHAR_UNSIGNED__ */
+#endif
+
+/* Define to empty if `const' does not conform to ANSI C. */
+/* #undef const */
+
+/* Define to `__inline__' or `__inline' if that's what the C compiler
+   calls it, or to nothing if 'inline' is not supported under any name.  */
+#ifndef __cplusplus
+/* #undef inline */
+#endif
+
+/* Define to `unsigned int' if <sys/types.h> does not define. */
+/* #undef size_t */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/tjpeg/cpu-features.h b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/cpu-features.h
new file mode 100644
index 0000000..39c1db3
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/cpu-features.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright (C) 2008 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#ifndef _ARM_MACHINE_CPU_FEATURES_H
+#define _ARM_MACHINE_CPU_FEATURES_H
+
+/* The purpose of this file is to define several macros corresponding
+ * to CPU features that may or may not be available at build time on
+ * on the target CPU.
+ *
+ * This is done to abstract us from the various ARM Architecture
+ * quirks and alphabet soup.
+ *
+ * IMPORTANT: We have no intention to support anything below an ARMv4T !
+ */
+
+/* __ARM_ARCH__ is a number corresponding to the ARM revision
+ * we're going to support
+ *
+ * it looks like our toolchain doesn't define __ARM_ARCH__
+ * so try to guess it.
+ *
+ *
+ *
+ */
+#ifndef __ARM_ARCH__
+
+#  if defined __ARM_ARCH_7__   || defined __ARM_ARCH_7A__ || \
+      defined __ARM_ARCH_7R__  || defined __ARM_ARCH_7M__
+
+#    define __ARM_ARCH__ 7
+
+#  elif defined __ARM_ARCH_6__   || defined __ARM_ARCH_6J__ || \
+      defined __ARM_ARCH_6K__  || defined __ARM_ARCH_6Z__ || \
+      defined __ARM_ARCH_6KZ__ || defined __ARM_ARCH_6T2__
+#
+#    define __ARM_ARCH__ 6
+#
+#  elif defined __ARM_ARCH_5__ || defined __ARM_ARCH_5T__ || \
+        defined __ARM_ARCH_5TE__ || defined __ARM_ARCH_5TEJ__
+#
+#    define __ARM_ARCH__ 5
+#
+#  elif defined __ARM_ARCH_4T__
+#
+#    define __ARM_ARCH__ 4
+#
+#  elif defined __ARM_ARCH_4__
+#    error ARMv4 is not supported, please use ARMv4T at a minimum
+#  else
+#    error Unknown or unsupported ARM architecture
+#  endif
+#endif
+
+/* experimental feature used to check that our ARMv4 workarounds
+ * work correctly without a real ARMv4 machine */
+#ifdef BIONIC_EXPERIMENTAL_FORCE_ARMV4
+#  undef  __ARM_ARCH__
+#  define __ARM_ARCH__  4
+#endif
+
+/* define __ARM_HAVE_5TE if we have the ARMv5TE instructions */
+#if __ARM_ARCH__ > 5
+#  define  __ARM_HAVE_5TE  1
+#elif __ARM_ARCH__ == 5
+#  if defined __ARM_ARCH_5TE__ || defined __ARM_ARCH_5TEJ__
+#    define __ARM_HAVE_5TE  1
+#  endif
+#endif
+
+/* instructions introduced in ARMv5 */
+#if __ARM_ARCH__ >= 5
+#  define  __ARM_HAVE_BLX  1
+#  define  __ARM_HAVE_CLZ  1
+#  define  __ARM_HAVE_LDC2 1
+#  define  __ARM_HAVE_MCR2 1
+#  define  __ARM_HAVE_MRC2 1
+#  define  __ARM_HAVE_STC2 1
+#endif
+
+/* ARMv5TE introduces a few instructions */
+#if __ARM_HAVE_5TE
+#  define  __ARM_HAVE_PLD   1
+#  define  __ARM_HAVE_MCRR  1
+#  define  __ARM_HAVE_MRRC  1
+#endif
+
+/* define __ARM_HAVE_HALFWORD_MULTIPLY when half-word multiply instructions
+ * this means variants of: smul, smulw, smla, smlaw, smlal
+ */
+#if __ARM_HAVE_5TE
+#  define  __ARM_HAVE_HALFWORD_MULTIPLY  1
+#endif
+
+/* define __ARM_HAVE_PAIR_LOAD_STORE when 64-bit memory loads and stored
+ * into/from a pair of 32-bit registers is supported throuhg 'ldrd' and 'strd'
+ */
+#if __ARM_HAVE_5TE
+#  define  __ARM_HAVE_PAIR_LOAD_STORE 1
+#endif
+
+/* define __ARM_HAVE_SATURATED_ARITHMETIC is you have the saturated integer
+ * arithmetic instructions: qdd, qdadd, qsub, qdsub
+ */
+#if __ARM_HAVE_5TE
+#  define  __ARM_HAVE_SATURATED_ARITHMETIC 1
+#endif
+
+/* define __ARM_HAVE_PC_INTERWORK when a direct assignment to the
+ * pc register will switch into thumb/ARM mode depending on bit 0
+ * of the new instruction address. Before ARMv5, this was not the
+ * case, and you have to write:
+ *
+ *     mov  r0, [<some address>]
+ *     bx   r0
+ *
+ * instead of:
+ *
+ *     ldr  pc, [<some address>]
+ *
+ * note that this affects any instruction that explicitly changes the
+ * value of the pc register, including ldm { ...,pc } or 'add pc, #offset'
+ */
+#if __ARM_ARCH__ >= 5
+#  define __ARM_HAVE_PC_INTERWORK
+#endif
+
+/* define __ARM_HAVE_LDREX_STREX for ARMv6 and ARMv7 architecture to be
+ * used in replacement of deprecated swp instruction
+ */
+#if __ARM_ARCH__ >= 6
+#  define __ARM_HAVE_LDREX_STREX
+#endif
+
+/* define __ARM_HAVE_DMB for ARMv7 architecture
+ */
+#if __ARM_ARCH__ >= 7
+#  define __ARM_HAVE_DMB
+#endif
+
+/* define __ARM_HAVE_LDREXD for ARMv7 architecture
+ * (also present in ARMv6K, but not implemented in ARMv7-M, neither of which
+ * we care about)
+ */
+#if __ARM_ARCH__ >= 7
+#  define __ARM_HAVE_LDREXD
+#endif
+
+/* define _ARM_HAVE_VFP if we have VFPv3
+ */
+#if __ARM_ARCH__ >= 7 && defined __VFP_FP__
+#  define __ARM_HAVE_VFP
+#endif
+
+/* define _ARM_HAVE_NEON for ARMv7 architecture if we support the
+ * Neon SIMD instruction set extensions. This also implies
+ * that VFPv3-D32 is supported.
+ */
+#if __ARM_ARCH__ >= 7 && defined __ARM_NEON__
+#  define __ARM_HAVE_NEON
+#endif
+
+/* Assembly-only macros */
+
+/* define a handy PLD(address) macro since the cache preload
+ * is an optional opcode
+ */
+#if __ARM_HAVE_PLD
+#  define  PLD(reg,offset)    pld    [reg, offset]
+#else
+#  define  PLD(reg,offset)    /* nothing */
+#endif
+
+#endif /* _ARM_MACHINE_CPU_FEATURES_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jchuff.h b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jchuff.h
new file mode 100644
index 0000000..a9599fc
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jchuff.h
@@ -0,0 +1,47 @@
+/*
+ * jchuff.h
+ *
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains declarations for Huffman entropy encoding routines
+ * that are shared between the sequential encoder (jchuff.c) and the
+ * progressive encoder (jcphuff.c).  No other modules need to see these.
+ */
+
+/* The legal range of a DCT coefficient is
+ *  -1024 .. +1023  for 8-bit data;
+ * -16384 .. +16383 for 12-bit data.
+ * Hence the magnitude should always fit in 10 or 14 bits respectively.
+ */
+
+#if BITS_IN_JSAMPLE == 8
+#define MAX_COEF_BITS 10
+#else
+#define MAX_COEF_BITS 14
+#endif
+
+/* Derived data constructed for each Huffman table */
+
+typedef struct {
+  unsigned int ehufco[256];	/* code for each symbol */
+  char ehufsi[256];		/* length of code for each symbol */
+  /* If no code has been allocated for a symbol S, ehufsi[S] contains 0 */
+} c_derived_tbl;
+
+/* Short forms of external names for systems with brain-damaged linkers. */
+
+#ifdef NEED_SHORT_EXTERNAL_NAMES
+#define jpeg_make_c_derived_tbl	jMkCDerived
+#define jpeg_gen_optimal_table	jGenOptTbl
+#endif /* NEED_SHORT_EXTERNAL_NAMES */
+
+/* Expand a Huffman table definition into the derived format */
+EXTERN(void) jpeg_make_c_derived_tbl
+	JPP((j_compress_ptr cinfo, boolean isDC, int tblno,
+	     c_derived_tbl ** pdtbl));
+
+/* Generate an optimal table definition given the specified counts */
+EXTERN(void) jpeg_gen_optimal_table
+	JPP((j_compress_ptr cinfo, JHUFF_TBL * htbl, long freq[]));
diff --git a/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jconfig.h b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jconfig.h
new file mode 100644
index 0000000..3f12221
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jconfig.h
@@ -0,0 +1,62 @@
+/* jconfig.h.  Generated from jconfig.h.in by configure.  */
+/* Version ID for the JPEG library.
+ * Might be useful for tests like "#if JPEG_LIB_VERSION >= 60".
+ */
+#define JPEG_LIB_VERSION 62
+
+/* Support arithmetic encoding */
+#define C_ARITH_CODING_SUPPORTED 1
+
+/* Support arithmetic decoding */
+#define D_ARITH_CODING_SUPPORTED 1
+
+/* Define if your compiler supports prototypes */
+#define HAVE_PROTOTYPES 1
+
+/* Define to 1 if you have the <stddef.h> header file. */
+#define HAVE_STDDEF_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if the system has the type `unsigned char'. */
+#define HAVE_UNSIGNED_CHAR 1
+
+/* Define to 1 if the system has the type `unsigned short'. */
+#define HAVE_UNSIGNED_SHORT 1
+
+/* Define if you want use complete types */
+/* #undef INCOMPLETE_TYPES_BROKEN */
+
+/* Define if you have BSD-like bzero and bcopy */
+/* #undef NEED_BSD_STRINGS */
+
+/* Define if you need short function names */
+/* #undef NEED_SHORT_EXTERNAL_NAMES */
+
+/* Define if you have sys/types.h */
+#define NEED_SYS_TYPES_H 1
+
+/* Define if shift is unsigned */
+/* #undef RIGHT_SHIFT_IS_UNSIGNED */
+
+/* Use accelerated SIMD routines. */
+#define WITH_SIMD 1
+
+/* Define to 1 if type `char' is unsigned and you are not using gcc.  */
+#ifndef __CHAR_UNSIGNED__
+/* # undef __CHAR_UNSIGNED__ */
+#endif
+
+/* Define to empty if `const' does not conform to ANSI C. */
+/* #undef const */
+
+/* Define to `__inline__' or `__inline' if that's what the C compiler
+   calls it, or to nothing if 'inline' is not supported under any name.  */
+#ifndef __cplusplus
+/* #undef inline */
+#endif
+
+/* Define to `unsigned int' if <sys/types.h> does not define. */
+/* #undef size_t */
+
diff --git a/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jdct.h b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jdct.h
new file mode 100644
index 0000000..7b49a97
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jdct.h
@@ -0,0 +1,184 @@
+/*
+ * jdct.h
+ *
+ * Copyright (C) 1994-1996, Thomas G. Lane.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This include file contains common declarations for the forward and
+ * inverse DCT modules.  These declarations are private to the DCT managers
+ * (jcdctmgr.c, jddctmgr.c) and the individual DCT algorithms.
+ * The individual DCT algorithms are kept in separate files to ease 
+ * machine-dependent tuning (e.g., assembly coding).
+ */
+
+
+/*
+ * A forward DCT routine is given a pointer to a work area of type DCTELEM[];
+ * the DCT is to be performed in-place in that buffer.  Type DCTELEM is int
+ * for 8-bit samples, INT32 for 12-bit samples.  (NOTE: Floating-point DCT
+ * implementations use an array of type FAST_FLOAT, instead.)
+ * The DCT inputs are expected to be signed (range +-CENTERJSAMPLE).
+ * The DCT outputs are returned scaled up by a factor of 8; they therefore
+ * have a range of +-8K for 8-bit data, +-128K for 12-bit data.  This
+ * convention improves accuracy in integer implementations and saves some
+ * work in floating-point ones.
+ * Quantization of the output coefficients is done by jcdctmgr.c. This
+ * step requires an unsigned type and also one with twice the bits.
+ */
+
+#if BITS_IN_JSAMPLE == 8
+#ifndef WITH_SIMD
+typedef int DCTELEM;		/* 16 or 32 bits is fine */
+typedef unsigned int UDCTELEM;
+typedef unsigned long long UDCTELEM2;
+#else
+typedef short DCTELEM;  /* prefer 16 bit with SIMD for parellelism */
+typedef unsigned short UDCTELEM;
+typedef unsigned int UDCTELEM2;
+#endif
+#else
+typedef INT32 DCTELEM;		/* must have 32 bits */
+typedef UINT32 UDCTELEM;
+typedef unsigned long long UDCTELEM2;
+#endif
+
+
+/*
+ * An inverse DCT routine is given a pointer to the input JBLOCK and a pointer
+ * to an output sample array.  The routine must dequantize the input data as
+ * well as perform the IDCT; for dequantization, it uses the multiplier table
+ * pointed to by compptr->dct_table.  The output data is to be placed into the
+ * sample array starting at a specified column.  (Any row offset needed will
+ * be applied to the array pointer before it is passed to the IDCT code.)
+ * Note that the number of samples emitted by the IDCT routine is
+ * DCT_scaled_size * DCT_scaled_size.
+ */
+
+/* typedef inverse_DCT_method_ptr is declared in jpegint.h */
+
+/*
+ * Each IDCT routine has its own ideas about the best dct_table element type.
+ */
+
+typedef MULTIPLIER ISLOW_MULT_TYPE; /* short or int, whichever is faster */
+#if BITS_IN_JSAMPLE == 8
+typedef MULTIPLIER IFAST_MULT_TYPE; /* 16 bits is OK, use short if faster */
+#define IFAST_SCALE_BITS  2	/* fractional bits in scale factors */
+#else
+typedef INT32 IFAST_MULT_TYPE;	/* need 32 bits for scaled quantizers */
+#define IFAST_SCALE_BITS  13	/* fractional bits in scale factors */
+#endif
+typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */
+
+
+/*
+ * Each IDCT routine is responsible for range-limiting its results and
+ * converting them to unsigned form (0..MAXJSAMPLE).  The raw outputs could
+ * be quite far out of range if the input data is corrupt, so a bulletproof
+ * range-limiting step is required.  We use a mask-and-table-lookup method
+ * to do the combined operations quickly.  See the comments with
+ * prepare_range_limit_table (in jdmaster.c) for more info.
+ */
+
+#define IDCT_range_limit(cinfo)  ((cinfo)->sample_range_limit + CENTERJSAMPLE)
+
+#define RANGE_MASK  (MAXJSAMPLE * 4 + 3) /* 2 bits wider than legal samples */
+
+
+/* Short forms of external names for systems with brain-damaged linkers. */
+
+#ifdef NEED_SHORT_EXTERNAL_NAMES
+#define jpeg_fdct_islow		jFDislow
+#define jpeg_fdct_ifast		jFDifast
+#define jpeg_fdct_float		jFDfloat
+#define jpeg_idct_islow		jRDislow
+#define jpeg_idct_ifast		jRDifast
+#define jpeg_idct_float		jRDfloat
+#define jpeg_idct_4x4		jRD4x4
+#define jpeg_idct_2x2		jRD2x2
+#define jpeg_idct_1x1		jRD1x1
+#endif /* NEED_SHORT_EXTERNAL_NAMES */
+
+/* Extern declarations for the forward and inverse DCT routines. */
+
+EXTERN(void) jpeg_fdct_islow JPP((DCTELEM * data));
+EXTERN(void) jpeg_fdct_ifast JPP((DCTELEM * data));
+EXTERN(void) jpeg_fdct_float JPP((FAST_FLOAT * data));
+
+EXTERN(void) jpeg_idct_islow
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_ifast
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_float
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_4x4
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_2x2
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_1x1
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+	 JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+
+
+/*
+ * Macros for handling fixed-point arithmetic; these are used by many
+ * but not all of the DCT/IDCT modules.
+ *
+ * All values are expected to be of type INT32.
+ * Fractional constants are scaled left by CONST_BITS bits.
+ * CONST_BITS is defined within each module using these macros,
+ * and may differ from one module to the next.
+ */
+
+#define ONE	((INT32) 1)
+#define CONST_SCALE (ONE << CONST_BITS)
+
+/* Convert a positive real constant to an integer scaled by CONST_SCALE.
+ * Caution: some C compilers fail to reduce "FIX(constant)" at compile time,
+ * thus causing a lot of useless floating-point operations at run time.
+ */
+
+#define FIX(x)	((INT32) ((x) * CONST_SCALE + 0.5))
+
+/* Descale and correctly round an INT32 value that's scaled by N bits.
+ * We assume RIGHT_SHIFT rounds towards minus infinity, so adding
+ * the fudge factor is correct for either sign of X.
+ */
+
+#define DESCALE(x,n)  RIGHT_SHIFT((x) + (ONE << ((n)-1)), n)
+
+/* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
+ * This macro is used only when the two inputs will actually be no more than
+ * 16 bits wide, so that a 16x16->32 bit multiply can be used instead of a
+ * full 32x32 multiply.  This provides a useful speedup on many machines.
+ * Unfortunately there is no way to specify a 16x16->32 multiply portably
+ * in C, but some C compilers will do the right thing if you provide the
+ * correct combination of casts.
+ */
+
+#ifdef SHORTxSHORT_32		/* may work if 'int' is 32 bits */
+#define MULTIPLY16C16(var,const)  (((INT16) (var)) * ((INT16) (const)))
+#endif
+#ifdef SHORTxLCONST_32		/* known to work with Microsoft C 6.0 */
+#define MULTIPLY16C16(var,const)  (((INT16) (var)) * ((INT32) (const)))
+#endif
+
+#ifndef MULTIPLY16C16		/* default definition */
+#define MULTIPLY16C16(var,const)  ((var) * (const))
+#endif
+
+/* Same except both inputs are variables. */
+
+#ifdef SHORTxSHORT_32		/* may work if 'int' is 32 bits */
+#define MULTIPLY16V16(var1,var2)  (((INT16) (var1)) * ((INT16) (var2)))
+#endif
+
+#ifndef MULTIPLY16V16		/* default definition */
+#define MULTIPLY16V16(var1,var2)  ((var1) * (var2))
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jdhuff.h b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jdhuff.h
new file mode 100644
index 0000000..47665a9
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jdhuff.h
@@ -0,0 +1,235 @@
+/*
+ * jdhuff.h
+ *
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Copyright (C) 2010-2011, D. R. Commander.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains declarations for Huffman entropy decoding routines
+ * that are shared between the sequential decoder (jdhuff.c) and the
+ * progressive decoder (jdphuff.c).  No other modules need to see these.
+ */
+
+/* Short forms of external names for systems with brain-damaged linkers. */
+
+#ifdef NEED_SHORT_EXTERNAL_NAMES
+#define jpeg_make_d_derived_tbl	jMkDDerived
+#define jpeg_fill_bit_buffer	jFilBitBuf
+#define jpeg_huff_decode	jHufDecode
+#endif /* NEED_SHORT_EXTERNAL_NAMES */
+
+
+/* Derived data constructed for each Huffman table */
+
+#define HUFF_LOOKAHEAD	8	/* # of bits of lookahead */
+
+typedef struct {
+  /* Basic tables: (element [0] of each array is unused) */
+  INT32 maxcode[18];		/* largest code of length k (-1 if none) */
+  /* (maxcode[17] is a sentinel to ensure jpeg_huff_decode terminates) */
+  INT32 valoffset[18];		/* huffval[] offset for codes of length k */
+  /* valoffset[k] = huffval[] index of 1st symbol of code length k, less
+   * the smallest code of length k; so given a code of length k, the
+   * corresponding symbol is huffval[code + valoffset[k]]
+   */
+
+  /* Link to public Huffman table (needed only in jpeg_huff_decode) */
+  JHUFF_TBL *pub;
+
+  /* Lookahead table: indexed by the next HUFF_LOOKAHEAD bits of
+   * the input data stream.  If the next Huffman code is no more
+   * than HUFF_LOOKAHEAD bits long, we can obtain its length and
+   * the corresponding symbol directly from this tables.
+   *
+   * The lower 8 bits of each table entry contain the number of
+   * bits in the corresponding Huffman code, or HUFF_LOOKAHEAD + 1
+   * if too long.  The next 8 bits of each entry contain the
+   * symbol.
+   */
+  int lookup[1<<HUFF_LOOKAHEAD];
+} d_derived_tbl;
+
+/* Expand a Huffman table definition into the derived format */
+EXTERN(void) jpeg_make_d_derived_tbl
+	JPP((j_decompress_ptr cinfo, boolean isDC, int tblno,
+	     d_derived_tbl ** pdtbl));
+
+
+/*
+ * Fetching the next N bits from the input stream is a time-critical operation
+ * for the Huffman decoders.  We implement it with a combination of inline
+ * macros and out-of-line subroutines.  Note that N (the number of bits
+ * demanded at one time) never exceeds 15 for JPEG use.
+ *
+ * We read source bytes into get_buffer and dole out bits as needed.
+ * If get_buffer already contains enough bits, they are fetched in-line
+ * by the macros CHECK_BIT_BUFFER and GET_BITS.  When there aren't enough
+ * bits, jpeg_fill_bit_buffer is called; it will attempt to fill get_buffer
+ * as full as possible (not just to the number of bits needed; this
+ * prefetching reduces the overhead cost of calling jpeg_fill_bit_buffer).
+ * Note that jpeg_fill_bit_buffer may return FALSE to indicate suspension.
+ * On TRUE return, jpeg_fill_bit_buffer guarantees that get_buffer contains
+ * at least the requested number of bits --- dummy zeroes are inserted if
+ * necessary.
+ */
+
+#if __WORDSIZE == 64 || defined(_WIN64)
+
+typedef size_t bit_buf_type;	/* type of bit-extraction buffer */
+#define BIT_BUF_SIZE  64		/* size of buffer in bits */
+
+#else
+
+typedef INT32 bit_buf_type;	/* type of bit-extraction buffer */
+#define BIT_BUF_SIZE  32		/* size of buffer in bits */
+
+#endif
+#define LOG_TWO_BIT_BUF_SIZE  5        /* log_2(BIT_BUF_SIZE) */
+
+/* If long is > 32 bits on your machine, and shifting/masking longs is
+ * reasonably fast, making bit_buf_type be long and setting BIT_BUF_SIZE
+ * appropriately should be a win.  Unfortunately we can't define the size
+ * with something like  #define BIT_BUF_SIZE (sizeof(bit_buf_type)*8)
+ * because not all machines measure sizeof in 8-bit bytes.
+ */
+
+typedef struct {		/* Bitreading state saved across MCUs */
+  bit_buf_type get_buffer;	/* current bit-extraction buffer */
+  int bits_left;		/* # of unused bits in it */
+} bitread_perm_state;
+
+typedef struct {		/* Bitreading working state within an MCU */
+  /* Current data source location */
+  /* We need a copy, rather than munging the original, in case of suspension */
+  const JOCTET * next_input_byte; /* => next byte to read from source */
+  size_t bytes_in_buffer;	/* # of bytes remaining in source buffer */
+  /* Bit input buffer --- note these values are kept in register variables,
+   * not in this struct, inside the inner loops.
+   */
+  bit_buf_type get_buffer;	/* current bit-extraction buffer */
+  int bits_left;		/* # of unused bits in it */
+  /* Pointer needed by jpeg_fill_bit_buffer. */
+  j_decompress_ptr cinfo;	/* back link to decompress master record */
+} bitread_working_state;
+
+/* Macros to declare and load/writeFileAsync bitread local variables. */
+#define BITREAD_STATE_VARS  \
+	register bit_buf_type get_buffer;  \
+	register int bits_left;  \
+	bitread_working_state br_state
+
+#define BITREAD_LOAD_STATE(cinfop,permstate)  \
+	br_state.cinfo = cinfop; \
+	br_state.next_input_byte = cinfop->src->next_input_byte; \
+	br_state.bytes_in_buffer = cinfop->src->bytes_in_buffer; \
+	get_buffer = permstate.get_buffer; \
+	bits_left = permstate.bits_left;
+
+#define BITREAD_SAVE_STATE(cinfop,permstate)  \
+	cinfop->src->next_input_byte = br_state.next_input_byte; \
+	cinfop->src->bytes_in_buffer = br_state.bytes_in_buffer; \
+	permstate.get_buffer = get_buffer; \
+	permstate.bits_left = bits_left
+
+/*
+ * These macros provide the in-line portion of bit fetching.
+ * Use CHECK_BIT_BUFFER to ensure there are N bits in get_buffer
+ * before using GET_BITS, PEEK_BITS, or DROP_BITS.
+ * The variables get_buffer and bits_left are assumed to be locals,
+ * but the state struct might not be (jpeg_huff_decode needs this).
+ *	CHECK_BIT_BUFFER(state,n,action);
+ *		Ensure there are N bits in get_buffer; if suspend, take action.
+ *      val = GET_BITS(n);
+ *		Fetch next N bits.
+ *      val = PEEK_BITS(n);
+ *		Fetch next N bits without removing them from the buffer.
+ *	DROP_BITS(n);
+ *		Discard next N bits.
+ * The value N should be a simple variable, not an expression, because it
+ * is evaluated multiple times.
+ */
+
+#define CHECK_BIT_BUFFER(state,nbits,action) \
+	{ if (bits_left < (nbits)) {  \
+	    if (! jpeg_fill_bit_buffer(&(state),get_buffer,bits_left,nbits))  \
+	      { action; }  \
+	    get_buffer = (state).get_buffer; bits_left = (state).bits_left; } }
+
+#define GET_BITS(nbits) \
+	(((int) (get_buffer >> (bits_left -= (nbits)))) & ((1<<(nbits))-1))
+
+#define PEEK_BITS(nbits) \
+	(((int) (get_buffer >> (bits_left -  (nbits)))) & ((1<<(nbits))-1))
+
+#define DROP_BITS(nbits) \
+	(bits_left -= (nbits))
+
+/* Load up the bit buffer to a depth of at least nbits */
+EXTERN(boolean) jpeg_fill_bit_buffer
+	JPP((bitread_working_state * state, register bit_buf_type get_buffer,
+	     register int bits_left, int nbits));
+
+
+/*
+ * Code for extracting next Huffman-coded symbol from input bit stream.
+ * Again, this is time-critical and we make the main paths be macros.
+ *
+ * We use a lookahead table to process codes of up to HUFF_LOOKAHEAD bits
+ * without looping.  Usually, more than 95% of the Huffman codes will be 8
+ * or fewer bits long.  The few overlength codes are handled with a loop,
+ * which need not be inline code.
+ *
+ * Notes about the HUFF_DECODE macro:
+ * 1. Near the end of the data segment, we may fail to get enough bits
+ *    for a lookahead.  In that case, we do it the hard way.
+ * 2. If the lookahead table contains no entry, the next code must be
+ *    more than HUFF_LOOKAHEAD bits long.
+ * 3. jpeg_huff_decode returns -1 if forced to suspend.
+ */
+
+#define HUFF_DECODE(result,state,htbl,failaction,slowlabel) \
+{ register int nb, look; \
+  if (bits_left < HUFF_LOOKAHEAD) { \
+    if (! jpeg_fill_bit_buffer(&state,get_buffer,bits_left, 0)) {failaction;} \
+    get_buffer = state.get_buffer; bits_left = state.bits_left; \
+    if (bits_left < HUFF_LOOKAHEAD) { \
+      nb = 1; goto slowlabel; \
+    } \
+  } \
+  look = PEEK_BITS(HUFF_LOOKAHEAD); \
+  if ((nb = (htbl->lookup[look] >> HUFF_LOOKAHEAD)) <= HUFF_LOOKAHEAD) { \
+    DROP_BITS(nb); \
+    result = htbl->lookup[look] & ((1 << HUFF_LOOKAHEAD) - 1); \
+  } else { \
+slowlabel: \
+    if ((result=jpeg_huff_decode(&state,get_buffer,bits_left,htbl,nb)) < 0) \
+	{ failaction; } \
+    get_buffer = state.get_buffer; bits_left = state.bits_left; \
+  } \
+}
+
+#define HUFF_DECODE_FAST(s,nb,htbl) \
+  FILL_BIT_BUFFER_FAST; \
+  s = PEEK_BITS(HUFF_LOOKAHEAD); \
+  s = htbl->lookup[s]; \
+  nb = s >> HUFF_LOOKAHEAD; \
+  /* Pre-execute the common case of nb <= HUFF_LOOKAHEAD */ \
+  DROP_BITS(nb); \
+  s = s & ((1 << HUFF_LOOKAHEAD) - 1); \
+  if (nb > HUFF_LOOKAHEAD) { \
+    /* Equivalent of jpeg_huff_decode() */ \
+    /* Don't use GET_BITS() here because we don't want to modify bits_left */ \
+    s = (get_buffer >> bits_left) & ((1 << (nb)) - 1); \
+    while (s > htbl->maxcode[nb]) { \
+      s <<= 1; \
+      s |= GET_BITS(1); \
+      nb++; \
+    } \
+    s = htbl->pub->huffval[ (int) (s + htbl->valoffset[nb]) & 0xFF ]; \
+  }
+
+/* Out-of-line case for Huffman code fetching */
+EXTERN(int) jpeg_huff_decode
+	JPP((bitread_working_state * state, register bit_buf_type get_buffer,
+	     register int bits_left, d_derived_tbl * htbl, int min_bits));
diff --git a/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jerror.h b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jerror.h
new file mode 100644
index 0000000..275086e
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jerror.h
@@ -0,0 +1,314 @@
+/*
+ * jerror.h
+ *
+ * Copyright (C) 1994-1997, Thomas G. Lane.
+ * Modified 1997-2009 by Guido Vollbeding.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file defines the error and message codes for the JPEG library.
+ * Edit this file to add new codes, or to translate the message strings to
+ * some other language.
+ * A set of error-reporting macros are defined too.  Some applications using
+ * the JPEG library may wish to include this file to get the error codes
+ * and/or the macros.
+ */
+
+/*
+ * To define the enum list of message codes, include this file without
+ * defining macro JMESSAGE.  To create a message string table, include it
+ * again with a suitable JMESSAGE definition (see jerror.c for an example).
+ */
+#ifndef JMESSAGE
+#ifndef JERROR_H
+/* First time through, define the enum list */
+#define JMAKE_ENUM_LIST
+#else
+/* Repeated inclusions of this file are no-ops unless JMESSAGE is defined */
+#define JMESSAGE(code,string)
+#endif /* JERROR_H */
+#endif /* JMESSAGE */
+
+#ifdef JMAKE_ENUM_LIST
+
+typedef enum {
+
+#define JMESSAGE(code,string)	code ,
+
+#endif /* JMAKE_ENUM_LIST */
+
+JMESSAGE(JMSG_NOMESSAGE, "Bogus message code %d") /* Must be first entry! */
+
+/* For maintenance convenience, list is alphabetical by message code name */
+#if JPEG_LIB_VERSION < 70
+JMESSAGE(JERR_ARITH_NOTIMPL,
+	 "Sorry, arithmetic coding is not implemented")
+#endif
+JMESSAGE(JERR_BAD_ALIGN_TYPE, "ALIGN_TYPE is wrong, please fix")
+JMESSAGE(JERR_BAD_ALLOC_CHUNK, "MAX_ALLOC_CHUNK is wrong, please fix")
+JMESSAGE(JERR_BAD_BUFFER_MODE, "Bogus buffer control mode")
+JMESSAGE(JERR_BAD_COMPONENT_ID, "Invalid component ID %d in SOS")
+#if JPEG_LIB_VERSION >= 70
+JMESSAGE(JERR_BAD_CROP_SPEC, "Invalid crop request")
+#endif
+JMESSAGE(JERR_BAD_DCT_COEF, "DCT coefficient out of range")
+JMESSAGE(JERR_BAD_DCTSIZE, "IDCT output block size %d not supported")
+#if JPEG_LIB_VERSION >= 70
+JMESSAGE(JERR_BAD_DROP_SAMPLING,
+	 "Component index %d: mismatching sampling ratio %d:%d, %d:%d, %c")
+#endif
+JMESSAGE(JERR_BAD_HUFF_TABLE, "Bogus Huffman table definition")
+JMESSAGE(JERR_BAD_IN_COLORSPACE, "Bogus input colorspace")
+JMESSAGE(JERR_BAD_J_COLORSPACE, "Bogus JPEG colorspace")
+JMESSAGE(JERR_BAD_LENGTH, "Bogus marker length")
+JMESSAGE(JERR_BAD_LIB_VERSION,
+	 "Wrong JPEG library version: library is %d, caller expects %d")
+JMESSAGE(JERR_BAD_MCU_SIZE, "Sampling factors too large for interleaved scan")
+JMESSAGE(JERR_BAD_POOL_ID, "Invalid memory pool code %d")
+JMESSAGE(JERR_BAD_PRECISION, "Unsupported JPEG data precision %d")
+JMESSAGE(JERR_BAD_PROGRESSION,
+	 "Invalid progressive parameters Ss=%d Se=%d Ah=%d Al=%d")
+JMESSAGE(JERR_BAD_PROG_SCRIPT,
+	 "Invalid progressive parameters at scan script entry %d")
+JMESSAGE(JERR_BAD_SAMPLING, "Bogus sampling factors")
+JMESSAGE(JERR_BAD_SCAN_SCRIPT, "Invalid scan script at entry %d")
+JMESSAGE(JERR_BAD_STATE, "Improper call to JPEG library in state %d")
+JMESSAGE(JERR_BAD_STRUCT_SIZE,
+	 "JPEG parameter struct mismatch: library thinks size is %u, caller expects %u")
+JMESSAGE(JERR_BAD_VIRTUAL_ACCESS, "Bogus virtual array access")
+JMESSAGE(JERR_BUFFER_SIZE, "Buffer passed to JPEG library is too small")
+JMESSAGE(JERR_CANT_SUSPEND, "Suspension not allowed here")
+JMESSAGE(JERR_CCIR601_NOTIMPL, "CCIR601 sampling not implemented yet")
+JMESSAGE(JERR_COMPONENT_COUNT, "Too many color components: %d, max %d")
+JMESSAGE(JERR_CONVERSION_NOTIMPL, "Unsupported color conversion request")
+JMESSAGE(JERR_DAC_INDEX, "Bogus DAC index %d")
+JMESSAGE(JERR_DAC_VALUE, "Bogus DAC value 0x%x")
+JMESSAGE(JERR_DHT_INDEX, "Bogus DHT index %d")
+JMESSAGE(JERR_DQT_INDEX, "Bogus DQT index %d")
+JMESSAGE(JERR_EMPTY_IMAGE, "Empty JPEG image (DNL not supported)")
+JMESSAGE(JERR_EMS_READ, "Read from EMS failed")
+JMESSAGE(JERR_EMS_WRITE, "Write to EMS failed")
+JMESSAGE(JERR_EOI_EXPECTED, "Didn't expect more than one scan")
+JMESSAGE(JERR_FILE_READ, "Input file read error")
+JMESSAGE(JERR_FILE_WRITE, "Output file write error --- out of disk space?")
+JMESSAGE(JERR_FRACT_SAMPLE_NOTIMPL, "Fractional sampling not implemented yet")
+JMESSAGE(JERR_HUFF_CLEN_OVERFLOW, "Huffman code size table overflow")
+JMESSAGE(JERR_HUFF_MISSING_CODE, "Missing Huffman code table entry")
+JMESSAGE(JERR_IMAGE_TOO_BIG, "Maximum supported image dimension is %u pixels")
+JMESSAGE(JERR_INPUT_EMPTY, "Empty input file")
+JMESSAGE(JERR_INPUT_EOF, "Premature end of input file")
+JMESSAGE(JERR_MISMATCHED_QUANT_TABLE,
+	 "Cannot transcode due to multiple use of quantization table %d")
+JMESSAGE(JERR_MISSING_DATA, "Scan script does not transmit all data")
+JMESSAGE(JERR_MODE_CHANGE, "Invalid color quantization mode change")
+JMESSAGE(JERR_NOTIMPL, "Not implemented yet")
+JMESSAGE(JERR_NOT_COMPILED, "Requested feature was omitted at compile time")
+#if JPEG_LIB_VERSION >= 70
+JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmetic table 0x%02x was not defined")
+#endif
+JMESSAGE(JERR_NO_BACKING_STORE, "Backing store not supported")
+JMESSAGE(JERR_NO_HUFF_TABLE, "Huffman table 0x%02x was not defined")
+JMESSAGE(JERR_NO_IMAGE, "JPEG datastream contains no image")
+JMESSAGE(JERR_NO_QUANT_TABLE, "Quantization table 0x%02x was not defined")
+JMESSAGE(JERR_NO_SOI, "Not a JPEG file: starts with 0x%02x 0x%02x")
+JMESSAGE(JERR_OUT_OF_MEMORY, "Insufficient memory (case %d)")
+JMESSAGE(JERR_QUANT_COMPONENTS,
+	 "Cannot quantize more than %d color components")
+JMESSAGE(JERR_QUANT_FEW_COLORS, "Cannot quantize to fewer than %d colors")
+JMESSAGE(JERR_QUANT_MANY_COLORS, "Cannot quantize to more than %d colors")
+JMESSAGE(JERR_SOF_DUPLICATE, "Invalid JPEG file structure: two SOF markers")
+JMESSAGE(JERR_SOF_NO_SOS, "Invalid JPEG file structure: missing SOS marker")
+JMESSAGE(JERR_SOF_UNSUPPORTED, "Unsupported JPEG process: SOF type 0x%02x")
+JMESSAGE(JERR_SOI_DUPLICATE, "Invalid JPEG file structure: two SOI markers")
+JMESSAGE(JERR_SOS_NO_SOF, "Invalid JPEG file structure: SOS before SOF")
+JMESSAGE(JERR_TFILE_CREATE, "Failed to create temporary file %s")
+JMESSAGE(JERR_TFILE_READ, "Read failed on temporary file")
+JMESSAGE(JERR_TFILE_SEEK, "Seek failed on temporary file")
+JMESSAGE(JERR_TFILE_WRITE,
+	 "Write failed on temporary file --- out of disk space?")
+JMESSAGE(JERR_TOO_LITTLE_DATA, "Application transferred too few scanlines")
+JMESSAGE(JERR_UNKNOWN_MARKER, "Unsupported marker type 0x%02x")
+JMESSAGE(JERR_VIRTUAL_BUG, "Virtual array controller messed up")
+JMESSAGE(JERR_WIDTH_OVERFLOW, "Image too wide for this implementation")
+JMESSAGE(JERR_XMS_READ, "Read from XMS failed")
+JMESSAGE(JERR_XMS_WRITE, "Write to XMS failed")
+JMESSAGE(JMSG_COPYRIGHT, JCOPYRIGHT)
+JMESSAGE(JMSG_VERSION, JVERSION)
+JMESSAGE(JTRC_16BIT_TABLES,
+	 "Caution: quantization tables are too coarse for baseline JPEG")
+JMESSAGE(JTRC_ADOBE,
+	 "Adobe APP14 marker: version %d, flags 0x%04x 0x%04x, transform %d")
+JMESSAGE(JTRC_APP0, "Unknown APP0 marker (not JFIF), length %u")
+JMESSAGE(JTRC_APP14, "Unknown APP14 marker (not Adobe), length %u")
+JMESSAGE(JTRC_DAC, "Define Arithmetic Table 0x%02x: 0x%02x")
+JMESSAGE(JTRC_DHT, "Define Huffman Table 0x%02x")
+JMESSAGE(JTRC_DQT, "Define Quantization Table %d  precision %d")
+JMESSAGE(JTRC_DRI, "Define Restart Interval %u")
+JMESSAGE(JTRC_EMS_CLOSE, "Freed EMS handle %u")
+JMESSAGE(JTRC_EMS_OPEN, "Obtained EMS handle %u")
+JMESSAGE(JTRC_EOI, "End Of Image")
+JMESSAGE(JTRC_HUFFBITS, "        %3d %3d %3d %3d %3d %3d %3d %3d")
+JMESSAGE(JTRC_JFIF, "JFIF APP0 marker: version %d.%02d, density %dx%d  %d")
+JMESSAGE(JTRC_JFIF_BADTHUMBNAILSIZE,
+	 "Warning: thumbnail image size does not match data length %u")
+JMESSAGE(JTRC_JFIF_EXTENSION,
+	 "JFIF extension marker: type 0x%02x, length %u")
+JMESSAGE(JTRC_JFIF_THUMBNAIL, "    with %d x %d thumbnail image")
+JMESSAGE(JTRC_MISC_MARKER, "Miscellaneous marker 0x%02x, length %u")
+JMESSAGE(JTRC_PARMLESS_MARKER, "Unexpected marker 0x%02x")
+JMESSAGE(JTRC_QUANTVALS, "        %4u %4u %4u %4u %4u %4u %4u %4u")
+JMESSAGE(JTRC_QUANT_3_NCOLORS, "Quantizing to %d = %d*%d*%d colors")
+JMESSAGE(JTRC_QUANT_NCOLORS, "Quantizing to %d colors")
+JMESSAGE(JTRC_QUANT_SELECTED, "Selected %d colors for quantization")
+JMESSAGE(JTRC_RECOVERY_ACTION, "At marker 0x%02x, recovery action %d")
+JMESSAGE(JTRC_RST, "RST%d")
+JMESSAGE(JTRC_SMOOTH_NOTIMPL,
+	 "Smoothing not supported with nonstandard sampling ratios")
+JMESSAGE(JTRC_SOF, "Start Of Frame 0x%02x: width=%u, height=%u, components=%d")
+JMESSAGE(JTRC_SOF_COMPONENT, "    Component %d: %dhx%dv q=%d")
+JMESSAGE(JTRC_SOI, "Start of Image")
+JMESSAGE(JTRC_SOS, "Start Of Scan: %d components")
+JMESSAGE(JTRC_SOS_COMPONENT, "    Component %d: dc=%d ac=%d")
+JMESSAGE(JTRC_SOS_PARAMS, "  Ss=%d, Se=%d, Ah=%d, Al=%d")
+JMESSAGE(JTRC_TFILE_CLOSE, "Closed temporary file %s")
+JMESSAGE(JTRC_TFILE_OPEN, "Opened temporary file %s")
+JMESSAGE(JTRC_THUMB_JPEG,
+	 "JFIF extension marker: JPEG-compressed thumbnail image, length %u")
+JMESSAGE(JTRC_THUMB_PALETTE,
+	 "JFIF extension marker: palette thumbnail image, length %u")
+JMESSAGE(JTRC_THUMB_RGB,
+	 "JFIF extension marker: RGB thumbnail image, length %u")
+JMESSAGE(JTRC_UNKNOWN_IDS,
+	 "Unrecognized component IDs %d %d %d, assuming YCbCr")
+JMESSAGE(JTRC_XMS_CLOSE, "Freed XMS handle %u")
+JMESSAGE(JTRC_XMS_OPEN, "Obtained XMS handle %u")
+JMESSAGE(JWRN_ADOBE_XFORM, "Unknown Adobe color transform code %d")
+#if JPEG_LIB_VERSION >= 70
+JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code")
+#endif
+JMESSAGE(JWRN_BOGUS_PROGRESSION,
+	 "Inconsistent progression sequence for component %d coefficient %d")
+JMESSAGE(JWRN_EXTRANEOUS_DATA,
+	 "Corrupt JPEG data: %u extraneous bytes before marker 0x%02x")
+JMESSAGE(JWRN_HIT_MARKER, "Corrupt JPEG data: premature end of data segment")
+JMESSAGE(JWRN_HUFF_BAD_CODE, "Corrupt JPEG data: bad Huffman code")
+JMESSAGE(JWRN_JFIF_MAJOR, "Warning: unknown JFIF revision number %d.%02d")
+JMESSAGE(JWRN_JPEG_EOF, "Premature end of JPEG file")
+JMESSAGE(JWRN_MUST_RESYNC,
+	 "Corrupt JPEG data: found marker 0x%02x instead of RST%d")
+JMESSAGE(JWRN_NOT_SEQUENTIAL, "Invalid SOS parameters for sequential JPEG")
+JMESSAGE(JWRN_TOO_MUCH_DATA, "Application transferred too many scanlines")
+#if JPEG_LIB_VERSION < 70
+JMESSAGE(JERR_BAD_CROP_SPEC, "Invalid crop request")
+#if defined(C_ARITH_CODING_SUPPORTED) || defined(D_ARITH_CODING_SUPPORTED)
+JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmetic table 0x%02x was not defined")
+JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code")
+#endif
+#endif
+
+#ifdef JMAKE_ENUM_LIST
+
+  JMSG_LASTMSGCODE
+} J_MESSAGE_CODE;
+
+#undef JMAKE_ENUM_LIST
+#endif /* JMAKE_ENUM_LIST */
+
+/* Zap JMESSAGE macro so that future re-inclusions do nothing by default */
+#undef JMESSAGE
+
+
+#ifndef JERROR_H
+#define JERROR_H
+
+/* Macros to simplify using the error and trace message stuff */
+/* The first parameter is either type of cinfo pointer */
+
+/* Fatal errors (print message and exit) */
+#define ERREXIT(cinfo,code)  \
+  ((cinfo)->err->msg_code = (code), \
+   (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
+#define ERREXIT1(cinfo,code,p1)  \
+  ((cinfo)->err->msg_code = (code), \
+   (cinfo)->err->msg_parm.i[0] = (p1), \
+   (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
+#define ERREXIT2(cinfo,code,p1,p2)  \
+  ((cinfo)->err->msg_code = (code), \
+   (cinfo)->err->msg_parm.i[0] = (p1), \
+   (cinfo)->err->msg_parm.i[1] = (p2), \
+   (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
+#define ERREXIT3(cinfo,code,p1,p2,p3)  \
+  ((cinfo)->err->msg_code = (code), \
+   (cinfo)->err->msg_parm.i[0] = (p1), \
+   (cinfo)->err->msg_parm.i[1] = (p2), \
+   (cinfo)->err->msg_parm.i[2] = (p3), \
+   (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
+#define ERREXIT4(cinfo,code,p1,p2,p3,p4)  \
+  ((cinfo)->err->msg_code = (code), \
+   (cinfo)->err->msg_parm.i[0] = (p1), \
+   (cinfo)->err->msg_parm.i[1] = (p2), \
+   (cinfo)->err->msg_parm.i[2] = (p3), \
+   (cinfo)->err->msg_parm.i[3] = (p4), \
+   (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
+#define ERREXITS(cinfo,code,str)  \
+  ((cinfo)->err->msg_code = (code), \
+   strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \
+   (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
+
+#define MAKESTMT(stuff)		do { stuff } while (0)
+
+/* Nonfatal errors (we can keep going, but the data is probably corrupt) */
+#define WARNMS(cinfo,code)  \
+  ((cinfo)->err->msg_code = (code), \
+   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), -1))
+#define WARNMS1(cinfo,code,p1)  \
+  ((cinfo)->err->msg_code = (code), \
+   (cinfo)->err->msg_parm.i[0] = (p1), \
+   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), -1))
+#define WARNMS2(cinfo,code,p1,p2)  \
+  ((cinfo)->err->msg_code = (code), \
+   (cinfo)->err->msg_parm.i[0] = (p1), \
+   (cinfo)->err->msg_parm.i[1] = (p2), \
+   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), -1))
+
+/* Informational/debugging messages */
+#define TRACEMS(cinfo,lvl,code)  \
+  ((cinfo)->err->msg_code = (code), \
+   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)))
+#define TRACEMS1(cinfo,lvl,code,p1)  \
+  ((cinfo)->err->msg_code = (code), \
+   (cinfo)->err->msg_parm.i[0] = (p1), \
+   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)))
+#define TRACEMS2(cinfo,lvl,code,p1,p2)  \
+  ((cinfo)->err->msg_code = (code), \
+   (cinfo)->err->msg_parm.i[0] = (p1), \
+   (cinfo)->err->msg_parm.i[1] = (p2), \
+   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)))
+#define TRACEMS3(cinfo,lvl,code,p1,p2,p3)  \
+  MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \
+	   _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); \
+	   (cinfo)->err->msg_code = (code); \
+	   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); )
+#define TRACEMS4(cinfo,lvl,code,p1,p2,p3,p4)  \
+  MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \
+	   _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \
+	   (cinfo)->err->msg_code = (code); \
+	   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); )
+#define TRACEMS5(cinfo,lvl,code,p1,p2,p3,p4,p5)  \
+  MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \
+	   _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \
+	   _mp[4] = (p5); \
+	   (cinfo)->err->msg_code = (code); \
+	   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); )
+#define TRACEMS8(cinfo,lvl,code,p1,p2,p3,p4,p5,p6,p7,p8)  \
+  MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \
+	   _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \
+	   _mp[4] = (p5); _mp[5] = (p6); _mp[6] = (p7); _mp[7] = (p8); \
+	   (cinfo)->err->msg_code = (code); \
+	   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); )
+#define TRACEMSS(cinfo,lvl,code,str)  \
+  ((cinfo)->err->msg_code = (code), \
+   strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \
+   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)))
+
+#endif /* JERROR_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jinclude.h b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jinclude.h
new file mode 100644
index 0000000..0a4f151
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jinclude.h
@@ -0,0 +1,91 @@
+/*
+ * jinclude.h
+ *
+ * Copyright (C) 1991-1994, Thomas G. Lane.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file exists to provide a single place to fix any problems with
+ * including the wrong system include files.  (Common problems are taken
+ * care of by the standard jconfig symbols, but on really weird systems
+ * you may have to edit this file.)
+ *
+ * NOTE: this file is NOT intended to be included by applications using the
+ * JPEG library.  Most applications need only include jpeglib.h.
+ */
+
+
+/* Include auto-config file to find out which system include files we need. */
+
+#include "jconfig.h"		/* auto configuration options */
+#define JCONFIG_INCLUDED	/* so that jpeglib.h doesn't do it again */
+
+/*
+ * We need the NULL macro and size_t typedef.
+ * On an ANSI-conforming system it is sufficient to include <stddef.h>.
+ * Otherwise, we get them from <stdlib.h> or <stdio.h>; we may have to
+ * pull in <sys/types.h> as well.
+ * Note that the core JPEG library does not require <stdio.h>;
+ * only the default error handler and data source/destination modules do.
+ * But we must pull it in because of the references to FILE in jpeglib.h.
+ * You can remove those references if you want to compile without <stdio.h>.
+ */
+
+#ifdef HAVE_STDDEF_H
+#include <stddef.h>
+#endif
+
+#ifdef HAVE_STDLIB_H
+#include <stdlib.h>
+#endif
+
+#ifdef NEED_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+
+#include <stdio.h>
+
+/*
+ * We need memory copying and zeroing functions, plus strncpy().
+ * ANSI and System V implementations declare these in <string.h>.
+ * BSD doesn't have the mem() functions, but it does have bcopy()/bzero().
+ * Some systems may declare memset and memcpy in <memory.h>.
+ *
+ * NOTE: we assume the size parameters to these functions are of type size_t.
+ * Change the casts in these macros if not!
+ */
+
+#ifdef NEED_BSD_STRINGS
+
+#include <strings.h>
+#define MEMZERO(target,size)	bzero((void *)(target), (size_t)(size))
+#define MEMCOPY(dest,src,size)	bcopy((const void *)(src), (void *)(dest), (size_t)(size))
+
+#else /* not BSD, assume ANSI/SysV string lib */
+
+#include <string.h>
+#define MEMZERO(target,size)	memset((void *)(target), 0, (size_t)(size))
+#define MEMCOPY(dest,src,size)	memcpy((void *)(dest), (const void *)(src), (size_t)(size))
+
+#endif
+
+/*
+ * In ANSI C, and indeed any rational implementation, size_t is also the
+ * type returned by sizeof().  However, it seems there are some irrational
+ * implementations out there, in which sizeof() returns an int even though
+ * size_t is defined as long or unsigned long.  To ensure consistent results
+ * we always use this SIZEOF() macro in place of using sizeof() directly.
+ */
+
+#define SIZEOF(object)	((size_t) sizeof(object))
+
+/*
+ * The modules that use fread() and fwrite() always invoke them through
+ * these macros.  On some systems you may need to twiddle the argument casts.
+ * CAUTION: argument order is different from underlying functions!
+ */
+
+#define JFREAD(file,buf,sizeofbuf)  \
+  ((size_t) fread((void *) (buf), (size_t) 1, (size_t) (sizeofbuf), (file)))
+#define JFWRITE(file,buf,sizeofbuf)  \
+  ((size_t) fwrite((const void *) (buf), (size_t) 1, (size_t) (sizeofbuf), (file)))
diff --git a/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jmemsys.h b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jmemsys.h
new file mode 100644
index 0000000..b190945
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jmemsys.h
@@ -0,0 +1,198 @@
+/*
+ * jmemsys.h
+ *
+ * Copyright (C) 1992-1997, Thomas G. Lane.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This include file defines the interface between the system-independent
+ * and system-dependent portions of the JPEG memory manager.  No other
+ * modules need include it.  (The system-independent portion is jmemmgr.c;
+ * there are several different versions of the system-dependent portion.)
+ *
+ * This file works as-is for the system-dependent memory managers supplied
+ * in the IJG distribution.  You may need to modify it if you write a
+ * custom memory manager.  If system-dependent changes are needed in
+ * this file, the best method is to #ifdef them based on a configuration
+ * symbol supplied in jconfig.h, as we have done with USE_MSDOS_MEMMGR
+ * and USE_MAC_MEMMGR.
+ */
+
+
+/* Short forms of external names for systems with brain-damaged linkers. */
+
+#ifdef NEED_SHORT_EXTERNAL_NAMES
+#define jpeg_get_small		jGetSmall
+#define jpeg_free_small		jFreeSmall
+#define jpeg_get_large		jGetLarge
+#define jpeg_free_large		jFreeLarge
+#define jpeg_mem_available	jMemAvail
+#define jpeg_open_backing_store	jOpenBackStore
+#define jpeg_mem_init		jMemInit
+#define jpeg_mem_term		jMemTerm
+#endif /* NEED_SHORT_EXTERNAL_NAMES */
+
+
+/*
+ * These two functions are used to allocate and release small chunks of
+ * memory.  (Typically the total amount requested through jpeg_get_small is
+ * no more than 20K or so; this will be requested in chunks of a few K each.)
+ * Behavior should be the same as for the standard library functions malloc
+ * and free; in particular, jpeg_get_small must return NULL on failure.
+ * On most systems, these ARE malloc and free.  jpeg_free_small is passed the
+ * size of the object being freed, just in case it's needed.
+ * On an 80x86 machine using small-data memory model, these manage near heap.
+ */
+
+EXTERN(void *) jpeg_get_small JPP((j_common_ptr cinfo, size_t sizeofobject));
+EXTERN(void) jpeg_free_small JPP((j_common_ptr cinfo, void * object,
+				  size_t sizeofobject));
+
+/*
+ * These two functions are used to allocate and release large chunks of
+ * memory (up to the total free space designated by jpeg_mem_available).
+ * The interface is the same as above, except that on an 80x86 machine,
+ * far pointers are used.  On most other machines these are identical to
+ * the jpeg_get/free_small routines; but we keep them separate anyway,
+ * in case a different allocation strategy is desirable for large chunks.
+ */
+
+EXTERN(void FAR *) jpeg_get_large JPP((j_common_ptr cinfo,
+				       size_t sizeofobject));
+EXTERN(void) jpeg_free_large JPP((j_common_ptr cinfo, void FAR * object,
+				  size_t sizeofobject));
+
+/*
+ * The macro MAX_ALLOC_CHUNK designates the maximum number of bytes that may
+ * be requested in a single call to jpeg_get_large (and jpeg_get_small for that
+ * matter, but that case should never come into play).  This macro is needed
+ * to model the 64Kb-segment-size limit of far addressing on 80x86 machines.
+ * On those machines, we expect that jconfig.h will provide a proper value.
+ * On machines with 32-bit flat address spaces, any large constant may be used.
+ *
+ * NB: jmemmgr.c expects that MAX_ALLOC_CHUNK will be representable as type
+ * size_t and will be a multiple of sizeof(align_type).
+ */
+
+#ifndef MAX_ALLOC_CHUNK		/* may be overridden in jconfig.h */
+#define MAX_ALLOC_CHUNK  1000000000L
+#endif
+
+/*
+ * This routine computes the total space still available for allocation by
+ * jpeg_get_large.  If more space than this is needed, backing store will be
+ * used.  NOTE: any memory already allocated must not be counted.
+ *
+ * There is a minimum space requirement, corresponding to the minimum
+ * feasible buffer sizes; jmemmgr.c will request that much space even if
+ * jpeg_mem_available returns zero.  The maximum space needed, enough to hold
+ * all working storage in memory, is also passed in case it is useful.
+ * Finally, the total space already allocated is passed.  If no better
+ * method is available, cinfo->mem->max_memory_to_use - already_allocated
+ * is often a suitable calculation.
+ *
+ * It is OK for jpeg_mem_available to underestimate the space available
+ * (that'll just lead to more backing-store access than is really necessary).
+ * However, an overestimate will lead to failure.  Hence it's wise to subtract
+ * a slop factor from the true available space.  5% should be enough.
+ *
+ * On machines with lots of virtual memory, any large constant may be returned.
+ * Conversely, zero may be returned to always use the minimum amount of memory.
+ */
+
+EXTERN(size_t) jpeg_mem_available JPP((j_common_ptr cinfo,
+				     size_t min_bytes_needed,
+				     size_t max_bytes_needed,
+				     size_t already_allocated));
+
+
+/*
+ * This structure holds whatever state is needed to access a single
+ * backing-store object.  The read/write/close method pointers are called
+ * by jmemmgr.c to manipulate the backing-store object; all other fields
+ * are private to the system-dependent backing store routines.
+ */
+
+#define TEMP_NAME_LENGTH   64	/* max length of a temporary file's name */
+
+
+#ifdef USE_MSDOS_MEMMGR		/* DOS-specific junk */
+
+typedef unsigned short XMSH;	/* type of extended-memory handles */
+typedef unsigned short EMSH;	/* type of expanded-memory handles */
+
+typedef union {
+  short file_handle;		/* DOS file handle if it's a temp file */
+  XMSH xms_handle;		/* handle if it's a chunk of XMS */
+  EMSH ems_handle;		/* handle if it's a chunk of EMS */
+} handle_union;
+
+#endif /* USE_MSDOS_MEMMGR */
+
+#ifdef USE_MAC_MEMMGR		/* Mac-specific junk */
+#include <Files.h>
+#endif /* USE_MAC_MEMMGR */
+
+
+typedef struct backing_store_struct * backing_store_ptr;
+
+typedef struct backing_store_struct {
+  /* Methods for reading/writing/closing this backing-store object */
+  JMETHOD(void, read_backing_store, (j_common_ptr cinfo,
+				     backing_store_ptr info,
+				     void FAR * buffer_address,
+				     long file_offset, long byte_count));
+  JMETHOD(void, write_backing_store, (j_common_ptr cinfo,
+				      backing_store_ptr info,
+				      void FAR * buffer_address,
+				      long file_offset, long byte_count));
+  JMETHOD(void, close_backing_store, (j_common_ptr cinfo,
+				      backing_store_ptr info));
+
+  /* Private fields for system-dependent backing-store management */
+#ifdef USE_MSDOS_MEMMGR
+  /* For the MS-DOS manager (jmemdos.c), we need: */
+  handle_union handle;		/* reference to backing-store storage object */
+  char temp_name[TEMP_NAME_LENGTH]; /* name if it's a file */
+#else
+#ifdef USE_MAC_MEMMGR
+  /* For the Mac manager (jmemmac.c), we need: */
+  short temp_file;		/* file reference number to temp file */
+  FSSpec tempSpec;		/* the FSSpec for the temp file */
+  char temp_name[TEMP_NAME_LENGTH]; /* name if it's a file */
+#else
+  /* For a typical implementation with temp files, we need: */
+  FILE * temp_file;		/* stdio reference to temp file */
+  char temp_name[TEMP_NAME_LENGTH]; /* name of temp file */
+#endif
+#endif
+} backing_store_info;
+
+
+/*
+ * Initial opening of a backing-store object.  This must fill in the
+ * read/write/close pointers in the object.  The read/write routines
+ * may take an error exit if the specified maximum file size is exceeded.
+ * (If jpeg_mem_available always returns a large value, this routine can
+ * just take an error exit.)
+ */
+
+EXTERN(void) jpeg_open_backing_store JPP((j_common_ptr cinfo,
+					  backing_store_ptr info,
+					  long total_bytes_needed));
+
+
+/*
+ * These routines take care of any system-dependent initialization and
+ * cleanup required.  jpeg_mem_init will be called before anything is
+ * allocated (and, therefore, nothing in cinfo is of use except the error
+ * manager pointer).  It should return a suitable default value for
+ * max_memory_to_use; this may subsequently be overridden by the surrounding
+ * application.  (Note that max_memory_to_use is only important if
+ * jpeg_mem_available chooses to consult it ... no one else will.)
+ * jpeg_mem_term may assume that all requested memory has been freed and that
+ * all opened backing-store objects have been closed.
+ */
+
+EXTERN(long) jpeg_mem_init JPP((j_common_ptr cinfo));
+EXTERN(void) jpeg_mem_term JPP((j_common_ptr cinfo));
diff --git a/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jmorecfg.h b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jmorecfg.h
new file mode 100644
index 0000000..f762ae7
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jmorecfg.h
@@ -0,0 +1,446 @@
+/*
+ * jmorecfg.h
+ *
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Copyright (C) 2009, 2011, D. R. Commander.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains additional configuration options that customize the
+ * JPEG software for special applications or support machine-dependent
+ * optimizations.  Most users will not need to touch this file.
+ */
+
+/*
+ * When we're building for android, turn on ANDROID_RGB by default. 
+ * This is needed for components like skia which make use of the
+ * new encodings defined behind ANDROID_RBG. It's not a reasonable
+ * config to have ANDROID_RBG off.
+ */
+#ifdef ANDROID
+#ifndef ANDROID_RGB
+#define ANDROID_RGB
+#endif
+#endif
+
+/*
+ * Define BITS_IN_JSAMPLE as either
+ *   8   for 8-bit sample values (the usual setting)
+ *   12  for 12-bit sample values
+ * Only 8 and 12 are legal data precisions for lossy JPEG according to the
+ * JPEG standard, and the IJG code does not support anything else!
+ * We do not support run-time selection of data precision, sorry.
+ */
+
+#define BITS_IN_JSAMPLE  8	/* use 8 or 12 */
+
+
+/*
+ * Maximum number of components (color channels) allowed in JPEG image.
+ * To meet the letter of the JPEG spec, set this to 255.  However, darn
+ * few applications need more than 4 channels (maybe 5 for CMYK + alpha
+ * mask).  We recommend 10 as a reasonable compromise; use 4 if you are
+ * really short on memory.  (Each allowed component costs a hundred or so
+ * bytes of storage, whether actually used in an image or not.)
+ */
+
+#define MAX_COMPONENTS  10	/* maximum number of image components */
+
+
+/*
+ * Basic data types.
+ * You may need to change these if you have a machine with unusual data
+ * type sizes; for example, "char" not 8 bits, "short" not 16 bits,
+ * or "long" not 32 bits.  We don't care whether "int" is 16 or 32 bits,
+ * but it had better be at least 16.
+ */
+
+/* Representation of a single sample (pixel element value).
+ * We frequently allocate large arrays of these, so it's important to keep
+ * them small.  But if you have memory to burn and access to char or short
+ * arrays is very slow on your hardware, you might want to change these.
+ */
+
+#if BITS_IN_JSAMPLE == 8
+/* JSAMPLE should be the smallest type that will hold the values 0..255.
+ * You can use a signed char by having GETJSAMPLE mask it with 0xFF.
+ */
+
+#ifdef HAVE_UNSIGNED_CHAR
+
+typedef unsigned char JSAMPLE;
+#define GETJSAMPLE(value)  ((int) (value))
+
+#else /* not HAVE_UNSIGNED_CHAR */
+
+typedef char JSAMPLE;
+#ifdef __CHAR_UNSIGNED__
+#define GETJSAMPLE(value)  ((int) (value))
+#else
+#define GETJSAMPLE(value)  ((int) (value) & 0xFF)
+#endif /* __CHAR_UNSIGNED__ */
+
+#endif /* HAVE_UNSIGNED_CHAR */
+
+#define MAXJSAMPLE	255
+#define CENTERJSAMPLE	128
+
+#endif /* BITS_IN_JSAMPLE == 8 */
+
+
+#if BITS_IN_JSAMPLE == 12
+/* JSAMPLE should be the smallest type that will hold the values 0..4095.
+ * On nearly all machines "short" will do nicely.
+ */
+
+typedef short JSAMPLE;
+#define GETJSAMPLE(value)  ((int) (value))
+
+#define MAXJSAMPLE	4095
+#define CENTERJSAMPLE	2048
+
+#endif /* BITS_IN_JSAMPLE == 12 */
+
+
+/* Representation of a DCT frequency coefficient.
+ * This should be a signed value of at least 16 bits; "short" is usually OK.
+ * Again, we allocate large arrays of these, but you can change to int
+ * if you have memory to burn and "short" is really slow.
+ */
+
+typedef short JCOEF;
+
+
+/* Compressed datastreams are represented as arrays of JOCTET.
+ * These must be EXACTLY 8 bits wide, at least once they are written to
+ * external storage.  Note that when using the stdio data source/destination
+ * managers, this is also the data type passed to fread/fwrite.
+ */
+
+#ifdef HAVE_UNSIGNED_CHAR
+
+typedef unsigned char JOCTET;
+#define GETJOCTET(value)  (value)
+
+#else /* not HAVE_UNSIGNED_CHAR */
+
+typedef char JOCTET;
+#ifdef __CHAR_UNSIGNED__
+#define GETJOCTET(value)  (value)
+#else
+#define GETJOCTET(value)  ((value) & 0xFF)
+#endif /* __CHAR_UNSIGNED__ */
+
+#endif /* HAVE_UNSIGNED_CHAR */
+
+
+/* These typedefs are used for various table entries and so forth.
+ * They must be at least as wide as specified; but making them too big
+ * won't cost a huge amount of memory, so we don't provide special
+ * extraction code like we did for JSAMPLE.  (In other words, these
+ * typedefs live at a different point on the speed/space tradeoff curve.)
+ */
+
+/* UINT8 must hold at least the values 0..255. */
+
+#ifdef HAVE_UNSIGNED_CHAR
+typedef unsigned char UINT8;
+#else /* not HAVE_UNSIGNED_CHAR */
+#ifdef __CHAR_UNSIGNED__
+typedef char UINT8;
+#else /* not __CHAR_UNSIGNED__ */
+typedef short UINT8;
+#endif /* __CHAR_UNSIGNED__ */
+#endif /* HAVE_UNSIGNED_CHAR */
+
+/* UINT16 must hold at least the values 0..65535. */
+
+#ifdef HAVE_UNSIGNED_SHORT
+typedef unsigned short UINT16;
+#else /* not HAVE_UNSIGNED_SHORT */
+typedef unsigned int UINT16;
+#endif /* HAVE_UNSIGNED_SHORT */
+
+/* INT16 must hold at least the values -32768..32767. */
+
+#ifndef XMD_H			/* X11/xmd.h correctly defines INT16 */
+typedef short INT16;
+#endif
+
+/* INT32 must hold at least signed 32-bit values. */
+
+#ifndef XMD_H			/* X11/xmd.h correctly defines INT32 */
+typedef long INT32;
+#endif
+
+/* Datatype used for image dimensions.  The JPEG standard only supports
+ * images up to 64K*64K due to 16-bit fields in SOF markers.  Therefore
+ * "unsigned int" is sufficient on all machines.  However, if you need to
+ * handle larger images and you don't mind deviating from the spec, you
+ * can change this datatype.
+ */
+
+typedef unsigned int JDIMENSION;
+
+#define JPEG_MAX_DIMENSION  65500L  /* a tad under 64K to prevent overflows */
+
+
+/* These macros are used in all function definitions and extern declarations.
+ * You could modify them if you need to change function linkage conventions;
+ * in particular, you'll need to do that to make the library a Windows DLL.
+ * Another application is to make all functions global for use with debuggers
+ * or code profilers that require it.
+ */
+
+/* a function called through method pointers: */
+#define METHODDEF(type)		static type
+/* a function used only in its module: */
+#define LOCAL(type)		static type
+/* a function referenced thru EXTERNs: */
+#define GLOBAL(type)		type
+/* a reference to a GLOBAL function: */
+#define EXTERN(type)		extern type
+
+
+/* This macro is used to declare a "method", that is, a function pointer.
+ * We want to supply prototype parameters if the compiler can cope.
+ * Note that the arglist parameter must be parenthesized!
+ * Again, you can customize this if you need special linkage keywords.
+ */
+
+#ifdef HAVE_PROTOTYPES
+#define JMETHOD(type,methodname,arglist)  type (*methodname) arglist
+#else
+#define JMETHOD(type,methodname,arglist)  type (*methodname) ()
+#endif
+
+
+/* Here is the pseudo-keyword for declaring pointers that must be "far"
+ * on 80x86 machines.  Most of the specialized coding for 80x86 is handled
+ * by just saying "FAR *" where such a pointer is needed.  In a few places
+ * explicit coding is needed; see uses of the NEED_FAR_POINTERS symbol.
+ */
+
+#ifdef NEED_FAR_POINTERS
+#define FAR  far
+#else
+#define FAR
+#endif
+
+
+/*
+ * On a few systems, type boolean and/or its values FALSE, TRUE may appear
+ * in standard header files.  Or you may have conflicts with application-
+ * specific header files that you want to include together with these files.
+ * Defining HAVE_BOOLEAN before including jpeglib.h should make it work.
+ */
+
+#ifndef HAVE_BOOLEAN
+typedef int boolean;
+#endif
+#ifndef FALSE			/* in case these macros already exist */
+#define FALSE	0		/* values of boolean */
+#endif
+#ifndef TRUE
+#define TRUE	1
+#endif
+
+
+/*
+ * The remaining options affect code selection within the JPEG library,
+ * but they don't need to be visible to most applications using the library.
+ * To minimize application namespace pollution, the symbols won't be
+ * defined unless JPEG_INTERNALS or JPEG_INTERNAL_OPTIONS has been defined.
+ */
+
+#ifdef JPEG_INTERNALS
+#define JPEG_INTERNAL_OPTIONS
+#endif
+
+#ifdef JPEG_INTERNAL_OPTIONS
+
+
+/*
+ * These defines indicate whether to include various optional functions.
+ * Undefining some of these symbols will produce a smaller but less capable
+ * library.  Note that you can leave certain source files out of the
+ * compilation/linking process if you've #undef'd the corresponding symbols.
+ * (You may HAVE to do that if your compiler doesn't like null source files.)
+ */
+
+/* Capability options common to encoder and decoder: */
+
+#define DCT_ISLOW_SUPPORTED	/* slow but accurate integer algorithm */
+#define DCT_IFAST_SUPPORTED	/* faster, less accurate integer method */
+#define DCT_FLOAT_SUPPORTED	/* floating-point: accurate, fast on fast HW */
+
+/* Encoder capability options: */
+
+#define C_MULTISCAN_FILES_SUPPORTED /* Multiple-scan JPEG files? */
+#define C_PROGRESSIVE_SUPPORTED	    /* Progressive JPEG? (Requires MULTISCAN)*/
+#define ENTROPY_OPT_SUPPORTED	    /* Optimization of entropy coding parms? */
+/* Note: if you selected 12-bit data precision, it is dangerous to turn off
+ * ENTROPY_OPT_SUPPORTED.  The standard Huffman tables are only good for 8-bit
+ * precision, so jchuff.c normally uses entropy optimization to compute
+ * usable tables for higher precision.  If you don't want to do optimization,
+ * you'll have to supply different default Huffman tables.
+ * The exact same statements apply for progressive JPEG: the default tables
+ * don't work for progressive mode.  (This may get fixed, however.)
+ */
+#define INPUT_SMOOTHING_SUPPORTED   /* Input image smoothing option? */
+
+/* Decoder capability options: */
+
+#define D_MULTISCAN_FILES_SUPPORTED /* Multiple-scan JPEG files? */
+#define D_PROGRESSIVE_SUPPORTED	    /* Progressive JPEG? (Requires MULTISCAN)*/
+#define SAVE_MARKERS_SUPPORTED	    /* jpeg_save_markers() needed? */
+#define BLOCK_SMOOTHING_SUPPORTED   /* Block smoothing? (Progressive only) */
+#define IDCT_SCALING_SUPPORTED	    /* Output rescaling via IDCT? */
+#undef  UPSAMPLE_SCALING_SUPPORTED  /* Output rescaling at upsample stage? */
+#define UPSAMPLE_MERGING_SUPPORTED  /* Fast path for sloppy upsampling? */
+#define QUANT_1PASS_SUPPORTED	    /* 1-pass color quantization? */
+#define QUANT_2PASS_SUPPORTED	    /* 2-pass color quantization? */
+
+/* more capability options later, no doubt */
+
+
+/*
+ * Ordering of RGB data in scanlines passed to or from the application.
+ * If your application wants to deal with data in the order B,G,R, just
+ * change these macros.  You can also deal with formats such as R,G,B,X
+ * (one extra byte per pixel) by changing RGB_PIXELSIZE.  Note that changing
+ * the offsets will also change the order in which colormap data is organized.
+ * RESTRICTIONS:
+ * 1. The sample applications cjpeg,djpeg do NOT support modified RGB formats.
+ * 2. These macros only affect RGB<=>YCbCr color conversion, so they are not
+ *    useful if you are using JPEG color spaces other than YCbCr or grayscale.
+ * 3. The color quantizer modules will not behave desirably if RGB_PIXELSIZE
+ *    is not 3 (they don't understand about dummy color components!).  So you
+ *    can't use color quantization if you change that value.
+ */
+
+#define RGB_RED		0	/* Offset of Red in an RGB scanline element */
+#define RGB_GREEN	1	/* Offset of Green */
+#define RGB_BLUE	2	/* Offset of Blue */
+#define RGB_PIXELSIZE	3	/* JSAMPLEs per RGB scanline element */
+
+#ifdef ANDROID_RGB
+#define RGB_ALPHA   3   /* Offset of Alpha */
+#endif
+
+#define JPEG_NUMCS 16
+
+#define EXT_RGB_RED        0
+#define EXT_RGB_GREEN      1
+#define EXT_RGB_BLUE       2
+#define EXT_RGB_PIXELSIZE  3
+
+#define EXT_RGBX_RED       0
+#define EXT_RGBX_GREEN     1
+#define EXT_RGBX_BLUE      2
+#define EXT_RGBX_PIXELSIZE 4
+
+#define EXT_BGR_RED        2
+#define EXT_BGR_GREEN      1
+#define EXT_BGR_BLUE       0
+#define EXT_BGR_PIXELSIZE  3
+
+#define EXT_BGRX_RED       2
+#define EXT_BGRX_GREEN     1
+#define EXT_BGRX_BLUE      0
+#define EXT_BGRX_PIXELSIZE 4
+
+#define EXT_XBGR_RED       3
+#define EXT_XBGR_GREEN     2
+#define EXT_XBGR_BLUE      1
+#define EXT_XBGR_PIXELSIZE 4
+
+#define EXT_XRGB_RED       1
+#define EXT_XRGB_GREEN     2
+#define EXT_XRGB_BLUE      3
+#define EXT_XRGB_PIXELSIZE 4
+
+#ifdef ANDROID_RGB
+#define RGB_ALPHA   3   /* Offset of Alpha */
+#endif
+
+static const int rgb_red[JPEG_NUMCS] = {
+  -1, -1, RGB_RED, -1, -1, -1, EXT_RGB_RED, EXT_RGBX_RED,
+  EXT_BGR_RED, EXT_BGRX_RED, EXT_XBGR_RED, EXT_XRGB_RED,
+  EXT_RGBX_RED, EXT_BGRX_RED, EXT_XBGR_RED, EXT_XRGB_RED
+};
+
+static const int rgb_green[JPEG_NUMCS] = {
+  -1, -1, RGB_GREEN, -1, -1, -1, EXT_RGB_GREEN, EXT_RGBX_GREEN,
+  EXT_BGR_GREEN, EXT_BGRX_GREEN, EXT_XBGR_GREEN, EXT_XRGB_GREEN,
+  EXT_RGBX_GREEN, EXT_BGRX_GREEN, EXT_XBGR_GREEN, EXT_XRGB_GREEN
+};
+
+static const int rgb_blue[JPEG_NUMCS] = {
+  -1, -1, RGB_BLUE, -1, -1, -1, EXT_RGB_BLUE, EXT_RGBX_BLUE,
+  EXT_BGR_BLUE, EXT_BGRX_BLUE, EXT_XBGR_BLUE, EXT_XRGB_BLUE,
+  EXT_RGBX_BLUE, EXT_BGRX_BLUE, EXT_XBGR_BLUE, EXT_XRGB_BLUE
+};
+
+static const int rgb_pixelsize[JPEG_NUMCS] = {
+  -1, -1, RGB_PIXELSIZE, -1, -1, -1, EXT_RGB_PIXELSIZE, EXT_RGBX_PIXELSIZE,
+  EXT_BGR_PIXELSIZE, EXT_BGRX_PIXELSIZE, EXT_XBGR_PIXELSIZE, EXT_XRGB_PIXELSIZE,
+  EXT_RGBX_PIXELSIZE, EXT_BGRX_PIXELSIZE, EXT_XBGR_PIXELSIZE, EXT_XRGB_PIXELSIZE
+};
+
+
+/*
+ * Define ANDROID_RGB to enable specific optimizations for Android
+ *   JCS_RGBA_8888 support
+ *   JCS_RGB_565 support
+ *
+ */
+
+#ifdef ANDROID_RGB
+#define PACK_SHORT_565(r,g,b)  ((((r)<<8)&0xf800)|(((g)<<3)&0x7E0)|((b)>>3))
+#define PACK_TWO_PIXELS(l,r)   ((r<<16) | l)
+#define PACK_NEED_ALIGNMENT(ptr) (((int)(ptr))&3)
+#define WRITE_TWO_PIXELS(addr, pixels) do {     \
+         ((INT16*)(addr))[0] = (pixels);        \
+         ((INT16*)(addr))[1] = (pixels)>>16;    \
+    } while(0)
+#define WRITE_TWO_ALIGNED_PIXELS(addr, pixels)  ((*(INT32*)(addr)) = pixels)
+#define DITHER_565_R(r, dither) ((r) + ((dither)&0xFF))
+#define DITHER_565_G(g, dither) ((g) + (((dither)&0xFF)>>1))
+#define DITHER_565_B(b, dither) ((b) + ((dither)&0xFF))
+#endif
+
+
+/* Definitions for speed-related optimizations. */
+
+/* On some machines (notably 68000 series) "int" is 32 bits, but multiplying
+ * two 16-bit shorts is faster than multiplying two ints.  Define MULTIPLIER
+ * as short on such a machine.  MULTIPLIER must be at least 16 bits wide.
+ */
+
+#ifndef MULTIPLIER
+#ifndef WITH_SIMD
+#define MULTIPLIER  int		/* type for fastest integer multiply */
+#else
+#define MULTIPLIER short  /* prefer 16-bit with SIMD for parellelism */
+#endif
+#endif
+
+
+/* FAST_FLOAT should be either float or double, whichever is done faster
+ * by your compiler.  (Note that this type is only used in the floating point
+ * DCT routines, so it only matters if you've defined DCT_FLOAT_SUPPORTED.)
+ * Typically, float is faster in ANSI C compilers, while double is faster in
+ * pre-ANSI compilers (because they insist on converting to double anyway).
+ * The code below therefore chooses float if we have ANSI-style prototypes.
+ */
+
+#ifndef FAST_FLOAT
+#ifdef HAVE_PROTOTYPES
+#define FAST_FLOAT  float
+#else
+#define FAST_FLOAT  double
+#endif
+#endif
+
+#endif /* JPEG_INTERNAL_OPTIONS */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jpegcomp.h b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jpegcomp.h
new file mode 100644
index 0000000..1b9e0a4
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jpegcomp.h
@@ -0,0 +1,26 @@
+/*
+ * jpegcomp.h
+ *
+ * Copyright (C) 2010, D. R. Commander
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * JPEG compatibility macros
+ * These declarations are considered internal to the JPEG library; most
+ * applications using the library shouldn't need to include this file.
+ */
+
+#if JPEG_LIB_VERSION >= 70
+#define _DCT_scaled_size DCT_h_scaled_size
+#define _min_DCT_scaled_size min_DCT_h_scaled_size
+#define _min_DCT_h_scaled_size min_DCT_h_scaled_size
+#define _min_DCT_v_scaled_size min_DCT_v_scaled_size
+#define _jpeg_width jpeg_width
+#define _jpeg_height jpeg_height
+#else
+#define _DCT_scaled_size DCT_scaled_size
+#define _min_DCT_scaled_size min_DCT_scaled_size
+#define _min_DCT_h_scaled_size min_DCT_scaled_size
+#define _min_DCT_v_scaled_size min_DCT_scaled_size
+#define _jpeg_width image_width
+#define _jpeg_height image_height
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jpegint.h b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jpegint.h
new file mode 100644
index 0000000..a849a47
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jpegint.h
@@ -0,0 +1,460 @@
+/*
+ * jpegint.h
+ *
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 1997-2009 by Guido Vollbeding.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file provides common declarations for the various JPEG modules.
+ * These declarations are considered internal to the JPEG library; most
+ * applications using the library shouldn't need to include this file.
+ */
+
+
+/* Declarations for both compression & decompression */
+
+typedef enum {			/* Operating modes for buffer controllers */
+	JBUF_PASS_THRU,		/* Plain stripwise operation */
+	/* Remaining modes require a full-image buffer to have been created */
+	JBUF_SAVE_SOURCE,	/* Run source subobject only, writeFileAsync output */
+	JBUF_CRANK_DEST,	/* Run dest subobject only, using saved data */
+	JBUF_SAVE_AND_PASS	/* Run both subobjects, writeFileAsync output */
+} J_BUF_MODE;
+
+/* Values of global_state field (jdapi.c has some dependencies on ordering!) */
+#define CSTATE_START	100	/* after create_compress */
+#define CSTATE_SCANNING	101	/* start_compress done, write_scanlines OK */
+#define CSTATE_RAW_OK	102	/* start_compress done, write_raw_data OK */
+#define CSTATE_WRCOEFS	103	/* jpeg_write_coefficients done */
+#define DSTATE_START	200	/* after create_decompress */
+#define DSTATE_INHEADER	201	/* reading header markers, no SOS yet */
+#define DSTATE_READY	202	/* found SOS, ready for start_decompress */
+#define DSTATE_PRELOAD	203	/* reading multiscan file in start_decompress*/
+#define DSTATE_PRESCAN	204	/* performing dummy pass for 2-pass quant */
+#define DSTATE_SCANNING	205	/* start_decompress done, read_scanlines OK */
+#define DSTATE_RAW_OK	206	/* start_decompress done, read_raw_data OK */
+#define DSTATE_BUFIMAGE	207	/* expecting jpeg_start_output */
+#define DSTATE_BUFPOST	208	/* looking for SOS/EOI in jpeg_finish_output */
+#define DSTATE_RDCOEFS	209	/* reading file in jpeg_read_coefficients */
+#define DSTATE_STOPPING	210	/* looking for EOI in jpeg_finish_decompress */
+
+
+/* Declarations for compression modules */
+
+/* Master control module */
+struct jpeg_comp_master {
+  JMETHOD(void, prepare_for_pass, (j_compress_ptr cinfo));
+  JMETHOD(void, pass_startup, (j_compress_ptr cinfo));
+  JMETHOD(void, finish_pass, (j_compress_ptr cinfo));
+
+  /* State variables made visible to other modules */
+  boolean call_pass_startup;	/* True if pass_startup must be called */
+  boolean is_last_pass;		/* True during last pass */
+};
+
+/* Main buffer control (downsampled-data buffer) */
+struct jpeg_c_main_controller {
+  JMETHOD(void, start_pass, (j_compress_ptr cinfo, J_BUF_MODE pass_mode));
+  JMETHOD(void, process_data, (j_compress_ptr cinfo,
+			       JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
+			       JDIMENSION in_rows_avail));
+};
+
+/* Compression preprocessing (downsampling input buffer control) */
+struct jpeg_c_prep_controller {
+  JMETHOD(void, start_pass, (j_compress_ptr cinfo, J_BUF_MODE pass_mode));
+  JMETHOD(void, pre_process_data, (j_compress_ptr cinfo,
+				   JSAMPARRAY input_buf,
+				   JDIMENSION *in_row_ctr,
+				   JDIMENSION in_rows_avail,
+				   JSAMPIMAGE output_buf,
+				   JDIMENSION *out_row_group_ctr,
+				   JDIMENSION out_row_groups_avail));
+};
+
+/* Coefficient buffer control */
+struct jpeg_c_coef_controller {
+  JMETHOD(void, start_pass, (j_compress_ptr cinfo, J_BUF_MODE pass_mode));
+  JMETHOD(boolean, compress_data, (j_compress_ptr cinfo,
+				   JSAMPIMAGE input_buf));
+};
+
+/* Colorspace conversion */
+struct jpeg_color_converter {
+  JMETHOD(void, start_pass, (j_compress_ptr cinfo));
+  JMETHOD(void, color_convert, (j_compress_ptr cinfo,
+				JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+				JDIMENSION output_row, int num_rows));
+};
+
+/* Downsampling */
+struct jpeg_downsampler {
+  JMETHOD(void, start_pass, (j_compress_ptr cinfo));
+  JMETHOD(void, downsample, (j_compress_ptr cinfo,
+			     JSAMPIMAGE input_buf, JDIMENSION in_row_index,
+			     JSAMPIMAGE output_buf,
+			     JDIMENSION out_row_group_index));
+
+  boolean need_context_rows;	/* TRUE if need rows above & below */
+};
+
+/* Forward DCT (also controls coefficient quantization) */
+struct jpeg_forward_dct {
+  JMETHOD(void, start_pass, (j_compress_ptr cinfo));
+  /* perhaps this should be an array??? */
+  JMETHOD(void, forward_DCT, (j_compress_ptr cinfo,
+			      jpeg_component_info * compptr,
+			      JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
+			      JDIMENSION start_row, JDIMENSION start_col,
+			      JDIMENSION num_blocks));
+};
+
+/* Entropy encoding */
+struct jpeg_entropy_encoder {
+  JMETHOD(void, start_pass, (j_compress_ptr cinfo, boolean gather_statistics));
+  JMETHOD(boolean, encode_mcu, (j_compress_ptr cinfo, JBLOCKROW *MCU_data));
+  JMETHOD(void, finish_pass, (j_compress_ptr cinfo));
+};
+
+/* Marker writing */
+struct jpeg_marker_writer {
+  JMETHOD(void, write_file_header, (j_compress_ptr cinfo));
+  JMETHOD(void, write_frame_header, (j_compress_ptr cinfo));
+  JMETHOD(void, write_scan_header, (j_compress_ptr cinfo));
+  JMETHOD(void, write_file_trailer, (j_compress_ptr cinfo));
+  JMETHOD(void, write_tables_only, (j_compress_ptr cinfo));
+  /* These routines are exported to allow insertion of extra markers */
+  /* Probably only COM and APPn markers should be written this way */
+  JMETHOD(void, write_marker_header, (j_compress_ptr cinfo, int marker,
+				      unsigned int datalen));
+  JMETHOD(void, write_marker_byte, (j_compress_ptr cinfo, int val));
+};
+
+
+/* Declarations for decompression modules */
+
+/* Master control module */
+struct jpeg_decomp_master {
+  JMETHOD(void, prepare_for_output_pass, (j_decompress_ptr cinfo));
+  JMETHOD(void, finish_output_pass, (j_decompress_ptr cinfo));
+
+  /* State variables made visible to other modules */
+  boolean is_dummy_pass;	/* True during 1st pass for 2-pass quant */
+};
+
+/* Input control module */
+struct jpeg_input_controller {
+  JMETHOD(int, consume_input, (j_decompress_ptr cinfo));
+  JMETHOD(void, reset_input_controller, (j_decompress_ptr cinfo));
+  JMETHOD(void, start_input_pass, (j_decompress_ptr cinfo));
+  JMETHOD(void, finish_input_pass, (j_decompress_ptr cinfo));
+
+  /* State variables made visible to other modules */
+  boolean has_multiple_scans;	/* True if file has multiple scans */
+  boolean eoi_reached;		/* True when EOI has been consumed */
+
+#ifdef ANDROID
+  JMETHOD(int, consume_input_build_huffman_index, (j_decompress_ptr cinfo,
+                    huffman_index *index, int scan_count));
+  JMETHOD(int, consume_markers, (j_decompress_ptr cinfo,
+                    huffman_index *index, int scan_count));
+#endif
+};
+
+/* Main buffer control (downsampled-data buffer) */
+struct jpeg_d_main_controller {
+  JMETHOD(void, start_pass, (j_decompress_ptr cinfo, J_BUF_MODE pass_mode));
+  JMETHOD(void, process_data, (j_decompress_ptr cinfo,
+			       JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
+			       JDIMENSION out_rows_avail));
+};
+
+/* Coefficient buffer control */
+struct jpeg_d_coef_controller {
+  JMETHOD(void, start_input_pass, (j_decompress_ptr cinfo));
+  JMETHOD(int, consume_data, (j_decompress_ptr cinfo));
+  JMETHOD(void, start_output_pass, (j_decompress_ptr cinfo));
+  JMETHOD(int, decompress_data, (j_decompress_ptr cinfo,
+				 JSAMPIMAGE output_buf));
+
+  /* Pointer to array of coefficient virtual arrays, or NULL if none */
+  jvirt_barray_ptr *coef_arrays;
+
+#ifdef ANDROID
+  JMETHOD(int, consume_data_build_huffman_index, (j_decompress_ptr cinfo,
+                    huffman_index* index, int scan_count));
+
+ /* column number of the first and last tile, respectively */
+ int column_left_boundary;
+ int column_right_boundary;
+
+ /* column number of the first and last MCU, respectively */
+ int MCU_column_left_boundary;
+ int MCU_column_right_boundary;
+
+ /* the number of MCU columns to skip from the indexed MCU, iM,
+  * to the requested MCU boundary, rM, where iM is the MCU that we sample
+  * into our index and is the nearest one to the left of rM.
+  */
+ int MCU_columns_to_skip;
+
+#endif
+};
+
+/* Decompression postprocessing (color quantization buffer control) */
+struct jpeg_d_post_controller {
+  JMETHOD(void, start_pass, (j_decompress_ptr cinfo, J_BUF_MODE pass_mode));
+  JMETHOD(void, post_process_data, (j_decompress_ptr cinfo,
+				    JSAMPIMAGE input_buf,
+				    JDIMENSION *in_row_group_ctr,
+				    JDIMENSION in_row_groups_avail,
+				    JSAMPARRAY output_buf,
+				    JDIMENSION *out_row_ctr,
+				    JDIMENSION out_rows_avail));
+};
+
+/* Marker reading & parsing */
+struct jpeg_marker_reader {
+  JMETHOD(void, reset_marker_reader, (j_decompress_ptr cinfo));
+  /* Read markers until SOS or EOI.
+   * Returns same codes as are defined for jpeg_consume_input:
+   * JPEG_SUSPENDED, JPEG_REACHED_SOS, or JPEG_REACHED_EOI.
+   */
+  JMETHOD(int, read_markers, (j_decompress_ptr cinfo));
+  /* Read a restart marker --- exported for use by entropy decoder only */
+  jpeg_marker_parser_method read_restart_marker;
+
+  /* State of marker reader --- nominally internal, but applications
+   * supplying COM or APPn handlers might like to know the state.
+   */
+  boolean saw_SOI;		/* found SOI? */
+  boolean saw_SOF;		/* found SOF? */
+  int next_restart_num;		/* next restart number expected (0-7) */
+  unsigned int discarded_bytes;	/* # of bytes skipped looking for a marker */
+
+#ifdef ANDROID
+  JMETHOD(void, get_sos_marker_position, (j_decompress_ptr cinfo,
+                   huffman_index *index));
+ 
+  int current_sos_marker_position;
+#endif
+};
+
+/* Entropy decoding */
+struct jpeg_entropy_decoder {
+  JMETHOD(void, start_pass, (j_decompress_ptr cinfo));
+  JMETHOD(boolean, decode_mcu, (j_decompress_ptr cinfo,
+				JBLOCKROW *MCU_data));
+
+  /* This is here to share code between baseline and progressive decoders; */
+  /* other modules probably should not use it */
+  boolean insufficient_data;	/* set TRUE after emitting warning */
+
+#ifdef ANDROID
+  JMETHOD(boolean, decode_mcu_discard_coef, (j_decompress_ptr cinfo));
+  JMETHOD(void, configure_huffman_decoder, (j_decompress_ptr cinfo,
+                    huffman_offset_data offset));
+  JMETHOD(void, get_huffman_decoder_configuration, (j_decompress_ptr cinfo,
+                    huffman_offset_data *offset));
+
+  huffman_index *index;
+#endif
+};
+
+/* Inverse DCT (also performs dequantization) */
+typedef JMETHOD(void, inverse_DCT_method_ptr,
+		(j_decompress_ptr cinfo, jpeg_component_info * compptr,
+		 JCOEFPTR coef_block,
+		 JSAMPARRAY output_buf, JDIMENSION output_col));
+
+struct jpeg_inverse_dct {
+  JMETHOD(void, start_pass, (j_decompress_ptr cinfo));
+  /* It is useful to allow each component to have a separate IDCT method. */
+  inverse_DCT_method_ptr inverse_DCT[MAX_COMPONENTS];
+};
+
+/* Upsampling (note that upsampler must also call color converter) */
+struct jpeg_upsampler {
+  JMETHOD(void, start_pass, (j_decompress_ptr cinfo));
+  JMETHOD(void, upsample, (j_decompress_ptr cinfo,
+			   JSAMPIMAGE input_buf,
+			   JDIMENSION *in_row_group_ctr,
+			   JDIMENSION in_row_groups_avail,
+			   JSAMPARRAY output_buf,
+			   JDIMENSION *out_row_ctr,
+			   JDIMENSION out_rows_avail));
+
+  boolean need_context_rows;	/* TRUE if need rows above & below */
+};
+
+/* Colorspace conversion */
+struct jpeg_color_deconverter {
+  JMETHOD(void, start_pass, (j_decompress_ptr cinfo));
+  JMETHOD(void, color_convert, (j_decompress_ptr cinfo,
+				JSAMPIMAGE input_buf, JDIMENSION input_row,
+				JSAMPARRAY output_buf, int num_rows));
+};
+
+/* Color quantization or color precision reduction */
+struct jpeg_color_quantizer {
+  JMETHOD(void, start_pass, (j_decompress_ptr cinfo, boolean is_pre_scan));
+  JMETHOD(void, color_quantize, (j_decompress_ptr cinfo,
+				 JSAMPARRAY input_buf, JSAMPARRAY output_buf,
+				 int num_rows));
+  JMETHOD(void, finish_pass, (j_decompress_ptr cinfo));
+  JMETHOD(void, new_color_map, (j_decompress_ptr cinfo));
+};
+
+
+/* Miscellaneous useful macros */
+
+#undef MAX
+#define MAX(a,b)	((a) > (b) ? (a) : (b))
+#undef MIN
+#define MIN(a,b)	((a) < (b) ? (a) : (b))
+
+
+/* We assume that right shift corresponds to signed division by 2 with
+ * rounding towards minus infinity.  This is correct for typical "arithmetic
+ * shift" instructions that shift in copies of the sign bit.  But some
+ * C compilers implement >> with an unsigned shift.  For these machines you
+ * must define RIGHT_SHIFT_IS_UNSIGNED.
+ * RIGHT_SHIFT provides a proper signed right shift of an INT32 quantity.
+ * It is only applied with constant shift counts.  SHIFT_TEMPS must be
+ * included in the variables of any routine using RIGHT_SHIFT.
+ */
+
+#ifdef RIGHT_SHIFT_IS_UNSIGNED
+#define SHIFT_TEMPS	INT32 shift_temp;
+#define RIGHT_SHIFT(x,shft)  \
+	((shift_temp = (x)) < 0 ? \
+	 (shift_temp >> (shft)) | ((~((INT32) 0)) << (32-(shft))) : \
+	 (shift_temp >> (shft)))
+#else
+#define SHIFT_TEMPS
+#define RIGHT_SHIFT(x,shft)	((x) >> (shft))
+#endif
+
+
+/* Short forms of external names for systems with brain-damaged linkers. */
+
+#ifdef NEED_SHORT_EXTERNAL_NAMES
+#define jinit_compress_master	jICompress
+#define jinit_c_master_control	jICMaster
+#define jinit_c_main_controller	jICMainC
+#define jinit_c_prep_controller	jICPrepC
+#define jinit_c_coef_controller	jICCoefC
+#define jinit_color_converter	jICColor
+#define jinit_downsampler	jIDownsampler
+#define jinit_forward_dct	jIFDCT
+#define jinit_huff_encoder	jIHEncoder
+#define jinit_phuff_encoder	jIPHEncoder
+#define jinit_arith_encoder	jIAEncoder
+#define jinit_marker_writer	jIMWriter
+#define jinit_master_decompress	jIDMaster
+#define jinit_d_main_controller	jIDMainC
+#define jinit_d_coef_controller	jIDCoefC
+#define jinit_d_post_controller	jIDPostC
+#define jinit_input_controller	jIInCtlr
+#define jinit_marker_reader	jIMReader
+#define jinit_huff_decoder	jIHDecoder
+#define jinit_phuff_decoder	jIPHDecoder
+#define jinit_arith_decoder	jIADecoder
+#define jinit_inverse_dct	jIIDCT
+#define jinit_upsampler		jIUpsampler
+#define jinit_color_deconverter	jIDColor
+#define jinit_1pass_quantizer	jI1Quant
+#define jinit_2pass_quantizer	jI2Quant
+#define jinit_merged_upsampler	jIMUpsampler
+#define jinit_memory_mgr	jIMemMgr
+#define jdiv_round_up		jDivRound
+#define jround_up		jRound
+#define jcopy_sample_rows	jCopySamples
+#define jcopy_block_row		jCopyBlocks
+#define jzero_far		jZeroFar
+#define jpeg_zigzag_order	jZIGTable
+#define jpeg_natural_order	jZAGTable
+#define jpeg_aritab		jAriTab
+#endif /* NEED_SHORT_EXTERNAL_NAMES */
+
+
+/* Compression module initialization routines */
+EXTERN(void) jinit_compress_master JPP((j_compress_ptr cinfo));
+EXTERN(void) jinit_c_master_control JPP((j_compress_ptr cinfo,
+					 boolean transcode_only));
+EXTERN(void) jinit_c_main_controller JPP((j_compress_ptr cinfo,
+					  boolean need_full_buffer));
+EXTERN(void) jinit_c_prep_controller JPP((j_compress_ptr cinfo,
+					  boolean need_full_buffer));
+EXTERN(void) jinit_c_coef_controller JPP((j_compress_ptr cinfo,
+					  boolean need_full_buffer));
+EXTERN(void) jinit_color_converter JPP((j_compress_ptr cinfo));
+EXTERN(void) jinit_downsampler JPP((j_compress_ptr cinfo));
+EXTERN(void) jinit_forward_dct JPP((j_compress_ptr cinfo));
+EXTERN(void) jinit_huff_encoder JPP((j_compress_ptr cinfo));
+EXTERN(void) jinit_phuff_encoder JPP((j_compress_ptr cinfo));
+EXTERN(void) jinit_arith_encoder JPP((j_compress_ptr cinfo));
+EXTERN(void) jinit_marker_writer JPP((j_compress_ptr cinfo));
+/* Decompression module initialization routines */
+EXTERN(void) jinit_master_decompress JPP((j_decompress_ptr cinfo));
+EXTERN(void) jinit_d_main_controller JPP((j_decompress_ptr cinfo,
+					  boolean need_full_buffer));
+EXTERN(void) jinit_d_coef_controller JPP((j_decompress_ptr cinfo,
+					  boolean need_full_buffer));
+EXTERN(void) jinit_d_post_controller JPP((j_decompress_ptr cinfo,
+					  boolean need_full_buffer));
+EXTERN(void) jinit_input_controller JPP((j_decompress_ptr cinfo));
+EXTERN(void) jinit_marker_reader JPP((j_decompress_ptr cinfo));
+EXTERN(void) jinit_huff_decoder JPP((j_decompress_ptr cinfo));
+EXTERN(void) jinit_phuff_decoder JPP((j_decompress_ptr cinfo));
+EXTERN(void) jinit_arith_decoder JPP((j_decompress_ptr cinfo));
+EXTERN(void) jinit_inverse_dct JPP((j_decompress_ptr cinfo));
+EXTERN(void) jinit_upsampler JPP((j_decompress_ptr cinfo));
+EXTERN(void) jinit_color_deconverter JPP((j_decompress_ptr cinfo));
+EXTERN(void) jinit_1pass_quantizer JPP((j_decompress_ptr cinfo));
+EXTERN(void) jinit_2pass_quantizer JPP((j_decompress_ptr cinfo));
+EXTERN(void) jinit_merged_upsampler JPP((j_decompress_ptr cinfo));
+
+#ifdef ANDROID
+EXTERN(void) jinit_huff_decoder_no_data JPP((j_decompress_ptr cinfo));
+EXTERN(void) jpeg_decompress_per_scan_setup (j_decompress_ptr cinfo);
+#endif
+
+/* Memory manager initialization */
+EXTERN(void) jinit_memory_mgr JPP((j_common_ptr cinfo));
+
+/* Utility routines in jutils.c */
+EXTERN(long) jdiv_round_up JPP((long a, long b));
+EXTERN(long) jround_up JPP((long a, long b));
+EXTERN(long) jmin JPP((long a, long b));
+EXTERN(void) jcopy_sample_rows JPP((JSAMPARRAY input_array, int source_row,
+				    JSAMPARRAY output_array, int dest_row,
+				    int num_rows, JDIMENSION num_cols));
+EXTERN(void) jcopy_block_row JPP((JBLOCKROW input_row, JBLOCKROW output_row,
+				  JDIMENSION num_blocks));
+EXTERN(void) jzero_far JPP((void FAR * target, size_t bytestozero));
+EXTERN(void) jset_input_stream_position JPP((j_decompress_ptr cinfo,
+                    int offset));
+EXTERN(void) jset_input_stream_position_bit JPP((j_decompress_ptr cinfo,
+                    int byte_offset, int bit_left, INT32 buf));
+
+EXTERN(int) jget_input_stream_position JPP((j_decompress_ptr cinfo));
+
+/* Constant tables in jutils.c */
+#if 0				/* This table is not actually needed in v6a */
+extern const int jpeg_zigzag_order[]; /* natural coef order to zigzag order */
+#endif
+extern const int jpeg_natural_order[]; /* zigzag coef order to natural order */
+
+/* Arithmetic coding probability estimation tables in jaricom.c */
+extern const INT32 jpeg_aritab[];
+
+/* Suppress undefined-structure complaints if necessary. */
+
+#ifdef INCOMPLETE_TYPES_BROKEN
+#ifndef AM_MEMORY_MANAGER	/* only jmemmgr.c defines these */
+struct jvirt_sarray_control { long dummy; };
+struct jvirt_barray_control { long dummy; };
+#endif
+#endif /* INCOMPLETE_TYPES_BROKEN */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jpeglib.h b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jpeglib.h
new file mode 100644
index 0000000..3811d7d
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jpeglib.h
@@ -0,0 +1,1611 @@
+/*
+ * jpeglib.h
+ *
+ * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modified 2002-2009 by Guido Vollbeding.
+ * Copyright (C) 2009-2011, D. R. Commander.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file defines the application interface for the JPEG library.
+ * Most applications using the library need only include this file,
+ * and perhaps jerror.h if they want to know the exact error codes.
+ */
+
+#ifndef JPEGLIB_H
+#define JPEGLIB_H
+
+
+#include "stdio.h"
+
+/*
+ * First we include the configuration files that record how this
+ * installation of the JPEG library is set up.  jconfig.h can be
+ * generated automatically for many systems.  jmorecfg.h contains
+ * manual configuration options that most people need not worry about.
+ */
+
+#ifndef JCONFIG_INCLUDED    /* in case jinclude.h already did */
+
+#include "jconfig.h"		/* widely used configuration options */
+
+#endif
+
+#include "jmorecfg.h"		/* seldom changed options */
+
+#ifndef ANDROID
+#ifdef __cplusplus
+#ifndef DONT_USE_EXTERN_C
+extern "C" {
+#endif
+#endif
+#endif
+
+
+/* Various constants determining the sizes of things.
+ * All of these are specified by the JPEG standard, so don't change them
+ * if you want to be compatible.
+ */
+
+#define DCTSIZE            8    /* The basic DCT block is 8x8 samples */
+#define DCTSIZE2        64    /* DCTSIZE squared; # of elements in a block */
+#define NUM_QUANT_TBLS      4    /* Quantization tables are numbered 0..3 */
+#define NUM_HUFF_TBLS       4    /* Huffman tables are numbered 0..3 */
+#define NUM_ARITH_TBLS      16    /* Arith-coding tables are numbered 0..15 */
+#define MAX_COMPS_IN_SCAN   4    /* JPEG limit on # of components in one scan */
+#define MAX_SAMP_FACTOR     4    /* JPEG limit on sampling factors */
+/* Unfortunately, some bozo at Adobe saw no reason to be bound by the standard;
+ * the PostScript DCT filter can emit files with many more than 10 blocks/MCU.
+ * If you happen to run across such a file, you can up D_MAX_BLOCKS_IN_MCU
+ * to handle it.  We even let you do this from the jconfig.h file.  However,
+ * we strongly discourage changing C_MAX_BLOCKS_IN_MCU; just because Adobe
+ * sometimes emits noncompliant files doesn't mean you should too.
+ */
+#define C_MAX_BLOCKS_IN_MCU   10 /* compressor's limit on blocks per MCU */
+#ifndef D_MAX_BLOCKS_IN_MCU
+#define D_MAX_BLOCKS_IN_MCU   10 /* decompressor's limit on blocks per MCU */
+#endif
+
+
+/* Data structures for images (arrays of samples and of DCT coefficients).
+ * On 80x86 machines, the image arrays are too big for near pointers,
+ * but the pointer arrays can fit in near memory.
+ */
+
+typedef JSAMPLE FAR *JSAMPROW;
+/* ptr to one image row of pixel samples. */
+typedef JSAMPROW *JSAMPARRAY;
+/* ptr to some rows (a 2-D sample array) */
+typedef JSAMPARRAY *JSAMPIMAGE;
+/* a 3-D sample array: top index is color */
+
+typedef JCOEF JBLOCK[DCTSIZE2];
+/* one block of coefficients */
+typedef JBLOCK FAR *JBLOCKROW;
+/* pointer to one row of coefficient blocks */
+typedef JBLOCKROW *JBLOCKARRAY;
+/* a 2-D array of coefficient blocks */
+typedef JBLOCKARRAY *JBLOCKIMAGE;
+/* a 3-D array of coefficient blocks */
+
+typedef JCOEF FAR *JCOEFPTR;    /* useful in a couple of places */
+
+
+/* Types for JPEG compression parameters and working tables. */
+
+
+/* DCT coefficient quantization tables. */
+
+typedef struct {
+    /* This array gives the coefficient quantizers in natural array order
+     * (not the zigzag order in which they are stored in a JPEG DQT marker).
+     * CAUTION: IJG versions prior to v6a kept this array in zigzag order.
+     */
+    UINT16 quantval[DCTSIZE2];    /* quantization step for each coefficient */
+    /* This field is used only during compression.  It's initialized FALSE when
+     * the table is created, and set TRUE when it's been output to the file.
+     * You could suppress output of a table by setting this to TRUE.
+     * (See jpeg_suppress_tables for an example.)
+     */
+    boolean sent_table;        /* TRUE when table has been output */
+} JQUANT_TBL;
+
+
+/* Huffman coding tables. */
+
+typedef struct {
+    /* These two fields directly represent the contents of a JPEG DHT marker */
+    UINT8 bits[17];        /* bits[k] = # of symbols with codes of */
+    /* length k bits; bits[0] is unused */
+    UINT8 huffval[256];        /* The symbols, in order of incr code length */
+    /* This field is used only during compression.  It's initialized FALSE when
+     * the table is created, and set TRUE when it's been output to the file.
+     * You could suppress output of a table by setting this to TRUE.
+     * (See jpeg_suppress_tables for an example.)
+     */
+    boolean sent_table;        /* TRUE when table has been output */
+} JHUFF_TBL;
+
+
+/* Basic info about one component (color channel). */
+
+typedef struct {
+    /* These values are fixed over the whole image. */
+    /* For compression, they must be supplied by parameter setup; */
+    /* for decompression, they are read from the SOF marker. */
+    int component_id;
+    /* identifier for this component (0..255) */
+    int component_index;
+    /* its index in SOF or cinfo->comp_info[] */
+    int h_samp_factor;
+    /* horizontal sampling factor (1..4) */
+    int v_samp_factor;
+    /* vertical sampling factor (1..4) */
+    int quant_tbl_no;        /* quantization table selector (0..3) */
+    /* These values may vary between scans. */
+    /* For compression, they must be supplied by parameter setup; */
+    /* for decompression, they are read from the SOS marker. */
+    /* The decompressor output side may not use these variables. */
+    int dc_tbl_no;
+    /* DC entropy table selector (0..3) */
+    int ac_tbl_no;        /* AC entropy table selector (0..3) */
+
+    /* Remaining fields should be treated as private by applications. */
+
+    /* These values are computed during compression or decompression startup: */
+    /* Component's size in DCT blocks.
+     * Any dummy blocks added to complete an MCU are not counted; therefore
+     * these values do not depend on whether a scan is interleaved or not.
+     */
+    JDIMENSION width_in_blocks;
+    JDIMENSION height_in_blocks;
+    /* Size of a DCT block in samples.  Always DCTSIZE for compression.
+     * For decompression this is the size of the output from one DCT block,
+     * reflecting any scaling we choose to apply during the IDCT step.
+     * Values of 1,2,4,8 are likely to be supported.  Note that different
+     * components may receive different IDCT scalings.
+     */
+#if JPEG_LIB_VERSION >= 70
+    int DCT_h_scaled_size;
+    int DCT_v_scaled_size;
+#else
+    int DCT_scaled_size;
+#endif
+    /* The downsampled dimensions are the component's actual, unpadded number
+     * of samples at the main buffer (preprocessing/compression interface), thus
+     * downsampled_width = ceil(image_width * Hi/Hmax)
+     * and similarly for height.  For decompression, IDCT scaling is included, so
+     * downsampled_width = ceil(image_width * Hi/Hmax * DCT_[h_]scaled_size/DCTSIZE)
+     */
+    JDIMENSION downsampled_width;
+    /* actual width in samples */
+    JDIMENSION downsampled_height; /* actual height in samples */
+    /* This flag is used only for decompression.  In cases where some of the
+     * components will be ignored (eg grayscale output from YCbCr image),
+     * we can skip most computations for the unused components.
+     */
+    boolean component_needed;    /* do we need the value of this component? */
+
+    /* These values are computed before starting a scan of the component. */
+    /* The decompressor output side may not use these variables. */
+    int MCU_width;
+    /* number of blocks per MCU, horizontally */
+    int MCU_height;
+    /* number of blocks per MCU, vertically */
+    int MCU_blocks;
+    /* MCU_width * MCU_height */
+    int MCU_sample_width;
+    /* MCU width in samples, MCU_width*DCT_[h_]scaled_size */
+    int last_col_width;
+    /* # of non-dummy blocks across in last MCU */
+    int last_row_height;        /* # of non-dummy blocks down in last MCU */
+
+    /* Saved quantization table for component; NULL if none yet saved.
+     * See jdinput.c comments about the need for this information.
+     * This field is currently used only for decompression.
+     */
+    JQUANT_TBL *quant_table;
+
+    /* Private per-component storage for DCT or IDCT subsystem. */
+    void *dct_table;
+} jpeg_component_info;
+
+
+/* The script for encoding a multiple-scan file is an array of these: */
+
+typedef struct {
+    int comps_in_scan;
+    /* number of components encoded in this scan */
+    int component_index[MAX_COMPS_IN_SCAN];
+    /* their SOF/comp_info[] indexes */
+    int Ss, Se;
+    /* progressive JPEG spectral selection parms */
+    int Ah, Al;            /* progressive JPEG successive approx. parms */
+} jpeg_scan_info;
+
+/* The decompressor can writeFileAsync APPn and COM markers in a list of these: */
+
+typedef struct jpeg_marker_struct FAR *jpeg_saved_marker_ptr;
+
+struct jpeg_marker_struct {
+    jpeg_saved_marker_ptr next;
+    /* next in list, or NULL */
+    UINT8 marker;
+    /* marker code: JPEG_COM, or JPEG_APP0+n */
+    unsigned int original_length;
+    /* # bytes of data in the file */
+    unsigned int data_length;
+    /* # bytes of data saved at data[] */
+    JOCTET FAR *data;        /* the data contained in the marker */
+    /* the marker length word is not counted in data_length or original_length */
+};
+
+/* Known color spaces. */
+
+#define JCS_EXTENSIONS 1
+#define JCS_ALPHA_EXTENSIONS 1
+
+typedef enum {
+    JCS_UNKNOWN, /* error/unspecified */
+            JCS_GRAYSCALE, /* monochrome */
+            JCS_RGB, /* red/green/blue as specified by the RGB_RED, RGB_GREEN,
+				   RGB_BLUE, and RGB_PIXELSIZE macros */
+            JCS_YCbCr, /* Y/Cb/Cr (also known as YUV) */
+            JCS_CMYK, /* C/M/Y/K */
+            JCS_YCCK, /* Y/Cb/Cr/K */
+            JCS_EXT_RGB, /* red/green/blue */
+            JCS_EXT_RGBX, /* red/green/blue/x */
+            JCS_EXT_BGR, /* blue/green/red */
+            JCS_EXT_BGRX, /* blue/green/red/x */
+            JCS_EXT_XBGR, /* x/blue/green/red */
+            JCS_EXT_XRGB,        /* x/red/green/blue */
+    /* When out_color_space it set to JCS_EXT_RGBX, JCS_EXT_BGRX,
+       JCS_EXT_XBGR, or JCS_EXT_XRGB during decompression, the X byte is
+       undefined, and in order to ensure the best performance,
+       libjpeg-turbo can set that byte to whatever value it wishes.  Use
+       the following colorspace constants to ensure that the X byte is set
+       to 0xFF, so that it can be interpreted as an opaque alpha
+       channel. */
+            JCS_EXT_RGBA, /* red/green/blue/alpha */
+            JCS_EXT_BGRA, /* blue/green/red/alpha */
+            JCS_EXT_ABGR, /* alpha/blue/green/red */
+            JCS_EXT_ARGB,        /* alpha/red/green/blue */
+#ifdef ANDROID_RGB
+    JCS_RGBA_8888, /* red/green/blue/alpha */
+            JCS_RGB_565     /* red/green/blue in 565 format */
+#endif
+} J_COLOR_SPACE;
+
+/* DCT/IDCT algorithm options. */
+
+typedef enum {
+    JDCT_ISLOW, /* slow but accurate integer algorithm */
+            JDCT_IFAST, /* faster, less accurate integer method */
+            JDCT_FLOAT        /* floating-point: accurate, fast on fast HW */
+} J_DCT_METHOD;
+
+#ifndef JDCT_DEFAULT        /* may be overridden in jconfig.h */
+#define JDCT_DEFAULT  JDCT_ISLOW
+#endif
+#ifndef JDCT_FASTEST        /* may be overridden in jconfig.h */
+#define JDCT_FASTEST  JDCT_IFAST
+#endif
+
+/* Dithering options for decompression. */
+
+typedef enum {
+    JDITHER_NONE, /* no dithering */
+            JDITHER_ORDERED, /* simple ordered dither */
+            JDITHER_FS        /* Floyd-Steinberg error diffusion dither */
+} J_DITHER_MODE;
+
+
+/* Common fields between JPEG compression and decompression master structs. */
+
+#define jpeg_common_fields \
+  struct jpeg_error_mgr * err;    /* Error handler module */\
+  struct jpeg_memory_mgr * mem;    /* Memory manager module */\
+  struct jpeg_progress_mgr * progress; /* Progress monitor, or NULL if none */\
+  void * client_data;        /* Available for use by application */\
+  boolean is_decompressor;    /* So common code can tell which is which */\
+  int global_state        /* For checking call sequence validity */
+
+/* Routines that are to be used by both halves of the library are declared
+ * to receive a pointer to this structure.  There are no actual instances of
+ * jpeg_common_struct, only of jpeg_compress_struct and jpeg_decompress_struct.
+ */
+struct jpeg_common_struct {
+    jpeg_common_fields;        /* Fields common to both master struct types */
+    /* Additional fields follow in an actual jpeg_compress_struct or
+     * jpeg_decompress_struct.  All three structs must agree on these
+     * initial fields!  (This would be a lot cleaner in C++.)
+     */
+};
+
+typedef struct jpeg_common_struct *j_common_ptr;
+typedef struct jpeg_compress_struct *j_compress_ptr;
+typedef struct jpeg_decompress_struct *j_decompress_ptr;
+
+
+/* Master record for a compression instance */
+
+struct jpeg_compress_struct {
+    jpeg_common_fields;        /* Fields shared with jpeg_decompress_struct */
+
+    /* Destination for compressed data */
+    struct jpeg_destination_mgr *dest;
+
+    /* Description of source image --- these fields must be filled in by
+     * outer application before starting compression.  in_color_space must
+     * be correct before you can even call jpeg_set_defaults().
+     */
+
+    JDIMENSION image_width;
+    /* input image width */
+    JDIMENSION image_height;
+    /* input image height */
+    int input_components;
+    /* # of color components in input image */
+    J_COLOR_SPACE in_color_space;
+    /* colorspace of input image */
+
+    double input_gamma;        /* image gamma of input image */
+
+    /* Compression parameters --- these fields must be set before calling
+     * jpeg_start_compress().  We recommend calling jpeg_set_defaults() to
+     * initialize everything to reasonable defaults, then changing anything
+     * the application specifically wants to change.  That way you won't get
+     * burnt when new parameters are added.  Also note that there are several
+     * helper routines to simplify changing parameters.
+     */
+
+#if JPEG_LIB_VERSION >= 70
+    unsigned int scale_num, scale_denom; /* fraction by which to scale image */
+
+    JDIMENSION jpeg_width;	/* scaled JPEG image width */
+    JDIMENSION jpeg_height;	/* scaled JPEG image height */
+    /* Dimensions of actual JPEG image that will be written to file,
+     * derived from input dimensions by scaling factors above.
+     * These fields are computed by jpeg_start_compress().
+     * You can also use jpeg_calc_jpeg_dimensions() to determine these values
+     * in advance of calling jpeg_start_compress().
+     */
+#endif
+
+    int data_precision;
+    /* bits of precision in image data */
+
+    int num_components;
+    /* # of color components in JPEG image */
+    J_COLOR_SPACE jpeg_color_space;
+    /* colorspace of JPEG image */
+
+    jpeg_component_info *comp_info;
+    /* comp_info[i] describes component that appears i'th in SOF */
+
+    JQUANT_TBL *quant_tbl_ptrs[NUM_QUANT_TBLS];
+#if JPEG_LIB_VERSION >= 70
+    int q_scale_factor[NUM_QUANT_TBLS];
+#endif
+    /* ptrs to coefficient quantization tables, or NULL if not defined,
+     * and corresponding scale factors (percentage, initialized 100).
+     */
+
+    JHUFF_TBL *dc_huff_tbl_ptrs[NUM_HUFF_TBLS];
+    JHUFF_TBL *ac_huff_tbl_ptrs[NUM_HUFF_TBLS];
+    /* ptrs to Huffman coding tables, or NULL if not defined */
+
+    UINT8 arith_dc_L[NUM_ARITH_TBLS];
+    /* L values for DC arith-coding tables */
+    UINT8 arith_dc_U[NUM_ARITH_TBLS];
+    /* U values for DC arith-coding tables */
+    UINT8 arith_ac_K[NUM_ARITH_TBLS];
+    /* Kx values for AC arith-coding tables */
+
+    int num_scans;
+    /* # of entries in scan_info array */
+    const jpeg_scan_info *scan_info; /* script for multi-scan file, or NULL */
+    /* The default value of scan_info is NULL, which causes a single-scan
+     * sequential JPEG file to be emitted.  To create a multi-scan file,
+     * set num_scans and scan_info to point to an array of scan definitions.
+     */
+
+    boolean raw_data_in;
+    /* TRUE=caller supplies downsampled data */
+    boolean arith_code;
+    /* TRUE=arithmetic coding, FALSE=Huffman */
+    boolean optimize_coding;
+    /* TRUE=optimize entropy encoding parms */
+    boolean CCIR601_sampling;    /* TRUE=first samples are cosited */
+#if JPEG_LIB_VERSION >= 70
+    boolean do_fancy_downsampling; /* TRUE=apply fancy downsampling */
+#endif
+    int smoothing_factor;
+    /* 1..100, or 0 for no input smoothing */
+    J_DCT_METHOD dct_method;    /* DCT algorithm selector */
+
+    /* The restart interval can be specified in absolute MCUs by setting
+     * restart_interval, or in MCU rows by setting restart_in_rows
+     * (in which case the correct restart_interval will be figured
+     * for each scan).
+     */
+    unsigned int restart_interval;
+    /* MCUs per restart, or 0 for no restart */
+    int restart_in_rows;        /* if > 0, MCU rows per restart interval */
+
+    /* Parameters controlling emission of special markers. */
+
+    boolean write_JFIF_header;
+    /* should a JFIF marker be written? */
+    UINT8 JFIF_major_version;
+    /* What to write for the JFIF version number */
+    UINT8 JFIF_minor_version;
+    /* These three values are not used by the JPEG code, merely copied */
+    /* into the JFIF APP0 marker.  density_unit can be 0 for unknown, */
+    /* 1 for dots/inch, or 2 for dots/cm.  Note that the pixel aspect */
+    /* ratio is defined by X_density/Y_density even when density_unit=0. */
+    UINT8 density_unit;
+    /* JFIF code for pixel size units */
+    UINT16 X_density;
+    /* Horizontal pixel density */
+    UINT16 Y_density;
+    /* Vertical pixel density */
+    boolean write_Adobe_marker;    /* should an Adobe marker be written? */
+
+    /* State variable: index of next scanline to be written to
+     * jpeg_write_scanlines().  Application may use this to control its
+     * processing loop, e.g., "while (next_scanline < image_height)".
+     */
+
+    JDIMENSION next_scanline;    /* 0 .. image_height-1  */
+
+    /* Remaining fields are known throughout compressor, but generally
+     * should not be touched by a surrounding application.
+     */
+
+    /*
+     * These fields are computed during compression startup
+     */
+    boolean progressive_mode;
+    /* TRUE if scan script uses progressive mode */
+    int max_h_samp_factor;
+    /* largest h_samp_factor */
+    int max_v_samp_factor;    /* largest v_samp_factor */
+
+#if JPEG_LIB_VERSION >= 70
+    int min_DCT_h_scaled_size;	/* smallest DCT_h_scaled_size of any component */
+    int min_DCT_v_scaled_size;	/* smallest DCT_v_scaled_size of any component */
+#endif
+
+    JDIMENSION total_iMCU_rows;    /* # of iMCU rows to be input to coef ctlr */
+    /* The coefficient controller receives data in units of MCU rows as defined
+     * for fully interleaved scans (whether the JPEG file is interleaved or not).
+     * There are v_samp_factor * DCTSIZE sample rows of each component in an
+     * "iMCU" (interleaved MCU) row.
+     */
+
+    /*
+     * These fields are valid during any one scan.
+     * They describe the components and MCUs actually appearing in the scan.
+     */
+    int comps_in_scan;
+    /* # of JPEG components in this scan */
+    jpeg_component_info *cur_comp_info[MAX_COMPS_IN_SCAN];
+    /* *cur_comp_info[i] describes component that appears i'th in SOS */
+
+    JDIMENSION MCUs_per_row;
+    /* # of MCUs across the image */
+    JDIMENSION MCU_rows_in_scan;
+    /* # of MCU rows in the image */
+
+    int blocks_in_MCU;
+    /* # of DCT blocks per MCU */
+    int MCU_membership[C_MAX_BLOCKS_IN_MCU];
+    /* MCU_membership[i] is index in cur_comp_info of component owning */
+    /* i'th block in an MCU */
+
+    int Ss, Se, Ah, Al;        /* progressive JPEG parameters for scan */
+
+#if JPEG_LIB_VERSION >= 80
+    int block_size;		/* the basic DCT block size: 1..16 */
+    const int * natural_order;	/* natural-order position array */
+    int lim_Se;			/* min( Se, DCTSIZE2-1 ) */
+#endif
+
+    /*
+     * Links to compression subobjects (methods and private variables of modules)
+     */
+    struct jpeg_comp_master *master;
+    struct jpeg_c_main_controller *main;
+    struct jpeg_c_prep_controller *prep;
+    struct jpeg_c_coef_controller *coef;
+    struct jpeg_marker_writer *marker;
+    struct jpeg_color_converter *cconvert;
+    struct jpeg_downsampler *downsample;
+    struct jpeg_forward_dct *fdct;
+    struct jpeg_entropy_encoder *entropy;
+    jpeg_scan_info *script_space;
+    /* workspace for jpeg_simple_progression */
+    int script_space_size;
+};
+
+
+/* Master record for a decompression instance */
+
+struct jpeg_decompress_struct {
+    jpeg_common_fields;        /* Fields shared with jpeg_compress_struct */
+
+    /* Source of compressed data */
+    struct jpeg_source_mgr *src;
+
+    /* Basic description of image --- filled in by jpeg_read_header(). */
+    /* Application may inspect these values to decide how to process image. */
+
+    JDIMENSION original_image_width;
+    /* nominal image width (from SOF marker) */
+    JDIMENSION image_width;
+    /* nominal image width (from SOF marker) */
+    JDIMENSION image_height;
+    /* nominal image height */
+    int num_components;
+    /* # of color components in JPEG image */
+    J_COLOR_SPACE jpeg_color_space; /* colorspace of JPEG image */
+
+    /* Decompression processing parameters --- these fields must be set before
+     * calling jpeg_start_decompress().  Note that jpeg_read_header() initializes
+     * them to default values.
+     */
+
+    J_COLOR_SPACE out_color_space;
+    /* colorspace for output */
+
+    unsigned int scale_num, scale_denom;
+    /* fraction by which to scale image */
+
+    double output_gamma;
+    /* image gamma wanted in output */
+
+    boolean buffered_image;
+    /* TRUE=multiple output passes */
+    boolean raw_data_out;
+    /* TRUE=downsampled data wanted */
+
+    J_DCT_METHOD dct_method;
+    /* IDCT algorithm selector */
+    boolean do_fancy_upsampling;
+    /* TRUE=apply fancy upsampling */
+    boolean do_block_smoothing;
+    /* TRUE=apply interblock smoothing */
+
+    boolean quantize_colors;    /* TRUE=colormapped output wanted */
+    /* the following are ignored if not quantize_colors: */
+    J_DITHER_MODE dither_mode;
+    /* type of color dithering to use */
+    boolean two_pass_quantize;
+    /* TRUE=use two-pass color quantization */
+    int desired_number_of_colors;    /* max # colors to use in created colormap */
+    /* these are significant only in buffered-image mode: */
+    boolean enable_1pass_quant;
+    /* enable future use of 1-pass quantizer */
+    boolean enable_external_quant;
+    /* enable future use of external colormap */
+    boolean enable_2pass_quant;    /* enable future use of 2-pass quantizer */
+
+    /* Description of actual output image that will be returned to application.
+     * These fields are computed by jpeg_start_decompress().
+     * You can also use jpeg_calc_output_dimensions() to determine these values
+     * in advance of calling jpeg_start_decompress().
+     */
+
+    JDIMENSION output_width;
+    /* scaled image width */
+    JDIMENSION output_height;
+    /* scaled image height */
+    int out_color_components;
+    /* # of color components in out_color_space */
+    int output_components;    /* # of color components returned */
+    /* output_components is 1 (a colormap index) when quantizing colors;
+     * otherwise it equals out_color_components.
+     */
+    int rec_outbuf_height;    /* min recommended height of scanline buffer */
+    /* If the buffer passed to jpeg_read_scanlines() is less than this many rows
+     * high, space and time will be wasted due to unnecessary data copying.
+     * Usually rec_outbuf_height will be 1 or 2, at most 4.
+     */
+
+    /* When quantizing colors, the output colormap is described by these fields.
+     * The application can supply a colormap by setting colormap non-NULL before
+     * calling jpeg_start_decompress; otherwise a colormap is created during
+     * jpeg_start_decompress or jpeg_start_output.
+     * The map has out_color_components rows and actual_number_of_colors columns.
+     */
+    int actual_number_of_colors;
+    /* number of entries in use */
+    JSAMPARRAY colormap;        /* The color map as a 2-D pixel array */
+
+    /* State variables: these variables indicate the progress of decompression.
+     * The application may examine these but must not modify them.
+     */
+
+    /* Row index of next scanline to be read from jpeg_read_scanlines().
+     * Application may use this to control its processing loop, e.g.,
+     * "while (output_scanline < output_height)".
+     */
+    JDIMENSION output_scanline;    /* 0 .. output_height-1  */
+
+    /* Current input scan number and number of iMCU rows completed in scan.
+     * These indicate the progress of the decompressor input side.
+     */
+    int input_scan_number;
+    /* Number of SOS markers seen so far */
+    JDIMENSION input_iMCU_row;    /* Number of iMCU rows completed */
+
+    /* The "output scan number" is the notional scan being displayed by the
+     * output side.  The decompressor will not allow output scan/row number
+     * to get ahead of input scan/row, but it can fall arbitrarily far behind.
+     */
+    int output_scan_number;
+    /* Nominal scan number being displayed */
+    JDIMENSION output_iMCU_row;    /* Number of iMCU rows read */
+
+    /* Current progression status.  coef_bits[c][i] indicates the precision
+     * with which component c's DCT coefficient i (in zigzag order) is known.
+     * It is -1 when no data has yet been received, otherwise it is the point
+     * transform (shift) value for the most recent scan of the coefficient
+     * (thus, 0 at completion of the progression).
+     * This pointer is NULL when reading a non-progressive file.
+     */
+    int (*coef_bits)[DCTSIZE2];    /* -1 or current Al value for each coef */
+
+    /* Internal JPEG parameters --- the application usually need not look at
+     * these fields.  Note that the decompressor output side may not use
+     * any parameters that can change between scans.
+     */
+
+    /* Quantization and Huffman tables are carried forward across input
+     * datastreams when processing abbreviated JPEG datastreams.
+     */
+
+    JQUANT_TBL *quant_tbl_ptrs[NUM_QUANT_TBLS];
+    /* ptrs to coefficient quantization tables, or NULL if not defined */
+
+    JHUFF_TBL *dc_huff_tbl_ptrs[NUM_HUFF_TBLS];
+    JHUFF_TBL *ac_huff_tbl_ptrs[NUM_HUFF_TBLS];
+    /* ptrs to Huffman coding tables, or NULL if not defined */
+
+    /* These parameters are never carried across datastreams, since they
+     * are given in SOF/SOS markers or defined to be reset by SOI.
+     */
+
+    int data_precision;
+    /* bits of precision in image data */
+
+    jpeg_component_info *comp_info;
+    /* comp_info[i] describes component that appears i'th in SOF */
+
+#if JPEG_LIB_VERSION >= 80
+    boolean is_baseline;		/* TRUE if Baseline SOF0 encountered */
+#endif
+#ifdef ANDROID
+    boolean tile_decode;          /* TRUE if using tile based decoding */
+#endif
+    boolean progressive_mode;
+    /* TRUE if SOFn specifies progressive mode */
+    boolean arith_code;
+    /* TRUE=arithmetic coding, FALSE=Huffman */
+
+    UINT8 arith_dc_L[NUM_ARITH_TBLS];
+    /* L values for DC arith-coding tables */
+    UINT8 arith_dc_U[NUM_ARITH_TBLS];
+    /* U values for DC arith-coding tables */
+    UINT8 arith_ac_K[NUM_ARITH_TBLS];
+    /* Kx values for AC arith-coding tables */
+
+    unsigned int restart_interval; /* MCUs per restart interval, or 0 for no restart */
+
+    /* These fields record data obtained from optional markers recognized by
+     * the JPEG library.
+     */
+    boolean saw_JFIF_marker;    /* TRUE iff a JFIF APP0 marker was found */
+    /* Data copied from JFIF marker; only valid if saw_JFIF_marker is TRUE: */
+    UINT8 JFIF_major_version;
+    /* JFIF version number */
+    UINT8 JFIF_minor_version;
+    UINT8 density_unit;
+    /* JFIF code for pixel size units */
+    UINT16 X_density;
+    /* Horizontal pixel density */
+    UINT16 Y_density;
+    /* Vertical pixel density */
+    boolean saw_Adobe_marker;
+    /* TRUE iff an Adobe APP14 marker was found */
+    UINT8 Adobe_transform;
+    /* Color transform code from Adobe marker */
+
+    boolean CCIR601_sampling;    /* TRUE=first samples are cosited */
+
+    /* Aside from the specific data retained from APPn markers known to the
+     * library, the uninterpreted contents of any or all APPn and COM markers
+     * can be saved in a list for examination by the application.
+     */
+    jpeg_saved_marker_ptr marker_list; /* Head of list of saved markers */
+
+    /* Remaining fields are known throughout decompressor, but generally
+     * should not be touched by a surrounding application.
+     */
+
+    /*
+     * These fields are computed during decompression startup
+     */
+    int max_h_samp_factor;
+    /* largest h_samp_factor */
+    int max_v_samp_factor;    /* largest v_samp_factor */
+
+#if JPEG_LIB_VERSION >= 70
+    int min_DCT_h_scaled_size;	/* smallest DCT_h_scaled_size of any component */
+    int min_DCT_v_scaled_size;	/* smallest DCT_v_scaled_size of any component */
+#else
+    int min_DCT_scaled_size;    /* smallest DCT_scaled_size of any component */
+#endif
+
+    JDIMENSION total_iMCU_rows;    /* # of iMCU rows in image */
+    /* The coefficient controller's input and output progress is measured in
+     * units of "iMCU" (interleaved MCU) rows.  These are the same as MCU rows
+     * in fully interleaved JPEG scans, but are used whether the scan is
+     * interleaved or not.  We define an iMCU row as v_samp_factor DCT block
+     * rows of each component.  Therefore, the IDCT output contains
+     * v_samp_factor*DCT_[v_]scaled_size sample rows of a component per iMCU row.
+     */
+
+    JSAMPLE *sample_range_limit; /* table for fast range-limiting */
+
+    /*
+     * These fields are valid during any one scan.
+     * They describe the components and MCUs actually appearing in the scan.
+     * Note that the decompressor output side must not use these fields.
+     */
+    int comps_in_scan;
+    /* # of JPEG components in this scan */
+    jpeg_component_info *cur_comp_info[MAX_COMPS_IN_SCAN];
+    /* *cur_comp_info[i] describes component that appears i'th in SOS */
+
+    JDIMENSION MCUs_per_row;
+    /* # of MCUs across the image */
+    JDIMENSION MCU_rows_in_scan;
+    /* # of MCU rows in the image */
+
+    int blocks_in_MCU;
+    /* # of DCT blocks per MCU */
+    int MCU_membership[D_MAX_BLOCKS_IN_MCU];
+    /* MCU_membership[i] is index in cur_comp_info of component owning */
+    /* i'th block in an MCU */
+
+    int Ss, Se, Ah, Al;        /* progressive JPEG parameters for scan */
+
+#if JPEG_LIB_VERSION >= 80
+    /* These fields are derived from Se of first SOS marker.
+     */
+    int block_size;		/* the basic DCT block size: 1..16 */
+    const int * natural_order; /* natural-order position array for entropy decode */
+    int lim_Se;			/* min( Se, DCTSIZE2-1 ) for entropy decode */
+#endif
+
+    /* This field is shared between entropy decoder and marker parser.
+     * It is either zero or the code of a JPEG marker that has been
+     * read from the data source, but has not yet been processed.
+     */
+    int unread_marker;
+
+    /*
+     * Links to decompression subobjects (methods, private variables of modules)
+     */
+    struct jpeg_decomp_master *master;
+    struct jpeg_d_main_controller *main;
+    struct jpeg_d_coef_controller *coef;
+    struct jpeg_d_post_controller *post;
+    struct jpeg_input_controller *inputctl;
+    struct jpeg_marker_reader *marker;
+    struct jpeg_entropy_decoder *entropy;
+    struct jpeg_inverse_dct *idct;
+    struct jpeg_upsampler *upsample;
+    struct jpeg_color_deconverter *cconvert;
+    struct jpeg_color_quantizer *cquantize;
+};
+
+
+typedef struct {
+
+    // |--- byte_offset ---|- bit_left -|
+    //  \------ 27 -------/ \---- 5 ----/
+    unsigned int bitstream_offset;
+    short prev_dc[3];
+
+    // remaining EOBs in EOBRUN
+    unsigned short EOBRUN;
+
+    // writeFileAsync the decoder current bit buffer, entropy->bitstate.get_buffer.
+    INT32 get_buffer;
+
+    // writeFileAsync the restart info.
+    unsigned short restarts_to_go;
+    unsigned char next_restart_num;
+} huffman_offset_data;
+
+typedef struct {
+
+    // The header starting position of this scan
+    unsigned int bitstream_offset;
+
+    // Number of components in this scan
+    int comps_in_scan;
+
+    // Number of MCUs in each row
+    int MCUs_per_row;
+    int MCU_rows_per_iMCU_row;
+
+    // The last MCU position and its dc value in this scan
+    huffman_offset_data prev_MCU_offset;
+
+    huffman_offset_data **offset;
+} huffman_scan_header;
+
+#define DEFAULT_MCU_SAMPLE_SIZE 16
+
+typedef struct {
+
+    // The number of MCUs that we sample each time as an index point
+    int MCU_sample_size;
+
+    // Number of scan in this image
+    int scan_count;
+
+    // Number of iMCUs rows in this image
+    int total_iMCU_rows;
+
+    // Memory used by scan struct
+    size_t mem_used;
+    huffman_scan_header *scan;
+} huffman_index;
+
+
+/* "Object" declarations for JPEG modules that may be supplied or called
+ * directly by the surrounding application.
+ * As with all objects in the JPEG library, these structs only define the
+ * publicly visible methods and state variables of a module.  Additional
+ * private fields may exist after the public ones.
+ */
+
+
+/* Error handler object */
+
+struct jpeg_error_mgr {
+    /* Error exit handler: does not return to caller */
+    JMETHOD(void, error_exit, (j_common_ptr
+            cinfo));
+    /* Conditionally emit a trace or warning message */
+    JMETHOD(void, emit_message, (j_common_ptr
+            cinfo,
+            int msg_level));
+    /* Routine that actually outputs a trace or error message */
+    JMETHOD(void, output_message, (j_common_ptr
+            cinfo));
+    /* Format a message string for the most recent JPEG error or message */
+    JMETHOD(void, format_message, (j_common_ptr
+            cinfo,
+            char *buffer));
+
+#define JMSG_LENGTH_MAX  200    /* recommended size of format_message buffer */
+    /* Reset error state variables at start of a new image */
+    JMETHOD(void, reset_error_mgr, (j_common_ptr
+            cinfo));
+
+    /* The message ID code and any parameters are saved here.
+     * A message can have one string parameter or up to 8 int parameters.
+     */
+    int msg_code;
+#define JMSG_STR_PARM_MAX  80
+    union {
+        int i[8];
+        char s[JMSG_STR_PARM_MAX];
+    } msg_parm;
+
+    /* Standard state variables for error facility */
+
+    int trace_level;        /* max msg_level that will be displayed */
+
+    /* For recoverable corrupt-data errors, we emit a warning message,
+     * but keep going unless emit_message chooses to abort.  emit_message
+     * should count warnings in num_warnings.  The surrounding application
+     * can check for bad data by seeing if num_warnings is nonzero at the
+     * end of processing.
+     */
+    long num_warnings;        /* number of corrupt-data warnings */
+
+    /* These fields point to the table(s) of error message strings.
+     * An application can change the table pointer to switch to a different
+     * message list (typically, to change the language in which errors are
+     * reported).  Some applications may wish to add additional error codes
+     * that will be handled by the JPEG library error mechanism; the second
+     * table pointer is used for this purpose.
+     *
+     * First table includes all errors generated by JPEG library itself.
+     * Error code 0 is reserved for a "no such error string" message.
+     */
+    const char *const *jpeg_message_table;
+    /* Library errors */
+    int last_jpeg_message;    /* Table contains strings 0..last_jpeg_message */
+    /* Second table can be added by application (see cjpeg/djpeg for example).
+     * It contains strings numbered first_addon_message..last_addon_message.
+     */
+    const char *const *addon_message_table;
+    /* Non-library errors */
+    int first_addon_message;
+    /* code for first string in addon table */
+    int last_addon_message;    /* code for last string in addon table */
+};
+
+
+/* Progress monitor object */
+
+struct jpeg_progress_mgr {
+    JMETHOD(void, progress_monitor, (j_common_ptr
+            cinfo));
+
+    long pass_counter;
+    /* work units completed in this pass */
+    long pass_limit;
+    /* total number of work units in this pass */
+    int completed_passes;
+    /* passes completed so far */
+    int total_passes;        /* total number of passes expected */
+};
+
+
+/* Data destination object for compression */
+
+struct jpeg_destination_mgr {
+    JOCTET *next_output_byte;
+    /* => next byte to write in buffer */
+    size_t free_in_buffer;    /* # of byte spaces remaining in buffer */
+
+    JMETHOD(void, init_destination, (j_compress_ptr
+            cinfo));
+
+    JMETHOD(boolean, empty_output_buffer, (j_compress_ptr
+            cinfo));
+
+    JMETHOD(void, term_destination, (j_compress_ptr
+            cinfo));
+};
+
+
+/* Data source object for decompression */
+
+struct jpeg_source_mgr {
+    const JOCTET *next_input_byte;
+    /* => next byte to read from buffer */
+    size_t bytes_in_buffer;    /* # of bytes remaining in buffer */
+#ifdef ANDROID
+    const JOCTET *start_input_byte;
+    /* => first byte to read from input */
+    size_t current_offset; /* current readed input offset */
+#endif
+
+    JMETHOD(void, init_source, (j_decompress_ptr
+            cinfo));
+
+    JMETHOD(boolean, fill_input_buffer, (j_decompress_ptr
+            cinfo));
+
+    JMETHOD(void, skip_input_data, (j_decompress_ptr
+            cinfo,
+            long num_bytes));
+
+    JMETHOD(boolean, resync_to_restart, (j_decompress_ptr
+            cinfo,
+            int desired));
+
+    JMETHOD(void, term_source, (j_decompress_ptr
+            cinfo));
+
+#ifdef ANDROID
+
+    JMETHOD(boolean, seek_input_data, (j_decompress_ptr
+            cinfo,
+            long byte_offset));
+
+#endif
+};
+
+
+/* Memory manager object.
+ * Allocates "small" objects (a few K total), "large" objects (tens of K),
+ * and "really big" objects (virtual arrays with backing store if needed).
+ * The memory manager does not allow individual objects to be freed; rather,
+ * each created object is assigned to a pool, and whole pools can be freed
+ * at once.  This is faster and more convenient than remembering exactly what
+ * to free, especially where malloc()/free() are not too speedy.
+ * NB: alloc routines never return NULL.  They exit to error_exit if not
+ * successful.
+ */
+
+#define JPOOL_PERMANENT    0    /* lasts until master record is destroyed */
+#define JPOOL_IMAGE    1    /* lasts until done with image/datastream */
+#define JPOOL_NUMPOOLS    2
+
+typedef struct jvirt_sarray_control *jvirt_sarray_ptr;
+typedef struct jvirt_barray_control *jvirt_barray_ptr;
+
+
+struct jpeg_memory_mgr {
+    /* Method pointers */
+    JMETHOD(void *, alloc_small, (j_common_ptr
+            cinfo,
+            int pool_id,
+            size_t sizeofobject));
+
+    JMETHOD(void FAR
+                    *, alloc_large, (j_common_ptr
+                    cinfo,
+                    int pool_id,
+                    size_t sizeofobject));
+
+    JMETHOD(JSAMPARRAY, alloc_sarray, (j_common_ptr
+            cinfo,
+            int pool_id,
+            JDIMENSION samplesperrow,
+            JDIMENSION numrows));
+
+    JMETHOD(JBLOCKARRAY, alloc_barray, (j_common_ptr
+            cinfo,
+            int pool_id,
+            JDIMENSION blocksperrow,
+            JDIMENSION numrows));
+
+    JMETHOD(jvirt_sarray_ptr, request_virt_sarray, (j_common_ptr
+            cinfo,
+            int pool_id,
+            boolean pre_zero,
+            JDIMENSION samplesperrow,
+            JDIMENSION numrows,
+            JDIMENSION maxaccess));
+
+    JMETHOD(jvirt_barray_ptr, request_virt_barray, (j_common_ptr
+            cinfo,
+            int pool_id,
+            boolean pre_zero,
+            JDIMENSION blocksperrow,
+            JDIMENSION numrows,
+            JDIMENSION maxaccess));
+
+    JMETHOD(void, realize_virt_arrays, (j_common_ptr
+            cinfo));
+
+    JMETHOD(JSAMPARRAY, access_virt_sarray, (j_common_ptr
+            cinfo,
+                    jvirt_sarray_ptr
+            ptr,
+                    JDIMENSION
+            start_row,
+                    JDIMENSION
+            num_rows,
+                    boolean
+            writable));
+
+    JMETHOD(JBLOCKARRAY, access_virt_barray, (j_common_ptr
+            cinfo,
+                    jvirt_barray_ptr
+            ptr,
+                    JDIMENSION
+            start_row,
+                    JDIMENSION
+            num_rows,
+                    boolean
+            writable));
+
+    JMETHOD(void, free_pool, (j_common_ptr
+            cinfo,
+            int pool_id));
+
+    JMETHOD(void, self_destruct, (j_common_ptr
+            cinfo));
+
+    /* Limit on memory allocation for this JPEG object.  (Note that this is
+     * merely advisory, not a guaranteed maximum; it only affects the space
+     * used for virtual-array buffers.)  May be changed by outer application
+     * after creating the JPEG object.
+     */
+    long max_memory_to_use;
+
+    /* Maximum allocation request accepted by alloc_large. */
+    long max_alloc_chunk;
+};
+
+
+/* Routine signature for application-supplied marker processing methods.
+ * Need not pass marker code since it is stored in cinfo->unread_marker.
+ */
+typedef JMETHOD(boolean, jpeg_marker_parser_method, (j_decompress_ptr
+        cinfo));
+
+
+/* Declarations for routines called by application.
+ * The JPP macro hides prototype parameters from compilers that can't cope.
+ * Note JPP requires double parentheses.
+ */
+
+#ifdef HAVE_PROTOTYPES
+#define JPP(arglist)    arglist
+#else
+#define JPP(arglist)	()
+#endif
+
+
+/* Short forms of external names for systems with brain-damaged linkers.
+ * We shorten external names to be unique in the first six letters, which
+ * is good enough for all known systems.
+ * (If your compiler itself needs names to be unique in less than 15 
+ * characters, you are out of luck.  Get a better compiler.)
+ */
+
+#ifdef NEED_SHORT_EXTERNAL_NAMES
+#define jpeg_std_error		jStdError
+#define jpeg_CreateCompress	jCreaCompress
+#define jpeg_CreateDecompress	jCreaDecompress
+#define jpeg_destroy_compress	jDestCompress
+#define jpeg_destroy_decompress	jDestDecompress
+#define jpeg_stdio_dest		jStdDest
+#define jpeg_stdio_src		jStdSrc
+#if JPEG_LIB_VERSION >= 80
+#define jpeg_mem_dest		jMemDest
+#define jpeg_mem_src		jMemSrc
+#endif
+#define jpeg_set_defaults	jSetDefaults
+#define jpeg_set_colorspace	jSetColorspace
+#define jpeg_default_colorspace	jDefColorspace
+#define jpeg_set_quality	jSetQuality
+#define jpeg_set_linear_quality	jSetLQuality
+#if JPEG_LIB_VERSION >= 70
+#define jpeg_default_qtables	jDefQTables
+#endif
+#define jpeg_add_quant_table	jAddQuantTable
+#define jpeg_quality_scaling	jQualityScaling
+#define jpeg_simple_progression	jSimProgress
+#define jpeg_suppress_tables	jSuppressTables
+#define jpeg_alloc_quant_table	jAlcQTable
+#define jpeg_alloc_huff_table	jAlcHTable
+#define jpeg_start_compress	jStrtCompress
+#define jpeg_write_scanlines	jWrtScanlines
+#define jpeg_finish_compress	jFinCompress
+#if JPEG_LIB_VERSION >= 70
+#define jpeg_calc_jpeg_dimensions	jCjpegDimensions
+#endif
+#define jpeg_write_raw_data	jWrtRawData
+#define jpeg_write_marker	jWrtMarker
+#define jpeg_write_m_header	jWrtMHeader
+#define jpeg_write_m_byte	jWrtMByte
+#define jpeg_write_tables	jWrtTables
+#define jpeg_read_header	jReadHeader
+#define jpeg_start_decompress	jStrtDecompress
+#define jpeg_read_scanlines	jReadScanlines
+#define jpeg_finish_decompress	jFinDecompress
+#define jpeg_read_raw_data	jReadRawData
+#define jpeg_has_multiple_scans	jHasMultScn
+#define jpeg_start_output	jStrtOutput
+#define jpeg_finish_output	jFinOutput
+#define jpeg_input_complete	jInComplete
+#define jpeg_new_colormap	jNewCMap
+#define jpeg_consume_input	jConsumeInput
+#if JPEG_LIB_VERSION >= 80
+#define jpeg_core_output_dimensions	jCoreDimensions
+#endif
+#define jpeg_calc_output_dimensions	jCalcDimensions
+#define jpeg_save_markers	jSaveMarkers
+#define jpeg_set_marker_processor	jSetMarker
+#define jpeg_read_coefficients	jReadCoefs
+#define jpeg_write_coefficients	jWrtCoefs
+#define jpeg_copy_critical_parameters	jCopyCrit
+#define jpeg_abort_compress	jAbrtCompress
+#define jpeg_abort_decompress	jAbrtDecompress
+#define jpeg_abort		jAbort
+#define jpeg_destroy		jDestroy
+#define jpeg_resync_to_restart	jResyncRestart
+#endif /* NEED_SHORT_EXTERNAL_NAMES */
+
+
+/* Default error-management setup */
+EXTERN(struct jpeg_error_mgr *)jpeg_std_error
+        JPP((struct jpeg_error_mgr * err));
+
+/* Initialization of JPEG compression objects.
+ * jpeg_create_compress() and jpeg_create_decompress() are the exported
+ * names that applications should call.  These expand to calls on
+ * jpeg_CreateCompress and jpeg_CreateDecompress with additional information
+ * passed for version mismatch checking.
+ * NB: you must set up the error-manager BEFORE calling jpeg_create_xxx.
+ */
+#define jpeg_create_compress(cinfo) \
+    jpeg_CreateCompress((cinfo), JPEG_LIB_VERSION, \
+            (size_t) sizeof(struct jpeg_compress_struct))
+#define jpeg_create_decompress(cinfo) \
+    jpeg_CreateDecompress((cinfo), JPEG_LIB_VERSION, \
+              (size_t) sizeof(struct jpeg_decompress_struct))
+
+EXTERN(void) jpeg_CreateCompress JPP((j_compress_ptr
+                                             cinfo,
+                                             int version, size_t
+                                             structsize));
+
+EXTERN(void) jpeg_CreateDecompress JPP((j_decompress_ptr
+                                               cinfo,
+                                               int version, size_t
+                                               structsize));
+/* Destruction of JPEG compression objects */
+EXTERN(void) jpeg_destroy_compress JPP((j_compress_ptr
+                                               cinfo));
+
+EXTERN(void) jpeg_destroy_decompress JPP((j_decompress_ptr
+                                                 cinfo));
+
+/* Standard data source and destination managers: stdio streams. */
+/* Caller is responsible for opening the file before and closing after. */
+EXTERN(void) jpeg_stdio_dest JPP((j_compress_ptr
+                                         cinfo, FILE * outfile));
+
+EXTERN(void) jpeg_stdio_src JPP((j_decompress_ptr
+                                        cinfo, FILE * infile));
+
+#if JPEG_LIB_VERSION >= 80
+/* Data source and destination managers: memory buffers. */
+EXTERN(void) jpeg_mem_dest JPP((j_compress_ptr cinfo,
+                   unsigned char ** outbuffer,
+                   unsigned long * outsize));
+EXTERN(void) jpeg_mem_src JPP((j_decompress_ptr cinfo,
+                  unsigned char * inbuffer,
+                  unsigned long insize));
+#endif
+
+/* Default parameter setup for compression */
+EXTERN(void) jpeg_set_defaults JPP((j_compress_ptr
+                                           cinfo));
+/* Compression parameter setup aids */
+EXTERN(void) jpeg_set_colorspace JPP((j_compress_ptr
+                                             cinfo,
+                                                     J_COLOR_SPACE
+                                             colorspace));
+
+EXTERN(void) jpeg_default_colorspace JPP((j_compress_ptr
+                                                 cinfo));
+
+EXTERN(void) jpeg_set_quality JPP((j_compress_ptr
+                                          cinfo,
+                                          int quality,
+                                          boolean force_baseline));
+
+EXTERN(void) jpeg_set_linear_quality JPP((j_compress_ptr
+                                                 cinfo,
+                                                 int scale_factor,
+                                                 boolean force_baseline));
+
+#if JPEG_LIB_VERSION >= 70
+EXTERN(void) jpeg_default_qtables JPP((j_compress_ptr cinfo,
+                       boolean force_baseline));
+#endif
+
+EXTERN(void) jpeg_add_quant_table JPP((j_compress_ptr
+                                              cinfo,
+                                              int which_tbl,
+                                              const unsigned int *basic_table,
+                                              int scale_factor,
+                                              boolean force_baseline));
+
+EXTERN(int) jpeg_quality_scaling JPP((int quality));
+
+EXTERN(void) jpeg_simple_progression JPP((j_compress_ptr
+                                                 cinfo));
+
+EXTERN(void) jpeg_suppress_tables JPP((j_compress_ptr
+                                              cinfo,
+                                                      boolean
+                                              suppress));
+
+EXTERN(JQUANT_TBL *)jpeg_alloc_quant_table JPP((j_common_ptr
+                                                       cinfo));
+
+EXTERN(JHUFF_TBL *)jpeg_alloc_huff_table JPP((j_common_ptr
+                                                     cinfo));
+
+/* Main entry points for compression */
+EXTERN(void) jpeg_start_compress JPP((j_compress_ptr
+                                             cinfo,
+                                                     boolean
+                                             write_all_tables));
+
+EXTERN(JDIMENSION) jpeg_write_scanlines JPP((j_compress_ptr
+                                                    cinfo,
+                                                            JSAMPARRAY
+                                                    scanlines,
+                                                            JDIMENSION
+                                                    num_lines));
+
+EXTERN(void) jpeg_finish_compress JPP((j_compress_ptr
+                                              cinfo));
+
+#if JPEG_LIB_VERSION >= 70
+/* Precalculate JPEG dimensions for current compression parameters. */
+EXTERN(void) jpeg_calc_jpeg_dimensions JPP((j_compress_ptr cinfo));
+#endif
+
+/* Replaces jpeg_write_scanlines when writing raw downsampled data. */
+EXTERN(JDIMENSION) jpeg_write_raw_data JPP((j_compress_ptr
+                                                   cinfo,
+                                                           JSAMPIMAGE
+                                                   data,
+                                                           JDIMENSION
+                                                   num_lines));
+
+/* Write a special marker.  See libjpeg.txt concerning safe usage. */
+EXTERN(void) jpeg_write_marker
+        JPP((j_compress_ptr
+                    cinfo,
+                    int marker,
+                    const JOCTET *dataptr,
+                    unsigned int datalen));
+/* Same, but piecemeal. */
+EXTERN(void) jpeg_write_m_header
+        JPP((j_compress_ptr
+                    cinfo,
+                    int marker,
+                    unsigned int datalen));
+
+EXTERN(void) jpeg_write_m_byte
+        JPP((j_compress_ptr
+                    cinfo,
+                    int val));
+
+/* Alternate compression function: just write an abbreviated table file */
+EXTERN(void) jpeg_write_tables JPP((j_compress_ptr
+                                           cinfo));
+
+/* Decompression startup: read start of JPEG datastream to see what's there */
+EXTERN(int) jpeg_read_header JPP((j_decompress_ptr
+                                         cinfo,
+                                                 boolean
+                                         require_image));
+/* Return value is one of: */
+#define JPEG_SUSPENDED        0 /* Suspended due to lack of input data */
+#define JPEG_HEADER_OK        1 /* Found valid image datastream */
+#define JPEG_HEADER_TABLES_ONLY    2 /* Found valid table-specs-only datastream */
+/* If you pass require_image = TRUE (normal case), you need not check for
+ * a TABLES_ONLY return code; an abbreviated file will cause an error exit.
+ * JPEG_SUSPENDED is only possible if you use a data source module that can
+ * give a suspension return (the stdio source module doesn't).
+ */
+
+/* Main entry points for decompression */
+EXTERN(boolean) jpeg_start_decompress JPP((j_decompress_ptr
+                                                  cinfo));
+
+#ifdef ANDROID
+
+EXTERN(boolean) jpeg_start_tile_decompress JPP((j_decompress_ptr
+                                                       cinfo));
+
+#endif
+
+EXTERN(JDIMENSION) jpeg_read_scanlines JPP((j_decompress_ptr
+                                                   cinfo,
+                                                           JSAMPARRAY
+                                                   scanlines,
+                                                           JDIMENSION
+                                                   max_lines));
+
+EXTERN(boolean) jpeg_finish_decompress JPP((j_decompress_ptr
+                                                   cinfo));
+
+/* Replaces jpeg_read_scanlines when reading raw downsampled data. */
+EXTERN(JDIMENSION) jpeg_read_raw_data JPP((j_decompress_ptr
+                                                  cinfo,
+                                                          JSAMPIMAGE
+                                                  data,
+                                                          JDIMENSION
+                                                  max_lines));
+
+#ifdef ANDROID
+
+EXTERN(JDIMENSION) jpeg_read_scanlines_from JPP((j_decompress_ptr
+                                                        cinfo,
+                                                                JSAMPARRAY
+                                                        scanlines,
+                                                        int line_offset,
+                                                        JDIMENSION max_lines));
+
+EXTERN(JDIMENSION) jpeg_read_tile_scanline JPP((j_decompress_ptr
+                                                       cinfo,
+                                                               huffman_index * index,
+                                                               JSAMPARRAY
+                                                       scanlines));
+
+EXTERN(void) jpeg_init_read_tile_scanline JPP((j_decompress_ptr
+                                                      cinfo,
+                                                              huffman_index * index,
+                                                      int *start_x,
+                                                      int *start_y,
+                                                      int *width,
+                                                      int *height));
+
+#endif
+
+/* Additional entry points for buffered-image mode. */
+EXTERN(boolean) jpeg_has_multiple_scans JPP((j_decompress_ptr
+                                                    cinfo));
+
+EXTERN(boolean) jpeg_start_output JPP((j_decompress_ptr
+                                              cinfo,
+                                              int scan_number));
+
+EXTERN(boolean) jpeg_finish_output JPP((j_decompress_ptr
+                                               cinfo));
+
+EXTERN(boolean) jpeg_input_complete JPP((j_decompress_ptr
+                                                cinfo));
+
+EXTERN(void) jpeg_new_colormap JPP((j_decompress_ptr
+                                           cinfo));
+
+EXTERN(int) jpeg_consume_input JPP((j_decompress_ptr
+                                           cinfo));
+/* Return value is one of: */
+/* #define JPEG_SUSPENDED	0    Suspended due to lack of input data */
+#define JPEG_REACHED_SOS    1 /* Reached start of new scan */
+#define JPEG_REACHED_EOI    2 /* Reached end of image */
+#define JPEG_ROW_COMPLETED    3 /* Completed one iMCU row */
+#define JPEG_SCAN_COMPLETED    4 /* Completed last iMCU row of a scan */
+
+/* Precalculate output dimensions for current decompression parameters. */
+#if JPEG_LIB_VERSION >= 80
+EXTERN(void) jpeg_core_output_dimensions JPP((j_decompress_ptr cinfo));
+#endif
+
+EXTERN(void) jpeg_calc_output_dimensions JPP((j_decompress_ptr
+                                                     cinfo));
+
+/* Control saving of COM and APPn markers into marker_list. */
+EXTERN(void) jpeg_save_markers
+        JPP((j_decompress_ptr
+                    cinfo,
+                    int marker_code,
+                    unsigned int length_limit));
+
+/* Install a special processing method for COM or APPn markers. */
+EXTERN(void) jpeg_set_marker_processor
+        JPP((j_decompress_ptr
+                    cinfo,
+                    int marker_code,
+                    jpeg_marker_parser_method routine));
+
+/* Read or write raw DCT coefficients --- useful for lossless transcoding. */
+EXTERN(jvirt_barray_ptr *)jpeg_read_coefficients JPP((j_decompress_ptr
+                                                             cinfo));
+
+EXTERN(void) jpeg_write_coefficients JPP((j_compress_ptr
+                                                 cinfo,
+                                                         jvirt_barray_ptr * coef_arrays));
+
+EXTERN(void) jpeg_copy_critical_parameters JPP((j_decompress_ptr
+                                                       srcinfo,
+                                                               j_compress_ptr
+                                                       dstinfo));
+
+/* If you choose to abort compression or decompression before completing
+ * jpeg_finish_(de)compress, then you need to clean up to release memory,
+ * temporary files, etc.  You can just call jpeg_destroy_(de)compress
+ * if you're done with the JPEG object, but if you want to clean it up and
+ * reuse it, call this:
+ */
+EXTERN(void) jpeg_abort_compress JPP((j_compress_ptr
+                                             cinfo));
+
+EXTERN(void) jpeg_abort_decompress JPP((j_decompress_ptr
+                                               cinfo));
+
+/* Generic versions of jpeg_abort and jpeg_destroy that work on either
+ * flavor of JPEG object.  These may be more convenient in some places.
+ */
+EXTERN(void) jpeg_abort JPP((j_common_ptr
+                                    cinfo));
+
+EXTERN(void) jpeg_destroy JPP((j_common_ptr
+                                      cinfo));
+
+/* Default restart-marker-resync procedure for use by data source modules */
+EXTERN(boolean) jpeg_resync_to_restart JPP((j_decompress_ptr
+                                                   cinfo,
+                                                   int desired));
+
+#ifdef ANDROID
+
+EXTERN(boolean) jpeg_build_huffman_index
+        JPP((j_decompress_ptr
+                    cinfo, huffman_index * index));
+
+EXTERN(void) jpeg_configure_huffman_decoder(j_decompress_ptr cinfo,
+                                            huffman_offset_data offset);
+
+EXTERN(void) jpeg_get_huffman_decoder_configuration(j_decompress_ptr cinfo,
+                                                    huffman_offset_data *offset);
+
+EXTERN(void) jpeg_create_huffman_index(j_decompress_ptr cinfo,
+                                       huffman_index *index);
+
+EXTERN(void) jpeg_configure_huffman_index_scan(j_decompress_ptr cinfo,
+                                               huffman_index *index, int scan_no, int offset);
+
+EXTERN(void) jpeg_destroy_huffman_index(huffman_index *index);
+
+#endif
+
+/* These marker codes are exported since applications and data source modules
+ * are likely to want to use them.
+ */
+
+#define JPEG_RST0    0xD0    /* RST0 marker code */
+#define JPEG_EOI    0xD9    /* EOI marker code */
+#define JPEG_APP0    0xE0    /* APP0 marker code */
+#define JPEG_COM    0xFE    /* COM marker code */
+
+
+/* If we have a brain-damaged compiler that emits warnings (or worse, errors)
+ * for structure definitions that are never filled in, keep it quiet by
+ * supplying dummy definitions for the various substructures.
+ */
+
+#ifdef INCOMPLETE_TYPES_BROKEN
+#ifndef JPEG_INTERNALS		/* will be defined in jpegint.h */
+struct jvirt_sarray_control { long dummy; };
+struct jvirt_barray_control { long dummy; };
+struct jpeg_comp_master { long dummy; };
+struct jpeg_c_main_controller { long dummy; };
+struct jpeg_c_prep_controller { long dummy; };
+struct jpeg_c_coef_controller { long dummy; };
+struct jpeg_marker_writer { long dummy; };
+struct jpeg_color_converter { long dummy; };
+struct jpeg_downsampler { long dummy; };
+struct jpeg_forward_dct { long dummy; };
+struct jpeg_entropy_encoder { long dummy; };
+struct jpeg_decomp_master { long dummy; };
+struct jpeg_d_main_controller { long dummy; };
+struct jpeg_d_coef_controller { long dummy; };
+struct jpeg_d_post_controller { long dummy; };
+struct jpeg_input_controller { long dummy; };
+struct jpeg_marker_reader { long dummy; };
+struct jpeg_entropy_decoder { long dummy; };
+struct jpeg_inverse_dct { long dummy; };
+struct jpeg_upsampler { long dummy; };
+struct jpeg_color_deconverter { long dummy; };
+struct jpeg_color_quantizer { long dummy; };
+#endif /* JPEG_INTERNALS */
+#endif /* INCOMPLETE_TYPES_BROKEN */
+
+
+/*
+ * The JPEG library modules define JPEG_INTERNALS before including this file.
+ * The internal structure declarations are read only when that is true.
+ * Applications using the library should not include jpegint.h, but may wish
+ * to include jerror.h.
+ */
+
+#ifdef JPEG_INTERNALS
+#include "jpegint.h"		/* fetch private declarations */
+#include "jerror.h"		/* fetch error codes too */
+#endif
+
+#ifndef ANDROID
+#ifdef __cplusplus
+#ifndef DONT_USE_EXTERN_C
+}
+#endif
+#endif
+#endif
+
+#endif /* JPEGLIB_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jsimd.h b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jsimd.h
new file mode 100644
index 0000000..6ee99cc
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jsimd.h
@@ -0,0 +1,666 @@
+/*
+ * simd/jsimd.h
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright 2011 D. R. Commander
+ * 
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ */
+
+/* Bitmask for supported acceleration methods */
+
+#define JSIMD_NONE       0x00
+#define JSIMD_MMX        0x01
+#define JSIMD_3DNOW      0x02
+#define JSIMD_SSE        0x04
+#define JSIMD_SSE2       0x08
+#define JSIMD_ARM_NEON   0x10
+
+/* Short forms of external names for systems with brain-damaged linkers. */
+
+#ifdef NEED_SHORT_EXTERNAL_NAMES
+#define jpeg_simd_cpu_support                 jSiCpuSupport
+#define jsimd_rgb_ycc_convert_mmx             jSRGBYCCM
+#define jsimd_extrgb_ycc_convert_mmx          jSEXTRGBYCCM
+#define jsimd_extrgbx_ycc_convert_mmx         jSEXTRGBXYCCM
+#define jsimd_extbgr_ycc_convert_mmx          jSEXTBGRYCCM
+#define jsimd_extbgrx_ycc_convert_mmx         jSEXTBGRXYCCM
+#define jsimd_extxbgr_ycc_convert_mmx         jSEXTXBGRYCCM
+#define jsimd_extxrgb_ycc_convert_mmx         jSEXTXRGBYCCM
+#define jsimd_rgb_gray_convert_mmx            jSRGBGRYM
+#define jsimd_extrgb_gray_convert_mmx         jSEXTRGBGRYM
+#define jsimd_extrgbx_gray_convert_mmx        jSEXTRGBXGRYM
+#define jsimd_extbgr_gray_convert_mmx         jSEXTBGRGRYM
+#define jsimd_extbgrx_gray_convert_mmx        jSEXTBGRXGRYM
+#define jsimd_extxbgr_gray_convert_mmx        jSEXTXBGRGRYM
+#define jsimd_extxrgb_gray_convert_mmx        jSEXTXRGBGRYM
+#define jsimd_ycc_rgb_convert_mmx             jSYCCRGBM
+#define jsimd_ycc_extrgb_convert_mmx          jSYCCEXTRGBM
+#define jsimd_ycc_extrgbx_convert_mmx         jSYCCEXTRGBXM
+#define jsimd_ycc_extbgr_convert_mmx          jSYCCEXTBGRM
+#define jsimd_ycc_extbgrx_convert_mmx         jSYCCEXTBGRXM
+#define jsimd_ycc_extxbgr_convert_mmx         jSYCCEXTXBGRM
+#define jsimd_ycc_extxrgb_convert_mmx         jSYCCEXTXRGBM
+#define jconst_rgb_ycc_convert_sse2           jSCRGBYCCS2
+#define jsimd_rgb_ycc_convert_sse2            jSRGBYCCS2
+#define jsimd_extrgb_ycc_convert_sse2         jSEXTRGBYCCS2
+#define jsimd_extrgbx_ycc_convert_sse2        jSEXTRGBXYCCS2
+#define jsimd_extbgr_ycc_convert_sse2         jSEXTBGRYCCS2
+#define jsimd_extbgrx_ycc_convert_sse2        jSEXTBGRXYCCS2
+#define jsimd_extxbgr_ycc_convert_sse2        jSEXTXBGRYCCS2
+#define jsimd_extxrgb_ycc_convert_sse2        jSEXTXRGBYCCS2
+#define jconst_rgb_gray_convert_sse2          jSCRGBGRYS2
+#define jsimd_rgb_gray_convert_sse2           jSRGBGRYS2
+#define jsimd_extrgb_gray_convert_sse2        jSEXTRGBGRYS2
+#define jsimd_extrgbx_gray_convert_sse2       jSEXTRGBXGRYS2
+#define jsimd_extbgr_gray_convert_sse2        jSEXTBGRGRYS2
+#define jsimd_extbgrx_gray_convert_sse2       jSEXTBGRXGRYS2
+#define jsimd_extxbgr_gray_convert_sse2       jSEXTXBGRGRYS2
+#define jsimd_extxrgb_gray_convert_sse2       jSEXTXRGBGRYS2
+#define jconst_ycc_rgb_convert_sse2           jSCYCCRGBS2
+#define jsimd_ycc_rgb_convert_sse2            jSYCCRGBS2
+#define jsimd_ycc_extrgb_convert_sse2         jSYCCEXTRGBS2
+#define jsimd_ycc_extrgbx_convert_sse2        jSYCCEXTRGBXS2
+#define jsimd_ycc_extbgr_convert_sse2         jSYCCEXTBGRS2
+#define jsimd_ycc_extbgrx_convert_sse2        jSYCCEXTBGRXS2
+#define jsimd_ycc_extxbgr_convert_sse2        jSYCCEXTXBGRS2
+#define jsimd_ycc_extxrgb_convert_sse2        jSYCCEXTXRGBS2
+#define jsimd_h2v2_downsample_mmx             jSDnH2V2M
+#define jsimd_h2v1_downsample_mmx             jSDnH2V1M
+#define jsimd_h2v2_downsample_sse2            jSDnH2V2S2
+#define jsimd_h2v1_downsample_sse2            jSDnH2V1S2
+#define jsimd_h2v2_upsample_mmx               jSUpH2V2M
+#define jsimd_h2v1_upsample_mmx               jSUpH2V1M
+#define jsimd_h2v2_fancy_upsample_mmx         jSFUpH2V2M
+#define jsimd_h2v1_fancy_upsample_mmx         jSFUpH2V1M
+#define jsimd_h2v2_merged_upsample_mmx        jSMUpH2V2M
+#define jsimd_h2v2_extrgb_merged_upsample_mmx jSMUpH2V2EXTRGBM
+#define jsimd_h2v2_extrgbx_merged_upsample_mmx jSMUpH2V2EXTRGBXM
+#define jsimd_h2v2_extbgr_merged_upsample_mmx jSMUpH2V2EXTBGRM
+#define jsimd_h2v2_extbgrx_merged_upsample_mmx jSMUpH2V2EXTBGRXM
+#define jsimd_h2v2_extxbgr_merged_upsample_mmx jSMUpH2V2EXTXBGRM
+#define jsimd_h2v2_extxrgb_merged_upsample_mmx jSMUpH2V2EXTXRGBM
+#define jsimd_h2v1_merged_upsample_mmx        jSMUpH2V1M
+#define jsimd_h2v1_extrgb_merged_upsample_mmx jSMUpH2V1EXTRGBM
+#define jsimd_h2v1_extrgbx_merged_upsample_mmx jSMUpH2V1EXTRGBXM
+#define jsimd_h2v1_extbgr_merged_upsample_mmx jSMUpH2V1EXTBGRM
+#define jsimd_h2v1_extbgrx_merged_upsample_mmx jSMUpH2V1EXTBGRXM
+#define jsimd_h2v1_extxbgr_merged_upsample_mmx jSMUpH2V1EXTXBGRM
+#define jsimd_h2v1_extxrgb_merged_upsample_mmx jSMUpH2V1EXTXRGBM
+#define jsimd_h2v2_upsample_sse2              jSUpH2V2S2
+#define jsimd_h2v1_upsample_sse2              jSUpH2V1S2
+#define jconst_fancy_upsample_sse2            jSCFUpS2
+#define jsimd_h2v2_fancy_upsample_sse2        jSFUpH2V2S2
+#define jsimd_h2v1_fancy_upsample_sse2        jSFUpH2V1S2
+#define jconst_merged_upsample_sse2           jSCMUpS2
+#define jsimd_h2v2_merged_upsample_sse2       jSMUpH2V2S2
+#define jsimd_h2v2_extrgb_merged_upsample_sse2 jSMUpH2V2EXTRGBS2
+#define jsimd_h2v2_extrgbx_merged_upsample_sse2 jSMUpH2V2EXTRGBXS2
+#define jsimd_h2v2_extbgr_merged_upsample_sse2 jSMUpH2V2EXTBGRS2
+#define jsimd_h2v2_extbgrx_merged_upsample_sse2 jSMUpH2V2EXTBGRXS2
+#define jsimd_h2v2_extxbgr_merged_upsample_sse2 jSMUpH2V2EXTXBGRS2
+#define jsimd_h2v2_extxrgb_merged_upsample_sse2 jSMUpH2V2EXTXRGBS2
+#define jsimd_h2v1_merged_upsample_sse2       jSMUpH2V1S2
+#define jsimd_h2v1_extrgb_merged_upsample_sse2 jSMUpH2V1EXTRGBS2
+#define jsimd_h2v1_extrgbx_merged_upsample_sse2 jSMUpH2V1EXTRGBXS2
+#define jsimd_h2v1_extbgr_merged_upsample_sse2 jSMUpH2V1EXTBGRS2
+#define jsimd_h2v1_extbgrx_merged_upsample_sse2 jSMUpH2V1EXTBGRXS2
+#define jsimd_h2v1_extxbgr_merged_upsample_sse2 jSMUpH2V1EXTXBGRS2
+#define jsimd_h2v1_extxrgb_merged_upsample_sse2 jSMUpH2V1EXTXRGBS2
+#define jsimd_convsamp_mmx                    jSConvM
+#define jsimd_convsamp_sse2                   jSConvS2
+#define jsimd_convsamp_float_3dnow            jSConvF3D
+#define jsimd_convsamp_float_sse              jSConvFS
+#define jsimd_convsamp_float_sse2             jSConvFS2
+#define jsimd_fdct_islow_mmx                  jSFDMIS
+#define jsimd_fdct_ifast_mmx                  jSFDMIF
+#define jconst_fdct_islow_sse2                jSCFDS2IS
+#define jsimd_fdct_islow_sse2                 jSFDS2IS
+#define jconst_fdct_ifast_sse2                jSCFDS2IF
+#define jsimd_fdct_ifast_sse2                 jSFDS2IF
+#define jsimd_fdct_float_3dnow                jSFD3DF
+#define jconst_fdct_float_sse                 jSCFDSF
+#define jsimd_fdct_float_sse                  jSFDSF
+#define jsimd_quantize_mmx                    jSQuantM
+#define jsimd_quantize_sse2                   jSQuantS2
+#define jsimd_quantize_float_3dnow            jSQuantF3D
+#define jsimd_quantize_float_sse              jSQuantFS
+#define jsimd_quantize_float_sse2             jSQuantFS2
+#define jsimd_idct_2x2_mmx                    jSIDM22
+#define jsimd_idct_4x4_mmx                    jSIDM44
+#define jconst_idct_red_sse2                  jSCIDS2R
+#define jsimd_idct_2x2_sse2                   jSIDS222
+#define jsimd_idct_4x4_sse2                   jSIDS244
+#define jsimd_idct_islow_mmx                  jSIDMIS
+#define jsimd_idct_ifast_mmx                  jSIDMIF
+#define jconst_idct_islow_sse2                jSCIDS2IS
+#define jsimd_idct_islow_sse2                 jSIDS2IS
+#define jconst_idct_ifast_sse2                jSCIDS2IF
+#define jsimd_idct_ifast_sse2                 jSIDS2IF
+#define jsimd_idct_float_3dnow                jSID3DF
+#define jconst_fdct_float_sse                 jSCIDSF
+#define jsimd_idct_float_sse                  jSIDSF
+#define jconst_fdct_float_sse2                jSCIDS2F
+#define jsimd_idct_float_sse2                 jSIDS2F
+#endif /* NEED_SHORT_EXTERNAL_NAMES */
+
+/* SIMD Ext: retrieve SIMD/CPU information */
+EXTERN(unsigned int) jpeg_simd_cpu_support JPP((void));
+
+/* SIMD Color Space Conversion */
+EXTERN(void) jsimd_rgb_ycc_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgb_ycc_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgbx_ycc_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgr_ycc_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgrx_ycc_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxbgr_ycc_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxrgb_ycc_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+
+EXTERN(void) jsimd_rgb_gray_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgb_gray_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgbx_gray_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgr_gray_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgrx_gray_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxbgr_gray_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxrgb_gray_convert_mmx
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+
+EXTERN(void) jsimd_ycc_rgb_convert_mmx
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgb_convert_mmx
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgbx_convert_mmx
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgr_convert_mmx
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgrx_convert_mmx
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxbgr_convert_mmx
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxrgb_convert_mmx
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+
+extern const int jconst_rgb_ycc_convert_sse2[];
+EXTERN(void) jsimd_rgb_ycc_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgb_ycc_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgbx_ycc_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgr_ycc_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgrx_ycc_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxbgr_ycc_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxrgb_ycc_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+
+extern const int jconst_rgb_gray_convert_sse2[];
+EXTERN(void) jsimd_rgb_gray_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgb_gray_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgbx_gray_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgr_gray_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgrx_gray_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxbgr_gray_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxrgb_gray_convert_sse2
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+
+extern const int jconst_ycc_rgb_convert_sse2[];
+EXTERN(void) jsimd_ycc_rgb_convert_sse2
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgb_convert_sse2
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgbx_convert_sse2
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgr_convert_sse2
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgrx_convert_sse2
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxbgr_convert_sse2
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxrgb_convert_sse2
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+
+EXTERN(void) jsimd_rgb_ycc_convert_neon
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgb_ycc_convert_neon
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extrgbx_ycc_convert_neon
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgr_ycc_convert_neon
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extbgrx_ycc_convert_neon
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxbgr_ycc_convert_neon
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+EXTERN(void) jsimd_extxrgb_ycc_convert_neon
+        JPP((JDIMENSION img_width,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows));
+
+EXTERN(void) jsimd_ycc_rgb_convert_neon
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgb_convert_neon
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extrgbx_convert_neon
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgr_convert_neon
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extbgrx_convert_neon
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxbgr_convert_neon
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+EXTERN(void) jsimd_ycc_extxrgb_convert_neon
+        JPP((JDIMENSION out_width,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows));
+
+/* SIMD Downsample */
+EXTERN(void) jsimd_h2v2_downsample_mmx
+        JPP((JDIMENSION image_width, int max_v_samp_factor,
+             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+             JSAMPARRAY input_data, JSAMPARRAY output_data));
+EXTERN(void) jsimd_h2v1_downsample_mmx
+        JPP((JDIMENSION image_width, int max_v_samp_factor,
+             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+             JSAMPARRAY input_data, JSAMPARRAY output_data));
+
+EXTERN(void) jsimd_h2v2_downsample_sse2
+        JPP((JDIMENSION image_width, int max_v_samp_factor,
+             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+             JSAMPARRAY input_data, JSAMPARRAY output_data));
+EXTERN(void) jsimd_h2v1_downsample_sse2
+        JPP((JDIMENSION image_width, int max_v_samp_factor,
+             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+             JSAMPARRAY input_data, JSAMPARRAY output_data));
+
+/* SIMD Upsample */
+EXTERN(void) jsimd_h2v2_upsample_mmx
+        JPP((int max_v_samp_factor, JDIMENSION output_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v1_upsample_mmx
+        JPP((int max_v_samp_factor, JDIMENSION output_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+EXTERN(void) jsimd_h2v2_fancy_upsample_mmx
+        JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v1_fancy_upsample_mmx
+        JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+EXTERN(void) jsimd_h2v2_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+
+EXTERN(void) jsimd_h2v2_upsample_sse2
+        JPP((int max_v_samp_factor, JDIMENSION output_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v1_upsample_sse2
+        JPP((int max_v_samp_factor, JDIMENSION output_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+extern const int jconst_fancy_upsample_sse2[];
+EXTERN(void) jsimd_h2v2_fancy_upsample_sse2
+        JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+EXTERN(void) jsimd_h2v1_fancy_upsample_sse2
+        JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
+extern const int jconst_merged_upsample_sse2[];
+EXTERN(void) jsimd_h2v2_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+
+/* SIMD Sample Conversion */
+EXTERN(void) jsimd_convsamp_mmx JPP((JSAMPARRAY sample_data,
+                                     JDIMENSION start_col,
+                                     DCTELEM * workspace));
+
+EXTERN(void) jsimd_convsamp_sse2 JPP((JSAMPARRAY sample_data,
+                                      JDIMENSION start_col,
+                                      DCTELEM * workspace));
+
+EXTERN(void) jsimd_convsamp_neon JPP((JSAMPARRAY sample_data,
+                                      JDIMENSION start_col,
+                                      DCTELEM * workspace));
+
+EXTERN(void) jsimd_convsamp_float_3dnow JPP((JSAMPARRAY sample_data,
+                                             JDIMENSION start_col,
+                                             FAST_FLOAT * workspace));
+
+EXTERN(void) jsimd_convsamp_float_sse JPP((JSAMPARRAY sample_data,
+                                           JDIMENSION start_col,
+                                           FAST_FLOAT * workspace));
+
+EXTERN(void) jsimd_convsamp_float_sse2 JPP((JSAMPARRAY sample_data,
+                                            JDIMENSION start_col,
+                                            FAST_FLOAT * workspace));
+
+/* SIMD Forward DCT */
+EXTERN(void) jsimd_fdct_islow_mmx JPP((DCTELEM * data));
+EXTERN(void) jsimd_fdct_ifast_mmx JPP((DCTELEM * data));
+
+extern const int jconst_fdct_ifast_sse2[];
+EXTERN(void) jsimd_fdct_islow_sse2 JPP((DCTELEM * data));
+extern const int jconst_fdct_islow_sse2[];
+EXTERN(void) jsimd_fdct_ifast_sse2 JPP((DCTELEM * data));
+
+EXTERN(void) jsimd_fdct_ifast_neon JPP((DCTELEM * data));
+
+EXTERN(void) jsimd_fdct_float_3dnow JPP((FAST_FLOAT * data));
+
+extern const int jconst_fdct_float_sse[];
+EXTERN(void) jsimd_fdct_float_sse JPP((FAST_FLOAT * data));
+
+/* SIMD Quantization */
+EXTERN(void) jsimd_quantize_mmx JPP((JCOEFPTR coef_block,
+                                     DCTELEM * divisors,
+                                     DCTELEM * workspace));
+
+EXTERN(void) jsimd_quantize_sse2 JPP((JCOEFPTR coef_block,
+                                      DCTELEM * divisors,
+                                      DCTELEM * workspace));
+
+EXTERN(void) jsimd_quantize_neon JPP((JCOEFPTR coef_block,
+                                      DCTELEM * divisors,
+                                      DCTELEM * workspace));
+
+EXTERN(void) jsimd_quantize_float_3dnow JPP((JCOEFPTR coef_block,
+                                             FAST_FLOAT * divisors,
+                                             FAST_FLOAT * workspace));
+
+EXTERN(void) jsimd_quantize_float_sse JPP((JCOEFPTR coef_block,
+                                           FAST_FLOAT * divisors,
+                                           FAST_FLOAT * workspace));
+
+EXTERN(void) jsimd_quantize_float_sse2 JPP((JCOEFPTR coef_block,
+                                            FAST_FLOAT * divisors,
+                                            FAST_FLOAT * workspace));
+
+/* SIMD Reduced Inverse DCT */
+EXTERN(void) jsimd_idct_2x2_mmx JPP((void * dct_table,
+                                     JCOEFPTR coef_block,
+                                     JSAMPARRAY output_buf,
+                                     JDIMENSION output_col));
+EXTERN(void) jsimd_idct_4x4_mmx JPP((void * dct_table,
+                                     JCOEFPTR coef_block,
+                                     JSAMPARRAY output_buf,
+                                     JDIMENSION output_col));
+
+extern const int jconst_idct_red_sse2[];
+EXTERN(void) jsimd_idct_2x2_sse2 JPP((void * dct_table,
+                                      JCOEFPTR coef_block,
+                                      JSAMPARRAY output_buf,
+                                      JDIMENSION output_col));
+EXTERN(void) jsimd_idct_4x4_sse2 JPP((void * dct_table,
+                                      JCOEFPTR coef_block,
+                                      JSAMPARRAY output_buf,
+                                      JDIMENSION output_col));
+
+EXTERN(void) jsimd_idct_2x2_neon JPP((void * dct_table,
+                                      JCOEFPTR coef_block,
+                                      JSAMPARRAY output_buf,
+                                      JDIMENSION output_col));
+EXTERN(void) jsimd_idct_4x4_neon JPP((void * dct_table,
+                                      JCOEFPTR coef_block,
+                                      JSAMPARRAY output_buf,
+                                      JDIMENSION output_col));
+
+/* SIMD Inverse DCT */
+EXTERN(void) jsimd_idct_islow_mmx JPP((void * dct_table,
+                                       JCOEFPTR coef_block,
+                                       JSAMPARRAY output_buf,
+                                       JDIMENSION output_col));
+EXTERN(void) jsimd_idct_ifast_mmx JPP((void * dct_table,
+                                       JCOEFPTR coef_block,
+                                       JSAMPARRAY output_buf,
+                                       JDIMENSION output_col));
+
+extern const int jconst_idct_islow_sse2[];
+EXTERN(void) jsimd_idct_islow_sse2 JPP((void * dct_table,
+                                        JCOEFPTR coef_block,
+                                        JSAMPARRAY output_buf,
+                                        JDIMENSION output_col));
+extern const int jconst_idct_ifast_sse2[];
+EXTERN(void) jsimd_idct_ifast_sse2 JPP((void * dct_table,
+                                        JCOEFPTR coef_block,
+                                        JSAMPARRAY output_buf,
+                                        JDIMENSION output_col));
+
+EXTERN(void) jsimd_idct_islow_neon JPP((void * dct_table,
+                                        JCOEFPTR coef_block,
+                                        JSAMPARRAY output_buf,
+                                        JDIMENSION output_col));
+EXTERN(void) jsimd_idct_ifast_neon JPP((void * dct_table,
+                                        JCOEFPTR coef_block,
+                                        JSAMPARRAY output_buf,
+                                        JDIMENSION output_col));
+
+EXTERN(void) jsimd_idct_float_3dnow JPP((void * dct_table,
+                                         JCOEFPTR coef_block,
+                                         JSAMPARRAY output_buf,
+                                         JDIMENSION output_col));
+
+extern const int jconst_idct_float_sse[];
+EXTERN(void) jsimd_idct_float_sse JPP((void * dct_table,
+                                       JCOEFPTR coef_block,
+                                       JSAMPARRAY output_buf,
+                                       JDIMENSION output_col));
+
+extern const int jconst_idct_float_sse2[];
+EXTERN(void) jsimd_idct_float_sse2 JPP((void * dct_table,
+                                        JCOEFPTR coef_block,
+                                        JSAMPARRAY output_buf,
+                                        JDIMENSION output_col));
+
diff --git a/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jsimdcfg.inc.h b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jsimdcfg.inc.h
new file mode 100644
index 0000000..0dacd06
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jsimdcfg.inc.h
@@ -0,0 +1,199 @@
+// This file generates the include file for the assembly
+// implementations by abusing the C preprocessor.
+//
+// Note: Some things are manually defined as they need to
+// be mapped to NASM types.
+
+;
+; Automatically generated include file from jsimdcfg.inc.h
+;
+
+#define JPEG_INTERNALS
+
+#include "../jpeglib.h"
+#include "../jconfig.h"
+#include "../jmorecfg.h"
+#include "jsimd.h"
+
+#define define(var) %define _cpp_protection_##var
+#define definev(var) %define _cpp_protection_##var var
+
+;
+; -- jpeglib.h
+;
+
+definev(DCTSIZE)
+definev(DCTSIZE2)
+
+;
+; -- jmorecfg.h
+;
+
+definev(RGB_RED)
+definev(RGB_GREEN)
+definev(RGB_BLUE)
+definev(RGB_PIXELSIZE)
+
+definev(EXT_RGB_RED)
+definev(EXT_RGB_GREEN)
+definev(EXT_RGB_BLUE)
+definev(EXT_RGB_PIXELSIZE)
+
+definev(EXT_RGBX_RED)
+definev(EXT_RGBX_GREEN)
+definev(EXT_RGBX_BLUE)
+definev(EXT_RGBX_PIXELSIZE)
+
+definev(EXT_BGR_RED)
+definev(EXT_BGR_GREEN)
+definev(EXT_BGR_BLUE)
+definev(EXT_BGR_PIXELSIZE)
+
+definev(EXT_BGRX_RED)
+definev(EXT_BGRX_GREEN)
+definev(EXT_BGRX_BLUE)
+definev(EXT_BGRX_PIXELSIZE)
+
+definev(EXT_XBGR_RED)
+definev(EXT_XBGR_GREEN)
+definev(EXT_XBGR_BLUE)
+definev(EXT_XBGR_PIXELSIZE)
+
+definev(EXT_XRGB_RED)
+definev(EXT_XRGB_GREEN)
+definev(EXT_XRGB_BLUE)
+definev(EXT_XRGB_PIXELSIZE)
+
+%define RGBX_FILLER_0XFF        1
+
+; Representation of a single sample (pixel element value).
+; On this SIMD implementation, this must be 'unsigned char'.
+;
+
+%define JSAMPLE                 byte          ; unsigned char
+%define SIZEOF_JSAMPLE          SIZEOF_BYTE   ; sizeof(JSAMPLE)
+
+definev(CENTERJSAMPLE)
+
+; Representation of a DCT frequency coefficient.
+; On this SIMD implementation, this must be 'short'.
+;
+%define JCOEF                   word          ; short
+%define SIZEOF_JCOEF            SIZEOF_WORD   ; sizeof(JCOEF)
+
+; Datatype used for image dimensions.
+; On this SIMD implementation, this must be 'unsigned int'.
+;
+%define JDIMENSION              dword         ; unsigned int
+%define SIZEOF_JDIMENSION       SIZEOF_DWORD  ; sizeof(JDIMENSION)
+
+%define JSAMPROW                POINTER       ; JSAMPLE FAR * (jpeglib.h)
+%define JSAMPARRAY              POINTER       ; JSAMPROW *    (jpeglib.h)
+%define JSAMPIMAGE              POINTER       ; JSAMPARRAY *  (jpeglib.h)
+%define JCOEFPTR                POINTER       ; JCOEF FAR *   (jpeglib.h)
+%define SIZEOF_JSAMPROW         SIZEOF_POINTER  ; sizeof(JSAMPROW)
+%define SIZEOF_JSAMPARRAY       SIZEOF_POINTER  ; sizeof(JSAMPARRAY)
+%define SIZEOF_JSAMPIMAGE       SIZEOF_POINTER  ; sizeof(JSAMPIMAGE)
+%define SIZEOF_JCOEFPTR         SIZEOF_POINTER  ; sizeof(JCOEFPTR)
+
+;
+; -- jdct.h
+;
+
+; A forward DCT routine is given a pointer to a work area of type DCTELEM[];
+; the DCT is to be performed in-place in that buffer.
+; To maximize parallelism, Type DCTELEM is changed to short (originally, int).
+;
+%define DCTELEM                 word          ; short
+%define SIZEOF_DCTELEM          SIZEOF_WORD   ; sizeof(DCTELEM)
+
+%define FAST_FLOAT              FP32            ; float
+%define SIZEOF_FAST_FLOAT       SIZEOF_FP32     ; sizeof(FAST_FLOAT)
+
+; To maximize parallelism, Type MULTIPLIER is changed to short.
+;
+%define ISLOW_MULT_TYPE         word          ; must be short
+%define SIZEOF_ISLOW_MULT_TYPE  SIZEOF_WORD   ; sizeof(ISLOW_MULT_TYPE)
+
+%define IFAST_MULT_TYPE         word          ; must be short
+%define SIZEOF_IFAST_MULT_TYPE  SIZEOF_WORD   ; sizeof(IFAST_MULT_TYPE)
+%define IFAST_SCALE_BITS        2             ; fractional bits in scale factors
+
+%define FLOAT_MULT_TYPE         FP32          ; must be float
+%define SIZEOF_FLOAT_MULT_TYPE  SIZEOF_FP32   ; sizeof(FLOAT_MULT_TYPE)
+
+;
+; -- jsimd.h
+;
+
+definev(JSIMD_NONE)
+definev(JSIMD_MMX)
+definev(JSIMD_3DNOW)
+definev(JSIMD_SSE)
+definev(JSIMD_SSE2)
+
+; Short forms of external names for systems with brain-damaged linkers.
+;
+#ifdef NEED_SHORT_EXTERNAL_NAMES
+definev(jpeg_simd_cpu_support)
+definev(jsimd_rgb_ycc_convert_mmx)
+definev(jsimd_ycc_rgb_convert_mmx)
+definev(jconst_rgb_ycc_convert_sse2)
+definev(jsimd_rgb_ycc_convert_sse2)
+definev(jconst_ycc_rgb_convert_sse2)
+definev(jsimd_ycc_rgb_convert_sse2)
+definev(jsimd_h2v2_downsample_mmx)
+definev(jsimd_h2v1_downsample_mmx)
+definev(jsimd_h2v2_downsample_sse2)
+definev(jsimd_h2v1_downsample_sse2)
+definev(jsimd_h2v2_upsample_mmx)
+definev(jsimd_h2v1_upsample_mmx)
+definev(jsimd_h2v1_fancy_upsample_mmx)
+definev(jsimd_h2v2_fancy_upsample_mmx)
+definev(jsimd_h2v1_merged_upsample_mmx)
+definev(jsimd_h2v2_merged_upsample_mmx)
+definev(jsimd_h2v2_upsample_sse2)
+definev(jsimd_h2v1_upsample_sse2)
+definev(jconst_fancy_upsample_sse2)
+definev(jsimd_h2v1_fancy_upsample_sse2)
+definev(jsimd_h2v2_fancy_upsample_sse2)
+definev(jconst_merged_upsample_sse2)
+definev(jsimd_h2v1_merged_upsample_sse2)
+definev(jsimd_h2v2_merged_upsample_sse2)
+definev(jsimd_convsamp_mmx)
+definev(jsimd_convsamp_sse2)
+definev(jsimd_convsamp_float_3dnow)
+definev(jsimd_convsamp_float_sse)
+definev(jsimd_convsamp_float_sse2)
+definev(jsimd_fdct_islow_mmx)
+definev(jsimd_fdct_ifast_mmx)
+definev(jconst_fdct_islow_sse2)
+definev(jsimd_fdct_islow_sse2)
+definev(jconst_fdct_ifast_sse2)
+definev(jsimd_fdct_ifast_sse2)
+definev(jsimd_fdct_float_3dnow)
+definev(jconst_fdct_float_sse)
+definev(jsimd_fdct_float_sse)
+definev(jsimd_quantize_mmx)
+definev(jsimd_quantize_sse2)
+definev(jsimd_quantize_float_3dnow)
+definev(jsimd_quantize_float_sse)
+definev(jsimd_quantize_float_sse2)
+definev(jsimd_idct_2x2_mmx)
+definev(jsimd_idct_4x4_mmx)
+definev(jconst_idct_red_sse2)
+definev(jsimd_idct_2x2_sse2)
+definev(jsimd_idct_4x4_sse2)
+definev(jsimd_idct_islow_mmx)
+definev(jsimd_idct_ifast_mmx)
+definev(jconst_idct_islow_sse2)
+definev(jsimd_idct_islow_sse2)
+definev(jconst_idct_ifast_sse2)
+definev(jsimd_idct_ifast_sse2)
+definev(jsimd_idct_float_3dnow)
+definev(jconst_idct_float_sse)
+definev(jsimd_idct_float_sse)
+definev(jconst_idct_float_sse2)
+definev(jsimd_idct_float_sse2)
+#endif /* NEED_SHORT_EXTERNAL_NAMES */
+
diff --git a/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jsimddct.h b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jsimddct.h
new file mode 100644
index 0000000..a1c7440
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jsimddct.h
@@ -0,0 +1,102 @@
+/*
+ * jsimddct.h
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * 
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ */
+
+/* Short forms of external names for systems with brain-damaged linkers. */
+
+#ifdef NEED_SHORT_EXTERNAL_NAMES
+#define jsimd_can_convsamp                jSCanConv
+#define jsimd_can_convsamp_float          jSCanConvF
+#define jsimd_convsamp                    jSConv
+#define jsimd_convsamp_float              jSConvF
+#define jsimd_can_fdct_islow              jSCanFDCTIS
+#define jsimd_can_fdct_ifast              jSCanFDCTIF
+#define jsimd_can_fdct_float              jSCanFDCTFl
+#define jsimd_fdct_islow                  jSFDCTIS
+#define jsimd_fdct_ifast                  jSFDCTIF
+#define jsimd_fdct_float                  jSFDCTFl
+#define jsimd_can_quantize                jSCanQuant
+#define jsimd_can_quantize_float          jSCanQuantF
+#define jsimd_quantize                    jSQuant
+#define jsimd_quantize_float              jSQuantF
+#define jsimd_can_idct_2x2                jSCanIDCT22
+#define jsimd_can_idct_4x4                jSCanIDCT44
+#define jsimd_idct_2x2                    jSIDCT22
+#define jsimd_idct_4x4                    jSIDCT44
+#define jsimd_can_idct_islow              jSCanIDCTIS
+#define jsimd_can_idct_ifast              jSCanIDCTIF
+#define jsimd_can_idct_float              jSCanIDCTFl
+#define jsimd_idct_islow                  jSIDCTIS
+#define jsimd_idct_ifast                  jSIDCTIF
+#define jsimd_idct_float                  jSIDCTFl
+#endif /* NEED_SHORT_EXTERNAL_NAMES */
+
+EXTERN(int) jsimd_can_convsamp JPP((void));
+EXTERN(int) jsimd_can_convsamp_float JPP((void));
+
+EXTERN(void) jsimd_convsamp JPP((JSAMPARRAY sample_data,
+                                 JDIMENSION start_col,
+                                 DCTELEM * workspace));
+EXTERN(void) jsimd_convsamp_float JPP((JSAMPARRAY sample_data,
+                                       JDIMENSION start_col,
+                                       FAST_FLOAT * workspace));
+
+EXTERN(int) jsimd_can_fdct_islow JPP((void));
+EXTERN(int) jsimd_can_fdct_ifast JPP((void));
+EXTERN(int) jsimd_can_fdct_float JPP((void));
+
+EXTERN(void) jsimd_fdct_islow JPP((DCTELEM * data));
+EXTERN(void) jsimd_fdct_ifast JPP((DCTELEM * data));
+EXTERN(void) jsimd_fdct_float JPP((FAST_FLOAT * data));
+
+EXTERN(int) jsimd_can_quantize JPP((void));
+EXTERN(int) jsimd_can_quantize_float JPP((void));
+
+EXTERN(void) jsimd_quantize JPP((JCOEFPTR coef_block,
+                                 DCTELEM * divisors,
+                                 DCTELEM * workspace));
+EXTERN(void) jsimd_quantize_float JPP((JCOEFPTR coef_block,
+                                       FAST_FLOAT * divisors,
+                                       FAST_FLOAT * workspace));
+
+EXTERN(int) jsimd_can_idct_2x2 JPP((void));
+EXTERN(int) jsimd_can_idct_4x4 JPP((void));
+
+EXTERN(void) jsimd_idct_2x2 JPP((j_decompress_ptr cinfo,
+                                 jpeg_component_info * compptr,
+                                 JCOEFPTR coef_block,
+                                 JSAMPARRAY output_buf,
+                                 JDIMENSION output_col));
+EXTERN(void) jsimd_idct_4x4 JPP((j_decompress_ptr cinfo,
+                                 jpeg_component_info * compptr,
+                                 JCOEFPTR coef_block,
+                                 JSAMPARRAY output_buf,
+                                 JDIMENSION output_col));
+
+EXTERN(int) jsimd_can_idct_islow JPP((void));
+EXTERN(int) jsimd_can_idct_ifast JPP((void));
+EXTERN(int) jsimd_can_idct_float JPP((void));
+
+EXTERN(void) jsimd_idct_islow JPP((j_decompress_ptr cinfo,
+                                   jpeg_component_info * compptr,
+                                   JCOEFPTR coef_block,
+                                   JSAMPARRAY output_buf,
+                                   JDIMENSION output_col));
+EXTERN(void) jsimd_idct_ifast JPP((j_decompress_ptr cinfo,
+                                   jpeg_component_info * compptr,
+                                   JCOEFPTR coef_block,
+                                   JSAMPARRAY output_buf,
+                                   JDIMENSION output_col));
+EXTERN(void) jsimd_idct_float JPP((j_decompress_ptr cinfo,
+                                   jpeg_component_info * compptr,
+                                   JCOEFPTR coef_block,
+                                   JSAMPARRAY output_buf,
+                                   JDIMENSION output_col));
+
diff --git a/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jversion.h b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jversion.h
new file mode 100644
index 0000000..a045405
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/jversion.h
@@ -0,0 +1,36 @@
+/*
+ * jversion.h
+ *
+ * Copyright (C) 1991-2010, Thomas G. Lane, Guido Vollbeding.
+ * Copyright (C) 2010, D. R. Commander.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains software version identification.
+ */
+
+
+#if JPEG_LIB_VERSION >= 80
+
+#define JVERSION	"8b  16-May-2010"
+
+#define JCOPYRIGHT	"Copyright (C) 2010, Thomas G. Lane, Guido Vollbeding"
+
+#elif JPEG_LIB_VERSION >= 70
+
+#define JVERSION        "7  27-Jun-2009"
+
+#define JCOPYRIGHT      "Copyright (C) 2009, Thomas G. Lane, Guido Vollbeding"
+
+#else
+
+#define JVERSION	"6b  27-Mar-1998"
+
+#define JCOPYRIGHT	"Copyright (C) 1998, Thomas G. Lane"
+
+#endif
+
+#define LJTCOPYRIGHT	"Copyright (C) 1999-2006 MIYASAKA Masaru\n" \
+			"Copyright (C) 2009 Pierre Ossman for Cendio AB\n" \
+			"Copyright (C) 2009-2011 D. R. Commander\n" \
+			"Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)"
diff --git a/duix-sdk/src/main/cpp/third/arm/include/tjpeg/platform.h b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/platform.h
new file mode 100644
index 0000000..f7f28af
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/platform.h
@@ -0,0 +1,11 @@
+#ifndef __JKUTILS_H__
+#define __JKUTILS_H__
+
+#include <jni.h>
+#include <unistd.h>
+#include <turbojpeg.h>
+#include <GLES2/gl2.h>
+
+typedef struct { unsigned char *bytes; } Allocation;
+
+#endif
\ No newline at end of file
diff --git a/duix-sdk/src/main/cpp/third/arm/include/tjpeg/tjutil.h b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/tjutil.h
new file mode 100644
index 0000000..bdad348
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/tjutil.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C)2011 D. R. Commander.  All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * - Neither the name of the libjpeg-turbo Project nor the names of its
+ *   contributors may be used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef _WIN32
+	#ifndef __MINGW32__
+		#include <stdio.h>
+		#define snprintf(str, n, format, ...)  \
+			_snprintf_s(str, n, _TRUNCATE, format, __VA_ARGS__)
+	#endif
+	#define strcasecmp stricmp
+	#define strncasecmp strnicmp
+#endif
+
+#ifndef min
+ #define min(a,b) ((a)<(b)?(a):(b))
+#endif
+
+#ifndef max
+ #define max(a,b) ((a)>(b)?(a):(b))
+#endif
+
+extern double gettime(void);
diff --git a/duix-sdk/src/main/cpp/third/arm/include/tjpeg/transupp.h b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/transupp.h
new file mode 100644
index 0000000..57633ab
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/transupp.h
@@ -0,0 +1,217 @@
+/*
+ * transupp.h
+ *
+ * Copyright (C) 1997-2009, Thomas G. Lane, Guido Vollbeding.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains declarations for image transformation routines and
+ * other utility code used by the jpegtran sample application.  These are
+ * NOT part of the core JPEG library.  But we keep these routines separate
+ * from jpegtran.c to ease the task of maintaining jpegtran-like programs
+ * that have other user interfaces.
+ *
+ * NOTE: all the routines declared here have very specific requirements
+ * about when they are to be executed during the reading and writing of the
+ * source and destination files.  See the comments in transupp.c, or see
+ * jpegtran.c for an example of correct usage.
+ */
+
+/* If you happen not to want the image transform support, disable it here */
+#ifndef TRANSFORMS_SUPPORTED
+#define TRANSFORMS_SUPPORTED 1		/* 0 disables transform code */
+#endif
+
+/*
+ * Although rotating and flipping data expressed as DCT coefficients is not
+ * hard, there is an asymmetry in the JPEG format specification for images
+ * whose dimensions aren't multiples of the iMCU size.  The right and bottom
+ * image edges are padded out to the next iMCU boundary with junk data; but
+ * no padding is possible at the top and left edges.  If we were to flip
+ * the whole image including the pad data, then pad garbage would become
+ * visible at the top and/or left, and real pixels would disappear into the
+ * pad margins --- perhaps permanently, since encoders & decoders may not
+ * bother to preserve DCT blocks that appear to be completely outside the
+ * nominal image area.  So, we have to exclude any partial iMCUs from the
+ * basic transformation.
+ *
+ * Transpose is the only transformation that can handle partial iMCUs at the
+ * right and bottom edges completely cleanly.  flip_h can flip partial iMCUs
+ * at the bottom, but leaves any partial iMCUs at the right edge untouched.
+ * Similarly flip_v leaves any partial iMCUs at the bottom edge untouched.
+ * The other transforms are defined as combinations of these basic transforms
+ * and process edge blocks in a way that preserves the equivalence.
+ *
+ * The "trim" option causes untransformable partial iMCUs to be dropped;
+ * this is not strictly lossless, but it usually gives the best-looking
+ * result for odd-size images.  Note that when this option is active,
+ * the expected mathematical equivalences between the transforms may not hold.
+ * (For example, -rot 270 -trim trims only the bottom edge, but -rot 90 -trim
+ * followed by -rot 180 -trim trims both edges.)
+ *
+ * We also offer a lossless-crop option, which discards data outside a given
+ * image region but losslessly preserves what is inside.  Like the setRotation and
+ * flip transforms, lossless crop is restricted by the JPEG format: the upper
+ * left corner of the selected region must fall on an iMCU boundary.  If this
+ * does not hold for the given crop parameters, we silently move the upper left
+ * corner up and/or left to make it so, simultaneously increasing the region
+ * dimensions to keep the lower right crop corner unchanged.  (Thus, the
+ * output image covers at least the requested region, but may cover more.)
+ *
+ * We also provide a lossless-resize option, which is kind of a lossless-crop
+ * operation in the DCT coefficient block domain - it discards higher-order
+ * coefficients and losslessly preserves lower-order coefficients of a
+ * sub-block.
+ *
+ * Rotate/flip transform, resize, and crop can be requested together in a
+ * single invocation.  The crop is applied last --- that is, the crop region
+ * is specified in terms of the destination image after transform/resize.
+ *
+ * We also offer a "force to grayscale" option, which simply discards the
+ * chrominance channels of a YCbCr image.  This is lossless in the sense that
+ * the luminance channel is preserved exactly.  It's not the same kind of
+ * thing as the setRotation/flip transformations, but it's convenient to handle it
+ * as part of this package, mainly because the transformation routines have to
+ * be aware of the option to know how many components to work on.
+ */
+
+
+/* Short forms of external names for systems with brain-damaged linkers. */
+
+#ifdef NEED_SHORT_EXTERNAL_NAMES
+#define jtransform_parse_crop_spec	jTrParCrop
+#define jtransform_request_workspace	jTrRequest
+#define jtransform_adjust_parameters	jTrAdjust
+#define jtransform_execute_transform	jTrExec
+#define jtransform_perfect_transform	jTrPerfect
+#define jcopy_markers_setup		jCMrkSetup
+#define jcopy_markers_execute		jCMrkExec
+#endif /* NEED_SHORT_EXTERNAL_NAMES */
+
+
+/*
+ * Codes for supported types of image transformations.
+ */
+
+typedef enum {
+	JXFORM_NONE,		/* no transformation */
+	JXFORM_FLIP_H,		/* horizontal flip */
+	JXFORM_FLIP_V,		/* vertical flip */
+	JXFORM_TRANSPOSE,	/* transpose across UL-to-LR axis */
+	JXFORM_TRANSVERSE,	/* transpose across UR-to-LL axis */
+	JXFORM_ROT_90,		/* 90-degree clockwise rotation */
+	JXFORM_ROT_180,		/* 180-degree rotation */
+	JXFORM_ROT_270		/* 270-degree clockwise (or 90 ccw) */
+} JXFORM_CODE;
+
+/*
+ * Codes for crop parameters, which can individually be unspecified,
+ * positive, or negative.  (Negative width or height makes no sense, though.)
+ */
+
+typedef enum {
+	JCROP_UNSET,
+	JCROP_POS,
+	JCROP_NEG
+} JCROP_CODE;
+
+/*
+ * Transform parameters struct.
+ * NB: application must not change any elements of this struct after
+ * calling jtransform_request_workspace.
+ */
+
+typedef struct {
+  /* Options: set by caller */
+  JXFORM_CODE transform;	/* image transform operator */
+  boolean perfect;		/* if TRUE, fail if partial MCUs are requested */
+  boolean trim;			/* if TRUE, trim partial MCUs as needed */
+  boolean force_grayscale;	/* if TRUE, convert color image to grayscale */
+  boolean crop;			/* if TRUE, crop source image */
+  boolean slow_hflip;  /* For best performance, the JXFORM_FLIP_H transform
+                          normally modifies the source coefficients in place.
+                          Setting this to TRUE will instead use a slower,
+                          double-buffered algorithm, which leaves the source
+                          coefficients in tact (necessary if other transformed
+                          images must be generated from the same set of
+                          coefficients. */
+
+  /* Crop parameters: application need not set these unless crop is TRUE.
+   * These can be filled in by jtransform_parse_crop_spec().
+   */
+  JDIMENSION crop_width;	/* Width of selected region */
+  JCROP_CODE crop_width_set;
+  JDIMENSION crop_height;	/* Height of selected region */
+  JCROP_CODE crop_height_set;
+  JDIMENSION crop_xoffset;	/* X offset of selected region */
+  JCROP_CODE crop_xoffset_set;	/* (negative measures from right edge) */
+  JDIMENSION crop_yoffset;	/* Y offset of selected region */
+  JCROP_CODE crop_yoffset_set;	/* (negative measures from bottom edge) */
+
+  /* Internal workspace: caller should not touch these */
+  int num_components;		/* # of components in workspace */
+  jvirt_barray_ptr * workspace_coef_arrays; /* workspace for transformations */
+  JDIMENSION output_width;	/* cropped destination dimensions */
+  JDIMENSION output_height;
+  JDIMENSION x_crop_offset;	/* destination crop offsets measured in iMCUs */
+  JDIMENSION y_crop_offset;
+  int iMCU_sample_width;	/* destination iMCU size */
+  int iMCU_sample_height;
+} jpeg_transform_info;
+
+
+#if TRANSFORMS_SUPPORTED
+
+/* Parse a crop specification (written in X11 geometry style) */
+EXTERN(boolean) jtransform_parse_crop_spec
+	JPP((jpeg_transform_info *info, const char *spec));
+/* Request any required workspace */
+EXTERN(boolean) jtransform_request_workspace
+	JPP((j_decompress_ptr srcinfo, jpeg_transform_info *info));
+/* Adjust output image parameters */
+EXTERN(jvirt_barray_ptr *) jtransform_adjust_parameters
+	JPP((j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+	     jvirt_barray_ptr *src_coef_arrays,
+	     jpeg_transform_info *info));
+/* Execute the actual transformation, if any */
+EXTERN(void) jtransform_execute_transform
+	JPP((j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+	     jvirt_barray_ptr *src_coef_arrays,
+	     jpeg_transform_info *info));
+/* Determine whether lossless transformation is perfectly
+ * possible for a specified image and transformation.
+ */
+EXTERN(boolean) jtransform_perfect_transform
+	JPP((JDIMENSION image_width, JDIMENSION image_height,
+	     int MCU_width, int MCU_height,
+	     JXFORM_CODE transform));
+
+/* jtransform_execute_transform used to be called
+ * jtransform_execute_transformation, but some compilers complain about
+ * routine names that long.  This macro is here to avoid breaking any
+ * old source code that uses the original name...
+ */
+#define jtransform_execute_transformation	jtransform_execute_transform
+
+#endif /* TRANSFORMS_SUPPORTED */
+
+
+/*
+ * Support for copying optional markers from source to destination file.
+ */
+
+typedef enum {
+	JCOPYOPT_NONE,		/* copy no optional markers */
+	JCOPYOPT_COMMENTS,	/* copy only comment (COM) markers */
+	JCOPYOPT_ALL		/* copy all optional markers */
+} JCOPY_OPTION;
+
+#define JCOPYOPT_DEFAULT  JCOPYOPT_COMMENTS	/* recommended default */
+
+/* Setup decompression object to writeFileAsync desired markers in memory */
+EXTERN(void) jcopy_markers_setup
+	JPP((j_decompress_ptr srcinfo, JCOPY_OPTION option));
+/* Copy markers saved in the given source object to the destination object */
+EXTERN(void) jcopy_markers_execute
+	JPP((j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+	     JCOPY_OPTION option));
diff --git a/duix-sdk/src/main/cpp/third/arm/include/tjpeg/turbojpeg.h b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/turbojpeg.h
new file mode 100644
index 0000000..343788a
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/tjpeg/turbojpeg.h
@@ -0,0 +1,897 @@
+/*
+ * Copyright (C)2009-2011 D. R. Commander.  All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * - Neither the name of the libjpeg-turbo Project nor the names of its
+ *   contributors may be used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __TURBOJPEG_H__
+#define __TURBOJPEG_H__
+
+#if defined(_WIN32) && defined(DLLDEFINE)
+#define DLLEXPORT __declspec(dllexport)
+#else
+#define DLLEXPORT
+#endif
+#define DLLCALL
+
+
+/**
+ * @addtogroup TurboJPEG
+ * TurboJPEG API.  This API provides an interface for generating, decoding, and
+ * transforming planar YUV and JPEG images in memory.
+ *
+ * @{
+ */
+
+
+/**
+ * The number of chrominance subsampling options
+ */
+#define TJ_NUMSAMP 5
+
+/**
+ * Chrominance subsampling options.
+ * When an image is converted from the RGB to the YCbCr colorspace as part of
+ * the JPEG compression process, some of the Cb and Cr (chrominance) components
+ * can be discarded or averaged together to produce a smaller image with little
+ * perceptible loss of image clarity (the human eye is more sensitive to small
+ * changes in brightness than small changes in color.)  This is called
+ * "chrominance subsampling".
+ */
+enum TJSAMP
+{
+  /**
+   * 4:4:4 chrominance subsampling (no chrominance subsampling).  The JPEG or
+   * YUV image will contain one chrominance component for every pixel in the
+   * source image.
+   */
+  TJSAMP_444=0,
+  /**
+   * 4:2:2 chrominance subsampling.  The JPEG or YUV image will contain one
+   * chrominance component for every 2x1 block of pixels in the source image.
+   */
+  TJSAMP_422,
+  /**
+   * 4:2:0 chrominance subsampling.  The JPEG or YUV image will contain one
+   * chrominance component for every 2x2 block of pixels in the source image.
+   */
+  TJSAMP_420,
+  /**
+   * Grayscale.  The JPEG or YUV image will contain no chrominance components.
+   */
+  TJSAMP_GRAY,
+  /**
+   * 4:4:0 chrominance subsampling.  The JPEG or YUV image will contain one
+   * chrominance component for every 1x2 block of pixels in the source image.
+   */
+  TJSAMP_440
+};
+
+/**
+ * MCU block width (in pixels) for a given level of chrominance subsampling.
+ * MCU block sizes:
+ * - 8x8 for no subsampling or grayscale
+ * - 16x8 for 4:2:2
+ * - 8x16 for 4:4:0
+ * - 16x16 for 4:2:0 
+ */
+static const int tjMCUWidth[TJ_NUMSAMP]  = {8, 16, 16, 8, 8};
+
+/**
+ * MCU block height (in pixels) for a given level of chrominance subsampling.
+ * MCU block sizes:
+ * - 8x8 for no subsampling or grayscale
+ * - 16x8 for 4:2:2
+ * - 8x16 for 4:4:0
+ * - 16x16 for 4:2:0 
+ */
+static const int tjMCUHeight[TJ_NUMSAMP] = {8, 8, 16, 8, 16};
+
+
+/**
+ * The number of pixel formats
+ */
+#define TJ_NUMPF 11
+
+/**
+ * Pixel formats
+ */
+enum TJPF
+{
+  /**
+   * RGB pixel format.  The red, green, and blue components in the image are
+   * stored in 3-byte pixels in the order R, G, B from lowest to highest byte
+   * address within each pixel.
+   */
+  TJPF_RGB=0,
+  /**
+   * BGR pixel format.  The red, green, and blue components in the image are
+   * stored in 3-byte pixels in the order B, G, R from lowest to highest byte
+   * address within each pixel.
+   */
+  TJPF_BGR,
+  /**
+   * RGBX pixel format.  The red, green, and blue components in the image are
+   * stored in 4-byte pixels in the order R, G, B from lowest to highest byte
+   * address within each pixel.  The X component is ignored when compressing
+   * and undefined when decompressing.
+   */
+  TJPF_RGBX,
+  /**
+   * BGRX pixel format.  The red, green, and blue components in the image are
+   * stored in 4-byte pixels in the order B, G, R from lowest to highest byte
+   * address within each pixel.  The X component is ignored when compressing
+   * and undefined when decompressing.
+   */
+  TJPF_BGRX,
+  /**
+   * XBGR pixel format.  The red, green, and blue components in the image are
+   * stored in 4-byte pixels in the order R, G, B from highest to lowest byte
+   * address within each pixel.  The X component is ignored when compressing
+   * and undefined when decompressing.
+   */
+  TJPF_XBGR,
+  /**
+   * XRGB pixel format.  The red, green, and blue components in the image are
+   * stored in 4-byte pixels in the order B, G, R from highest to lowest byte
+   * address within each pixel.  The X component is ignored when compressing
+   * and undefined when decompressing.
+   */
+  TJPF_XRGB,
+  /**
+   * Grayscale pixel format.  Each 1-byte pixel represents a luminance
+   * (brightness) level from 0 to 255.
+   */
+  TJPF_GRAY,
+  /**
+   * RGBA pixel format.  This is the same as @ref TJPF_RGBX, except that when
+   * decompressing, the X component is guaranteed to be 0xFF, which can be
+   * interpreted as an opaque alpha channel.
+   */
+  TJPF_RGBA,
+  /**
+   * BGRA pixel format.  This is the same as @ref TJPF_BGRX, except that when
+   * decompressing, the X component is guaranteed to be 0xFF, which can be
+   * interpreted as an opaque alpha channel.
+   */
+  TJPF_BGRA,
+  /**
+   * ABGR pixel format.  This is the same as @ref TJPF_XBGR, except that when
+   * decompressing, the X component is guaranteed to be 0xFF, which can be
+   * interpreted as an opaque alpha channel.
+   */
+  TJPF_ABGR,
+  /**
+   * ARGB pixel format.  This is the same as @ref TJPF_XRGB, except that when
+   * decompressing, the X component is guaranteed to be 0xFF, which can be
+   * interpreted as an opaque alpha channel.
+   */
+  TJPF_ARGB
+};
+
+/**
+ * Red offset (in bytes) for a given pixel format.  This specifies the number
+ * of bytes that the red component is offset from the start of the pixel.  For
+ * instance, if a pixel of format TJ_BGRX is stored in <tt>char pixel[]</tt>,
+ * then the red component will be <tt>pixel[tjRedOffset[TJ_BGRX]]</tt>.
+ */
+static const int tjRedOffset[TJ_NUMPF] = {0, 2, 0, 2, 3, 1, 0, 0, 2, 3, 1};
+/**
+ * Green offset (in bytes) for a given pixel format.  This specifies the number
+ * of bytes that the green component is offset from the start of the pixel.
+ * For instance, if a pixel of format TJ_BGRX is stored in
+ * <tt>char pixel[]</tt>, then the green component will be
+ * <tt>pixel[tjGreenOffset[TJ_BGRX]]</tt>.
+ */
+static const int tjGreenOffset[TJ_NUMPF] = {1, 1, 1, 1, 2, 2, 0, 1, 1, 2, 2};
+/**
+ * Blue offset (in bytes) for a given pixel format.  This specifies the number
+ * of bytes that the Blue component is offset from the start of the pixel.  For
+ * instance, if a pixel of format TJ_BGRX is stored in <tt>char pixel[]</tt>,
+ * then the blue component will be <tt>pixel[tjBlueOffset[TJ_BGRX]]</tt>.
+ */
+static const int tjBlueOffset[TJ_NUMPF] = {2, 0, 2, 0, 1, 3, 0, 2, 0, 1, 3};
+
+/**
+ * Pixel size (in bytes) for a given pixel format.
+ */
+static const int tjPixelSize[TJ_NUMPF] = {3, 3, 4, 4, 4, 4, 1, 4, 4, 4, 4};
+
+
+/**
+ * The uncompressed source/destination image is stored in bottom-up (Windows,
+ * OpenGL) order, not top-down (X11) order.
+ */
+#define TJFLAG_BOTTOMUP        2
+/**
+ * Turn off CPU auto-detection and force TurboJPEG to use MMX code (IPP and
+ * 32-bit libjpeg-turbo versions only.)
+ */
+#define TJFLAG_FORCEMMX        8
+/**
+ * Turn off CPU auto-detection and force TurboJPEG to use SSE code (32-bit IPP
+ * and 32-bit libjpeg-turbo versions only)
+ */
+#define TJFLAG_FORCESSE       16
+/**
+ * Turn off CPU auto-detection and force TurboJPEG to use SSE2 code (32-bit IPP
+ * and 32-bit libjpeg-turbo versions only)
+ */
+#define TJFLAG_FORCESSE2      32
+/**
+ * Turn off CPU auto-detection and force TurboJPEG to use SSE3 code (64-bit IPP
+ * version only)
+ */
+#define TJFLAG_FORCESSE3     128
+/**
+ * Use fast, inaccurate chrominance upsampling routines in the JPEG
+ * decompressor (libjpeg and libjpeg-turbo versions only)
+ */
+#define TJFLAG_FASTUPSAMPLE  256
+/**
+ * Disable buffer (re)allocation.  If passed to #tjCompress2() or
+ * #tjTransform(), this flag will cause those functions to generate an error if
+ * the JPEG image buffer is invalid or too small rather than attempting to
+ * allocate or reallocate that buffer.  This reproduces the behavior of earlier
+ * versions of TurboJPEG.
+ */
+#define TJFLAG_NOREALLOC     1024
+
+
+/**
+ * Number of transform operations
+ */
+#define TJ_NUMXOP 8
+
+/**
+ * Transform operations for #tjTransform()
+ */
+enum TJXOP
+{
+  /**
+   * Do not transform the position of the image pixels
+   */
+  TJXOP_NONE=0,
+  /**
+   * Flip (mirror) image horizontally.  This transform is imperfect if there
+   * are any partial MCU blocks on the right edge (see #TJXOPT_PERFECT.)
+   */
+  TJXOP_HFLIP,
+  /**
+   * Flip (mirror) image vertically.  This transform is imperfect if there are
+   * any partial MCU blocks on the bottom edge (see #TJXOPT_PERFECT.)
+   */
+  TJXOP_VFLIP,
+  /**
+   * Transpose image (flip/mirror along upper left to lower right axis.)  This
+   * transform is always perfect.
+   */
+  TJXOP_TRANSPOSE,
+  /**
+   * Transverse transpose image (flip/mirror along upper right to lower left
+   * axis.)  This transform is imperfect if there are any partial MCU blocks in
+   * the image (see #TJXOPT_PERFECT.)
+   */
+  TJXOP_TRANSVERSE,
+  /**
+   * Rotate image clockwise by 90 degrees.  This transform is imperfect if
+   * there are any partial MCU blocks on the bottom edge (see
+   * #TJXOPT_PERFECT.)
+   */
+  TJXOP_ROT90,
+  /**
+   * Rotate image 180 degrees.  This transform is imperfect if there are any
+   * partial MCU blocks in the image (see #TJXOPT_PERFECT.)
+   */
+  TJXOP_ROT180,
+  /**
+   * Rotate image counter-clockwise by 90 degrees.  This transform is imperfect
+   * if there are any partial MCU blocks on the right edge (see
+   * #TJXOPT_PERFECT.)
+   */
+  TJXOP_ROT270
+};
+
+
+/**
+ * This option will cause #tjTransform() to return an error if the transform is
+ * not perfect.  Lossless transforms operate on MCU blocks, whose size depends
+ * on the level of chrominance subsampling used (see #tjMCUWidth
+ * and #tjMCUHeight.)  If the image's width or height is not evenly divisible
+ * by the MCU block size, then there will be partial MCU blocks on the right
+ * and/or bottom edges.  It is not possible to move these partial MCU blocks to
+ * the top or left of the image, so any transform that would require that is
+ * "imperfect."  If this option is not specified, then any partial MCU blocks
+ * that cannot be transformed will be left in place, which will create
+ * odd-looking strips on the right or bottom edge of the image.
+ */
+#define TJXOPT_PERFECT  1
+/**
+ * This option will cause #tjTransform() to discard any partial MCU blocks that
+ * cannot be transformed.
+ */
+#define TJXOPT_TRIM     2
+/**
+ * This option will enable lossless cropping.  See #tjTransform() for more
+ * information.
+ */
+#define TJXOPT_CROP     4
+/**
+ * This option will discard the color data in the input image and produce
+ * a grayscale output image.
+ */
+#define TJXOPT_GRAY     8
+/**
+ * This option will prevent #tjTransform() from outputting a JPEG image for
+ * this particular transform (this can be used in conjunction with a custom
+ * filter to capture the transformed DCT coefficients without transcoding
+ * them.)
+ */
+#define TJXOPT_NOOUTPUT 16
+
+
+/**
+ * Scaling factor
+ */
+typedef struct
+{
+  /**
+   * Numerator
+   */
+  int num;
+  /**
+   * Denominator
+   */
+  int denom;
+} tjscalingfactor;
+
+/**
+ * Cropping region
+ */
+typedef struct
+{
+  /**
+   * The left boundary of the cropping region.  This must be evenly divisible
+   * by the MCU block width (see #tjMCUWidth.)
+   */
+  int x;
+  /**
+   * The upper boundary of the cropping region.  This must be evenly divisible
+   * by the MCU block height (see #tjMCUHeight.)
+   */
+  int y;
+  /**
+   * The width of the cropping region. Setting this to 0 is the equivalent of
+   * setting it to the width of the source JPEG image - x.
+   */
+  int w;
+  /**
+   * The height of the cropping region. Setting this to 0 is the equivalent of
+   * setting it to the height of the source JPEG image - y.
+   */
+  int h;
+} tjregion;
+
+/**
+ * Lossless transform
+ */
+typedef struct tjtransform
+{
+  /**
+   * Cropping region
+   */
+  tjregion r;
+  /**
+   * One of the @ref TJXOP "transform operations"
+   */
+  int op;
+  /**
+   * The bitwise OR of one of more of the @ref TJXOPT_CROP "transform options"
+   */
+  int options;
+  /**
+   * Arbitrary data that can be accessed within the body of the callback
+   * function
+   */
+  void *data;
+  /**
+   * A callback function that can be used to modify the DCT coefficients
+   * after they are losslessly transformed but before they are transcoded to a
+   * new JPEG file.  This allows for custom filters or other transformations to
+   * be applied in the frequency domain.
+   *
+   * @param coeffs pointer to an array of transformed DCT coefficients.  (NOTE:
+   *        this pointer is not guaranteed to be valid once the callback
+   *        returns, so applications wishing to hand off the DCT coefficients
+   *        to another function or library should make a copy of them within
+   *        the body of the callback.)
+   * @param arrayRegion #tjregion structure containing the width and height of
+   *        the array pointed to by <tt>coeffs</tt> as well as its offset
+   *        relative to the component plane.  TurboJPEG implementations may
+   *        choose to split each component plane into multiple DCT coefficient
+   *        arrays and call the callback function once for each array.
+   * @param planeRegion #tjregion structure containing the width and height of
+   *        the component plane to which <tt>coeffs</tt> belongs
+   * @param componentID ID number of the component plane to which
+   *        <tt>coeffs</tt> belongs (Y, Cb, and Cr have, respectively, ID's of
+   *        0, 1, and 2 in typical JPEG images.)
+   * @param transformID ID number of the transformed image to which
+   *        <tt>coeffs</tt> belongs.  This is the same as the index of the
+   *        transform in the transforms array that was passed to
+   *        #tjTransform().
+   * @param transform a pointer to a #tjtransform structure that specifies the
+   *        parameters and/or cropping region for this transform
+   *
+   * @return 0 if the callback was successful, or -1 if an error occurred.
+   */
+  int (*customFilter)(short *coeffs, tjregion arrayRegion,
+    tjregion planeRegion, int componentIndex, int transformIndex,
+    struct tjtransform *transform);
+} tjtransform;
+
+/**
+ * TurboJPEG instance handle
+ */
+typedef void* tjhandle;
+
+
+/**
+ * Pad the given width to the nearest 32-bit boundary
+ */
+#define TJPAD(width) (((width)+3)&(~3))
+
+/**
+ * Compute the scaled value of <tt>dimension</tt> using the given scaling
+ * factor.  This macro performs the integer equivalent of <tt>ceil(dimension *
+ * scalingFactor)</tt>. 
+ */
+#define TJSCALED(dimension, scalingFactor) ((dimension * scalingFactor.num \
+  + scalingFactor.denom - 1) / scalingFactor.denom)
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/**
+ * Create a TurboJPEG compressor instance.
+ *
+ * @return a handle to the newly-created instance, or NULL if an error
+ * occurred (see #tjGetErrorStr().)
+ */
+DLLEXPORT tjhandle DLLCALL tjInitCompress(void);
+
+
+/**
+ * Compress an RGB or grayscale image into a JPEG image.
+ *
+ * @param handle a handle to a TurboJPEG compressor or transformer instance
+ * @param srcBuf pointer to an image buffer containing RGB or grayscale pixels
+ *        to be compressed
+ * @param width width (in pixels) of the source image
+ * @param pitch bytes per line of the source image.  Normally, this should be
+ *        <tt>width * #tjPixelSize[pixelFormat]</tt> if the image is unpadded,
+ *        or <tt>#TJPAD(width * #tjPixelSize[pixelFormat])</tt> if each line of
+ *        the image is padded to the nearest 32-bit boundary, as is the case
+ *        for Windows bitmaps.  You can also be clever and use this parameter
+ *        to skip lines, etc.  Setting this parameter to 0 is the equivalent of
+ *        setting it to <tt>width * #tjPixelSize[pixelFormat]</tt>.
+ * @param height height (in pixels) of the source image
+ * @param pixelFormat pixel format of the source image (see @ref TJPF
+ *        "Pixel formats".)
+ * @param jpegBuf address of a pointer to an image buffer that will receive the
+ *        JPEG image.  TurboJPEG has the ability to reallocate the JPEG buffer
+ *        to accommodate the size of the JPEG image.  Thus, you can choose to:
+ *        -# pre-allocate the JPEG buffer with an arbitrary size using
+ *        #tjAlloc() and let TurboJPEG grow the buffer as needed,
+ *        -# set <tt>*jpegBuf</tt> to NULL to tell TurboJPEG to allocate the
+ *        buffer for you, or
+ *        -# pre-allocate the buffer to a "worst case" size determined by
+ *        calling #tjBufSize().  This should ensure that the buffer never has
+ *        to be re-allocated (setting #TJFLAG_NOREALLOC guarantees this.)
+ *        .
+ *        If you choose option 1, <tt>*jpegSize</tt> should be set to the
+ *        size of your pre-allocated buffer.  In any case, unless you have
+ *        set #TJFLAG_NOREALLOC, you should always check <tt>*jpegBuf</tt> upon
+ *        return from this function, as it may have changed.
+ * @param jpegSize pointer to an unsigned long variable that holds the size of
+ *        the JPEG image buffer.  If <tt>*jpegBuf</tt> points to a
+ *        pre-allocated buffer, then <tt>*jpegSize</tt> should be set to the
+ *        size of the buffer.  Upon return, <tt>*jpegSize</tt> will contain the
+ *        size of the JPEG image (in bytes.)
+ * @param jpegSubsamp the level of chrominance subsampling to be used when
+ *        generating the JPEG image (see @ref TJSAMP
+ *        "Chrominance subsampling options".)
+ * @param jpegQual the image quality of the generated JPEG image (1 = worst,
+          100 = best)
+ * @param flags the bitwise OR of one or more of the @ref TJFLAG_BOTTOMUP
+ *        "flags".
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
+*/
+DLLEXPORT int DLLCALL tjCompress2(tjhandle handle, unsigned char *srcBuf,
+  int width, int pitch, int height, int pixelFormat, unsigned char **jpegBuf,
+  unsigned long *jpegSize, int jpegSubsamp, int jpegQual, int flags);
+
+
+/**
+ * The maximum size of the buffer (in bytes) required to hold a JPEG image with
+ * the given parameters.  The number of bytes returned by this function is
+ * larger than the size of the uncompressed source image.  The reason for this
+ * is that the JPEG format uses 16-bit coefficients, and it is thus possible
+ * for a very high-quality JPEG image with very high frequency content to
+ * expand rather than compress when converted to the JPEG format.  Such images
+ * represent a very rare corner case, but since there is no way to predict the
+ * size of a JPEG image prior to compression, the corner case has to be
+ * handled.
+ *
+ * @param width width of the image (in pixels)
+ * @param height height of the image (in pixels)
+ * @param jpegSubsamp the level of chrominance subsampling to be used when
+ *        generating the JPEG image (see @ref TJSAMP
+ *        "Chrominance subsampling options".)
+ *
+ * @return the maximum size of the buffer (in bytes) required to hold the
+ * image, or -1 if the arguments are out of bounds.
+ */
+DLLEXPORT unsigned long DLLCALL tjBufSize(int width, int height,
+  int jpegSubsamp);
+
+
+/**
+ * The size of the buffer (in bytes) required to hold a YUV planar image with
+ * the given parameters.
+ *
+ * @param width width of the image (in pixels)
+ * @param height height of the image (in pixels)
+ * @param subsamp level of chrominance subsampling in the image (see
+ *        @ref TJSAMP "Chrominance subsampling options".)
+ *
+ * @return the size of the buffer (in bytes) required to hold the image, or
+ * -1 if the arguments are out of bounds.
+ */
+DLLEXPORT unsigned long DLLCALL tjBufSizeYUV(int width, int height,
+  int subsamp);
+
+
+/**
+ * Encode an RGB or grayscale image into a YUV planar image.  This function
+ * uses the accelerated color conversion routines in TurboJPEG's underlying
+ * codec to produce a planar YUV image that is suitable for X Video.
+ * Specifically, if the chrominance components are subsampled along the
+ * horizontal dimension, then the width of the luminance plane is padded to 2
+ * in the output image (same goes for the height of the luminance plane, if the
+ * chrominance components are subsampled along the vertical dimension.)  Also,
+ * each line of each plane in the output image is padded to 4 bytes.  Although
+ * this will work with any subsampling option, it is really only useful in
+ * combination with TJ_420, which produces an image compatible with the I420
+ * (AKA "YUV420P") format.
+ *
+ * @param handle a handle to a TurboJPEG compressor or transformer instance
+ * @param srcBuf pointer to an image buffer containing RGB or grayscale pixels
+ *        to be encoded
+ * @param width width (in pixels) of the source image
+ * @param pitch bytes per line of the source image.  Normally, this should be
+ *        <tt>width * #tjPixelSize[pixelFormat]</tt> if the image is unpadded,
+ *        or <tt>#TJPAD(width * #tjPixelSize[pixelFormat])</tt> if each line of
+ *        the image is padded to the nearest 32-bit boundary, as is the case
+ *        for Windows bitmaps.  You can also be clever and use this parameter
+ *        to skip lines, etc.  Setting this parameter to 0 is the equivalent of
+ *        setting it to <tt>width * #tjPixelSize[pixelFormat]</tt>.
+ * @param height height (in pixels) of the source image
+ * @param pixelFormat pixel format of the source image (see @ref TJPF
+ *        "Pixel formats".)
+ * @param dstBuf pointer to an image buffer that will receive the YUV image.
+ *        Use #tjBufSizeYUV() to determine the appropriate size for this buffer
+ *        based on the image width, height, and level of chrominance
+ *        subsampling.
+ * @param subsamp the level of chrominance subsampling to be used when
+ *        generating the YUV image (see @ref TJSAMP
+ *        "Chrominance subsampling options".)
+ * @param flags the bitwise OR of one or more of the @ref TJFLAG_BOTTOMUP
+ *        "flags".
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
+*/
+DLLEXPORT int DLLCALL tjEncodeYUV2(tjhandle handle,
+  unsigned char *srcBuf, int width, int pitch, int height, int pixelFormat,
+  unsigned char *dstBuf, int subsamp, int flags);
+
+
+/**
+ * Create a TurboJPEG decompressor instance.
+ *
+ * @return a handle to the newly-created instance, or NULL if an error
+ * occurred (see #tjGetErrorStr().)
+*/
+DLLEXPORT tjhandle DLLCALL tjInitDecompress(void);
+
+
+/**
+ * Retrieve information about a JPEG image without decompressing it.
+ *
+ * @param handle a handle to a TurboJPEG decompressor or transformer instance
+ * @param jpegBuf pointer to a buffer containing a JPEG image
+ * @param jpegSize size of the JPEG image (in bytes)
+ * @param width pointer to an integer variable that will receive the width (in
+ *        pixels) of the JPEG image
+ * @param height pointer to an integer variable that will receive the height
+ *        (in pixels) of the JPEG image
+ * @param jpegSubsamp pointer to an integer variable that will receive the
+ *        level of chrominance subsampling used when compressing the JPEG image
+ *        (see @ref TJSAMP "Chrominance subsampling options".)
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
+*/
+DLLEXPORT int DLLCALL tjDecompressHeader2(tjhandle handle,
+  unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height,
+  int *jpegSubsamp);
+
+
+/**
+ * Returns a list of fractional scaling factors that the JPEG decompressor in
+ * this implementation of TurboJPEG supports.
+ *
+ * @param numscalingfactors pointer to an integer variable that will receive
+ *        the number of elements in the list
+ *
+ * @return a pointer to a list of fractional scaling factors, or NULL if an
+ * error is encountered (see #tjGetErrorStr().)
+*/
+DLLEXPORT tjscalingfactor* DLLCALL tjGetScalingFactors(int *numscalingfactors);
+
+
+/**
+ * Decompress a JPEG image to an RGB or grayscale image.
+ *
+ * @param handle a handle to a TurboJPEG decompressor or transformer instance
+ * @param jpegBuf pointer to a buffer containing the JPEG image to decompress
+ * @param jpegSize size of the JPEG image (in bytes)
+ * @param dstBuf pointer to an image buffer that will receive the decompressed
+ *        image.  This buffer should normally be <tt>pitch * scaledHeight</tt>
+ *        bytes in size, where <tt>scaledHeight</tt> can be determined by
+ *        calling #TJSCALED() with the JPEG image height and one of the scaling
+ *        factors returned by #tjGetScalingFactors().  The dstBuf pointer may
+ *        also be used to decompress into a specific region of a larger buffer.
+ * @param width desired width (in pixels) of the destination image.  If this is
+ *        smaller than the width of the JPEG image being decompressed, then
+ *        TurboJPEG will use scaling in the JPEG decompressor to generate the
+ *        largest possible image that will fit within the desired width.  If
+ *        width is set to 0, then only the height will be considered when
+ *        determining the scaled image size.
+ * @param pitch bytes per line of the destination image.  Normally, this is
+ *        <tt>scaledWidth * #tjPixelSize[pixelFormat]</tt> if the decompressed
+ *        image is unpadded, else <tt>#TJPAD(scaledWidth *
+ *        #tjPixelSize[pixelFormat])</tt> if each line of the decompressed
+ *        image is padded to the nearest 32-bit boundary, as is the case for
+ *        Windows bitmaps.  (NOTE: <tt>scaledWidth</tt> can be determined by
+ *        calling #TJSCALED() with the JPEG image width and one of the scaling
+ *        factors returned by #tjGetScalingFactors().)  You can also be clever
+ *        and use the pitch parameter to skip lines, etc.  Setting this
+ *        parameter to 0 is the equivalent of setting it to <tt>scaledWidth
+ *        * #tjPixelSize[pixelFormat]</tt>.
+ * @param height desired height (in pixels) of the destination image.  If this
+ *        is smaller than the height of the JPEG image being decompressed, then
+ *        TurboJPEG will use scaling in the JPEG decompressor to generate the
+ *        largest possible image that will fit within the desired height.  If
+ *        height is set to 0, then only the width will be considered when
+ *        determining the scaled image size.
+ * @param pixelFormat pixel format of the destination image (see @ref
+ *        TJPF "Pixel formats".)
+ * @param flags the bitwise OR of one or more of the @ref TJFLAG_BOTTOMUP
+ *        "flags".
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
+ */
+DLLEXPORT int DLLCALL tjDecompress2(tjhandle handle,
+  unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
+  int width, int pitch, int height, int pixelFormat, int flags);
+
+
+/**
+ * Decompress a JPEG image to a YUV planar image.  This function performs JPEG
+ * decompression but leaves out the color conversion step, so a planar YUV
+ * image is generated instead of an RGB image.  The padding of the planes in
+ * this image is the same as the images generated by #tjEncodeYUV2().  Note
+ * that, if the width or height of the image is not an even multiple of the MCU
+ * block size (see #tjMCUWidth and #tjMCUHeight), then an intermediate buffer
+ * copy will be performed within TurboJPEG.
+ *
+ * @param handle a handle to a TurboJPEG decompressor or transformer instance
+ * @param jpegBuf pointer to a buffer containing the JPEG image to decompress
+ * @param jpegSize size of the JPEG image (in bytes)
+ * @param dstBuf pointer to an image buffer that will receive the YUV image.
+ *        Use #tjBufSizeYUV to determine the appropriate size for this buffer
+ *        based on the image width, height, and level of subsampling.
+ * @param flags the bitwise OR of one or more of the @ref TJFLAG_BOTTOMUP
+ *        "flags".
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
+ */
+DLLEXPORT int DLLCALL tjDecompressToYUV(tjhandle handle,
+  unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
+  int flags);
+
+
+/**
+ * Create a new TurboJPEG transformer instance.
+ *
+ * @return a handle to the newly-created instance, or NULL if an error
+ * occurred (see #tjGetErrorStr().)
+ */
+DLLEXPORT tjhandle DLLCALL tjInitTransform(void);
+
+
+/**
+ * Losslessly transform a JPEG image into another JPEG image.  Lossless
+ * transforms work by moving the raw coefficients from one JPEG image structure
+ * to another without altering the values of the coefficients.  While this is
+ * typically faster than decompressing the image, transforming it, and
+ * re-compressing it, lossless transforms are not free.  Each lossless
+ * transform requires reading and Huffman decoding all of the coefficients in
+ * the source image, regardless of the size of the destination image.  Thus,
+ * this function provides a means of generating multiple transformed images
+ * from the same source or of applying multiple transformations simultaneously,
+ * in order to eliminate the need to read the source coefficients multiple
+ * times.
+ *
+ * @param handle a handle to a TurboJPEG transformer instance
+ * @param jpegBuf pointer to a buffer containing the JPEG image to transform
+ * @param jpegSize size of the JPEG image (in bytes)
+ * @param n the number of transformed JPEG images to generate
+ * @param dstBufs pointer to an array of n image buffers.  <tt>dstBufs[i]</tt>
+ *        will receive a JPEG image that has been transformed using the
+ *        parameters in <tt>transforms[i]</tt>.  TurboJPEG has the ability to
+ *        reallocate the JPEG buffer to accommodate the size of the JPEG image.
+ *        Thus, you can choose to:
+ *        -# pre-allocate the JPEG buffer with an arbitrary size using
+ *        #tjAlloc() and let TurboJPEG grow the buffer as needed,
+ *        -# set <tt>dstBufs[i]</tt> to NULL to tell TurboJPEG to allocate the
+ *        buffer for you, or
+ *        -# pre-allocate the buffer to a "worst case" size determined by
+ *        calling #tjBufSize() with the cropped width and height.  This should
+ *        ensure that the buffer never has to be re-allocated (setting
+ *        #TJFLAG_NOREALLOC guarantees this.)
+ *        .
+ *        If you choose option 1, <tt>dstSizes[i]</tt> should be set to
+ *        the size of your pre-allocated buffer.  In any case, unless you have
+ *        set #TJFLAG_NOREALLOC, you should always check <tt>dstBufs[i]</tt>
+ *        upon return from this function, as it may have changed.
+ * @param dstSizes pointer to an array of n unsigned long variables that will
+ *        receive the actual sizes (in bytes) of each transformed JPEG image.
+ *        If <tt>dstBufs[i]</tt> points to a pre-allocated buffer, then
+ *        <tt>dstSizes[i]</tt> should be set to the size of the buffer.  Upon
+ *        return, <tt>dstSizes[i]</tt> will contain the size of the JPEG image
+ *        (in bytes.)
+ * @param transforms pointer to an array of n tjtransform structures, each of
+ *        which specifies the transform parameters and/or cropping region for
+ *        the corresponding transformed output image.
+ * @param flags the bitwise OR of one or more of the @ref TJFLAG_BOTTOMUP
+ *        "flags".
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
+ */
+DLLEXPORT int DLLCALL tjTransform(tjhandle handle, unsigned char *jpegBuf,
+  unsigned long jpegSize, int n, unsigned char **dstBufs,
+  unsigned long *dstSizes, tjtransform *transforms, int flags);
+
+
+/**
+ * Destroy a TurboJPEG compressor, decompressor, or transformer instance.
+ *
+ * @param handle a handle to a TurboJPEG compressor, decompressor or
+ *        transformer instance
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tjGetErrorStr().)
+ */
+DLLEXPORT int DLLCALL tjDestroy(tjhandle handle);
+
+
+/**
+ * Allocate an image buffer for use with TurboJPEG.  You should always use
+ * this function to allocate the JPEG destination buffer(s) for #tjCompress2()
+ * and #tjTransform() unless you are disabling automatic buffer
+ * (re)allocation (by setting #TJFLAG_NOREALLOC.)
+ *
+ * @param bytes the number of bytes to allocate
+ * 
+ * @return a pointer to a newly-allocated buffer with the specified number of
+ *         bytes
+ *
+ * @sa tjFree()
+ */
+DLLEXPORT unsigned char* DLLCALL tjAlloc(int bytes);
+
+
+/**
+ * Free an image buffer previously allocated by TurboJPEG.  You should always
+ * use this function to free JPEG destination buffer(s) that were automatically
+ * (re)allocated by #tjCompress2() or #tjTransform() or that were manually
+ * allocated using #tjAlloc().
+ *
+ * @param buffer address of the buffer to free
+ *
+ * @sa tjAlloc()
+ */
+DLLEXPORT void DLLCALL tjFree(unsigned char *buffer);
+
+
+/**
+ * Returns a descriptive error message explaining why the last command failed.
+ *
+ * @return a descriptive error message explaining why the last command failed.
+ */
+DLLEXPORT char* DLLCALL tjGetErrorStr(void);
+
+
+/* Backward compatibility functions and macros (nothing to see here) */
+#define NUMSUBOPT TJ_NUMSAMP
+#define TJ_444 TJSAMP_444
+#define TJ_422 TJSAMP_422
+#define TJ_420 TJSAMP_420
+#define TJ_411 TJSAMP_420
+#define TJ_GRAYSCALE TJSAMP_GRAY
+
+#define TJ_BGR 1
+#define TJ_BOTTOMUP TJFLAG_BOTTOMUP
+#define TJ_FORCEMMX TJFLAG_FORCEMMX
+#define TJ_FORCESSE TJFLAG_FORCESSE
+#define TJ_FORCESSE2 TJFLAG_FORCESSE2
+#define TJ_ALPHAFIRST 64
+#define TJ_FORCESSE3 TJFLAG_FORCESSE3
+#define TJ_FASTUPSAMPLE TJFLAG_FASTUPSAMPLE
+#define TJ_YUV 512
+
+DLLEXPORT unsigned long DLLCALL TJBUFSIZE(int width, int height);
+
+DLLEXPORT unsigned long DLLCALL TJBUFSIZEYUV(int width, int height,
+  int jpegSubsamp);
+
+DLLEXPORT int DLLCALL tjCompress(tjhandle handle, unsigned char *srcBuf,
+  int width, int pitch, int height, int pixelSize, unsigned char *dstBuf,
+  unsigned long *compressedSize, int jpegSubsamp, int jpegQual, int flags);
+
+DLLEXPORT int DLLCALL tjEncodeYUV(tjhandle handle,
+  unsigned char *srcBuf, int width, int pitch, int height, int pixelSize,
+  unsigned char *dstBuf, int subsamp, int flags);
+
+DLLEXPORT int DLLCALL tjDecompressHeader(tjhandle handle,
+  unsigned char *jpegBuf, unsigned long jpegSize, int *width, int *height);
+
+DLLEXPORT int DLLCALL tjDecompress(tjhandle handle,
+  unsigned char *jpegBuf, unsigned long jpegSize, unsigned char *dstBuf,
+  int width, int pitch, int height, int pixelSize, int flags);
+
+
+/**
+ * @}
+ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/turbojpeg/jconfig.h b/duix-sdk/src/main/cpp/third/arm/include/turbojpeg/jconfig.h
new file mode 100644
index 0000000..1c68fee
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/turbojpeg/jconfig.h
@@ -0,0 +1,60 @@
+/* Version ID for the JPEG library.
+ * Might be useful for tests like "#if JPEG_LIB_VERSION >= 60".
+ */
+#define JPEG_LIB_VERSION  62
+
+/* libjpeg-turbo version */
+#define LIBJPEG_TURBO_VERSION  3.0.1
+
+/* libjpeg-turbo version in integer form */
+#define LIBJPEG_TURBO_VERSION_NUMBER  3000001
+
+/* Support arithmetic encoding when using 8-bit samples */
+#define C_ARITH_CODING_SUPPORTED 1
+
+/* Support arithmetic decoding when using 8-bit samples */
+#define D_ARITH_CODING_SUPPORTED 1
+
+/* Support in-memory source/destination managers */
+#define MEM_SRCDST_SUPPORTED  1
+
+/* Use accelerated SIMD routines when using 8-bit samples */
+#define WITH_SIMD 1
+
+/* This version of libjpeg-turbo supports run-time selection of data precision,
+ * so BITS_IN_JSAMPLE is no longer used to specify the data precision at build
+ * time.  However, some downstream software expects the macro to be defined.
+ * Since 12-bit data precision is an opt-in feature that requires explicitly
+ * calling 12-bit-specific libjpeg API functions and using 12-bit-specific data
+ * types, the unmodified portion of the libjpeg API still behaves as if it were
+ * built for 8-bit precision, and JSAMPLE is still literally an 8-bit data
+ * type.  Thus, it is correct to define BITS_IN_JSAMPLE to 8 here.
+ */
+#ifndef BITS_IN_JSAMPLE
+#define BITS_IN_JSAMPLE  8
+#endif
+
+#ifdef _WIN32
+
+#undef RIGHT_SHIFT_IS_UNSIGNED
+
+/* Define "boolean" as unsigned char, not int, per Windows custom */
+#ifndef __RPCNDR_H__            /* don't conflict if rpcndr.h already read */
+typedef unsigned char boolean;
+#endif
+#define HAVE_BOOLEAN            /* prevent jmorecfg.h from redefining it */
+
+/* Define "INT32" as int, not long, per Windows custom */
+#if !(defined(_BASETSD_H_) || defined(_BASETSD_H))   /* don't conflict if basetsd.h already read */
+typedef short INT16;
+typedef signed int INT32;
+#endif
+#define XMD_H                   /* prevent jmorecfg.h from redefining it */
+
+#else
+
+/* Define if your (broken) compiler shifts signed values as if they were
+   unsigned. */
+/* #undef RIGHT_SHIFT_IS_UNSIGNED */
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/arm/include/turbojpeg/jerror.h b/duix-sdk/src/main/cpp/third/arm/include/turbojpeg/jerror.h
new file mode 100644
index 0000000..39362fd
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/turbojpeg/jerror.h
@@ -0,0 +1,336 @@
+/*
+ * jerror.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1997, Thomas G. Lane.
+ * Modified 1997-2009 by Guido Vollbeding.
+ * Lossless JPEG Modifications:
+ * Copyright (C) 1999, Ken Murchison.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2014, 2017, 2021-2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file defines the error and message codes for the JPEG library.
+ * Edit this file to add new codes, or to translate the message strings to
+ * some other language.
+ * A set of error-reporting macros are defined too.  Some applications using
+ * the JPEG library may wish to include this file to get the error codes
+ * and/or the macros.
+ */
+
+/*
+ * To define the enum list of message codes, include this file without
+ * defining macro JMESSAGE.  To create a message string table, include it
+ * again with a suitable JMESSAGE definition (see jerror.c for an example).
+ */
+#ifndef JMESSAGE
+#ifndef JERROR_H
+/* First time through, define the enum list */
+#define JMAKE_ENUM_LIST
+#else
+/* Repeated inclusions of this file are no-ops unless JMESSAGE is defined */
+#define JMESSAGE(code, string)
+#endif /* JERROR_H */
+#endif /* JMESSAGE */
+
+#ifdef JMAKE_ENUM_LIST
+
+typedef enum {
+
+#define JMESSAGE(code, string)  code,
+
+#endif /* JMAKE_ENUM_LIST */
+
+JMESSAGE(JMSG_NOMESSAGE, "Bogus message code %d") /* Must be first entry! */
+
+/* For maintenance convenience, list is alphabetical by message code name */
+#if JPEG_LIB_VERSION < 70
+JMESSAGE(JERR_ARITH_NOTIMPL, "Sorry, arithmetic coding is not implemented")
+#endif
+JMESSAGE(JERR_BAD_ALIGN_TYPE, "ALIGN_TYPE is wrong, please fix")
+JMESSAGE(JERR_BAD_ALLOC_CHUNK, "MAX_ALLOC_CHUNK is wrong, please fix")
+JMESSAGE(JERR_BAD_BUFFER_MODE, "Bogus buffer control mode")
+JMESSAGE(JERR_BAD_COMPONENT_ID, "Invalid component ID %d in SOS")
+#if JPEG_LIB_VERSION >= 70
+JMESSAGE(JERR_BAD_CROP_SPEC, "Invalid crop request")
+#endif
+JMESSAGE(JERR_BAD_DCT_COEF,
+         "DCT coefficient (lossy) or spatial difference (lossless) out of range")
+JMESSAGE(JERR_BAD_DCTSIZE, "IDCT output block size %d not supported")
+#if JPEG_LIB_VERSION >= 70
+JMESSAGE(JERR_BAD_DROP_SAMPLING,
+         "Component index %d: mismatching sampling ratio %d:%d, %d:%d, %c")
+#endif
+JMESSAGE(JERR_BAD_HUFF_TABLE, "Bogus Huffman table definition")
+JMESSAGE(JERR_BAD_IN_COLORSPACE, "Bogus input colorspace")
+JMESSAGE(JERR_BAD_J_COLORSPACE, "Bogus JPEG colorspace")
+JMESSAGE(JERR_BAD_LENGTH, "Bogus marker length")
+JMESSAGE(JERR_BAD_LIB_VERSION,
+         "Wrong JPEG library version: library is %d, caller expects %d")
+JMESSAGE(JERR_BAD_MCU_SIZE, "Sampling factors too large for interleaved scan")
+JMESSAGE(JERR_BAD_POOL_ID, "Invalid memory pool code %d")
+JMESSAGE(JERR_BAD_PRECISION, "Unsupported JPEG data precision %d")
+JMESSAGE(JERR_BAD_PROGRESSION,
+         "Invalid progressive/lossless parameters Ss=%d Se=%d Ah=%d Al=%d")
+JMESSAGE(JERR_BAD_PROG_SCRIPT,
+         "Invalid progressive/lossless parameters at scan script entry %d")
+JMESSAGE(JERR_BAD_SAMPLING, "Bogus sampling factors")
+JMESSAGE(JERR_BAD_SCAN_SCRIPT, "Invalid scan script at entry %d")
+JMESSAGE(JERR_BAD_STATE, "Improper call to JPEG library in state %d")
+JMESSAGE(JERR_BAD_STRUCT_SIZE,
+         "JPEG parameter struct mismatch: library thinks size is %u, caller expects %u")
+JMESSAGE(JERR_BAD_VIRTUAL_ACCESS, "Bogus virtual array access")
+JMESSAGE(JERR_BUFFER_SIZE, "Buffer passed to JPEG library is too small")
+JMESSAGE(JERR_CANT_SUSPEND, "Suspension not allowed here")
+JMESSAGE(JERR_CCIR601_NOTIMPL, "CCIR601 sampling not implemented yet")
+JMESSAGE(JERR_COMPONENT_COUNT, "Too many color components: %d, max %d")
+JMESSAGE(JERR_CONVERSION_NOTIMPL, "Unsupported color conversion request")
+JMESSAGE(JERR_DAC_INDEX, "Bogus DAC index %d")
+JMESSAGE(JERR_DAC_VALUE, "Bogus DAC value 0x%x")
+JMESSAGE(JERR_DHT_INDEX, "Bogus DHT index %d")
+JMESSAGE(JERR_DQT_INDEX, "Bogus DQT index %d")
+JMESSAGE(JERR_EMPTY_IMAGE, "Empty JPEG image (DNL not supported)")
+JMESSAGE(JERR_EMS_READ, "Read from EMS failed")
+JMESSAGE(JERR_EMS_WRITE, "Write to EMS failed")
+JMESSAGE(JERR_EOI_EXPECTED, "Didn't expect more than one scan")
+JMESSAGE(JERR_FILE_READ, "Input file read error")
+JMESSAGE(JERR_FILE_WRITE, "Output file write error --- out of disk space?")
+JMESSAGE(JERR_FRACT_SAMPLE_NOTIMPL, "Fractional sampling not implemented yet")
+JMESSAGE(JERR_HUFF_CLEN_OVERFLOW, "Huffman code size table overflow")
+JMESSAGE(JERR_HUFF_MISSING_CODE, "Missing Huffman code table entry")
+JMESSAGE(JERR_IMAGE_TOO_BIG, "Maximum supported image dimension is %u pixels")
+JMESSAGE(JERR_INPUT_EMPTY, "Empty input file")
+JMESSAGE(JERR_INPUT_EOF, "Premature end of input file")
+JMESSAGE(JERR_MISMATCHED_QUANT_TABLE,
+         "Cannot transcode due to multiple use of quantization table %d")
+JMESSAGE(JERR_MISSING_DATA, "Scan script does not transmit all data")
+JMESSAGE(JERR_MODE_CHANGE, "Invalid color quantization mode change")
+JMESSAGE(JERR_NOTIMPL, "Requested features are incompatible")
+JMESSAGE(JERR_NOT_COMPILED, "Requested feature was omitted at compile time")
+#if JPEG_LIB_VERSION >= 70
+JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmetic table 0x%02x was not defined")
+#endif
+JMESSAGE(JERR_NO_BACKING_STORE, "Backing store not supported")
+JMESSAGE(JERR_NO_HUFF_TABLE, "Huffman table 0x%02x was not defined")
+JMESSAGE(JERR_NO_IMAGE, "JPEG datastream contains no image")
+JMESSAGE(JERR_NO_QUANT_TABLE, "Quantization table 0x%02x was not defined")
+JMESSAGE(JERR_NO_SOI, "Not a JPEG file: starts with 0x%02x 0x%02x")
+JMESSAGE(JERR_OUT_OF_MEMORY, "Insufficient memory (case %d)")
+JMESSAGE(JERR_QUANT_COMPONENTS,
+         "Cannot quantize more than %d color components")
+JMESSAGE(JERR_QUANT_FEW_COLORS, "Cannot quantize to fewer than %d colors")
+JMESSAGE(JERR_QUANT_MANY_COLORS, "Cannot quantize to more than %d colors")
+JMESSAGE(JERR_SOF_DUPLICATE, "Invalid JPEG file structure: two SOF markers")
+JMESSAGE(JERR_SOF_NO_SOS, "Invalid JPEG file structure: missing SOS marker")
+JMESSAGE(JERR_SOF_UNSUPPORTED, "Unsupported JPEG process: SOF type 0x%02x")
+JMESSAGE(JERR_SOI_DUPLICATE, "Invalid JPEG file structure: two SOI markers")
+JMESSAGE(JERR_SOS_NO_SOF, "Invalid JPEG file structure: SOS before SOF")
+JMESSAGE(JERR_TFILE_CREATE, "Failed to create temporary file %s")
+JMESSAGE(JERR_TFILE_READ, "Read failed on temporary file")
+JMESSAGE(JERR_TFILE_SEEK, "Seek failed on temporary file")
+JMESSAGE(JERR_TFILE_WRITE,
+         "Write failed on temporary file --- out of disk space?")
+JMESSAGE(JERR_TOO_LITTLE_DATA, "Application transferred too few scanlines")
+JMESSAGE(JERR_UNKNOWN_MARKER, "Unsupported marker type 0x%02x")
+JMESSAGE(JERR_VIRTUAL_BUG, "Virtual array controller messed up")
+JMESSAGE(JERR_WIDTH_OVERFLOW, "Image too wide for this implementation")
+JMESSAGE(JERR_XMS_READ, "Read from XMS failed")
+JMESSAGE(JERR_XMS_WRITE, "Write to XMS failed")
+JMESSAGE(JMSG_COPYRIGHT, JCOPYRIGHT_SHORT)
+JMESSAGE(JMSG_VERSION, JVERSION)
+JMESSAGE(JTRC_16BIT_TABLES,
+         "Caution: quantization tables are too coarse for baseline JPEG")
+JMESSAGE(JTRC_ADOBE,
+         "Adobe APP14 marker: version %d, flags 0x%04x 0x%04x, transform %d")
+JMESSAGE(JTRC_APP0, "Unknown APP0 marker (not JFIF), length %u")
+JMESSAGE(JTRC_APP14, "Unknown APP14 marker (not Adobe), length %u")
+JMESSAGE(JTRC_DAC, "Define Arithmetic Table 0x%02x: 0x%02x")
+JMESSAGE(JTRC_DHT, "Define Huffman Table 0x%02x")
+JMESSAGE(JTRC_DQT, "Define Quantization Table %d  precision %d")
+JMESSAGE(JTRC_DRI, "Define Restart Interval %u")
+JMESSAGE(JTRC_EMS_CLOSE, "Freed EMS handle %u")
+JMESSAGE(JTRC_EMS_OPEN, "Obtained EMS handle %u")
+JMESSAGE(JTRC_EOI, "End Of Image")
+JMESSAGE(JTRC_HUFFBITS, "        %3d %3d %3d %3d %3d %3d %3d %3d")
+JMESSAGE(JTRC_JFIF, "JFIF APP0 marker: version %d.%02d, density %dx%d  %d")
+JMESSAGE(JTRC_JFIF_BADTHUMBNAILSIZE,
+         "Warning: thumbnail image size does not match data length %u")
+JMESSAGE(JTRC_JFIF_EXTENSION, "JFIF extension marker: type 0x%02x, length %u")
+JMESSAGE(JTRC_JFIF_THUMBNAIL, "    with %d x %d thumbnail image")
+JMESSAGE(JTRC_MISC_MARKER, "Miscellaneous marker 0x%02x, length %u")
+JMESSAGE(JTRC_PARMLESS_MARKER, "Unexpected marker 0x%02x")
+JMESSAGE(JTRC_QUANTVALS, "        %4u %4u %4u %4u %4u %4u %4u %4u")
+JMESSAGE(JTRC_QUANT_3_NCOLORS, "Quantizing to %d = %d*%d*%d colors")
+JMESSAGE(JTRC_QUANT_NCOLORS, "Quantizing to %d colors")
+JMESSAGE(JTRC_QUANT_SELECTED, "Selected %d colors for quantization")
+JMESSAGE(JTRC_RECOVERY_ACTION, "At marker 0x%02x, recovery action %d")
+JMESSAGE(JTRC_RST, "RST%d")
+JMESSAGE(JTRC_SMOOTH_NOTIMPL,
+         "Smoothing not supported with nonstandard sampling ratios")
+JMESSAGE(JTRC_SOF, "Start Of Frame 0x%02x: width=%u, height=%u, components=%d")
+JMESSAGE(JTRC_SOF_COMPONENT, "    Component %d: %dhx%dv q=%d")
+JMESSAGE(JTRC_SOI, "Start of Image")
+JMESSAGE(JTRC_SOS, "Start Of Scan: %d components")
+JMESSAGE(JTRC_SOS_COMPONENT, "    Component %d: dc=%d ac=%d")
+JMESSAGE(JTRC_SOS_PARAMS, "  Ss=%d, Se=%d, Ah=%d, Al=%d")
+JMESSAGE(JTRC_TFILE_CLOSE, "Closed temporary file %s")
+JMESSAGE(JTRC_TFILE_OPEN, "Opened temporary file %s")
+JMESSAGE(JTRC_THUMB_JPEG,
+         "JFIF extension marker: JPEG-compressed thumbnail image, length %u")
+JMESSAGE(JTRC_THUMB_PALETTE,
+         "JFIF extension marker: palette thumbnail image, length %u")
+JMESSAGE(JTRC_THUMB_RGB,
+         "JFIF extension marker: RGB thumbnail image, length %u")
+JMESSAGE(JTRC_UNKNOWN_IDS,
+         "Unrecognized component IDs %d %d %d, assuming YCbCr (lossy) or RGB (lossless)")
+JMESSAGE(JTRC_XMS_CLOSE, "Freed XMS handle %u")
+JMESSAGE(JTRC_XMS_OPEN, "Obtained XMS handle %u")
+JMESSAGE(JWRN_ADOBE_XFORM, "Unknown Adobe color transform code %d")
+#if JPEG_LIB_VERSION >= 70
+JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code")
+#endif
+JMESSAGE(JWRN_BOGUS_PROGRESSION,
+         "Inconsistent progression sequence for component %d coefficient %d")
+JMESSAGE(JWRN_EXTRANEOUS_DATA,
+         "Corrupt JPEG data: %u extraneous bytes before marker 0x%02x")
+JMESSAGE(JWRN_HIT_MARKER, "Corrupt JPEG data: premature end of data segment")
+JMESSAGE(JWRN_HUFF_BAD_CODE, "Corrupt JPEG data: bad Huffman code")
+JMESSAGE(JWRN_JFIF_MAJOR, "Warning: unknown JFIF revision number %d.%02d")
+JMESSAGE(JWRN_JPEG_EOF, "Premature end of JPEG file")
+JMESSAGE(JWRN_MUST_RESYNC,
+         "Corrupt JPEG data: found marker 0x%02x instead of RST%d")
+JMESSAGE(JWRN_NOT_SEQUENTIAL, "Invalid SOS parameters for sequential JPEG")
+JMESSAGE(JWRN_TOO_MUCH_DATA, "Application transferred too many scanlines")
+#if JPEG_LIB_VERSION < 70
+JMESSAGE(JERR_BAD_CROP_SPEC, "Invalid crop request")
+#if defined(C_ARITH_CODING_SUPPORTED) || defined(D_ARITH_CODING_SUPPORTED)
+JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmetic table 0x%02x was not defined")
+JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code")
+#endif
+#endif
+JMESSAGE(JWRN_BOGUS_ICC, "Corrupt JPEG data: bad ICC marker")
+#if JPEG_LIB_VERSION < 70
+JMESSAGE(JERR_BAD_DROP_SAMPLING,
+         "Component index %d: mismatching sampling ratio %d:%d, %d:%d, %c")
+#endif
+JMESSAGE(JERR_BAD_RESTART,
+         "Invalid restart interval %d; must be an integer multiple of the number of MCUs in an MCU row (%d)")
+
+#ifdef JMAKE_ENUM_LIST
+
+  JMSG_LASTMSGCODE
+} J_MESSAGE_CODE;
+
+#undef JMAKE_ENUM_LIST
+#endif /* JMAKE_ENUM_LIST */
+
+/* Zap JMESSAGE macro so that future re-inclusions do nothing by default */
+#undef JMESSAGE
+
+
+#ifndef JERROR_H
+#define JERROR_H
+
+/* Macros to simplify using the error and trace message stuff */
+/* The first parameter is either type of cinfo pointer */
+
+/* Fatal errors (print message and exit) */
+#define ERREXIT(cinfo, code) \
+  ((cinfo)->err->msg_code = (code), \
+   (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT1(cinfo, code, p1) \
+  ((cinfo)->err->msg_code = (code), \
+   (cinfo)->err->msg_parm.i[0] = (p1), \
+   (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT2(cinfo, code, p1, p2) \
+  ((cinfo)->err->msg_code = (code), \
+   (cinfo)->err->msg_parm.i[0] = (p1), \
+   (cinfo)->err->msg_parm.i[1] = (p2), \
+   (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT3(cinfo, code, p1, p2, p3) \
+  ((cinfo)->err->msg_code = (code), \
+   (cinfo)->err->msg_parm.i[0] = (p1), \
+   (cinfo)->err->msg_parm.i[1] = (p2), \
+   (cinfo)->err->msg_parm.i[2] = (p3), \
+   (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT4(cinfo, code, p1, p2, p3, p4) \
+  ((cinfo)->err->msg_code = (code), \
+   (cinfo)->err->msg_parm.i[0] = (p1), \
+   (cinfo)->err->msg_parm.i[1] = (p2), \
+   (cinfo)->err->msg_parm.i[2] = (p3), \
+   (cinfo)->err->msg_parm.i[3] = (p4), \
+   (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT6(cinfo, code, p1, p2, p3, p4, p5, p6) \
+  ((cinfo)->err->msg_code = (code), \
+   (cinfo)->err->msg_parm.i[0] = (p1), \
+   (cinfo)->err->msg_parm.i[1] = (p2), \
+   (cinfo)->err->msg_parm.i[2] = (p3), \
+   (cinfo)->err->msg_parm.i[3] = (p4), \
+   (cinfo)->err->msg_parm.i[4] = (p5), \
+   (cinfo)->err->msg_parm.i[5] = (p6), \
+   (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXITS(cinfo, code, str) \
+  ((cinfo)->err->msg_code = (code), \
+   strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \
+   (cinfo)->err->msg_parm.s[JMSG_STR_PARM_MAX - 1] = '\0', \
+   (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+
+#define MAKESTMT(stuff)         do { stuff } while (0)
+
+/* Nonfatal errors (we can keep going, but the data is probably corrupt) */
+#define WARNMS(cinfo, code) \
+  ((cinfo)->err->msg_code = (code), \
+   (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), -1))
+#define WARNMS1(cinfo, code, p1) \
+  ((cinfo)->err->msg_code = (code), \
+   (cinfo)->err->msg_parm.i[0] = (p1), \
+   (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), -1))
+#define WARNMS2(cinfo, code, p1, p2) \
+  ((cinfo)->err->msg_code = (code), \
+   (cinfo)->err->msg_parm.i[0] = (p1), \
+   (cinfo)->err->msg_parm.i[1] = (p2), \
+   (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), -1))
+
+/* Informational/debugging messages */
+#define TRACEMS(cinfo, lvl, code) \
+  ((cinfo)->err->msg_code = (code), \
+   (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)))
+#define TRACEMS1(cinfo, lvl, code, p1) \
+  ((cinfo)->err->msg_code = (code), \
+   (cinfo)->err->msg_parm.i[0] = (p1), \
+   (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)))
+#define TRACEMS2(cinfo, lvl, code, p1, p2) \
+  ((cinfo)->err->msg_code = (code), \
+   (cinfo)->err->msg_parm.i[0] = (p1), \
+   (cinfo)->err->msg_parm.i[1] = (p2), \
+   (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)))
+#define TRACEMS3(cinfo, lvl, code, p1, p2, p3) \
+  MAKESTMT(int *_mp = (cinfo)->err->msg_parm.i; \
+           _mp[0] = (p1);  _mp[1] = (p2);  _mp[2] = (p3); \
+           (cinfo)->err->msg_code = (code); \
+           (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)); )
+#define TRACEMS4(cinfo, lvl, code, p1, p2, p3, p4) \
+  MAKESTMT(int *_mp = (cinfo)->err->msg_parm.i; \
+           _mp[0] = (p1);  _mp[1] = (p2);  _mp[2] = (p3);  _mp[3] = (p4); \
+           (cinfo)->err->msg_code = (code); \
+           (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)); )
+#define TRACEMS5(cinfo, lvl, code, p1, p2, p3, p4, p5) \
+  MAKESTMT(int *_mp = (cinfo)->err->msg_parm.i; \
+           _mp[0] = (p1);  _mp[1] = (p2);  _mp[2] = (p3);  _mp[3] = (p4); \
+           _mp[4] = (p5); \
+           (cinfo)->err->msg_code = (code); \
+           (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)); )
+#define TRACEMS8(cinfo, lvl, code, p1, p2, p3, p4, p5, p6, p7, p8) \
+  MAKESTMT(int *_mp = (cinfo)->err->msg_parm.i; \
+           _mp[0] = (p1);  _mp[1] = (p2);  _mp[2] = (p3);  _mp[3] = (p4); \
+           _mp[4] = (p5);  _mp[5] = (p6);  _mp[6] = (p7);  _mp[7] = (p8); \
+           (cinfo)->err->msg_code = (code); \
+           (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)); )
+#define TRACEMSS(cinfo, lvl, code, str) \
+  ((cinfo)->err->msg_code = (code), \
+   strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \
+   (cinfo)->err->msg_parm.s[JMSG_STR_PARM_MAX - 1] = '\0', \
+   (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)))
+
+#endif /* JERROR_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/turbojpeg/jmorecfg.h b/duix-sdk/src/main/cpp/third/arm/include/turbojpeg/jmorecfg.h
new file mode 100644
index 0000000..89c7842
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/turbojpeg/jmorecfg.h
@@ -0,0 +1,385 @@
+/*
+ * jmorecfg.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 1997-2009 by Guido Vollbeding.
+ * Lossless JPEG Modifications:
+ * Copyright (C) 1999, Ken Murchison.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2009, 2011, 2014-2015, 2018, 2020, 2022, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file contains additional configuration options that customize the
+ * JPEG software for special applications or support machine-dependent
+ * optimizations.  Most users will not need to touch this file.
+ */
+
+
+/*
+ * Maximum number of components (color channels) allowed in JPEG image.
+ * To meet the letter of Rec. ITU-T T.81 | ISO/IEC 10918-1, set this to 255.
+ * However, darn few applications need more than 4 channels (maybe 5 for CMYK +
+ * alpha mask).  We recommend 10 as a reasonable compromise; use 4 if you are
+ * really short on memory.  (Each allowed component costs a hundred or so
+ * bytes of storage, whether actually used in an image or not.)
+ */
+
+#define MAX_COMPONENTS  10      /* maximum number of image components */
+
+
+/*
+ * Basic data types.
+ * You may need to change these if you have a machine with unusual data
+ * type sizes; for example, "char" not 8 bits, "short" not 16 bits,
+ * or "long" not 32 bits.  We don't care whether "int" is 16 or 32 bits,
+ * but it had better be at least 16.
+ */
+
+/* Representation of a single sample (pixel element value).
+ * We frequently allocate large arrays of these, so it's important to keep
+ * them small.  But if you have memory to burn and access to char or short
+ * arrays is very slow on your hardware, you might want to change these.
+ */
+
+/* JSAMPLE should be the smallest type that will hold the values 0..255. */
+
+typedef unsigned char JSAMPLE;
+#define GETJSAMPLE(value)  ((int)(value))
+
+#define MAXJSAMPLE       255
+#define CENTERJSAMPLE    128
+
+
+/* J12SAMPLE should be the smallest type that will hold the values 0..4095. */
+
+typedef short J12SAMPLE;
+
+#define MAXJ12SAMPLE     4095
+#define CENTERJ12SAMPLE  2048
+
+
+/* J16SAMPLE should be the smallest type that will hold the values 0..65535. */
+
+typedef unsigned short J16SAMPLE;
+
+#define MAXJ16SAMPLE     65535
+#define CENTERJ16SAMPLE  32768
+
+
+/* Representation of a DCT frequency coefficient.
+ * This should be a signed value of at least 16 bits; "short" is usually OK.
+ * Again, we allocate large arrays of these, but you can change to int
+ * if you have memory to burn and "short" is really slow.
+ */
+
+typedef short JCOEF;
+
+
+/* Compressed datastreams are represented as arrays of JOCTET.
+ * These must be EXACTLY 8 bits wide, at least once they are written to
+ * external storage.  Note that when using the stdio data source/destination
+ * managers, this is also the data type passed to fread/fwrite.
+ */
+
+typedef unsigned char JOCTET;
+#define GETJOCTET(value)  (value)
+
+
+/* These typedefs are used for various table entries and so forth.
+ * They must be at least as wide as specified; but making them too big
+ * won't cost a huge amount of memory, so we don't provide special
+ * extraction code like we did for JSAMPLE.  (In other words, these
+ * typedefs live at a different point on the speed/space tradeoff curve.)
+ */
+
+/* UINT8 must hold at least the values 0..255. */
+
+typedef unsigned char UINT8;
+
+/* UINT16 must hold at least the values 0..65535. */
+
+typedef unsigned short UINT16;
+
+/* INT16 must hold at least the values -32768..32767. */
+
+#ifndef XMD_H                   /* X11/xmd.h correctly defines INT16 */
+typedef short INT16;
+#endif
+
+/* INT32 must hold at least signed 32-bit values.
+ *
+ * NOTE: The INT32 typedef dates back to libjpeg v5 (1994.)  Integers were
+ * sometimes 16-bit back then (MS-DOS), which is why INT32 is typedef'd to
+ * long.  It also wasn't common (or at least as common) in 1994 for INT32 to be
+ * defined by platform headers.  Since then, however, INT32 is defined in
+ * several other common places:
+ *
+ * Xmd.h (X11 header) typedefs INT32 to int on 64-bit platforms and long on
+ * 32-bit platforms (i.e always a 32-bit signed type.)
+ *
+ * basetsd.h (Win32 header) typedefs INT32 to int (always a 32-bit signed type
+ * on modern platforms.)
+ *
+ * qglobal.h (Qt header) typedefs INT32 to int (always a 32-bit signed type on
+ * modern platforms.)
+ *
+ * This is a recipe for conflict, since "long" and "int" aren't always
+ * compatible types.  Since the definition of INT32 has technically been part
+ * of the libjpeg API for more than 20 years, we can't remove it, but we do not
+ * use it internally any longer.  We instead define a separate type (JLONG)
+ * for internal use, which ensures that internal behavior will always be the
+ * same regardless of any external headers that may be included.
+ */
+
+#ifndef XMD_H                   /* X11/xmd.h correctly defines INT32 */
+#ifndef _BASETSD_H_             /* Microsoft defines it in basetsd.h */
+#ifndef _BASETSD_H              /* MinGW is slightly different */
+#ifndef QGLOBAL_H               /* Qt defines it in qglobal.h */
+typedef long INT32;
+#endif
+#endif
+#endif
+#endif
+
+/* Datatype used for image dimensions.  The JPEG standard only supports
+ * images up to 64K*64K due to 16-bit fields in SOF markers.  Therefore
+ * "unsigned int" is sufficient on all machines.  However, if you need to
+ * handle larger images and you don't mind deviating from the spec, you
+ * can change this datatype.  (Note that changing this datatype will
+ * potentially require modifying the SIMD code.  The x86-64 SIMD extensions,
+ * in particular, assume a 32-bit JDIMENSION.)
+ */
+
+typedef unsigned int JDIMENSION;
+
+#define JPEG_MAX_DIMENSION  65500L  /* a tad under 64K to prevent overflows */
+
+
+/* These macros are used in all function definitions and extern declarations.
+ * You could modify them if you need to change function linkage conventions;
+ * in particular, you'll need to do that to make the library a Windows DLL.
+ * Another application is to make all functions global for use with debuggers
+ * or code profilers that require it.
+ */
+
+/* a function called through method pointers: */
+#define METHODDEF(type)         static type
+/* a function used only in its module: */
+#define LOCAL(type)             static type
+/* a function referenced thru EXTERNs: */
+#define GLOBAL(type)            type
+/* a reference to a GLOBAL function: */
+#define EXTERN(type)            extern type
+
+
+/* Originally, this macro was used as a way of defining function prototypes
+ * for both modern compilers as well as older compilers that did not support
+ * prototype parameters.  libjpeg-turbo has never supported these older,
+ * non-ANSI compilers, but the macro is still included because there is some
+ * software out there that uses it.
+ */
+
+#define JMETHOD(type, methodname, arglist)  type (*methodname) arglist
+
+
+/* libjpeg-turbo no longer supports platforms that have far symbols (MS-DOS),
+ * but again, some software relies on this macro.
+ */
+
+#undef FAR
+#define FAR
+
+
+/*
+ * On a few systems, type boolean and/or its values FALSE, TRUE may appear
+ * in standard header files.  Or you may have conflicts with application-
+ * specific header files that you want to include together with these files.
+ * Defining HAVE_BOOLEAN before including jpeglib.h should make it work.
+ */
+
+#ifndef HAVE_BOOLEAN
+typedef int boolean;
+#endif
+#ifndef FALSE                   /* in case these macros already exist */
+#define FALSE   0               /* values of boolean */
+#endif
+#ifndef TRUE
+#define TRUE    1
+#endif
+
+
+/*
+ * The remaining options affect code selection within the JPEG library,
+ * but they don't need to be visible to most applications using the library.
+ * To minimize application namespace pollution, the symbols won't be
+ * defined unless JPEG_INTERNALS or JPEG_INTERNAL_OPTIONS has been defined.
+ */
+
+#ifdef JPEG_INTERNALS
+#define JPEG_INTERNAL_OPTIONS
+#endif
+
+#ifdef JPEG_INTERNAL_OPTIONS
+
+
+/*
+ * These defines indicate whether to include various optional functions.
+ * Undefining some of these symbols will produce a smaller but less capable
+ * library.  Note that you can leave certain source files out of the
+ * compilation/linking process if you've #undef'd the corresponding symbols.
+ * (You may HAVE to do that if your compiler doesn't like null source files.)
+ */
+
+/* Capability options common to encoder and decoder: */
+
+#define DCT_ISLOW_SUPPORTED     /* accurate integer method */
+#define DCT_IFAST_SUPPORTED     /* less accurate int method [legacy feature] */
+#define DCT_FLOAT_SUPPORTED     /* floating-point method [legacy feature] */
+
+/* Encoder capability options: */
+
+#define C_MULTISCAN_FILES_SUPPORTED /* Multiple-scan JPEG files? */
+#define C_PROGRESSIVE_SUPPORTED     /* Progressive JPEG? (Requires MULTISCAN)*/
+#define C_LOSSLESS_SUPPORTED        /* Lossless JPEG? */
+#define ENTROPY_OPT_SUPPORTED       /* Optimization of entropy coding parms? */
+/* Note: if you selected 12-bit data precision, it is dangerous to turn off
+ * ENTROPY_OPT_SUPPORTED.  The standard Huffman tables are only good for 8-bit
+ * precision, so jchuff.c normally uses entropy optimization to compute
+ * usable tables for higher precision.  If you don't want to do optimization,
+ * you'll have to supply different default Huffman tables.
+ * The exact same statements apply for progressive and lossless JPEG:
+ * the default tables don't work for progressive mode or lossless mode.
+ * (This may get fixed, however.)
+ */
+#define INPUT_SMOOTHING_SUPPORTED   /* Input image smoothing option? */
+
+/* Decoder capability options: */
+
+#define D_MULTISCAN_FILES_SUPPORTED /* Multiple-scan JPEG files? */
+#define D_PROGRESSIVE_SUPPORTED     /* Progressive JPEG? (Requires MULTISCAN)*/
+#define D_LOSSLESS_SUPPORTED        /* Lossless JPEG? */
+#define SAVE_MARKERS_SUPPORTED      /* jpeg_save_markers() needed? */
+#define BLOCK_SMOOTHING_SUPPORTED   /* Block smoothing? (Progressive only) */
+#define IDCT_SCALING_SUPPORTED      /* Output rescaling via IDCT? */
+#undef  UPSAMPLE_SCALING_SUPPORTED  /* Output rescaling at upsample stage? */
+#define UPSAMPLE_MERGING_SUPPORTED  /* Fast path for sloppy upsampling? */
+#define QUANT_1PASS_SUPPORTED       /* 1-pass color quantization? */
+#define QUANT_2PASS_SUPPORTED       /* 2-pass color quantization? */
+
+/* more capability options later, no doubt */
+
+
+/*
+ * The RGB_RED, RGB_GREEN, RGB_BLUE, and RGB_PIXELSIZE macros are a vestigial
+ * feature of libjpeg.  The idea was that, if an application developer needed
+ * to compress from/decompress to a BGR/BGRX/RGBX/XBGR/XRGB buffer, they could
+ * change these macros, rebuild libjpeg, and link their application statically
+ * with it.  In reality, few people ever did this, because there were some
+ * severe restrictions involved (cjpeg and djpeg no longer worked properly,
+ * compressing/decompressing RGB JPEGs no longer worked properly, and the color
+ * quantizer wouldn't work with pixel sizes other than 3.)  Furthermore, since
+ * all of the O/S-supplied versions of libjpeg were built with the default
+ * values of RGB_RED, RGB_GREEN, RGB_BLUE, and RGB_PIXELSIZE, many applications
+ * have come to regard these values as immutable.
+ *
+ * The libjpeg-turbo colorspace extensions provide a much cleaner way of
+ * compressing from/decompressing to buffers with arbitrary component orders
+ * and pixel sizes.  Thus, we do not support changing the values of RGB_RED,
+ * RGB_GREEN, RGB_BLUE, or RGB_PIXELSIZE.  In addition to the restrictions
+ * listed above, changing these values will also break the SIMD extensions and
+ * the regression tests.
+ */
+
+#define RGB_RED         0       /* Offset of Red in an RGB scanline element */
+#define RGB_GREEN       1       /* Offset of Green */
+#define RGB_BLUE        2       /* Offset of Blue */
+#define RGB_PIXELSIZE   3       /* JSAMPLEs per RGB scanline element */
+
+#define JPEG_NUMCS  17
+
+#define EXT_RGB_RED         0
+#define EXT_RGB_GREEN       1
+#define EXT_RGB_BLUE        2
+#define EXT_RGB_PIXELSIZE   3
+
+#define EXT_RGBX_RED        0
+#define EXT_RGBX_GREEN      1
+#define EXT_RGBX_BLUE       2
+#define EXT_RGBX_PIXELSIZE  4
+
+#define EXT_BGR_RED         2
+#define EXT_BGR_GREEN       1
+#define EXT_BGR_BLUE        0
+#define EXT_BGR_PIXELSIZE   3
+
+#define EXT_BGRX_RED        2
+#define EXT_BGRX_GREEN      1
+#define EXT_BGRX_BLUE       0
+#define EXT_BGRX_PIXELSIZE  4
+
+#define EXT_XBGR_RED        3
+#define EXT_XBGR_GREEN      2
+#define EXT_XBGR_BLUE       1
+#define EXT_XBGR_PIXELSIZE  4
+
+#define EXT_XRGB_RED        1
+#define EXT_XRGB_GREEN      2
+#define EXT_XRGB_BLUE       3
+#define EXT_XRGB_PIXELSIZE  4
+
+static const int rgb_red[JPEG_NUMCS] = {
+  -1, -1, RGB_RED, -1, -1, -1, EXT_RGB_RED, EXT_RGBX_RED,
+  EXT_BGR_RED, EXT_BGRX_RED, EXT_XBGR_RED, EXT_XRGB_RED,
+  EXT_RGBX_RED, EXT_BGRX_RED, EXT_XBGR_RED, EXT_XRGB_RED,
+  -1
+};
+
+static const int rgb_green[JPEG_NUMCS] = {
+  -1, -1, RGB_GREEN, -1, -1, -1, EXT_RGB_GREEN, EXT_RGBX_GREEN,
+  EXT_BGR_GREEN, EXT_BGRX_GREEN, EXT_XBGR_GREEN, EXT_XRGB_GREEN,
+  EXT_RGBX_GREEN, EXT_BGRX_GREEN, EXT_XBGR_GREEN, EXT_XRGB_GREEN,
+  -1
+};
+
+static const int rgb_blue[JPEG_NUMCS] = {
+  -1, -1, RGB_BLUE, -1, -1, -1, EXT_RGB_BLUE, EXT_RGBX_BLUE,
+  EXT_BGR_BLUE, EXT_BGRX_BLUE, EXT_XBGR_BLUE, EXT_XRGB_BLUE,
+  EXT_RGBX_BLUE, EXT_BGRX_BLUE, EXT_XBGR_BLUE, EXT_XRGB_BLUE,
+  -1
+};
+
+static const int rgb_pixelsize[JPEG_NUMCS] = {
+  -1, -1, RGB_PIXELSIZE, -1, -1, -1, EXT_RGB_PIXELSIZE, EXT_RGBX_PIXELSIZE,
+  EXT_BGR_PIXELSIZE, EXT_BGRX_PIXELSIZE, EXT_XBGR_PIXELSIZE, EXT_XRGB_PIXELSIZE,
+  EXT_RGBX_PIXELSIZE, EXT_BGRX_PIXELSIZE, EXT_XBGR_PIXELSIZE, EXT_XRGB_PIXELSIZE,
+  -1
+};
+
+/* Definitions for speed-related optimizations. */
+
+/* On some machines (notably 68000 series) "int" is 32 bits, but multiplying
+ * two 16-bit shorts is faster than multiplying two ints.  Define MULTIPLIER
+ * as short on such a machine.  MULTIPLIER must be at least 16 bits wide.
+ */
+
+#ifndef MULTIPLIER
+#ifndef WITH_SIMD
+#define MULTIPLIER  int         /* type for fastest integer multiply */
+#else
+#define MULTIPLIER  short       /* prefer 16-bit with SIMD for parellelism */
+#endif
+#endif
+
+
+/* FAST_FLOAT should be either float or double, whichever is done faster
+ * by your compiler.  (Note that this type is only used in the floating point
+ * DCT routines, so it only matters if you've defined DCT_FLOAT_SUPPORTED.)
+ */
+
+#ifndef FAST_FLOAT
+#define FAST_FLOAT  float
+#endif
+
+#endif /* JPEG_INTERNAL_OPTIONS */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/turbojpeg/jpeglib.h b/duix-sdk/src/main/cpp/third/arm/include/turbojpeg/jpeglib.h
new file mode 100644
index 0000000..a59e98c
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/turbojpeg/jpeglib.h
@@ -0,0 +1,1209 @@
+/*
+ * jpeglib.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modified 2002-2009 by Guido Vollbeding.
+ * Lossless JPEG Modifications:
+ * Copyright (C) 1999, Ken Murchison.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2009-2011, 2013-2014, 2016-2017, 2020, 2022-2023,
+             D. R. Commander.
+ * Copyright (C) 2015, Google, Inc.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file defines the application interface for the JPEG library.
+ * Most applications using the library need only include this file,
+ * and perhaps jerror.h if they want to know the exact error codes.
+ */
+
+#ifndef JPEGLIB_H
+#define JPEGLIB_H
+
+/*
+ * First we include the configuration files that record how this
+ * installation of the JPEG library is set up.  jconfig.h can be
+ * generated automatically for many systems.  jmorecfg.h contains
+ * manual configuration options that most people need not worry about.
+ */
+
+#ifndef JCONFIG_INCLUDED        /* in case jinclude.h already did */
+#include "jconfig.h"            /* widely used configuration options */
+#endif
+#include "jmorecfg.h"           /* seldom changed options */
+
+
+#ifdef __cplusplus
+#ifndef DONT_USE_EXTERN_C
+extern "C" {
+#endif
+#endif
+
+
+/* Various constants determining the sizes of things.
+ * All of these are specified by the JPEG standard, so don't change them
+ * if you want to be compatible.
+ */
+
+/* NOTE: In lossless mode, an MCU contains one or more samples rather than one
+ * or more 8x8 DCT blocks, so the term "data unit" is used to generically
+ * describe a sample in lossless mode or an 8x8 DCT block in lossy mode.  To
+ * preserve backward API/ABI compatibility, the field and macro names retain
+ * the "block" terminology.
+ */
+
+#define DCTSIZE             8   /* The basic DCT block is 8x8 samples */
+#define DCTSIZE2            64  /* DCTSIZE squared; # of elements in a block */
+#define NUM_QUANT_TBLS      4   /* Quantization tables are numbered 0..3 */
+#define NUM_HUFF_TBLS       4   /* Huffman tables are numbered 0..3 */
+#define NUM_ARITH_TBLS      16  /* Arith-coding tables are numbered 0..15 */
+#define MAX_COMPS_IN_SCAN   4   /* JPEG limit on # of components in one scan */
+#define MAX_SAMP_FACTOR     4   /* JPEG limit on sampling factors */
+/* Unfortunately, some bozo at Adobe saw no reason to be bound by the standard;
+ * the PostScript DCT filter can emit files with many more than 10 blocks/MCU.
+ * If you happen to run across such a file, you can up D_MAX_BLOCKS_IN_MCU
+ * to handle it.  We even let you do this from the jconfig.h file.  However,
+ * we strongly discourage changing C_MAX_BLOCKS_IN_MCU; just because Adobe
+ * sometimes emits noncompliant files doesn't mean you should too.
+ */
+#define C_MAX_BLOCKS_IN_MCU   10 /* compressor's limit on data units/MCU */
+#ifndef D_MAX_BLOCKS_IN_MCU
+#define D_MAX_BLOCKS_IN_MCU   10 /* decompressor's limit on data units/MCU */
+#endif
+
+
+/* Data structures for images (arrays of samples and of DCT coefficients).
+ */
+
+typedef JSAMPLE *JSAMPROW;      /* ptr to one image row of pixel samples. */
+typedef JSAMPROW *JSAMPARRAY;   /* ptr to some rows (a 2-D sample array) */
+typedef JSAMPARRAY *JSAMPIMAGE; /* a 3-D sample array: top index is color */
+
+typedef J12SAMPLE *J12SAMPROW;      /* ptr to one image row of 12-bit pixel
+                                       samples. */
+typedef J12SAMPROW *J12SAMPARRAY;   /* ptr to some 12-bit sample rows (a 2-D
+                                       12-bit sample array) */
+typedef J12SAMPARRAY *J12SAMPIMAGE; /* a 3-D 12-bit sample array: top index is
+                                       color */
+
+typedef J16SAMPLE *J16SAMPROW;      /* ptr to one image row of 16-bit pixel
+                                       samples. */
+typedef J16SAMPROW *J16SAMPARRAY;   /* ptr to some 16-bit sample rows (a 2-D
+                                       16-bit sample array) */
+typedef J16SAMPARRAY *J16SAMPIMAGE; /* a 3-D 16-bit sample array: top index is
+                                       color */
+
+typedef JCOEF JBLOCK[DCTSIZE2]; /* one block of coefficients */
+typedef JBLOCK *JBLOCKROW;      /* pointer to one row of coefficient blocks */
+typedef JBLOCKROW *JBLOCKARRAY;         /* a 2-D array of coefficient blocks */
+typedef JBLOCKARRAY *JBLOCKIMAGE;       /* a 3-D array of coefficient blocks */
+
+typedef JCOEF *JCOEFPTR;        /* useful in a couple of places */
+
+
+/* Types for JPEG compression parameters and working tables. */
+
+
+/* DCT coefficient quantization tables. */
+
+typedef struct {
+  /* This array gives the coefficient quantizers in natural array order
+   * (not the zigzag order in which they are stored in a JPEG DQT marker).
+   * CAUTION: IJG versions prior to v6a kept this array in zigzag order.
+   */
+  UINT16 quantval[DCTSIZE2];    /* quantization step for each coefficient */
+  /* This field is used only during compression.  It's initialized FALSE when
+   * the table is created, and set TRUE when it's been output to the file.
+   * You could suppress output of a table by setting this to TRUE.
+   * (See jpeg_suppress_tables for an example.)
+   */
+  boolean sent_table;           /* TRUE when table has been output */
+} JQUANT_TBL;
+
+
+/* Huffman coding tables. */
+
+typedef struct {
+  /* These two fields directly represent the contents of a JPEG DHT marker */
+  UINT8 bits[17];               /* bits[k] = # of symbols with codes of */
+                                /* length k bits; bits[0] is unused */
+  UINT8 huffval[256];           /* The symbols, in order of incr code length */
+  /* This field is used only during compression.  It's initialized FALSE when
+   * the table is created, and set TRUE when it's been output to the file.
+   * You could suppress output of a table by setting this to TRUE.
+   * (See jpeg_suppress_tables for an example.)
+   */
+  boolean sent_table;           /* TRUE when table has been output */
+} JHUFF_TBL;
+
+
+/* Basic info about one component (color channel). */
+
+typedef struct {
+  /* These values are fixed over the whole image. */
+  /* For compression, they must be supplied by parameter setup; */
+  /* for decompression, they are read from the SOF marker. */
+  int component_id;             /* identifier for this component (0..255) */
+  int component_index;          /* its index in SOF or cinfo->comp_info[] */
+  int h_samp_factor;            /* horizontal sampling factor (1..4) */
+  int v_samp_factor;            /* vertical sampling factor (1..4) */
+  int quant_tbl_no;             /* quantization table selector (0..3) */
+  /* These values may vary between scans. */
+  /* For compression, they must be supplied by parameter setup; */
+  /* for decompression, they are read from the SOS marker. */
+  /* The decompressor output side may not use these variables. */
+  int dc_tbl_no;                /* DC entropy table selector (0..3) */
+  int ac_tbl_no;                /* AC entropy table selector (0..3) */
+
+  /* Remaining fields should be treated as private by applications. */
+
+  /* These values are computed during compression or decompression startup: */
+  /* Component's size in data units.
+   * In lossy mode, any dummy blocks added to complete an MCU are not counted;
+   * therefore these values do not depend on whether a scan is interleaved or
+   * not.  In lossless mode, these are always equal to the image width and
+   * height.
+   */
+  JDIMENSION width_in_blocks;
+  JDIMENSION height_in_blocks;
+  /* Size of a data unit in samples.  Always DCTSIZE for lossy compression.
+   * For lossy decompression this is the size of the output from one DCT block,
+   * reflecting any scaling we choose to apply during the IDCT step.
+   * Values from 1 to 16 are supported.  Note that different components may
+   * receive different IDCT scalings.  In lossless mode, this is always equal
+   * to 1.
+   */
+#if JPEG_LIB_VERSION >= 70
+  int DCT_h_scaled_size;
+  int DCT_v_scaled_size;
+#else
+  int DCT_scaled_size;
+#endif
+  /* The downsampled dimensions are the component's actual, unpadded number
+   * of samples at the main buffer (preprocessing/compression interface), thus
+   * downsampled_width = ceil(image_width * Hi/Hmax)
+   * and similarly for height.  For lossy decompression, IDCT scaling is
+   * included, so
+   * downsampled_width = ceil(image_width * Hi/Hmax * DCT_[h_]scaled_size/DCTSIZE)
+   * In lossless mode, these are always equal to the image width and height.
+   */
+  JDIMENSION downsampled_width;  /* actual width in samples */
+  JDIMENSION downsampled_height; /* actual height in samples */
+  /* This flag is used only for decompression.  In cases where some of the
+   * components will be ignored (eg grayscale output from YCbCr image),
+   * we can skip most computations for the unused components.
+   */
+  boolean component_needed;     /* do we need the value of this component? */
+
+  /* These values are computed before starting a scan of the component. */
+  /* The decompressor output side may not use these variables. */
+  int MCU_width;                /* number of data units per MCU, horizontally */
+  int MCU_height;               /* number of data units per MCU, vertically */
+  int MCU_blocks;               /* MCU_width * MCU_height */
+  int MCU_sample_width;         /* MCU width in samples, MCU_width*DCT_[h_]scaled_size */
+  int last_col_width;           /* # of non-dummy data units across in last MCU */
+  int last_row_height;          /* # of non-dummy data units down in last MCU */
+
+  /* Saved quantization table for component; NULL if none yet saved.
+   * See jdinput.c comments about the need for this information.
+   * This field is currently used only for decompression.
+   */
+  JQUANT_TBL *quant_table;
+
+  /* Private per-component storage for DCT or IDCT subsystem. */
+  void *dct_table;
+} jpeg_component_info;
+
+
+/* The script for encoding a multiple-scan file is an array of these: */
+
+typedef struct {
+  int comps_in_scan;            /* number of components encoded in this scan */
+  int component_index[MAX_COMPS_IN_SCAN]; /* their SOF/comp_info[] indexes */
+  int Ss, Se;                   /* progressive JPEG spectral selection parms
+                                   (Ss is the predictor selection value in
+                                   lossless mode) */
+  int Ah, Al;                   /* progressive JPEG successive approx. parms
+                                   (Al is the point transform value in lossless
+                                   mode) */
+} jpeg_scan_info;
+
+/* The decompressor can save APPn and COM markers in a list of these: */
+
+typedef struct jpeg_marker_struct *jpeg_saved_marker_ptr;
+
+struct jpeg_marker_struct {
+  jpeg_saved_marker_ptr next;   /* next in list, or NULL */
+  UINT8 marker;                 /* marker code: JPEG_COM, or JPEG_APP0+n */
+  unsigned int original_length; /* # bytes of data in the file */
+  unsigned int data_length;     /* # bytes of data saved at data[] */
+  JOCTET *data;                 /* the data contained in the marker */
+  /* the marker length word is not counted in data_length or original_length */
+};
+
+/* Known color spaces. */
+
+#define JCS_EXTENSIONS  1
+#define JCS_ALPHA_EXTENSIONS  1
+
+typedef enum {
+  JCS_UNKNOWN,            /* error/unspecified */
+  JCS_GRAYSCALE,          /* monochrome */
+  JCS_RGB,                /* red/green/blue as specified by the RGB_RED,
+                             RGB_GREEN, RGB_BLUE, and RGB_PIXELSIZE macros */
+  JCS_YCbCr,              /* Y/Cb/Cr (also known as YUV) */
+  JCS_CMYK,               /* C/M/Y/K */
+  JCS_YCCK,               /* Y/Cb/Cr/K */
+  JCS_EXT_RGB,            /* red/green/blue */
+  JCS_EXT_RGBX,           /* red/green/blue/x */
+  JCS_EXT_BGR,            /* blue/green/red */
+  JCS_EXT_BGRX,           /* blue/green/red/x */
+  JCS_EXT_XBGR,           /* x/blue/green/red */
+  JCS_EXT_XRGB,           /* x/red/green/blue */
+  /* When out_color_space it set to JCS_EXT_RGBX, JCS_EXT_BGRX, JCS_EXT_XBGR,
+     or JCS_EXT_XRGB during decompression, the X byte is undefined, and in
+     order to ensure the best performance, libjpeg-turbo can set that byte to
+     whatever value it wishes.  Use the following colorspace constants to
+     ensure that the X byte is set to 0xFF, so that it can be interpreted as an
+     opaque alpha channel. */
+  JCS_EXT_RGBA,           /* red/green/blue/alpha */
+  JCS_EXT_BGRA,           /* blue/green/red/alpha */
+  JCS_EXT_ABGR,           /* alpha/blue/green/red */
+  JCS_EXT_ARGB,           /* alpha/red/green/blue */
+  JCS_RGB565              /* 5-bit red/6-bit green/5-bit blue
+                             [decompression only] */
+} J_COLOR_SPACE;
+
+/* DCT/IDCT algorithm options. */
+
+typedef enum {
+  JDCT_ISLOW,             /* accurate integer method */
+  JDCT_IFAST,             /* less accurate integer method [legacy feature] */
+  JDCT_FLOAT              /* floating-point method [legacy feature] */
+} J_DCT_METHOD;
+
+#ifndef JDCT_DEFAULT            /* may be overridden in jconfig.h */
+#define JDCT_DEFAULT  JDCT_ISLOW
+#endif
+#ifndef JDCT_FASTEST            /* may be overridden in jconfig.h */
+#define JDCT_FASTEST  JDCT_IFAST
+#endif
+
+/* Dithering options for decompression. */
+
+typedef enum {
+  JDITHER_NONE,           /* no dithering */
+  JDITHER_ORDERED,        /* simple ordered dither */
+  JDITHER_FS              /* Floyd-Steinberg error diffusion dither */
+} J_DITHER_MODE;
+
+
+/* Common fields between JPEG compression and decompression master structs. */
+
+#define jpeg_common_fields \
+  struct jpeg_error_mgr *err;   /* Error handler module */ \
+  struct jpeg_memory_mgr *mem;  /* Memory manager module */ \
+  struct jpeg_progress_mgr *progress; /* Progress monitor, or NULL if none */ \
+  void *client_data;            /* Available for use by application */ \
+  boolean is_decompressor;      /* So common code can tell which is which */ \
+  int global_state              /* For checking call sequence validity */
+
+/* Routines that are to be used by both halves of the library are declared
+ * to receive a pointer to this structure.  There are no actual instances of
+ * jpeg_common_struct, only of jpeg_compress_struct and jpeg_decompress_struct.
+ */
+struct jpeg_common_struct {
+  jpeg_common_fields;           /* Fields common to both master struct types */
+  /* Additional fields follow in an actual jpeg_compress_struct or
+   * jpeg_decompress_struct.  All three structs must agree on these
+   * initial fields!  (This would be a lot cleaner in C++.)
+   */
+};
+
+typedef struct jpeg_common_struct *j_common_ptr;
+typedef struct jpeg_compress_struct *j_compress_ptr;
+typedef struct jpeg_decompress_struct *j_decompress_ptr;
+
+
+/* Master record for a compression instance */
+
+struct jpeg_compress_struct {
+  jpeg_common_fields;           /* Fields shared with jpeg_decompress_struct */
+
+  /* Destination for compressed data */
+  struct jpeg_destination_mgr *dest;
+
+  /* Description of source image --- these fields must be filled in by
+   * outer application before starting compression.  in_color_space must
+   * be correct before you can even call jpeg_set_defaults().
+   */
+
+  JDIMENSION image_width;       /* input image width */
+  JDIMENSION image_height;      /* input image height */
+  int input_components;         /* # of color components in input image */
+  J_COLOR_SPACE in_color_space; /* colorspace of input image */
+
+  double input_gamma;           /* image gamma of input image */
+
+  /* Compression parameters --- these fields must be set before calling
+   * jpeg_start_compress().  We recommend calling jpeg_set_defaults() to
+   * initialize everything to reasonable defaults, then changing anything
+   * the application specifically wants to change.  That way you won't get
+   * burnt when new parameters are added.  Also note that there are several
+   * helper routines to simplify changing parameters.
+   */
+
+#if JPEG_LIB_VERSION >= 70
+  unsigned int scale_num, scale_denom; /* fraction by which to scale image */
+
+  JDIMENSION jpeg_width;        /* scaled JPEG image width */
+  JDIMENSION jpeg_height;       /* scaled JPEG image height */
+  /* Dimensions of actual JPEG image that will be written to file,
+   * derived from input dimensions by scaling factors above.
+   * These fields are computed by jpeg_start_compress().
+   * You can also use jpeg_calc_jpeg_dimensions() to determine these values
+   * in advance of calling jpeg_start_compress().
+   */
+#endif
+
+  int data_precision;           /* bits of precision in image data */
+
+  int num_components;           /* # of color components in JPEG image */
+  J_COLOR_SPACE jpeg_color_space; /* colorspace of JPEG image */
+
+  jpeg_component_info *comp_info;
+  /* comp_info[i] describes component that appears i'th in SOF */
+
+  JQUANT_TBL *quant_tbl_ptrs[NUM_QUANT_TBLS];
+#if JPEG_LIB_VERSION >= 70
+  int q_scale_factor[NUM_QUANT_TBLS];
+#endif
+  /* ptrs to coefficient quantization tables, or NULL if not defined,
+   * and corresponding scale factors (percentage, initialized 100).
+   */
+
+  JHUFF_TBL *dc_huff_tbl_ptrs[NUM_HUFF_TBLS];
+  JHUFF_TBL *ac_huff_tbl_ptrs[NUM_HUFF_TBLS];
+  /* ptrs to Huffman coding tables, or NULL if not defined */
+
+  UINT8 arith_dc_L[NUM_ARITH_TBLS]; /* L values for DC arith-coding tables */
+  UINT8 arith_dc_U[NUM_ARITH_TBLS]; /* U values for DC arith-coding tables */
+  UINT8 arith_ac_K[NUM_ARITH_TBLS]; /* Kx values for AC arith-coding tables */
+
+  int num_scans;                /* # of entries in scan_info array */
+  const jpeg_scan_info *scan_info; /* script for multi-scan file, or NULL */
+  /* The default value of scan_info is NULL, which causes a single-scan
+   * sequential JPEG file to be emitted.  To create a multi-scan file,
+   * set num_scans and scan_info to point to an array of scan definitions.
+   */
+
+  boolean raw_data_in;          /* TRUE=caller supplies downsampled data */
+  boolean arith_code;           /* TRUE=arithmetic coding, FALSE=Huffman */
+  boolean optimize_coding;      /* TRUE=optimize entropy encoding parms */
+  boolean CCIR601_sampling;     /* TRUE=first samples are cosited */
+#if JPEG_LIB_VERSION >= 70
+  boolean do_fancy_downsampling; /* TRUE=apply fancy downsampling */
+#endif
+  int smoothing_factor;         /* 1..100, or 0 for no input smoothing */
+  J_DCT_METHOD dct_method;      /* DCT algorithm selector */
+
+  /* The restart interval can be specified in absolute MCUs by setting
+   * restart_interval, or in MCU rows by setting restart_in_rows
+   * (in which case the correct restart_interval will be figured
+   * for each scan).
+   */
+  unsigned int restart_interval; /* MCUs per restart, or 0 for no restart */
+  int restart_in_rows;          /* if > 0, MCU rows per restart interval */
+
+  /* Parameters controlling emission of special markers. */
+
+  boolean write_JFIF_header;    /* should a JFIF marker be written? */
+  UINT8 JFIF_major_version;     /* What to write for the JFIF version number */
+  UINT8 JFIF_minor_version;
+  /* These three values are not used by the JPEG code, merely copied */
+  /* into the JFIF APP0 marker.  density_unit can be 0 for unknown, */
+  /* 1 for dots/inch, or 2 for dots/cm.  Note that the pixel aspect */
+  /* ratio is defined by X_density/Y_density even when density_unit=0. */
+  UINT8 density_unit;           /* JFIF code for pixel size units */
+  UINT16 X_density;             /* Horizontal pixel density */
+  UINT16 Y_density;             /* Vertical pixel density */
+  boolean write_Adobe_marker;   /* should an Adobe marker be written? */
+
+  /* State variable: index of next scanline to be written to
+   * jpeg_write_scanlines().  Application may use this to control its
+   * processing loop, e.g., "while (next_scanline < image_height)".
+   */
+
+  JDIMENSION next_scanline;     /* 0 .. image_height-1  */
+
+  /* Remaining fields are known throughout compressor, but generally
+   * should not be touched by a surrounding application.
+   */
+
+  /*
+   * These fields are computed during compression startup
+   */
+  boolean progressive_mode;     /* TRUE if scan script uses progressive mode */
+  int max_h_samp_factor;        /* largest h_samp_factor */
+  int max_v_samp_factor;        /* largest v_samp_factor */
+
+#if JPEG_LIB_VERSION >= 70
+  int min_DCT_h_scaled_size;    /* smallest DCT_h_scaled_size of any component */
+  int min_DCT_v_scaled_size;    /* smallest DCT_v_scaled_size of any component */
+#endif
+
+  JDIMENSION total_iMCU_rows;   /* # of iMCU rows to be input to coefficient or
+                                   difference controller */
+  /* The coefficient or difference controller receives data in units of MCU
+   * rows as defined for fully interleaved scans (whether the JPEG file is
+   * interleaved or not).  In lossy mode, there are v_samp_factor * DCTSIZE
+   * sample rows of each component in an "iMCU" (interleaved MCU) row.  In
+   * lossless mode, total_iMCU_rows is always equal to the image height.
+   */
+
+  /*
+   * These fields are valid during any one scan.
+   * They describe the components and MCUs actually appearing in the scan.
+   */
+  int comps_in_scan;            /* # of JPEG components in this scan */
+  jpeg_component_info *cur_comp_info[MAX_COMPS_IN_SCAN];
+  /* *cur_comp_info[i] describes component that appears i'th in SOS */
+
+  JDIMENSION MCUs_per_row;      /* # of MCUs across the image */
+  JDIMENSION MCU_rows_in_scan;  /* # of MCU rows in the image */
+
+  int blocks_in_MCU;            /* # of data units per MCU */
+  int MCU_membership[C_MAX_BLOCKS_IN_MCU];
+  /* MCU_membership[i] is index in cur_comp_info of component owning */
+  /* i'th data unit in an MCU */
+
+  int Ss, Se, Ah, Al;           /* progressive/lossless JPEG parameters for
+                                   scan */
+
+#if JPEG_LIB_VERSION >= 80
+  int block_size;               /* the basic DCT block size: 1..16 */
+  const int *natural_order;     /* natural-order position array */
+  int lim_Se;                   /* min( Se, DCTSIZE2-1 ) */
+#endif
+
+  /*
+   * Links to compression subobjects (methods and private variables of modules)
+   */
+  struct jpeg_comp_master *master;
+  struct jpeg_c_main_controller *main;
+  struct jpeg_c_prep_controller *prep;
+  struct jpeg_c_coef_controller *coef;
+  struct jpeg_marker_writer *marker;
+  struct jpeg_color_converter *cconvert;
+  struct jpeg_downsampler *downsample;
+  struct jpeg_forward_dct *fdct;
+  struct jpeg_entropy_encoder *entropy;
+  jpeg_scan_info *script_space; /* workspace for jpeg_simple_progression */
+  int script_space_size;
+};
+
+
+/* Master record for a decompression instance */
+
+struct jpeg_decompress_struct {
+  jpeg_common_fields;           /* Fields shared with jpeg_compress_struct */
+
+  /* Source of compressed data */
+  struct jpeg_source_mgr *src;
+
+  /* Basic description of image --- filled in by jpeg_read_header(). */
+  /* Application may inspect these values to decide how to process image. */
+
+  JDIMENSION image_width;       /* nominal image width (from SOF marker) */
+  JDIMENSION image_height;      /* nominal image height */
+  int num_components;           /* # of color components in JPEG image */
+  J_COLOR_SPACE jpeg_color_space; /* colorspace of JPEG image */
+
+  /* Decompression processing parameters --- these fields must be set before
+   * calling jpeg_start_decompress().  Note that jpeg_read_header() initializes
+   * them to default values.
+   */
+
+  J_COLOR_SPACE out_color_space; /* colorspace for output */
+
+  unsigned int scale_num, scale_denom; /* fraction by which to scale image */
+
+  double output_gamma;          /* image gamma wanted in output */
+
+  boolean buffered_image;       /* TRUE=multiple output passes */
+  boolean raw_data_out;         /* TRUE=downsampled data wanted */
+
+  J_DCT_METHOD dct_method;      /* IDCT algorithm selector */
+  boolean do_fancy_upsampling;  /* TRUE=apply fancy upsampling */
+  boolean do_block_smoothing;   /* TRUE=apply interblock smoothing */
+
+  boolean quantize_colors;      /* TRUE=colormapped output wanted */
+  /* the following are ignored if not quantize_colors: */
+  J_DITHER_MODE dither_mode;    /* type of color dithering to use */
+  boolean two_pass_quantize;    /* TRUE=use two-pass color quantization */
+  int desired_number_of_colors; /* max # colors to use in created colormap */
+  /* these are significant only in buffered-image mode: */
+  boolean enable_1pass_quant;   /* enable future use of 1-pass quantizer */
+  boolean enable_external_quant;/* enable future use of external colormap */
+  boolean enable_2pass_quant;   /* enable future use of 2-pass quantizer */
+
+  /* Description of actual output image that will be returned to application.
+   * These fields are computed by jpeg_start_decompress().
+   * You can also use jpeg_calc_output_dimensions() to determine these values
+   * in advance of calling jpeg_start_decompress().
+   */
+
+  JDIMENSION output_width;      /* scaled image width */
+  JDIMENSION output_height;     /* scaled image height */
+  int out_color_components;     /* # of color components in out_color_space */
+  int output_components;        /* # of color components returned */
+  /* output_components is 1 (a colormap index) when quantizing colors;
+   * otherwise it equals out_color_components.
+   */
+  int rec_outbuf_height;        /* min recommended height of scanline buffer */
+  /* If the buffer passed to jpeg_read_scanlines() is less than this many rows
+   * high, space and time will be wasted due to unnecessary data copying.
+   * Usually rec_outbuf_height will be 1 or 2, at most 4.
+   */
+
+  /* When quantizing colors, the output colormap is described by these fields.
+   * The application can supply a colormap by setting colormap non-NULL before
+   * calling jpeg_start_decompress; otherwise a colormap is created during
+   * jpeg_start_decompress or jpeg_start_output.
+   * The map has out_color_components rows and actual_number_of_colors columns.
+   */
+  int actual_number_of_colors;  /* number of entries in use */
+  JSAMPARRAY colormap;          /* The color map as a 2-D pixel array
+                                   If data_precision is 12 or 16, then this is
+                                   actually a J12SAMPARRAY or a J16SAMPARRAY,
+                                   so callers must type-cast it in order to
+                                   read/write 12-bit or 16-bit samples from/to
+                                   the array. */
+
+  /* State variables: these variables indicate the progress of decompression.
+   * The application may examine these but must not modify them.
+   */
+
+  /* Row index of next scanline to be read from jpeg_read_scanlines().
+   * Application may use this to control its processing loop, e.g.,
+   * "while (output_scanline < output_height)".
+   */
+  JDIMENSION output_scanline;   /* 0 .. output_height-1  */
+
+  /* Current input scan number and number of iMCU rows completed in scan.
+   * These indicate the progress of the decompressor input side.
+   */
+  int input_scan_number;        /* Number of SOS markers seen so far */
+  JDIMENSION input_iMCU_row;    /* Number of iMCU rows completed */
+
+  /* The "output scan number" is the notional scan being displayed by the
+   * output side.  The decompressor will not allow output scan/row number
+   * to get ahead of input scan/row, but it can fall arbitrarily far behind.
+   */
+  int output_scan_number;       /* Nominal scan number being displayed */
+  JDIMENSION output_iMCU_row;   /* Number of iMCU rows read */
+
+  /* Current progression status.  coef_bits[c][i] indicates the precision
+   * with which component c's DCT coefficient i (in zigzag order) is known.
+   * It is -1 when no data has yet been received, otherwise it is the point
+   * transform (shift) value for the most recent scan of the coefficient
+   * (thus, 0 at completion of the progression).
+   * This pointer is NULL when reading a non-progressive file.
+   */
+  int (*coef_bits)[DCTSIZE2];   /* -1 or current Al value for each coef */
+
+  /* Internal JPEG parameters --- the application usually need not look at
+   * these fields.  Note that the decompressor output side may not use
+   * any parameters that can change between scans.
+   */
+
+  /* Quantization and Huffman tables are carried forward across input
+   * datastreams when processing abbreviated JPEG datastreams.
+   */
+
+  JQUANT_TBL *quant_tbl_ptrs[NUM_QUANT_TBLS];
+  /* ptrs to coefficient quantization tables, or NULL if not defined */
+
+  JHUFF_TBL *dc_huff_tbl_ptrs[NUM_HUFF_TBLS];
+  JHUFF_TBL *ac_huff_tbl_ptrs[NUM_HUFF_TBLS];
+  /* ptrs to Huffman coding tables, or NULL if not defined */
+
+  /* These parameters are never carried across datastreams, since they
+   * are given in SOF/SOS markers or defined to be reset by SOI.
+   */
+
+  int data_precision;           /* bits of precision in image data */
+
+  jpeg_component_info *comp_info;
+  /* comp_info[i] describes component that appears i'th in SOF */
+
+#if JPEG_LIB_VERSION >= 80
+  boolean is_baseline;          /* TRUE if Baseline SOF0 encountered */
+#endif
+  boolean progressive_mode;     /* TRUE if SOFn specifies progressive mode */
+  boolean arith_code;           /* TRUE=arithmetic coding, FALSE=Huffman */
+
+  UINT8 arith_dc_L[NUM_ARITH_TBLS]; /* L values for DC arith-coding tables */
+  UINT8 arith_dc_U[NUM_ARITH_TBLS]; /* U values for DC arith-coding tables */
+  UINT8 arith_ac_K[NUM_ARITH_TBLS]; /* Kx values for AC arith-coding tables */
+
+  unsigned int restart_interval; /* MCUs per restart interval, or 0 for no restart */
+
+  /* These fields record data obtained from optional markers recognized by
+   * the JPEG library.
+   */
+  boolean saw_JFIF_marker;      /* TRUE iff a JFIF APP0 marker was found */
+  /* Data copied from JFIF marker; only valid if saw_JFIF_marker is TRUE: */
+  UINT8 JFIF_major_version;     /* JFIF version number */
+  UINT8 JFIF_minor_version;
+  UINT8 density_unit;           /* JFIF code for pixel size units */
+  UINT16 X_density;             /* Horizontal pixel density */
+  UINT16 Y_density;             /* Vertical pixel density */
+  boolean saw_Adobe_marker;     /* TRUE iff an Adobe APP14 marker was found */
+  UINT8 Adobe_transform;        /* Color transform code from Adobe marker */
+
+  boolean CCIR601_sampling;     /* TRUE=first samples are cosited */
+
+  /* Aside from the specific data retained from APPn markers known to the
+   * library, the uninterpreted contents of any or all APPn and COM markers
+   * can be saved in a list for examination by the application.
+   */
+  jpeg_saved_marker_ptr marker_list; /* Head of list of saved markers */
+
+  /* Remaining fields are known throughout decompressor, but generally
+   * should not be touched by a surrounding application.
+   */
+
+  /*
+   * These fields are computed during decompression startup
+   */
+  int max_h_samp_factor;        /* largest h_samp_factor */
+  int max_v_samp_factor;        /* largest v_samp_factor */
+
+#if JPEG_LIB_VERSION >= 70
+  int min_DCT_h_scaled_size;    /* smallest DCT_h_scaled_size of any component */
+  int min_DCT_v_scaled_size;    /* smallest DCT_v_scaled_size of any component */
+#else
+  int min_DCT_scaled_size;      /* smallest DCT_scaled_size of any component */
+#endif
+
+  JDIMENSION total_iMCU_rows;   /* # of iMCU rows in image */
+  /* The coefficient or difference controller's input and output progress is
+   * measured in units of "iMCU" (interleaved MCU) rows.  These are the same as
+   * MCU rows in fully interleaved JPEG scans, but are used whether the scan is
+   * interleaved or not.  In lossy mode, we define an iMCU row as v_samp_factor
+   * DCT block rows of each component.  Therefore, the IDCT output contains
+   * v_samp_factor*DCT_[v_]scaled_size sample rows of a component per iMCU row.
+   * In lossless mode, total_iMCU_rows is always equal to the image height.
+   */
+
+  JSAMPLE *sample_range_limit;  /* table for fast range-limiting
+                                   If data_precision is 12 or 16, then this is
+                                   actually a J12SAMPLE pointer or a J16SAMPLE
+                                   pointer, so callers must type-cast it in
+                                   order to read 12-bit or 16-bit samples from
+                                   the array. */
+
+  /*
+   * These fields are valid during any one scan.
+   * They describe the components and MCUs actually appearing in the scan.
+   * Note that the decompressor output side must not use these fields.
+   */
+  int comps_in_scan;            /* # of JPEG components in this scan */
+  jpeg_component_info *cur_comp_info[MAX_COMPS_IN_SCAN];
+  /* *cur_comp_info[i] describes component that appears i'th in SOS */
+
+  JDIMENSION MCUs_per_row;      /* # of MCUs across the image */
+  JDIMENSION MCU_rows_in_scan;  /* # of MCU rows in the image */
+
+  int blocks_in_MCU;            /* # of data units per MCU */
+  int MCU_membership[D_MAX_BLOCKS_IN_MCU];
+  /* MCU_membership[i] is index in cur_comp_info of component owning */
+  /* i'th data unit in an MCU */
+
+  int Ss, Se, Ah, Al;           /* progressive/lossless JPEG parameters for
+                                   scan */
+
+#if JPEG_LIB_VERSION >= 80
+  /* These fields are derived from Se of first SOS marker.
+   */
+  int block_size;               /* the basic DCT block size: 1..16 */
+  const int *natural_order; /* natural-order position array for entropy decode */
+  int lim_Se;                   /* min( Se, DCTSIZE2-1 ) for entropy decode */
+#endif
+
+  /* This field is shared between entropy decoder and marker parser.
+   * It is either zero or the code of a JPEG marker that has been
+   * read from the data source, but has not yet been processed.
+   */
+  int unread_marker;
+
+  /*
+   * Links to decompression subobjects (methods, private variables of modules)
+   */
+  struct jpeg_decomp_master *master;
+  struct jpeg_d_main_controller *main;
+  struct jpeg_d_coef_controller *coef;
+  struct jpeg_d_post_controller *post;
+  struct jpeg_input_controller *inputctl;
+  struct jpeg_marker_reader *marker;
+  struct jpeg_entropy_decoder *entropy;
+  struct jpeg_inverse_dct *idct;
+  struct jpeg_upsampler *upsample;
+  struct jpeg_color_deconverter *cconvert;
+  struct jpeg_color_quantizer *cquantize;
+};
+
+
+/* "Object" declarations for JPEG modules that may be supplied or called
+ * directly by the surrounding application.
+ * As with all objects in the JPEG library, these structs only define the
+ * publicly visible methods and state variables of a module.  Additional
+ * private fields may exist after the public ones.
+ */
+
+
+/* Error handler object */
+
+struct jpeg_error_mgr {
+  /* Error exit handler: does not return to caller */
+  void (*error_exit) (j_common_ptr cinfo);
+  /* Conditionally emit a trace or warning message */
+  void (*emit_message) (j_common_ptr cinfo, int msg_level);
+  /* Routine that actually outputs a trace or error message */
+  void (*output_message) (j_common_ptr cinfo);
+  /* Format a message string for the most recent JPEG error or message */
+  void (*format_message) (j_common_ptr cinfo, char *buffer);
+#define JMSG_LENGTH_MAX  200    /* recommended size of format_message buffer */
+  /* Reset error state variables at start of a new image */
+  void (*reset_error_mgr) (j_common_ptr cinfo);
+
+  /* The message ID code and any parameters are saved here.
+   * A message can have one string parameter or up to 8 int parameters.
+   */
+  int msg_code;
+#define JMSG_STR_PARM_MAX  80
+  union {
+    int i[8];
+    char s[JMSG_STR_PARM_MAX];
+  } msg_parm;
+
+  /* Standard state variables for error facility */
+
+  int trace_level;              /* max msg_level that will be displayed */
+
+  /* For recoverable corrupt-data errors, we emit a warning message,
+   * but keep going unless emit_message chooses to abort.  emit_message
+   * should count warnings in num_warnings.  The surrounding application
+   * can check for bad data by seeing if num_warnings is nonzero at the
+   * end of processing.
+   */
+  long num_warnings;            /* number of corrupt-data warnings */
+
+  /* These fields point to the table(s) of error message strings.
+   * An application can change the table pointer to switch to a different
+   * message list (typically, to change the language in which errors are
+   * reported).  Some applications may wish to add additional error codes
+   * that will be handled by the JPEG library error mechanism; the second
+   * table pointer is used for this purpose.
+   *
+   * First table includes all errors generated by JPEG library itself.
+   * Error code 0 is reserved for a "no such error string" message.
+   */
+  const char * const *jpeg_message_table; /* Library errors */
+  int last_jpeg_message;    /* Table contains strings 0..last_jpeg_message */
+  /* Second table can be added by application (see cjpeg/djpeg for example).
+   * It contains strings numbered first_addon_message..last_addon_message.
+   */
+  const char * const *addon_message_table; /* Non-library errors */
+  int first_addon_message;      /* code for first string in addon table */
+  int last_addon_message;       /* code for last string in addon table */
+};
+
+
+/* Progress monitor object */
+
+struct jpeg_progress_mgr {
+  void (*progress_monitor) (j_common_ptr cinfo);
+
+  long pass_counter;            /* work units completed in this pass */
+  long pass_limit;              /* total number of work units in this pass */
+  int completed_passes;         /* passes completed so far */
+  int total_passes;             /* total number of passes expected */
+};
+
+
+/* Data destination object for compression */
+
+struct jpeg_destination_mgr {
+  JOCTET *next_output_byte;     /* => next byte to write in buffer */
+  size_t free_in_buffer;        /* # of byte spaces remaining in buffer */
+
+  void (*init_destination) (j_compress_ptr cinfo);
+  boolean (*empty_output_buffer) (j_compress_ptr cinfo);
+  void (*term_destination) (j_compress_ptr cinfo);
+};
+
+
+/* Data source object for decompression */
+
+struct jpeg_source_mgr {
+  const JOCTET *next_input_byte; /* => next byte to read from buffer */
+  size_t bytes_in_buffer;       /* # of bytes remaining in buffer */
+
+  void (*init_source) (j_decompress_ptr cinfo);
+  boolean (*fill_input_buffer) (j_decompress_ptr cinfo);
+  void (*skip_input_data) (j_decompress_ptr cinfo, long num_bytes);
+  boolean (*resync_to_restart) (j_decompress_ptr cinfo, int desired);
+  void (*term_source) (j_decompress_ptr cinfo);
+};
+
+
+/* Memory manager object.
+ * Allocates "small" objects (a few K total), "large" objects (tens of K),
+ * and "really big" objects (virtual arrays with backing store if needed).
+ * The memory manager does not allow individual objects to be freed; rather,
+ * each created object is assigned to a pool, and whole pools can be freed
+ * at once.  This is faster and more convenient than remembering exactly what
+ * to free, especially where malloc()/free() are not too speedy.
+ * NB: alloc routines never return NULL.  They exit to error_exit if not
+ * successful.
+ */
+
+#define JPOOL_PERMANENT  0      /* lasts until master record is destroyed */
+#define JPOOL_IMAGE      1      /* lasts until done with image/datastream */
+#define JPOOL_NUMPOOLS   2
+
+typedef struct jvirt_sarray_control *jvirt_sarray_ptr;
+typedef struct jvirt_barray_control *jvirt_barray_ptr;
+
+
+struct jpeg_memory_mgr {
+  /* Method pointers */
+  void *(*alloc_small) (j_common_ptr cinfo, int pool_id, size_t sizeofobject);
+  void *(*alloc_large) (j_common_ptr cinfo, int pool_id,
+                        size_t sizeofobject);
+  /* If cinfo->data_precision is 12 or 16, then this method and the
+   * access_virt_sarray method actually return a J12SAMPARRAY or a
+   * J16SAMPARRAY, so callers must type-cast the return value in order to
+   * read/write 12-bit or 16-bit samples from/to the array.
+   */
+  JSAMPARRAY (*alloc_sarray) (j_common_ptr cinfo, int pool_id,
+                              JDIMENSION samplesperrow, JDIMENSION numrows);
+  JBLOCKARRAY (*alloc_barray) (j_common_ptr cinfo, int pool_id,
+                               JDIMENSION blocksperrow, JDIMENSION numrows);
+  jvirt_sarray_ptr (*request_virt_sarray) (j_common_ptr cinfo, int pool_id,
+                                           boolean pre_zero,
+                                           JDIMENSION samplesperrow,
+                                           JDIMENSION numrows,
+                                           JDIMENSION maxaccess);
+  jvirt_barray_ptr (*request_virt_barray) (j_common_ptr cinfo, int pool_id,
+                                           boolean pre_zero,
+                                           JDIMENSION blocksperrow,
+                                           JDIMENSION numrows,
+                                           JDIMENSION maxaccess);
+  void (*realize_virt_arrays) (j_common_ptr cinfo);
+  JSAMPARRAY (*access_virt_sarray) (j_common_ptr cinfo, jvirt_sarray_ptr ptr,
+                                    JDIMENSION start_row, JDIMENSION num_rows,
+                                    boolean writable);
+  JBLOCKARRAY (*access_virt_barray) (j_common_ptr cinfo, jvirt_barray_ptr ptr,
+                                     JDIMENSION start_row, JDIMENSION num_rows,
+                                     boolean writable);
+  void (*free_pool) (j_common_ptr cinfo, int pool_id);
+  void (*self_destruct) (j_common_ptr cinfo);
+
+  /* Limit on memory allocation for this JPEG object.  (Note that this is
+   * merely advisory, not a guaranteed maximum; it only affects the space
+   * used for virtual-array buffers.)  May be changed by outer application
+   * after creating the JPEG object.
+   */
+  long max_memory_to_use;
+
+  /* Maximum allocation request accepted by alloc_large. */
+  long max_alloc_chunk;
+};
+
+
+/* Routine signature for application-supplied marker processing methods.
+ * Need not pass marker code since it is stored in cinfo->unread_marker.
+ */
+typedef boolean (*jpeg_marker_parser_method) (j_decompress_ptr cinfo);
+
+
+/* Originally, this macro was used as a way of defining function prototypes
+ * for both modern compilers as well as older compilers that did not support
+ * prototype parameters.  libjpeg-turbo has never supported these older,
+ * non-ANSI compilers, but the macro is still included because there is some
+ * software out there that uses it.
+ */
+
+#define JPP(arglist)    arglist
+
+
+/* Default error-management setup */
+EXTERN(struct jpeg_error_mgr *) jpeg_std_error(struct jpeg_error_mgr *err);
+
+/* Initialization of JPEG compression objects.
+ * jpeg_create_compress() and jpeg_create_decompress() are the exported
+ * names that applications should call.  These expand to calls on
+ * jpeg_CreateCompress and jpeg_CreateDecompress with additional information
+ * passed for version mismatch checking.
+ * NB: you must set up the error-manager BEFORE calling jpeg_create_xxx.
+ */
+#define jpeg_create_compress(cinfo) \
+  jpeg_CreateCompress((cinfo), JPEG_LIB_VERSION, \
+                      (size_t)sizeof(struct jpeg_compress_struct))
+#define jpeg_create_decompress(cinfo) \
+  jpeg_CreateDecompress((cinfo), JPEG_LIB_VERSION, \
+                        (size_t)sizeof(struct jpeg_decompress_struct))
+EXTERN(void) jpeg_CreateCompress(j_compress_ptr cinfo, int version,
+                                 size_t structsize);
+EXTERN(void) jpeg_CreateDecompress(j_decompress_ptr cinfo, int version,
+                                   size_t structsize);
+/* Destruction of JPEG compression objects */
+EXTERN(void) jpeg_destroy_compress(j_compress_ptr cinfo);
+EXTERN(void) jpeg_destroy_decompress(j_decompress_ptr cinfo);
+
+/* Standard data source and destination managers: stdio streams. */
+/* Caller is responsible for opening the file before and closing after. */
+EXTERN(void) jpeg_stdio_dest(j_compress_ptr cinfo, FILE *outfile);
+EXTERN(void) jpeg_stdio_src(j_decompress_ptr cinfo, FILE *infile);
+
+/* Data source and destination managers: memory buffers. */
+EXTERN(void) jpeg_mem_dest(j_compress_ptr cinfo, unsigned char **outbuffer,
+                           unsigned long *outsize);
+EXTERN(void) jpeg_mem_src(j_decompress_ptr cinfo,
+                          const unsigned char *inbuffer, unsigned long insize);
+
+/* Default parameter setup for compression */
+EXTERN(void) jpeg_set_defaults(j_compress_ptr cinfo);
+/* Compression parameter setup aids */
+EXTERN(void) jpeg_set_colorspace(j_compress_ptr cinfo,
+                                 J_COLOR_SPACE colorspace);
+EXTERN(void) jpeg_default_colorspace(j_compress_ptr cinfo);
+EXTERN(void) jpeg_set_quality(j_compress_ptr cinfo, int quality,
+                              boolean force_baseline);
+EXTERN(void) jpeg_set_linear_quality(j_compress_ptr cinfo, int scale_factor,
+                                     boolean force_baseline);
+#if JPEG_LIB_VERSION >= 70
+EXTERN(void) jpeg_default_qtables(j_compress_ptr cinfo,
+                                  boolean force_baseline);
+#endif
+EXTERN(void) jpeg_add_quant_table(j_compress_ptr cinfo, int which_tbl,
+                                  const unsigned int *basic_table,
+                                  int scale_factor, boolean force_baseline);
+EXTERN(int) jpeg_quality_scaling(int quality);
+EXTERN(void) jpeg_enable_lossless(j_compress_ptr cinfo,
+                                  int predictor_selection_value,
+                                  int point_transform);
+EXTERN(void) jpeg_simple_progression(j_compress_ptr cinfo);
+EXTERN(void) jpeg_suppress_tables(j_compress_ptr cinfo, boolean suppress);
+EXTERN(JQUANT_TBL *) jpeg_alloc_quant_table(j_common_ptr cinfo);
+EXTERN(JHUFF_TBL *) jpeg_alloc_huff_table(j_common_ptr cinfo);
+
+/* Main entry points for compression */
+EXTERN(void) jpeg_start_compress(j_compress_ptr cinfo,
+                                 boolean write_all_tables);
+EXTERN(JDIMENSION) jpeg_write_scanlines(j_compress_ptr cinfo,
+                                        JSAMPARRAY scanlines,
+                                        JDIMENSION num_lines);
+EXTERN(JDIMENSION) jpeg12_write_scanlines(j_compress_ptr cinfo,
+                                          J12SAMPARRAY scanlines,
+                                          JDIMENSION num_lines);
+EXTERN(JDIMENSION) jpeg16_write_scanlines(j_compress_ptr cinfo,
+                                          J16SAMPARRAY scanlines,
+                                          JDIMENSION num_lines);
+EXTERN(void) jpeg_finish_compress(j_compress_ptr cinfo);
+
+#if JPEG_LIB_VERSION >= 70
+/* Precalculate JPEG dimensions for current compression parameters. */
+EXTERN(void) jpeg_calc_jpeg_dimensions(j_compress_ptr cinfo);
+#endif
+
+/* Replaces jpeg_write_scanlines when writing raw downsampled data. */
+EXTERN(JDIMENSION) jpeg_write_raw_data(j_compress_ptr cinfo, JSAMPIMAGE data,
+                                       JDIMENSION num_lines);
+EXTERN(JDIMENSION) jpeg12_write_raw_data(j_compress_ptr cinfo,
+                                         J12SAMPIMAGE data,
+                                         JDIMENSION num_lines);
+
+/* Write a special marker.  See libjpeg.txt concerning safe usage. */
+EXTERN(void) jpeg_write_marker(j_compress_ptr cinfo, int marker,
+                               const JOCTET *dataptr, unsigned int datalen);
+/* Same, but piecemeal. */
+EXTERN(void) jpeg_write_m_header(j_compress_ptr cinfo, int marker,
+                                 unsigned int datalen);
+EXTERN(void) jpeg_write_m_byte(j_compress_ptr cinfo, int val);
+
+/* Alternate compression function: just write an abbreviated table file */
+EXTERN(void) jpeg_write_tables(j_compress_ptr cinfo);
+
+/* Write ICC profile.  See libjpeg.txt for usage information. */
+EXTERN(void) jpeg_write_icc_profile(j_compress_ptr cinfo,
+                                    const JOCTET *icc_data_ptr,
+                                    unsigned int icc_data_len);
+
+
+/* Decompression startup: read start of JPEG datastream to see what's there */
+EXTERN(int) jpeg_read_header(j_decompress_ptr cinfo, boolean require_image);
+/* Return value is one of: */
+#define JPEG_SUSPENDED           0 /* Suspended due to lack of input data */
+#define JPEG_HEADER_OK           1 /* Found valid image datastream */
+#define JPEG_HEADER_TABLES_ONLY  2 /* Found valid table-specs-only datastream */
+/* If you pass require_image = TRUE (normal case), you need not check for
+ * a TABLES_ONLY return code; an abbreviated file will cause an error exit.
+ * JPEG_SUSPENDED is only possible if you use a data source module that can
+ * give a suspension return (the stdio source module doesn't).
+ */
+
+/* Main entry points for decompression */
+EXTERN(boolean) jpeg_start_decompress(j_decompress_ptr cinfo);
+EXTERN(JDIMENSION) jpeg_read_scanlines(j_decompress_ptr cinfo,
+                                       JSAMPARRAY scanlines,
+                                       JDIMENSION max_lines);
+EXTERN(JDIMENSION) jpeg12_read_scanlines(j_decompress_ptr cinfo,
+                                         J12SAMPARRAY scanlines,
+                                         JDIMENSION max_lines);
+EXTERN(JDIMENSION) jpeg16_read_scanlines(j_decompress_ptr cinfo,
+                                         J16SAMPARRAY scanlines,
+                                         JDIMENSION max_lines);
+EXTERN(JDIMENSION) jpeg_skip_scanlines(j_decompress_ptr cinfo,
+                                       JDIMENSION num_lines);
+EXTERN(JDIMENSION) jpeg12_skip_scanlines(j_decompress_ptr cinfo,
+                                         JDIMENSION num_lines);
+EXTERN(void) jpeg_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
+                                JDIMENSION *width);
+EXTERN(void) jpeg12_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
+                                  JDIMENSION *width);
+EXTERN(boolean) jpeg_finish_decompress(j_decompress_ptr cinfo);
+
+/* Replaces jpeg_read_scanlines when reading raw downsampled data. */
+EXTERN(JDIMENSION) jpeg_read_raw_data(j_decompress_ptr cinfo, JSAMPIMAGE data,
+                                      JDIMENSION max_lines);
+EXTERN(JDIMENSION) jpeg12_read_raw_data(j_decompress_ptr cinfo,
+                                        J12SAMPIMAGE data,
+                                        JDIMENSION max_lines);
+
+/* Additional entry points for buffered-image mode. */
+EXTERN(boolean) jpeg_has_multiple_scans(j_decompress_ptr cinfo);
+EXTERN(boolean) jpeg_start_output(j_decompress_ptr cinfo, int scan_number);
+EXTERN(boolean) jpeg_finish_output(j_decompress_ptr cinfo);
+EXTERN(boolean) jpeg_input_complete(j_decompress_ptr cinfo);
+EXTERN(void) jpeg_new_colormap(j_decompress_ptr cinfo);
+EXTERN(int) jpeg_consume_input(j_decompress_ptr cinfo);
+/* Return value is one of: */
+/* #define JPEG_SUSPENDED       0    Suspended due to lack of input data */
+#define JPEG_REACHED_SOS        1 /* Reached start of new scan */
+#define JPEG_REACHED_EOI        2 /* Reached end of image */
+#define JPEG_ROW_COMPLETED      3 /* Completed one iMCU row */
+#define JPEG_SCAN_COMPLETED     4 /* Completed last iMCU row of a scan */
+
+/* Precalculate output dimensions for current decompression parameters. */
+#if JPEG_LIB_VERSION >= 80
+EXTERN(void) jpeg_core_output_dimensions(j_decompress_ptr cinfo);
+#endif
+EXTERN(void) jpeg_calc_output_dimensions(j_decompress_ptr cinfo);
+
+/* Control saving of COM and APPn markers into marker_list. */
+EXTERN(void) jpeg_save_markers(j_decompress_ptr cinfo, int marker_code,
+                               unsigned int length_limit);
+
+/* Install a special processing method for COM or APPn markers. */
+EXTERN(void) jpeg_set_marker_processor(j_decompress_ptr cinfo,
+                                       int marker_code,
+                                       jpeg_marker_parser_method routine);
+
+/* Read or write raw DCT coefficients --- useful for lossless transcoding. */
+EXTERN(jvirt_barray_ptr *) jpeg_read_coefficients(j_decompress_ptr cinfo);
+EXTERN(void) jpeg_write_coefficients(j_compress_ptr cinfo,
+                                     jvirt_barray_ptr *coef_arrays);
+EXTERN(void) jpeg_copy_critical_parameters(j_decompress_ptr srcinfo,
+                                           j_compress_ptr dstinfo);
+
+/* If you choose to abort compression or decompression before completing
+ * jpeg_finish_(de)compress, then you need to clean up to release memory,
+ * temporary files, etc.  You can just call jpeg_destroy_(de)compress
+ * if you're done with the JPEG object, but if you want to clean it up and
+ * reuse it, call this:
+ */
+EXTERN(void) jpeg_abort_compress(j_compress_ptr cinfo);
+EXTERN(void) jpeg_abort_decompress(j_decompress_ptr cinfo);
+
+/* Generic versions of jpeg_abort and jpeg_destroy that work on either
+ * flavor of JPEG object.  These may be more convenient in some places.
+ */
+EXTERN(void) jpeg_abort(j_common_ptr cinfo);
+EXTERN(void) jpeg_destroy(j_common_ptr cinfo);
+
+/* Default restart-marker-resync procedure for use by data source modules */
+EXTERN(boolean) jpeg_resync_to_restart(j_decompress_ptr cinfo, int desired);
+
+/* Read ICC profile.  See libjpeg.txt for usage information. */
+EXTERN(boolean) jpeg_read_icc_profile(j_decompress_ptr cinfo,
+                                      JOCTET **icc_data_ptr,
+                                      unsigned int *icc_data_len);
+
+
+/* These marker codes are exported since applications and data source modules
+ * are likely to want to use them.
+ */
+
+#define JPEG_RST0       0xD0    /* RST0 marker code */
+#define JPEG_EOI        0xD9    /* EOI marker code */
+#define JPEG_APP0       0xE0    /* APP0 marker code */
+#define JPEG_COM        0xFE    /* COM marker code */
+
+
+/* If we have a brain-damaged compiler that emits warnings (or worse, errors)
+ * for structure definitions that are never filled in, keep it quiet by
+ * supplying dummy definitions for the various substructures.
+ */
+
+#ifdef INCOMPLETE_TYPES_BROKEN
+#ifndef JPEG_INTERNALS          /* will be defined in jpegint.h */
+struct jvirt_sarray_control { long dummy; };
+struct jvirt_barray_control { long dummy; };
+struct jpeg_comp_master { long dummy; };
+struct jpeg_c_main_controller { long dummy; };
+struct jpeg_c_prep_controller { long dummy; };
+struct jpeg_c_coef_controller { long dummy; };
+struct jpeg_marker_writer { long dummy; };
+struct jpeg_color_converter { long dummy; };
+struct jpeg_downsampler { long dummy; };
+struct jpeg_forward_dct { long dummy; };
+struct jpeg_entropy_encoder { long dummy; };
+struct jpeg_decomp_master { long dummy; };
+struct jpeg_d_main_controller { long dummy; };
+struct jpeg_d_coef_controller { long dummy; };
+struct jpeg_d_post_controller { long dummy; };
+struct jpeg_input_controller { long dummy; };
+struct jpeg_marker_reader { long dummy; };
+struct jpeg_entropy_decoder { long dummy; };
+struct jpeg_inverse_dct { long dummy; };
+struct jpeg_upsampler { long dummy; };
+struct jpeg_color_deconverter { long dummy; };
+struct jpeg_color_quantizer { long dummy; };
+#endif /* JPEG_INTERNALS */
+#endif /* INCOMPLETE_TYPES_BROKEN */
+
+
+/*
+ * The JPEG library modules define JPEG_INTERNALS before including this file.
+ * The internal structure declarations are read only when that is true.
+ * Applications using the library should not include jpegint.h, but may wish
+ * to include jerror.h.
+ */
+
+#ifdef JPEG_INTERNALS
+#include "jpegint.h"            /* fetch private declarations */
+#include "jerror.h"             /* fetch error codes too */
+#endif
+
+#ifdef __cplusplus
+#ifndef DONT_USE_EXTERN_C
+}
+#endif
+#endif
+
+#endif /* JPEGLIB_H */
diff --git a/duix-sdk/src/main/cpp/third/arm/include/turbojpeg/turbojpeg.h b/duix-sdk/src/main/cpp/third/arm/include/turbojpeg/turbojpeg.h
new file mode 100644
index 0000000..12efcbc
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/arm/include/turbojpeg/turbojpeg.h
@@ -0,0 +1,2286 @@
+/*
+ * Copyright (C)2009-2015, 2017, 2020-2023 D. R. Commander.
+ *                                         All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * - Neither the name of the libjpeg-turbo Project nor the names of its
+ *   contributors may be used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __TURBOJPEG_H__
+#define __TURBOJPEG_H__
+
+#include <stddef.h>
+
+#if defined(_WIN32) && defined(DLLDEFINE)
+#define DLLEXPORT  __declspec(dllexport)
+#else
+#define DLLEXPORT
+#endif
+#define DLLCALL
+
+
+/**
+ * @addtogroup TurboJPEG
+ * TurboJPEG API.  This API provides an interface for generating, decoding, and
+ * transforming planar YUV and JPEG images in memory.
+ *
+ * @anchor YUVnotes
+ * YUV Image Format Notes
+ * ----------------------
+ * Technically, the JPEG format uses the YCbCr colorspace (which is technically
+ * not a colorspace but a color transform), but per the convention of the
+ * digital video community, the TurboJPEG API uses "YUV" to refer to an image
+ * format consisting of Y, Cb, and Cr image planes.
+ *
+ * Each plane is simply a 2D array of bytes, each byte representing the value
+ * of one of the components (Y, Cb, or Cr) at a particular location in the
+ * image.  The width and height of each plane are determined by the image
+ * width, height, and level of chrominance subsampling.  The luminance plane
+ * width is the image width padded to the nearest multiple of the horizontal
+ * subsampling factor (1 in the case of 4:4:4, grayscale, 4:4:0, or 4:4:1; 2 in
+ * the case of 4:2:2 or 4:2:0; 4 in the case of 4:1:1.)  Similarly, the
+ * luminance plane height is the image height padded to the nearest multiple of
+ * the vertical subsampling factor (1 in the case of 4:4:4, 4:2:2, grayscale,
+ * or 4:1:1; 2 in the case of 4:2:0 or 4:4:0; 4 in the case of 4:4:1.)  This is
+ * irrespective of any additional padding that may be specified as an argument
+ * to the various YUV functions.  The chrominance plane width is equal to the
+ * luminance plane width divided by the horizontal subsampling factor, and the
+ * chrominance plane height is equal to the luminance plane height divided by
+ * the vertical subsampling factor.
+ *
+ * For example, if the source image is 35 x 35 pixels and 4:2:2 subsampling is
+ * used, then the luminance plane would be 36 x 35 bytes, and each of the
+ * chrominance planes would be 18 x 35 bytes.  If you specify a row alignment
+ * of 4 bytes on top of this, then the luminance plane would be 36 x 35 bytes,
+ * and each of the chrominance planes would be 20 x 35 bytes.
+ *
+ * @{
+ */
+
+
+/**
+ * The number of initialization options
+ */
+#define TJ_NUMINIT  3
+
+/**
+ * Initialization options.
+ */
+enum TJINIT {
+  /**
+   * Initialize the TurboJPEG instance for compression.
+   */
+  TJINIT_COMPRESS,
+  /**
+   * Initialize the TurboJPEG instance for decompression.
+   */
+  TJINIT_DECOMPRESS,
+  /**
+   * Initialize the TurboJPEG instance for lossless transformation (both
+   * compression and decompression.)
+   */
+  TJINIT_TRANSFORM
+};
+
+
+/**
+ * The number of chrominance subsampling options
+ */
+#define TJ_NUMSAMP  7
+
+/**
+ * Chrominance subsampling options.
+ * When pixels are converted from RGB to YCbCr (see #TJCS_YCbCr) or from CMYK
+ * to YCCK (see #TJCS_YCCK) as part of the JPEG compression process, some of
+ * the Cb and Cr (chrominance) components can be discarded or averaged together
+ * to produce a smaller image with little perceptible loss of image clarity.
+ * (The human eye is more sensitive to small changes in brightness than to
+ * small changes in color.)  This is called "chrominance subsampling".
+ */
+enum TJSAMP {
+  /**
+   * 4:4:4 chrominance subsampling (no chrominance subsampling).  The JPEG or
+   * YUV image will contain one chrominance component for every pixel in the
+   * source image.
+   */
+  TJSAMP_444,
+  /**
+   * 4:2:2 chrominance subsampling.  The JPEG or YUV image will contain one
+   * chrominance component for every 2x1 block of pixels in the source image.
+   */
+  TJSAMP_422,
+  /**
+   * 4:2:0 chrominance subsampling.  The JPEG or YUV image will contain one
+   * chrominance component for every 2x2 block of pixels in the source image.
+   */
+  TJSAMP_420,
+  /**
+   * Grayscale.  The JPEG or YUV image will contain no chrominance components.
+   */
+  TJSAMP_GRAY,
+  /**
+   * 4:4:0 chrominance subsampling.  The JPEG or YUV image will contain one
+   * chrominance component for every 1x2 block of pixels in the source image.
+   *
+   * @note 4:4:0 subsampling is not fully accelerated in libjpeg-turbo.
+   */
+  TJSAMP_440,
+  /**
+   * 4:1:1 chrominance subsampling.  The JPEG or YUV image will contain one
+   * chrominance component for every 4x1 block of pixels in the source image.
+   * JPEG images compressed with 4:1:1 subsampling will be almost exactly the
+   * same size as those compressed with 4:2:0 subsampling, and in the
+   * aggregate, both subsampling methods produce approximately the same
+   * perceptual quality.  However, 4:1:1 is better able to reproduce sharp
+   * horizontal features.
+   *
+   * @note 4:1:1 subsampling is not fully accelerated in libjpeg-turbo.
+   */
+  TJSAMP_411,
+  /**
+   * 4:4:1 chrominance subsampling.  The JPEG or YUV image will contain one
+   * chrominance component for every 1x4 block of pixels in the source image.
+   * JPEG images compressed with 4:4:1 subsampling will be almost exactly the
+   * same size as those compressed with 4:2:0 subsampling, and in the
+   * aggregate, both subsampling methods produce approximately the same
+   * perceptual quality.  However, 4:4:1 is better able to reproduce sharp
+   * vertical features.
+   *
+   * @note 4:4:1 subsampling is not fully accelerated in libjpeg-turbo.
+   */
+  TJSAMP_441,
+  /**
+   * Unknown subsampling.  The JPEG image uses an unusual type of chrominance
+   * subsampling.  Such images can be decompressed into packed-pixel images,
+   * but they cannot be
+   * - decompressed into planar YUV images,
+   * - losslessly transformed if #TJXOPT_CROP is specified, or
+   * - partially decompressed using a cropping region.
+   */
+  TJSAMP_UNKNOWN = -1
+};
+
+/**
+ * MCU block width (in pixels) for a given level of chrominance subsampling.
+ * MCU block sizes:
+ * - 8x8 for no subsampling or grayscale
+ * - 16x8 for 4:2:2
+ * - 8x16 for 4:4:0
+ * - 16x16 for 4:2:0
+ * - 32x8 for 4:1:1
+ * - 8x32 for 4:4:1
+ */
+static const int tjMCUWidth[TJ_NUMSAMP]  = { 8, 16, 16, 8, 8, 32, 8 };
+
+/**
+ * MCU block height (in pixels) for a given level of chrominance subsampling.
+ * MCU block sizes:
+ * - 8x8 for no subsampling or grayscale
+ * - 16x8 for 4:2:2
+ * - 8x16 for 4:4:0
+ * - 16x16 for 4:2:0
+ * - 32x8 for 4:1:1
+ * - 8x32 for 4:4:1
+ */
+static const int tjMCUHeight[TJ_NUMSAMP] = { 8, 8, 16, 8, 16, 8, 32 };
+
+
+/**
+ * The number of pixel formats
+ */
+#define TJ_NUMPF  12
+
+/**
+ * Pixel formats
+ */
+enum TJPF {
+  /**
+   * RGB pixel format.  The red, green, and blue components in the image are
+   * stored in 3-sample pixels in the order R, G, B from lowest to highest
+   * memory address within each pixel.
+   */
+  TJPF_RGB,
+  /**
+   * BGR pixel format.  The red, green, and blue components in the image are
+   * stored in 3-sample pixels in the order B, G, R from lowest to highest
+   * memory address within each pixel.
+   */
+  TJPF_BGR,
+  /**
+   * RGBX pixel format.  The red, green, and blue components in the image are
+   * stored in 4-sample pixels in the order R, G, B from lowest to highest
+   * memory address within each pixel.  The X component is ignored when
+   * compressing and undefined when decompressing.
+   */
+  TJPF_RGBX,
+  /**
+   * BGRX pixel format.  The red, green, and blue components in the image are
+   * stored in 4-sample pixels in the order B, G, R from lowest to highest
+   * memory address within each pixel.  The X component is ignored when
+   * compressing and undefined when decompressing.
+   */
+  TJPF_BGRX,
+  /**
+   * XBGR pixel format.  The red, green, and blue components in the image are
+   * stored in 4-sample pixels in the order R, G, B from highest to lowest
+   * memory address within each pixel.  The X component is ignored when
+   * compressing and undefined when decompressing.
+   */
+  TJPF_XBGR,
+  /**
+   * XRGB pixel format.  The red, green, and blue components in the image are
+   * stored in 4-sample pixels in the order B, G, R from highest to lowest
+   * memory address within each pixel.  The X component is ignored when
+   * compressing and undefined when decompressing.
+   */
+  TJPF_XRGB,
+  /**
+   * Grayscale pixel format.  Each 1-sample pixel represents a luminance
+   * (brightness) level from 0 to the maximum sample value (255 for 8-bit
+   * samples, 4095 for 12-bit samples, and 65535 for 16-bit samples.)
+   */
+  TJPF_GRAY,
+  /**
+   * RGBA pixel format.  This is the same as @ref TJPF_RGBX, except that when
+   * decompressing, the X component is guaranteed to be equal to the maximum
+   * sample value, which can be interpreted as an opaque alpha channel.
+   */
+  TJPF_RGBA,
+  /**
+   * BGRA pixel format.  This is the same as @ref TJPF_BGRX, except that when
+   * decompressing, the X component is guaranteed to be equal to the maximum
+   * sample value, which can be interpreted as an opaque alpha channel.
+   */
+  TJPF_BGRA,
+  /**
+   * ABGR pixel format.  This is the same as @ref TJPF_XBGR, except that when
+   * decompressing, the X component is guaranteed to be equal to the maximum
+   * sample value, which can be interpreted as an opaque alpha channel.
+   */
+  TJPF_ABGR,
+  /**
+   * ARGB pixel format.  This is the same as @ref TJPF_XRGB, except that when
+   * decompressing, the X component is guaranteed to be equal to the maximum
+   * sample value, which can be interpreted as an opaque alpha channel.
+   */
+  TJPF_ARGB,
+  /**
+   * CMYK pixel format.  Unlike RGB, which is an additive color model used
+   * primarily for display, CMYK (Cyan/Magenta/Yellow/Key) is a subtractive
+   * color model used primarily for printing.  In the CMYK color model, the
+   * value of each color component typically corresponds to an amount of cyan,
+   * magenta, yellow, or black ink that is applied to a white background.  In
+   * order to convert between CMYK and RGB, it is necessary to use a color
+   * management system (CMS.)  A CMS will attempt to map colors within the
+   * printer's gamut to perceptually similar colors in the display's gamut and
+   * vice versa, but the mapping is typically not 1:1 or reversible, nor can it
+   * be defined with a simple formula.  Thus, such a conversion is out of scope
+   * for a codec library.  However, the TurboJPEG API allows for compressing
+   * packed-pixel CMYK images into YCCK JPEG images (see #TJCS_YCCK) and
+   * decompressing YCCK JPEG images into packed-pixel CMYK images.
+   */
+  TJPF_CMYK,
+  /**
+   * Unknown pixel format.  Currently this is only used by #tj3LoadImage8(),
+   * #tj3LoadImage12(), and #tj3LoadImage16().
+   */
+  TJPF_UNKNOWN = -1
+};
+
+/**
+ * Red offset (in samples) for a given pixel format.  This specifies the number
+ * of samples that the red component is offset from the start of the pixel.
+ * For instance, if an 8-bit-per-component pixel of format TJPF_BGRX is stored
+ * in `unsigned char pixel[]`, then the red component will be
+ * `pixel[tjRedOffset[TJPF_BGRX]]`.  This will be -1 if the pixel format does
+ * not have a red component.
+ */
+static const int tjRedOffset[TJ_NUMPF] = {
+  0, 2, 0, 2, 3, 1, -1, 0, 2, 3, 1, -1
+};
+/**
+ * Green offset (in samples) for a given pixel format.  This specifies the
+ * number of samples that the green component is offset from the start of the
+ * pixel.  For instance, if an 8-bit-per-component pixel of format TJPF_BGRX is
+ * stored in `unsigned char pixel[]`, then the green component will be
+ * `pixel[tjGreenOffset[TJPF_BGRX]]`.  This will be -1 if the pixel format does
+ * not have a green component.
+ */
+static const int tjGreenOffset[TJ_NUMPF] = {
+  1, 1, 1, 1, 2, 2, -1, 1, 1, 2, 2, -1
+};
+/**
+ * Blue offset (in samples) for a given pixel format.  This specifies the
+ * number of samples that the blue component is offset from the start of the
+ * pixel.  For instance, if an 8-bit-per-component pixel of format TJPF_BGRX is
+ * stored in `unsigned char pixel[]`, then the blue component will be
+ * `pixel[tjBlueOffset[TJPF_BGRX]]`.  This will be -1 if the pixel format does
+ * not have a blue component.
+ */
+static const int tjBlueOffset[TJ_NUMPF] = {
+  2, 0, 2, 0, 1, 3, -1, 2, 0, 1, 3, -1
+};
+/**
+ * Alpha offset (in samples) for a given pixel format.  This specifies the
+ * number of samples that the alpha component is offset from the start of the
+ * pixel.  For instance, if an 8-bit-per-component pixel of format TJPF_BGRA is
+ * stored in `unsigned char pixel[]`, then the alpha component will be
+ * `pixel[tjAlphaOffset[TJPF_BGRA]]`.  This will be -1 if the pixel format does
+ * not have an alpha component.
+ */
+static const int tjAlphaOffset[TJ_NUMPF] = {
+  -1, -1, -1, -1, -1, -1, -1, 3, 3, 0, 0, -1
+};
+/**
+ * Pixel size (in samples) for a given pixel format
+ */
+static const int tjPixelSize[TJ_NUMPF] = {
+  3, 3, 4, 4, 4, 4, 1, 4, 4, 4, 4, 4
+};
+
+
+/**
+ * The number of JPEG colorspaces
+ */
+#define TJ_NUMCS  5
+
+/**
+ * JPEG colorspaces
+ */
+enum TJCS {
+  /**
+   * RGB colorspace.  When compressing the JPEG image, the R, G, and B
+   * components in the source image are reordered into image planes, but no
+   * colorspace conversion or subsampling is performed.  RGB JPEG images can be
+   * compressed from and decompressed to packed-pixel images with any of the
+   * extended RGB or grayscale pixel formats, but they cannot be compressed
+   * from or decompressed to planar YUV images.
+   */
+  TJCS_RGB,
+  /**
+   * YCbCr colorspace.  YCbCr is not an absolute colorspace but rather a
+   * mathematical transformation of RGB designed solely for storage and
+   * transmission.  YCbCr images must be converted to RGB before they can
+   * actually be displayed.  In the YCbCr colorspace, the Y (luminance)
+   * component represents the black & white portion of the original image, and
+   * the Cb and Cr (chrominance) components represent the color portion of the
+   * original image.  Originally, the analog equivalent of this transformation
+   * allowed the same signal to drive both black & white and color televisions,
+   * but JPEG images use YCbCr primarily because it allows the color data to be
+   * optionally subsampled for the purposes of reducing network or disk usage.
+   * YCbCr is the most common JPEG colorspace, and YCbCr JPEG images can be
+   * compressed from and decompressed to packed-pixel images with any of the
+   * extended RGB or grayscale pixel formats.  YCbCr JPEG images can also be
+   * compressed from and decompressed to planar YUV images.
+   */
+  TJCS_YCbCr,
+  /**
+   * Grayscale colorspace.  The JPEG image retains only the luminance data (Y
+   * component), and any color data from the source image is discarded.
+   * Grayscale JPEG images can be compressed from and decompressed to
+   * packed-pixel images with any of the extended RGB or grayscale pixel
+   * formats, or they can be compressed from and decompressed to planar YUV
+   * images.
+   */
+  TJCS_GRAY,
+  /**
+   * CMYK colorspace.  When compressing the JPEG image, the C, M, Y, and K
+   * components in the source image are reordered into image planes, but no
+   * colorspace conversion or subsampling is performed.  CMYK JPEG images can
+   * only be compressed from and decompressed to packed-pixel images with the
+   * CMYK pixel format.
+   */
+  TJCS_CMYK,
+  /**
+   * YCCK colorspace.  YCCK (AKA "YCbCrK") is not an absolute colorspace but
+   * rather a mathematical transformation of CMYK designed solely for storage
+   * and transmission.  It is to CMYK as YCbCr is to RGB.  CMYK pixels can be
+   * reversibly transformed into YCCK, and as with YCbCr, the chrominance
+   * components in the YCCK pixels can be subsampled without incurring major
+   * perceptual loss.  YCCK JPEG images can only be compressed from and
+   * decompressed to packed-pixel images with the CMYK pixel format.
+   */
+  TJCS_YCCK
+};
+
+
+/**
+ * The number of parameters
+ */
+#define TJ_NUMPARAM
+
+/**
+ * Parameters
+ */
+enum TJPARAM {
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  TJPARAM_MAXPIXELS = -1,
+#endif
+  /**
+   * Error handling behavior
+   *
+   * **Value**
+   * - `0` *[default]* Allow the current compression/decompression/transform
+   * operation to complete unless a fatal error is encountered.
+   * - `1` Immediately discontinue the current
+   * compression/decompression/transform operation if a warning (non-fatal
+   * error) occurs.
+   */
+  TJPARAM_STOPONWARNING,
+  /**
+   * Row order in packed-pixel source/destination images
+   *
+   * **Value**
+   * - `0` *[default]* top-down (X11) order
+   * - `1` bottom-up (Windows, OpenGL) order
+   */
+  TJPARAM_BOTTOMUP,
+  /**
+   * JPEG destination buffer (re)allocation [compression, lossless
+   * transformation]
+   *
+   * **Value**
+   * - `0` *[default]* Attempt to allocate or reallocate the JPEG destination
+   * buffer as needed.
+   * - `1` Generate an error if the JPEG destination buffer is invalid or too
+   * small.
+   */
+  TJPARAM_NOREALLOC,
+  /**
+   * Perceptual quality of lossy JPEG images [compression only]
+   *
+   * **Value**
+   * - `1`-`100` (`1` = worst quality but best compression, `100` = best
+   * quality but worst compression) *[no default; must be explicitly
+   * specified]*
+   */
+  TJPARAM_QUALITY,
+  /**
+   * Chrominance subsampling level
+   *
+   * The JPEG or YUV image uses (decompression, decoding) or will use (lossy
+   * compression, encoding) the specified level of chrominance subsampling.
+   *
+   * **Value**
+   * - One of the @ref TJSAMP "chrominance subsampling options" *[no default;
+   * must be explicitly specified for lossy compression, encoding, and
+   * decoding]*
+   */
+  TJPARAM_SUBSAMP,
+  /**
+   * JPEG width (in pixels) [decompression only, read-only]
+   */
+  TJPARAM_JPEGWIDTH,
+  /**
+   * JPEG height (in pixels) [decompression only, read-only]
+   */
+  TJPARAM_JPEGHEIGHT,
+  /**
+   * JPEG data precision (bits per sample) [decompression only, read-only]
+   *
+   * The JPEG image uses the specified number of bits per sample.
+   *
+   * **Value**
+   * - `8`, `12`, or `16`
+   *
+   * 12-bit data precision implies #TJPARAM_OPTIMIZE unless #TJPARAM_ARITHMETIC
+   * is set.
+   */
+  TJPARAM_PRECISION,
+  /**
+   * JPEG colorspace
+   *
+   * The JPEG image uses (decompression) or will use (lossy compression) the
+   * specified colorspace.
+   *
+   * **Value**
+   * - One of the @ref TJCS "JPEG colorspaces" *[default for lossy compression:
+   * automatically selected based on the subsampling level and pixel format]*
+   */
+  TJPARAM_COLORSPACE,
+  /**
+   * Chrominance upsampling algorithm [lossy decompression only]
+   *
+   * **Value**
+   * - `0` *[default]* Use smooth upsampling when decompressing a JPEG image
+   * that was compressed using chrominance subsampling.  This creates a smooth
+   * transition between neighboring chrominance components in order to reduce
+   * upsampling artifacts in the decompressed image.
+   * - `1` Use the fastest chrominance upsampling algorithm available, which
+   * may combine upsampling with color conversion.
+   */
+  TJPARAM_FASTUPSAMPLE,
+  /**
+   * DCT/IDCT algorithm [lossy compression and decompression]
+   *
+   * **Value**
+   * - `0` *[default]* Use the most accurate DCT/IDCT algorithm available.
+   * - `1` Use the fastest DCT/IDCT algorithm available.
+   *
+   * This parameter is provided mainly for backward compatibility with libjpeg,
+   * which historically implemented several different DCT/IDCT algorithms
+   * because of performance limitations with 1990s CPUs.  In the libjpeg-turbo
+   * implementation of the TurboJPEG API:
+   * - The "fast" and "accurate" DCT/IDCT algorithms perform similarly on
+   * modern x86/x86-64 CPUs that support AVX2 instructions.
+   * - The "fast" algorithm is generally only about 5-15% faster than the
+   * "accurate" algorithm on other types of CPUs.
+   * - The difference in accuracy between the "fast" and "accurate" algorithms
+   * is the most pronounced at JPEG quality levels above 90 and tends to be
+   * more pronounced with decompression than with compression.
+   * - The "fast" algorithm degrades and is not fully accelerated for JPEG
+   * quality levels above 97, so it will be slower than the "accurate"
+   * algorithm.
+   */
+  TJPARAM_FASTDCT,
+  /**
+   * Optimized baseline entropy coding [lossy compression only]
+   *
+   * **Value**
+   * - `0` *[default]* The JPEG image will use the default Huffman tables.
+   * - `1` Optimal Huffman tables will be computed for the JPEG image.  For
+   * lossless transformation, this can also be specified using
+   * #TJXOPT_OPTIMIZE.
+   *
+   * Optimized baseline entropy coding will improve compression slightly
+   * (generally 5% or less), but it will reduce compression performance
+   * considerably.
+   */
+  TJPARAM_OPTIMIZE,
+  /**
+   * Progressive entropy coding
+   *
+   * **Value**
+   * - `0` *[default for compression, lossless transformation]* The lossy JPEG
+   * image uses (decompression) or will use (compression, lossless
+   * transformation) baseline entropy coding.
+   * - `1` The lossy JPEG image uses (decompression) or will use (compression,
+   * lossless transformation) progressive entropy coding.  For lossless
+   * transformation, this can also be specified using #TJXOPT_PROGRESSIVE.
+   *
+   * Progressive entropy coding will generally improve compression relative to
+   * baseline entropy coding, but it will reduce compression and decompression
+   * performance considerably.  Can be combined with #TJPARAM_ARITHMETIC.
+   * Implies #TJPARAM_OPTIMIZE unless #TJPARAM_ARITHMETIC is also set.
+   */
+  TJPARAM_PROGRESSIVE,
+  /**
+   * Progressive JPEG scan limit for lossy JPEG images [decompression, lossless
+   * transformation]
+   *
+   * Setting this parameter will cause the decompression and transform
+   * functions to return an error if the number of scans in a progressive JPEG
+   * image exceeds the specified limit.  The primary purpose of this is to
+   * allow security-critical applications to guard against an exploit of the
+   * progressive JPEG format described in
+   * <a href="https://libjpeg-turbo.org/pmwiki/uploads/About/TwoIssueswiththeJPEGStandard.pdf" target="_blank">this report</a>.
+   *
+   * **Value**
+   * - maximum number of progressive JPEG scans that the decompression and
+   * transform functions will process *[default: `0` (no limit)]*
+   *
+   * @see #TJPARAM_PROGRESSIVE
+   */
+  TJPARAM_SCANLIMIT,
+  /**
+   * Arithmetic entropy coding
+   *
+   * **Value**
+   * - `0` *[default for compression, lossless transformation]* The lossy JPEG
+   * image uses (decompression) or will use (compression, lossless
+   * transformation) Huffman entropy coding.
+   * - `1` The lossy JPEG image uses (decompression) or will use (compression,
+   * lossless transformation) arithmetic entropy coding.  For lossless
+   * transformation, this can also be specified using #TJXOPT_ARITHMETIC.
+   *
+   * Arithmetic entropy coding will generally improve compression relative to
+   * Huffman entropy coding, but it will reduce compression and decompression
+   * performance considerably.  Can be combined with #TJPARAM_PROGRESSIVE.
+   */
+  TJPARAM_ARITHMETIC,
+  /**
+   * Lossless JPEG
+   *
+   * **Value**
+   * - `0` *[default for compression]* The JPEG image is (decompression) or
+   * will be (compression) lossy/DCT-based.
+   * - `1` The JPEG image is (decompression) or will be (compression)
+   * lossless/predictive.
+   *
+   * In most cases, compressing and decompressing lossless JPEG images is
+   * considerably slower than compressing and decompressing lossy JPEG images.
+   * Also note that the following features are not available with lossless JPEG
+   * images:
+   * - Colorspace conversion (lossless JPEG images always use #TJCS_RGB,
+   * #TJCS_GRAY, or #TJCS_CMYK, depending on the pixel format of the source
+   * image)
+   * - Chrominance subsampling (lossless JPEG images always use #TJSAMP_444)
+   * - JPEG quality selection
+   * - DCT/IDCT algorithm selection
+   * - Progressive entropy coding
+   * - Arithmetic entropy coding
+   * - Compression from/decompression to planar YUV images
+   * - Decompression scaling
+   * - Lossless transformation
+   *
+   * @see #TJPARAM_LOSSLESSPSV, #TJPARAM_LOSSLESSPT
+   */
+  TJPARAM_LOSSLESS,
+  /**
+   * Lossless JPEG predictor selection value (PSV)
+   *
+   * **Value**
+   * - `1`-`7` *[default for compression: `1`]*
+   *
+   * @see #TJPARAM_LOSSLESS
+   */
+  TJPARAM_LOSSLESSPSV,
+  /**
+   * Lossless JPEG point transform (Pt)
+   *
+   * **Value**
+   * - `0` through ***precision*** *- 1*, where ***precision*** is the JPEG
+   * data precision in bits *[default for compression: `0`]*
+   *
+   * A point transform value of `0` is necessary in order to generate a fully
+   * lossless JPEG image.  (A non-zero point transform value right-shifts the
+   * input samples by the specified number of bits, which is effectively a form
+   * of lossy color quantization.)
+   *
+   * @see #TJPARAM_LOSSLESS, #TJPARAM_PRECISION
+   */
+  TJPARAM_LOSSLESSPT,
+  /**
+   * JPEG restart marker interval in MCU blocks (lossy) or samples (lossless)
+   * [compression only]
+   *
+   * The nature of entropy coding is such that a corrupt JPEG image cannot
+   * be decompressed beyond the point of corruption unless it contains restart
+   * markers.  A restart marker stops and restarts the entropy coding algorithm
+   * so that, if a JPEG image is corrupted, decompression can resume at the
+   * next marker.  Thus, adding more restart markers improves the fault
+   * tolerance of the JPEG image, but adding too many restart markers can
+   * adversely affect the compression ratio and performance.
+   *
+   * **Value**
+   * - the number of MCU blocks or samples between each restart marker
+   * *[default: `0` (no restart markers)]*
+   *
+   * Setting this parameter to a non-zero value sets #TJPARAM_RESTARTROWS to 0.
+   */
+  TJPARAM_RESTARTBLOCKS,
+  /**
+   * JPEG restart marker interval in MCU rows (lossy) or sample rows (lossless)
+   * [compression only]
+   *
+   * See #TJPARAM_RESTARTBLOCKS for a description of restart markers.
+   *
+   * **Value**
+   * - the number of MCU rows or sample rows between each restart marker
+   * *[default: `0` (no restart markers)]*
+   *
+   * Setting this parameter to a non-zero value sets #TJPARAM_RESTARTBLOCKS to
+   * 0.
+   */
+  TJPARAM_RESTARTROWS,
+  /**
+   * JPEG horizontal pixel density
+   *
+   * **Value**
+   * - The JPEG image has (decompression) or will have (compression) the
+   * specified horizontal pixel density *[default for compression: `1`]*.
+   *
+   * This value is stored in or read from the JPEG header.  It does not affect
+   * the contents of the JPEG image.  Note that this parameter is set by
+   * #tj3LoadImage8() when loading a Windows BMP file that contains pixel
+   * density information, and the value of this parameter is stored to a
+   * Windows BMP file by #tj3SaveImage8() if the value of #TJPARAM_DENSITYUNIT
+   * is `2`.
+   *
+   * @see TJPARAM_DENSITYUNIT
+   */
+  TJPARAM_XDENSITY,
+  /**
+   * JPEG vertical pixel density
+   *
+   * **Value**
+   * - The JPEG image has (decompression) or will have (compression) the
+   * specified vertical pixel density *[default for compression: `1`]*.
+   *
+   * This value is stored in or read from the JPEG header.  It does not affect
+   * the contents of the JPEG image.  Note that this parameter is set by
+   * #tj3LoadImage8() when loading a Windows BMP file that contains pixel
+   * density information, and the value of this parameter is stored to a
+   * Windows BMP file by #tj3SaveImage8() if the value of #TJPARAM_DENSITYUNIT
+   * is `2`.
+   *
+   * @see TJPARAM_DENSITYUNIT
+   */
+  TJPARAM_YDENSITY,
+  /**
+   * JPEG pixel density units
+   *
+   * **Value**
+   * - `0` *[default for compression]* The pixel density of the JPEG image is
+   * expressed (decompression) or will be expressed (compression) in unknown
+   * units.
+   * - `1` The pixel density of the JPEG image is expressed (decompression) or
+   * will be expressed (compression) in units of pixels/inch.
+   * - `2` The pixel density of the JPEG image is expressed (decompression) or
+   * will be expressed (compression) in units of pixels/cm.
+   *
+   * This value is stored in or read from the JPEG header.  It does not affect
+   * the contents of the JPEG image.  Note that this parameter is set by
+   * #tj3LoadImage8() when loading a Windows BMP file that contains pixel
+   * density information, and the value of this parameter is stored to a
+   * Windows BMP file by #tj3SaveImage8() if the value is `2`.
+   *
+   * @see TJPARAM_XDENSITY, TJPARAM_YDENSITY
+   */
+  TJPARAM_DENSITYUNITS
+};
+
+
+/**
+ * The number of error codes
+ */
+#define TJ_NUMERR  2
+
+/**
+ * Error codes
+ */
+enum TJERR {
+  /**
+   * The error was non-fatal and recoverable, but the destination image may
+   * still be corrupt.
+   */
+  TJERR_WARNING,
+  /**
+   * The error was fatal and non-recoverable.
+   */
+  TJERR_FATAL
+};
+
+
+/**
+ * The number of transform operations
+ */
+#define TJ_NUMXOP  8
+
+/**
+ * Transform operations for #tj3Transform()
+ */
+enum TJXOP {
+  /**
+   * Do not transform the position of the image pixels
+   */
+  TJXOP_NONE,
+  /**
+   * Flip (mirror) image horizontally.  This transform is imperfect if there
+   * are any partial MCU blocks on the right edge (see #TJXOPT_PERFECT.)
+   */
+  TJXOP_HFLIP,
+  /**
+   * Flip (mirror) image vertically.  This transform is imperfect if there are
+   * any partial MCU blocks on the bottom edge (see #TJXOPT_PERFECT.)
+   */
+  TJXOP_VFLIP,
+  /**
+   * Transpose image (flip/mirror along upper left to lower right axis.)  This
+   * transform is always perfect.
+   */
+  TJXOP_TRANSPOSE,
+  /**
+   * Transverse transpose image (flip/mirror along upper right to lower left
+   * axis.)  This transform is imperfect if there are any partial MCU blocks in
+   * the image (see #TJXOPT_PERFECT.)
+   */
+  TJXOP_TRANSVERSE,
+  /**
+   * Rotate image clockwise by 90 degrees.  This transform is imperfect if
+   * there are any partial MCU blocks on the bottom edge (see
+   * #TJXOPT_PERFECT.)
+   */
+  TJXOP_ROT90,
+  /**
+   * Rotate image 180 degrees.  This transform is imperfect if there are any
+   * partial MCU blocks in the image (see #TJXOPT_PERFECT.)
+   */
+  TJXOP_ROT180,
+  /**
+   * Rotate image counter-clockwise by 90 degrees.  This transform is imperfect
+   * if there are any partial MCU blocks on the right edge (see
+   * #TJXOPT_PERFECT.)
+   */
+  TJXOP_ROT270
+};
+
+
+/**
+ * This option will cause #tj3Transform() to return an error if the transform
+ * is not perfect.  Lossless transforms operate on MCU blocks, whose size
+ * depends on the level of chrominance subsampling used (see #tjMCUWidth and
+ * #tjMCUHeight.)  If the image's width or height is not evenly divisible by
+ * the MCU block size, then there will be partial MCU blocks on the right
+ * and/or bottom edges.  It is not possible to move these partial MCU blocks to
+ * the top or left of the image, so any transform that would require that is
+ * "imperfect."  If this option is not specified, then any partial MCU blocks
+ * that cannot be transformed will be left in place, which will create
+ * odd-looking strips on the right or bottom edge of the image.
+ */
+#define TJXOPT_PERFECT  (1 << 0)
+/**
+ * This option will cause #tj3Transform() to discard any partial MCU blocks
+ * that cannot be transformed.
+ */
+#define TJXOPT_TRIM  (1 << 1)
+/**
+ * This option will enable lossless cropping.  See #tj3Transform() for more
+ * information.
+ */
+#define TJXOPT_CROP  (1 << 2)
+/**
+ * This option will discard the color data in the source image and produce a
+ * grayscale destination image.
+ */
+#define TJXOPT_GRAY  (1 << 3)
+/**
+ * This option will prevent #tj3Transform() from outputting a JPEG image for
+ * this particular transform.  (This can be used in conjunction with a custom
+ * filter to capture the transformed DCT coefficients without transcoding
+ * them.)
+ */
+#define TJXOPT_NOOUTPUT  (1 << 4)
+/**
+ * This option will enable progressive entropy coding in the JPEG image
+ * generated by this particular transform.  Progressive entropy coding will
+ * generally improve compression relative to baseline entropy coding (the
+ * default), but it will reduce decompression performance considerably.
+ * Can be combined with #TJXOPT_ARITHMETIC.  Implies #TJXOPT_OPTIMIZE unless
+ * #TJXOPT_ARITHMETIC is also specified.
+ */
+#define TJXOPT_PROGRESSIVE  (1 << 5)
+/**
+ * This option will prevent #tj3Transform() from copying any extra markers
+ * (including EXIF and ICC profile data) from the source image to the
+ * destination image.
+ */
+#define TJXOPT_COPYNONE  (1 << 6)
+/**
+ * This option will enable arithmetic entropy coding in the JPEG image
+ * generated by this particular transform.  Arithmetic entropy coding will
+ * generally improve compression relative to Huffman entropy coding (the
+ * default), but it will reduce decompression performance considerably.  Can be
+ * combined with #TJXOPT_PROGRESSIVE.
+ */
+#define TJXOPT_ARITHMETIC  (1 << 7)
+/**
+ * This option will enable optimized baseline entropy coding in the JPEG image
+ * generated by this particular transform.  Optimized baseline entropy coding
+ * will improve compression slightly (generally 5% or less.)
+ */
+#define TJXOPT_OPTIMIZE  (1 << 8)
+
+
+/**
+ * Scaling factor
+ */
+typedef struct {
+  /**
+   * Numerator
+   */
+  int num;
+  /**
+   * Denominator
+   */
+  int denom;
+} tjscalingfactor;
+
+/**
+ * Cropping region
+ */
+typedef struct {
+  /**
+   * The left boundary of the cropping region.  This must be evenly divisible
+   * by the MCU block width (see #tjMCUWidth.)
+   */
+  int x;
+  /**
+   * The upper boundary of the cropping region.  For lossless transformation,
+   * this must be evenly divisible by the MCU block height (see #tjMCUHeight.)
+   */
+  int y;
+  /**
+   * The width of the cropping region.  Setting this to 0 is the equivalent of
+   * setting it to the width of the source JPEG image - x.
+   */
+  int w;
+  /**
+   * The height of the cropping region.  Setting this to 0 is the equivalent of
+   * setting it to the height of the source JPEG image - y.
+   */
+  int h;
+} tjregion;
+
+/**
+ * A #tjregion structure that specifies no cropping
+ */
+static const tjregion TJUNCROPPED = { 0, 0, 0, 0 };
+
+/**
+ * Lossless transform
+ */
+typedef struct tjtransform {
+  /**
+   * Cropping region
+   */
+  tjregion r;
+  /**
+   * One of the @ref TJXOP "transform operations"
+   */
+  int op;
+  /**
+   * The bitwise OR of one of more of the @ref TJXOPT_ARITHMETIC
+   * "transform options"
+   */
+  int options;
+  /**
+   * Arbitrary data that can be accessed within the body of the callback
+   * function
+   */
+  void *data;
+  /**
+   * A callback function that can be used to modify the DCT coefficients after
+   * they are losslessly transformed but before they are transcoded to a new
+   * JPEG image.  This allows for custom filters or other transformations to be
+   * applied in the frequency domain.
+   *
+   * @param coeffs pointer to an array of transformed DCT coefficients.  (NOTE:
+   * this pointer is not guaranteed to be valid once the callback returns, so
+   * applications wishing to hand off the DCT coefficients to another function
+   * or library should make a copy of them within the body of the callback.)
+   *
+   * @param arrayRegion #tjregion structure containing the width and height of
+   * the array pointed to by `coeffs` as well as its offset relative to the
+   * component plane.  TurboJPEG implementations may choose to split each
+   * component plane into multiple DCT coefficient arrays and call the callback
+   * function once for each array.
+   *
+   * @param planeRegion #tjregion structure containing the width and height of
+   * the component plane to which `coeffs` belongs
+   *
+   * @param componentID ID number of the component plane to which `coeffs`
+   * belongs.  (Y, Cb, and Cr have, respectively, ID's of 0, 1, and 2 in
+   * typical JPEG images.)
+   *
+   * @param transformID ID number of the transformed image to which `coeffs`
+   * belongs.  This is the same as the index of the transform in the
+   * `transforms` array that was passed to #tj3Transform().
+   *
+   * @param transform a pointer to a #tjtransform structure that specifies the
+   * parameters and/or cropping region for this transform
+   *
+   * @return 0 if the callback was successful, or -1 if an error occurred.
+   */
+  int (*customFilter) (short *coeffs, tjregion arrayRegion,
+                       tjregion planeRegion, int componentID, int transformID,
+                       struct tjtransform *transform);
+} tjtransform;
+
+/**
+ * TurboJPEG instance handle
+ */
+typedef void *tjhandle;
+
+
+/**
+ * Compute the scaled value of `dimension` using the given scaling factor.
+ * This macro performs the integer equivalent of `ceil(dimension *
+ * scalingFactor)`.
+ */
+#define TJSCALED(dimension, scalingFactor) \
+  (((dimension) * scalingFactor.num + scalingFactor.denom - 1) / \
+   scalingFactor.denom)
+
+/**
+ * A #tjscalingfactor structure that specifies a scaling factor of 1/1 (no
+ * scaling)
+ */
+static const tjscalingfactor TJUNSCALED = { 1, 1 };
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/**
+ * Create a new TurboJPEG instance.
+ *
+ * @param initType one of the @ref TJINIT "initialization options"
+ *
+ * @return a handle to the newly-created instance, or NULL if an error occurred
+ * (see #tj3GetErrorStr().)
+ */
+DLLEXPORT tjhandle tj3Init(int initType);
+
+
+/**
+ * Set the value of a parameter.
+ *
+ * @param handle handle to a TurboJPEG instance
+ *
+ * @param param one of the @ref TJPARAM "parameters"
+ *
+ * @param value value of the parameter (refer to @ref TJPARAM
+ * "parameter documentation")
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr().)
+ */
+DLLEXPORT int tj3Set(tjhandle handle, int param, int value);
+
+
+/**
+ * Get the value of a parameter.
+ *
+ * @param handle handle to a TurboJPEG instance
+ *
+ * @param param one of the @ref TJPARAM "parameters"
+ *
+ * @return the value of the specified parameter, or -1 if the value is unknown.
+ */
+DLLEXPORT int tj3Get(tjhandle handle, int param);
+
+
+/**
+ * Compress an 8-bit-per-sample packed-pixel RGB, grayscale, or CMYK image into
+ * an 8-bit-per-sample JPEG image.
+ *
+ * @param handle handle to a TurboJPEG instance that has been initialized for
+ * compression
+ *
+ * @param srcBuf pointer to a buffer containing a packed-pixel RGB, grayscale,
+ * or CMYK source image to be compressed.  This buffer should normally be
+ * `pitch * height` samples in size.  However, you can also use this parameter
+ * to compress from a specific region of a larger buffer.
+ *
+ * @param width width (in pixels) of the source image
+ *
+ * @param pitch samples per row in the source image.  Normally this should be
+ * <tt>width * #tjPixelSize[pixelFormat]</tt>, if the image is unpadded.
+ * (Setting this parameter to 0 is the equivalent of setting it to
+ * <tt>width * #tjPixelSize[pixelFormat]</tt>.)  However, you can also use this
+ * parameter to specify the row alignment/padding of the source image, to skip
+ * rows, or to compress from a specific region of a larger buffer.
+ *
+ * @param height height (in pixels) of the source image
+ *
+ * @param pixelFormat pixel format of the source image (see @ref TJPF
+ * "Pixel formats".)
+ *
+ * @param jpegBuf address of a pointer to a byte buffer that will receive the
+ * JPEG image.  TurboJPEG has the ability to reallocate the JPEG buffer to
+ * accommodate the size of the JPEG image.  Thus, you can choose to:
+ * -# pre-allocate the JPEG buffer with an arbitrary size using #tj3Alloc() and
+ * let TurboJPEG grow the buffer as needed,
+ * -# set `*jpegBuf` to NULL to tell TurboJPEG to allocate the buffer for you,
+ * or
+ * -# pre-allocate the buffer to a "worst case" size determined by calling
+ * #tj3JPEGBufSize().  This should ensure that the buffer never has to be
+ * re-allocated.  (Setting #TJPARAM_NOREALLOC guarantees that it won't be.)
+ * .
+ * If you choose option 1, then `*jpegSize` should be set to the size of your
+ * pre-allocated buffer.  In any case, unless you have set #TJPARAM_NOREALLOC,
+ * you should always check `*jpegBuf` upon return from this function, as it may
+ * have changed.
+ *
+ * @param jpegSize pointer to a size_t variable that holds the size of the JPEG
+ * buffer.  If `*jpegBuf` points to a pre-allocated buffer, then `*jpegSize`
+ * should be set to the size of the buffer.  Upon return, `*jpegSize` will
+ * contain the size of the JPEG image (in bytes.)  If `*jpegBuf` points to a
+ * JPEG buffer that is being reused from a previous call to one of the JPEG
+ * compression functions, then `*jpegSize` is ignored.
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr()
+ * and #tj3GetErrorCode().)
+ */
+DLLEXPORT int tj3Compress8(tjhandle handle, const unsigned char *srcBuf,
+                           int width, int pitch, int height, int pixelFormat,
+                           unsigned char **jpegBuf, size_t *jpegSize);
+
+/**
+ * Compress a 12-bit-per-sample packed-pixel RGB, grayscale, or CMYK image into
+ * a 12-bit-per-sample JPEG image.
+ *
+ * \details \copydetails tj3Compress8()
+ */
+DLLEXPORT int tj3Compress12(tjhandle handle, const short *srcBuf, int width,
+                            int pitch, int height, int pixelFormat,
+                            unsigned char **jpegBuf, size_t *jpegSize);
+
+/**
+ * Compress a 16-bit-per-sample packed-pixel RGB, grayscale, or CMYK image into
+ * a 16-bit-per-sample lossless JPEG image.
+ *
+ * \details \copydetails tj3Compress8()
+ */
+DLLEXPORT int tj3Compress16(tjhandle handle, const unsigned short *srcBuf,
+                            int width, int pitch, int height, int pixelFormat,
+                            unsigned char **jpegBuf, size_t *jpegSize);
+
+
+/**
+ * Compress an 8-bit-per-sample unified planar YUV image into an
+ * 8-bit-per-sample JPEG image.
+ *
+ * @param handle handle to a TurboJPEG instance that has been initialized for
+ * compression
+ *
+ * @param srcBuf pointer to a buffer containing a unified planar YUV source
+ * image to be compressed.  The size of this buffer should match the value
+ * returned by #tj3YUVBufSize() for the given image width, height, row
+ * alignment, and level of chrominance subsampling (see #TJPARAM_SUBSAMP.)  The
+ * Y, U (Cb), and V (Cr) image planes should be stored sequentially in the
+ * buffer.  (Refer to @ref YUVnotes "YUV Image Format Notes".)
+ *
+ * @param width width (in pixels) of the source image.  If the width is not an
+ * even multiple of the MCU block width (see #tjMCUWidth), then an intermediate
+ * buffer copy will be performed.
+ *
+ * @param align row alignment (in bytes) of the source image (must be a power
+ * of 2.)  Setting this parameter to n indicates that each row in each plane of
+ * the source image is padded to the nearest multiple of n bytes
+ * (1 = unpadded.)
+ *
+ * @param height height (in pixels) of the source image.  If the height is not
+ * an even multiple of the MCU block height (see #tjMCUHeight), then an
+ * intermediate buffer copy will be performed.
+ *
+ * @param jpegBuf address of a pointer to a byte buffer that will receive the
+ * JPEG image.  TurboJPEG has the ability to reallocate the JPEG buffer to
+ * accommodate the size of the JPEG image.  Thus, you can choose to:
+ * -# pre-allocate the JPEG buffer with an arbitrary size using #tj3Alloc() and
+ * let TurboJPEG grow the buffer as needed,
+ * -# set `*jpegBuf` to NULL to tell TurboJPEG to allocate the buffer for you,
+ * or
+ * -# pre-allocate the buffer to a "worst case" size determined by calling
+ * #tj3JPEGBufSize().  This should ensure that the buffer never has to be
+ * re-allocated.  (Setting #TJPARAM_NOREALLOC guarantees that it won't be.)
+ * .
+ * If you choose option 1, then `*jpegSize` should be set to the size of your
+ * pre-allocated buffer.  In any case, unless you have set #TJPARAM_NOREALLOC,
+ * you should always check `*jpegBuf` upon return from this function, as it may
+ * have changed.
+ *
+ * @param jpegSize pointer to a size_t variable that holds the size of the JPEG
+ * buffer.  If `*jpegBuf` points to a pre-allocated buffer, then `*jpegSize`
+ * should be set to the size of the buffer.  Upon return, `*jpegSize` will
+ * contain the size of the JPEG image (in bytes.)  If `*jpegBuf` points to a
+ * JPEG buffer that is being reused from a previous call to one of the JPEG
+ * compression functions, then `*jpegSize` is ignored.
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr()
+ * and #tj3GetErrorCode().)
+ */
+DLLEXPORT int tj3CompressFromYUV8(tjhandle handle,
+                                  const unsigned char *srcBuf, int width,
+                                  int align, int height,
+                                  unsigned char **jpegBuf, size_t *jpegSize);
+
+
+/**
+ * Compress a set of 8-bit-per-sample Y, U (Cb), and V (Cr) image planes into
+ * an 8-bit-per-sample JPEG image.
+ *
+ * @param handle handle to a TurboJPEG instance that has been initialized for
+ * compression
+ *
+ * @param srcPlanes an array of pointers to Y, U (Cb), and V (Cr) image planes
+ * (or just a Y plane, if compressing a grayscale image) that contain a YUV
+ * source image to be compressed.  These planes can be contiguous or
+ * non-contiguous in memory.  The size of each plane should match the value
+ * returned by #tj3YUVPlaneSize() for the given image width, height, strides,
+ * and level of chrominance subsampling (see #TJPARAM_SUBSAMP.)  Refer to
+ * @ref YUVnotes "YUV Image Format Notes" for more details.
+ *
+ * @param width width (in pixels) of the source image.  If the width is not an
+ * even multiple of the MCU block width (see #tjMCUWidth), then an intermediate
+ * buffer copy will be performed.
+ *
+ * @param strides an array of integers, each specifying the number of bytes per
+ * row in the corresponding plane of the YUV source image.  Setting the stride
+ * for any plane to 0 is the same as setting it to the plane width (see
+ * @ref YUVnotes "YUV Image Format Notes".)  If `strides` is NULL, then the
+ * strides for all planes will be set to their respective plane widths.  You
+ * can adjust the strides in order to specify an arbitrary amount of row
+ * padding in each plane or to create a JPEG image from a subregion of a larger
+ * planar YUV image.
+ *
+ * @param height height (in pixels) of the source image.  If the height is not
+ * an even multiple of the MCU block height (see #tjMCUHeight), then an
+ * intermediate buffer copy will be performed.
+ *
+ * @param jpegBuf address of a pointer to a byte buffer that will receive the
+ * JPEG image.  TurboJPEG has the ability to reallocate the JPEG buffer to
+ * accommodate the size of the JPEG image.  Thus, you can choose to:
+ * -# pre-allocate the JPEG buffer with an arbitrary size using #tj3Alloc() and
+ * let TurboJPEG grow the buffer as needed,
+ * -# set `*jpegBuf` to NULL to tell TurboJPEG to allocate the buffer for you,
+ * or
+ * -# pre-allocate the buffer to a "worst case" size determined by calling
+ * #tj3JPEGBufSize().  This should ensure that the buffer never has to be
+ * re-allocated.  (Setting #TJPARAM_NOREALLOC guarantees that it won't be.)
+ * .
+ * If you choose option 1, then `*jpegSize` should be set to the size of your
+ * pre-allocated buffer.  In any case, unless you have set #TJPARAM_NOREALLOC,
+ * you should always check `*jpegBuf` upon return from this function, as it may
+ * have changed.
+ *
+ * @param jpegSize pointer to a size_t variable that holds the size of the JPEG
+ * buffer.  If `*jpegBuf` points to a pre-allocated buffer, then `*jpegSize`
+ * should be set to the size of the buffer.  Upon return, `*jpegSize` will
+ * contain the size of the JPEG image (in bytes.)  If `*jpegBuf` points to a
+ * JPEG buffer that is being reused from a previous call to one of the JPEG
+ * compression functions, then `*jpegSize` is ignored.
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr()
+ * and #tj3GetErrorCode().)
+ */
+DLLEXPORT int tj3CompressFromYUVPlanes8(tjhandle handle,
+                                        const unsigned char * const *srcPlanes,
+                                        int width, const int *strides,
+                                        int height, unsigned char **jpegBuf,
+                                        size_t *jpegSize);
+
+
+/**
+ * The maximum size of the buffer (in bytes) required to hold a JPEG image with
+ * the given parameters.  The number of bytes returned by this function is
+ * larger than the size of the uncompressed source image.  The reason for this
+ * is that the JPEG format uses 16-bit coefficients, so it is possible for a
+ * very high-quality source image with very high-frequency content to expand
+ * rather than compress when converted to the JPEG format.  Such images
+ * represent very rare corner cases, but since there is no way to predict the
+ * size of a JPEG image prior to compression, the corner cases have to be
+ * handled.
+ *
+ * @param width width (in pixels) of the image
+ *
+ * @param height height (in pixels) of the image
+ *
+ * @param jpegSubsamp the level of chrominance subsampling to be used when
+ * generating the JPEG image (see @ref TJSAMP
+ * "Chrominance subsampling options".)  #TJSAMP_UNKNOWN is treated like
+ * #TJSAMP_444, since a buffer large enough to hold a JPEG image with no
+ * subsampling should also be large enough to hold a JPEG image with an
+ * arbitrary level of subsampling.  Note that lossless JPEG images always
+ * use #TJSAMP_444.
+ *
+ * @return the maximum size of the buffer (in bytes) required to hold the
+ * image, or 0 if the arguments are out of bounds.
+ */
+DLLEXPORT size_t tj3JPEGBufSize(int width, int height, int jpegSubsamp);
+
+
+/**
+ * The size of the buffer (in bytes) required to hold a unified planar YUV
+ * image with the given parameters.
+ *
+ * @param width width (in pixels) of the image
+ *
+ * @param align row alignment (in bytes) of the image (must be a power of 2.)
+ * Setting this parameter to n specifies that each row in each plane of the
+ * image will be padded to the nearest multiple of n bytes (1 = unpadded.)
+ *
+ * @param height height (in pixels) of the image
+ *
+ * @param subsamp level of chrominance subsampling in the image (see
+ * @ref TJSAMP "Chrominance subsampling options".)
+ *
+ * @return the size of the buffer (in bytes) required to hold the image, or 0
+ * if the arguments are out of bounds.
+ */
+DLLEXPORT size_t tj3YUVBufSize(int width, int align, int height, int subsamp);
+
+
+/**
+ * The size of the buffer (in bytes) required to hold a YUV image plane with
+ * the given parameters.
+ *
+ * @param componentID ID number of the image plane (0 = Y, 1 = U/Cb, 2 = V/Cr)
+ *
+ * @param width width (in pixels) of the YUV image.  NOTE: this is the width of
+ * the whole image, not the plane width.
+ *
+ * @param stride bytes per row in the image plane.  Setting this to 0 is the
+ * equivalent of setting it to the plane width.
+ *
+ * @param height height (in pixels) of the YUV image.  NOTE: this is the height
+ * of the whole image, not the plane height.
+ *
+ * @param subsamp level of chrominance subsampling in the image (see
+ * @ref TJSAMP "Chrominance subsampling options".)
+ *
+ * @return the size of the buffer (in bytes) required to hold the YUV image
+ * plane, or 0 if the arguments are out of bounds.
+ */
+DLLEXPORT size_t tj3YUVPlaneSize(int componentID, int width, int stride,
+                                 int height, int subsamp);
+
+
+/**
+ * The plane width of a YUV image plane with the given parameters.  Refer to
+ * @ref YUVnotes "YUV Image Format Notes" for a description of plane width.
+ *
+ * @param componentID ID number of the image plane (0 = Y, 1 = U/Cb, 2 = V/Cr)
+ *
+ * @param width width (in pixels) of the YUV image
+ *
+ * @param subsamp level of chrominance subsampling in the image (see
+ * @ref TJSAMP "Chrominance subsampling options".)
+ *
+ * @return the plane width of a YUV image plane with the given parameters, or 0
+ * if the arguments are out of bounds.
+ */
+DLLEXPORT int tj3YUVPlaneWidth(int componentID, int width, int subsamp);
+
+
+/**
+ * The plane height of a YUV image plane with the given parameters.  Refer to
+ * @ref YUVnotes "YUV Image Format Notes" for a description of plane height.
+ *
+ * @param componentID ID number of the image plane (0 = Y, 1 = U/Cb, 2 = V/Cr)
+ *
+ * @param height height (in pixels) of the YUV image
+ *
+ * @param subsamp level of chrominance subsampling in the image (see
+ * @ref TJSAMP "Chrominance subsampling options".)
+ *
+ * @return the plane height of a YUV image plane with the given parameters, or
+ * 0 if the arguments are out of bounds.
+ */
+DLLEXPORT int tj3YUVPlaneHeight(int componentID, int height, int subsamp);
+
+
+/**
+ * Encode an 8-bit-per-sample packed-pixel RGB or grayscale image into an
+ * 8-bit-per-sample unified planar YUV image.  This function performs color
+ * conversion (which is accelerated in the libjpeg-turbo implementation) but
+ * does not execute any of the other steps in the JPEG compression process.
+ *
+ * @param handle handle to a TurboJPEG instance that has been initialized for
+ * compression
+ *
+ * @param srcBuf pointer to a buffer containing a packed-pixel RGB or grayscale
+ * source image to be encoded.  This buffer should normally be `pitch * height`
+ * bytes in size.  However, you can also use this parameter to encode from a
+ * specific region of a larger buffer.
+ *
+ * @param width width (in pixels) of the source image
+ *
+ * @param pitch bytes per row in the source image.  Normally this should be
+ * <tt>width * #tjPixelSize[pixelFormat]</tt>, if the image is unpadded.
+ * (Setting this parameter to 0 is the equivalent of setting it to
+ * <tt>width * #tjPixelSize[pixelFormat]</tt>.)  However, you can also use this
+ * parameter to specify the row alignment/padding of the source image, to skip
+ * rows, or to encode from a specific region of a larger packed-pixel image.
+ *
+ * @param height height (in pixels) of the source image
+ *
+ * @param pixelFormat pixel format of the source image (see @ref TJPF
+ * "Pixel formats".)
+ *
+ * @param dstBuf pointer to a buffer that will receive the unified planar YUV
+ * image.  Use #tj3YUVBufSize() to determine the appropriate size for this
+ * buffer based on the image width, height, row alignment, and level of
+ * chrominance subsampling (see #TJPARAM_SUBSAMP.)  The Y, U (Cb), and V (Cr)
+ * image planes will be stored sequentially in the buffer.  (Refer to
+ * @ref YUVnotes "YUV Image Format Notes".)
+ *
+ * @param align row alignment (in bytes) of the YUV image (must be a power of
+ * 2.)  Setting this parameter to n will cause each row in each plane of the
+ * YUV image to be padded to the nearest multiple of n bytes (1 = unpadded.)
+ * To generate images suitable for X Video, `align` should be set to 4.
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr()
+ * and #tj3GetErrorCode().)
+ */
+DLLEXPORT int tj3EncodeYUV8(tjhandle handle, const unsigned char *srcBuf,
+                            int width, int pitch, int height, int pixelFormat,
+                            unsigned char *dstBuf, int align);
+
+
+/**
+ * Encode an 8-bit-per-sample packed-pixel RGB or grayscale image into separate
+ * 8-bit-per-sample Y, U (Cb), and V (Cr) image planes.  This function performs
+ * color conversion (which is accelerated in the libjpeg-turbo implementation)
+ * but does not execute any of the other steps in the JPEG compression process.
+ *
+ * @param handle handle to a TurboJPEG instance that has been initialized for
+ * compression
+ *
+ * @param srcBuf pointer to a buffer containing a packed-pixel RGB or grayscale
+ * source image to be encoded.  This buffer should normally be `pitch * height`
+ * bytes in size.  However, you can also use this parameter to encode from a
+ * specific region of a larger buffer.
+ *
+ *
+ * @param width width (in pixels) of the source image
+ *
+ * @param pitch bytes per row in the source image.  Normally this should be
+ * <tt>width * #tjPixelSize[pixelFormat]</tt>, if the image is unpadded.
+ * (Setting this parameter to 0 is the equivalent of setting it to
+ * <tt>width * #tjPixelSize[pixelFormat]</tt>.)  However, you can also use this
+ * parameter to specify the row alignment/padding of the source image, to skip
+ * rows, or to encode from a specific region of a larger packed-pixel image.
+ *
+ * @param height height (in pixels) of the source image
+ *
+ * @param pixelFormat pixel format of the source image (see @ref TJPF
+ * "Pixel formats".)
+ *
+ * @param dstPlanes an array of pointers to Y, U (Cb), and V (Cr) image planes
+ * (or just a Y plane, if generating a grayscale image) that will receive the
+ * encoded image.  These planes can be contiguous or non-contiguous in memory.
+ * Use #tj3YUVPlaneSize() to determine the appropriate size for each plane
+ * based on the image width, height, strides, and level of chrominance
+ * subsampling (see #TJPARAM_SUBSAMP.)  Refer to @ref YUVnotes
+ * "YUV Image Format Notes" for more details.
+ *
+ * @param strides an array of integers, each specifying the number of bytes per
+ * row in the corresponding plane of the YUV image.  Setting the stride for any
+ * plane to 0 is the same as setting it to the plane width (see @ref YUVnotes
+ * "YUV Image Format Notes".)  If `strides` is NULL, then the strides for all
+ * planes will be set to their respective plane widths.  You can adjust the
+ * strides in order to add an arbitrary amount of row padding to each plane or
+ * to encode an RGB or grayscale image into a subregion of a larger planar YUV
+ * image.
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr()
+ * and #tj3GetErrorCode().)
+ */
+DLLEXPORT int tj3EncodeYUVPlanes8(tjhandle handle, const unsigned char *srcBuf,
+                                  int width, int pitch, int height,
+                                  int pixelFormat, unsigned char **dstPlanes,
+                                  int *strides);
+
+
+/**
+ * Retrieve information about a JPEG image without decompressing it, or prime
+ * the decompressor with quantization and Huffman tables.  If a JPEG image is
+ * passed to this function, then the @ref TJPARAM "parameters" that describe
+ * the JPEG image will be set when the function returns.
+ *
+ * @param handle handle to a TurboJPEG instance that has been initialized for
+ * decompression
+ *
+ * @param jpegBuf pointer to a byte buffer containing a JPEG image or an
+ * "abbreviated table specification" (AKA "tables-only") datastream.  Passing a
+ * tables-only datastream to this function primes the decompressor with
+ * quantization and Huffman tables that can be used when decompressing
+ * subsequent "abbreviated image" datastreams.  This is useful, for instance,
+ * when decompressing video streams in which all frames share the same
+ * quantization and Huffman tables.
+ *
+ * @param jpegSize size of the JPEG image or tables-only datastream (in bytes)
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr()
+ * and #tj3GetErrorCode().)
+ */
+DLLEXPORT int tj3DecompressHeader(tjhandle handle,
+                                  const unsigned char *jpegBuf,
+                                  size_t jpegSize);
+
+
+/**
+ * Returns a list of fractional scaling factors that the JPEG decompressor
+ * supports.
+ *
+ * @param numScalingFactors pointer to an integer variable that will receive
+ * the number of elements in the list
+ *
+ * @return a pointer to a list of fractional scaling factors, or NULL if an
+ * error is encountered (see #tj3GetErrorStr().)
+ */
+DLLEXPORT tjscalingfactor *tj3GetScalingFactors(int *numScalingFactors);
+
+
+/**
+ * Set the scaling factor for subsequent lossy decompression operations.
+ *
+ * @param handle handle to a TurboJPEG instance that has been initialized for
+ * decompression
+ *
+ * @param scalingFactor #tjscalingfactor structure that specifies a fractional
+ * scaling factor that the decompressor supports (see #tj3GetScalingFactors()),
+ * or <tt>#TJUNSCALED</tt> for no scaling.  Decompression scaling is a function
+ * of the IDCT algorithm, so scaling factors are generally limited to multiples
+ * of 1/8.  If the entire JPEG image will be decompressed, then the width and
+ * height of the scaled destination image can be determined by calling
+ * #TJSCALED() with the JPEG width and height (see #TJPARAM_JPEGWIDTH and
+ * #TJPARAM_JPEGHEIGHT) and the specified scaling factor.  When decompressing
+ * into a planar YUV image, an intermediate buffer copy will be performed if
+ * the width or height of the scaled destination image is not an even multiple
+ * of the MCU block size (see #tjMCUWidth and #tjMCUHeight.)  Note that
+ * decompression scaling is not available (and the specified scaling factor is
+ * ignored) when decompressing lossless JPEG images (see #TJPARAM_LOSSLESS),
+ * since the IDCT algorithm is not used with those images.  Note also that
+ * #TJPARAM_FASTDCT is ignored when decompression scaling is enabled.
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr().)
+ */
+DLLEXPORT int tj3SetScalingFactor(tjhandle handle,
+                                  tjscalingfactor scalingFactor);
+
+
+/**
+ * Set the cropping region for partially decompressing a lossy JPEG image into
+ * a packed-pixel image
+ *
+ * @param handle handle to a TurboJPEG instance that has been initialized for
+ * decompression
+ *
+ * @param croppingRegion #tjregion structure that specifies a subregion of the
+ * JPEG image to decompress, or <tt>#TJUNCROPPED</tt> for no cropping.  The
+ * left boundary of the cropping region must be evenly divisible by the scaled
+ * MCU block width (<tt>#TJSCALED(#tjMCUWidth[subsamp], scalingFactor)</tt>,
+ * where `subsamp` is the level of chrominance subsampling in the JPEG image
+ * (see #TJPARAM_SUBSAMP) and `scalingFactor` is the decompression scaling
+ * factor (see #tj3SetScalingFactor().)  The cropping region should be
+ * specified relative to the scaled image dimensions.  Unless `croppingRegion`
+ * is <tt>#TJUNCROPPED</tt>, the JPEG header must be read (see
+ * #tj3DecompressHeader()) prior to calling this function.
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr().)
+ */
+DLLEXPORT int tj3SetCroppingRegion(tjhandle handle, tjregion croppingRegion);
+
+
+/**
+ * Decompress an 8-bit-per-sample JPEG image into an 8-bit-per-sample
+ * packed-pixel RGB, grayscale, or CMYK image.  The @ref TJPARAM "parameters"
+ * that describe the JPEG image will be set when this function returns.
+ *
+ * @param handle handle to a TurboJPEG instance that has been initialized for
+ * decompression
+ *
+ * @param jpegBuf pointer to a byte buffer containing the JPEG image to
+ * decompress
+ *
+ * @param jpegSize size of the JPEG image (in bytes)
+ *
+ * @param dstBuf pointer to a buffer that will receive the packed-pixel
+ * decompressed image.  This buffer should normally be
+ * `pitch * destinationHeight` samples in size.  However, you can also use this
+ * parameter to decompress into a specific region of a larger buffer.  NOTE:
+ * If the JPEG image is lossy, then `destinationHeight` is either the scaled
+ * JPEG height (see #TJSCALED(), #TJPARAM_JPEGHEIGHT, and
+ * #tj3SetScalingFactor()) or the height of the cropping region (see
+ * #tj3SetCroppingRegion().)  If the JPEG image is lossless, then
+ * `destinationHeight` is the JPEG height.
+ *
+ * @param pitch samples per row in the destination image.  Normally this should
+ * be set to <tt>destinationWidth * #tjPixelSize[pixelFormat]</tt>, if the
+ * destination image should be unpadded.  (Setting this parameter to 0 is the
+ * equivalent of setting it to
+ * <tt>destinationWidth * #tjPixelSize[pixelFormat]</tt>.)  However, you can
+ * also use this parameter to specify the row alignment/padding of the
+ * destination image, to skip rows, or to decompress into a specific region of
+ * a larger buffer.  NOTE: If the JPEG image is lossy, then `destinationWidth`
+ * is either the scaled JPEG width (see #TJSCALED(), #TJPARAM_JPEGWIDTH, and
+ * #tj3SetScalingFactor()) or the width of the cropping region (see
+ * #tj3SetCroppingRegion().)  If the JPEG image is lossless, then
+ * `destinationWidth` is the JPEG width.
+ *
+ * @param pixelFormat pixel format of the destination image (see @ref
+ * TJPF "Pixel formats".)
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr()
+ * and #tj3GetErrorCode().)
+ */
+DLLEXPORT int tj3Decompress8(tjhandle handle, const unsigned char *jpegBuf,
+                             size_t jpegSize, unsigned char *dstBuf, int pitch,
+                             int pixelFormat);
+
+/**
+ * Decompress a 12-bit-per-sample JPEG image into a 12-bit-per-sample
+ * packed-pixel RGB, grayscale, or CMYK image.
+ *
+ * \details \copydetails tj3Decompress8()
+ */
+DLLEXPORT int tj3Decompress12(tjhandle handle, const unsigned char *jpegBuf,
+                              size_t jpegSize, short *dstBuf, int pitch,
+                              int pixelFormat);
+
+/**
+ * Decompress a 16-bit-per-sample lossless JPEG image into a 16-bit-per-sample
+ * packed-pixel RGB, grayscale, or CMYK image.
+ *
+ * \details \copydetails tj3Decompress8()
+ */
+DLLEXPORT int tj3Decompress16(tjhandle handle, const unsigned char *jpegBuf,
+                              size_t jpegSize, unsigned short *dstBuf,
+                              int pitch, int pixelFormat);
+
+
+/**
+ * Decompress an 8-bit-per-sample JPEG image into an 8-bit-per-sample unified
+ * planar YUV image.  This function performs JPEG decompression but leaves out
+ * the color conversion step, so a planar YUV image is generated instead of a
+ * packed-pixel image.  The @ref TJPARAM "parameters" that describe the JPEG
+ * image will be set when this function returns.
+ *
+ * @param handle handle to a TurboJPEG instance that has been initialized for
+ * decompression
+ *
+ * @param jpegBuf pointer to a byte buffer containing the JPEG image to
+ * decompress
+ *
+ * @param jpegSize size of the JPEG image (in bytes)
+ *
+ * @param dstBuf pointer to a buffer that will receive the unified planar YUV
+ * decompressed image.  Use #tj3YUVBufSize() to determine the appropriate size
+ * for this buffer based on the scaled JPEG width and height (see #TJSCALED(),
+ * #TJPARAM_JPEGWIDTH, #TJPARAM_JPEGHEIGHT, and #tj3SetScalingFactor()), row
+ * alignment, and level of chrominance subsampling (see #TJPARAM_SUBSAMP.)  The
+ * Y, U (Cb), and V (Cr) image planes will be stored sequentially in the
+ * buffer.  (Refer to @ref YUVnotes "YUV Image Format Notes".)
+ *
+ * @param align row alignment (in bytes) of the YUV image (must be a power of
+ * 2.)  Setting this parameter to n will cause each row in each plane of the
+ * YUV image to be padded to the nearest multiple of n bytes (1 = unpadded.)
+ * To generate images suitable for X Video, `align` should be set to 4.
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr()
+ * and #tj3GetErrorCode().)
+ */
+DLLEXPORT int tj3DecompressToYUV8(tjhandle handle,
+                                  const unsigned char *jpegBuf,
+                                  size_t jpegSize,
+                                  unsigned char *dstBuf, int align);
+
+
+/**
+ * Decompress an 8-bit-per-sample JPEG image into separate 8-bit-per-sample Y,
+ * U (Cb), and V (Cr) image planes.  This function performs JPEG decompression
+ * but leaves out the color conversion step, so a planar YUV image is generated
+ * instead of a packed-pixel image.  The @ref TJPARAM "parameters" that
+ * describe the JPEG image will be set when this function returns.
+ *
+ * @param handle handle to a TurboJPEG instance that has been initialized for
+ * decompression
+ *
+ * @param jpegBuf pointer to a byte buffer containing the JPEG image to
+ * decompress
+ *
+ * @param jpegSize size of the JPEG image (in bytes)
+ *
+ * @param dstPlanes an array of pointers to Y, U (Cb), and V (Cr) image planes
+ * (or just a Y plane, if decompressing a grayscale image) that will receive
+ * the decompressed image.  These planes can be contiguous or non-contiguous in
+ * memory.  Use #tj3YUVPlaneSize() to determine the appropriate size for each
+ * plane based on the scaled JPEG width and height (see #TJSCALED(),
+ * #TJPARAM_JPEGWIDTH, #TJPARAM_JPEGHEIGHT, and #tj3SetScalingFactor()),
+ * strides, and level of chrominance subsampling (see #TJPARAM_SUBSAMP.)  Refer
+ * to @ref YUVnotes "YUV Image Format Notes" for more details.
+ *
+ * @param strides an array of integers, each specifying the number of bytes per
+ * row in the corresponding plane of the YUV image.  Setting the stride for any
+ * plane to 0 is the same as setting it to the scaled plane width (see
+ * @ref YUVnotes "YUV Image Format Notes".)  If `strides` is NULL, then the
+ * strides for all planes will be set to their respective scaled plane widths.
+ * You can adjust the strides in order to add an arbitrary amount of row
+ * padding to each plane or to decompress the JPEG image into a subregion of a
+ * larger planar YUV image.
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr()
+ * and #tj3GetErrorCode().)
+ */
+DLLEXPORT int tj3DecompressToYUVPlanes8(tjhandle handle,
+                                        const unsigned char *jpegBuf,
+                                        size_t jpegSize,
+                                        unsigned char **dstPlanes,
+                                        int *strides);
+
+
+/**
+ * Decode an 8-bit-per-sample unified planar YUV image into an 8-bit-per-sample
+ * packed-pixel RGB or grayscale image.  This function performs color
+ * conversion (which is accelerated in the libjpeg-turbo implementation) but
+ * does not execute any of the other steps in the JPEG decompression process.
+ *
+ * @param handle handle to a TurboJPEG instance that has been initialized for
+ * decompression
+ *
+ * @param srcBuf pointer to a buffer containing a unified planar YUV source
+ * image to be decoded.  The size of this buffer should match the value
+ * returned by #tj3YUVBufSize() for the given image width, height, row
+ * alignment, and level of chrominance subsampling (see #TJPARAM_SUBSAMP.)  The
+ * Y, U (Cb), and V (Cr) image planes should be stored sequentially in the
+ * source buffer.  (Refer to @ref YUVnotes "YUV Image Format Notes".)
+ *
+ * @param align row alignment (in bytes) of the YUV source image (must be a
+ * power of 2.)  Setting this parameter to n indicates that each row in each
+ * plane of the YUV source image is padded to the nearest multiple of n bytes
+ * (1 = unpadded.)
+ *
+ * @param dstBuf pointer to a buffer that will receive the packed-pixel decoded
+ * image.  This buffer should normally be `pitch * height` bytes in size.
+ * However, you can also use this parameter to decode into a specific region of
+ * a larger buffer.
+ *
+ * @param width width (in pixels) of the source and destination images
+ *
+ * @param pitch bytes per row in the destination image.  Normally this should
+ * be set to <tt>width * #tjPixelSize[pixelFormat]</tt>, if the destination
+ * image should be unpadded.  (Setting this parameter to 0 is the equivalent of
+ * setting it to <tt>width * #tjPixelSize[pixelFormat]</tt>.)  However, you can
+ * also use this parameter to specify the row alignment/padding of the
+ * destination image, to skip rows, or to decode into a specific region of a
+ * larger buffer.
+ *
+ * @param height height (in pixels) of the source and destination images
+ *
+ * @param pixelFormat pixel format of the destination image (see @ref TJPF
+ * "Pixel formats".)
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr()
+ * and #tj3GetErrorCode().)
+ */
+DLLEXPORT int tj3DecodeYUV8(tjhandle handle, const unsigned char *srcBuf,
+                            int align, unsigned char *dstBuf, int width,
+                            int pitch, int height, int pixelFormat);
+
+
+/**
+ * Decode a set of 8-bit-per-sample Y, U (Cb), and V (Cr) image planes into an
+ * 8-bit-per-sample packed-pixel RGB or grayscale image.  This function
+ * performs color conversion (which is accelerated in the libjpeg-turbo
+ * implementation) but does not execute any of the other steps in the JPEG
+ * decompression process.
+ *
+ * @param handle handle to a TurboJPEG instance that has been initialized for
+ * decompression
+ *
+ * @param srcPlanes an array of pointers to Y, U (Cb), and V (Cr) image planes
+ * (or just a Y plane, if decoding a grayscale image) that contain a YUV image
+ * to be decoded.  These planes can be contiguous or non-contiguous in memory.
+ * The size of each plane should match the value returned by #tj3YUVPlaneSize()
+ * for the given image width, height, strides, and level of chrominance
+ * subsampling (see #TJPARAM_SUBSAMP.)  Refer to @ref YUVnotes
+ * "YUV Image Format Notes" for more details.
+ *
+ * @param strides an array of integers, each specifying the number of bytes per
+ * row in the corresponding plane of the YUV source image.  Setting the stride
+ * for any plane to 0 is the same as setting it to the plane width (see
+ * @ref YUVnotes "YUV Image Format Notes".)  If `strides` is NULL, then the
+ * strides for all planes will be set to their respective plane widths.  You
+ * can adjust the strides in order to specify an arbitrary amount of row
+ * padding in each plane or to decode a subregion of a larger planar YUV image.
+ *
+ * @param dstBuf pointer to a buffer that will receive the packed-pixel decoded
+ * image.  This buffer should normally be `pitch * height` bytes in size.
+ * However, you can also use this parameter to decode into a specific region of
+ * a larger buffer.
+ *
+ * @param width width (in pixels) of the source and destination images
+ *
+ * @param pitch bytes per row in the destination image.  Normally this should
+ * be set to <tt>width * #tjPixelSize[pixelFormat]</tt>, if the destination
+ * image should be unpadded.  (Setting this parameter to 0 is the equivalent of
+ * setting it to <tt>width * #tjPixelSize[pixelFormat]</tt>.)  However, you can
+ * also use this parameter to specify the row alignment/padding of the
+ * destination image, to skip rows, or to decode into a specific region of a
+ * larger buffer.
+ *
+ * @param height height (in pixels) of the source and destination images
+ *
+ * @param pixelFormat pixel format of the destination image (see @ref TJPF
+ * "Pixel formats".)
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr()
+ * and #tj3GetErrorCode().)
+ */
+DLLEXPORT int tj3DecodeYUVPlanes8(tjhandle handle,
+                                  const unsigned char * const *srcPlanes,
+                                  const int *strides, unsigned char *dstBuf,
+                                  int width, int pitch, int height,
+                                  int pixelFormat);
+
+
+/**
+ * Losslessly transform a JPEG image into another JPEG image.  Lossless
+ * transforms work by moving the raw DCT coefficients from one JPEG image
+ * structure to another without altering the values of the coefficients.  While
+ * this is typically faster than decompressing the image, transforming it, and
+ * re-compressing it, lossless transforms are not free.  Each lossless
+ * transform requires reading and performing entropy decoding on all of the
+ * coefficients in the source image, regardless of the size of the destination
+ * image.  Thus, this function provides a means of generating multiple
+ * transformed images from the same source or applying multiple transformations
+ * simultaneously, in order to eliminate the need to read the source
+ * coefficients multiple times.
+ *
+ * @param handle handle to a TurboJPEG instance that has been initialized for
+ * lossless transformation
+ *
+ * @param jpegBuf pointer to a byte buffer containing the JPEG source image to
+ * transform
+ *
+ * @param jpegSize size of the JPEG source image (in bytes)
+ *
+ * @param n the number of transformed JPEG images to generate
+ *
+ * @param dstBufs pointer to an array of n byte buffers.  `dstBufs[i]` will
+ * receive a JPEG image that has been transformed using the parameters in
+ * `transforms[i]`.  TurboJPEG has the ability to reallocate the JPEG
+ * destination buffer to accommodate the size of the transformed JPEG image.
+ * Thus, you can choose to:
+ * -# pre-allocate the JPEG destination buffer with an arbitrary size using
+ * #tj3Alloc() and let TurboJPEG grow the buffer as needed,
+ * -# set `dstBufs[i]` to NULL to tell TurboJPEG to allocate the buffer for
+ * you, or
+ * -# pre-allocate the buffer to a "worst case" size determined by calling
+ * #tj3JPEGBufSize() with the transformed or cropped width and height and the
+ * level of subsampling used in the source image.  Under normal circumstances,
+ * this should ensure that the buffer never has to be re-allocated.  (Setting
+ * #TJPARAM_NOREALLOC guarantees that it won't be.)  Note, however, that there
+ * are some rare cases (such as transforming images with a large amount of
+ * embedded EXIF or ICC profile data) in which the transformed JPEG image will
+ * be larger than the worst-case size, and #TJPARAM_NOREALLOC cannot be used in
+ * those cases.
+ * .
+ * If you choose option 1, then `dstSizes[i]` should be set to the size of your
+ * pre-allocated buffer.  In any case, unless you have set #TJPARAM_NOREALLOC,
+ * you should always check `dstBufs[i]` upon return from this function, as it
+ * may have changed.
+ *
+ * @param dstSizes pointer to an array of n size_t variables that will receive
+ * the actual sizes (in bytes) of each transformed JPEG image.  If `dstBufs[i]`
+ * points to a pre-allocated buffer, then `dstSizes[i]` should be set to the
+ * size of the buffer.  Upon return, `dstSizes[i]` will contain the size of the
+ * transformed JPEG image (in bytes.)
+ *
+ * @param transforms pointer to an array of n #tjtransform structures, each of
+ * which specifies the transform parameters and/or cropping region for the
+ * corresponding transformed JPEG image.
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr()
+ * and #tj3GetErrorCode().)
+ */
+DLLEXPORT int tj3Transform(tjhandle handle, const unsigned char *jpegBuf,
+                           size_t jpegSize, int n, unsigned char **dstBufs,
+                           size_t *dstSizes, const tjtransform *transforms);
+
+
+/**
+ * Destroy a TurboJPEG instance.
+ *
+ * @param handle handle to a TurboJPEG instance.  If the handle is NULL, then
+ * this function has no effect.
+ */
+DLLEXPORT void tj3Destroy(tjhandle handle);
+
+
+/**
+ * Allocate a byte buffer for use with TurboJPEG.  You should always use this
+ * function to allocate the JPEG destination buffer(s) for the compression and
+ * transform functions unless you are disabling automatic buffer (re)allocation
+ * (by setting #TJPARAM_NOREALLOC.)
+ *
+ * @param bytes the number of bytes to allocate
+ *
+ * @return a pointer to a newly-allocated buffer with the specified number of
+ * bytes.
+ *
+ * @see tj3Free()
+ */
+DLLEXPORT void *tj3Alloc(size_t bytes);
+
+
+/**
+ * Load an 8-bit-per-sample packed-pixel image from disk into memory.
+ *
+ * @param handle handle to a TurboJPEG instance
+ *
+ * @param filename name of a file containing a packed-pixel image in Windows
+ * BMP or PBMPLUS (PPM/PGM) format.  Windows BMP files require 8-bit-per-sample
+ * data precision.  If the data precision of the PBMPLUS file does not match
+ * the target data precision, then upconverting or downconverting will be
+ * performed.
+ *
+ * @param width pointer to an integer variable that will receive the width (in
+ * pixels) of the packed-pixel image
+ *
+ * @param align row alignment (in samples) of the packed-pixel buffer to be
+ * returned (must be a power of 2.)  Setting this parameter to n will cause all
+ * rows in the buffer to be padded to the nearest multiple of n samples
+ * (1 = unpadded.)
+ *
+ * @param height pointer to an integer variable that will receive the height
+ * (in pixels) of the packed-pixel image
+ *
+ * @param pixelFormat pointer to an integer variable that specifies or will
+ * receive the pixel format of the packed-pixel buffer.  The behavior of this
+ * function will vary depending on the value of `*pixelFormat` passed to the
+ * function:
+ * - @ref TJPF_UNKNOWN : The packed-pixel buffer returned by this function will
+ * use the most optimal pixel format for the file type, and `*pixelFormat` will
+ * contain the ID of that pixel format upon successful return from this
+ * function.
+ * - @ref TJPF_GRAY : Only PGM files and 8-bit-per-pixel BMP files with a
+ * grayscale colormap can be loaded.
+ * - @ref TJPF_CMYK : The RGB or grayscale pixels stored in the file will be
+ * converted using a quick & dirty algorithm that is suitable only for testing
+ * purposes.  (Proper conversion between CMYK and other formats requires a
+ * color management system.)
+ * - Other @ref TJPF "pixel formats" : The packed-pixel buffer will use the
+ * specified pixel format, and pixel format conversion will be performed if
+ * necessary.
+ *
+ * @return a pointer to a newly-allocated buffer containing the packed-pixel
+ * image, converted to the chosen pixel format and with the chosen row
+ * alignment, or NULL if an error occurred (see #tj3GetErrorStr().)  This
+ * buffer should be freed using #tj3Free().
+ */
+DLLEXPORT unsigned char *tj3LoadImage8(tjhandle handle, const char *filename,
+                                       int *width, int align, int *height,
+                                       int *pixelFormat);
+
+/**
+ * Load a 12-bit-per-sample packed-pixel image from disk into memory.
+ *
+ * \details \copydetails tj3LoadImage8()
+ */
+DLLEXPORT short *tj3LoadImage12(tjhandle handle, const char *filename,
+                                int *width, int align, int *height,
+                                int *pixelFormat);
+
+/**
+ * Load a 16-bit-per-sample packed-pixel image from disk into memory.
+ *
+ * \details \copydetails tj3LoadImage8()
+ */
+DLLEXPORT unsigned short *tj3LoadImage16(tjhandle handle, const char *filename,
+                                         int *width, int align, int *height,
+                                         int *pixelFormat);
+
+
+/**
+ * Save an 8-bit-per-sample packed-pixel image from memory to disk.
+ *
+ * @param handle handle to a TurboJPEG instance
+ *
+ * @param filename name of a file to which to save the packed-pixel image.  The
+ * image will be stored in Windows BMP or PBMPLUS (PPM/PGM) format, depending
+ * on the file extension.  Windows BMP files require 8-bit-per-sample data
+ * precision.
+ *
+ * @param buffer pointer to a buffer containing a packed-pixel RGB, grayscale,
+ * or CMYK image to be saved
+ *
+ * @param width width (in pixels) of the packed-pixel image
+ *
+ * @param pitch samples per row in the packed-pixel image.  Setting this
+ * parameter to 0 is the equivalent of setting it to
+ * <tt>width * #tjPixelSize[pixelFormat]</tt>.
+ *
+ * @param height height (in pixels) of the packed-pixel image
+ *
+ * @param pixelFormat pixel format of the packed-pixel image (see @ref TJPF
+ * "Pixel formats".)  If this parameter is set to @ref TJPF_GRAY, then the
+ * image will be stored in PGM or 8-bit-per-pixel (indexed color) BMP format.
+ * Otherwise, the image will be stored in PPM or 24-bit-per-pixel BMP format.
+ * If this parameter is set to @ref TJPF_CMYK, then the CMYK pixels will be
+ * converted to RGB using a quick & dirty algorithm that is suitable only for
+ * testing purposes.  (Proper conversion between CMYK and other formats
+ * requires a color management system.)
+ *
+ * @return 0 if successful, or -1 if an error occurred (see #tj3GetErrorStr().)
+ */
+DLLEXPORT int tj3SaveImage8(tjhandle handle, const char *filename,
+                            const unsigned char *buffer, int width, int pitch,
+                            int height, int pixelFormat);
+
+/**
+ * Save a 12-bit-per-sample packed-pixel image from memory to disk.
+ *
+ * \details \copydetails tj3SaveImage8()
+ */
+DLLEXPORT int tj3SaveImage12(tjhandle handle, const char *filename,
+                             const short *buffer, int width, int pitch,
+                             int height, int pixelFormat);
+
+/**
+ * Save a 16-bit-per-sample packed-pixel image from memory to disk.
+ *
+ * \details \copydetails tj3SaveImage8()
+ */
+DLLEXPORT int tj3SaveImage16(tjhandle handle, const char *filename,
+                             const unsigned short *buffer, int width,
+                             int pitch, int height, int pixelFormat);
+
+
+/**
+ * Free a byte buffer previously allocated by TurboJPEG.  You should always use
+ * this function to free JPEG destination buffer(s) that were automatically
+ * (re)allocated by the compression and transform functions or that were
+ * manually allocated using #tj3Alloc().
+ *
+ * @param buffer address of the buffer to free.  If the address is NULL, then
+ * this function has no effect.
+ *
+ * @see tj3Alloc()
+ */
+DLLEXPORT void tj3Free(void *buffer);
+
+
+/**
+ * Returns a descriptive error message explaining why the last command failed.
+ *
+ * @param handle handle to a TurboJPEG instance, or NULL if the error was
+ * generated by a global function (but note that retrieving the error message
+ * for a global function is thread-safe only on platforms that support
+ * thread-local storage.)
+ *
+ * @return a descriptive error message explaining why the last command failed.
+ */
+DLLEXPORT char *tj3GetErrorStr(tjhandle handle);
+
+
+/**
+ * Returns a code indicating the severity of the last error.  See
+ * @ref TJERR "Error codes".
+ *
+ * @param handle handle to a TurboJPEG instance
+ *
+ * @return a code indicating the severity of the last error.  See
+ * @ref TJERR "Error codes".
+ */
+DLLEXPORT int tj3GetErrorCode(tjhandle handle);
+
+
+/* Backward compatibility functions and macros (nothing to see here) */
+
+/* TurboJPEG 1.0+ */
+
+#define NUMSUBOPT  TJ_NUMSAMP
+#define TJ_444  TJSAMP_444
+#define TJ_422  TJSAMP_422
+#define TJ_420  TJSAMP_420
+#define TJ_411  TJSAMP_420
+#define TJ_GRAYSCALE  TJSAMP_GRAY
+
+#define TJ_BGR  1
+#define TJ_BOTTOMUP  TJFLAG_BOTTOMUP
+#define TJ_FORCEMMX  TJFLAG_FORCEMMX
+#define TJ_FORCESSE  TJFLAG_FORCESSE
+#define TJ_FORCESSE2  TJFLAG_FORCESSE2
+#define TJ_ALPHAFIRST  64
+#define TJ_FORCESSE3  TJFLAG_FORCESSE3
+#define TJ_FASTUPSAMPLE  TJFLAG_FASTUPSAMPLE
+
+#define TJPAD(width)  (((width) + 3) & (~3))
+
+DLLEXPORT unsigned long TJBUFSIZE(int width, int height);
+
+DLLEXPORT int tjCompress(tjhandle handle, unsigned char *srcBuf, int width,
+                         int pitch, int height, int pixelSize,
+                         unsigned char *dstBuf, unsigned long *compressedSize,
+                         int jpegSubsamp, int jpegQual, int flags);
+
+DLLEXPORT int tjDecompress(tjhandle handle, unsigned char *jpegBuf,
+                           unsigned long jpegSize, unsigned char *dstBuf,
+                           int width, int pitch, int height, int pixelSize,
+                           int flags);
+
+DLLEXPORT int tjDecompressHeader(tjhandle handle, unsigned char *jpegBuf,
+                                 unsigned long jpegSize, int *width,
+                                 int *height);
+
+DLLEXPORT int tjDestroy(tjhandle handle);
+
+DLLEXPORT char *tjGetErrorStr(void);
+
+DLLEXPORT tjhandle tjInitCompress(void);
+
+DLLEXPORT tjhandle tjInitDecompress(void);
+
+/* TurboJPEG 1.1+ */
+
+#define TJ_YUV  512
+
+DLLEXPORT unsigned long TJBUFSIZEYUV(int width, int height, int jpegSubsamp);
+
+DLLEXPORT int tjDecompressHeader2(tjhandle handle, unsigned char *jpegBuf,
+                                  unsigned long jpegSize, int *width,
+                                  int *height, int *jpegSubsamp);
+
+DLLEXPORT int tjDecompressToYUV(tjhandle handle, unsigned char *jpegBuf,
+                                unsigned long jpegSize, unsigned char *dstBuf,
+                                int flags);
+
+DLLEXPORT int tjEncodeYUV(tjhandle handle, unsigned char *srcBuf, int width,
+                          int pitch, int height, int pixelSize,
+                          unsigned char *dstBuf, int subsamp, int flags);
+
+/* TurboJPEG 1.2+ */
+
+#define TJFLAG_BOTTOMUP  2
+#define TJFLAG_FORCEMMX  8
+#define TJFLAG_FORCESSE  16
+#define TJFLAG_FORCESSE2  32
+#define TJFLAG_FORCESSE3  128
+#define TJFLAG_FASTUPSAMPLE  256
+#define TJFLAG_NOREALLOC  1024
+
+DLLEXPORT unsigned char *tjAlloc(int bytes);
+
+DLLEXPORT unsigned long tjBufSize(int width, int height, int jpegSubsamp);
+
+DLLEXPORT unsigned long tjBufSizeYUV(int width, int height, int subsamp);
+
+DLLEXPORT int tjCompress2(tjhandle handle, const unsigned char *srcBuf,
+                          int width, int pitch, int height, int pixelFormat,
+                          unsigned char **jpegBuf, unsigned long *jpegSize,
+                          int jpegSubsamp, int jpegQual, int flags);
+
+DLLEXPORT int tjDecompress2(tjhandle handle, const unsigned char *jpegBuf,
+                            unsigned long jpegSize, unsigned char *dstBuf,
+                            int width, int pitch, int height, int pixelFormat,
+                            int flags);
+
+DLLEXPORT int tjEncodeYUV2(tjhandle handle, unsigned char *srcBuf, int width,
+                           int pitch, int height, int pixelFormat,
+                           unsigned char *dstBuf, int subsamp, int flags);
+
+DLLEXPORT void tjFree(unsigned char *buffer);
+
+DLLEXPORT tjscalingfactor *tjGetScalingFactors(int *numscalingfactors);
+
+DLLEXPORT tjhandle tjInitTransform(void);
+
+DLLEXPORT int tjTransform(tjhandle handle, const unsigned char *jpegBuf,
+                            unsigned long jpegSize, int n,
+                            unsigned char **dstBufs, unsigned long *dstSizes,
+                            tjtransform *transforms, int flags);
+
+/* TurboJPEG 1.2.1+ */
+
+#define TJFLAG_FASTDCT  2048
+#define TJFLAG_ACCURATEDCT  4096
+
+/* TurboJPEG 1.4+ */
+
+DLLEXPORT unsigned long tjBufSizeYUV2(int width, int align, int height,
+                                      int subsamp);
+
+DLLEXPORT int tjCompressFromYUV(tjhandle handle, const unsigned char *srcBuf,
+                                int width, int align, int height, int subsamp,
+                                unsigned char **jpegBuf,
+                                unsigned long *jpegSize, int jpegQual,
+                                int flags);
+
+DLLEXPORT int tjCompressFromYUVPlanes(tjhandle handle,
+                                      const unsigned char **srcPlanes,
+                                      int width, const int *strides,
+                                      int height, int subsamp,
+                                      unsigned char **jpegBuf,
+                                      unsigned long *jpegSize, int jpegQual,
+                                      int flags);
+
+DLLEXPORT int tjDecodeYUV(tjhandle handle, const unsigned char *srcBuf,
+                          int align, int subsamp, unsigned char *dstBuf,
+                          int width, int pitch, int height, int pixelFormat,
+                          int flags);
+
+DLLEXPORT int tjDecodeYUVPlanes(tjhandle handle,
+                                const unsigned char **srcPlanes,
+                                const int *strides, int subsamp,
+                                unsigned char *dstBuf, int width, int pitch,
+                                int height, int pixelFormat, int flags);
+
+DLLEXPORT int tjDecompressHeader3(tjhandle handle,
+                                  const unsigned char *jpegBuf,
+                                  unsigned long jpegSize, int *width,
+                                  int *height, int *jpegSubsamp,
+                                  int *jpegColorspace);
+
+DLLEXPORT int tjDecompressToYUV2(tjhandle handle, const unsigned char *jpegBuf,
+                                 unsigned long jpegSize, unsigned char *dstBuf,
+                                 int width, int align, int height, int flags);
+
+DLLEXPORT int tjDecompressToYUVPlanes(tjhandle handle,
+                                      const unsigned char *jpegBuf,
+                                      unsigned long jpegSize,
+                                      unsigned char **dstPlanes, int width,
+                                      int *strides, int height, int flags);
+
+DLLEXPORT int tjEncodeYUV3(tjhandle handle, const unsigned char *srcBuf,
+                           int width, int pitch, int height, int pixelFormat,
+                           unsigned char *dstBuf, int align, int subsamp,
+                           int flags);
+
+DLLEXPORT int tjEncodeYUVPlanes(tjhandle handle, const unsigned char *srcBuf,
+                                int width, int pitch, int height,
+                                int pixelFormat, unsigned char **dstPlanes,
+                                int *strides, int subsamp, int flags);
+
+DLLEXPORT int tjPlaneHeight(int componentID, int height, int subsamp);
+
+DLLEXPORT unsigned long tjPlaneSizeYUV(int componentID, int width, int stride,
+                                       int height, int subsamp);
+
+DLLEXPORT int tjPlaneWidth(int componentID, int width, int subsamp);
+
+/* TurboJPEG 2.0+ */
+
+#define TJFLAG_STOPONWARNING  8192
+#define TJFLAG_PROGRESSIVE  16384
+
+DLLEXPORT int tjGetErrorCode(tjhandle handle);
+
+DLLEXPORT char *tjGetErrorStr2(tjhandle handle);
+
+DLLEXPORT unsigned char *tjLoadImage(const char *filename, int *width,
+                                     int align, int *height, int *pixelFormat,
+                                     int flags);
+
+DLLEXPORT int tjSaveImage(const char *filename, unsigned char *buffer,
+                          int width, int pitch, int height, int pixelFormat,
+                          int flags);
+
+/* TurboJPEG 2.1+ */
+
+#define TJFLAG_LIMITSCANS  32768
+
+/**
+ * @}
+ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/allocator.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/allocator.h
new file mode 100644
index 0000000..3a5ebca
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/allocator.h
@@ -0,0 +1,448 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_ALLOCATOR_H
+#define NCNN_ALLOCATOR_H
+
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+
+#include "platform.h"
+
+#include <stdlib.h>
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+#include <android/hardware_buffer.h>
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+// the alignment of all the allocated buffers
+#if NCNN_AVX512
+#define NCNN_MALLOC_ALIGN 64
+#elif NCNN_AVX
+#define NCNN_MALLOC_ALIGN 32
+#else
+#define NCNN_MALLOC_ALIGN 16
+#endif
+
+// we have some optimized kernels that may overread buffer a bit in loop
+// it is common to interleave next-loop data load with arithmetic instructions
+// allocating more bytes keeps us safe from SEGV_ACCERR failure
+#define NCNN_MALLOC_OVERREAD 64
+
+// Aligns a pointer to the specified number of bytes
+// ptr Aligned pointer
+// n Alignment size that must be a power of two
+template<typename _Tp>
+static NCNN_FORCEINLINE _Tp* alignPtr(_Tp* ptr, int n = (int)sizeof(_Tp))
+{
+    return (_Tp*)(((size_t)ptr + n - 1) & -n);
+}
+
+// Aligns a buffer size to the specified number of bytes
+// The function returns the minimum number that is greater or equal to sz and is divisible by n
+// sz Buffer size to align
+// n Alignment size that must be a power of two
+static NCNN_FORCEINLINE size_t alignSize(size_t sz, int n)
+{
+    return (sz + n - 1) & -n;
+}
+
+static NCNN_FORCEINLINE void* fastMalloc(size_t size)
+{
+#if _MSC_VER
+    return _aligned_malloc(size, NCNN_MALLOC_ALIGN);
+#elif (defined(__unix__) || defined(__APPLE__)) && _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17)
+    void* ptr = 0;
+    if (posix_memalign(&ptr, NCNN_MALLOC_ALIGN, size + NCNN_MALLOC_OVERREAD))
+        ptr = 0;
+    return ptr;
+#elif __ANDROID__ && __ANDROID_API__ < 17
+    return memalign(NCNN_MALLOC_ALIGN, size + NCNN_MALLOC_OVERREAD);
+#else
+    unsigned char* udata = (unsigned char*)malloc(size + sizeof(void*) + NCNN_MALLOC_ALIGN + NCNN_MALLOC_OVERREAD);
+    if (!udata)
+        return 0;
+    unsigned char** adata = alignPtr((unsigned char**)udata + 1, NCNN_MALLOC_ALIGN);
+    adata[-1] = udata;
+    return adata;
+#endif
+}
+
+static NCNN_FORCEINLINE void fastFree(void* ptr)
+{
+    if (ptr)
+    {
+#if _MSC_VER
+        _aligned_free(ptr);
+#elif (defined(__unix__) || defined(__APPLE__)) && _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17)
+        free(ptr);
+#elif __ANDROID__ && __ANDROID_API__ < 17
+        free(ptr);
+#else
+        unsigned char* udata = ((unsigned char**)ptr)[-1];
+        free(udata);
+#endif
+    }
+}
+
+#if NCNN_THREADS
+// exchange-add operation for atomic operations on reference counters
+#if defined __riscv && !defined __riscv_atomic
+// riscv target without A extension
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#elif defined __INTEL_COMPILER && !(defined WIN32 || defined _WIN32)
+// atomic increment on the linux version of the Intel(tm) compiler
+#define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd(const_cast<void*>(reinterpret_cast<volatile void*>(addr)), delta)
+#elif defined __GNUC__
+#if defined __clang__ && __clang_major__ >= 3 && !defined __ANDROID__ && !defined __EMSCRIPTEN__ && !defined(__CUDACC__)
+#ifdef __ATOMIC_ACQ_REL
+#define NCNN_XADD(addr, delta) __c11_atomic_fetch_add((_Atomic(int)*)(addr), delta, __ATOMIC_ACQ_REL)
+#else
+#define NCNN_XADD(addr, delta) __atomic_fetch_add((_Atomic(int)*)(addr), delta, 4)
+#endif
+#else
+#if defined __ATOMIC_ACQ_REL && !defined __clang__
+// version for gcc >= 4.7
+#define NCNN_XADD(addr, delta) (int)__atomic_fetch_add((unsigned*)(addr), (unsigned)(delta), __ATOMIC_ACQ_REL)
+#else
+#define NCNN_XADD(addr, delta) (int)__sync_fetch_and_add((unsigned*)(addr), (unsigned)(delta))
+#endif
+#endif
+#elif defined _MSC_VER && !defined RC_INVOKED
+#define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile*)addr, delta)
+#else
+// thread-unsafe branch
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#endif
+#else  // NCNN_THREADS
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#endif // NCNN_THREADS
+
+class NCNN_EXPORT Allocator
+{
+public:
+    virtual ~Allocator();
+    virtual void* fastMalloc(size_t size) = 0;
+    virtual void fastFree(void* ptr) = 0;
+};
+
+class PoolAllocatorPrivate;
+class NCNN_EXPORT PoolAllocator : public Allocator
+{
+public:
+    PoolAllocator();
+    ~PoolAllocator();
+
+    // ratio range 0 ~ 1
+    // default cr = 0
+    void set_size_compare_ratio(float scr);
+
+    // budget drop threshold
+    // default threshold = 10
+    void set_size_drop_threshold(size_t);
+
+    // release all budgets immediately
+    void clear();
+
+    virtual void* fastMalloc(size_t size);
+    virtual void fastFree(void* ptr);
+
+private:
+    PoolAllocator(const PoolAllocator&);
+    PoolAllocator& operator=(const PoolAllocator&);
+
+private:
+    PoolAllocatorPrivate* const d;
+};
+
+class UnlockedPoolAllocatorPrivate;
+class NCNN_EXPORT UnlockedPoolAllocator : public Allocator
+{
+public:
+    UnlockedPoolAllocator();
+    ~UnlockedPoolAllocator();
+
+    // ratio range 0 ~ 1
+    // default cr = 0
+    void set_size_compare_ratio(float scr);
+
+    // budget drop threshold
+    // default threshold = 10
+    void set_size_drop_threshold(size_t);
+
+    // release all budgets immediately
+    void clear();
+
+    virtual void* fastMalloc(size_t size);
+    virtual void fastFree(void* ptr);
+
+private:
+    UnlockedPoolAllocator(const UnlockedPoolAllocator&);
+    UnlockedPoolAllocator& operator=(const UnlockedPoolAllocator&);
+
+private:
+    UnlockedPoolAllocatorPrivate* const d;
+};
+
+#if NCNN_VULKAN
+
+class VulkanDevice;
+
+class NCNN_EXPORT VkBufferMemory
+{
+public:
+    VkBuffer buffer;
+
+    // the base offset assigned by allocator
+    size_t offset;
+    size_t capacity;
+
+    VkDeviceMemory memory;
+    void* mapped_ptr;
+
+    // buffer state, modified by command functions internally
+    mutable VkAccessFlags access_flags;
+    mutable VkPipelineStageFlags stage_flags;
+
+    // initialize and modified by mat
+    int refcount;
+};
+
+class NCNN_EXPORT VkImageMemory
+{
+public:
+    VkImage image;
+    VkImageView imageview;
+
+    // underlying info assigned by allocator
+    int width;
+    int height;
+    int depth;
+    VkFormat format;
+
+    VkDeviceMemory memory;
+    void* mapped_ptr;
+
+    // the base offset assigned by allocator
+    size_t bind_offset;
+    size_t bind_capacity;
+
+    // image state, modified by command functions internally
+    mutable VkAccessFlags access_flags;
+    mutable VkImageLayout image_layout;
+    mutable VkPipelineStageFlags stage_flags;
+
+    // in-execution state, modified by command functions internally
+    mutable int command_refcount;
+
+    // initialize and modified by mat
+    int refcount;
+};
+
+class NCNN_EXPORT VkAllocator
+{
+public:
+    explicit VkAllocator(const VulkanDevice* _vkdev);
+    virtual ~VkAllocator();
+
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size) = 0;
+    virtual void fastFree(VkBufferMemory* ptr) = 0;
+    virtual int flush(VkBufferMemory* ptr);
+    virtual int invalidate(VkBufferMemory* ptr);
+
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack) = 0;
+    virtual void fastFree(VkImageMemory* ptr) = 0;
+
+public:
+    const VulkanDevice* vkdev;
+    uint32_t buffer_memory_type_index;
+    uint32_t image_memory_type_index;
+    uint32_t reserved_type_index;
+    bool mappable;
+    bool coherent;
+
+protected:
+    VkBuffer create_buffer(size_t size, VkBufferUsageFlags usage);
+    VkDeviceMemory allocate_memory(size_t size, uint32_t memory_type_index);
+    VkDeviceMemory allocate_dedicated_memory(size_t size, uint32_t memory_type_index, VkImage image, VkBuffer buffer);
+
+    VkImage create_image(int width, int height, int depth, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage);
+    VkImageView create_imageview(VkImage image, VkFormat format);
+};
+
+class VkBlobAllocatorPrivate;
+class NCNN_EXPORT VkBlobAllocator : public VkAllocator
+{
+public:
+    explicit VkBlobAllocator(const VulkanDevice* vkdev, size_t preferred_block_size = 16 * 1024 * 1024); // 16M
+    virtual ~VkBlobAllocator();
+
+public:
+    // release all budgets immediately
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkBlobAllocator(const VkBlobAllocator&);
+    VkBlobAllocator& operator=(const VkBlobAllocator&);
+
+private:
+    VkBlobAllocatorPrivate* const d;
+};
+
+class VkWeightAllocatorPrivate;
+class NCNN_EXPORT VkWeightAllocator : public VkAllocator
+{
+public:
+    explicit VkWeightAllocator(const VulkanDevice* vkdev, size_t preferred_block_size = 8 * 1024 * 1024); // 8M
+    virtual ~VkWeightAllocator();
+
+public:
+    // release all blocks immediately
+    virtual void clear();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkWeightAllocator(const VkWeightAllocator&);
+    VkWeightAllocator& operator=(const VkWeightAllocator&);
+
+private:
+    VkWeightAllocatorPrivate* const d;
+};
+
+class VkStagingAllocatorPrivate;
+class NCNN_EXPORT VkStagingAllocator : public VkAllocator
+{
+public:
+    explicit VkStagingAllocator(const VulkanDevice* vkdev);
+    virtual ~VkStagingAllocator();
+
+public:
+    // ratio range 0 ~ 1
+    // default cr = 0.75
+    void set_size_compare_ratio(float scr);
+
+    // release all budgets immediately
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkStagingAllocator(const VkStagingAllocator&);
+    VkStagingAllocator& operator=(const VkStagingAllocator&);
+
+private:
+    VkStagingAllocatorPrivate* const d;
+};
+
+class VkWeightStagingAllocatorPrivate;
+class NCNN_EXPORT VkWeightStagingAllocator : public VkAllocator
+{
+public:
+    explicit VkWeightStagingAllocator(const VulkanDevice* vkdev);
+    virtual ~VkWeightStagingAllocator();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkWeightStagingAllocator(const VkWeightStagingAllocator&);
+    VkWeightStagingAllocator& operator=(const VkWeightStagingAllocator&);
+
+private:
+    VkWeightStagingAllocatorPrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class NCNN_EXPORT VkAndroidHardwareBufferImageAllocator : public VkAllocator
+{
+public:
+    VkAndroidHardwareBufferImageAllocator(const VulkanDevice* _vkdev, AHardwareBuffer* _hb);
+    virtual ~VkAndroidHardwareBufferImageAllocator();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkAndroidHardwareBufferImageAllocator(const VkAndroidHardwareBufferImageAllocator&);
+    VkAndroidHardwareBufferImageAllocator& operator=(const VkAndroidHardwareBufferImageAllocator&);
+
+public:
+    int init();
+
+    int width() const;
+    int height() const;
+    uint64_t external_format() const;
+
+public:
+    AHardwareBuffer* hb;
+    AHardwareBuffer_Desc bufferDesc;
+    VkAndroidHardwareBufferFormatPropertiesANDROID bufferFormatProperties;
+    VkAndroidHardwareBufferPropertiesANDROID bufferProperties;
+    VkSamplerYcbcrConversionKHR samplerYcbcrConversion;
+};
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_ALLOCATOR_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/benchmark.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/benchmark.h
new file mode 100644
index 0000000..3d5c0cd
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/benchmark.h
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_BENCHMARK_H
+#define NCNN_BENCHMARK_H
+
+#include "layer.h"
+#include "mat.h"
+#include "platform.h"
+
+namespace ncnn {
+
+// get now timestamp in ms
+NCNN_EXPORT double get_current_time();
+
+#if NCNN_BENCHMARK
+
+NCNN_EXPORT void benchmark(const Layer* layer, double start, double end);
+NCNN_EXPORT void benchmark(const Layer* layer, const Mat& bottom_blob, Mat& top_blob, double start, double end);
+
+#endif // NCNN_BENCHMARK
+
+} // namespace ncnn
+
+#endif // NCNN_BENCHMARK_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/blob.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/blob.h
new file mode 100644
index 0000000..c9f144f
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/blob.h
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_BLOB_H
+#define NCNN_BLOB_H
+
+#include "mat.h"
+#include "platform.h"
+
+namespace ncnn {
+
+class NCNN_EXPORT Blob
+{
+public:
+    // empty
+    Blob();
+
+public:
+#if NCNN_STRING
+    // blob name
+    std::string name;
+#endif // NCNN_STRING
+    // layer index which produce this blob as output
+    int producer;
+    // layer index which need this blob as input
+    int consumer;
+    // shape hint
+    Mat shape;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_BLOB_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/c_api.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/c_api.h
new file mode 100644
index 0000000..b7435f8
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/c_api.h
@@ -0,0 +1,327 @@
+/* Tencent is pleased to support the open source community by making ncnn available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed
+ * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+ * CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ */
+
+#ifndef NCNN_C_API_H
+#define NCNN_C_API_H
+
+#include "platform.h"
+
+#if NCNN_C_API
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+NCNN_EXPORT const char* ncnn_version();
+
+/* allocator api */
+typedef struct __ncnn_allocator_t* ncnn_allocator_t;
+struct NCNN_EXPORT __ncnn_allocator_t
+{
+    void* pthis;
+
+    void* (*fast_malloc)(ncnn_allocator_t allocator, size_t size);
+    void (*fast_free)(ncnn_allocator_t allocator, void* ptr);
+};
+
+NCNN_EXPORT ncnn_allocator_t ncnn_allocator_create_pool_allocator();
+NCNN_EXPORT ncnn_allocator_t ncnn_allocator_create_unlocked_pool_allocator();
+NCNN_EXPORT void ncnn_allocator_destroy(ncnn_allocator_t allocator);
+
+/* option api */
+typedef struct __ncnn_option_t* ncnn_option_t;
+
+NCNN_EXPORT ncnn_option_t ncnn_option_create();
+NCNN_EXPORT void ncnn_option_destroy(ncnn_option_t opt);
+
+NCNN_EXPORT int ncnn_option_get_num_threads(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_num_threads(ncnn_option_t opt, int num_threads);
+
+NCNN_EXPORT int ncnn_option_get_use_local_pool_allocator(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_use_local_pool_allocator(ncnn_option_t opt, int use_local_pool_allocator);
+
+NCNN_EXPORT void ncnn_option_set_blob_allocator(ncnn_option_t opt, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_option_set_workspace_allocator(ncnn_option_t opt, ncnn_allocator_t allocator);
+
+NCNN_EXPORT int ncnn_option_get_use_vulkan_compute(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_use_vulkan_compute(ncnn_option_t opt, int use_vulkan_compute);
+
+/* mat api */
+typedef struct __ncnn_mat_t* ncnn_mat_t;
+
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create();
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_1d(int w, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_2d(int w, int h, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_3d(int w, int h, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_4d(int w, int h, int d, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_1d(int w, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_2d(int w, int h, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_3d(int w, int h, int c, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_4d(int w, int h, int d, int c, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_1d_elem(int w, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_2d_elem(int w, int h, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_3d_elem(int w, int h, int c, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_4d_elem(int w, int h, int d, int c, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_1d_elem(int w, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_2d_elem(int w, int h, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_3d_elem(int w, int h, int c, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_4d_elem(int w, int h, int d, int c, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_mat_destroy(ncnn_mat_t mat);
+
+NCNN_EXPORT void ncnn_mat_fill_float(ncnn_mat_t mat, float v);
+
+NCNN_EXPORT ncnn_mat_t ncnn_mat_clone(const ncnn_mat_t mat, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_1d(const ncnn_mat_t mat, int w, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_2d(const ncnn_mat_t mat, int w, int h, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_3d(const ncnn_mat_t mat, int w, int h, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_4d(const ncnn_mat_t mat, int w, int h, int d, int c, ncnn_allocator_t allocator);
+
+NCNN_EXPORT int ncnn_mat_get_dims(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_w(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_h(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_d(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_c(const ncnn_mat_t mat);
+NCNN_EXPORT size_t ncnn_mat_get_elemsize(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_elempack(const ncnn_mat_t mat);
+NCNN_EXPORT size_t ncnn_mat_get_cstep(const ncnn_mat_t mat);
+NCNN_EXPORT void* ncnn_mat_get_data(const ncnn_mat_t mat);
+
+NCNN_EXPORT void* ncnn_mat_get_channel_data(const ncnn_mat_t mat, int c);
+
+#if NCNN_PIXEL
+
+/* mat pixel api */
+#define NCNN_MAT_PIXEL_RGB       1
+#define NCNN_MAT_PIXEL_BGR       2
+#define NCNN_MAT_PIXEL_GRAY      3
+#define NCNN_MAT_PIXEL_RGBA      4
+#define NCNN_MAT_PIXEL_BGRA      5
+#define NCNN_MAT_PIXEL_X2Y(X, Y) (X | (Y << 16))
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels(const unsigned char* pixels, int type, int w, int h, int stride, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int stride, int target_width, int target_height, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, int target_width, int target_height, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_mat_to_pixels(const ncnn_mat_t mat, unsigned char* pixels, int type, int stride);
+NCNN_EXPORT void ncnn_mat_to_pixels_resize(const ncnn_mat_t mat, unsigned char* pixels, int type, int target_width, int target_height, int target_stride);
+
+#endif /* NCNN_PIXEL */
+
+NCNN_EXPORT void ncnn_mat_substract_mean_normalize(ncnn_mat_t mat, const float* mean_vals, const float* norm_vals);
+
+NCNN_EXPORT void ncnn_convert_packing(const ncnn_mat_t src, ncnn_mat_t* dst, int elempack, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_flatten(const ncnn_mat_t src, ncnn_mat_t* dst, const ncnn_option_t opt);
+
+/* blob api */
+typedef struct __ncnn_blob_t* ncnn_blob_t;
+
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_blob_get_name(const ncnn_blob_t blob);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_blob_get_producer(const ncnn_blob_t blob);
+NCNN_EXPORT int ncnn_blob_get_consumer(const ncnn_blob_t blob);
+
+NCNN_EXPORT void ncnn_blob_get_shape(const ncnn_blob_t blob, int* dims, int* w, int* h, int* c);
+
+/* paramdict api */
+typedef struct __ncnn_paramdict_t* ncnn_paramdict_t;
+
+NCNN_EXPORT ncnn_paramdict_t ncnn_paramdict_create();
+NCNN_EXPORT void ncnn_paramdict_destroy(ncnn_paramdict_t pd);
+
+NCNN_EXPORT int ncnn_paramdict_get_type(const ncnn_paramdict_t pd, int id);
+
+NCNN_EXPORT int ncnn_paramdict_get_int(const ncnn_paramdict_t pd, int id, int def);
+NCNN_EXPORT float ncnn_paramdict_get_float(const ncnn_paramdict_t pd, int id, float def);
+NCNN_EXPORT ncnn_mat_t ncnn_paramdict_get_array(const ncnn_paramdict_t pd, int id, const ncnn_mat_t def);
+
+NCNN_EXPORT void ncnn_paramdict_set_int(ncnn_paramdict_t pd, int id, int i);
+NCNN_EXPORT void ncnn_paramdict_set_float(ncnn_paramdict_t pd, int id, float f);
+NCNN_EXPORT void ncnn_paramdict_set_array(ncnn_paramdict_t pd, int id, const ncnn_mat_t v);
+
+/* datareader api */
+typedef struct __ncnn_datareader_t* ncnn_datareader_t;
+struct NCNN_EXPORT __ncnn_datareader_t
+{
+    void* pthis;
+
+#if NCNN_STRING
+    int (*scan)(ncnn_datareader_t dr, const char* format, void* p);
+#endif /* NCNN_STRING */
+    size_t (*read)(ncnn_datareader_t dr, void* buf, size_t size);
+};
+
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create();
+#if NCNN_STDIO
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create_from_stdio(FILE* fp);
+#endif /* NCNN_STDIO */
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create_from_memory(const unsigned char** mem);
+NCNN_EXPORT void ncnn_datareader_destroy(ncnn_datareader_t dr);
+
+/* modelbin api */
+typedef struct __ncnn_modelbin_t* ncnn_modelbin_t;
+struct NCNN_EXPORT __ncnn_modelbin_t
+{
+    void* pthis;
+
+    ncnn_mat_t (*load_1d)(const ncnn_modelbin_t mb, int w, int type);
+    ncnn_mat_t (*load_2d)(const ncnn_modelbin_t mb, int w, int h, int type);
+    ncnn_mat_t (*load_3d)(const ncnn_modelbin_t mb, int w, int h, int c, int type);
+};
+
+NCNN_EXPORT ncnn_modelbin_t ncnn_modelbin_create_from_datareader(const ncnn_datareader_t dr);
+NCNN_EXPORT ncnn_modelbin_t ncnn_modelbin_create_from_mat_array(const ncnn_mat_t* weights, int n);
+NCNN_EXPORT void ncnn_modelbin_destroy(ncnn_modelbin_t mb);
+
+/* layer api */
+typedef struct __ncnn_layer_t* ncnn_layer_t;
+struct NCNN_EXPORT __ncnn_layer_t
+{
+    void* pthis;
+
+    int (*load_param)(ncnn_layer_t layer, const ncnn_paramdict_t pd);
+    int (*load_model)(ncnn_layer_t layer, const ncnn_modelbin_t mb);
+
+    int (*create_pipeline)(ncnn_layer_t layer, const ncnn_option_t opt);
+    int (*destroy_pipeline)(ncnn_layer_t layer, const ncnn_option_t opt);
+
+    int (*forward_1)(const ncnn_layer_t layer, const ncnn_mat_t bottom_blob, ncnn_mat_t* top_blob, const ncnn_option_t opt);
+    int (*forward_n)(const ncnn_layer_t layer, const ncnn_mat_t* bottom_blobs, int n, ncnn_mat_t* top_blobs, int n2, const ncnn_option_t opt);
+
+    int (*forward_inplace_1)(const ncnn_layer_t layer, ncnn_mat_t bottom_top_blob, const ncnn_option_t opt);
+    int (*forward_inplace_n)(const ncnn_layer_t layer, ncnn_mat_t* bottom_top_blobs, int n, const ncnn_option_t opt);
+};
+
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create();
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create_by_typeindex(int typeindex);
+#if NCNN_STRING
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create_by_type(const char* type);
+#endif /* NCNN_STRING */
+NCNN_EXPORT void ncnn_layer_destroy(ncnn_layer_t layer);
+
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_layer_get_name(const ncnn_layer_t layer);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_layer_get_typeindex(const ncnn_layer_t layer);
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_layer_get_type(const ncnn_layer_t layer);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_layer_get_one_blob_only(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_inplace(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_vulkan(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_packing(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_bf16_storage(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_fp16_storage(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_image_storage(const ncnn_layer_t layer);
+
+NCNN_EXPORT void ncnn_layer_set_one_blob_only(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_inplace(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_vulkan(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_packing(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_bf16_storage(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_fp16_storage(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_image_storage(ncnn_layer_t layer, int enable);
+
+NCNN_EXPORT int ncnn_layer_get_bottom_count(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_bottom(const ncnn_layer_t layer, int i);
+NCNN_EXPORT int ncnn_layer_get_top_count(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_top(const ncnn_layer_t layer, int i);
+
+NCNN_EXPORT void ncnn_blob_get_bottom_shape(const ncnn_layer_t layer, int i, int* dims, int* w, int* h, int* c);
+NCNN_EXPORT void ncnn_blob_get_top_shape(const ncnn_layer_t layer, int i, int* dims, int* w, int* h, int* c);
+
+/* layer factory function */
+typedef ncnn_layer_t (*ncnn_layer_creator_t)(void* userdata);
+typedef void (*ncnn_layer_destroyer_t)(ncnn_layer_t layer, void* userdata);
+
+typedef struct __ncnn_net_custom_layer_factory_t* ncnn_net_custom_layer_factory_t;
+struct __ncnn_net_custom_layer_factory_t
+{
+    ncnn_layer_creator_t creator;
+    ncnn_layer_destroyer_t destroyer;
+    void* userdata;
+    ncnn_net_custom_layer_factory_t next;
+};
+
+/* net api */
+typedef struct __ncnn_net_t* ncnn_net_t;
+struct __ncnn_net_t
+{
+    void* pthis;
+
+    ncnn_net_custom_layer_factory_t custom_layer_factory;
+};
+
+NCNN_EXPORT ncnn_net_t ncnn_net_create();
+NCNN_EXPORT void ncnn_net_destroy(ncnn_net_t net);
+
+NCNN_EXPORT ncnn_option_t ncnn_net_get_option(ncnn_net_t net);
+NCNN_EXPORT void ncnn_net_set_option(ncnn_net_t net, ncnn_option_t opt);
+
+#if NCNN_STRING
+NCNN_EXPORT void ncnn_net_register_custom_layer_by_type(ncnn_net_t net, const char* type, ncnn_layer_creator_t creator, ncnn_layer_destroyer_t destroyer, void* userdata);
+#endif /* NCNN_STRING */
+NCNN_EXPORT void ncnn_net_register_custom_layer_by_typeindex(ncnn_net_t net, int typeindex, ncnn_layer_creator_t creator, ncnn_layer_destroyer_t destroyer, void* userdata);
+
+#if NCNN_STDIO
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param(ncnn_net_t net, const char* path);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_net_load_param_bin(ncnn_net_t net, const char* path);
+NCNN_EXPORT int ncnn_net_load_model(ncnn_net_t net, const char* path);
+#endif /* NCNN_STDIO */
+
+#if NCNN_STDIO
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param_memory(ncnn_net_t net, const char* mem);
+#endif /* NCNN_STRING */
+#endif /* NCNN_STDIO */
+NCNN_EXPORT int ncnn_net_load_param_bin_memory(ncnn_net_t net, const unsigned char* mem);
+NCNN_EXPORT int ncnn_net_load_model_memory(ncnn_net_t net, const unsigned char* mem);
+
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_net_load_param_bin_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+NCNN_EXPORT int ncnn_net_load_model_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+
+NCNN_EXPORT void ncnn_net_clear(ncnn_net_t net);
+
+/* extractor api */
+typedef struct __ncnn_extractor_t* ncnn_extractor_t;
+
+NCNN_EXPORT ncnn_extractor_t ncnn_extractor_create(ncnn_net_t net);
+NCNN_EXPORT void ncnn_extractor_destroy(ncnn_extractor_t ex);
+
+NCNN_EXPORT void ncnn_extractor_set_option(ncnn_extractor_t ex, const ncnn_option_t opt);
+
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_extractor_input(ncnn_extractor_t ex, const char* name, const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_extractor_extract(ncnn_extractor_t ex, const char* name, ncnn_mat_t* mat);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_extractor_input_index(ncnn_extractor_t ex, int index, const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_extractor_extract_index(ncnn_extractor_t ex, int index, ncnn_mat_t* mat);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* NCNN_C_API */
+
+#endif /* NCNN_C_API_H */
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/command.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/command.h
new file mode 100644
index 0000000..337d085
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/command.h
@@ -0,0 +1,136 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_COMMAND_H
+#define NCNN_COMMAND_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+
+#include "mat.h"
+
+#include <vulkan/vulkan.h>
+
+namespace ncnn {
+
+class Pipeline;
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class ImportAndroidHardwareBufferPipeline;
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+class VkComputePrivate;
+class NCNN_EXPORT VkCompute
+{
+public:
+    explicit VkCompute(const VulkanDevice* vkdev);
+    virtual ~VkCompute();
+
+public:
+    void record_upload(const Mat& src, VkMat& dst, const Option& opt);
+
+    void record_upload(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    void record_download(const VkMat& src, Mat& dst, const Option& opt);
+
+    void record_download(const VkImageMat& src, Mat& dst, const Option& opt);
+
+    void record_buffer_to_image(const VkMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_image_to_buffer(const VkImageMat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const Mat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, Mat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, Mat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, VkMat& dst, const Option& opt);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkImageMat>& bindings, const std::vector<vk_constant_type>& constants, const VkImageMat& dispatcher);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher);
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const VkImageMat& dispatcher);
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const Mat& dispatcher);
+
+#if NCNN_BENCHMARK
+    void record_write_timestamp(uint32_t query);
+#endif // NCNN_BENCHMARK
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+    void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkMat& dst);
+
+    void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkImageMat& dst);
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+    int submit_and_wait();
+
+    int reset();
+
+#if NCNN_BENCHMARK
+    int create_query_pool(uint32_t query_count);
+
+    int get_query_pool_results(uint32_t first_query, uint32_t query_count, std::vector<uint64_t>& results);
+#endif // NCNN_BENCHMARK
+
+protected:
+    const VulkanDevice* vkdev;
+
+    void barrier_readwrite(const VkMat& binding);
+    void barrier_readwrite(const VkImageMat& binding);
+    void barrier_readonly(const VkImageMat& binding);
+
+private:
+    VkComputePrivate* const d;
+};
+
+class VkTransferPrivate;
+class NCNN_EXPORT VkTransfer
+{
+public:
+    explicit VkTransfer(const VulkanDevice* vkdev);
+    virtual ~VkTransfer();
+
+public:
+    void record_upload(const Mat& src, VkMat& dst, const Option& opt, bool flatten = true);
+
+    void record_upload(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    int submit_and_wait();
+
+protected:
+    const VulkanDevice* vkdev;
+
+private:
+    VkTransferPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_VULKAN
+
+#endif // NCNN_COMMAND_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/cpu.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/cpu.h
new file mode 100644
index 0000000..0f748f3
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/cpu.h
@@ -0,0 +1,169 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_CPU_H
+#define NCNN_CPU_H
+
+#include <stddef.h>
+
+#if (defined _WIN32 && !(defined __MINGW32__))
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+#if defined __ANDROID__ || defined __linux__
+#include <sched.h> // cpu_set_t
+#endif
+
+#include "platform.h"
+
+namespace ncnn {
+
+class NCNN_EXPORT CpuSet
+{
+public:
+    CpuSet();
+    void enable(int cpu);
+    void disable(int cpu);
+    void disable_all();
+    bool is_enabled(int cpu) const;
+    int num_enabled() const;
+
+public:
+#if (defined _WIN32 && !(defined __MINGW32__))
+    ULONG_PTR mask;
+#endif
+#if defined __ANDROID__ || defined __linux__
+    cpu_set_t cpu_set;
+#endif
+#if __APPLE__
+    unsigned int policy;
+#endif
+};
+
+// test optional cpu features
+// edsp = armv7 edsp
+NCNN_EXPORT int cpu_support_arm_edsp();
+// neon = armv7 neon or aarch64 asimd
+NCNN_EXPORT int cpu_support_arm_neon();
+// vfpv4 = armv7 fp16 + fma
+NCNN_EXPORT int cpu_support_arm_vfpv4();
+// asimdhp = aarch64 asimd half precision
+NCNN_EXPORT int cpu_support_arm_asimdhp();
+// asimddp = aarch64 asimd dot product
+NCNN_EXPORT int cpu_support_arm_asimddp();
+// asimdfhm = aarch64 asimd fhm
+NCNN_EXPORT int cpu_support_arm_asimdfhm();
+// bf16 = aarch64 bf16
+NCNN_EXPORT int cpu_support_arm_bf16();
+// i8mm = aarch64 i8mm
+NCNN_EXPORT int cpu_support_arm_i8mm();
+// sve = aarch64 sve
+NCNN_EXPORT int cpu_support_arm_sve();
+// sve2 = aarch64 sve2
+NCNN_EXPORT int cpu_support_arm_sve2();
+// svebf16 = aarch64 svebf16
+NCNN_EXPORT int cpu_support_arm_svebf16();
+// svei8mm = aarch64 svei8mm
+NCNN_EXPORT int cpu_support_arm_svei8mm();
+// svef32mm = aarch64 svef32mm
+NCNN_EXPORT int cpu_support_arm_svef32mm();
+
+// avx = x86 avx
+NCNN_EXPORT int cpu_support_x86_avx();
+// fma = x86 fma
+NCNN_EXPORT int cpu_support_x86_fma();
+// xop = x86 xop
+NCNN_EXPORT int cpu_support_x86_xop();
+// f16c = x86 f16c
+NCNN_EXPORT int cpu_support_x86_f16c();
+// avx2 = x86 avx2 + fma + f16c
+NCNN_EXPORT int cpu_support_x86_avx2();
+// avx_vnni = x86 avx vnni
+NCNN_EXPORT int cpu_support_x86_avx_vnni();
+// avx512 = x86 avx512f + avx512cd + avx512bw + avx512dq + avx512vl
+NCNN_EXPORT int cpu_support_x86_avx512();
+// avx512_vnni = x86 avx512 vnni
+NCNN_EXPORT int cpu_support_x86_avx512_vnni();
+// avx512_bf16 = x86 avx512 bf16
+NCNN_EXPORT int cpu_support_x86_avx512_bf16();
+// avx512_fp16 = x86 avx512 fp16
+NCNN_EXPORT int cpu_support_x86_avx512_fp16();
+
+// lsx = loongarch lsx
+NCNN_EXPORT int cpu_support_loongarch_lsx();
+// lasx = loongarch lasx
+NCNN_EXPORT int cpu_support_loongarch_lasx();
+
+// msa = mips mas
+NCNN_EXPORT int cpu_support_mips_msa();
+// mmi = loongson mmi
+NCNN_EXPORT int cpu_support_loongson_mmi();
+
+// v = riscv vector
+NCNN_EXPORT int cpu_support_riscv_v();
+// zfh = riscv half-precision float
+NCNN_EXPORT int cpu_support_riscv_zfh();
+// vlenb = riscv vector length in bytes
+NCNN_EXPORT int cpu_riscv_vlenb();
+
+// cpu info
+NCNN_EXPORT int get_cpu_count();
+NCNN_EXPORT int get_little_cpu_count();
+NCNN_EXPORT int get_big_cpu_count();
+
+NCNN_EXPORT int get_physical_cpu_count();
+NCNN_EXPORT int get_physical_little_cpu_count();
+NCNN_EXPORT int get_physical_big_cpu_count();
+
+// bind all threads on little clusters if powersave enabled
+// affects HMP arch cpu like ARM big.LITTLE
+// only implemented on android at the moment
+// switching powersave is expensive and not thread-safe
+// 0 = all cores enabled(default)
+// 1 = only little clusters enabled
+// 2 = only big clusters enabled
+// return 0 if success for setter function
+NCNN_EXPORT int get_cpu_powersave();
+NCNN_EXPORT int set_cpu_powersave(int powersave);
+
+// convenient wrapper
+NCNN_EXPORT const CpuSet& get_cpu_thread_affinity_mask(int powersave);
+
+// set explicit thread affinity
+NCNN_EXPORT int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask);
+
+// misc function wrapper for openmp routines
+NCNN_EXPORT int get_omp_num_threads();
+NCNN_EXPORT void set_omp_num_threads(int num_threads);
+
+NCNN_EXPORT int get_omp_dynamic();
+NCNN_EXPORT void set_omp_dynamic(int dynamic);
+
+NCNN_EXPORT int get_omp_thread_num();
+
+NCNN_EXPORT int get_kmp_blocktime();
+NCNN_EXPORT void set_kmp_blocktime(int time_ms);
+
+// need to flush denormals on Intel Chipset.
+// Other architectures such as ARM can be added as needed.
+// 0 = DAZ OFF, FTZ OFF
+// 1 = DAZ ON , FTZ OFF
+// 2 = DAZ OFF, FTZ ON
+// 3 = DAZ ON,  FTZ ON
+NCNN_EXPORT int get_flush_denormals();
+NCNN_EXPORT int set_flush_denormals(int flush_denormals);
+
+} // namespace ncnn
+
+#endif // NCNN_CPU_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/datareader.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/datareader.h
new file mode 100644
index 0000000..ed2aba3
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/datareader.h
@@ -0,0 +1,122 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_DATAREADER_H
+#define NCNN_DATAREADER_H
+
+#include "platform.h"
+#if NCNN_STDIO
+#include <stdio.h>
+#endif
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/asset_manager.h>
+#endif
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+// data read wrapper
+class NCNN_EXPORT DataReader
+{
+public:
+    DataReader();
+    virtual ~DataReader();
+
+#if NCNN_STRING
+    // parse plain param text
+    // return 1 if scan success
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+
+    // read binary param and model data
+    // return bytes read
+    virtual size_t read(void* buf, size_t size) const;
+
+    // get model data reference
+    // return bytes referenced
+    virtual size_t reference(size_t size, const void** buf) const;
+};
+
+#if NCNN_STDIO
+class DataReaderFromStdioPrivate;
+class NCNN_EXPORT DataReaderFromStdio : public DataReader
+{
+public:
+    explicit DataReaderFromStdio(FILE* fp);
+    virtual ~DataReaderFromStdio();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+
+private:
+    DataReaderFromStdio(const DataReaderFromStdio&);
+    DataReaderFromStdio& operator=(const DataReaderFromStdio&);
+
+private:
+    DataReaderFromStdioPrivate* const d;
+};
+#endif // NCNN_STDIO
+
+class DataReaderFromMemoryPrivate;
+class NCNN_EXPORT DataReaderFromMemory : public DataReader
+{
+public:
+    explicit DataReaderFromMemory(const unsigned char*& mem);
+    virtual ~DataReaderFromMemory();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+    virtual size_t reference(size_t size, const void** buf) const;
+
+private:
+    DataReaderFromMemory(const DataReaderFromMemory&);
+    DataReaderFromMemory& operator=(const DataReaderFromMemory&);
+
+private:
+    DataReaderFromMemoryPrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+class DataReaderFromAndroidAssetPrivate;
+class NCNN_EXPORT DataReaderFromAndroidAsset : public DataReader
+{
+public:
+    explicit DataReaderFromAndroidAsset(AAsset* asset);
+    virtual ~DataReaderFromAndroidAsset();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+
+private:
+    DataReaderFromAndroidAsset(const DataReaderFromAndroidAsset&);
+    DataReaderFromAndroidAsset& operator=(const DataReaderFromAndroidAsset&);
+
+private:
+    DataReaderFromAndroidAssetPrivate* const d;
+};
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+} // namespace ncnn
+
+#endif // NCNN_DATAREADER_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/gpu.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/gpu.h
new file mode 100644
index 0000000..2ef4927
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/gpu.h
@@ -0,0 +1,359 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_GPU_H
+#define NCNN_GPU_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+
+#include "mat.h"
+
+#include <vulkan/vulkan.h>
+
+#include "vulkan_header_fix.h"
+
+namespace ncnn {
+
+// instance
+NCNN_EXPORT int create_gpu_instance();
+NCNN_EXPORT void destroy_gpu_instance();
+
+// instance extension capability
+extern int support_VK_KHR_external_memory_capabilities;
+extern int support_VK_KHR_get_physical_device_properties2;
+extern int support_VK_KHR_get_surface_capabilities2;
+extern int support_VK_KHR_surface;
+extern int support_VK_EXT_debug_utils;
+#if __ANDROID_API__ >= 26
+extern int support_VK_KHR_android_surface;
+#endif // __ANDROID_API__ >= 26
+
+// VK_KHR_external_memory_capabilities
+extern PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR vkGetPhysicalDeviceExternalBufferPropertiesKHR;
+
+// VK_KHR_get_physical_device_properties2
+extern PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR;
+extern PFN_vkGetPhysicalDeviceProperties2KHR vkGetPhysicalDeviceProperties2KHR;
+extern PFN_vkGetPhysicalDeviceFormatProperties2KHR vkGetPhysicalDeviceFormatProperties2KHR;
+extern PFN_vkGetPhysicalDeviceImageFormatProperties2KHR vkGetPhysicalDeviceImageFormatProperties2KHR;
+extern PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR vkGetPhysicalDeviceQueueFamilyProperties2KHR;
+extern PFN_vkGetPhysicalDeviceMemoryProperties2KHR vkGetPhysicalDeviceMemoryProperties2KHR;
+extern PFN_vkGetPhysicalDeviceSparseImageFormatProperties2KHR vkGetPhysicalDeviceSparseImageFormatProperties2KHR;
+
+// VK_KHR_get_surface_capabilities2
+extern PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR vkGetPhysicalDeviceSurfaceCapabilities2KHR;
+extern PFN_vkGetPhysicalDeviceSurfaceFormats2KHR vkGetPhysicalDeviceSurfaceFormats2KHR;
+
+// VK_KHR_surface
+extern PFN_vkDestroySurfaceKHR vkDestroySurfaceKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceSupportKHR vkGetPhysicalDeviceSurfaceSupportKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR vkGetPhysicalDeviceSurfaceCapabilitiesKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceFormatsKHR vkGetPhysicalDeviceSurfaceFormatsKHR;
+extern PFN_vkGetPhysicalDeviceSurfacePresentModesKHR vkGetPhysicalDeviceSurfacePresentModesKHR;
+
+#if __ANDROID_API__ >= 26
+// VK_KHR_android_surface
+extern PFN_vkCreateAndroidSurfaceKHR vkCreateAndroidSurfaceKHR;
+#endif // __ANDROID_API__ >= 26
+
+// VK_NV_cooperative_matrix
+extern PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesNV vkGetPhysicalDeviceCooperativeMatrixPropertiesNV;
+
+// get info
+NCNN_EXPORT int get_gpu_count();
+NCNN_EXPORT int get_default_gpu_index();
+
+class GpuInfoPrivate;
+class NCNN_EXPORT GpuInfo
+{
+public:
+    explicit GpuInfo();
+    virtual ~GpuInfo();
+
+    // vulkan physical device
+    VkPhysicalDevice physical_device() const;
+
+    // memory properties
+    const VkPhysicalDeviceMemoryProperties& physical_device_memory_properties() const;
+
+    // info
+    uint32_t api_version() const;
+    uint32_t driver_version() const;
+    uint32_t vendor_id() const;
+    uint32_t device_id() const;
+    const char* device_name() const;
+    uint8_t* pipeline_cache_uuid() const;
+
+    // 0 = discrete gpu
+    // 1 = integrated gpu
+    // 2 = virtual gpu
+    // 3 = cpu
+    int type() const;
+
+    // hardware limit
+    uint32_t max_shared_memory_size() const;
+    uint32_t max_workgroup_count_x() const;
+    uint32_t max_workgroup_count_y() const;
+    uint32_t max_workgroup_count_z() const;
+    uint32_t max_workgroup_invocations() const;
+    uint32_t max_workgroup_size_x() const;
+    uint32_t max_workgroup_size_y() const;
+    uint32_t max_workgroup_size_z() const;
+    size_t memory_map_alignment() const;
+    size_t buffer_offset_alignment() const;
+    size_t non_coherent_atom_size() const;
+    size_t buffer_image_granularity() const;
+    uint32_t max_image_dimension_1d() const;
+    uint32_t max_image_dimension_2d() const;
+    uint32_t max_image_dimension_3d() const;
+    float timestamp_period() const;
+
+    // runtime
+    uint32_t compute_queue_family_index() const;
+    uint32_t graphics_queue_family_index() const;
+    uint32_t transfer_queue_family_index() const;
+
+    uint32_t compute_queue_count() const;
+    uint32_t graphics_queue_count() const;
+    uint32_t transfer_queue_count() const;
+
+    // property
+    bool unified_compute_transfer_queue() const;
+
+    // subgroup
+    uint32_t subgroup_size() const;
+    bool support_subgroup_basic() const;
+    bool support_subgroup_vote() const;
+    bool support_subgroup_ballot() const;
+    bool support_subgroup_shuffle() const;
+
+    // bug is not feature
+    bool bug_storage_buffer_no_l1() const;
+    bool bug_corrupted_online_pipeline_cache() const;
+    bool bug_buffer_image_load_zero() const;
+
+    // but sometimes bug is a feature
+    bool bug_implicit_fp16_arithmetic() const;
+
+    // fp16 and int8 feature
+    bool support_fp16_packed() const;
+    bool support_fp16_storage() const;
+    bool support_fp16_arithmetic() const;
+    bool support_int8_packed() const;
+    bool support_int8_storage() const;
+    bool support_int8_arithmetic() const;
+
+    // ycbcr conversion feature
+    bool support_ycbcr_conversion() const;
+
+    // cooperative matrix feature
+    bool support_cooperative_matrix() const;
+    bool support_cooperative_matrix_16_8_8() const;
+
+    // extension capability
+    int support_VK_KHR_8bit_storage() const;
+    int support_VK_KHR_16bit_storage() const;
+    int support_VK_KHR_bind_memory2() const;
+    int support_VK_KHR_create_renderpass2() const;
+    int support_VK_KHR_dedicated_allocation() const;
+    int support_VK_KHR_descriptor_update_template() const;
+    int support_VK_KHR_external_memory() const;
+    int support_VK_KHR_get_memory_requirements2() const;
+    int support_VK_KHR_maintenance1() const;
+    int support_VK_KHR_maintenance2() const;
+    int support_VK_KHR_maintenance3() const;
+    int support_VK_KHR_multiview() const;
+    int support_VK_KHR_push_descriptor() const;
+    int support_VK_KHR_sampler_ycbcr_conversion() const;
+    int support_VK_KHR_shader_float16_int8() const;
+    int support_VK_KHR_shader_float_controls() const;
+    int support_VK_KHR_storage_buffer_storage_class() const;
+    int support_VK_KHR_swapchain() const;
+    int support_VK_EXT_descriptor_indexing() const;
+    int support_VK_EXT_memory_budget() const;
+    int support_VK_EXT_queue_family_foreign() const;
+#if __ANDROID_API__ >= 26
+    int support_VK_ANDROID_external_memory_android_hardware_buffer() const;
+#endif // __ANDROID_API__ >= 26
+    int support_VK_NV_cooperative_matrix() const;
+
+private:
+    GpuInfo(const GpuInfo&);
+    GpuInfo& operator=(const GpuInfo&);
+
+private:
+    friend int create_gpu_instance();
+    GpuInfoPrivate* const d;
+};
+
+NCNN_EXPORT const GpuInfo& get_gpu_info(int device_index = get_default_gpu_index());
+
+class VkAllocator;
+class VkCompute;
+class Option;
+class PipelineCache;
+class VulkanDevicePrivate;
+class NCNN_EXPORT VulkanDevice
+{
+public:
+    VulkanDevice(int device_index = get_default_gpu_index());
+    ~VulkanDevice();
+
+    const GpuInfo& info;
+
+    VkDevice vkdevice() const;
+
+    VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const;
+
+    // with fixed workgroup size
+    VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const;
+
+    // helper for creating pipeline
+    int create_descriptorset_layout(int binding_count, const int* binding_types, VkDescriptorSetLayout* descriptorset_layout) const;
+    int create_pipeline_layout(int push_constant_count, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout* pipeline_layout) const;
+    int create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, VkPipeline* pipeline) const;
+    int create_descriptor_update_template(int binding_count, const int* binding_types, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR* descriptor_update_template) const;
+
+    uint32_t find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const;
+    bool is_mappable(uint32_t memory_type_index) const;
+    bool is_coherent(uint32_t memory_type_index) const;
+
+    VkQueue acquire_queue(uint32_t queue_family_index) const;
+    void reclaim_queue(uint32_t queue_family_index, VkQueue queue) const;
+
+    // allocator on this device
+    VkAllocator* acquire_blob_allocator() const;
+    void reclaim_blob_allocator(VkAllocator* allocator) const;
+
+    VkAllocator* acquire_staging_allocator() const;
+    void reclaim_staging_allocator(VkAllocator* allocator) const;
+
+    // immutable sampler for texelfetch
+    const VkSampler* immutable_texelfetch_sampler() const;
+
+    // dummy buffer image
+    VkMat get_dummy_buffer() const;
+    VkImageMat get_dummy_image() const;
+    VkImageMat get_dummy_image_readonly() const;
+
+    // pipeline cache on this device
+    const PipelineCache* get_pipeline_cache() const;
+
+    // test image allocation
+    bool shape_support_image_storage(const Mat& shape) const;
+
+    // current gpu heap memory budget in MB
+    uint32_t get_heap_budget() const;
+
+    // utility operator
+    void convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkImageMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkImageMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+
+    // VK_KHR_bind_memory2
+    PFN_vkBindBufferMemory2KHR vkBindBufferMemory2KHR;
+    PFN_vkBindImageMemory2KHR vkBindImageMemory2KHR;
+
+    // VK_KHR_create_renderpass2
+    PFN_vkCmdBeginRenderPass2KHR vkCmdBeginRenderPass2KHR;
+    PFN_vkCmdEndRenderPass2KHR vkCmdEndRenderPass2KHR;
+    PFN_vkCmdNextSubpass2KHR vkCmdNextSubpass2KHR;
+    PFN_vkCreateRenderPass2KHR vkCreateRenderPass2KHR;
+
+    // VK_KHR_descriptor_update_template
+    PFN_vkCreateDescriptorUpdateTemplateKHR vkCreateDescriptorUpdateTemplateKHR;
+    PFN_vkDestroyDescriptorUpdateTemplateKHR vkDestroyDescriptorUpdateTemplateKHR;
+    PFN_vkUpdateDescriptorSetWithTemplateKHR vkUpdateDescriptorSetWithTemplateKHR;
+
+    // VK_KHR_get_memory_requirements2
+    PFN_vkGetImageMemoryRequirements2KHR vkGetImageMemoryRequirements2KHR;
+    PFN_vkGetBufferMemoryRequirements2KHR vkGetBufferMemoryRequirements2KHR;
+    PFN_vkGetImageSparseMemoryRequirements2KHR vkGetImageSparseMemoryRequirements2KHR;
+
+    // VK_KHR_maintenance1
+    PFN_vkTrimCommandPoolKHR vkTrimCommandPoolKHR;
+
+    // VK_KHR_maintenance3
+    PFN_vkGetDescriptorSetLayoutSupportKHR vkGetDescriptorSetLayoutSupportKHR;
+
+    // VK_KHR_push_descriptor
+    PFN_vkCmdPushDescriptorSetWithTemplateKHR vkCmdPushDescriptorSetWithTemplateKHR;
+    PFN_vkCmdPushDescriptorSetKHR vkCmdPushDescriptorSetKHR;
+
+    // VK_KHR_sampler_ycbcr_conversion
+    PFN_vkCreateSamplerYcbcrConversionKHR vkCreateSamplerYcbcrConversionKHR;
+    PFN_vkDestroySamplerYcbcrConversionKHR vkDestroySamplerYcbcrConversionKHR;
+
+    // VK_KHR_swapchain
+    PFN_vkCreateSwapchainKHR vkCreateSwapchainKHR;
+    PFN_vkDestroySwapchainKHR vkDestroySwapchainKHR;
+    PFN_vkGetSwapchainImagesKHR vkGetSwapchainImagesKHR;
+    PFN_vkAcquireNextImageKHR vkAcquireNextImageKHR;
+    PFN_vkQueuePresentKHR vkQueuePresentKHR;
+
+#if __ANDROID_API__ >= 26
+    // VK_ANDROID_external_memory_android_hardware_buffer
+    PFN_vkGetAndroidHardwareBufferPropertiesANDROID vkGetAndroidHardwareBufferPropertiesANDROID;
+    PFN_vkGetMemoryAndroidHardwareBufferANDROID vkGetMemoryAndroidHardwareBufferANDROID;
+#endif // __ANDROID_API__ >= 26
+
+protected:
+    // device extension
+    int init_device_extension();
+
+private:
+    VulkanDevice(const VulkanDevice&);
+    VulkanDevice& operator=(const VulkanDevice&);
+
+private:
+    VulkanDevicePrivate* const d;
+};
+
+NCNN_EXPORT VulkanDevice* get_gpu_device(int device_index = get_default_gpu_index());
+
+// online spirv compilation
+NCNN_EXPORT int compile_spirv_module(const char* comp_string, const Option& opt, std::vector<uint32_t>& spirv);
+NCNN_EXPORT int compile_spirv_module(const char* comp_data, int comp_data_size, const Option& opt, std::vector<uint32_t>& spirv);
+NCNN_EXPORT int compile_spirv_module(int shader_type_index, const Option& opt, std::vector<uint32_t>& spirv);
+
+// info from spirv
+class NCNN_EXPORT ShaderInfo
+{
+public:
+    int specialization_count;
+    int binding_count;
+    int push_constant_count;
+
+    // 0 = null
+    // 1 = storage buffer
+    // 2 = storage image
+    // 3 = combined image sampler
+    int binding_types[16]; // 16 is large enough I think ...
+
+    int reserved_0;
+    int reserved_1;
+    int reserved_2;
+    int reserved_3;
+};
+
+NCNN_EXPORT int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info);
+
+} // namespace ncnn
+
+#endif // NCNN_VULKAN
+
+#endif // NCNN_GPU_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/layer.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/layer.h
new file mode 100644
index 0000000..d02f65b
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/layer.h
@@ -0,0 +1,214 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_H
+#define NCNN_LAYER_H
+
+#include "mat.h"
+#include "modelbin.h"
+#include "option.h"
+#include "paramdict.h"
+#include "platform.h"
+
+#include <math.h>
+
+#if NCNN_VULKAN
+#include "command.h"
+#include "pipeline.h"
+
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+namespace ncnn {
+
+class NCNN_EXPORT Layer
+{
+public:
+    // empty
+    Layer();
+    // virtual destructor
+    virtual ~Layer();
+
+    // load layer specific parameter from parsed dict
+    // return 0 if success
+    virtual int load_param(const ParamDict& pd);
+
+    // load layer specific weight data from model binary
+    // return 0 if success
+    virtual int load_model(const ModelBin& mb);
+
+    // layer implementation specific setup
+    // return 0 if success
+    virtual int create_pipeline(const Option& opt);
+
+    // layer implementation specific clean
+    // return 0 if success
+    virtual int destroy_pipeline(const Option& opt);
+
+public:
+    // one input and one output blob
+    bool one_blob_only;
+
+    // support inplace inference
+    bool support_inplace;
+
+    // support vulkan compute
+    bool support_vulkan;
+
+    // accept input blob with packed storage
+    bool support_packing;
+
+    // accept bf16
+    bool support_bf16_storage;
+
+    // accept fp16
+    bool support_fp16_storage;
+
+    // accept int8
+    bool support_int8_storage;
+
+    // shader image storage
+    bool support_image_storage;
+
+    // shader tensor storage
+    bool support_tensor_storage;
+
+    bool support_reserved_00;
+
+    bool support_reserved_0;
+    bool support_reserved_1;
+    bool support_reserved_2;
+    bool support_reserved_3;
+    bool support_reserved_4;
+    bool support_reserved_5;
+    bool support_reserved_6;
+    bool support_reserved_7;
+    bool support_reserved_8;
+    bool support_reserved_9;
+
+    // feature disabled set
+    int featmask;
+
+public:
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+
+#if NCNN_VULKAN
+public:
+    // upload weight blob from host to device
+    virtual int upload_model(VkTransfer& cmd, const Option& opt);
+
+public:
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<VkMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<VkImageMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    // assigned immediately after creating this layer
+    const VulkanDevice* vkdev;
+#endif // NCNN_VULKAN
+
+public:
+    // custom user data
+    void* userdata;
+    // layer type index
+    int typeindex;
+#if NCNN_STRING
+    // layer type name
+    std::string type;
+    // layer name
+    std::string name;
+#endif // NCNN_STRING
+    // blob index which this layer needs as input
+    std::vector<int> bottoms;
+    // blob index which this layer produces as output
+    std::vector<int> tops;
+    // shape hint
+    std::vector<Mat> bottom_shapes;
+    std::vector<Mat> top_shapes;
+};
+
+// layer factory function
+typedef Layer* (*layer_creator_func)(void*);
+typedef void (*layer_destroyer_func)(Layer*, void*);
+
+struct layer_registry_entry
+{
+#if NCNN_STRING
+    // layer type name
+    const char* name;
+#endif // NCNN_STRING
+    // layer factory entry
+    layer_creator_func creator;
+};
+
+struct custom_layer_registry_entry
+{
+#if NCNN_STRING
+    // layer type name
+    const char* name;
+#endif // NCNN_STRING
+    // layer factory entry
+    layer_creator_func creator;
+    layer_destroyer_func destroyer;
+    void* userdata;
+};
+
+#if NCNN_STRING
+// get layer type from type name
+NCNN_EXPORT int layer_to_index(const char* type);
+// create layer from type name
+NCNN_EXPORT Layer* create_layer(const char* type);
+#endif // NCNN_STRING
+// create layer from layer type
+NCNN_EXPORT Layer* create_layer(int index);
+
+#define DEFINE_LAYER_CREATOR(name)                          \
+    ::ncnn::Layer* name##_layer_creator(void* /*userdata*/) \
+    {                                                       \
+        return new name;                                    \
+    }
+
+#define DEFINE_LAYER_DESTROYER(name)                                      \
+    void name##_layer_destroyer(::ncnn::Layer* layer, void* /*userdata*/) \
+    {                                                                     \
+        delete layer;                                                     \
+    }
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/layer_shader_type.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/layer_shader_type.h
new file mode 100644
index 0000000..c143e7d
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/layer_shader_type.h
@@ -0,0 +1,29 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_SHADER_TYPE_H
+#define NCNN_LAYER_SHADER_TYPE_H
+
+namespace ncnn {
+
+namespace LayerShaderType {
+enum LayerShaderType
+{
+#include "layer_shader_type_enum.h"
+};
+} // namespace LayerShaderType
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_SHADER_TYPE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/layer_shader_type_enum.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/layer_shader_type_enum.h
new file mode 100644
index 0000000..f11cab9
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/layer_shader_type_enum.h
@@ -0,0 +1,370 @@
+// Layer Shader Enum header
+//
+// This file is auto-generated by cmake, don't edit it.
+
+absval = 0,
+absval_pack4 = 1,
+absval_pack8 = 2,
+batchnorm = 3,
+batchnorm_pack4 = 4,
+batchnorm_pack8 = 5,
+concat = 6,
+concat_pack4 = 7,
+concat_pack4to1 = 8,
+concat_pack8 = 9,
+concat_pack8to1 = 10,
+concat_pack8to4 = 11,
+convolution = 12,
+convolution_1x1s1d1 = 13,
+convolution_3x3s1d1_winograd23_transform_input = 14,
+convolution_3x3s1d1_winograd23_transform_output = 15,
+convolution_3x3s1d1_winograd43_transform_input = 16,
+convolution_3x3s1d1_winograd43_transform_output = 17,
+convolution_3x3s1d1_winograd_gemm = 18,
+convolution_gemm = 19,
+convolution_pack1to4 = 20,
+convolution_pack1to4_1x1s1d1 = 21,
+convolution_pack1to4_3x3s1d1_winograd_gemm = 22,
+convolution_pack1to4_gemm = 23,
+convolution_pack1to8 = 24,
+convolution_pack1to8_1x1s1d1 = 25,
+convolution_pack1to8_3x3s1d1_winograd_gemm = 26,
+convolution_pack1to8_gemm = 27,
+convolution_pack4 = 28,
+convolution_pack4_1x1s1d1 = 29,
+convolution_pack4_1x1s1d1_cm_16_8_8 = 30,
+convolution_pack4_3x3s1d1_winograd23_transform_input = 31,
+convolution_pack4_3x3s1d1_winograd23_transform_output = 32,
+convolution_pack4_3x3s1d1_winograd43_transform_input = 33,
+convolution_pack4_3x3s1d1_winograd43_transform_output = 34,
+convolution_pack4_3x3s1d1_winograd_gemm = 35,
+convolution_pack4_3x3s1d1_winograd_gemm_cm_16_8_8 = 36,
+convolution_pack4_gemm = 37,
+convolution_pack4_gemm_cm_16_8_8 = 38,
+convolution_pack4to1 = 39,
+convolution_pack4to1_1x1s1d1 = 40,
+convolution_pack4to1_3x3s1d1_winograd_gemm = 41,
+convolution_pack4to1_gemm = 42,
+convolution_pack4to8 = 43,
+convolution_pack4to8_1x1s1d1 = 44,
+convolution_pack4to8_3x3s1d1_winograd_gemm = 45,
+convolution_pack4to8_gemm = 46,
+convolution_pack8 = 47,
+convolution_pack8_1x1s1d1 = 48,
+convolution_pack8_3x3s1d1_winograd23_transform_input = 49,
+convolution_pack8_3x3s1d1_winograd23_transform_output = 50,
+convolution_pack8_3x3s1d1_winograd43_transform_input = 51,
+convolution_pack8_3x3s1d1_winograd43_transform_output = 52,
+convolution_pack8_3x3s1d1_winograd_gemm = 53,
+convolution_pack8_gemm = 54,
+convolution_pack8to1 = 55,
+convolution_pack8to1_1x1s1d1 = 56,
+convolution_pack8to1_3x3s1d1_winograd_gemm = 57,
+convolution_pack8to1_gemm = 58,
+convolution_pack8to4 = 59,
+convolution_pack8to4_1x1s1d1 = 60,
+convolution_pack8to4_3x3s1d1_winograd_gemm = 61,
+convolution_pack8to4_gemm = 62,
+crop = 63,
+crop_pack1to4 = 64,
+crop_pack1to8 = 65,
+crop_pack4 = 66,
+crop_pack4to1 = 67,
+crop_pack4to8 = 68,
+crop_pack8 = 69,
+crop_pack8to1 = 70,
+crop_pack8to4 = 71,
+deconvolution = 72,
+deconvolution_col2im = 73,
+deconvolution_gemm = 74,
+deconvolution_pack1to4 = 75,
+deconvolution_pack1to4_gemm = 76,
+deconvolution_pack1to8 = 77,
+deconvolution_pack1to8_gemm = 78,
+deconvolution_pack4 = 79,
+deconvolution_pack4_col2im = 80,
+deconvolution_pack4_gemm = 81,
+deconvolution_pack4_gemm_cm_16_8_8 = 82,
+deconvolution_pack4to1 = 83,
+deconvolution_pack4to1_gemm = 84,
+deconvolution_pack4to8 = 85,
+deconvolution_pack4to8_gemm = 86,
+deconvolution_pack8 = 87,
+deconvolution_pack8_col2im = 88,
+deconvolution_pack8_gemm = 89,
+deconvolution_pack8to1 = 90,
+deconvolution_pack8to1_gemm = 91,
+deconvolution_pack8to4 = 92,
+deconvolution_pack8to4_gemm = 93,
+dropout = 94,
+dropout_pack4 = 95,
+dropout_pack8 = 96,
+eltwise = 97,
+eltwise_pack4 = 98,
+eltwise_pack8 = 99,
+elu = 100,
+elu_pack4 = 101,
+elu_pack8 = 102,
+flatten = 103,
+flatten_pack1to4 = 104,
+flatten_pack1to8 = 105,
+flatten_pack4 = 106,
+flatten_pack4to8 = 107,
+flatten_pack8 = 108,
+innerproduct = 109,
+innerproduct_gemm = 110,
+innerproduct_gemm_wp1to4 = 111,
+innerproduct_gemm_wp1to8 = 112,
+innerproduct_gemm_wp4 = 113,
+innerproduct_gemm_wp4to1 = 114,
+innerproduct_gemm_wp4to8 = 115,
+innerproduct_gemm_wp8 = 116,
+innerproduct_gemm_wp8to1 = 117,
+innerproduct_gemm_wp8to4 = 118,
+innerproduct_pack1to4 = 119,
+innerproduct_pack1to8 = 120,
+innerproduct_pack4 = 121,
+innerproduct_pack4to1 = 122,
+innerproduct_pack4to8 = 123,
+innerproduct_pack8 = 124,
+innerproduct_pack8to1 = 125,
+innerproduct_pack8to4 = 126,
+innerproduct_reduce_sum8 = 127,
+innerproduct_reduce_sum8_pack4 = 128,
+innerproduct_reduce_sum8_pack8 = 129,
+innerproduct_sum8 = 130,
+innerproduct_sum8_pack1to4 = 131,
+innerproduct_sum8_pack1to8 = 132,
+innerproduct_sum8_pack4 = 133,
+innerproduct_sum8_pack4to1 = 134,
+innerproduct_sum8_pack4to8 = 135,
+innerproduct_sum8_pack8 = 136,
+innerproduct_sum8_pack8to1 = 137,
+innerproduct_sum8_pack8to4 = 138,
+lrn_norm = 139,
+lrn_norm_across_channel_pack4 = 140,
+lrn_norm_across_channel_pack8 = 141,
+lrn_norm_within_channel_pack4 = 142,
+lrn_norm_within_channel_pack8 = 143,
+lrn_square_pad = 144,
+lrn_square_pad_across_channel_pack4 = 145,
+lrn_square_pad_across_channel_pack8 = 146,
+lrn_square_pad_within_channel_pack4 = 147,
+lrn_square_pad_within_channel_pack8 = 148,
+pooling = 149,
+pooling_adaptive = 150,
+pooling_adaptive_pack4 = 151,
+pooling_adaptive_pack8 = 152,
+pooling_global = 153,
+pooling_global_pack4 = 154,
+pooling_global_pack8 = 155,
+pooling_pack4 = 156,
+pooling_pack8 = 157,
+prelu = 158,
+prelu_pack4 = 159,
+prelu_pack8 = 160,
+relu = 161,
+relu_pack4 = 162,
+relu_pack8 = 163,
+reshape = 164,
+reshape_pack1to4 = 165,
+reshape_pack1to8 = 166,
+reshape_pack4 = 167,
+reshape_pack4to1 = 168,
+reshape_pack4to8 = 169,
+reshape_pack8 = 170,
+reshape_pack8to1 = 171,
+reshape_pack8to4 = 172,
+scale = 173,
+scale_pack4 = 174,
+scale_pack8 = 175,
+sigmoid = 176,
+sigmoid_pack4 = 177,
+sigmoid_pack8 = 178,
+slice = 179,
+slice_pack1to4 = 180,
+slice_pack1to8 = 181,
+slice_pack4 = 182,
+slice_pack4to8 = 183,
+slice_pack8 = 184,
+softmax_div_sum = 185,
+softmax_div_sum_pack4 = 186,
+softmax_div_sum_pack8 = 187,
+softmax_exp_sub_max = 188,
+softmax_exp_sub_max_pack4 = 189,
+softmax_exp_sub_max_pack8 = 190,
+softmax_reduce_max = 191,
+softmax_reduce_max_pack4 = 192,
+softmax_reduce_max_pack8 = 193,
+softmax_reduce_sum = 194,
+softmax_reduce_sum_pack4 = 195,
+softmax_reduce_sum_pack8 = 196,
+tanh = 197,
+tanh_pack4 = 198,
+tanh_pack8 = 199,
+binaryop = 200,
+binaryop_broadcast = 201,
+binaryop_broadcast_a1_pack4 = 202,
+binaryop_broadcast_a1_pack8 = 203,
+binaryop_broadcast_b1_pack4 = 204,
+binaryop_broadcast_b1_pack8 = 205,
+binaryop_broadcast_pack4 = 206,
+binaryop_broadcast_pack8 = 207,
+binaryop_pack4 = 208,
+binaryop_pack8 = 209,
+unaryop = 210,
+unaryop_pack4 = 211,
+unaryop_pack8 = 212,
+convolutiondepthwise = 213,
+convolutiondepthwise_group = 214,
+convolutiondepthwise_group_pack1to4 = 215,
+convolutiondepthwise_group_pack1to8 = 216,
+convolutiondepthwise_group_pack4 = 217,
+convolutiondepthwise_group_pack4to1 = 218,
+convolutiondepthwise_group_pack4to8 = 219,
+convolutiondepthwise_group_pack8 = 220,
+convolutiondepthwise_group_pack8to1 = 221,
+convolutiondepthwise_group_pack8to4 = 222,
+convolutiondepthwise_pack4 = 223,
+convolutiondepthwise_pack8 = 224,
+padding = 225,
+padding_3d = 226,
+padding_3d_pack4 = 227,
+padding_3d_pack8 = 228,
+padding_pack1to4 = 229,
+padding_pack1to8 = 230,
+padding_pack4 = 231,
+padding_pack4to1 = 232,
+padding_pack4to8 = 233,
+padding_pack8 = 234,
+padding_pack8to1 = 235,
+padding_pack8to4 = 236,
+normalize_coeffs = 237,
+normalize_coeffs_pack4 = 238,
+normalize_coeffs_pack8 = 239,
+normalize_norm = 240,
+normalize_norm_pack4 = 241,
+normalize_norm_pack8 = 242,
+normalize_reduce_sum4_fp16_to_fp32 = 243,
+normalize_reduce_sum4_fp16_to_fp32_pack4 = 244,
+normalize_reduce_sum4_fp16_to_fp32_pack8 = 245,
+normalize_reduce_sum4_fp32 = 246,
+normalize_reduce_sum4_fp32_pack4 = 247,
+normalize_reduce_sum4_fp32_pack8 = 248,
+permute = 249,
+permute_pack1to4 = 250,
+permute_pack1to8 = 251,
+permute_pack4 = 252,
+permute_pack4to1 = 253,
+permute_pack4to8 = 254,
+permute_pack8 = 255,
+permute_pack8to1 = 256,
+permute_pack8to4 = 257,
+priorbox = 258,
+priorbox_mxnet = 259,
+interp = 260,
+interp_bicubic = 261,
+interp_bicubic_coeffs = 262,
+interp_bicubic_pack4 = 263,
+interp_bicubic_pack8 = 264,
+interp_pack4 = 265,
+interp_pack8 = 266,
+deconvolutiondepthwise = 267,
+deconvolutiondepthwise_group = 268,
+deconvolutiondepthwise_group_pack1to4 = 269,
+deconvolutiondepthwise_group_pack1to8 = 270,
+deconvolutiondepthwise_group_pack4 = 271,
+deconvolutiondepthwise_group_pack4to1 = 272,
+deconvolutiondepthwise_group_pack4to8 = 273,
+deconvolutiondepthwise_group_pack8 = 274,
+deconvolutiondepthwise_group_pack8to1 = 275,
+deconvolutiondepthwise_group_pack8to4 = 276,
+deconvolutiondepthwise_pack4 = 277,
+deconvolutiondepthwise_pack8 = 278,
+shufflechannel = 279,
+shufflechannel_pack4 = 280,
+shufflechannel_pack8 = 281,
+instancenorm_coeffs = 282,
+instancenorm_coeffs_pack4 = 283,
+instancenorm_coeffs_pack8 = 284,
+instancenorm_norm = 285,
+instancenorm_norm_pack4 = 286,
+instancenorm_norm_pack8 = 287,
+instancenorm_reduce_mean = 288,
+instancenorm_reduce_mean_pack4 = 289,
+instancenorm_reduce_mean_pack8 = 290,
+instancenorm_reduce_sum4_fp16_to_fp32 = 291,
+instancenorm_reduce_sum4_fp16_to_fp32_pack4 = 292,
+instancenorm_reduce_sum4_fp16_to_fp32_pack8 = 293,
+instancenorm_reduce_sum4_fp32 = 294,
+instancenorm_reduce_sum4_fp32_pack4 = 295,
+instancenorm_reduce_sum4_fp32_pack8 = 296,
+instancenorm_sub_mean_square = 297,
+instancenorm_sub_mean_square_pack4 = 298,
+instancenorm_sub_mean_square_pack8 = 299,
+clip = 300,
+clip_pack4 = 301,
+clip_pack8 = 302,
+reorg = 303,
+reorg_pack1to4 = 304,
+reorg_pack1to8 = 305,
+reorg_pack4 = 306,
+reorg_pack4to8 = 307,
+reorg_pack8 = 308,
+packing = 309,
+packing_fp16_to_fp32 = 310,
+packing_fp32_to_fp16 = 311,
+packing_pack1to4 = 312,
+packing_pack1to4_fp16_to_fp32 = 313,
+packing_pack1to4_fp32_to_fp16 = 314,
+packing_pack1to8 = 315,
+packing_pack1to8_fp16_to_fp32 = 316,
+packing_pack1to8_fp32_to_fp16 = 317,
+packing_pack4 = 318,
+packing_pack4_fp16_to_fp32 = 319,
+packing_pack4_fp32_to_fp16 = 320,
+packing_pack4to1 = 321,
+packing_pack4to1_fp16_to_fp32 = 322,
+packing_pack4to1_fp32_to_fp16 = 323,
+packing_pack4to8 = 324,
+packing_pack4to8_fp16_to_fp32 = 325,
+packing_pack4to8_fp32_to_fp16 = 326,
+packing_pack8 = 327,
+packing_pack8_fp16_to_fp32 = 328,
+packing_pack8_fp32_to_fp16 = 329,
+packing_pack8to1 = 330,
+packing_pack8to1_fp16_to_fp32 = 331,
+packing_pack8to1_fp32_to_fp16 = 332,
+packing_pack8to4 = 333,
+packing_pack8to4_fp16_to_fp32 = 334,
+packing_pack8to4_fp32_to_fp16 = 335,
+cast_fp16_to_fp32 = 336,
+cast_fp16_to_fp32_pack4 = 337,
+cast_fp16_to_fp32_pack8 = 338,
+cast_fp32_to_fp16 = 339,
+cast_fp32_to_fp16_pack4 = 340,
+cast_fp32_to_fp16_pack8 = 341,
+hardsigmoid = 342,
+hardsigmoid_pack4 = 343,
+hardsigmoid_pack8 = 344,
+hardswish = 345,
+hardswish_pack4 = 346,
+hardswish_pack8 = 347,
+pixelshuffle = 348,
+pixelshuffle_pack4 = 349,
+pixelshuffle_pack4to1 = 350,
+pixelshuffle_pack8 = 351,
+pixelshuffle_pack8to1 = 352,
+pixelshuffle_pack8to4 = 353,
+deepcopy = 354,
+deepcopy_pack4 = 355,
+deepcopy_pack8 = 356,
+mish = 357,
+mish_pack4 = 358,
+mish_pack8 = 359,
+swish = 360,
+swish_pack4 = 361,
+swish_pack8 = 362,
+convert_ycbcr = 363,
+vulkan_activation = 364,
+
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/layer_type.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/layer_type.h
new file mode 100644
index 0000000..511c714
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/layer_type.h
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_TYPE_H
+#define NCNN_LAYER_TYPE_H
+
+namespace ncnn {
+
+namespace LayerType {
+enum LayerType
+{
+#include "layer_type_enum.h"
+    CustomBit = (1 << 8),
+};
+} // namespace LayerType
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_TYPE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/layer_type_enum.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/layer_type_enum.h
new file mode 100644
index 0000000..581d589
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/layer_type_enum.h
@@ -0,0 +1,103 @@
+// Layer Type Enum header
+//
+// This file is auto-generated by cmake, don't edit it.
+
+AbsVal = 0,
+ArgMax = 1,
+BatchNorm = 2,
+Bias = 3,
+BNLL = 4,
+Concat = 5,
+Convolution = 6,
+Crop = 7,
+Deconvolution = 8,
+Dropout = 9,
+Eltwise = 10,
+ELU = 11,
+Embed = 12,
+Exp = 13,
+Flatten = 14,
+InnerProduct = 15,
+Input = 16,
+Log = 17,
+LRN = 18,
+MemoryData = 19,
+MVN = 20,
+Pooling = 21,
+Power = 22,
+PReLU = 23,
+Proposal = 24,
+Reduction = 25,
+ReLU = 26,
+Reshape = 27,
+ROIPooling = 28,
+Scale = 29,
+Sigmoid = 30,
+Slice = 31,
+Softmax = 32,
+Split = 33,
+SPP = 34,
+TanH = 35,
+Threshold = 36,
+Tile = 37,
+RNN = 38,
+LSTM = 39,
+BinaryOp = 40,
+UnaryOp = 41,
+ConvolutionDepthWise = 42,
+Padding = 43,
+Squeeze = 44,
+ExpandDims = 45,
+Normalize = 46,
+Permute = 47,
+PriorBox = 48,
+DetectionOutput = 49,
+Interp = 50,
+DeconvolutionDepthWise = 51,
+ShuffleChannel = 52,
+InstanceNorm = 53,
+Clip = 54,
+Reorg = 55,
+YoloDetectionOutput = 56,
+Quantize = 57,
+Dequantize = 58,
+Yolov3DetectionOutput = 59,
+PSROIPooling = 60,
+ROIAlign = 61,
+Packing = 62,
+Requantize = 63,
+Cast = 64,
+HardSigmoid = 65,
+SELU = 66,
+HardSwish = 67,
+Noop = 68,
+PixelShuffle = 69,
+DeepCopy = 70,
+Mish = 71,
+StatisticsPooling = 72,
+Swish = 73,
+Gemm = 74,
+GroupNorm = 75,
+LayerNorm = 76,
+Softplus = 77,
+GRU = 78,
+MultiHeadAttention = 79,
+GELU = 80,
+Convolution1D = 81,
+Pooling1D = 82,
+ConvolutionDepthWise1D = 83,
+Convolution3D = 84,
+ConvolutionDepthWise3D = 85,
+Pooling3D = 86,
+MatMul = 87,
+Deconvolution1D = 88,
+DeconvolutionDepthWise1D = 89,
+Deconvolution3D = 90,
+DeconvolutionDepthWise3D = 91,
+Einsum = 92,
+DeformableConv2D = 93,
+GLU = 94,
+Fold = 95,
+Unfold = 96,
+GridSample = 97,
+
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/mat.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/mat.h
new file mode 100644
index 0000000..c6f59ef
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/mat.h
@@ -0,0 +1,1843 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_MAT_H
+#define NCNN_MAT_H
+
+#include <stdlib.h>
+#include <string.h>
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif
+#if __SSE2__
+#include <emmintrin.h>
+#if __AVX__
+#include <immintrin.h>
+#endif
+#endif
+#if __mips_msa
+#include <msa.h>
+#endif
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif
+#if __riscv_vector
+#include <riscv_vector.h>
+#include "cpu.h" // cpu_riscv_vlenb()
+#endif
+
+#include "allocator.h"
+#include "option.h"
+#include "platform.h"
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#if NCNN_PIXEL
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/bitmap.h>
+#include <jni.h>
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+#endif // NCNN_PIXEL
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkMat;
+class VkImageMat;
+#endif // NCNN_VULKAN
+
+// the three dimension matrix
+class NCNN_EXPORT Mat
+{
+public:
+    // empty
+    Mat();
+    // vec
+    Mat(int w, size_t elemsize = 4u, Allocator* allocator = 0);
+    // image
+    Mat(int w, int h, size_t elemsize = 4u, Allocator* allocator = 0);
+    // dim
+    Mat(int w, int h, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // cube
+    Mat(int w, int h, int d, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // packed vec
+    Mat(int w, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed image
+    Mat(int w, int h, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed dim
+    Mat(int w, int h, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed cube
+    Mat(int w, int h, int d, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // copy
+    Mat(const Mat& m);
+    // external vec
+    Mat(int w, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external image
+    Mat(int w, int h, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external dim
+    Mat(int w, int h, int c, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external cube
+    Mat(int w, int h, int d, int c, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external packed vec
+    Mat(int w, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed image
+    Mat(int w, int h, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed dim
+    Mat(int w, int h, int c, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed cube
+    Mat(int w, int h, int d, int c, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // release
+    ~Mat();
+    // assign
+    Mat& operator=(const Mat& m);
+    // set all
+    void fill(float v);
+    void fill(int v);
+#if __ARM_NEON
+    void fill(float32x4_t _v);
+    void fill(uint16x4_t _v);
+    void fill(int32x4_t _v);
+    void fill(int32x4_t _v0, int32x4_t _v1);
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    void fill(float16x4_t _v);
+    void fill(float16x8_t _v);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif // __ARM_NEON
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+    void fill(__m512 _v);
+#endif // __AVX512F__
+    void fill(__m256 _v, int i = 0);
+#endif // __AVX__
+    void fill(__m128 _v);
+    void fill(__m128i _v);
+#endif // __SSE2__
+#if __mips_msa
+    void fill(v4f32 _v);
+#endif // __mips_msa
+#if __loongarch_sx
+    void fill(__m128 _v);
+#endif //__loongarch_sx
+#if __riscv_vector
+    void fill(vfloat32m1_t _v);
+    void fill(vuint16m1_t _v);
+    void fill(vint8m1_t _v);
+#if __riscv_zfh
+    void fill(vfloat16m1_t _v);
+#endif // __riscv_zfh
+#endif // __riscv_vector
+    template<typename T>
+    void fill(T v);
+    // deep copy
+    Mat clone(Allocator* allocator = 0) const;
+    // deep copy from other mat, inplace
+    void clone_from(const ncnn::Mat& mat, Allocator* allocator = 0);
+    // reshape vec
+    Mat reshape(int w, Allocator* allocator = 0) const;
+    // reshape image
+    Mat reshape(int w, int h, Allocator* allocator = 0) const;
+    // reshape dim
+    Mat reshape(int w, int h, int c, Allocator* allocator = 0) const;
+    // reshape cube
+    Mat reshape(int w, int h, int d, int c, Allocator* allocator = 0) const;
+    // allocate vec
+    void create(int w, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate image
+    void create(int w, int h, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate cube
+    void create(int w, int h, int d, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed cube
+    void create(int w, int h, int d, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate like
+    void create_like(const Mat& m, Allocator* allocator = 0);
+#if NCNN_VULKAN
+    // allocate like
+    void create_like(const VkMat& m, Allocator* allocator = 0);
+    // allocate like
+    void create_like(const VkImageMat& im, Allocator* allocator = 0);
+#endif // NCNN_VULKAN
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // bits per element
+    int elembits() const;
+
+    // shape only
+    Mat shape() const;
+
+    // data reference
+    Mat channel(int c);
+    const Mat channel(int c) const;
+    Mat depth(int z);
+    const Mat depth(int z) const;
+    float* row(int y);
+    const float* row(int y) const;
+    template<typename T>
+    T* row(int y);
+    template<typename T>
+    const T* row(int y) const;
+
+    // range reference
+    Mat channel_range(int c, int channels);
+    const Mat channel_range(int c, int channels) const;
+    Mat depth_range(int z, int depths);
+    const Mat depth_range(int z, int depths) const;
+    Mat row_range(int y, int rows);
+    const Mat row_range(int y, int rows) const;
+    Mat range(int x, int n);
+    const Mat range(int x, int n) const;
+
+    // access raw data
+    template<typename T>
+    operator T*();
+    template<typename T>
+    operator const T*() const;
+
+    // convenient access float vec element
+    float& operator[](size_t i);
+    const float& operator[](size_t i) const;
+
+#if NCNN_PIXEL
+    enum PixelType
+    {
+        PIXEL_CONVERT_SHIFT = 16,
+        PIXEL_FORMAT_MASK = 0x0000ffff,
+        PIXEL_CONVERT_MASK = 0xffff0000,
+
+        PIXEL_RGB = 1,
+        PIXEL_BGR = 2,
+        PIXEL_GRAY = 3,
+        PIXEL_RGBA = 4,
+        PIXEL_BGRA = 5,
+
+        PIXEL_RGB2BGR = PIXEL_RGB | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGB2GRAY = PIXEL_RGB | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGB2RGBA = PIXEL_RGB | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGB2BGRA = PIXEL_RGB | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_BGR2RGB = PIXEL_BGR | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGR2GRAY = PIXEL_BGR | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGR2RGBA = PIXEL_BGR | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGR2BGRA = PIXEL_BGR | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_GRAY2RGB = PIXEL_GRAY | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_GRAY2BGR = PIXEL_GRAY | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_GRAY2RGBA = PIXEL_GRAY | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+        PIXEL_GRAY2BGRA = PIXEL_GRAY | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_RGBA2RGB = PIXEL_RGBA | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2BGR = PIXEL_RGBA | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2GRAY = PIXEL_RGBA | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2BGRA = PIXEL_RGBA | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_BGRA2RGB = PIXEL_BGRA | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGRA2BGR = PIXEL_BGRA | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGRA2GRAY = PIXEL_BGRA | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGRA2RGBA = PIXEL_BGRA | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+    };
+    // convenient construct from pixel data
+    static Mat from_pixels(const unsigned char* pixels, int type, int w, int h, Allocator* allocator = 0);
+    // convenient construct from pixel data with stride(bytes-per-row) parameter
+    static Mat from_pixels(const unsigned char* pixels, int type, int w, int h, int stride, Allocator* allocator = 0);
+    // convenient construct from pixel data and resize to specific size
+    static Mat from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from pixel data and resize to specific size with stride(bytes-per-row) parameter
+    static Mat from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int stride, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from pixel data roi
+    static Mat from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int roix, int roiy, int roiw, int roih, Allocator* allocator = 0);
+    // convenient construct from pixel data roi with stride(bytes-per-row) parameter
+    static Mat from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, Allocator* allocator = 0);
+    // convenient construct from pixel data roi and resize to specific size
+    static Mat from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from pixel data roi and resize to specific size with stride(bytes-per-row) parameter
+    static Mat from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator = 0);
+
+    // convenient export to pixel data
+    void to_pixels(unsigned char* pixels, int type) const;
+    // convenient export to pixel data with stride(bytes-per-row) parameter
+    void to_pixels(unsigned char* pixels, int type, int stride) const;
+    // convenient export to pixel data and resize to specific size
+    void to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height) const;
+    // convenient export to pixel data and resize to specific size with stride(bytes-per-row) parameter
+    void to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height, int target_stride) const;
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+    // convenient construct from android Bitmap
+    static Mat from_android_bitmap(JNIEnv* env, jobject bitmap, int type_to, Allocator* allocator = 0);
+    // convenient construct from android Bitmap and resize to specific size
+    static Mat from_android_bitmap_resize(JNIEnv* env, jobject bitmap, int type_to, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from android Bitmap roi
+    static Mat from_android_bitmap_roi(JNIEnv* env, jobject bitmap, int type_to, int roix, int roiy, int roiw, int roih, Allocator* allocator = 0);
+    // convenient construct from android Bitmap roi and resize to specific size
+    static Mat from_android_bitmap_roi_resize(JNIEnv* env, jobject bitmap, int type_to, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient export to android Bitmap and resize to the android Bitmap size
+    void to_android_bitmap(JNIEnv* env, jobject bitmap, int type_from) const;
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+#endif // NCNN_PIXEL
+
+    // substract channel-wise mean values, then multiply by normalize values, pass 0 to skip
+    void substract_mean_normalize(const float* mean_vals, const float* norm_vals);
+
+    // convenient construct from half precision floating point data
+    static Mat from_float16(const unsigned short* data, int size);
+
+    // pointer to the data
+    void* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-d-h-w-1  c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-d-h-w-4  c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-d-h-w-8  c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    Allocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int d;
+    int c;
+
+    size_t cstep;
+};
+
+#if NCNN_VULKAN
+
+// the three dimension matrix, vulkan version
+class NCNN_EXPORT VkMat
+{
+public:
+    // empty
+    VkMat();
+    // vec
+    VkMat(int w, size_t elemsize, VkAllocator* allocator);
+    // image
+    VkMat(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // dim
+    VkMat(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // cube
+    VkMat(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // packed vec
+    VkMat(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed image
+    VkMat(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed dim
+    VkMat(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed cube
+    VkMat(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // copy
+    VkMat(const VkMat& m);
+    // external vec
+    VkMat(int w, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external image
+    VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external dim
+    VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external cube
+    VkMat(int w, int h, int d, int c, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external packed vec
+    VkMat(int w, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed image
+    VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed dim
+    VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed cube
+    VkMat(int w, int h, int d, int c, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // release
+    ~VkMat();
+    // assign
+    VkMat& operator=(const VkMat& m);
+    // allocate vec
+    void create(int w, size_t elemsize, VkAllocator* allocator);
+    // allocate image
+    void create(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate cube
+    void create(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed cube
+    void create(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate like
+    void create_like(const Mat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkMat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkImageMat& im, VkAllocator* allocator);
+
+    // mapped
+    Mat mapped() const;
+    void* mapped_ptr() const;
+
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // bits per element
+    int elembits() const;
+
+    // shape only
+    Mat shape() const;
+
+    // low-level reference
+    VkBuffer buffer() const;
+    size_t buffer_offset() const;
+    size_t buffer_capacity() const;
+
+    // device buffer
+    VkBufferMemory* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-d-h-w-1  c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-d-h-w-4  c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-d-h-w-8  c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    VkAllocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int d;
+    int c;
+
+    size_t cstep;
+};
+
+class NCNN_EXPORT VkImageMat
+{
+public:
+    // empty
+    VkImageMat();
+    // vec
+    VkImageMat(int w, size_t elemsize, VkAllocator* allocator);
+    // image
+    VkImageMat(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // dim
+    VkImageMat(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // cube
+    VkImageMat(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // packed vec
+    VkImageMat(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed image
+    VkImageMat(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed dim
+    VkImageMat(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed cube
+    VkImageMat(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // copy
+    VkImageMat(const VkImageMat& m);
+    // external vec
+    VkImageMat(int w, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external image
+    VkImageMat(int w, int h, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external dim
+    VkImageMat(int w, int h, int c, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external cube
+    VkImageMat(int w, int h, int d, int c, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external packed vec
+    VkImageMat(int w, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed image
+    VkImageMat(int w, int h, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed dim
+    VkImageMat(int w, int h, int c, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed cube
+    VkImageMat(int w, int h, int d, int c, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // release
+    ~VkImageMat();
+    // assign
+    VkImageMat& operator=(const VkImageMat& m);
+    // allocate vec
+    void create(int w, size_t elemsize, VkAllocator* allocator);
+    // allocate image
+    void create(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate cube
+    void create(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed cube
+    void create(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate like
+    void create_like(const Mat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkMat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkImageMat& im, VkAllocator* allocator);
+
+    // mapped
+    Mat mapped() const;
+    void* mapped_ptr() const;
+
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // bits per element
+    int elembits() const;
+
+    // shape only
+    Mat shape() const;
+
+    // low-level reference
+    VkImage image() const;
+    VkImageView imageview() const;
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+    // convenient construct from android hardware buffer
+    static VkImageMat from_android_hardware_buffer(VkAndroidHardwareBufferImageAllocator* allocator);
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+    // device image
+    VkImageMemory* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-d-h-w-1  c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-d-h-w-4  c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-d-h-w-8  c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    VkAllocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int d;
+    int c;
+};
+
+// type for vulkan specialization constant and push constant
+union vk_specialization_type
+{
+    int i;
+    float f;
+    uint32_t u32;
+};
+union vk_constant_type
+{
+    int i;
+    float f;
+};
+#endif // NCNN_VULKAN
+
+// misc function
+#if NCNN_PIXEL
+// convert yuv420sp(nv21) to rgb, the fast approximate version
+NCNN_EXPORT void yuv420sp2rgb(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
+// convert yuv420sp(nv12) to rgb, the fast approximate version
+NCNN_EXPORT void yuv420sp2rgb_nv12(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
+// convert yuv420sp(nv21) to rgb with half resize, the faster approximate version
+NCNN_EXPORT void yuv420sp2rgb_half(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
+// image pixel bilinear resize
+NCNN_EXPORT void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+NCNN_EXPORT void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+NCNN_EXPORT void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+NCNN_EXPORT void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+// image pixel bilinear resize with stride(bytes-per-row) parameter
+NCNN_EXPORT void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+NCNN_EXPORT void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+NCNN_EXPORT void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+NCNN_EXPORT void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+// image pixel bilinear resize, convenient wrapper for yuv420sp(nv21/nv12)
+NCNN_EXPORT void resize_bilinear_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+#endif // NCNN_PIXEL
+#if NCNN_PIXEL_ROTATE
+// type is the from type, 6 means rotating from 6 to 1
+//
+//     1        2       3      4         5            6           7          8
+//
+//   888888  888888      88  88      8888888888  88                  88  8888888888
+//   88          88      88  88      88  88      88  88          88  88      88  88
+//   8888      8888    8888  8888    88          8888888888  8888888888          88
+//   88          88      88  88
+//   88          88  888888  888888
+//
+// ref http://sylvana.net/jpegcrop/exif_orientation.html
+// image pixel kanna rotate
+NCNN_EXPORT void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+NCNN_EXPORT void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+NCNN_EXPORT void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+NCNN_EXPORT void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+// image pixel kanna rotate with stride(bytes-per-row) parameter
+NCNN_EXPORT void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+NCNN_EXPORT void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+NCNN_EXPORT void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+NCNN_EXPORT void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+// image pixel kanna rotate, convenient wrapper for yuv420sp(nv21/nv12)
+NCNN_EXPORT void kanna_rotate_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+#endif // NCNN_PIXEL_ROTATE
+#if NCNN_PIXEL_AFFINE
+// resolve affine transform matrix from rotation angle, scale factor and x y offset
+NCNN_EXPORT void get_rotation_matrix(float angle, float scale, float dx, float dy, float* tm);
+// resolve affine transform matrix from two set of points, num_point must be >= 2
+NCNN_EXPORT void get_affine_transform(const float* points_from, const float* points_to, int num_point, float* tm);
+// resolve the inversion affine transform matrix
+NCNN_EXPORT void invert_affine_transform(const float* tm, float* tm_inv);
+// image pixel bilinear warpaffine inverse transform, set -233 for transparent border color, the color RGBA is little-endian encoded
+NCNN_EXPORT void warpaffine_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+// image pixel bilinear warpaffine inverse transform with stride(bytes-per-row) parameter, set -233 for transparent border color, the color RGBA is little-endian encoded
+NCNN_EXPORT void warpaffine_bilinear_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+// image pixel bilinear warpaffine, convenient wrapper for yuv420sp(nv21/nv12), set -233 for transparent border color, the color YUV_ is little-endian encoded
+NCNN_EXPORT void warpaffine_bilinear_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+#endif // NCNN_PIXEL_AFFINE
+#if NCNN_PIXEL_DRAWING
+// draw rectangle, set thickness -1 for filled rectangle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_rectangle_c1(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c2(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c3(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c4(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+// draw rectangle with stride(bytes-per-row) parameter, set thickness -1 for filled rectangle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_rectangle_c1(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c2(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c3(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c4(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+// draw rectangle, convenient wrapper for yuv420sp(nv21/nv12), set thickness -1 for filled rectangle, the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_rectangle_yuv420sp(unsigned char* yuv420sp, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+// draw circle, set thickness -1 for filled circle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_circle_c1(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c2(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c3(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c4(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+// draw circle with stride(bytes-per-row) parameter, set thickness -1 for filled circle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_circle_c1(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c2(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c3(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c4(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+// draw circle, convenient wrapper for yuv420sp(nv21/nv12), set thickness -1 for filled circle, the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_circle_yuv420sp(unsigned char* yuv420sp, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+// draw line, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_line_c1(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c2(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c3(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c4(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+// draw line with stride(bytes-per-row) parameter, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_line_c1(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c2(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c3(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c4(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+// draw line, convenient wrapper for yuv420sp(nv21/nv12), the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_line_yuv420sp(unsigned char* yuv420sp, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+// resolve text bounding box size
+NCNN_EXPORT void get_text_drawing_size(const char* text, int fontpixelsize, int* w, int* h);
+// draw ascii printables and newline, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_text_c1(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c2(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c3(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c4(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+// draw ascii printables and newline with stride(bytes-per-row) parameter, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_text_c1(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c2(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c3(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c4(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+// draw ascii printables and newline, convenient wrapper for yuv420sp(nv21/nv12), the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_text_yuv420sp(unsigned char* yuv420sp, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+#endif // NCNN_PIXEL_DRAWING
+
+// type conversion
+// convert float to half precision floating point
+NCNN_EXPORT unsigned short float32_to_float16(float value);
+// convert half precision floating point to float
+NCNN_EXPORT float float16_to_float32(unsigned short value);
+// convert float to brain half
+NCNN_EXPORT NCNN_FORCEINLINE unsigned short float32_to_bfloat16(float value)
+{
+    // 16 : 16
+    union
+    {
+        unsigned int u;
+        float f;
+    } tmp;
+    tmp.f = value;
+    return tmp.u >> 16;
+}
+// convert brain half to float
+NCNN_EXPORT NCNN_FORCEINLINE float bfloat16_to_float32(unsigned short value)
+{
+    // 16 : 16
+    union
+    {
+        unsigned int u;
+        float f;
+    } tmp;
+    tmp.u = value << 16;
+    return tmp.f;
+}
+
+// mat process
+enum BorderType
+{
+    BORDER_CONSTANT = 0,
+    BORDER_REPLICATE = 1,
+    BORDER_REFLECT = 2,
+    BORDER_TRANSPARENT = -233,
+};
+NCNN_EXPORT void copy_make_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int type, float v, const Option& opt = Option());
+NCNN_EXPORT void copy_make_border_3d(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int front, int behind, int type, float v, const Option& opt = Option());
+NCNN_EXPORT void copy_cut_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, const Option& opt = Option());
+NCNN_EXPORT void copy_cut_border_3d(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int front, int behind, const Option& opt = Option());
+NCNN_EXPORT void resize_nearest(const Mat& src, Mat& dst, int w, int h, const Option& opt = Option());
+NCNN_EXPORT void resize_bilinear(const Mat& src, Mat& dst, int w, int h, const Option& opt = Option());
+NCNN_EXPORT void resize_bicubic(const Mat& src, Mat& dst, int w, int h, const Option& opt = Option());
+NCNN_EXPORT void convert_packing(const Mat& src, Mat& dst, int elempack, const Option& opt = Option());
+NCNN_EXPORT void flatten(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_float32_to_float16(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_float16_to_float32(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_int8_to_float32(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_float32_to_bfloat16(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_bfloat16_to_float32(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void quantize_to_int8(const Mat& src, Mat& dst, const Mat& scale_data, const Option& opt = Option());
+NCNN_EXPORT void dequantize_from_int32(const Mat& src, Mat& dst, const Mat& scale_data, const Mat& bias_data, const Option& opt = Option());
+NCNN_EXPORT void requantize_from_int32_to_int8(const Mat& src, Mat& dst, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt = Option());
+
+NCNN_FORCEINLINE Mat::Mat()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(const Mat& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c), cstep(m.cstep)
+{
+    addref();
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = (size_t)w * h;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = (size_t)w * h;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::~Mat()
+{
+    release();
+}
+
+NCNN_FORCEINLINE void Mat::fill(float _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+
+    int i = 0;
+#if __ARM_NEON
+    float32x4_t _c = vdupq_n_f32(_v);
+    for (; i + 3 < size; i += 4)
+    {
+        vst1q_f32(ptr, _c);
+        ptr += 4;
+    }
+#endif // __ARM_NEON
+    for (; i < size; i++)
+    {
+        *ptr++ = _v;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(int _v)
+{
+    int size = (int)total();
+    int* ptr = (int*)data;
+
+    int i = 0;
+#if __ARM_NEON
+    int32x4_t _c = vdupq_n_s32(_v);
+    for (; i + 3 < size; i += 4)
+    {
+        vst1q_s32(ptr, _c);
+        ptr += 4;
+    }
+#endif // __ARM_NEON
+    for (; i < size; i++)
+    {
+        *ptr++ = _v;
+    }
+}
+
+#if __ARM_NEON
+NCNN_FORCEINLINE void Mat::fill(float32x4_t _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_f32(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(uint16x4_t _v)
+{
+    int size = (int)total();
+    unsigned short* ptr = (unsigned short*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1_u16(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(int32x4_t _v)
+{
+    int size = (int)total();
+    int* ptr = (int*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_s32(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(int32x4_t _v0, int32x4_t _v1)
+{
+    int size = (int)total();
+    int* ptr = (int*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_s32(ptr, _v0);
+        vst1q_s32(ptr + 4, _v1);
+        ptr += 8;
+    }
+}
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+NCNN_FORCEINLINE void Mat::fill(float16x4_t _v)
+{
+    int size = (int)total();
+    __fp16* ptr = (__fp16*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1_f16(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(float16x8_t _v)
+{
+    int size = (int)total();
+    __fp16* ptr = (__fp16*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_f16(ptr, _v);
+        ptr += 8;
+    }
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif // __ARM_NEON
+
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+NCNN_FORCEINLINE void Mat::fill(__m512 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm512_storeu_ps(ptr, _v);
+        ptr += 16;
+    }
+}
+#endif // __AVX512F__
+NCNN_FORCEINLINE void Mat::fill(__m256 _v, int _i)
+{
+    // old gcc cannot overload __m128 and __m256 type
+    // add a dummy int parameter for different mangled function symbol
+    (void)_i;
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm256_storeu_ps(ptr, _v);
+        ptr += 8;
+    }
+}
+#endif // __AVX__
+NCNN_FORCEINLINE void Mat::fill(__m128 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm_storeu_ps(ptr, _v);
+        ptr += 4;
+    }
+}
+NCNN_FORCEINLINE void Mat::fill(__m128i _v)
+{
+    int size = (int)total();
+    unsigned short* ptr = (unsigned short*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm_store_si128((__m128i*)ptr, _v);
+        ptr += 8;
+    }
+}
+#endif // __SSE2__
+
+#if __mips_msa
+NCNN_FORCEINLINE void Mat::fill(v4f32 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        __msa_st_w((v4i32)_v, ptr, 0);
+        ptr += 4;
+    }
+}
+#endif // __mips_msa
+
+#if __loongarch_sx
+NCNN_FORCEINLINE void Mat::fill(__m128 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        __lsx_vst(_v, ptr, 0);
+        ptr += 4;
+    }
+}
+#endif // __loongarch_sx
+#if __riscv_vector
+NCNN_FORCEINLINE void Mat::fill(vfloat32m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 4;
+    const size_t vl = vsetvl_e32m1(packn);
+
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse32_v_f32m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(vuint16m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 2;
+    const size_t vl = vsetvl_e16m1(packn);
+
+    int size = (int)total();
+    unsigned short* ptr = (unsigned short*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse16_v_u16m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(vint8m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 1;
+    const size_t vl = vsetvl_e8m1(packn);
+
+    int size = (int)total();
+    signed char* ptr = (signed char*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse8_v_i8m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+#if __riscv_zfh
+NCNN_FORCEINLINE void Mat::fill(vfloat16m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 2;
+    const size_t vl = vsetvl_e16m1(packn);
+
+    int size = (int)total();
+    __fp16* ptr = (__fp16*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse16_v_f16m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+#endif // __riscv_zfh
+#endif // __riscv_vector
+
+template<typename T>
+NCNN_FORCEINLINE void Mat::fill(T _v)
+{
+    int size = (int)total();
+    T* ptr = (T*)data;
+    for (int i = 0; i < size; i++)
+    {
+        ptr[i] = _v;
+    }
+}
+
+NCNN_FORCEINLINE Mat& Mat::operator=(const Mat& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        NCNN_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    d = m.d;
+    c = m.c;
+
+    cstep = m.cstep;
+
+    return *this;
+}
+
+NCNN_FORCEINLINE void Mat::addref()
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+}
+
+NCNN_FORCEINLINE void Mat::release()
+{
+    if (refcount && NCNN_XADD(refcount, -1) == 1)
+    {
+        if (allocator)
+            allocator->fastFree(data);
+        else
+            fastFree(data);
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    d = 0;
+    c = 0;
+
+    cstep = 0;
+
+    refcount = 0;
+}
+
+NCNN_FORCEINLINE bool Mat::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+NCNN_FORCEINLINE size_t Mat::total() const
+{
+    return cstep * c;
+}
+
+NCNN_FORCEINLINE int Mat::elembits() const
+{
+    return elempack ? static_cast<int>(elemsize * 8) / elempack : 0;
+}
+
+NCNN_FORCEINLINE Mat Mat::shape() const
+{
+    if (dims == 1)
+        return Mat(w * elempack, (void*)0);
+    if (dims == 2)
+        return Mat(w, h * elempack, (void*)0);
+    if (dims == 3)
+        return Mat(w, h, c * elempack, (void*)0);
+    if (dims == 4)
+        return Mat(w, h, d, c * elempack, (void*)0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE Mat Mat::channel(int _c)
+{
+    Mat m(w, h, d, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims - 1;
+    if (dims == 4)
+        m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::channel(int _c) const
+{
+    Mat m(w, h, d, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims - 1;
+    if (dims == 4)
+        m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE Mat Mat::depth(int z)
+{
+    return Mat(w, h, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE const Mat Mat::depth(int z) const
+{
+    return Mat(w, h, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE float* Mat::row(int y)
+{
+    return (float*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+NCNN_FORCEINLINE const float* Mat::row(int y) const
+{
+    return (const float*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+template<typename T>
+NCNN_FORCEINLINE T* Mat::row(int y)
+{
+    return (T*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+template<typename T>
+NCNN_FORCEINLINE const T* Mat::row(int y) const
+{
+    return (const T*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+NCNN_FORCEINLINE Mat Mat::channel_range(int _c, int channels)
+{
+    Mat m(w, h, d, channels, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::channel_range(int _c, int channels) const
+{
+    Mat m(w, h, d, channels, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims;
+    return m;
+}
+
+NCNN_FORCEINLINE Mat Mat::depth_range(int z, int depths)
+{
+    Mat m(w, h, depths, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+    m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::depth_range(int z, int depths) const
+{
+    Mat m(w, h, depths, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+    m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE Mat Mat::row_range(int y, int rows)
+{
+    return Mat(w, rows, (unsigned char*)data + (size_t)w * y * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE const Mat Mat::row_range(int y, int rows) const
+{
+    return Mat(w, rows, (unsigned char*)data + (size_t)w * y * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE Mat Mat::range(int x, int n)
+{
+    return Mat(n, (unsigned char*)data + x * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE const Mat Mat::range(int x, int n) const
+{
+    return Mat(n, (unsigned char*)data + x * elemsize, elemsize, elempack, allocator);
+}
+
+template<typename T>
+NCNN_FORCEINLINE Mat::operator T*()
+{
+    return (T*)data;
+}
+
+template<typename T>
+NCNN_FORCEINLINE Mat::operator const T*() const
+{
+    return (const T*)data;
+}
+
+NCNN_FORCEINLINE float& Mat::operator[](size_t i)
+{
+    return ((float*)data)[i];
+}
+
+NCNN_FORCEINLINE const float& Mat::operator[](size_t i) const
+{
+    return ((const float*)data)[i];
+}
+
+#if NCNN_VULKAN
+
+NCNN_FORCEINLINE VkMat::VkMat()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(const VkMat& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c)
+{
+    addref();
+
+    cstep = m.cstep;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = w * h;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize(w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize(w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = w * h;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize(w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize(w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::~VkMat()
+{
+    release();
+}
+
+NCNN_FORCEINLINE VkMat& VkMat::operator=(const VkMat& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        NCNN_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    d = m.d;
+    c = m.c;
+
+    cstep = m.cstep;
+
+    return *this;
+}
+
+NCNN_FORCEINLINE Mat VkMat::mapped() const
+{
+    if (!allocator->mappable)
+        return Mat();
+
+    if (dims == 1)
+        return Mat(w, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 2)
+        return Mat(w, h, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 3)
+        return Mat(w, h, c, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 4)
+        return Mat(w, h, d, c, mapped_ptr(), elemsize, elempack, 0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE void* VkMat::mapped_ptr() const
+{
+    if (!allocator->mappable)
+        return 0;
+
+    return (unsigned char*)data->mapped_ptr + data->offset;
+}
+
+NCNN_FORCEINLINE void VkMat::addref()
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+}
+
+NCNN_FORCEINLINE void VkMat::release()
+{
+    if (refcount && NCNN_XADD(refcount, -1) == 1)
+    {
+        if (allocator && data)
+        {
+            allocator->fastFree(data);
+        }
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    d = 0;
+    c = 0;
+
+    cstep = 0;
+
+    refcount = 0;
+}
+
+NCNN_FORCEINLINE bool VkMat::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+NCNN_FORCEINLINE size_t VkMat::total() const
+{
+    return cstep * c;
+}
+
+NCNN_FORCEINLINE int VkMat::elembits() const
+{
+    return elempack ? static_cast<int>(elemsize) * 8 / elempack : 0;
+}
+
+NCNN_FORCEINLINE Mat VkMat::shape() const
+{
+    if (dims == 1)
+        return Mat(w * elempack, (void*)0);
+    if (dims == 2)
+        return Mat(w, h * elempack, (void*)0);
+    if (dims == 3)
+        return Mat(w, h, c * elempack, (void*)0);
+    if (dims == 4)
+        return Mat(w, h, d, c * elempack, (void*)0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE VkBuffer VkMat::buffer() const
+{
+    return data->buffer;
+}
+
+NCNN_FORCEINLINE size_t VkMat::buffer_offset() const
+{
+    return data->offset;
+}
+
+NCNN_FORCEINLINE size_t VkMat::buffer_capacity() const
+{
+    return data->capacity;
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(const VkImageMat& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c)
+{
+    addref();
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::~VkImageMat()
+{
+    release();
+}
+
+NCNN_FORCEINLINE VkImageMat& VkImageMat::operator=(const VkImageMat& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        NCNN_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    d = m.d;
+    c = m.c;
+
+    return *this;
+}
+
+NCNN_FORCEINLINE Mat VkImageMat::mapped() const
+{
+    if (!allocator->mappable || !data->mapped_ptr)
+        return Mat();
+
+    if (dims == 1)
+        return Mat(w, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 2)
+        return Mat(w, h, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 3)
+        return Mat(w, h, c, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 4)
+        return Mat(w, h, d, c, mapped_ptr(), elemsize, elempack, 0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE void* VkImageMat::mapped_ptr() const
+{
+    if (!allocator->mappable || !data->mapped_ptr)
+        return 0;
+
+    return (unsigned char*)data->mapped_ptr + data->bind_offset;
+}
+
+NCNN_FORCEINLINE void VkImageMat::addref()
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+}
+
+NCNN_FORCEINLINE void VkImageMat::release()
+{
+    if (refcount && NCNN_XADD(refcount, -1) == 1)
+    {
+        if (allocator && data)
+        {
+            allocator->fastFree(data);
+        }
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    d = 0;
+    c = 0;
+
+    refcount = 0;
+}
+
+NCNN_FORCEINLINE bool VkImageMat::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+NCNN_FORCEINLINE size_t VkImageMat::total() const
+{
+    return w * h * d * c;
+}
+
+NCNN_FORCEINLINE int VkImageMat::elembits() const
+{
+    return elempack ? static_cast<int>(elemsize) * 8 / elempack : 0;
+}
+
+NCNN_FORCEINLINE Mat VkImageMat::shape() const
+{
+    if (dims == 1)
+        return Mat(w * elempack, (void*)0);
+    if (dims == 2)
+        return Mat(w, h * elempack, (void*)0);
+    if (dims == 3)
+        return Mat(w, h, c * elempack, (void*)0);
+    if (dims == 4)
+        return Mat(w, h, d, c * elempack, (void*)0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE VkImage VkImageMat::image() const
+{
+    return data->image;
+}
+
+NCNN_FORCEINLINE VkImageView VkImageMat::imageview() const
+{
+    return data->imageview;
+}
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_MAT_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/modelbin.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/modelbin.h
new file mode 100644
index 0000000..15d2b9c
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/modelbin.h
@@ -0,0 +1,80 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_MODELBIN_H
+#define NCNN_MODELBIN_H
+
+#include "mat.h"
+
+namespace ncnn {
+
+class DataReader;
+class NCNN_EXPORT ModelBin
+{
+public:
+    ModelBin();
+    virtual ~ModelBin();
+    // element type
+    // 0 = auto
+    // 1 = float32
+    // 2 = float16
+    // 3 = int8
+    // load vec
+    virtual Mat load(int w, int type) const = 0;
+    // load image
+    virtual Mat load(int w, int h, int type) const;
+    // load dim
+    virtual Mat load(int w, int h, int c, int type) const;
+    // load cube
+    virtual Mat load(int w, int h, int d, int c, int type) const;
+};
+
+class ModelBinFromDataReaderPrivate;
+class NCNN_EXPORT ModelBinFromDataReader : public ModelBin
+{
+public:
+    explicit ModelBinFromDataReader(const DataReader& dr);
+    virtual ~ModelBinFromDataReader();
+
+    virtual Mat load(int w, int type) const;
+
+private:
+    ModelBinFromDataReader(const ModelBinFromDataReader&);
+    ModelBinFromDataReader& operator=(const ModelBinFromDataReader&);
+
+private:
+    ModelBinFromDataReaderPrivate* const d;
+};
+
+class ModelBinFromMatArrayPrivate;
+class NCNN_EXPORT ModelBinFromMatArray : public ModelBin
+{
+public:
+    // construct from weight blob array
+    explicit ModelBinFromMatArray(const Mat* weights);
+    virtual ~ModelBinFromMatArray();
+
+    virtual Mat load(int w, int type) const;
+
+private:
+    ModelBinFromMatArray(const ModelBinFromMatArray&);
+    ModelBinFromMatArray& operator=(const ModelBinFromMatArray&);
+
+private:
+    ModelBinFromMatArrayPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_MODELBIN_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/ncnn_export.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/ncnn_export.h
new file mode 100644
index 0000000..e2f5fde
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/ncnn_export.h
@@ -0,0 +1,42 @@
+
+#ifndef NCNN_EXPORT_H
+#define NCNN_EXPORT_H
+
+#ifdef NCNN_STATIC_DEFINE
+#  define NCNN_EXPORT
+#  define NCNN_NO_EXPORT
+#else
+#  ifndef NCNN_EXPORT
+#    ifdef ncnn_EXPORTS
+        /* We are building this library */
+#      define NCNN_EXPORT __attribute__((visibility("default")))
+#    else
+        /* We are using this library */
+#      define NCNN_EXPORT __attribute__((visibility("default")))
+#    endif
+#  endif
+
+#  ifndef NCNN_NO_EXPORT
+#    define NCNN_NO_EXPORT __attribute__((visibility("hidden")))
+#  endif
+#endif
+
+#ifndef NCNN_DEPRECATED
+#  define NCNN_DEPRECATED __attribute__ ((__deprecated__))
+#endif
+
+#ifndef NCNN_DEPRECATED_EXPORT
+#  define NCNN_DEPRECATED_EXPORT NCNN_EXPORT NCNN_DEPRECATED
+#endif
+
+#ifndef NCNN_DEPRECATED_NO_EXPORT
+#  define NCNN_DEPRECATED_NO_EXPORT NCNN_NO_EXPORT NCNN_DEPRECATED
+#endif
+
+#if 0 /* DEFINE_NO_DEPRECATED */
+#  ifndef NCNN_NO_DEPRECATED
+#    define NCNN_NO_DEPRECATED
+#  endif
+#endif
+
+#endif /* NCNN_EXPORT_H */
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/net.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/net.h
new file mode 100644
index 0000000..9407042
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/net.h
@@ -0,0 +1,272 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_NET_H
+#define NCNN_NET_H
+
+#include "blob.h"
+#include "layer.h"
+#include "mat.h"
+#include "option.h"
+#include "platform.h"
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/asset_manager.h>
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkCompute;
+#endif // NCNN_VULKAN
+class DataReader;
+class Extractor;
+class NetPrivate;
+class NCNN_EXPORT Net
+{
+public:
+    // empty init
+    Net();
+    // clear and destroy
+    virtual ~Net();
+
+public:
+    // option can be changed before loading
+    Option opt;
+
+#if NCNN_VULKAN
+    // set gpu device by index
+    void set_vulkan_device(int device_index);
+
+    // set gpu device by device handle, no owner transfer
+    void set_vulkan_device(const VulkanDevice* vkdev);
+
+    const VulkanDevice* vulkan_device() const;
+#endif // NCNN_VULKAN
+
+#if NCNN_STRING
+    // register custom layer by layer type name
+    // return 0 if success
+    int register_custom_layer(const char* type, layer_creator_func creator, layer_destroyer_func destroyer = 0, void* userdata = 0);
+    virtual int custom_layer_to_index(const char* type);
+#endif // NCNN_STRING
+    // register custom layer by layer type
+    // return 0 if success
+    int register_custom_layer(int index, layer_creator_func creator, layer_destroyer_func destroyer = 0, void* userdata = 0);
+
+#if NCNN_STRING
+    int load_param(const DataReader& dr);
+#endif // NCNN_STRING
+
+    int load_param_bin(const DataReader& dr);
+
+    int load_model(const DataReader& dr);
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    // load network structure from plain param file
+    // return 0 if success
+    int load_param(FILE* fp);
+    int load_param(const char* protopath);
+    int load_param_mem(const char* mem);
+#endif // NCNN_STRING
+    // load network structure from binary param file
+    // return 0 if success
+    int load_param_bin(FILE* fp);
+    int load_param_bin(const char* protopath);
+
+    // load network weight data from model file
+    // return 0 if success
+    int load_model(FILE* fp);
+    int load_model(const char* modelpath);
+#endif // NCNN_STDIO
+
+    // load network structure from external memory
+    // memory pointer must be 32-bit aligned
+    // return bytes consumed
+    int load_param(const unsigned char* mem);
+
+    // reference network weight data from external memory
+    // weight data is not copied but referenced
+    // so external memory should be retained when used
+    // memory pointer must be 32-bit aligned
+    // return bytes consumed
+    int load_model(const unsigned char* mem);
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#if NCNN_STRING
+    // convenient load network structure from android asset plain param file
+    int load_param(AAsset* asset);
+    int load_param(AAssetManager* mgr, const char* assetpath);
+#endif // NCNN_STRING
+    // convenient load network structure from android asset binary param file
+    int load_param_bin(AAsset* asset);
+    int load_param_bin(AAssetManager* mgr, const char* assetpath);
+
+    // convenient load network weight data from android asset model file
+    int load_model(AAsset* asset);
+    int load_model(AAssetManager* mgr, const char* assetpath);
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+    // unload network structure and weight data
+    void clear();
+
+    // construct an Extractor from network
+    Extractor create_extractor() const;
+
+    // get input/output indexes/names
+    const std::vector<int>& input_indexes() const;
+    const std::vector<int>& output_indexes() const;
+#if NCNN_STRING
+    const std::vector<const char*>& input_names() const;
+    const std::vector<const char*>& output_names() const;
+#endif
+
+    const std::vector<Blob>& blobs() const;
+    const std::vector<Layer*>& layers() const;
+
+    std::vector<Blob>& mutable_blobs();
+    std::vector<Layer*>& mutable_layers();
+
+protected:
+    friend class Extractor;
+#if NCNN_STRING
+    int find_blob_index_by_name(const char* name) const;
+    int find_layer_index_by_name(const char* name) const;
+    virtual Layer* create_custom_layer(const char* type);
+#endif // NCNN_STRING
+    virtual Layer* create_custom_layer(int index);
+
+private:
+    Net(const Net&);
+    Net& operator=(const Net&);
+
+private:
+    NetPrivate* const d;
+};
+
+class ExtractorPrivate;
+class NCNN_EXPORT Extractor
+{
+public:
+    virtual ~Extractor();
+
+    // copy
+    Extractor(const Extractor&);
+
+    // assign
+    Extractor& operator=(const Extractor&);
+
+    // clear blob mats and alloctors
+    void clear();
+
+    // enable light mode
+    // intermediate blob will be recycled when enabled
+    // enabled by default
+    void set_light_mode(bool enable);
+
+    // set thread count for this extractor
+    // this will overwrite the global setting
+    // default count is system depended
+    void set_num_threads(int num_threads);
+
+    // set blob memory allocator
+    void set_blob_allocator(Allocator* allocator);
+
+    // set workspace memory allocator
+    void set_workspace_allocator(Allocator* allocator);
+
+#if NCNN_VULKAN
+    void set_vulkan_compute(bool enable);
+
+    void set_blob_vkallocator(VkAllocator* allocator);
+
+    void set_workspace_vkallocator(VkAllocator* allocator);
+
+    void set_staging_vkallocator(VkAllocator* allocator);
+#endif // NCNN_VULKAN
+
+#if NCNN_STRING
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const Mat& in);
+
+    // get result by blob name
+    // return 0 if success
+    // type = 0, default
+    // type = 1, do not convert fp16/bf16 or / and packing
+    int extract(const char* blob_name, Mat& feat, int type = 0);
+#endif // NCNN_STRING
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const Mat& in);
+
+    // get result by blob index
+    // return 0 if success
+    // type = 0, default
+    // type = 1, do not convert fp16/bf16 or / and packing
+    int extract(int blob_index, Mat& feat, int type = 0);
+
+#if NCNN_VULKAN
+#if NCNN_STRING
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const VkMat& in);
+
+    // get result by blob name
+    // return 0 if success
+    int extract(const char* blob_name, VkMat& feat, VkCompute& cmd);
+
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const VkImageMat& in);
+
+    // get result by blob name
+    // return 0 if success
+    int extract(const char* blob_name, VkImageMat& feat, VkCompute& cmd);
+#endif // NCNN_STRING
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const VkMat& in);
+
+    // get result by blob index
+    // return 0 if success
+    int extract(int blob_index, VkMat& feat, VkCompute& cmd);
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const VkImageMat& in);
+
+    // get result by blob index
+    // return 0 if success
+    int extract(int blob_index, VkImageMat& feat, VkCompute& cmd);
+#endif // NCNN_VULKAN
+
+protected:
+    friend Extractor Net::create_extractor() const;
+    Extractor(const Net* net, size_t blob_count);
+
+private:
+    ExtractorPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_NET_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/option.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/option.h
new file mode 100644
index 0000000..3fda808
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/option.h
@@ -0,0 +1,153 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_OPTION_H
+#define NCNN_OPTION_H
+
+#include "platform.h"
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkAllocator;
+class PipelineCache;
+#endif // NCNN_VULKAN
+
+class Allocator;
+class NCNN_EXPORT Option
+{
+public:
+    // default option
+    Option();
+
+public:
+    // light mode
+    // intermediate blob will be recycled when enabled
+    // enabled by default
+    bool lightmode;
+
+    // thread count
+    // default value is the one returned by get_cpu_count()
+    int num_threads;
+
+    // blob memory allocator
+    Allocator* blob_allocator;
+
+    // workspace memory allocator
+    Allocator* workspace_allocator;
+
+#if NCNN_VULKAN
+    // blob memory allocator
+    VkAllocator* blob_vkallocator;
+
+    // workspace memory allocator
+    VkAllocator* workspace_vkallocator;
+
+    // staging memory allocator
+    VkAllocator* staging_vkallocator;
+
+    // pipeline cache
+    PipelineCache* pipeline_cache;
+#endif // NCNN_VULKAN
+
+    // the time openmp threads busy-wait for more work before going to sleep
+    // default value is 20ms to keep the cores enabled
+    // without too much extra power consumption afterwards
+    int openmp_blocktime;
+
+    // enable winograd convolution optimization
+    // improve convolution 3x3 stride1 performance, may consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_winograd_convolution;
+
+    // enable sgemm convolution optimization
+    // improve convolution 1x1 stride1 performance, may consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_sgemm_convolution;
+
+    // enable quantized int8 inference
+    // use low-precision int8 path for quantized model
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_int8_inference;
+
+    // enable vulkan compute
+    bool use_vulkan_compute;
+
+    // enable bf16 data type for storage
+    // improve most operator performance on all arm devices, may consume more memory
+    bool use_bf16_storage;
+
+    // enable options for gpu inference
+    bool use_fp16_packed;
+    bool use_fp16_storage;
+    bool use_fp16_arithmetic;
+    bool use_int8_packed;
+    bool use_int8_storage;
+    bool use_int8_arithmetic;
+
+    // enable simd-friendly packed memory layout
+    // improve all operator performance on all arm devices, will consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_packing_layout;
+
+    bool use_shader_pack8;
+
+    // subgroup option
+    bool use_subgroup_basic;
+    bool use_subgroup_vote;
+    bool use_subgroup_ballot;
+    bool use_subgroup_shuffle;
+
+    // turn on for adreno
+    bool use_image_storage;
+    bool use_tensor_storage;
+
+    bool use_reserved_0;
+
+    // enable DAZ(Denormals-Are-Zero) and FTZ(Flush-To-Zero)
+    // default value is 3
+    // 0 = DAZ OFF, FTZ OFF
+    // 1 = DAZ ON , FTZ OFF
+    // 2 = DAZ OFF, FTZ ON
+    // 3 = DAZ ON,  FTZ ON
+    int flush_denormals;
+
+    bool use_local_pool_allocator;
+
+    // enable local memory optimization for gpu inference
+    bool use_shader_local_memory;
+
+    // enable cooperative matrix optimization for gpu inference
+    bool use_cooperative_matrix;
+
+    // more fine-grained control of winograd convolution
+    bool use_winograd23_convolution;
+    bool use_winograd43_convolution;
+    bool use_winograd63_convolution;
+
+    bool use_reserved_6;
+    bool use_reserved_7;
+    bool use_reserved_8;
+    bool use_reserved_9;
+    bool use_reserved_10;
+    bool use_reserved_11;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_OPTION_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/paramdict.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/paramdict.h
new file mode 100644
index 0000000..c2ef160
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/paramdict.h
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PARAMDICT_H
+#define NCNN_PARAMDICT_H
+
+#include "mat.h"
+
+// at most 32 parameters
+#define NCNN_MAX_PARAM_COUNT 32
+
+namespace ncnn {
+
+class DataReader;
+class Net;
+class ParamDictPrivate;
+class NCNN_EXPORT ParamDict
+{
+public:
+    // empty
+    ParamDict();
+
+    virtual ~ParamDict();
+
+    // copy
+    ParamDict(const ParamDict&);
+
+    // assign
+    ParamDict& operator=(const ParamDict&);
+
+    // get type
+    int type(int id) const;
+
+    // get int
+    int get(int id, int def) const;
+    // get float
+    float get(int id, float def) const;
+    // get array
+    Mat get(int id, const Mat& def) const;
+
+    // set int
+    void set(int id, int i);
+    // set float
+    void set(int id, float f);
+    // set array
+    void set(int id, const Mat& v);
+
+protected:
+    friend class Net;
+
+    void clear();
+
+    int load_param(const DataReader& dr);
+    int load_param_bin(const DataReader& dr);
+
+private:
+    ParamDictPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_PARAMDICT_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/pipeline.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/pipeline.h
new file mode 100644
index 0000000..c284a14
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/pipeline.h
@@ -0,0 +1,113 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PIPELINE_H
+#define NCNN_PIPELINE_H
+
+#include "mat.h"
+#include "platform.h"
+#if NCNN_VULKAN
+#include "gpu.h"
+
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class Option;
+class PipelinePrivate;
+class NCNN_EXPORT Pipeline
+{
+public:
+    explicit Pipeline(const VulkanDevice* vkdev);
+    virtual ~Pipeline();
+
+public:
+    void set_optimal_local_size_xyz(int w = 4, int h = 4, int c = 4);
+    void set_optimal_local_size_xyz(const Mat& local_size_xyz);
+    void set_local_size_xyz(int w, int h, int c);
+
+    int create(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations);
+
+    int create(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations);
+
+public:
+    VkShaderModule shader_module() const;
+    VkDescriptorSetLayout descriptorset_layout() const;
+    VkPipelineLayout pipeline_layout() const;
+    VkPipeline pipeline() const;
+    VkDescriptorUpdateTemplateKHR descriptor_update_template() const;
+
+    const ShaderInfo& shader_info() const;
+
+    uint32_t local_size_x() const;
+    uint32_t local_size_y() const;
+    uint32_t local_size_z() const;
+
+protected:
+    void set_shader_module(VkShaderModule shader_module);
+    void set_descriptorset_layout(VkDescriptorSetLayout descriptorset_layout);
+    void set_pipeline_layout(VkPipelineLayout pipeline_layout);
+    void set_pipeline(VkPipeline pipeline);
+    void set_descriptor_update_template(VkDescriptorUpdateTemplateKHR descriptor_update_template);
+
+    void set_shader_info(const ShaderInfo& shader_info);
+
+public:
+    const VulkanDevice* vkdev;
+
+private:
+    Pipeline(const Pipeline&);
+    Pipeline& operator=(const Pipeline&);
+
+private:
+    PipelinePrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class VkCompute;
+class NCNN_EXPORT ImportAndroidHardwareBufferPipeline : private Pipeline
+{
+public:
+    explicit ImportAndroidHardwareBufferPipeline(const VulkanDevice* vkdev);
+    virtual ~ImportAndroidHardwareBufferPipeline();
+
+    int create(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator, int type_to, int rotate_from, const Option& opt);
+    int create(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator, int type_to, int rotate_from, int target_width, int target_height, const Option& opt);
+    void destroy();
+
+    friend class VkCompute;
+
+protected:
+    int create_shader_module(const Option& opt);
+    int create_sampler(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator);
+    int create_descriptorset_layout();
+
+public:
+    int type_to;
+    int rotate_from;
+    bool need_resize;
+
+    VkSampler sampler;
+};
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_PIPELINE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/pipelinecache.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/pipelinecache.h
new file mode 100644
index 0000000..bb6b8fb
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/pipelinecache.h
@@ -0,0 +1,85 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PIPELINECACHE_H
+#define NCNN_PIPELINECACHE_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#include "mat.h"
+#include "gpu.h"
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+
+class VulkanDevice;
+class PipelineCachePrivate;
+class NCNN_EXPORT PipelineCache
+{
+public:
+    explicit PipelineCache(const VulkanDevice* _vkdev);
+
+    virtual ~PipelineCache();
+
+    void clear();
+
+    int get_pipeline(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations,
+                     uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                     VkShaderModule* shader_module,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template,
+                     ShaderInfo& shader_info) const;
+
+    int get_pipeline(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations,
+                     uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                     VkShaderModule* shader_module,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template,
+                     ShaderInfo& shader_info) const;
+
+protected:
+    int create_shader_module(int shader_type_index, const Option& opt, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                             VkShaderModule* _shader_module, ShaderInfo& si) const;
+
+    int new_pipeline(VkShaderModule shader_module, const ShaderInfo& shader_info, const std::vector<vk_specialization_type>& specializations,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template) const;
+
+protected:
+    const VulkanDevice* vkdev;
+
+private:
+    PipelineCache(const PipelineCache&);
+    PipelineCache& operator=(const PipelineCache&);
+
+private:
+    PipelineCachePrivate* const d;
+};
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_PIPELINECACHE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/platform.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/platform.h
new file mode 100644
index 0000000..a353fd1
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/platform.h
@@ -0,0 +1,285 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PLATFORM_H
+#define NCNN_PLATFORM_H
+
+#define NCNN_STDIO 1
+#define NCNN_STRING 1
+#define NCNN_SIMPLEOCV 0
+#define NCNN_SIMPLEOMP 0
+#define NCNN_SIMPLESTL 0
+#define NCNN_THREADS 1
+#define NCNN_BENCHMARK 0
+#define NCNN_C_API 1
+#define NCNN_PLATFORM_API 1
+#define NCNN_PIXEL 1
+#define NCNN_PIXEL_ROTATE 1
+#define NCNN_PIXEL_AFFINE 1
+#define NCNN_PIXEL_DRAWING 1
+#define NCNN_VULKAN 1
+#define NCNN_SYSTEM_GLSLANG 0
+#define NCNN_RUNTIME_CPU 1
+#define NCNN_AVX 0
+#define NCNN_XOP 0
+#define NCNN_FMA 0
+#define NCNN_F16C 0
+#define NCNN_AVX2 0
+#define NCNN_AVXVNNI 0
+#define NCNN_AVX512 0
+#define NCNN_AVX512VNNI 0
+#define NCNN_AVX512BF16 0
+#define NCNN_AVX512FP16 0
+#define NCNN_VFPV4 1
+#if __aarch64__
+#define NCNN_ARM82 1
+#define NCNN_ARM82DOT 1
+#define NCNN_ARM82FP16FML 1
+#define NCNN_ARM84BF16 1
+#define NCNN_ARM84I8MM 1
+#define NCNN_ARM86SVE 1
+#define NCNN_ARM86SVE2 1
+#define NCNN_ARM86SVEBF16 1
+#define NCNN_ARM86SVEI8MM 1
+#define NCNN_ARM86SVEF32MM 1
+#endif // __aarch64__
+#define NCNN_MSA 0
+#define NCNN_LSX 0
+#define NCNN_MMI 0
+#define NCNN_RVV 0
+#define NCNN_INT8 1
+#define NCNN_BF16 1
+#define NCNN_FORCE_INLINE 1
+
+#define NCNN_VERSION_STRING "1.0.20221128"
+
+#include "ncnn_export.h"
+
+#ifdef __cplusplus
+
+#if NCNN_THREADS
+#if (defined _WIN32 && !(defined __MINGW32__))
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <process.h>
+#else
+#include <pthread.h>
+#endif
+#endif // NCNN_THREADS
+
+#if __ANDROID_API__ >= 26
+#define VK_USE_PLATFORM_ANDROID_KHR
+#endif // __ANDROID_API__ >= 26
+
+namespace ncnn {
+
+#if NCNN_THREADS
+#if (defined _WIN32 && !(defined __MINGW32__))
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() { InitializeSRWLock(&srwlock); }
+    ~Mutex() {}
+    void lock() { AcquireSRWLockExclusive(&srwlock); }
+    void unlock() { ReleaseSRWLockExclusive(&srwlock); }
+private:
+    friend class ConditionVariable;
+    // NOTE SRWLock is available from windows vista
+    SRWLOCK srwlock;
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() { InitializeConditionVariable(&condvar); }
+    ~ConditionVariable() {}
+    void wait(Mutex& mutex) { SleepConditionVariableSRW(&condvar, &mutex.srwlock, INFINITE, 0); }
+    void broadcast() { WakeAllConditionVariable(&condvar); }
+    void signal() { WakeConditionVariable(&condvar); }
+private:
+    CONDITION_VARIABLE condvar;
+};
+
+static unsigned __stdcall start_wrapper(void* args);
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*start)(void*), void* args = 0) { _start = start; _args = args; handle = (HANDLE)_beginthreadex(0, 0, start_wrapper, this, 0, 0); }
+    ~Thread() {}
+    void join() { WaitForSingleObject(handle, INFINITE); CloseHandle(handle); }
+private:
+    friend unsigned __stdcall start_wrapper(void* args)
+    {
+        Thread* t = (Thread*)args;
+        t->_start(t->_args);
+        return 0;
+    }
+    HANDLE handle;
+    void* (*_start)(void*);
+    void* _args;
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { key = TlsAlloc(); }
+    ~ThreadLocalStorage() { TlsFree(key); }
+    void set(void* value) { TlsSetValue(key, (LPVOID)value); }
+    void* get() { return (void*)TlsGetValue(key); }
+private:
+    DWORD key;
+};
+#else // (defined _WIN32 && !(defined __MINGW32__))
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() { pthread_mutex_init(&mutex, 0); }
+    ~Mutex() { pthread_mutex_destroy(&mutex); }
+    void lock() { pthread_mutex_lock(&mutex); }
+    void unlock() { pthread_mutex_unlock(&mutex); }
+private:
+    friend class ConditionVariable;
+    pthread_mutex_t mutex;
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() { pthread_cond_init(&cond, 0); }
+    ~ConditionVariable() { pthread_cond_destroy(&cond); }
+    void wait(Mutex& mutex) { pthread_cond_wait(&cond, &mutex.mutex); }
+    void broadcast() { pthread_cond_broadcast(&cond); }
+    void signal() { pthread_cond_signal(&cond); }
+private:
+    pthread_cond_t cond;
+};
+
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*start)(void*), void* args = 0) { pthread_create(&t, 0, start, args); }
+    ~Thread() {}
+    void join() { pthread_join(t, 0); }
+private:
+    pthread_t t;
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { pthread_key_create(&key, 0); }
+    ~ThreadLocalStorage() { pthread_key_delete(key); }
+    void set(void* value) { pthread_setspecific(key, value); }
+    void* get() { return pthread_getspecific(key); }
+private:
+    pthread_key_t key;
+};
+#endif // (defined _WIN32 && !(defined __MINGW32__))
+#else // NCNN_THREADS
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() {}
+    ~Mutex() {}
+    void lock() {}
+    void unlock() {}
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() {}
+    ~ConditionVariable() {}
+    void wait(Mutex& /*mutex*/) {}
+    void broadcast() {}
+    void signal() {}
+};
+
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*/*start*/)(void*), void* /*args*/ = 0) {}
+    ~Thread() {}
+    void join() {}
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { data = 0; }
+    ~ThreadLocalStorage() {}
+    void set(void* value) { data = value; }
+    void* get() { return data; }
+private:
+    void* data;
+};
+#endif // NCNN_THREADS
+
+class NCNN_EXPORT MutexLockGuard
+{
+public:
+    MutexLockGuard(Mutex& _mutex) : mutex(_mutex) { mutex.lock(); }
+    ~MutexLockGuard() { mutex.unlock(); }
+private:
+    Mutex& mutex;
+};
+
+} // namespace ncnn
+
+#if NCNN_SIMPLESTL
+#include "simplestl.h"
+#else
+#include <algorithm>
+#include <list>
+#include <vector>
+#include <string>
+#endif
+
+#endif // __cplusplus
+
+#if NCNN_STDIO
+#if NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#include <android/log.h>
+#define NCNN_LOGE(...) do { \
+    fprintf(stderr, ##__VA_ARGS__); fprintf(stderr, "\n"); \
+    __android_log_print(ANDROID_LOG_WARN, "ncnn", ##__VA_ARGS__); } while(0)
+#else // NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#include <stdio.h>
+#define NCNN_LOGE(...) do { \
+    fprintf(stderr, ##__VA_ARGS__); fprintf(stderr, "\n"); } while(0)
+#endif // NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#else
+#define NCNN_LOGE(...)
+#endif
+
+
+#if NCNN_FORCE_INLINE
+#ifdef _MSC_VER
+    #define NCNN_FORCEINLINE __forceinline
+#elif defined(__GNUC__)
+    #define NCNN_FORCEINLINE inline __attribute__((__always_inline__))
+#elif defined(__CLANG__)
+    #if __has_attribute(__always_inline__)
+        #define NCNN_FORCEINLINE inline __attribute__((__always_inline__))
+    #else
+        #define NCNN_FORCEINLINE inline
+    #endif
+#else
+    #define NCNN_FORCEINLINE inline
+#endif
+#else
+    #define NCNN_FORCEINLINE inline
+#endif
+
+#endif // NCNN_PLATFORM_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/simpleocv.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/simpleocv.h
new file mode 100644
index 0000000..55ede15
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/simpleocv.h
@@ -0,0 +1,501 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLEOCV_H
+#define NCNN_SIMPLEOCV_H
+
+#include "platform.h"
+
+#if NCNN_SIMPLEOCV
+
+#include <limits.h>
+#include <string.h>
+#include "allocator.h"
+#include "mat.h"
+
+#if defined(_MSC_VER) || defined(__GNUC__)
+#pragma push_macro("min")
+#pragma push_macro("max")
+#undef min
+#undef max
+#endif
+
+#ifndef NCNN_XADD
+using ncnn::NCNN_XADD;
+#endif
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+
+enum
+{
+    CV_LOAD_IMAGE_UNCHANGED = -1,
+    CV_LOAD_IMAGE_GRAYSCALE = 0,
+    CV_LOAD_IMAGE_COLOR = 1,
+};
+
+enum
+{
+    CV_IMWRITE_JPEG_QUALITY = 1
+};
+
+// minimal opencv style data structure implementation
+namespace cv {
+
+template<typename _Tp>
+static inline _Tp saturate_cast(int v)
+{
+    return _Tp(v);
+}
+template<>
+inline uchar saturate_cast<uchar>(int v)
+{
+    return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
+}
+
+template<typename _Tp>
+struct Scalar_
+{
+    Scalar_()
+    {
+        v[0] = 0;
+        v[1] = 0;
+        v[2] = 0;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0)
+    {
+        v[0] = _v0;
+        v[1] = 0;
+        v[2] = 0;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0, _Tp _v1, _Tp _v2)
+    {
+        v[0] = _v0;
+        v[1] = _v1;
+        v[2] = _v2;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0, _Tp _v1, _Tp _v2, _Tp _v3)
+    {
+        v[0] = _v0;
+        v[1] = _v1;
+        v[2] = _v2;
+        v[3] = _v3;
+    }
+
+    const _Tp operator[](const int i) const
+    {
+        return v[i];
+    }
+
+    _Tp operator[](const int i)
+    {
+        return v[i];
+    }
+
+    _Tp v[4];
+};
+
+typedef Scalar_<uchar> Scalar;
+
+template<typename _Tp>
+struct Point_
+{
+    Point_()
+        : x(0), y(0)
+    {
+    }
+    Point_(_Tp _x, _Tp _y)
+        : x(_x), y(_y)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Point_<_Tp2>() const
+    {
+        return Point_<_Tp2>(saturate_cast<_Tp2>(x), saturate_cast<_Tp2>(y));
+    }
+
+    _Tp x;
+    _Tp y;
+};
+
+typedef Point_<int> Point;
+typedef Point_<float> Point2f;
+
+template<typename _Tp>
+struct Size_
+{
+    Size_()
+        : width(0), height(0)
+    {
+    }
+    Size_(_Tp _w, _Tp _h)
+        : width(_w), height(_h)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Size_<_Tp2>() const
+    {
+        return Size_<_Tp2>(saturate_cast<_Tp2>(width), saturate_cast<_Tp2>(height));
+    }
+
+    _Tp width;
+    _Tp height;
+};
+
+typedef Size_<int> Size;
+typedef Size_<float> Size2f;
+
+template<typename _Tp>
+struct Rect_
+{
+    Rect_()
+        : x(0), y(0), width(0), height(0)
+    {
+    }
+    Rect_(_Tp _x, _Tp _y, _Tp _w, _Tp _h)
+        : x(_x), y(_y), width(_w), height(_h)
+    {
+    }
+    Rect_(Point_<_Tp> _p, Size_<_Tp> _size)
+        : x(_p.x), y(_p.y), width(_size.width), height(_size.height)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Rect_<_Tp2>() const
+    {
+        return Rect_<_Tp2>(saturate_cast<_Tp2>(x), saturate_cast<_Tp2>(y), saturate_cast<_Tp2>(width), saturate_cast<_Tp2>(height));
+    }
+
+    _Tp x;
+    _Tp y;
+    _Tp width;
+    _Tp height;
+
+    // area
+    _Tp area() const
+    {
+        return width * height;
+    }
+};
+
+template<typename _Tp>
+static inline Rect_<_Tp>& operator&=(Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    _Tp x1 = std::max(a.x, b.x), y1 = std::max(a.y, b.y);
+    a.width = std::min(a.x + a.width, b.x + b.width) - x1;
+    a.height = std::min(a.y + a.height, b.y + b.height) - y1;
+    a.x = x1;
+    a.y = y1;
+    if (a.width <= 0 || a.height <= 0)
+        a = Rect_<_Tp>();
+    return a;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp>& operator|=(Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    _Tp x1 = std::min(a.x, b.x), y1 = std::min(a.y, b.y);
+    a.width = std::max(a.x + a.width, b.x + b.width) - x1;
+    a.height = std::max(a.y + a.height, b.y + b.height) - y1;
+    a.x = x1;
+    a.y = y1;
+    return a;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp> operator&(const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    Rect_<_Tp> c = a;
+    return c &= b;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp> operator|(const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    Rect_<_Tp> c = a;
+    return c |= b;
+}
+
+typedef Rect_<int> Rect;
+typedef Rect_<float> Rect2f;
+
+#define CV_8UC1  1
+#define CV_8UC3  3
+#define CV_8UC4  4
+#define CV_32FC1 4
+
+struct NCNN_EXPORT Mat
+{
+    Mat()
+        : data(0), refcount(0), rows(0), cols(0), c(0)
+    {
+    }
+
+    Mat(int _rows, int _cols, int flags)
+        : data(0), refcount(0)
+    {
+        create(_rows, _cols, flags);
+    }
+
+    // copy
+    Mat(const Mat& m)
+        : data(m.data), refcount(m.refcount)
+    {
+        if (refcount)
+            NCNN_XADD(refcount, 1);
+
+        rows = m.rows;
+        cols = m.cols;
+        c = m.c;
+    }
+
+    Mat(int _rows, int _cols, int flags, void* _data)
+        : data((unsigned char*)_data), refcount(0)
+    {
+        rows = _rows;
+        cols = _cols;
+        c = flags;
+    }
+
+    ~Mat()
+    {
+        release();
+    }
+
+    // assign
+    Mat& operator=(const Mat& m)
+    {
+        if (this == &m)
+            return *this;
+
+        if (m.refcount)
+            NCNN_XADD(m.refcount, 1);
+
+        release();
+
+        data = m.data;
+        refcount = m.refcount;
+
+        rows = m.rows;
+        cols = m.cols;
+        c = m.c;
+
+        return *this;
+    }
+
+    Mat& operator=(const Scalar& s)
+    {
+        if (total() > 0)
+        {
+            uchar* p = data;
+            for (int i = 0; i < cols * rows; i++)
+            {
+                for (int j = 0; j < c; j++)
+                {
+                    *p++ = s[j];
+                }
+            }
+        }
+
+        return *this;
+    }
+
+    void create(int _rows, int _cols, int flags)
+    {
+        release();
+
+        rows = _rows;
+        cols = _cols;
+        c = flags;
+
+        if (total() > 0)
+        {
+            // refcount address must be aligned, so we expand totalsize here
+            size_t totalsize = (total() + 3) >> 2 << 2;
+            data = (uchar*)ncnn::fastMalloc(totalsize + (int)sizeof(*refcount));
+            refcount = (int*)(((uchar*)data) + totalsize);
+            *refcount = 1;
+        }
+    }
+
+    void release()
+    {
+        if (refcount && NCNN_XADD(refcount, -1) == 1)
+            ncnn::fastFree(data);
+
+        data = 0;
+
+        rows = 0;
+        cols = 0;
+        c = 0;
+
+        refcount = 0;
+    }
+
+    Mat clone() const
+    {
+        if (empty())
+            return Mat();
+
+        Mat m(rows, cols, c);
+
+        if (total() > 0)
+        {
+            memcpy(m.data, data, total());
+        }
+
+        return m;
+    }
+
+    bool empty() const
+    {
+        return data == 0 || total() == 0;
+    }
+
+    int channels() const
+    {
+        return c;
+    }
+
+    int type() const
+    {
+        return c;
+    }
+
+    size_t total() const
+    {
+        return cols * rows * c;
+    }
+
+    const uchar* ptr(int y) const
+    {
+        return data + y * cols * c;
+    }
+
+    uchar* ptr(int y)
+    {
+        return data + y * cols * c;
+    }
+
+    template<typename _Tp>
+    const _Tp* ptr(int y) const
+    {
+        return (const _Tp*)data + y * cols * c;
+    }
+
+    template<typename _Tp>
+    _Tp* ptr(int y)
+    {
+        return (_Tp*)data + y * cols * c;
+    }
+
+    // roi
+    Mat operator()(const Rect& roi) const
+    {
+        if (empty())
+            return Mat();
+
+        Mat m(roi.height, roi.width, c);
+
+        int sy = roi.y;
+        for (int y = 0; y < roi.height; y++)
+        {
+            const uchar* sptr = ptr(sy) + roi.x * c;
+            uchar* dptr = m.ptr(y);
+            memcpy(dptr, sptr, roi.width * c);
+            sy++;
+        }
+
+        return m;
+    }
+
+    uchar* data;
+
+    // pointer to the reference counter;
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    int rows;
+    int cols;
+
+    int c;
+};
+
+enum ImreadModes
+{
+    IMREAD_UNCHANGED = -1,
+    IMREAD_GRAYSCALE = 0,
+    IMREAD_COLOR = 1
+};
+
+NCNN_EXPORT Mat imread(const std::string& path, int flags = IMREAD_COLOR);
+
+enum ImwriteFlags
+{
+    IMWRITE_JPEG_QUALITY = 1
+};
+
+NCNN_EXPORT bool imwrite(const std::string& path, const Mat& m, const std::vector<int>& params = std::vector<int>());
+
+NCNN_EXPORT void imshow(const std::string& name, const Mat& m);
+
+NCNN_EXPORT int waitKey(int delay = 0);
+
+#if NCNN_PIXEL
+NCNN_EXPORT void resize(const Mat& src, Mat& dst, const Size& size, float sw = 0.f, float sh = 0.f, int flags = 0);
+#endif // NCNN_PIXEL
+
+#if NCNN_PIXEL_DRAWING
+
+enum
+{
+    FILLED = -1
+};
+
+NCNN_EXPORT void rectangle(Mat& img, Point pt1, Point pt2, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void rectangle(Mat& img, Rect rec, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void circle(Mat& img, Point center, int radius, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void line(Mat& img, Point p0, Point p1, const Scalar& color, int thickness = 1);
+
+enum
+{
+    FONT_HERSHEY_SIMPLEX = 0
+};
+
+NCNN_EXPORT void putText(Mat& img, const std::string& text, Point org, int fontFace, double fontScale, Scalar color, int thickness = 1);
+
+NCNN_EXPORT Size getTextSize(const std::string& text, int fontFace, double fontScale, int thickness, int* baseLine);
+
+#endif // NCNN_PIXEL_DRAWING
+
+} // namespace cv
+
+#if defined(_MSC_VER) || defined(__GNUC__)
+#pragma pop_macro("min")
+#pragma pop_macro("max")
+#endif
+
+#endif // NCNN_SIMPLEOCV
+
+#endif // NCNN_SIMPLEOCV_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/simpleomp.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/simpleomp.h
new file mode 100644
index 0000000..13e2452
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/simpleomp.h
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLEOMP_H
+#define NCNN_SIMPLEOMP_H
+
+#include "platform.h"
+
+#if NCNN_SIMPLEOMP
+
+#include <stdint.h>
+
+// This minimal openmp runtime implementation only supports the llvm openmp abi
+// and only supports #pragma omp parallel for num_threads(X)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+NCNN_EXPORT int omp_get_max_threads();
+
+NCNN_EXPORT void omp_set_num_threads(int num_threads);
+
+NCNN_EXPORT int omp_get_dynamic();
+
+NCNN_EXPORT void omp_set_dynamic(int dynamic);
+
+NCNN_EXPORT int omp_get_num_threads();
+
+NCNN_EXPORT int omp_get_thread_num();
+
+NCNN_EXPORT int kmp_get_blocktime();
+
+NCNN_EXPORT void kmp_set_blocktime(int blocktime);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // NCNN_SIMPLEOMP
+
+#endif // NCNN_SIMPLEOMP_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/simplestl.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/simplestl.h
new file mode 100644
index 0000000..00ff468
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/simplestl.h
@@ -0,0 +1,565 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLESTL_H
+#define NCNN_SIMPLESTL_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#if !NCNN_SIMPLESTL
+
+#include <new>
+
+#else
+
+// allocation functions
+NCNN_EXPORT void* operator new(size_t size);
+NCNN_EXPORT void* operator new[](size_t size);
+// placement allocation functions
+NCNN_EXPORT void* operator new(size_t size, void* ptr);
+NCNN_EXPORT void* operator new[](size_t size, void* ptr);
+// deallocation functions
+NCNN_EXPORT void operator delete(void* ptr);
+NCNN_EXPORT void operator delete[](void* ptr);
+// deallocation functions since c++14
+#if __cplusplus >= 201402L
+NCNN_EXPORT void operator delete(void* ptr, size_t sz);
+NCNN_EXPORT void operator delete[](void* ptr, size_t sz);
+#endif
+// placement deallocation functions
+NCNN_EXPORT void operator delete(void* ptr, void* voidptr2);
+NCNN_EXPORT void operator delete[](void* ptr, void* voidptr2);
+
+#endif
+
+// minimal stl data structure implementation
+namespace std {
+
+template<typename T>
+const T& max(const T& a, const T& b)
+{
+    return (a < b) ? b : a;
+}
+
+template<typename T>
+const T& min(const T& a, const T& b)
+{
+    return (a > b) ? b : a;
+}
+
+template<typename T>
+void swap(T& a, T& b)
+{
+    T temp(a);
+    a = b;
+    b = temp;
+}
+
+template<typename T1, typename T2>
+struct pair
+{
+    pair()
+        : first(), second()
+    {
+    }
+    pair(const T1& t1, const T2& t2)
+        : first(t1), second(t2)
+    {
+    }
+
+    T1 first;
+    T2 second;
+};
+
+template<typename T1, typename T2>
+bool operator==(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return (x.first == y.first && x.second == y.second);
+}
+template<typename T1, typename T2>
+bool operator<(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return x.first < y.first || (!(y.first < x.first) && x.second < y.second);
+}
+template<typename T1, typename T2>
+bool operator!=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(x == y);
+}
+template<typename T1, typename T2>
+bool operator>(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return y < x;
+}
+template<typename T1, typename T2>
+bool operator<=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(y < x);
+}
+template<typename T1, typename T2>
+bool operator>=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(x < y);
+}
+
+template<typename T1, typename T2>
+pair<T1, T2> make_pair(const T1& t1, const T2& t2)
+{
+    return pair<T1, T2>(t1, t2);
+}
+
+template<typename T>
+struct node
+{
+    node* prev_;
+    node* next_;
+    T data_;
+
+    node()
+        : prev_(0), next_(0), data_()
+    {
+    }
+    node(const T& t)
+        : prev_(0), next_(0), data_(t)
+    {
+    }
+};
+
+template<typename T>
+struct iter_list
+{
+    iter_list()
+        : curr_(0)
+    {
+    }
+    iter_list(node<T>* n)
+        : curr_(n)
+    {
+    }
+    iter_list(const iter_list& i)
+        : curr_(i.curr_)
+    {
+    }
+    ~iter_list()
+    {
+    }
+
+    iter_list& operator=(const iter_list& i)
+    {
+        curr_ = i.curr_;
+        return *this;
+    }
+
+    T& operator*()
+    {
+        return curr_->data_;
+    }
+    T* operator->()
+    {
+        return &(curr_->data_);
+    }
+
+    bool operator==(const iter_list& i)
+    {
+        return curr_ == i.curr_;
+    }
+    bool operator!=(const iter_list& i)
+    {
+        return curr_ != i.curr_;
+    }
+
+    iter_list& operator++()
+    {
+        curr_ = curr_->next_;
+        return *this;
+    }
+    iter_list& operator--()
+    {
+        curr_ = curr_->prev_;
+        return *this;
+    }
+
+    node<T>* curr_;
+};
+
+template<typename T>
+struct list
+{
+    typedef iter_list<T> iterator;
+
+    list()
+    {
+        head_ = new node<T>();
+        tail_ = head_;
+        count_ = 0;
+    }
+    ~list()
+    {
+        clear();
+        delete head_;
+    }
+    list(const list& l)
+    {
+        head_ = new node<T>();
+        tail_ = head_;
+        count_ = 0;
+
+        for (iter_list<T> i = l.begin(); i != l.end(); ++i)
+        {
+            push_back(*i);
+        }
+    }
+
+    list& operator=(const list& l)
+    {
+        if (this == &l)
+        {
+            return *this;
+        }
+        clear();
+
+        for (iter_list<T> i = l.begin(); i != l.end(); ++i)
+        {
+            push_back(*i);
+        }
+        return *this;
+    }
+
+    void clear()
+    {
+        while (count_ > 0)
+        {
+            pop_front();
+        }
+    }
+
+    void pop_front()
+    {
+        if (count_ > 0)
+        {
+            head_ = head_->next_;
+            delete head_->prev_;
+            head_->prev_ = 0;
+            --count_;
+        }
+    }
+
+    size_t size() const
+    {
+        return count_;
+    }
+    iter_list<T> begin() const
+    {
+        return iter_list<T>(head_);
+    }
+    iter_list<T> end() const
+    {
+        return iter_list<T>(tail_);
+    }
+    bool empty() const
+    {
+        return count_ == 0;
+    }
+
+    void push_back(const T& t)
+    {
+        if (count_ == 0)
+        {
+            head_ = new node<T>(t);
+            head_->prev_ = 0;
+            head_->next_ = tail_;
+            tail_->prev_ = head_;
+            count_ = 1;
+        }
+        else
+        {
+            node<T>* temp = new node<T>(t);
+            temp->prev_ = tail_->prev_;
+            temp->next_ = tail_;
+            tail_->prev_->next_ = temp;
+            tail_->prev_ = temp;
+            ++count_;
+        }
+    }
+
+    iter_list<T> erase(iter_list<T> pos)
+    {
+        if (pos != end())
+        {
+            node<T>* temp = pos.curr_;
+            if (temp == head_)
+            {
+                ++pos;
+                temp->next_->prev_ = 0;
+                head_ = temp->next_;
+            }
+            else
+            {
+                --pos;
+                temp->next_->prev_ = temp->prev_;
+                temp->prev_->next_ = temp->next_;
+                ++pos;
+            }
+            delete temp;
+            --count_;
+        }
+        return pos;
+    }
+
+protected:
+    node<T>* head_;
+    node<T>* tail_;
+    size_t count_;
+};
+
+template<typename T>
+struct greater
+{
+    bool operator()(const T& x, const T& y) const
+    {
+        return (x > y);
+    }
+};
+
+template<typename T>
+struct less
+{
+    bool operator()(const T& x, const T& y) const
+    {
+        return (x < y);
+    }
+};
+
+template<typename RandomAccessIter, typename Compare>
+void partial_sort(RandomAccessIter first, RandomAccessIter middle, RandomAccessIter last, Compare comp)
+{
+    // [TODO] heap sort should be used here, but we simply use bubble sort now
+    for (RandomAccessIter i = first; i < middle; ++i)
+    {
+        // bubble sort
+        for (RandomAccessIter j = last - 1; j > first; --j)
+        {
+            if (comp(*j, *(j - 1)))
+            {
+                swap(*j, *(j - 1));
+            }
+        }
+    }
+}
+
+template<typename T>
+struct vector
+{
+    vector()
+        : data_(0), size_(0), capacity_(0)
+    {
+    }
+    vector(const size_t new_size, const T& value = T())
+        : data_(0), size_(0), capacity_(0)
+    {
+        resize(new_size, value);
+    }
+    ~vector()
+    {
+        clear();
+    }
+    vector(const vector& v)
+        : data_(0), size_(0), capacity_(0)
+    {
+        resize(v.size());
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i] = v.data_[i];
+        }
+    }
+
+    vector& operator=(const vector& v)
+    {
+        if (this == &v)
+        {
+            return *this;
+        }
+        resize(0);
+        resize(v.size());
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i] = v.data_[i];
+        }
+        return *this;
+    }
+
+    void resize(const size_t new_size, const T& value = T())
+    {
+        try_alloc(new_size);
+        if (new_size > size_)
+        {
+            for (size_t i = size_; i < new_size; i++)
+            {
+                new (&data_[i]) T(value);
+            }
+        }
+        else if (new_size < size_)
+        {
+            for (size_t i = new_size; i < size_; i++)
+            {
+                data_[i].~T();
+            }
+        }
+        size_ = new_size;
+    }
+
+    void clear()
+    {
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i].~T();
+        }
+        delete[](char*) data_;
+        data_ = 0;
+        size_ = 0;
+        capacity_ = 0;
+    }
+
+    T* data() const
+    {
+        return data_;
+    }
+    size_t size() const
+    {
+        return size_;
+    }
+    T& operator[](size_t i) const
+    {
+        return data_[i];
+    }
+    T* begin() const
+    {
+        return &data_[0];
+    }
+    T* end() const
+    {
+        return &data_[size_];
+    }
+    bool empty() const
+    {
+        return size_ == 0;
+    }
+
+    void push_back(const T& t)
+    {
+        try_alloc(size_ + 1);
+        new (&data_[size_]) T(t);
+        size_++;
+    }
+
+    void insert(T* pos, T* b, T* e)
+    {
+        vector* v = 0;
+        if (b >= begin() && b < end())
+        {
+            //the same vector
+            v = new vector(*this);
+            b = v->begin() + (b - begin());
+            e = v->begin() + (e - begin());
+        }
+        size_t diff = pos - begin();
+        try_alloc(size_ + (e - b));
+        pos = begin() + diff;
+        memmove(pos + (e - b), pos, (end() - pos) * sizeof(T));
+        size_t len = e - b;
+        size_ += len;
+        for (size_t i = 0; i < len; i++)
+        {
+            *pos = *b;
+            pos++;
+            b++;
+        }
+        delete v;
+    }
+
+    T* erase(T* pos)
+    {
+        pos->~T();
+        memmove(pos, pos + 1, (end() - pos - 1) * sizeof(T));
+        size_--;
+        return pos;
+    }
+
+protected:
+    T* data_;
+    size_t size_;
+    size_t capacity_;
+    void try_alloc(size_t new_size)
+    {
+        if (new_size * 3 / 2 > capacity_ / 2)
+        {
+            capacity_ = new_size * 2;
+            T* new_data = (T*)new char[capacity_ * sizeof(T)];
+            memset(static_cast<void*>(new_data), 0, capacity_ * sizeof(T));
+            if (data_)
+            {
+                memmove(new_data, data_, sizeof(T) * size_);
+                delete[](char*) data_;
+            }
+            data_ = new_data;
+        }
+    }
+};
+
+struct NCNN_EXPORT string : public vector<char>
+{
+    string()
+    {
+    }
+    string(const char* str)
+    {
+        size_t len = strlen(str);
+        resize(len);
+        memcpy(data_, str, len);
+    }
+    const char* c_str() const
+    {
+        return (const char*)data_;
+    }
+    bool operator==(const string& str2) const
+    {
+        return strcmp(data_, str2.data_) == 0;
+    }
+    bool operator==(const char* str2) const
+    {
+        return strcmp(data_, str2) == 0;
+    }
+    bool operator!=(const char* str2) const
+    {
+        return strcmp(data_, str2) != 0;
+    }
+    string& operator+=(const string& str1)
+    {
+        insert(end(), str1.begin(), str1.end());
+        return *this;
+    }
+};
+
+inline string operator+(const string& str1, const string& str2)
+{
+    string str(str1);
+    str.insert(str.end(), str2.begin(), str2.end());
+    return str;
+}
+
+} // namespace std
+
+#endif // NCNN_SIMPLESTL_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/vulkan_header_fix.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/vulkan_header_fix.h
new file mode 100644
index 0000000..e7a7e8e
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/include/ncnn/vulkan_header_fix.h
@@ -0,0 +1,251 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_VULKAN_HEADER_FIX_H
+#define NCNN_VULKAN_HEADER_FIX_H
+
+#include <vulkan/vulkan.h>
+
+// This header contains new structure and function declearation to fix build with old vulkan sdk
+
+#if VK_HEADER_VERSION < 70
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES (VkStructureType)1000094000
+typedef enum VkSubgroupFeatureFlagBits
+{
+    VK_SUBGROUP_FEATURE_BASIC_BIT = 0x00000001,
+    VK_SUBGROUP_FEATURE_VOTE_BIT = 0x00000002,
+    VK_SUBGROUP_FEATURE_ARITHMETIC_BIT = 0x00000004,
+    VK_SUBGROUP_FEATURE_BALLOT_BIT = 0x00000008,
+    VK_SUBGROUP_FEATURE_SHUFFLE_BIT = 0x00000010,
+    VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT = 0x00000020,
+    VK_SUBGROUP_FEATURE_CLUSTERED_BIT = 0x00000040,
+    VK_SUBGROUP_FEATURE_QUAD_BIT = 0x00000080,
+    VK_SUBGROUP_FEATURE_PARTITIONED_BIT_NV = 0x00000100,
+    VK_SUBGROUP_FEATURE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkSubgroupFeatureFlagBits;
+typedef VkFlags VkSubgroupFeatureFlags;
+typedef struct VkPhysicalDeviceSubgroupProperties
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t subgroupSize;
+    VkShaderStageFlags supportedStages;
+    VkSubgroupFeatureFlags supportedOperations;
+    VkBool32 quadOperationsInAllStages;
+} VkPhysicalDeviceSubgroupProperties;
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_3_PROPERTIES (VkStructureType)1000168000
+#define VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_SUPPORT            (VkStructureType)1000168001
+typedef struct VkPhysicalDeviceMaintenance3Properties
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t maxPerSetDescriptors;
+    VkDeviceSize maxMemoryAllocationSize;
+} VkPhysicalDeviceMaintenance3Properties;
+typedef struct VkDescriptorSetLayoutSupport
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 supported;
+} VkDescriptorSetLayoutSupport;
+typedef VkPhysicalDeviceMaintenance3Properties VkPhysicalDeviceMaintenance3PropertiesKHR;
+typedef VkDescriptorSetLayoutSupport VkDescriptorSetLayoutSupportKHR;
+typedef void(VKAPI_PTR* PFN_vkGetDescriptorSetLayoutSupportKHR)(VkDevice device, const VkDescriptorSetLayoutCreateInfo* pCreateInfo, VkDescriptorSetLayoutSupport* pSupport);
+#endif // VK_HEADER_VERSION < 70
+
+#if VK_HEADER_VERSION < 80
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR (VkStructureType)1000177000
+typedef struct VkPhysicalDevice8BitStorageFeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 storageBuffer8BitAccess;
+    VkBool32 uniformAndStorageBuffer8BitAccess;
+    VkBool32 storagePushConstant8;
+} VkPhysicalDevice8BitStorageFeaturesKHR;
+#define VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2_KHR  (VkStructureType)1000109000
+#define VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2_KHR    (VkStructureType)1000109001
+#define VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2_KHR     (VkStructureType)1000109002
+#define VK_STRUCTURE_TYPE_SUBPASS_DEPENDENCY_2_KHR      (VkStructureType)1000109003
+#define VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2_KHR (VkStructureType)1000109004
+#define VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO_KHR        (VkStructureType)1000109005
+#define VK_STRUCTURE_TYPE_SUBPASS_END_INFO_KHR          (VkStructureType)1000109006
+typedef struct VkAttachmentDescription2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkAttachmentDescriptionFlags flags;
+    VkFormat format;
+    VkSampleCountFlagBits samples;
+    VkAttachmentLoadOp loadOp;
+    VkAttachmentStoreOp storeOp;
+    VkAttachmentLoadOp stencilLoadOp;
+    VkAttachmentStoreOp stencilStoreOp;
+    VkImageLayout initialLayout;
+    VkImageLayout finalLayout;
+} VkAttachmentDescription2KHR;
+typedef struct VkAttachmentReference2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint32_t attachment;
+    VkImageLayout layout;
+    VkImageAspectFlags aspectMask;
+} VkAttachmentReference2KHR;
+typedef struct VkSubpassDescription2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkSubpassDescriptionFlags flags;
+    VkPipelineBindPoint pipelineBindPoint;
+    uint32_t viewMask;
+    uint32_t inputAttachmentCount;
+    const VkAttachmentReference2KHR* pInputAttachments;
+    uint32_t colorAttachmentCount;
+    const VkAttachmentReference2KHR* pColorAttachments;
+    const VkAttachmentReference2KHR* pResolveAttachments;
+    const VkAttachmentReference2KHR* pDepthStencilAttachment;
+    uint32_t preserveAttachmentCount;
+    const uint32_t* pPreserveAttachments;
+} VkSubpassDescription2KHR;
+typedef struct VkSubpassDependency2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint32_t srcSubpass;
+    uint32_t dstSubpass;
+    VkPipelineStageFlags srcStageMask;
+    VkPipelineStageFlags dstStageMask;
+    VkAccessFlags srcAccessMask;
+    VkAccessFlags dstAccessMask;
+    VkDependencyFlags dependencyFlags;
+    int32_t viewOffset;
+} VkSubpassDependency2KHR;
+typedef struct VkRenderPassCreateInfo2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkRenderPassCreateFlags flags;
+    uint32_t attachmentCount;
+    const VkAttachmentDescription2KHR* pAttachments;
+    uint32_t subpassCount;
+    const VkSubpassDescription2KHR* pSubpasses;
+    uint32_t dependencyCount;
+    const VkSubpassDependency2KHR* pDependencies;
+    uint32_t correlatedViewMaskCount;
+    const uint32_t* pCorrelatedViewMasks;
+} VkRenderPassCreateInfo2KHR;
+typedef struct VkSubpassBeginInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkSubpassContents contents;
+} VkSubpassBeginInfoKHR;
+
+typedef struct VkSubpassEndInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+} VkSubpassEndInfoKHR;
+typedef VkResult(VKAPI_PTR* PFN_vkCreateRenderPass2KHR)(VkDevice device, const VkRenderPassCreateInfo2KHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkRenderPass* pRenderPass);
+typedef void(VKAPI_PTR* PFN_vkCmdBeginRenderPass2KHR)(VkCommandBuffer commandBuffer, const VkRenderPassBeginInfo* pRenderPassBegin, const VkSubpassBeginInfoKHR* pSubpassBeginInfo);
+typedef void(VKAPI_PTR* PFN_vkCmdNextSubpass2KHR)(VkCommandBuffer commandBuffer, const VkSubpassBeginInfoKHR* pSubpassBeginInfo, const VkSubpassEndInfoKHR* pSubpassEndInfo);
+typedef void(VKAPI_PTR* PFN_vkCmdEndRenderPass2KHR)(VkCommandBuffer commandBuffer, const VkSubpassEndInfoKHR* pSubpassEndInfo);
+#endif // VK_HEADER_VERSION < 80
+
+#if VK_HEADER_VERSION < 95
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR (VkStructureType)1000082000
+typedef struct VkPhysicalDeviceFloat16Int8FeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 shaderFloat16;
+    VkBool32 shaderInt8;
+} VkPhysicalDeviceFloat16Int8FeaturesKHR;
+#endif // VK_HEADER_VERSION < 95
+
+#if VK_HEADER_VERSION < 97
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT (VkStructureType)1000237000
+typedef struct VkPhysicalDeviceMemoryBudgetPropertiesEXT
+{
+    VkStructureType sType;
+    void* pNext;
+    VkDeviceSize heapBudget[VK_MAX_MEMORY_HEAPS];
+    VkDeviceSize heapUsage[VK_MAX_MEMORY_HEAPS];
+} VkPhysicalDeviceMemoryBudgetPropertiesEXT;
+#endif // VK_HEADER_VERSION < 97
+
+#if VK_HEADER_VERSION < 101
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_NV   (VkStructureType)1000249000
+#define VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_NV                 (VkStructureType)1000249001
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_PROPERTIES_NV (VkStructureType)1000249002
+typedef enum VkComponentTypeNV
+{
+    VK_COMPONENT_TYPE_FLOAT16_NV = 0,
+    VK_COMPONENT_TYPE_FLOAT32_NV = 1,
+    VK_COMPONENT_TYPE_FLOAT64_NV = 2,
+    VK_COMPONENT_TYPE_SINT8_NV = 3,
+    VK_COMPONENT_TYPE_SINT16_NV = 4,
+    VK_COMPONENT_TYPE_SINT32_NV = 5,
+    VK_COMPONENT_TYPE_SINT64_NV = 6,
+    VK_COMPONENT_TYPE_UINT8_NV = 7,
+    VK_COMPONENT_TYPE_UINT16_NV = 8,
+    VK_COMPONENT_TYPE_UINT32_NV = 9,
+    VK_COMPONENT_TYPE_UINT64_NV = 10,
+    VK_COMPONENT_TYPE_BEGIN_RANGE_NV = VK_COMPONENT_TYPE_FLOAT16_NV,
+    VK_COMPONENT_TYPE_END_RANGE_NV = VK_COMPONENT_TYPE_UINT64_NV,
+    VK_COMPONENT_TYPE_RANGE_SIZE_NV = (VK_COMPONENT_TYPE_UINT64_NV - VK_COMPONENT_TYPE_FLOAT16_NV + 1),
+    VK_COMPONENT_TYPE_MAX_ENUM_NV = 0x7FFFFFFF
+} VkComponentTypeNV;
+typedef enum VkScopeNV
+{
+    VK_SCOPE_DEVICE_NV = 1,
+    VK_SCOPE_WORKGROUP_NV = 2,
+    VK_SCOPE_SUBGROUP_NV = 3,
+    VK_SCOPE_QUEUE_FAMILY_NV = 5,
+    VK_SCOPE_BEGIN_RANGE_NV = VK_SCOPE_DEVICE_NV,
+    VK_SCOPE_END_RANGE_NV = VK_SCOPE_QUEUE_FAMILY_NV,
+    VK_SCOPE_RANGE_SIZE_NV = (VK_SCOPE_QUEUE_FAMILY_NV - VK_SCOPE_DEVICE_NV + 1),
+    VK_SCOPE_MAX_ENUM_NV = 0x7FFFFFFF
+} VkScopeNV;
+typedef struct VkCooperativeMatrixPropertiesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t MSize;
+    uint32_t NSize;
+    uint32_t KSize;
+    VkComponentTypeNV AType;
+    VkComponentTypeNV BType;
+    VkComponentTypeNV CType;
+    VkComponentTypeNV DType;
+    VkScopeNV scope;
+} VkCooperativeMatrixPropertiesNV;
+typedef struct VkPhysicalDeviceCooperativeMatrixFeaturesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 cooperativeMatrix;
+    VkBool32 cooperativeMatrixRobustBufferAccess;
+} VkPhysicalDeviceCooperativeMatrixFeaturesNV;
+typedef struct VkPhysicalDeviceCooperativeMatrixPropertiesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    VkShaderStageFlags cooperativeMatrixSupportedStages;
+} VkPhysicalDeviceCooperativeMatrixPropertiesNV;
+typedef VkResult(VKAPI_PTR* PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesNV)(VkPhysicalDevice physicalDevice, uint32_t* pPropertyCount, VkCooperativeMatrixPropertiesNV* pProperties);
+#endif // VK_HEADER_VERSION < 101
+
+#endif // NCNN_VULKAN_HEADER_FIX_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/lib/cmake/ncnn/ncnn-release.cmake b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/lib/cmake/ncnn/ncnn-release.cmake
new file mode 100644
index 0000000..1fb8660
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/lib/cmake/ncnn/ncnn-release.cmake
@@ -0,0 +1,19 @@
+#----------------------------------------------------------------
+# Generated CMake target import file for configuration "Release".
+#----------------------------------------------------------------
+
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+
+# Import target "ncnn" for configuration "Release"
+set_property(TARGET ncnn APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(ncnn PROPERTIES
+  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/libncnn.so"
+  IMPORTED_SONAME_RELEASE "libncnn.so"
+  )
+
+list(APPEND _cmake_import_check_targets ncnn )
+list(APPEND _cmake_import_check_files_for_ncnn "${_IMPORT_PREFIX}/lib/libncnn.so" )
+
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/lib/cmake/ncnn/ncnn.cmake b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/lib/cmake/ncnn/ncnn.cmake
new file mode 100644
index 0000000..53b9fae
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/lib/cmake/ncnn/ncnn.cmake
@@ -0,0 +1,109 @@
+# Generated by CMake
+
+if("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.8)
+   message(FATAL_ERROR "CMake >= 2.8.0 required")
+endif()
+if(CMAKE_VERSION VERSION_LESS "2.8.3")
+   message(FATAL_ERROR "CMake >= 2.8.3 required")
+endif()
+cmake_policy(PUSH)
+cmake_policy(VERSION 2.8.3...3.23)
+#----------------------------------------------------------------
+# Generated CMake target import file.
+#----------------------------------------------------------------
+
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+
+# Protect against multiple inclusion, which would fail when already imported targets are added once more.
+set(_cmake_targets_defined "")
+set(_cmake_targets_not_defined "")
+set(_cmake_expected_targets "")
+foreach(_cmake_expected_target IN ITEMS ncnn)
+  list(APPEND _cmake_expected_targets "${_cmake_expected_target}")
+  if(TARGET "${_cmake_expected_target}")
+    list(APPEND _cmake_targets_defined "${_cmake_expected_target}")
+  else()
+    list(APPEND _cmake_targets_not_defined "${_cmake_expected_target}")
+  endif()
+endforeach()
+unset(_cmake_expected_target)
+if(_cmake_targets_defined STREQUAL _cmake_expected_targets)
+  unset(_cmake_targets_defined)
+  unset(_cmake_targets_not_defined)
+  unset(_cmake_expected_targets)
+  unset(CMAKE_IMPORT_FILE_VERSION)
+  cmake_policy(POP)
+  return()
+endif()
+if(NOT _cmake_targets_defined STREQUAL "")
+  string(REPLACE ";" ", " _cmake_targets_defined_text "${_cmake_targets_defined}")
+  string(REPLACE ";" ", " _cmake_targets_not_defined_text "${_cmake_targets_not_defined}")
+  message(FATAL_ERROR "Some (but not all) targets in this export set were already defined.\nTargets Defined: ${_cmake_targets_defined_text}\nTargets not yet defined: ${_cmake_targets_not_defined_text}\n")
+endif()
+unset(_cmake_targets_defined)
+unset(_cmake_targets_not_defined)
+unset(_cmake_expected_targets)
+
+
+# Compute the installation prefix relative to this file.
+get_filename_component(_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+if(_IMPORT_PREFIX STREQUAL "/")
+  set(_IMPORT_PREFIX "")
+endif()
+
+# Create imported target ncnn
+add_library(ncnn SHARED IMPORTED)
+
+set_target_properties(ncnn PROPERTIES
+  INTERFACE_COMPILE_OPTIONS "-fno-rtti;-fno-exceptions"
+  INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include/ncnn"
+  INTERFACE_LINK_LIBRARIES "-fopenmp;-static-openmp;-Wl,-wrap,__kmp_affinity_determine_capable;Threads::Threads;Vulkan::Vulkan;android;jnigraphics;log"
+  INTERFACE_POSITION_INDEPENDENT_CODE "ON"
+)
+
+if(CMAKE_VERSION VERSION_LESS 2.8.12)
+  message(FATAL_ERROR "This file relies on consumers using CMake 2.8.12 or greater.")
+endif()
+
+# Load information for each installed configuration.
+file(GLOB _cmake_config_files "${CMAKE_CURRENT_LIST_DIR}/ncnn-*.cmake")
+foreach(_cmake_config_file IN LISTS _cmake_config_files)
+  include("${_cmake_config_file}")
+endforeach()
+unset(_cmake_config_file)
+unset(_cmake_config_files)
+
+# Cleanup temporary variables.
+set(_IMPORT_PREFIX)
+
+# Loop over all imported files and verify that they actually exist
+foreach(_cmake_target IN LISTS _cmake_import_check_targets)
+  foreach(_cmake_file IN LISTS "_cmake_import_check_files_for_${_cmake_target}")
+    if(NOT EXISTS "${_cmake_file}")
+      message(FATAL_ERROR "The imported target \"${_cmake_target}\" references the file
+   \"${_cmake_file}\"
+but this file does not exist.  Possible reasons include:
+* The file was deleted, renamed, or moved to another location.
+* An install or uninstall procedure did not complete successfully.
+* The installation package was faulty and contained
+   \"${CMAKE_CURRENT_LIST_FILE}\"
+but not all the files it references.
+")
+    endif()
+  endforeach()
+  unset(_cmake_file)
+  unset("_cmake_import_check_files_for_${_cmake_target}")
+endforeach()
+unset(_cmake_target)
+unset(_cmake_import_check_targets)
+
+# This file does not depend on other imported targets which have
+# been exported from the same project but in a separate export set.
+
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)
+cmake_policy(POP)
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/lib/cmake/ncnn/ncnnConfig.cmake b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/lib/cmake/ncnn/ncnnConfig.cmake
new file mode 100644
index 0000000..abb2dd6
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/lib/cmake/ncnn/ncnnConfig.cmake
@@ -0,0 +1,42 @@
+set(NCNN_OPENMP ON)
+set(NCNN_THREADS ON)
+set(NCNN_VULKAN ON)
+set(NCNN_SHARED_LIB ON)
+set(NCNN_SYSTEM_GLSLANG OFF)
+
+if(NCNN_OPENMP)
+    find_package(OpenMP)
+endif()
+
+if(NCNN_THREADS)
+    set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
+    set(THREADS_PREFER_PTHREAD_FLAG TRUE)
+    find_package(Threads REQUIRED)
+endif()
+
+if(NCNN_VULKAN)
+    find_package(Vulkan REQUIRED)
+
+    if(NOT NCNN_SHARED_LIB)
+        if(NCNN_SYSTEM_GLSLANG)
+            find_package(glslang QUIET)
+            if(NOT glslang_FOUND)
+                set(GLSLANG_TARGET_DIR "")
+                include(${GLSLANG_TARGET_DIR}/OSDependentTargets.cmake)
+                include(${GLSLANG_TARGET_DIR}/OGLCompilerTargets.cmake)
+                if(EXISTS "${GLSLANG_TARGET_DIR}/HLSLTargets.cmake")
+                    # hlsl support can be optional
+                    include("${GLSLANG_TARGET_DIR}/HLSLTargets.cmake")
+                endif()
+                include(${GLSLANG_TARGET_DIR}/glslangTargets.cmake)
+                include(${GLSLANG_TARGET_DIR}/SPIRVTargets.cmake)
+            endif()
+        else()
+            set(glslang_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../lib/cmake/glslang")
+            find_package(glslang QUIET)
+        endif()
+
+    endif()
+endif()
+
+include(${CMAKE_CURRENT_LIST_DIR}/ncnn.cmake)
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/lib/pkgconfig/ncnn.pc b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/lib/pkgconfig/ncnn.pc
new file mode 100644
index 0000000..2ae00de
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/arm64-v8a/lib/pkgconfig/ncnn.pc
@@ -0,0 +1,11 @@
+prefix=${pcfiledir}/../..
+librarydir=${prefix}/lib
+includedir=${prefix}/include
+
+Name: ncnn
+Description: high-performance neural network inference framework optimized for the mobile platform
+Version: 1.0.20221128
+URL: https://github.com/Tencent/ncnn
+Libs: -L"${librarydir}" -lncnn
+Cflags: -I"${includedir}"
+
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/allocator.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/allocator.h
new file mode 100644
index 0000000..3a5ebca
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/allocator.h
@@ -0,0 +1,448 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_ALLOCATOR_H
+#define NCNN_ALLOCATOR_H
+
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+
+#include "platform.h"
+
+#include <stdlib.h>
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+#include <android/hardware_buffer.h>
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+// the alignment of all the allocated buffers
+#if NCNN_AVX512
+#define NCNN_MALLOC_ALIGN 64
+#elif NCNN_AVX
+#define NCNN_MALLOC_ALIGN 32
+#else
+#define NCNN_MALLOC_ALIGN 16
+#endif
+
+// we have some optimized kernels that may overread buffer a bit in loop
+// it is common to interleave next-loop data load with arithmetic instructions
+// allocating more bytes keeps us safe from SEGV_ACCERR failure
+#define NCNN_MALLOC_OVERREAD 64
+
+// Aligns a pointer to the specified number of bytes
+// ptr Aligned pointer
+// n Alignment size that must be a power of two
+template<typename _Tp>
+static NCNN_FORCEINLINE _Tp* alignPtr(_Tp* ptr, int n = (int)sizeof(_Tp))
+{
+    return (_Tp*)(((size_t)ptr + n - 1) & -n);
+}
+
+// Aligns a buffer size to the specified number of bytes
+// The function returns the minimum number that is greater or equal to sz and is divisible by n
+// sz Buffer size to align
+// n Alignment size that must be a power of two
+static NCNN_FORCEINLINE size_t alignSize(size_t sz, int n)
+{
+    return (sz + n - 1) & -n;
+}
+
+static NCNN_FORCEINLINE void* fastMalloc(size_t size)
+{
+#if _MSC_VER
+    return _aligned_malloc(size, NCNN_MALLOC_ALIGN);
+#elif (defined(__unix__) || defined(__APPLE__)) && _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17)
+    void* ptr = 0;
+    if (posix_memalign(&ptr, NCNN_MALLOC_ALIGN, size + NCNN_MALLOC_OVERREAD))
+        ptr = 0;
+    return ptr;
+#elif __ANDROID__ && __ANDROID_API__ < 17
+    return memalign(NCNN_MALLOC_ALIGN, size + NCNN_MALLOC_OVERREAD);
+#else
+    unsigned char* udata = (unsigned char*)malloc(size + sizeof(void*) + NCNN_MALLOC_ALIGN + NCNN_MALLOC_OVERREAD);
+    if (!udata)
+        return 0;
+    unsigned char** adata = alignPtr((unsigned char**)udata + 1, NCNN_MALLOC_ALIGN);
+    adata[-1] = udata;
+    return adata;
+#endif
+}
+
+static NCNN_FORCEINLINE void fastFree(void* ptr)
+{
+    if (ptr)
+    {
+#if _MSC_VER
+        _aligned_free(ptr);
+#elif (defined(__unix__) || defined(__APPLE__)) && _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17)
+        free(ptr);
+#elif __ANDROID__ && __ANDROID_API__ < 17
+        free(ptr);
+#else
+        unsigned char* udata = ((unsigned char**)ptr)[-1];
+        free(udata);
+#endif
+    }
+}
+
+#if NCNN_THREADS
+// exchange-add operation for atomic operations on reference counters
+#if defined __riscv && !defined __riscv_atomic
+// riscv target without A extension
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#elif defined __INTEL_COMPILER && !(defined WIN32 || defined _WIN32)
+// atomic increment on the linux version of the Intel(tm) compiler
+#define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd(const_cast<void*>(reinterpret_cast<volatile void*>(addr)), delta)
+#elif defined __GNUC__
+#if defined __clang__ && __clang_major__ >= 3 && !defined __ANDROID__ && !defined __EMSCRIPTEN__ && !defined(__CUDACC__)
+#ifdef __ATOMIC_ACQ_REL
+#define NCNN_XADD(addr, delta) __c11_atomic_fetch_add((_Atomic(int)*)(addr), delta, __ATOMIC_ACQ_REL)
+#else
+#define NCNN_XADD(addr, delta) __atomic_fetch_add((_Atomic(int)*)(addr), delta, 4)
+#endif
+#else
+#if defined __ATOMIC_ACQ_REL && !defined __clang__
+// version for gcc >= 4.7
+#define NCNN_XADD(addr, delta) (int)__atomic_fetch_add((unsigned*)(addr), (unsigned)(delta), __ATOMIC_ACQ_REL)
+#else
+#define NCNN_XADD(addr, delta) (int)__sync_fetch_and_add((unsigned*)(addr), (unsigned)(delta))
+#endif
+#endif
+#elif defined _MSC_VER && !defined RC_INVOKED
+#define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile*)addr, delta)
+#else
+// thread-unsafe branch
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#endif
+#else  // NCNN_THREADS
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#endif // NCNN_THREADS
+
+class NCNN_EXPORT Allocator
+{
+public:
+    virtual ~Allocator();
+    virtual void* fastMalloc(size_t size) = 0;
+    virtual void fastFree(void* ptr) = 0;
+};
+
+class PoolAllocatorPrivate;
+class NCNN_EXPORT PoolAllocator : public Allocator
+{
+public:
+    PoolAllocator();
+    ~PoolAllocator();
+
+    // ratio range 0 ~ 1
+    // default cr = 0
+    void set_size_compare_ratio(float scr);
+
+    // budget drop threshold
+    // default threshold = 10
+    void set_size_drop_threshold(size_t);
+
+    // release all budgets immediately
+    void clear();
+
+    virtual void* fastMalloc(size_t size);
+    virtual void fastFree(void* ptr);
+
+private:
+    PoolAllocator(const PoolAllocator&);
+    PoolAllocator& operator=(const PoolAllocator&);
+
+private:
+    PoolAllocatorPrivate* const d;
+};
+
+class UnlockedPoolAllocatorPrivate;
+class NCNN_EXPORT UnlockedPoolAllocator : public Allocator
+{
+public:
+    UnlockedPoolAllocator();
+    ~UnlockedPoolAllocator();
+
+    // ratio range 0 ~ 1
+    // default cr = 0
+    void set_size_compare_ratio(float scr);
+
+    // budget drop threshold
+    // default threshold = 10
+    void set_size_drop_threshold(size_t);
+
+    // release all budgets immediately
+    void clear();
+
+    virtual void* fastMalloc(size_t size);
+    virtual void fastFree(void* ptr);
+
+private:
+    UnlockedPoolAllocator(const UnlockedPoolAllocator&);
+    UnlockedPoolAllocator& operator=(const UnlockedPoolAllocator&);
+
+private:
+    UnlockedPoolAllocatorPrivate* const d;
+};
+
+#if NCNN_VULKAN
+
+class VulkanDevice;
+
+class NCNN_EXPORT VkBufferMemory
+{
+public:
+    VkBuffer buffer;
+
+    // the base offset assigned by allocator
+    size_t offset;
+    size_t capacity;
+
+    VkDeviceMemory memory;
+    void* mapped_ptr;
+
+    // buffer state, modified by command functions internally
+    mutable VkAccessFlags access_flags;
+    mutable VkPipelineStageFlags stage_flags;
+
+    // initialize and modified by mat
+    int refcount;
+};
+
+class NCNN_EXPORT VkImageMemory
+{
+public:
+    VkImage image;
+    VkImageView imageview;
+
+    // underlying info assigned by allocator
+    int width;
+    int height;
+    int depth;
+    VkFormat format;
+
+    VkDeviceMemory memory;
+    void* mapped_ptr;
+
+    // the base offset assigned by allocator
+    size_t bind_offset;
+    size_t bind_capacity;
+
+    // image state, modified by command functions internally
+    mutable VkAccessFlags access_flags;
+    mutable VkImageLayout image_layout;
+    mutable VkPipelineStageFlags stage_flags;
+
+    // in-execution state, modified by command functions internally
+    mutable int command_refcount;
+
+    // initialize and modified by mat
+    int refcount;
+};
+
+class NCNN_EXPORT VkAllocator
+{
+public:
+    explicit VkAllocator(const VulkanDevice* _vkdev);
+    virtual ~VkAllocator();
+
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size) = 0;
+    virtual void fastFree(VkBufferMemory* ptr) = 0;
+    virtual int flush(VkBufferMemory* ptr);
+    virtual int invalidate(VkBufferMemory* ptr);
+
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack) = 0;
+    virtual void fastFree(VkImageMemory* ptr) = 0;
+
+public:
+    const VulkanDevice* vkdev;
+    uint32_t buffer_memory_type_index;
+    uint32_t image_memory_type_index;
+    uint32_t reserved_type_index;
+    bool mappable;
+    bool coherent;
+
+protected:
+    VkBuffer create_buffer(size_t size, VkBufferUsageFlags usage);
+    VkDeviceMemory allocate_memory(size_t size, uint32_t memory_type_index);
+    VkDeviceMemory allocate_dedicated_memory(size_t size, uint32_t memory_type_index, VkImage image, VkBuffer buffer);
+
+    VkImage create_image(int width, int height, int depth, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage);
+    VkImageView create_imageview(VkImage image, VkFormat format);
+};
+
+class VkBlobAllocatorPrivate;
+class NCNN_EXPORT VkBlobAllocator : public VkAllocator
+{
+public:
+    explicit VkBlobAllocator(const VulkanDevice* vkdev, size_t preferred_block_size = 16 * 1024 * 1024); // 16M
+    virtual ~VkBlobAllocator();
+
+public:
+    // release all budgets immediately
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkBlobAllocator(const VkBlobAllocator&);
+    VkBlobAllocator& operator=(const VkBlobAllocator&);
+
+private:
+    VkBlobAllocatorPrivate* const d;
+};
+
+class VkWeightAllocatorPrivate;
+class NCNN_EXPORT VkWeightAllocator : public VkAllocator
+{
+public:
+    explicit VkWeightAllocator(const VulkanDevice* vkdev, size_t preferred_block_size = 8 * 1024 * 1024); // 8M
+    virtual ~VkWeightAllocator();
+
+public:
+    // release all blocks immediately
+    virtual void clear();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkWeightAllocator(const VkWeightAllocator&);
+    VkWeightAllocator& operator=(const VkWeightAllocator&);
+
+private:
+    VkWeightAllocatorPrivate* const d;
+};
+
+class VkStagingAllocatorPrivate;
+class NCNN_EXPORT VkStagingAllocator : public VkAllocator
+{
+public:
+    explicit VkStagingAllocator(const VulkanDevice* vkdev);
+    virtual ~VkStagingAllocator();
+
+public:
+    // ratio range 0 ~ 1
+    // default cr = 0.75
+    void set_size_compare_ratio(float scr);
+
+    // release all budgets immediately
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkStagingAllocator(const VkStagingAllocator&);
+    VkStagingAllocator& operator=(const VkStagingAllocator&);
+
+private:
+    VkStagingAllocatorPrivate* const d;
+};
+
+class VkWeightStagingAllocatorPrivate;
+class NCNN_EXPORT VkWeightStagingAllocator : public VkAllocator
+{
+public:
+    explicit VkWeightStagingAllocator(const VulkanDevice* vkdev);
+    virtual ~VkWeightStagingAllocator();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkWeightStagingAllocator(const VkWeightStagingAllocator&);
+    VkWeightStagingAllocator& operator=(const VkWeightStagingAllocator&);
+
+private:
+    VkWeightStagingAllocatorPrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class NCNN_EXPORT VkAndroidHardwareBufferImageAllocator : public VkAllocator
+{
+public:
+    VkAndroidHardwareBufferImageAllocator(const VulkanDevice* _vkdev, AHardwareBuffer* _hb);
+    virtual ~VkAndroidHardwareBufferImageAllocator();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkAndroidHardwareBufferImageAllocator(const VkAndroidHardwareBufferImageAllocator&);
+    VkAndroidHardwareBufferImageAllocator& operator=(const VkAndroidHardwareBufferImageAllocator&);
+
+public:
+    int init();
+
+    int width() const;
+    int height() const;
+    uint64_t external_format() const;
+
+public:
+    AHardwareBuffer* hb;
+    AHardwareBuffer_Desc bufferDesc;
+    VkAndroidHardwareBufferFormatPropertiesANDROID bufferFormatProperties;
+    VkAndroidHardwareBufferPropertiesANDROID bufferProperties;
+    VkSamplerYcbcrConversionKHR samplerYcbcrConversion;
+};
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_ALLOCATOR_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/benchmark.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/benchmark.h
new file mode 100644
index 0000000..3d5c0cd
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/benchmark.h
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_BENCHMARK_H
+#define NCNN_BENCHMARK_H
+
+#include "layer.h"
+#include "mat.h"
+#include "platform.h"
+
+namespace ncnn {
+
+// get now timestamp in ms
+NCNN_EXPORT double get_current_time();
+
+#if NCNN_BENCHMARK
+
+NCNN_EXPORT void benchmark(const Layer* layer, double start, double end);
+NCNN_EXPORT void benchmark(const Layer* layer, const Mat& bottom_blob, Mat& top_blob, double start, double end);
+
+#endif // NCNN_BENCHMARK
+
+} // namespace ncnn
+
+#endif // NCNN_BENCHMARK_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/blob.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/blob.h
new file mode 100644
index 0000000..c9f144f
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/blob.h
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_BLOB_H
+#define NCNN_BLOB_H
+
+#include "mat.h"
+#include "platform.h"
+
+namespace ncnn {
+
+class NCNN_EXPORT Blob
+{
+public:
+    // empty
+    Blob();
+
+public:
+#if NCNN_STRING
+    // blob name
+    std::string name;
+#endif // NCNN_STRING
+    // layer index which produce this blob as output
+    int producer;
+    // layer index which need this blob as input
+    int consumer;
+    // shape hint
+    Mat shape;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_BLOB_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/c_api.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/c_api.h
new file mode 100644
index 0000000..b7435f8
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/c_api.h
@@ -0,0 +1,327 @@
+/* Tencent is pleased to support the open source community by making ncnn available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed
+ * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+ * CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ */
+
+#ifndef NCNN_C_API_H
+#define NCNN_C_API_H
+
+#include "platform.h"
+
+#if NCNN_C_API
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+NCNN_EXPORT const char* ncnn_version();
+
+/* allocator api */
+typedef struct __ncnn_allocator_t* ncnn_allocator_t;
+struct NCNN_EXPORT __ncnn_allocator_t
+{
+    void* pthis;
+
+    void* (*fast_malloc)(ncnn_allocator_t allocator, size_t size);
+    void (*fast_free)(ncnn_allocator_t allocator, void* ptr);
+};
+
+NCNN_EXPORT ncnn_allocator_t ncnn_allocator_create_pool_allocator();
+NCNN_EXPORT ncnn_allocator_t ncnn_allocator_create_unlocked_pool_allocator();
+NCNN_EXPORT void ncnn_allocator_destroy(ncnn_allocator_t allocator);
+
+/* option api */
+typedef struct __ncnn_option_t* ncnn_option_t;
+
+NCNN_EXPORT ncnn_option_t ncnn_option_create();
+NCNN_EXPORT void ncnn_option_destroy(ncnn_option_t opt);
+
+NCNN_EXPORT int ncnn_option_get_num_threads(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_num_threads(ncnn_option_t opt, int num_threads);
+
+NCNN_EXPORT int ncnn_option_get_use_local_pool_allocator(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_use_local_pool_allocator(ncnn_option_t opt, int use_local_pool_allocator);
+
+NCNN_EXPORT void ncnn_option_set_blob_allocator(ncnn_option_t opt, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_option_set_workspace_allocator(ncnn_option_t opt, ncnn_allocator_t allocator);
+
+NCNN_EXPORT int ncnn_option_get_use_vulkan_compute(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_use_vulkan_compute(ncnn_option_t opt, int use_vulkan_compute);
+
+/* mat api */
+typedef struct __ncnn_mat_t* ncnn_mat_t;
+
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create();
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_1d(int w, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_2d(int w, int h, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_3d(int w, int h, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_4d(int w, int h, int d, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_1d(int w, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_2d(int w, int h, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_3d(int w, int h, int c, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_4d(int w, int h, int d, int c, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_1d_elem(int w, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_2d_elem(int w, int h, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_3d_elem(int w, int h, int c, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_4d_elem(int w, int h, int d, int c, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_1d_elem(int w, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_2d_elem(int w, int h, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_3d_elem(int w, int h, int c, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_4d_elem(int w, int h, int d, int c, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_mat_destroy(ncnn_mat_t mat);
+
+NCNN_EXPORT void ncnn_mat_fill_float(ncnn_mat_t mat, float v);
+
+NCNN_EXPORT ncnn_mat_t ncnn_mat_clone(const ncnn_mat_t mat, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_1d(const ncnn_mat_t mat, int w, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_2d(const ncnn_mat_t mat, int w, int h, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_3d(const ncnn_mat_t mat, int w, int h, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_4d(const ncnn_mat_t mat, int w, int h, int d, int c, ncnn_allocator_t allocator);
+
+NCNN_EXPORT int ncnn_mat_get_dims(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_w(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_h(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_d(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_c(const ncnn_mat_t mat);
+NCNN_EXPORT size_t ncnn_mat_get_elemsize(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_elempack(const ncnn_mat_t mat);
+NCNN_EXPORT size_t ncnn_mat_get_cstep(const ncnn_mat_t mat);
+NCNN_EXPORT void* ncnn_mat_get_data(const ncnn_mat_t mat);
+
+NCNN_EXPORT void* ncnn_mat_get_channel_data(const ncnn_mat_t mat, int c);
+
+#if NCNN_PIXEL
+
+/* mat pixel api */
+#define NCNN_MAT_PIXEL_RGB       1
+#define NCNN_MAT_PIXEL_BGR       2
+#define NCNN_MAT_PIXEL_GRAY      3
+#define NCNN_MAT_PIXEL_RGBA      4
+#define NCNN_MAT_PIXEL_BGRA      5
+#define NCNN_MAT_PIXEL_X2Y(X, Y) (X | (Y << 16))
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels(const unsigned char* pixels, int type, int w, int h, int stride, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int stride, int target_width, int target_height, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, int target_width, int target_height, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_mat_to_pixels(const ncnn_mat_t mat, unsigned char* pixels, int type, int stride);
+NCNN_EXPORT void ncnn_mat_to_pixels_resize(const ncnn_mat_t mat, unsigned char* pixels, int type, int target_width, int target_height, int target_stride);
+
+#endif /* NCNN_PIXEL */
+
+NCNN_EXPORT void ncnn_mat_substract_mean_normalize(ncnn_mat_t mat, const float* mean_vals, const float* norm_vals);
+
+NCNN_EXPORT void ncnn_convert_packing(const ncnn_mat_t src, ncnn_mat_t* dst, int elempack, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_flatten(const ncnn_mat_t src, ncnn_mat_t* dst, const ncnn_option_t opt);
+
+/* blob api */
+typedef struct __ncnn_blob_t* ncnn_blob_t;
+
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_blob_get_name(const ncnn_blob_t blob);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_blob_get_producer(const ncnn_blob_t blob);
+NCNN_EXPORT int ncnn_blob_get_consumer(const ncnn_blob_t blob);
+
+NCNN_EXPORT void ncnn_blob_get_shape(const ncnn_blob_t blob, int* dims, int* w, int* h, int* c);
+
+/* paramdict api */
+typedef struct __ncnn_paramdict_t* ncnn_paramdict_t;
+
+NCNN_EXPORT ncnn_paramdict_t ncnn_paramdict_create();
+NCNN_EXPORT void ncnn_paramdict_destroy(ncnn_paramdict_t pd);
+
+NCNN_EXPORT int ncnn_paramdict_get_type(const ncnn_paramdict_t pd, int id);
+
+NCNN_EXPORT int ncnn_paramdict_get_int(const ncnn_paramdict_t pd, int id, int def);
+NCNN_EXPORT float ncnn_paramdict_get_float(const ncnn_paramdict_t pd, int id, float def);
+NCNN_EXPORT ncnn_mat_t ncnn_paramdict_get_array(const ncnn_paramdict_t pd, int id, const ncnn_mat_t def);
+
+NCNN_EXPORT void ncnn_paramdict_set_int(ncnn_paramdict_t pd, int id, int i);
+NCNN_EXPORT void ncnn_paramdict_set_float(ncnn_paramdict_t pd, int id, float f);
+NCNN_EXPORT void ncnn_paramdict_set_array(ncnn_paramdict_t pd, int id, const ncnn_mat_t v);
+
+/* datareader api */
+typedef struct __ncnn_datareader_t* ncnn_datareader_t;
+struct NCNN_EXPORT __ncnn_datareader_t
+{
+    void* pthis;
+
+#if NCNN_STRING
+    int (*scan)(ncnn_datareader_t dr, const char* format, void* p);
+#endif /* NCNN_STRING */
+    size_t (*read)(ncnn_datareader_t dr, void* buf, size_t size);
+};
+
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create();
+#if NCNN_STDIO
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create_from_stdio(FILE* fp);
+#endif /* NCNN_STDIO */
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create_from_memory(const unsigned char** mem);
+NCNN_EXPORT void ncnn_datareader_destroy(ncnn_datareader_t dr);
+
+/* modelbin api */
+typedef struct __ncnn_modelbin_t* ncnn_modelbin_t;
+struct NCNN_EXPORT __ncnn_modelbin_t
+{
+    void* pthis;
+
+    ncnn_mat_t (*load_1d)(const ncnn_modelbin_t mb, int w, int type);
+    ncnn_mat_t (*load_2d)(const ncnn_modelbin_t mb, int w, int h, int type);
+    ncnn_mat_t (*load_3d)(const ncnn_modelbin_t mb, int w, int h, int c, int type);
+};
+
+NCNN_EXPORT ncnn_modelbin_t ncnn_modelbin_create_from_datareader(const ncnn_datareader_t dr);
+NCNN_EXPORT ncnn_modelbin_t ncnn_modelbin_create_from_mat_array(const ncnn_mat_t* weights, int n);
+NCNN_EXPORT void ncnn_modelbin_destroy(ncnn_modelbin_t mb);
+
+/* layer api */
+typedef struct __ncnn_layer_t* ncnn_layer_t;
+struct NCNN_EXPORT __ncnn_layer_t
+{
+    void* pthis;
+
+    int (*load_param)(ncnn_layer_t layer, const ncnn_paramdict_t pd);
+    int (*load_model)(ncnn_layer_t layer, const ncnn_modelbin_t mb);
+
+    int (*create_pipeline)(ncnn_layer_t layer, const ncnn_option_t opt);
+    int (*destroy_pipeline)(ncnn_layer_t layer, const ncnn_option_t opt);
+
+    int (*forward_1)(const ncnn_layer_t layer, const ncnn_mat_t bottom_blob, ncnn_mat_t* top_blob, const ncnn_option_t opt);
+    int (*forward_n)(const ncnn_layer_t layer, const ncnn_mat_t* bottom_blobs, int n, ncnn_mat_t* top_blobs, int n2, const ncnn_option_t opt);
+
+    int (*forward_inplace_1)(const ncnn_layer_t layer, ncnn_mat_t bottom_top_blob, const ncnn_option_t opt);
+    int (*forward_inplace_n)(const ncnn_layer_t layer, ncnn_mat_t* bottom_top_blobs, int n, const ncnn_option_t opt);
+};
+
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create();
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create_by_typeindex(int typeindex);
+#if NCNN_STRING
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create_by_type(const char* type);
+#endif /* NCNN_STRING */
+NCNN_EXPORT void ncnn_layer_destroy(ncnn_layer_t layer);
+
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_layer_get_name(const ncnn_layer_t layer);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_layer_get_typeindex(const ncnn_layer_t layer);
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_layer_get_type(const ncnn_layer_t layer);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_layer_get_one_blob_only(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_inplace(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_vulkan(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_packing(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_bf16_storage(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_fp16_storage(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_image_storage(const ncnn_layer_t layer);
+
+NCNN_EXPORT void ncnn_layer_set_one_blob_only(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_inplace(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_vulkan(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_packing(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_bf16_storage(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_fp16_storage(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_image_storage(ncnn_layer_t layer, int enable);
+
+NCNN_EXPORT int ncnn_layer_get_bottom_count(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_bottom(const ncnn_layer_t layer, int i);
+NCNN_EXPORT int ncnn_layer_get_top_count(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_top(const ncnn_layer_t layer, int i);
+
+NCNN_EXPORT void ncnn_blob_get_bottom_shape(const ncnn_layer_t layer, int i, int* dims, int* w, int* h, int* c);
+NCNN_EXPORT void ncnn_blob_get_top_shape(const ncnn_layer_t layer, int i, int* dims, int* w, int* h, int* c);
+
+/* layer factory function */
+typedef ncnn_layer_t (*ncnn_layer_creator_t)(void* userdata);
+typedef void (*ncnn_layer_destroyer_t)(ncnn_layer_t layer, void* userdata);
+
+typedef struct __ncnn_net_custom_layer_factory_t* ncnn_net_custom_layer_factory_t;
+struct __ncnn_net_custom_layer_factory_t
+{
+    ncnn_layer_creator_t creator;
+    ncnn_layer_destroyer_t destroyer;
+    void* userdata;
+    ncnn_net_custom_layer_factory_t next;
+};
+
+/* net api */
+typedef struct __ncnn_net_t* ncnn_net_t;
+struct __ncnn_net_t
+{
+    void* pthis;
+
+    ncnn_net_custom_layer_factory_t custom_layer_factory;
+};
+
+NCNN_EXPORT ncnn_net_t ncnn_net_create();
+NCNN_EXPORT void ncnn_net_destroy(ncnn_net_t net);
+
+NCNN_EXPORT ncnn_option_t ncnn_net_get_option(ncnn_net_t net);
+NCNN_EXPORT void ncnn_net_set_option(ncnn_net_t net, ncnn_option_t opt);
+
+#if NCNN_STRING
+NCNN_EXPORT void ncnn_net_register_custom_layer_by_type(ncnn_net_t net, const char* type, ncnn_layer_creator_t creator, ncnn_layer_destroyer_t destroyer, void* userdata);
+#endif /* NCNN_STRING */
+NCNN_EXPORT void ncnn_net_register_custom_layer_by_typeindex(ncnn_net_t net, int typeindex, ncnn_layer_creator_t creator, ncnn_layer_destroyer_t destroyer, void* userdata);
+
+#if NCNN_STDIO
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param(ncnn_net_t net, const char* path);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_net_load_param_bin(ncnn_net_t net, const char* path);
+NCNN_EXPORT int ncnn_net_load_model(ncnn_net_t net, const char* path);
+#endif /* NCNN_STDIO */
+
+#if NCNN_STDIO
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param_memory(ncnn_net_t net, const char* mem);
+#endif /* NCNN_STRING */
+#endif /* NCNN_STDIO */
+NCNN_EXPORT int ncnn_net_load_param_bin_memory(ncnn_net_t net, const unsigned char* mem);
+NCNN_EXPORT int ncnn_net_load_model_memory(ncnn_net_t net, const unsigned char* mem);
+
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_net_load_param_bin_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+NCNN_EXPORT int ncnn_net_load_model_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+
+NCNN_EXPORT void ncnn_net_clear(ncnn_net_t net);
+
+/* extractor api */
+typedef struct __ncnn_extractor_t* ncnn_extractor_t;
+
+NCNN_EXPORT ncnn_extractor_t ncnn_extractor_create(ncnn_net_t net);
+NCNN_EXPORT void ncnn_extractor_destroy(ncnn_extractor_t ex);
+
+NCNN_EXPORT void ncnn_extractor_set_option(ncnn_extractor_t ex, const ncnn_option_t opt);
+
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_extractor_input(ncnn_extractor_t ex, const char* name, const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_extractor_extract(ncnn_extractor_t ex, const char* name, ncnn_mat_t* mat);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_extractor_input_index(ncnn_extractor_t ex, int index, const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_extractor_extract_index(ncnn_extractor_t ex, int index, ncnn_mat_t* mat);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* NCNN_C_API */
+
+#endif /* NCNN_C_API_H */
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/command.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/command.h
new file mode 100644
index 0000000..337d085
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/command.h
@@ -0,0 +1,136 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_COMMAND_H
+#define NCNN_COMMAND_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+
+#include "mat.h"
+
+#include <vulkan/vulkan.h>
+
+namespace ncnn {
+
+class Pipeline;
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class ImportAndroidHardwareBufferPipeline;
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+class VkComputePrivate;
+class NCNN_EXPORT VkCompute
+{
+public:
+    explicit VkCompute(const VulkanDevice* vkdev);
+    virtual ~VkCompute();
+
+public:
+    void record_upload(const Mat& src, VkMat& dst, const Option& opt);
+
+    void record_upload(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    void record_download(const VkMat& src, Mat& dst, const Option& opt);
+
+    void record_download(const VkImageMat& src, Mat& dst, const Option& opt);
+
+    void record_buffer_to_image(const VkMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_image_to_buffer(const VkImageMat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const Mat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, Mat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, Mat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, VkMat& dst, const Option& opt);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkImageMat>& bindings, const std::vector<vk_constant_type>& constants, const VkImageMat& dispatcher);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher);
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const VkImageMat& dispatcher);
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const Mat& dispatcher);
+
+#if NCNN_BENCHMARK
+    void record_write_timestamp(uint32_t query);
+#endif // NCNN_BENCHMARK
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+    void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkMat& dst);
+
+    void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkImageMat& dst);
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+    int submit_and_wait();
+
+    int reset();
+
+#if NCNN_BENCHMARK
+    int create_query_pool(uint32_t query_count);
+
+    int get_query_pool_results(uint32_t first_query, uint32_t query_count, std::vector<uint64_t>& results);
+#endif // NCNN_BENCHMARK
+
+protected:
+    const VulkanDevice* vkdev;
+
+    void barrier_readwrite(const VkMat& binding);
+    void barrier_readwrite(const VkImageMat& binding);
+    void barrier_readonly(const VkImageMat& binding);
+
+private:
+    VkComputePrivate* const d;
+};
+
+class VkTransferPrivate;
+class NCNN_EXPORT VkTransfer
+{
+public:
+    explicit VkTransfer(const VulkanDevice* vkdev);
+    virtual ~VkTransfer();
+
+public:
+    void record_upload(const Mat& src, VkMat& dst, const Option& opt, bool flatten = true);
+
+    void record_upload(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    int submit_and_wait();
+
+protected:
+    const VulkanDevice* vkdev;
+
+private:
+    VkTransferPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_VULKAN
+
+#endif // NCNN_COMMAND_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/cpu.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/cpu.h
new file mode 100644
index 0000000..0f748f3
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/cpu.h
@@ -0,0 +1,169 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_CPU_H
+#define NCNN_CPU_H
+
+#include <stddef.h>
+
+#if (defined _WIN32 && !(defined __MINGW32__))
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+#if defined __ANDROID__ || defined __linux__
+#include <sched.h> // cpu_set_t
+#endif
+
+#include "platform.h"
+
+namespace ncnn {
+
+class NCNN_EXPORT CpuSet
+{
+public:
+    CpuSet();
+    void enable(int cpu);
+    void disable(int cpu);
+    void disable_all();
+    bool is_enabled(int cpu) const;
+    int num_enabled() const;
+
+public:
+#if (defined _WIN32 && !(defined __MINGW32__))
+    ULONG_PTR mask;
+#endif
+#if defined __ANDROID__ || defined __linux__
+    cpu_set_t cpu_set;
+#endif
+#if __APPLE__
+    unsigned int policy;
+#endif
+};
+
+// test optional cpu features
+// edsp = armv7 edsp
+NCNN_EXPORT int cpu_support_arm_edsp();
+// neon = armv7 neon or aarch64 asimd
+NCNN_EXPORT int cpu_support_arm_neon();
+// vfpv4 = armv7 fp16 + fma
+NCNN_EXPORT int cpu_support_arm_vfpv4();
+// asimdhp = aarch64 asimd half precision
+NCNN_EXPORT int cpu_support_arm_asimdhp();
+// asimddp = aarch64 asimd dot product
+NCNN_EXPORT int cpu_support_arm_asimddp();
+// asimdfhm = aarch64 asimd fhm
+NCNN_EXPORT int cpu_support_arm_asimdfhm();
+// bf16 = aarch64 bf16
+NCNN_EXPORT int cpu_support_arm_bf16();
+// i8mm = aarch64 i8mm
+NCNN_EXPORT int cpu_support_arm_i8mm();
+// sve = aarch64 sve
+NCNN_EXPORT int cpu_support_arm_sve();
+// sve2 = aarch64 sve2
+NCNN_EXPORT int cpu_support_arm_sve2();
+// svebf16 = aarch64 svebf16
+NCNN_EXPORT int cpu_support_arm_svebf16();
+// svei8mm = aarch64 svei8mm
+NCNN_EXPORT int cpu_support_arm_svei8mm();
+// svef32mm = aarch64 svef32mm
+NCNN_EXPORT int cpu_support_arm_svef32mm();
+
+// avx = x86 avx
+NCNN_EXPORT int cpu_support_x86_avx();
+// fma = x86 fma
+NCNN_EXPORT int cpu_support_x86_fma();
+// xop = x86 xop
+NCNN_EXPORT int cpu_support_x86_xop();
+// f16c = x86 f16c
+NCNN_EXPORT int cpu_support_x86_f16c();
+// avx2 = x86 avx2 + fma + f16c
+NCNN_EXPORT int cpu_support_x86_avx2();
+// avx_vnni = x86 avx vnni
+NCNN_EXPORT int cpu_support_x86_avx_vnni();
+// avx512 = x86 avx512f + avx512cd + avx512bw + avx512dq + avx512vl
+NCNN_EXPORT int cpu_support_x86_avx512();
+// avx512_vnni = x86 avx512 vnni
+NCNN_EXPORT int cpu_support_x86_avx512_vnni();
+// avx512_bf16 = x86 avx512 bf16
+NCNN_EXPORT int cpu_support_x86_avx512_bf16();
+// avx512_fp16 = x86 avx512 fp16
+NCNN_EXPORT int cpu_support_x86_avx512_fp16();
+
+// lsx = loongarch lsx
+NCNN_EXPORT int cpu_support_loongarch_lsx();
+// lasx = loongarch lasx
+NCNN_EXPORT int cpu_support_loongarch_lasx();
+
+// msa = mips mas
+NCNN_EXPORT int cpu_support_mips_msa();
+// mmi = loongson mmi
+NCNN_EXPORT int cpu_support_loongson_mmi();
+
+// v = riscv vector
+NCNN_EXPORT int cpu_support_riscv_v();
+// zfh = riscv half-precision float
+NCNN_EXPORT int cpu_support_riscv_zfh();
+// vlenb = riscv vector length in bytes
+NCNN_EXPORT int cpu_riscv_vlenb();
+
+// cpu info
+NCNN_EXPORT int get_cpu_count();
+NCNN_EXPORT int get_little_cpu_count();
+NCNN_EXPORT int get_big_cpu_count();
+
+NCNN_EXPORT int get_physical_cpu_count();
+NCNN_EXPORT int get_physical_little_cpu_count();
+NCNN_EXPORT int get_physical_big_cpu_count();
+
+// bind all threads on little clusters if powersave enabled
+// affects HMP arch cpu like ARM big.LITTLE
+// only implemented on android at the moment
+// switching powersave is expensive and not thread-safe
+// 0 = all cores enabled(default)
+// 1 = only little clusters enabled
+// 2 = only big clusters enabled
+// return 0 if success for setter function
+NCNN_EXPORT int get_cpu_powersave();
+NCNN_EXPORT int set_cpu_powersave(int powersave);
+
+// convenient wrapper
+NCNN_EXPORT const CpuSet& get_cpu_thread_affinity_mask(int powersave);
+
+// set explicit thread affinity
+NCNN_EXPORT int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask);
+
+// misc function wrapper for openmp routines
+NCNN_EXPORT int get_omp_num_threads();
+NCNN_EXPORT void set_omp_num_threads(int num_threads);
+
+NCNN_EXPORT int get_omp_dynamic();
+NCNN_EXPORT void set_omp_dynamic(int dynamic);
+
+NCNN_EXPORT int get_omp_thread_num();
+
+NCNN_EXPORT int get_kmp_blocktime();
+NCNN_EXPORT void set_kmp_blocktime(int time_ms);
+
+// need to flush denormals on Intel Chipset.
+// Other architectures such as ARM can be added as needed.
+// 0 = DAZ OFF, FTZ OFF
+// 1 = DAZ ON , FTZ OFF
+// 2 = DAZ OFF, FTZ ON
+// 3 = DAZ ON,  FTZ ON
+NCNN_EXPORT int get_flush_denormals();
+NCNN_EXPORT int set_flush_denormals(int flush_denormals);
+
+} // namespace ncnn
+
+#endif // NCNN_CPU_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/datareader.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/datareader.h
new file mode 100644
index 0000000..ed2aba3
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/datareader.h
@@ -0,0 +1,122 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_DATAREADER_H
+#define NCNN_DATAREADER_H
+
+#include "platform.h"
+#if NCNN_STDIO
+#include <stdio.h>
+#endif
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/asset_manager.h>
+#endif
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+// data read wrapper
+class NCNN_EXPORT DataReader
+{
+public:
+    DataReader();
+    virtual ~DataReader();
+
+#if NCNN_STRING
+    // parse plain param text
+    // return 1 if scan success
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+
+    // read binary param and model data
+    // return bytes read
+    virtual size_t read(void* buf, size_t size) const;
+
+    // get model data reference
+    // return bytes referenced
+    virtual size_t reference(size_t size, const void** buf) const;
+};
+
+#if NCNN_STDIO
+class DataReaderFromStdioPrivate;
+class NCNN_EXPORT DataReaderFromStdio : public DataReader
+{
+public:
+    explicit DataReaderFromStdio(FILE* fp);
+    virtual ~DataReaderFromStdio();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+
+private:
+    DataReaderFromStdio(const DataReaderFromStdio&);
+    DataReaderFromStdio& operator=(const DataReaderFromStdio&);
+
+private:
+    DataReaderFromStdioPrivate* const d;
+};
+#endif // NCNN_STDIO
+
+class DataReaderFromMemoryPrivate;
+class NCNN_EXPORT DataReaderFromMemory : public DataReader
+{
+public:
+    explicit DataReaderFromMemory(const unsigned char*& mem);
+    virtual ~DataReaderFromMemory();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+    virtual size_t reference(size_t size, const void** buf) const;
+
+private:
+    DataReaderFromMemory(const DataReaderFromMemory&);
+    DataReaderFromMemory& operator=(const DataReaderFromMemory&);
+
+private:
+    DataReaderFromMemoryPrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+class DataReaderFromAndroidAssetPrivate;
+class NCNN_EXPORT DataReaderFromAndroidAsset : public DataReader
+{
+public:
+    explicit DataReaderFromAndroidAsset(AAsset* asset);
+    virtual ~DataReaderFromAndroidAsset();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+
+private:
+    DataReaderFromAndroidAsset(const DataReaderFromAndroidAsset&);
+    DataReaderFromAndroidAsset& operator=(const DataReaderFromAndroidAsset&);
+
+private:
+    DataReaderFromAndroidAssetPrivate* const d;
+};
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+} // namespace ncnn
+
+#endif // NCNN_DATAREADER_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/gpu.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/gpu.h
new file mode 100644
index 0000000..2ef4927
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/gpu.h
@@ -0,0 +1,359 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_GPU_H
+#define NCNN_GPU_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+
+#include "mat.h"
+
+#include <vulkan/vulkan.h>
+
+#include "vulkan_header_fix.h"
+
+namespace ncnn {
+
+// instance
+NCNN_EXPORT int create_gpu_instance();
+NCNN_EXPORT void destroy_gpu_instance();
+
+// instance extension capability
+extern int support_VK_KHR_external_memory_capabilities;
+extern int support_VK_KHR_get_physical_device_properties2;
+extern int support_VK_KHR_get_surface_capabilities2;
+extern int support_VK_KHR_surface;
+extern int support_VK_EXT_debug_utils;
+#if __ANDROID_API__ >= 26
+extern int support_VK_KHR_android_surface;
+#endif // __ANDROID_API__ >= 26
+
+// VK_KHR_external_memory_capabilities
+extern PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR vkGetPhysicalDeviceExternalBufferPropertiesKHR;
+
+// VK_KHR_get_physical_device_properties2
+extern PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR;
+extern PFN_vkGetPhysicalDeviceProperties2KHR vkGetPhysicalDeviceProperties2KHR;
+extern PFN_vkGetPhysicalDeviceFormatProperties2KHR vkGetPhysicalDeviceFormatProperties2KHR;
+extern PFN_vkGetPhysicalDeviceImageFormatProperties2KHR vkGetPhysicalDeviceImageFormatProperties2KHR;
+extern PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR vkGetPhysicalDeviceQueueFamilyProperties2KHR;
+extern PFN_vkGetPhysicalDeviceMemoryProperties2KHR vkGetPhysicalDeviceMemoryProperties2KHR;
+extern PFN_vkGetPhysicalDeviceSparseImageFormatProperties2KHR vkGetPhysicalDeviceSparseImageFormatProperties2KHR;
+
+// VK_KHR_get_surface_capabilities2
+extern PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR vkGetPhysicalDeviceSurfaceCapabilities2KHR;
+extern PFN_vkGetPhysicalDeviceSurfaceFormats2KHR vkGetPhysicalDeviceSurfaceFormats2KHR;
+
+// VK_KHR_surface
+extern PFN_vkDestroySurfaceKHR vkDestroySurfaceKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceSupportKHR vkGetPhysicalDeviceSurfaceSupportKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR vkGetPhysicalDeviceSurfaceCapabilitiesKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceFormatsKHR vkGetPhysicalDeviceSurfaceFormatsKHR;
+extern PFN_vkGetPhysicalDeviceSurfacePresentModesKHR vkGetPhysicalDeviceSurfacePresentModesKHR;
+
+#if __ANDROID_API__ >= 26
+// VK_KHR_android_surface
+extern PFN_vkCreateAndroidSurfaceKHR vkCreateAndroidSurfaceKHR;
+#endif // __ANDROID_API__ >= 26
+
+// VK_NV_cooperative_matrix
+extern PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesNV vkGetPhysicalDeviceCooperativeMatrixPropertiesNV;
+
+// get info
+NCNN_EXPORT int get_gpu_count();
+NCNN_EXPORT int get_default_gpu_index();
+
+class GpuInfoPrivate;
+class NCNN_EXPORT GpuInfo
+{
+public:
+    explicit GpuInfo();
+    virtual ~GpuInfo();
+
+    // vulkan physical device
+    VkPhysicalDevice physical_device() const;
+
+    // memory properties
+    const VkPhysicalDeviceMemoryProperties& physical_device_memory_properties() const;
+
+    // info
+    uint32_t api_version() const;
+    uint32_t driver_version() const;
+    uint32_t vendor_id() const;
+    uint32_t device_id() const;
+    const char* device_name() const;
+    uint8_t* pipeline_cache_uuid() const;
+
+    // 0 = discrete gpu
+    // 1 = integrated gpu
+    // 2 = virtual gpu
+    // 3 = cpu
+    int type() const;
+
+    // hardware limit
+    uint32_t max_shared_memory_size() const;
+    uint32_t max_workgroup_count_x() const;
+    uint32_t max_workgroup_count_y() const;
+    uint32_t max_workgroup_count_z() const;
+    uint32_t max_workgroup_invocations() const;
+    uint32_t max_workgroup_size_x() const;
+    uint32_t max_workgroup_size_y() const;
+    uint32_t max_workgroup_size_z() const;
+    size_t memory_map_alignment() const;
+    size_t buffer_offset_alignment() const;
+    size_t non_coherent_atom_size() const;
+    size_t buffer_image_granularity() const;
+    uint32_t max_image_dimension_1d() const;
+    uint32_t max_image_dimension_2d() const;
+    uint32_t max_image_dimension_3d() const;
+    float timestamp_period() const;
+
+    // runtime
+    uint32_t compute_queue_family_index() const;
+    uint32_t graphics_queue_family_index() const;
+    uint32_t transfer_queue_family_index() const;
+
+    uint32_t compute_queue_count() const;
+    uint32_t graphics_queue_count() const;
+    uint32_t transfer_queue_count() const;
+
+    // property
+    bool unified_compute_transfer_queue() const;
+
+    // subgroup
+    uint32_t subgroup_size() const;
+    bool support_subgroup_basic() const;
+    bool support_subgroup_vote() const;
+    bool support_subgroup_ballot() const;
+    bool support_subgroup_shuffle() const;
+
+    // bug is not feature
+    bool bug_storage_buffer_no_l1() const;
+    bool bug_corrupted_online_pipeline_cache() const;
+    bool bug_buffer_image_load_zero() const;
+
+    // but sometimes bug is a feature
+    bool bug_implicit_fp16_arithmetic() const;
+
+    // fp16 and int8 feature
+    bool support_fp16_packed() const;
+    bool support_fp16_storage() const;
+    bool support_fp16_arithmetic() const;
+    bool support_int8_packed() const;
+    bool support_int8_storage() const;
+    bool support_int8_arithmetic() const;
+
+    // ycbcr conversion feature
+    bool support_ycbcr_conversion() const;
+
+    // cooperative matrix feature
+    bool support_cooperative_matrix() const;
+    bool support_cooperative_matrix_16_8_8() const;
+
+    // extension capability
+    int support_VK_KHR_8bit_storage() const;
+    int support_VK_KHR_16bit_storage() const;
+    int support_VK_KHR_bind_memory2() const;
+    int support_VK_KHR_create_renderpass2() const;
+    int support_VK_KHR_dedicated_allocation() const;
+    int support_VK_KHR_descriptor_update_template() const;
+    int support_VK_KHR_external_memory() const;
+    int support_VK_KHR_get_memory_requirements2() const;
+    int support_VK_KHR_maintenance1() const;
+    int support_VK_KHR_maintenance2() const;
+    int support_VK_KHR_maintenance3() const;
+    int support_VK_KHR_multiview() const;
+    int support_VK_KHR_push_descriptor() const;
+    int support_VK_KHR_sampler_ycbcr_conversion() const;
+    int support_VK_KHR_shader_float16_int8() const;
+    int support_VK_KHR_shader_float_controls() const;
+    int support_VK_KHR_storage_buffer_storage_class() const;
+    int support_VK_KHR_swapchain() const;
+    int support_VK_EXT_descriptor_indexing() const;
+    int support_VK_EXT_memory_budget() const;
+    int support_VK_EXT_queue_family_foreign() const;
+#if __ANDROID_API__ >= 26
+    int support_VK_ANDROID_external_memory_android_hardware_buffer() const;
+#endif // __ANDROID_API__ >= 26
+    int support_VK_NV_cooperative_matrix() const;
+
+private:
+    GpuInfo(const GpuInfo&);
+    GpuInfo& operator=(const GpuInfo&);
+
+private:
+    friend int create_gpu_instance();
+    GpuInfoPrivate* const d;
+};
+
+NCNN_EXPORT const GpuInfo& get_gpu_info(int device_index = get_default_gpu_index());
+
+class VkAllocator;
+class VkCompute;
+class Option;
+class PipelineCache;
+class VulkanDevicePrivate;
+class NCNN_EXPORT VulkanDevice
+{
+public:
+    VulkanDevice(int device_index = get_default_gpu_index());
+    ~VulkanDevice();
+
+    const GpuInfo& info;
+
+    VkDevice vkdevice() const;
+
+    VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const;
+
+    // with fixed workgroup size
+    VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const;
+
+    // helper for creating pipeline
+    int create_descriptorset_layout(int binding_count, const int* binding_types, VkDescriptorSetLayout* descriptorset_layout) const;
+    int create_pipeline_layout(int push_constant_count, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout* pipeline_layout) const;
+    int create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, VkPipeline* pipeline) const;
+    int create_descriptor_update_template(int binding_count, const int* binding_types, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR* descriptor_update_template) const;
+
+    uint32_t find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const;
+    bool is_mappable(uint32_t memory_type_index) const;
+    bool is_coherent(uint32_t memory_type_index) const;
+
+    VkQueue acquire_queue(uint32_t queue_family_index) const;
+    void reclaim_queue(uint32_t queue_family_index, VkQueue queue) const;
+
+    // allocator on this device
+    VkAllocator* acquire_blob_allocator() const;
+    void reclaim_blob_allocator(VkAllocator* allocator) const;
+
+    VkAllocator* acquire_staging_allocator() const;
+    void reclaim_staging_allocator(VkAllocator* allocator) const;
+
+    // immutable sampler for texelfetch
+    const VkSampler* immutable_texelfetch_sampler() const;
+
+    // dummy buffer image
+    VkMat get_dummy_buffer() const;
+    VkImageMat get_dummy_image() const;
+    VkImageMat get_dummy_image_readonly() const;
+
+    // pipeline cache on this device
+    const PipelineCache* get_pipeline_cache() const;
+
+    // test image allocation
+    bool shape_support_image_storage(const Mat& shape) const;
+
+    // current gpu heap memory budget in MB
+    uint32_t get_heap_budget() const;
+
+    // utility operator
+    void convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkImageMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkImageMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+
+    // VK_KHR_bind_memory2
+    PFN_vkBindBufferMemory2KHR vkBindBufferMemory2KHR;
+    PFN_vkBindImageMemory2KHR vkBindImageMemory2KHR;
+
+    // VK_KHR_create_renderpass2
+    PFN_vkCmdBeginRenderPass2KHR vkCmdBeginRenderPass2KHR;
+    PFN_vkCmdEndRenderPass2KHR vkCmdEndRenderPass2KHR;
+    PFN_vkCmdNextSubpass2KHR vkCmdNextSubpass2KHR;
+    PFN_vkCreateRenderPass2KHR vkCreateRenderPass2KHR;
+
+    // VK_KHR_descriptor_update_template
+    PFN_vkCreateDescriptorUpdateTemplateKHR vkCreateDescriptorUpdateTemplateKHR;
+    PFN_vkDestroyDescriptorUpdateTemplateKHR vkDestroyDescriptorUpdateTemplateKHR;
+    PFN_vkUpdateDescriptorSetWithTemplateKHR vkUpdateDescriptorSetWithTemplateKHR;
+
+    // VK_KHR_get_memory_requirements2
+    PFN_vkGetImageMemoryRequirements2KHR vkGetImageMemoryRequirements2KHR;
+    PFN_vkGetBufferMemoryRequirements2KHR vkGetBufferMemoryRequirements2KHR;
+    PFN_vkGetImageSparseMemoryRequirements2KHR vkGetImageSparseMemoryRequirements2KHR;
+
+    // VK_KHR_maintenance1
+    PFN_vkTrimCommandPoolKHR vkTrimCommandPoolKHR;
+
+    // VK_KHR_maintenance3
+    PFN_vkGetDescriptorSetLayoutSupportKHR vkGetDescriptorSetLayoutSupportKHR;
+
+    // VK_KHR_push_descriptor
+    PFN_vkCmdPushDescriptorSetWithTemplateKHR vkCmdPushDescriptorSetWithTemplateKHR;
+    PFN_vkCmdPushDescriptorSetKHR vkCmdPushDescriptorSetKHR;
+
+    // VK_KHR_sampler_ycbcr_conversion
+    PFN_vkCreateSamplerYcbcrConversionKHR vkCreateSamplerYcbcrConversionKHR;
+    PFN_vkDestroySamplerYcbcrConversionKHR vkDestroySamplerYcbcrConversionKHR;
+
+    // VK_KHR_swapchain
+    PFN_vkCreateSwapchainKHR vkCreateSwapchainKHR;
+    PFN_vkDestroySwapchainKHR vkDestroySwapchainKHR;
+    PFN_vkGetSwapchainImagesKHR vkGetSwapchainImagesKHR;
+    PFN_vkAcquireNextImageKHR vkAcquireNextImageKHR;
+    PFN_vkQueuePresentKHR vkQueuePresentKHR;
+
+#if __ANDROID_API__ >= 26
+    // VK_ANDROID_external_memory_android_hardware_buffer
+    PFN_vkGetAndroidHardwareBufferPropertiesANDROID vkGetAndroidHardwareBufferPropertiesANDROID;
+    PFN_vkGetMemoryAndroidHardwareBufferANDROID vkGetMemoryAndroidHardwareBufferANDROID;
+#endif // __ANDROID_API__ >= 26
+
+protected:
+    // device extension
+    int init_device_extension();
+
+private:
+    VulkanDevice(const VulkanDevice&);
+    VulkanDevice& operator=(const VulkanDevice&);
+
+private:
+    VulkanDevicePrivate* const d;
+};
+
+NCNN_EXPORT VulkanDevice* get_gpu_device(int device_index = get_default_gpu_index());
+
+// online spirv compilation
+NCNN_EXPORT int compile_spirv_module(const char* comp_string, const Option& opt, std::vector<uint32_t>& spirv);
+NCNN_EXPORT int compile_spirv_module(const char* comp_data, int comp_data_size, const Option& opt, std::vector<uint32_t>& spirv);
+NCNN_EXPORT int compile_spirv_module(int shader_type_index, const Option& opt, std::vector<uint32_t>& spirv);
+
+// info from spirv
+class NCNN_EXPORT ShaderInfo
+{
+public:
+    int specialization_count;
+    int binding_count;
+    int push_constant_count;
+
+    // 0 = null
+    // 1 = storage buffer
+    // 2 = storage image
+    // 3 = combined image sampler
+    int binding_types[16]; // 16 is large enough I think ...
+
+    int reserved_0;
+    int reserved_1;
+    int reserved_2;
+    int reserved_3;
+};
+
+NCNN_EXPORT int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info);
+
+} // namespace ncnn
+
+#endif // NCNN_VULKAN
+
+#endif // NCNN_GPU_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/layer.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/layer.h
new file mode 100644
index 0000000..d02f65b
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/layer.h
@@ -0,0 +1,214 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_H
+#define NCNN_LAYER_H
+
+#include "mat.h"
+#include "modelbin.h"
+#include "option.h"
+#include "paramdict.h"
+#include "platform.h"
+
+#include <math.h>
+
+#if NCNN_VULKAN
+#include "command.h"
+#include "pipeline.h"
+
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+namespace ncnn {
+
+class NCNN_EXPORT Layer
+{
+public:
+    // empty
+    Layer();
+    // virtual destructor
+    virtual ~Layer();
+
+    // load layer specific parameter from parsed dict
+    // return 0 if success
+    virtual int load_param(const ParamDict& pd);
+
+    // load layer specific weight data from model binary
+    // return 0 if success
+    virtual int load_model(const ModelBin& mb);
+
+    // layer implementation specific setup
+    // return 0 if success
+    virtual int create_pipeline(const Option& opt);
+
+    // layer implementation specific clean
+    // return 0 if success
+    virtual int destroy_pipeline(const Option& opt);
+
+public:
+    // one input and one output blob
+    bool one_blob_only;
+
+    // support inplace inference
+    bool support_inplace;
+
+    // support vulkan compute
+    bool support_vulkan;
+
+    // accept input blob with packed storage
+    bool support_packing;
+
+    // accept bf16
+    bool support_bf16_storage;
+
+    // accept fp16
+    bool support_fp16_storage;
+
+    // accept int8
+    bool support_int8_storage;
+
+    // shader image storage
+    bool support_image_storage;
+
+    // shader tensor storage
+    bool support_tensor_storage;
+
+    bool support_reserved_00;
+
+    bool support_reserved_0;
+    bool support_reserved_1;
+    bool support_reserved_2;
+    bool support_reserved_3;
+    bool support_reserved_4;
+    bool support_reserved_5;
+    bool support_reserved_6;
+    bool support_reserved_7;
+    bool support_reserved_8;
+    bool support_reserved_9;
+
+    // feature disabled set
+    int featmask;
+
+public:
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+
+#if NCNN_VULKAN
+public:
+    // upload weight blob from host to device
+    virtual int upload_model(VkTransfer& cmd, const Option& opt);
+
+public:
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<VkMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<VkImageMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    // assigned immediately after creating this layer
+    const VulkanDevice* vkdev;
+#endif // NCNN_VULKAN
+
+public:
+    // custom user data
+    void* userdata;
+    // layer type index
+    int typeindex;
+#if NCNN_STRING
+    // layer type name
+    std::string type;
+    // layer name
+    std::string name;
+#endif // NCNN_STRING
+    // blob index which this layer needs as input
+    std::vector<int> bottoms;
+    // blob index which this layer produces as output
+    std::vector<int> tops;
+    // shape hint
+    std::vector<Mat> bottom_shapes;
+    std::vector<Mat> top_shapes;
+};
+
+// layer factory function
+typedef Layer* (*layer_creator_func)(void*);
+typedef void (*layer_destroyer_func)(Layer*, void*);
+
+struct layer_registry_entry
+{
+#if NCNN_STRING
+    // layer type name
+    const char* name;
+#endif // NCNN_STRING
+    // layer factory entry
+    layer_creator_func creator;
+};
+
+struct custom_layer_registry_entry
+{
+#if NCNN_STRING
+    // layer type name
+    const char* name;
+#endif // NCNN_STRING
+    // layer factory entry
+    layer_creator_func creator;
+    layer_destroyer_func destroyer;
+    void* userdata;
+};
+
+#if NCNN_STRING
+// get layer type from type name
+NCNN_EXPORT int layer_to_index(const char* type);
+// create layer from type name
+NCNN_EXPORT Layer* create_layer(const char* type);
+#endif // NCNN_STRING
+// create layer from layer type
+NCNN_EXPORT Layer* create_layer(int index);
+
+#define DEFINE_LAYER_CREATOR(name)                          \
+    ::ncnn::Layer* name##_layer_creator(void* /*userdata*/) \
+    {                                                       \
+        return new name;                                    \
+    }
+
+#define DEFINE_LAYER_DESTROYER(name)                                      \
+    void name##_layer_destroyer(::ncnn::Layer* layer, void* /*userdata*/) \
+    {                                                                     \
+        delete layer;                                                     \
+    }
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/layer_shader_type.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/layer_shader_type.h
new file mode 100644
index 0000000..c143e7d
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/layer_shader_type.h
@@ -0,0 +1,29 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_SHADER_TYPE_H
+#define NCNN_LAYER_SHADER_TYPE_H
+
+namespace ncnn {
+
+namespace LayerShaderType {
+enum LayerShaderType
+{
+#include "layer_shader_type_enum.h"
+};
+} // namespace LayerShaderType
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_SHADER_TYPE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/layer_shader_type_enum.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/layer_shader_type_enum.h
new file mode 100644
index 0000000..f11cab9
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/layer_shader_type_enum.h
@@ -0,0 +1,370 @@
+// Layer Shader Enum header
+//
+// This file is auto-generated by cmake, don't edit it.
+
+absval = 0,
+absval_pack4 = 1,
+absval_pack8 = 2,
+batchnorm = 3,
+batchnorm_pack4 = 4,
+batchnorm_pack8 = 5,
+concat = 6,
+concat_pack4 = 7,
+concat_pack4to1 = 8,
+concat_pack8 = 9,
+concat_pack8to1 = 10,
+concat_pack8to4 = 11,
+convolution = 12,
+convolution_1x1s1d1 = 13,
+convolution_3x3s1d1_winograd23_transform_input = 14,
+convolution_3x3s1d1_winograd23_transform_output = 15,
+convolution_3x3s1d1_winograd43_transform_input = 16,
+convolution_3x3s1d1_winograd43_transform_output = 17,
+convolution_3x3s1d1_winograd_gemm = 18,
+convolution_gemm = 19,
+convolution_pack1to4 = 20,
+convolution_pack1to4_1x1s1d1 = 21,
+convolution_pack1to4_3x3s1d1_winograd_gemm = 22,
+convolution_pack1to4_gemm = 23,
+convolution_pack1to8 = 24,
+convolution_pack1to8_1x1s1d1 = 25,
+convolution_pack1to8_3x3s1d1_winograd_gemm = 26,
+convolution_pack1to8_gemm = 27,
+convolution_pack4 = 28,
+convolution_pack4_1x1s1d1 = 29,
+convolution_pack4_1x1s1d1_cm_16_8_8 = 30,
+convolution_pack4_3x3s1d1_winograd23_transform_input = 31,
+convolution_pack4_3x3s1d1_winograd23_transform_output = 32,
+convolution_pack4_3x3s1d1_winograd43_transform_input = 33,
+convolution_pack4_3x3s1d1_winograd43_transform_output = 34,
+convolution_pack4_3x3s1d1_winograd_gemm = 35,
+convolution_pack4_3x3s1d1_winograd_gemm_cm_16_8_8 = 36,
+convolution_pack4_gemm = 37,
+convolution_pack4_gemm_cm_16_8_8 = 38,
+convolution_pack4to1 = 39,
+convolution_pack4to1_1x1s1d1 = 40,
+convolution_pack4to1_3x3s1d1_winograd_gemm = 41,
+convolution_pack4to1_gemm = 42,
+convolution_pack4to8 = 43,
+convolution_pack4to8_1x1s1d1 = 44,
+convolution_pack4to8_3x3s1d1_winograd_gemm = 45,
+convolution_pack4to8_gemm = 46,
+convolution_pack8 = 47,
+convolution_pack8_1x1s1d1 = 48,
+convolution_pack8_3x3s1d1_winograd23_transform_input = 49,
+convolution_pack8_3x3s1d1_winograd23_transform_output = 50,
+convolution_pack8_3x3s1d1_winograd43_transform_input = 51,
+convolution_pack8_3x3s1d1_winograd43_transform_output = 52,
+convolution_pack8_3x3s1d1_winograd_gemm = 53,
+convolution_pack8_gemm = 54,
+convolution_pack8to1 = 55,
+convolution_pack8to1_1x1s1d1 = 56,
+convolution_pack8to1_3x3s1d1_winograd_gemm = 57,
+convolution_pack8to1_gemm = 58,
+convolution_pack8to4 = 59,
+convolution_pack8to4_1x1s1d1 = 60,
+convolution_pack8to4_3x3s1d1_winograd_gemm = 61,
+convolution_pack8to4_gemm = 62,
+crop = 63,
+crop_pack1to4 = 64,
+crop_pack1to8 = 65,
+crop_pack4 = 66,
+crop_pack4to1 = 67,
+crop_pack4to8 = 68,
+crop_pack8 = 69,
+crop_pack8to1 = 70,
+crop_pack8to4 = 71,
+deconvolution = 72,
+deconvolution_col2im = 73,
+deconvolution_gemm = 74,
+deconvolution_pack1to4 = 75,
+deconvolution_pack1to4_gemm = 76,
+deconvolution_pack1to8 = 77,
+deconvolution_pack1to8_gemm = 78,
+deconvolution_pack4 = 79,
+deconvolution_pack4_col2im = 80,
+deconvolution_pack4_gemm = 81,
+deconvolution_pack4_gemm_cm_16_8_8 = 82,
+deconvolution_pack4to1 = 83,
+deconvolution_pack4to1_gemm = 84,
+deconvolution_pack4to8 = 85,
+deconvolution_pack4to8_gemm = 86,
+deconvolution_pack8 = 87,
+deconvolution_pack8_col2im = 88,
+deconvolution_pack8_gemm = 89,
+deconvolution_pack8to1 = 90,
+deconvolution_pack8to1_gemm = 91,
+deconvolution_pack8to4 = 92,
+deconvolution_pack8to4_gemm = 93,
+dropout = 94,
+dropout_pack4 = 95,
+dropout_pack8 = 96,
+eltwise = 97,
+eltwise_pack4 = 98,
+eltwise_pack8 = 99,
+elu = 100,
+elu_pack4 = 101,
+elu_pack8 = 102,
+flatten = 103,
+flatten_pack1to4 = 104,
+flatten_pack1to8 = 105,
+flatten_pack4 = 106,
+flatten_pack4to8 = 107,
+flatten_pack8 = 108,
+innerproduct = 109,
+innerproduct_gemm = 110,
+innerproduct_gemm_wp1to4 = 111,
+innerproduct_gemm_wp1to8 = 112,
+innerproduct_gemm_wp4 = 113,
+innerproduct_gemm_wp4to1 = 114,
+innerproduct_gemm_wp4to8 = 115,
+innerproduct_gemm_wp8 = 116,
+innerproduct_gemm_wp8to1 = 117,
+innerproduct_gemm_wp8to4 = 118,
+innerproduct_pack1to4 = 119,
+innerproduct_pack1to8 = 120,
+innerproduct_pack4 = 121,
+innerproduct_pack4to1 = 122,
+innerproduct_pack4to8 = 123,
+innerproduct_pack8 = 124,
+innerproduct_pack8to1 = 125,
+innerproduct_pack8to4 = 126,
+innerproduct_reduce_sum8 = 127,
+innerproduct_reduce_sum8_pack4 = 128,
+innerproduct_reduce_sum8_pack8 = 129,
+innerproduct_sum8 = 130,
+innerproduct_sum8_pack1to4 = 131,
+innerproduct_sum8_pack1to8 = 132,
+innerproduct_sum8_pack4 = 133,
+innerproduct_sum8_pack4to1 = 134,
+innerproduct_sum8_pack4to8 = 135,
+innerproduct_sum8_pack8 = 136,
+innerproduct_sum8_pack8to1 = 137,
+innerproduct_sum8_pack8to4 = 138,
+lrn_norm = 139,
+lrn_norm_across_channel_pack4 = 140,
+lrn_norm_across_channel_pack8 = 141,
+lrn_norm_within_channel_pack4 = 142,
+lrn_norm_within_channel_pack8 = 143,
+lrn_square_pad = 144,
+lrn_square_pad_across_channel_pack4 = 145,
+lrn_square_pad_across_channel_pack8 = 146,
+lrn_square_pad_within_channel_pack4 = 147,
+lrn_square_pad_within_channel_pack8 = 148,
+pooling = 149,
+pooling_adaptive = 150,
+pooling_adaptive_pack4 = 151,
+pooling_adaptive_pack8 = 152,
+pooling_global = 153,
+pooling_global_pack4 = 154,
+pooling_global_pack8 = 155,
+pooling_pack4 = 156,
+pooling_pack8 = 157,
+prelu = 158,
+prelu_pack4 = 159,
+prelu_pack8 = 160,
+relu = 161,
+relu_pack4 = 162,
+relu_pack8 = 163,
+reshape = 164,
+reshape_pack1to4 = 165,
+reshape_pack1to8 = 166,
+reshape_pack4 = 167,
+reshape_pack4to1 = 168,
+reshape_pack4to8 = 169,
+reshape_pack8 = 170,
+reshape_pack8to1 = 171,
+reshape_pack8to4 = 172,
+scale = 173,
+scale_pack4 = 174,
+scale_pack8 = 175,
+sigmoid = 176,
+sigmoid_pack4 = 177,
+sigmoid_pack8 = 178,
+slice = 179,
+slice_pack1to4 = 180,
+slice_pack1to8 = 181,
+slice_pack4 = 182,
+slice_pack4to8 = 183,
+slice_pack8 = 184,
+softmax_div_sum = 185,
+softmax_div_sum_pack4 = 186,
+softmax_div_sum_pack8 = 187,
+softmax_exp_sub_max = 188,
+softmax_exp_sub_max_pack4 = 189,
+softmax_exp_sub_max_pack8 = 190,
+softmax_reduce_max = 191,
+softmax_reduce_max_pack4 = 192,
+softmax_reduce_max_pack8 = 193,
+softmax_reduce_sum = 194,
+softmax_reduce_sum_pack4 = 195,
+softmax_reduce_sum_pack8 = 196,
+tanh = 197,
+tanh_pack4 = 198,
+tanh_pack8 = 199,
+binaryop = 200,
+binaryop_broadcast = 201,
+binaryop_broadcast_a1_pack4 = 202,
+binaryop_broadcast_a1_pack8 = 203,
+binaryop_broadcast_b1_pack4 = 204,
+binaryop_broadcast_b1_pack8 = 205,
+binaryop_broadcast_pack4 = 206,
+binaryop_broadcast_pack8 = 207,
+binaryop_pack4 = 208,
+binaryop_pack8 = 209,
+unaryop = 210,
+unaryop_pack4 = 211,
+unaryop_pack8 = 212,
+convolutiondepthwise = 213,
+convolutiondepthwise_group = 214,
+convolutiondepthwise_group_pack1to4 = 215,
+convolutiondepthwise_group_pack1to8 = 216,
+convolutiondepthwise_group_pack4 = 217,
+convolutiondepthwise_group_pack4to1 = 218,
+convolutiondepthwise_group_pack4to8 = 219,
+convolutiondepthwise_group_pack8 = 220,
+convolutiondepthwise_group_pack8to1 = 221,
+convolutiondepthwise_group_pack8to4 = 222,
+convolutiondepthwise_pack4 = 223,
+convolutiondepthwise_pack8 = 224,
+padding = 225,
+padding_3d = 226,
+padding_3d_pack4 = 227,
+padding_3d_pack8 = 228,
+padding_pack1to4 = 229,
+padding_pack1to8 = 230,
+padding_pack4 = 231,
+padding_pack4to1 = 232,
+padding_pack4to8 = 233,
+padding_pack8 = 234,
+padding_pack8to1 = 235,
+padding_pack8to4 = 236,
+normalize_coeffs = 237,
+normalize_coeffs_pack4 = 238,
+normalize_coeffs_pack8 = 239,
+normalize_norm = 240,
+normalize_norm_pack4 = 241,
+normalize_norm_pack8 = 242,
+normalize_reduce_sum4_fp16_to_fp32 = 243,
+normalize_reduce_sum4_fp16_to_fp32_pack4 = 244,
+normalize_reduce_sum4_fp16_to_fp32_pack8 = 245,
+normalize_reduce_sum4_fp32 = 246,
+normalize_reduce_sum4_fp32_pack4 = 247,
+normalize_reduce_sum4_fp32_pack8 = 248,
+permute = 249,
+permute_pack1to4 = 250,
+permute_pack1to8 = 251,
+permute_pack4 = 252,
+permute_pack4to1 = 253,
+permute_pack4to8 = 254,
+permute_pack8 = 255,
+permute_pack8to1 = 256,
+permute_pack8to4 = 257,
+priorbox = 258,
+priorbox_mxnet = 259,
+interp = 260,
+interp_bicubic = 261,
+interp_bicubic_coeffs = 262,
+interp_bicubic_pack4 = 263,
+interp_bicubic_pack8 = 264,
+interp_pack4 = 265,
+interp_pack8 = 266,
+deconvolutiondepthwise = 267,
+deconvolutiondepthwise_group = 268,
+deconvolutiondepthwise_group_pack1to4 = 269,
+deconvolutiondepthwise_group_pack1to8 = 270,
+deconvolutiondepthwise_group_pack4 = 271,
+deconvolutiondepthwise_group_pack4to1 = 272,
+deconvolutiondepthwise_group_pack4to8 = 273,
+deconvolutiondepthwise_group_pack8 = 274,
+deconvolutiondepthwise_group_pack8to1 = 275,
+deconvolutiondepthwise_group_pack8to4 = 276,
+deconvolutiondepthwise_pack4 = 277,
+deconvolutiondepthwise_pack8 = 278,
+shufflechannel = 279,
+shufflechannel_pack4 = 280,
+shufflechannel_pack8 = 281,
+instancenorm_coeffs = 282,
+instancenorm_coeffs_pack4 = 283,
+instancenorm_coeffs_pack8 = 284,
+instancenorm_norm = 285,
+instancenorm_norm_pack4 = 286,
+instancenorm_norm_pack8 = 287,
+instancenorm_reduce_mean = 288,
+instancenorm_reduce_mean_pack4 = 289,
+instancenorm_reduce_mean_pack8 = 290,
+instancenorm_reduce_sum4_fp16_to_fp32 = 291,
+instancenorm_reduce_sum4_fp16_to_fp32_pack4 = 292,
+instancenorm_reduce_sum4_fp16_to_fp32_pack8 = 293,
+instancenorm_reduce_sum4_fp32 = 294,
+instancenorm_reduce_sum4_fp32_pack4 = 295,
+instancenorm_reduce_sum4_fp32_pack8 = 296,
+instancenorm_sub_mean_square = 297,
+instancenorm_sub_mean_square_pack4 = 298,
+instancenorm_sub_mean_square_pack8 = 299,
+clip = 300,
+clip_pack4 = 301,
+clip_pack8 = 302,
+reorg = 303,
+reorg_pack1to4 = 304,
+reorg_pack1to8 = 305,
+reorg_pack4 = 306,
+reorg_pack4to8 = 307,
+reorg_pack8 = 308,
+packing = 309,
+packing_fp16_to_fp32 = 310,
+packing_fp32_to_fp16 = 311,
+packing_pack1to4 = 312,
+packing_pack1to4_fp16_to_fp32 = 313,
+packing_pack1to4_fp32_to_fp16 = 314,
+packing_pack1to8 = 315,
+packing_pack1to8_fp16_to_fp32 = 316,
+packing_pack1to8_fp32_to_fp16 = 317,
+packing_pack4 = 318,
+packing_pack4_fp16_to_fp32 = 319,
+packing_pack4_fp32_to_fp16 = 320,
+packing_pack4to1 = 321,
+packing_pack4to1_fp16_to_fp32 = 322,
+packing_pack4to1_fp32_to_fp16 = 323,
+packing_pack4to8 = 324,
+packing_pack4to8_fp16_to_fp32 = 325,
+packing_pack4to8_fp32_to_fp16 = 326,
+packing_pack8 = 327,
+packing_pack8_fp16_to_fp32 = 328,
+packing_pack8_fp32_to_fp16 = 329,
+packing_pack8to1 = 330,
+packing_pack8to1_fp16_to_fp32 = 331,
+packing_pack8to1_fp32_to_fp16 = 332,
+packing_pack8to4 = 333,
+packing_pack8to4_fp16_to_fp32 = 334,
+packing_pack8to4_fp32_to_fp16 = 335,
+cast_fp16_to_fp32 = 336,
+cast_fp16_to_fp32_pack4 = 337,
+cast_fp16_to_fp32_pack8 = 338,
+cast_fp32_to_fp16 = 339,
+cast_fp32_to_fp16_pack4 = 340,
+cast_fp32_to_fp16_pack8 = 341,
+hardsigmoid = 342,
+hardsigmoid_pack4 = 343,
+hardsigmoid_pack8 = 344,
+hardswish = 345,
+hardswish_pack4 = 346,
+hardswish_pack8 = 347,
+pixelshuffle = 348,
+pixelshuffle_pack4 = 349,
+pixelshuffle_pack4to1 = 350,
+pixelshuffle_pack8 = 351,
+pixelshuffle_pack8to1 = 352,
+pixelshuffle_pack8to4 = 353,
+deepcopy = 354,
+deepcopy_pack4 = 355,
+deepcopy_pack8 = 356,
+mish = 357,
+mish_pack4 = 358,
+mish_pack8 = 359,
+swish = 360,
+swish_pack4 = 361,
+swish_pack8 = 362,
+convert_ycbcr = 363,
+vulkan_activation = 364,
+
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/layer_type.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/layer_type.h
new file mode 100644
index 0000000..511c714
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/layer_type.h
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_TYPE_H
+#define NCNN_LAYER_TYPE_H
+
+namespace ncnn {
+
+namespace LayerType {
+enum LayerType
+{
+#include "layer_type_enum.h"
+    CustomBit = (1 << 8),
+};
+} // namespace LayerType
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_TYPE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/layer_type_enum.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/layer_type_enum.h
new file mode 100644
index 0000000..581d589
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/layer_type_enum.h
@@ -0,0 +1,103 @@
+// Layer Type Enum header
+//
+// This file is auto-generated by cmake, don't edit it.
+
+AbsVal = 0,
+ArgMax = 1,
+BatchNorm = 2,
+Bias = 3,
+BNLL = 4,
+Concat = 5,
+Convolution = 6,
+Crop = 7,
+Deconvolution = 8,
+Dropout = 9,
+Eltwise = 10,
+ELU = 11,
+Embed = 12,
+Exp = 13,
+Flatten = 14,
+InnerProduct = 15,
+Input = 16,
+Log = 17,
+LRN = 18,
+MemoryData = 19,
+MVN = 20,
+Pooling = 21,
+Power = 22,
+PReLU = 23,
+Proposal = 24,
+Reduction = 25,
+ReLU = 26,
+Reshape = 27,
+ROIPooling = 28,
+Scale = 29,
+Sigmoid = 30,
+Slice = 31,
+Softmax = 32,
+Split = 33,
+SPP = 34,
+TanH = 35,
+Threshold = 36,
+Tile = 37,
+RNN = 38,
+LSTM = 39,
+BinaryOp = 40,
+UnaryOp = 41,
+ConvolutionDepthWise = 42,
+Padding = 43,
+Squeeze = 44,
+ExpandDims = 45,
+Normalize = 46,
+Permute = 47,
+PriorBox = 48,
+DetectionOutput = 49,
+Interp = 50,
+DeconvolutionDepthWise = 51,
+ShuffleChannel = 52,
+InstanceNorm = 53,
+Clip = 54,
+Reorg = 55,
+YoloDetectionOutput = 56,
+Quantize = 57,
+Dequantize = 58,
+Yolov3DetectionOutput = 59,
+PSROIPooling = 60,
+ROIAlign = 61,
+Packing = 62,
+Requantize = 63,
+Cast = 64,
+HardSigmoid = 65,
+SELU = 66,
+HardSwish = 67,
+Noop = 68,
+PixelShuffle = 69,
+DeepCopy = 70,
+Mish = 71,
+StatisticsPooling = 72,
+Swish = 73,
+Gemm = 74,
+GroupNorm = 75,
+LayerNorm = 76,
+Softplus = 77,
+GRU = 78,
+MultiHeadAttention = 79,
+GELU = 80,
+Convolution1D = 81,
+Pooling1D = 82,
+ConvolutionDepthWise1D = 83,
+Convolution3D = 84,
+ConvolutionDepthWise3D = 85,
+Pooling3D = 86,
+MatMul = 87,
+Deconvolution1D = 88,
+DeconvolutionDepthWise1D = 89,
+Deconvolution3D = 90,
+DeconvolutionDepthWise3D = 91,
+Einsum = 92,
+DeformableConv2D = 93,
+GLU = 94,
+Fold = 95,
+Unfold = 96,
+GridSample = 97,
+
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/mat.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/mat.h
new file mode 100644
index 0000000..c6f59ef
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/mat.h
@@ -0,0 +1,1843 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_MAT_H
+#define NCNN_MAT_H
+
+#include <stdlib.h>
+#include <string.h>
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif
+#if __SSE2__
+#include <emmintrin.h>
+#if __AVX__
+#include <immintrin.h>
+#endif
+#endif
+#if __mips_msa
+#include <msa.h>
+#endif
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif
+#if __riscv_vector
+#include <riscv_vector.h>
+#include "cpu.h" // cpu_riscv_vlenb()
+#endif
+
+#include "allocator.h"
+#include "option.h"
+#include "platform.h"
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#if NCNN_PIXEL
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/bitmap.h>
+#include <jni.h>
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+#endif // NCNN_PIXEL
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkMat;
+class VkImageMat;
+#endif // NCNN_VULKAN
+
+// the three dimension matrix
+class NCNN_EXPORT Mat
+{
+public:
+    // empty
+    Mat();
+    // vec
+    Mat(int w, size_t elemsize = 4u, Allocator* allocator = 0);
+    // image
+    Mat(int w, int h, size_t elemsize = 4u, Allocator* allocator = 0);
+    // dim
+    Mat(int w, int h, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // cube
+    Mat(int w, int h, int d, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // packed vec
+    Mat(int w, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed image
+    Mat(int w, int h, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed dim
+    Mat(int w, int h, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed cube
+    Mat(int w, int h, int d, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // copy
+    Mat(const Mat& m);
+    // external vec
+    Mat(int w, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external image
+    Mat(int w, int h, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external dim
+    Mat(int w, int h, int c, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external cube
+    Mat(int w, int h, int d, int c, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external packed vec
+    Mat(int w, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed image
+    Mat(int w, int h, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed dim
+    Mat(int w, int h, int c, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed cube
+    Mat(int w, int h, int d, int c, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // release
+    ~Mat();
+    // assign
+    Mat& operator=(const Mat& m);
+    // set all
+    void fill(float v);
+    void fill(int v);
+#if __ARM_NEON
+    void fill(float32x4_t _v);
+    void fill(uint16x4_t _v);
+    void fill(int32x4_t _v);
+    void fill(int32x4_t _v0, int32x4_t _v1);
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    void fill(float16x4_t _v);
+    void fill(float16x8_t _v);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif // __ARM_NEON
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+    void fill(__m512 _v);
+#endif // __AVX512F__
+    void fill(__m256 _v, int i = 0);
+#endif // __AVX__
+    void fill(__m128 _v);
+    void fill(__m128i _v);
+#endif // __SSE2__
+#if __mips_msa
+    void fill(v4f32 _v);
+#endif // __mips_msa
+#if __loongarch_sx
+    void fill(__m128 _v);
+#endif //__loongarch_sx
+#if __riscv_vector
+    void fill(vfloat32m1_t _v);
+    void fill(vuint16m1_t _v);
+    void fill(vint8m1_t _v);
+#if __riscv_zfh
+    void fill(vfloat16m1_t _v);
+#endif // __riscv_zfh
+#endif // __riscv_vector
+    template<typename T>
+    void fill(T v);
+    // deep copy
+    Mat clone(Allocator* allocator = 0) const;
+    // deep copy from other mat, inplace
+    void clone_from(const ncnn::Mat& mat, Allocator* allocator = 0);
+    // reshape vec
+    Mat reshape(int w, Allocator* allocator = 0) const;
+    // reshape image
+    Mat reshape(int w, int h, Allocator* allocator = 0) const;
+    // reshape dim
+    Mat reshape(int w, int h, int c, Allocator* allocator = 0) const;
+    // reshape cube
+    Mat reshape(int w, int h, int d, int c, Allocator* allocator = 0) const;
+    // allocate vec
+    void create(int w, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate image
+    void create(int w, int h, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate cube
+    void create(int w, int h, int d, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed cube
+    void create(int w, int h, int d, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate like
+    void create_like(const Mat& m, Allocator* allocator = 0);
+#if NCNN_VULKAN
+    // allocate like
+    void create_like(const VkMat& m, Allocator* allocator = 0);
+    // allocate like
+    void create_like(const VkImageMat& im, Allocator* allocator = 0);
+#endif // NCNN_VULKAN
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // bits per element
+    int elembits() const;
+
+    // shape only
+    Mat shape() const;
+
+    // data reference
+    Mat channel(int c);
+    const Mat channel(int c) const;
+    Mat depth(int z);
+    const Mat depth(int z) const;
+    float* row(int y);
+    const float* row(int y) const;
+    template<typename T>
+    T* row(int y);
+    template<typename T>
+    const T* row(int y) const;
+
+    // range reference
+    Mat channel_range(int c, int channels);
+    const Mat channel_range(int c, int channels) const;
+    Mat depth_range(int z, int depths);
+    const Mat depth_range(int z, int depths) const;
+    Mat row_range(int y, int rows);
+    const Mat row_range(int y, int rows) const;
+    Mat range(int x, int n);
+    const Mat range(int x, int n) const;
+
+    // access raw data
+    template<typename T>
+    operator T*();
+    template<typename T>
+    operator const T*() const;
+
+    // convenient access float vec element
+    float& operator[](size_t i);
+    const float& operator[](size_t i) const;
+
+#if NCNN_PIXEL
+    enum PixelType
+    {
+        PIXEL_CONVERT_SHIFT = 16,
+        PIXEL_FORMAT_MASK = 0x0000ffff,
+        PIXEL_CONVERT_MASK = 0xffff0000,
+
+        PIXEL_RGB = 1,
+        PIXEL_BGR = 2,
+        PIXEL_GRAY = 3,
+        PIXEL_RGBA = 4,
+        PIXEL_BGRA = 5,
+
+        PIXEL_RGB2BGR = PIXEL_RGB | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGB2GRAY = PIXEL_RGB | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGB2RGBA = PIXEL_RGB | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGB2BGRA = PIXEL_RGB | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_BGR2RGB = PIXEL_BGR | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGR2GRAY = PIXEL_BGR | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGR2RGBA = PIXEL_BGR | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGR2BGRA = PIXEL_BGR | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_GRAY2RGB = PIXEL_GRAY | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_GRAY2BGR = PIXEL_GRAY | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_GRAY2RGBA = PIXEL_GRAY | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+        PIXEL_GRAY2BGRA = PIXEL_GRAY | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_RGBA2RGB = PIXEL_RGBA | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2BGR = PIXEL_RGBA | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2GRAY = PIXEL_RGBA | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2BGRA = PIXEL_RGBA | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_BGRA2RGB = PIXEL_BGRA | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGRA2BGR = PIXEL_BGRA | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGRA2GRAY = PIXEL_BGRA | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGRA2RGBA = PIXEL_BGRA | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+    };
+    // convenient construct from pixel data
+    static Mat from_pixels(const unsigned char* pixels, int type, int w, int h, Allocator* allocator = 0);
+    // convenient construct from pixel data with stride(bytes-per-row) parameter
+    static Mat from_pixels(const unsigned char* pixels, int type, int w, int h, int stride, Allocator* allocator = 0);
+    // convenient construct from pixel data and resize to specific size
+    static Mat from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from pixel data and resize to specific size with stride(bytes-per-row) parameter
+    static Mat from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int stride, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from pixel data roi
+    static Mat from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int roix, int roiy, int roiw, int roih, Allocator* allocator = 0);
+    // convenient construct from pixel data roi with stride(bytes-per-row) parameter
+    static Mat from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, Allocator* allocator = 0);
+    // convenient construct from pixel data roi and resize to specific size
+    static Mat from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from pixel data roi and resize to specific size with stride(bytes-per-row) parameter
+    static Mat from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator = 0);
+
+    // convenient export to pixel data
+    void to_pixels(unsigned char* pixels, int type) const;
+    // convenient export to pixel data with stride(bytes-per-row) parameter
+    void to_pixels(unsigned char* pixels, int type, int stride) const;
+    // convenient export to pixel data and resize to specific size
+    void to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height) const;
+    // convenient export to pixel data and resize to specific size with stride(bytes-per-row) parameter
+    void to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height, int target_stride) const;
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+    // convenient construct from android Bitmap
+    static Mat from_android_bitmap(JNIEnv* env, jobject bitmap, int type_to, Allocator* allocator = 0);
+    // convenient construct from android Bitmap and resize to specific size
+    static Mat from_android_bitmap_resize(JNIEnv* env, jobject bitmap, int type_to, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from android Bitmap roi
+    static Mat from_android_bitmap_roi(JNIEnv* env, jobject bitmap, int type_to, int roix, int roiy, int roiw, int roih, Allocator* allocator = 0);
+    // convenient construct from android Bitmap roi and resize to specific size
+    static Mat from_android_bitmap_roi_resize(JNIEnv* env, jobject bitmap, int type_to, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient export to android Bitmap and resize to the android Bitmap size
+    void to_android_bitmap(JNIEnv* env, jobject bitmap, int type_from) const;
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+#endif // NCNN_PIXEL
+
+    // substract channel-wise mean values, then multiply by normalize values, pass 0 to skip
+    void substract_mean_normalize(const float* mean_vals, const float* norm_vals);
+
+    // convenient construct from half precision floating point data
+    static Mat from_float16(const unsigned short* data, int size);
+
+    // pointer to the data
+    void* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-d-h-w-1  c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-d-h-w-4  c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-d-h-w-8  c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    Allocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int d;
+    int c;
+
+    size_t cstep;
+};
+
+#if NCNN_VULKAN
+
+// the three dimension matrix, vulkan version
+class NCNN_EXPORT VkMat
+{
+public:
+    // empty
+    VkMat();
+    // vec
+    VkMat(int w, size_t elemsize, VkAllocator* allocator);
+    // image
+    VkMat(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // dim
+    VkMat(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // cube
+    VkMat(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // packed vec
+    VkMat(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed image
+    VkMat(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed dim
+    VkMat(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed cube
+    VkMat(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // copy
+    VkMat(const VkMat& m);
+    // external vec
+    VkMat(int w, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external image
+    VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external dim
+    VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external cube
+    VkMat(int w, int h, int d, int c, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external packed vec
+    VkMat(int w, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed image
+    VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed dim
+    VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed cube
+    VkMat(int w, int h, int d, int c, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // release
+    ~VkMat();
+    // assign
+    VkMat& operator=(const VkMat& m);
+    // allocate vec
+    void create(int w, size_t elemsize, VkAllocator* allocator);
+    // allocate image
+    void create(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate cube
+    void create(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed cube
+    void create(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate like
+    void create_like(const Mat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkMat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkImageMat& im, VkAllocator* allocator);
+
+    // mapped
+    Mat mapped() const;
+    void* mapped_ptr() const;
+
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // bits per element
+    int elembits() const;
+
+    // shape only
+    Mat shape() const;
+
+    // low-level reference
+    VkBuffer buffer() const;
+    size_t buffer_offset() const;
+    size_t buffer_capacity() const;
+
+    // device buffer
+    VkBufferMemory* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-d-h-w-1  c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-d-h-w-4  c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-d-h-w-8  c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    VkAllocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int d;
+    int c;
+
+    size_t cstep;
+};
+
+class NCNN_EXPORT VkImageMat
+{
+public:
+    // empty
+    VkImageMat();
+    // vec
+    VkImageMat(int w, size_t elemsize, VkAllocator* allocator);
+    // image
+    VkImageMat(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // dim
+    VkImageMat(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // cube
+    VkImageMat(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // packed vec
+    VkImageMat(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed image
+    VkImageMat(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed dim
+    VkImageMat(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed cube
+    VkImageMat(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // copy
+    VkImageMat(const VkImageMat& m);
+    // external vec
+    VkImageMat(int w, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external image
+    VkImageMat(int w, int h, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external dim
+    VkImageMat(int w, int h, int c, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external cube
+    VkImageMat(int w, int h, int d, int c, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external packed vec
+    VkImageMat(int w, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed image
+    VkImageMat(int w, int h, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed dim
+    VkImageMat(int w, int h, int c, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed cube
+    VkImageMat(int w, int h, int d, int c, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // release
+    ~VkImageMat();
+    // assign
+    VkImageMat& operator=(const VkImageMat& m);
+    // allocate vec
+    void create(int w, size_t elemsize, VkAllocator* allocator);
+    // allocate image
+    void create(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate cube
+    void create(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed cube
+    void create(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate like
+    void create_like(const Mat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkMat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkImageMat& im, VkAllocator* allocator);
+
+    // mapped
+    Mat mapped() const;
+    void* mapped_ptr() const;
+
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // bits per element
+    int elembits() const;
+
+    // shape only
+    Mat shape() const;
+
+    // low-level reference
+    VkImage image() const;
+    VkImageView imageview() const;
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+    // convenient construct from android hardware buffer
+    static VkImageMat from_android_hardware_buffer(VkAndroidHardwareBufferImageAllocator* allocator);
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+    // device image
+    VkImageMemory* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-d-h-w-1  c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-d-h-w-4  c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-d-h-w-8  c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    VkAllocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int d;
+    int c;
+};
+
+// type for vulkan specialization constant and push constant
+union vk_specialization_type
+{
+    int i;
+    float f;
+    uint32_t u32;
+};
+union vk_constant_type
+{
+    int i;
+    float f;
+};
+#endif // NCNN_VULKAN
+
+// misc function
+#if NCNN_PIXEL
+// convert yuv420sp(nv21) to rgb, the fast approximate version
+NCNN_EXPORT void yuv420sp2rgb(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
+// convert yuv420sp(nv12) to rgb, the fast approximate version
+NCNN_EXPORT void yuv420sp2rgb_nv12(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
+// convert yuv420sp(nv21) to rgb with half resize, the faster approximate version
+NCNN_EXPORT void yuv420sp2rgb_half(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
+// image pixel bilinear resize
+NCNN_EXPORT void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+NCNN_EXPORT void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+NCNN_EXPORT void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+NCNN_EXPORT void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+// image pixel bilinear resize with stride(bytes-per-row) parameter
+NCNN_EXPORT void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+NCNN_EXPORT void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+NCNN_EXPORT void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+NCNN_EXPORT void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+// image pixel bilinear resize, convenient wrapper for yuv420sp(nv21/nv12)
+NCNN_EXPORT void resize_bilinear_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+#endif // NCNN_PIXEL
+#if NCNN_PIXEL_ROTATE
+// type is the from type, 6 means rotating from 6 to 1
+//
+//     1        2       3      4         5            6           7          8
+//
+//   888888  888888      88  88      8888888888  88                  88  8888888888
+//   88          88      88  88      88  88      88  88          88  88      88  88
+//   8888      8888    8888  8888    88          8888888888  8888888888          88
+//   88          88      88  88
+//   88          88  888888  888888
+//
+// ref http://sylvana.net/jpegcrop/exif_orientation.html
+// image pixel kanna rotate
+NCNN_EXPORT void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+NCNN_EXPORT void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+NCNN_EXPORT void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+NCNN_EXPORT void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+// image pixel kanna rotate with stride(bytes-per-row) parameter
+NCNN_EXPORT void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+NCNN_EXPORT void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+NCNN_EXPORT void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+NCNN_EXPORT void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+// image pixel kanna rotate, convenient wrapper for yuv420sp(nv21/nv12)
+NCNN_EXPORT void kanna_rotate_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+#endif // NCNN_PIXEL_ROTATE
+#if NCNN_PIXEL_AFFINE
+// resolve affine transform matrix from rotation angle, scale factor and x y offset
+NCNN_EXPORT void get_rotation_matrix(float angle, float scale, float dx, float dy, float* tm);
+// resolve affine transform matrix from two set of points, num_point must be >= 2
+NCNN_EXPORT void get_affine_transform(const float* points_from, const float* points_to, int num_point, float* tm);
+// resolve the inversion affine transform matrix
+NCNN_EXPORT void invert_affine_transform(const float* tm, float* tm_inv);
+// image pixel bilinear warpaffine inverse transform, set -233 for transparent border color, the color RGBA is little-endian encoded
+NCNN_EXPORT void warpaffine_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+// image pixel bilinear warpaffine inverse transform with stride(bytes-per-row) parameter, set -233 for transparent border color, the color RGBA is little-endian encoded
+NCNN_EXPORT void warpaffine_bilinear_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+// image pixel bilinear warpaffine, convenient wrapper for yuv420sp(nv21/nv12), set -233 for transparent border color, the color YUV_ is little-endian encoded
+NCNN_EXPORT void warpaffine_bilinear_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+#endif // NCNN_PIXEL_AFFINE
+#if NCNN_PIXEL_DRAWING
+// draw rectangle, set thickness -1 for filled rectangle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_rectangle_c1(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c2(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c3(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c4(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+// draw rectangle with stride(bytes-per-row) parameter, set thickness -1 for filled rectangle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_rectangle_c1(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c2(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c3(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c4(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+// draw rectangle, convenient wrapper for yuv420sp(nv21/nv12), set thickness -1 for filled rectangle, the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_rectangle_yuv420sp(unsigned char* yuv420sp, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+// draw circle, set thickness -1 for filled circle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_circle_c1(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c2(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c3(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c4(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+// draw circle with stride(bytes-per-row) parameter, set thickness -1 for filled circle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_circle_c1(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c2(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c3(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c4(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+// draw circle, convenient wrapper for yuv420sp(nv21/nv12), set thickness -1 for filled circle, the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_circle_yuv420sp(unsigned char* yuv420sp, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+// draw line, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_line_c1(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c2(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c3(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c4(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+// draw line with stride(bytes-per-row) parameter, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_line_c1(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c2(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c3(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c4(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+// draw line, convenient wrapper for yuv420sp(nv21/nv12), the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_line_yuv420sp(unsigned char* yuv420sp, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+// resolve text bounding box size
+NCNN_EXPORT void get_text_drawing_size(const char* text, int fontpixelsize, int* w, int* h);
+// draw ascii printables and newline, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_text_c1(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c2(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c3(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c4(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+// draw ascii printables and newline with stride(bytes-per-row) parameter, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_text_c1(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c2(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c3(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c4(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+// draw ascii printables and newline, convenient wrapper for yuv420sp(nv21/nv12), the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_text_yuv420sp(unsigned char* yuv420sp, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+#endif // NCNN_PIXEL_DRAWING
+
+// type conversion
+// convert float to half precision floating point
+NCNN_EXPORT unsigned short float32_to_float16(float value);
+// convert half precision floating point to float
+NCNN_EXPORT float float16_to_float32(unsigned short value);
+// convert float to brain half
+NCNN_EXPORT NCNN_FORCEINLINE unsigned short float32_to_bfloat16(float value)
+{
+    // 16 : 16
+    union
+    {
+        unsigned int u;
+        float f;
+    } tmp;
+    tmp.f = value;
+    return tmp.u >> 16;
+}
+// convert brain half to float
+NCNN_EXPORT NCNN_FORCEINLINE float bfloat16_to_float32(unsigned short value)
+{
+    // 16 : 16
+    union
+    {
+        unsigned int u;
+        float f;
+    } tmp;
+    tmp.u = value << 16;
+    return tmp.f;
+}
+
+// mat process
+enum BorderType
+{
+    BORDER_CONSTANT = 0,
+    BORDER_REPLICATE = 1,
+    BORDER_REFLECT = 2,
+    BORDER_TRANSPARENT = -233,
+};
+NCNN_EXPORT void copy_make_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int type, float v, const Option& opt = Option());
+NCNN_EXPORT void copy_make_border_3d(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int front, int behind, int type, float v, const Option& opt = Option());
+NCNN_EXPORT void copy_cut_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, const Option& opt = Option());
+NCNN_EXPORT void copy_cut_border_3d(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int front, int behind, const Option& opt = Option());
+NCNN_EXPORT void resize_nearest(const Mat& src, Mat& dst, int w, int h, const Option& opt = Option());
+NCNN_EXPORT void resize_bilinear(const Mat& src, Mat& dst, int w, int h, const Option& opt = Option());
+NCNN_EXPORT void resize_bicubic(const Mat& src, Mat& dst, int w, int h, const Option& opt = Option());
+NCNN_EXPORT void convert_packing(const Mat& src, Mat& dst, int elempack, const Option& opt = Option());
+NCNN_EXPORT void flatten(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_float32_to_float16(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_float16_to_float32(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_int8_to_float32(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_float32_to_bfloat16(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_bfloat16_to_float32(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void quantize_to_int8(const Mat& src, Mat& dst, const Mat& scale_data, const Option& opt = Option());
+NCNN_EXPORT void dequantize_from_int32(const Mat& src, Mat& dst, const Mat& scale_data, const Mat& bias_data, const Option& opt = Option());
+NCNN_EXPORT void requantize_from_int32_to_int8(const Mat& src, Mat& dst, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt = Option());
+
+NCNN_FORCEINLINE Mat::Mat()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(const Mat& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c), cstep(m.cstep)
+{
+    addref();
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = (size_t)w * h;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = (size_t)w * h;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::~Mat()
+{
+    release();
+}
+
+NCNN_FORCEINLINE void Mat::fill(float _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+
+    int i = 0;
+#if __ARM_NEON
+    float32x4_t _c = vdupq_n_f32(_v);
+    for (; i + 3 < size; i += 4)
+    {
+        vst1q_f32(ptr, _c);
+        ptr += 4;
+    }
+#endif // __ARM_NEON
+    for (; i < size; i++)
+    {
+        *ptr++ = _v;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(int _v)
+{
+    int size = (int)total();
+    int* ptr = (int*)data;
+
+    int i = 0;
+#if __ARM_NEON
+    int32x4_t _c = vdupq_n_s32(_v);
+    for (; i + 3 < size; i += 4)
+    {
+        vst1q_s32(ptr, _c);
+        ptr += 4;
+    }
+#endif // __ARM_NEON
+    for (; i < size; i++)
+    {
+        *ptr++ = _v;
+    }
+}
+
+#if __ARM_NEON
+NCNN_FORCEINLINE void Mat::fill(float32x4_t _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_f32(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(uint16x4_t _v)
+{
+    int size = (int)total();
+    unsigned short* ptr = (unsigned short*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1_u16(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(int32x4_t _v)
+{
+    int size = (int)total();
+    int* ptr = (int*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_s32(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(int32x4_t _v0, int32x4_t _v1)
+{
+    int size = (int)total();
+    int* ptr = (int*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_s32(ptr, _v0);
+        vst1q_s32(ptr + 4, _v1);
+        ptr += 8;
+    }
+}
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+NCNN_FORCEINLINE void Mat::fill(float16x4_t _v)
+{
+    int size = (int)total();
+    __fp16* ptr = (__fp16*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1_f16(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(float16x8_t _v)
+{
+    int size = (int)total();
+    __fp16* ptr = (__fp16*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_f16(ptr, _v);
+        ptr += 8;
+    }
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif // __ARM_NEON
+
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+NCNN_FORCEINLINE void Mat::fill(__m512 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm512_storeu_ps(ptr, _v);
+        ptr += 16;
+    }
+}
+#endif // __AVX512F__
+NCNN_FORCEINLINE void Mat::fill(__m256 _v, int _i)
+{
+    // old gcc cannot overload __m128 and __m256 type
+    // add a dummy int parameter for different mangled function symbol
+    (void)_i;
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm256_storeu_ps(ptr, _v);
+        ptr += 8;
+    }
+}
+#endif // __AVX__
+NCNN_FORCEINLINE void Mat::fill(__m128 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm_storeu_ps(ptr, _v);
+        ptr += 4;
+    }
+}
+NCNN_FORCEINLINE void Mat::fill(__m128i _v)
+{
+    int size = (int)total();
+    unsigned short* ptr = (unsigned short*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm_store_si128((__m128i*)ptr, _v);
+        ptr += 8;
+    }
+}
+#endif // __SSE2__
+
+#if __mips_msa
+NCNN_FORCEINLINE void Mat::fill(v4f32 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        __msa_st_w((v4i32)_v, ptr, 0);
+        ptr += 4;
+    }
+}
+#endif // __mips_msa
+
+#if __loongarch_sx
+NCNN_FORCEINLINE void Mat::fill(__m128 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        __lsx_vst(_v, ptr, 0);
+        ptr += 4;
+    }
+}
+#endif // __loongarch_sx
+#if __riscv_vector
+NCNN_FORCEINLINE void Mat::fill(vfloat32m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 4;
+    const size_t vl = vsetvl_e32m1(packn);
+
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse32_v_f32m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(vuint16m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 2;
+    const size_t vl = vsetvl_e16m1(packn);
+
+    int size = (int)total();
+    unsigned short* ptr = (unsigned short*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse16_v_u16m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(vint8m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 1;
+    const size_t vl = vsetvl_e8m1(packn);
+
+    int size = (int)total();
+    signed char* ptr = (signed char*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse8_v_i8m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+#if __riscv_zfh
+NCNN_FORCEINLINE void Mat::fill(vfloat16m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 2;
+    const size_t vl = vsetvl_e16m1(packn);
+
+    int size = (int)total();
+    __fp16* ptr = (__fp16*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse16_v_f16m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+#endif // __riscv_zfh
+#endif // __riscv_vector
+
+template<typename T>
+NCNN_FORCEINLINE void Mat::fill(T _v)
+{
+    int size = (int)total();
+    T* ptr = (T*)data;
+    for (int i = 0; i < size; i++)
+    {
+        ptr[i] = _v;
+    }
+}
+
+NCNN_FORCEINLINE Mat& Mat::operator=(const Mat& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        NCNN_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    d = m.d;
+    c = m.c;
+
+    cstep = m.cstep;
+
+    return *this;
+}
+
+NCNN_FORCEINLINE void Mat::addref()
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+}
+
+NCNN_FORCEINLINE void Mat::release()
+{
+    if (refcount && NCNN_XADD(refcount, -1) == 1)
+    {
+        if (allocator)
+            allocator->fastFree(data);
+        else
+            fastFree(data);
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    d = 0;
+    c = 0;
+
+    cstep = 0;
+
+    refcount = 0;
+}
+
+NCNN_FORCEINLINE bool Mat::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+NCNN_FORCEINLINE size_t Mat::total() const
+{
+    return cstep * c;
+}
+
+NCNN_FORCEINLINE int Mat::elembits() const
+{
+    return elempack ? static_cast<int>(elemsize * 8) / elempack : 0;
+}
+
+NCNN_FORCEINLINE Mat Mat::shape() const
+{
+    if (dims == 1)
+        return Mat(w * elempack, (void*)0);
+    if (dims == 2)
+        return Mat(w, h * elempack, (void*)0);
+    if (dims == 3)
+        return Mat(w, h, c * elempack, (void*)0);
+    if (dims == 4)
+        return Mat(w, h, d, c * elempack, (void*)0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE Mat Mat::channel(int _c)
+{
+    Mat m(w, h, d, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims - 1;
+    if (dims == 4)
+        m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::channel(int _c) const
+{
+    Mat m(w, h, d, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims - 1;
+    if (dims == 4)
+        m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE Mat Mat::depth(int z)
+{
+    return Mat(w, h, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE const Mat Mat::depth(int z) const
+{
+    return Mat(w, h, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE float* Mat::row(int y)
+{
+    return (float*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+NCNN_FORCEINLINE const float* Mat::row(int y) const
+{
+    return (const float*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+template<typename T>
+NCNN_FORCEINLINE T* Mat::row(int y)
+{
+    return (T*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+template<typename T>
+NCNN_FORCEINLINE const T* Mat::row(int y) const
+{
+    return (const T*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+NCNN_FORCEINLINE Mat Mat::channel_range(int _c, int channels)
+{
+    Mat m(w, h, d, channels, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::channel_range(int _c, int channels) const
+{
+    Mat m(w, h, d, channels, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims;
+    return m;
+}
+
+NCNN_FORCEINLINE Mat Mat::depth_range(int z, int depths)
+{
+    Mat m(w, h, depths, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+    m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::depth_range(int z, int depths) const
+{
+    Mat m(w, h, depths, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+    m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE Mat Mat::row_range(int y, int rows)
+{
+    return Mat(w, rows, (unsigned char*)data + (size_t)w * y * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE const Mat Mat::row_range(int y, int rows) const
+{
+    return Mat(w, rows, (unsigned char*)data + (size_t)w * y * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE Mat Mat::range(int x, int n)
+{
+    return Mat(n, (unsigned char*)data + x * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE const Mat Mat::range(int x, int n) const
+{
+    return Mat(n, (unsigned char*)data + x * elemsize, elemsize, elempack, allocator);
+}
+
+template<typename T>
+NCNN_FORCEINLINE Mat::operator T*()
+{
+    return (T*)data;
+}
+
+template<typename T>
+NCNN_FORCEINLINE Mat::operator const T*() const
+{
+    return (const T*)data;
+}
+
+NCNN_FORCEINLINE float& Mat::operator[](size_t i)
+{
+    return ((float*)data)[i];
+}
+
+NCNN_FORCEINLINE const float& Mat::operator[](size_t i) const
+{
+    return ((const float*)data)[i];
+}
+
+#if NCNN_VULKAN
+
+NCNN_FORCEINLINE VkMat::VkMat()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(const VkMat& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c)
+{
+    addref();
+
+    cstep = m.cstep;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = w * h;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize(w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize(w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = w * h;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize(w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize(w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::~VkMat()
+{
+    release();
+}
+
+NCNN_FORCEINLINE VkMat& VkMat::operator=(const VkMat& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        NCNN_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    d = m.d;
+    c = m.c;
+
+    cstep = m.cstep;
+
+    return *this;
+}
+
+NCNN_FORCEINLINE Mat VkMat::mapped() const
+{
+    if (!allocator->mappable)
+        return Mat();
+
+    if (dims == 1)
+        return Mat(w, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 2)
+        return Mat(w, h, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 3)
+        return Mat(w, h, c, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 4)
+        return Mat(w, h, d, c, mapped_ptr(), elemsize, elempack, 0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE void* VkMat::mapped_ptr() const
+{
+    if (!allocator->mappable)
+        return 0;
+
+    return (unsigned char*)data->mapped_ptr + data->offset;
+}
+
+NCNN_FORCEINLINE void VkMat::addref()
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+}
+
+NCNN_FORCEINLINE void VkMat::release()
+{
+    if (refcount && NCNN_XADD(refcount, -1) == 1)
+    {
+        if (allocator && data)
+        {
+            allocator->fastFree(data);
+        }
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    d = 0;
+    c = 0;
+
+    cstep = 0;
+
+    refcount = 0;
+}
+
+NCNN_FORCEINLINE bool VkMat::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+NCNN_FORCEINLINE size_t VkMat::total() const
+{
+    return cstep * c;
+}
+
+NCNN_FORCEINLINE int VkMat::elembits() const
+{
+    return elempack ? static_cast<int>(elemsize) * 8 / elempack : 0;
+}
+
+NCNN_FORCEINLINE Mat VkMat::shape() const
+{
+    if (dims == 1)
+        return Mat(w * elempack, (void*)0);
+    if (dims == 2)
+        return Mat(w, h * elempack, (void*)0);
+    if (dims == 3)
+        return Mat(w, h, c * elempack, (void*)0);
+    if (dims == 4)
+        return Mat(w, h, d, c * elempack, (void*)0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE VkBuffer VkMat::buffer() const
+{
+    return data->buffer;
+}
+
+NCNN_FORCEINLINE size_t VkMat::buffer_offset() const
+{
+    return data->offset;
+}
+
+NCNN_FORCEINLINE size_t VkMat::buffer_capacity() const
+{
+    return data->capacity;
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(const VkImageMat& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c)
+{
+    addref();
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::~VkImageMat()
+{
+    release();
+}
+
+NCNN_FORCEINLINE VkImageMat& VkImageMat::operator=(const VkImageMat& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        NCNN_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    d = m.d;
+    c = m.c;
+
+    return *this;
+}
+
+NCNN_FORCEINLINE Mat VkImageMat::mapped() const
+{
+    if (!allocator->mappable || !data->mapped_ptr)
+        return Mat();
+
+    if (dims == 1)
+        return Mat(w, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 2)
+        return Mat(w, h, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 3)
+        return Mat(w, h, c, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 4)
+        return Mat(w, h, d, c, mapped_ptr(), elemsize, elempack, 0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE void* VkImageMat::mapped_ptr() const
+{
+    if (!allocator->mappable || !data->mapped_ptr)
+        return 0;
+
+    return (unsigned char*)data->mapped_ptr + data->bind_offset;
+}
+
+NCNN_FORCEINLINE void VkImageMat::addref()
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+}
+
+NCNN_FORCEINLINE void VkImageMat::release()
+{
+    if (refcount && NCNN_XADD(refcount, -1) == 1)
+    {
+        if (allocator && data)
+        {
+            allocator->fastFree(data);
+        }
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    d = 0;
+    c = 0;
+
+    refcount = 0;
+}
+
+NCNN_FORCEINLINE bool VkImageMat::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+NCNN_FORCEINLINE size_t VkImageMat::total() const
+{
+    return w * h * d * c;
+}
+
+NCNN_FORCEINLINE int VkImageMat::elembits() const
+{
+    return elempack ? static_cast<int>(elemsize) * 8 / elempack : 0;
+}
+
+NCNN_FORCEINLINE Mat VkImageMat::shape() const
+{
+    if (dims == 1)
+        return Mat(w * elempack, (void*)0);
+    if (dims == 2)
+        return Mat(w, h * elempack, (void*)0);
+    if (dims == 3)
+        return Mat(w, h, c * elempack, (void*)0);
+    if (dims == 4)
+        return Mat(w, h, d, c * elempack, (void*)0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE VkImage VkImageMat::image() const
+{
+    return data->image;
+}
+
+NCNN_FORCEINLINE VkImageView VkImageMat::imageview() const
+{
+    return data->imageview;
+}
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_MAT_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/modelbin.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/modelbin.h
new file mode 100644
index 0000000..15d2b9c
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/modelbin.h
@@ -0,0 +1,80 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_MODELBIN_H
+#define NCNN_MODELBIN_H
+
+#include "mat.h"
+
+namespace ncnn {
+
+class DataReader;
+class NCNN_EXPORT ModelBin
+{
+public:
+    ModelBin();
+    virtual ~ModelBin();
+    // element type
+    // 0 = auto
+    // 1 = float32
+    // 2 = float16
+    // 3 = int8
+    // load vec
+    virtual Mat load(int w, int type) const = 0;
+    // load image
+    virtual Mat load(int w, int h, int type) const;
+    // load dim
+    virtual Mat load(int w, int h, int c, int type) const;
+    // load cube
+    virtual Mat load(int w, int h, int d, int c, int type) const;
+};
+
+class ModelBinFromDataReaderPrivate;
+class NCNN_EXPORT ModelBinFromDataReader : public ModelBin
+{
+public:
+    explicit ModelBinFromDataReader(const DataReader& dr);
+    virtual ~ModelBinFromDataReader();
+
+    virtual Mat load(int w, int type) const;
+
+private:
+    ModelBinFromDataReader(const ModelBinFromDataReader&);
+    ModelBinFromDataReader& operator=(const ModelBinFromDataReader&);
+
+private:
+    ModelBinFromDataReaderPrivate* const d;
+};
+
+class ModelBinFromMatArrayPrivate;
+class NCNN_EXPORT ModelBinFromMatArray : public ModelBin
+{
+public:
+    // construct from weight blob array
+    explicit ModelBinFromMatArray(const Mat* weights);
+    virtual ~ModelBinFromMatArray();
+
+    virtual Mat load(int w, int type) const;
+
+private:
+    ModelBinFromMatArray(const ModelBinFromMatArray&);
+    ModelBinFromMatArray& operator=(const ModelBinFromMatArray&);
+
+private:
+    ModelBinFromMatArrayPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_MODELBIN_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/ncnn_export.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/ncnn_export.h
new file mode 100644
index 0000000..e2f5fde
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/ncnn_export.h
@@ -0,0 +1,42 @@
+
+#ifndef NCNN_EXPORT_H
+#define NCNN_EXPORT_H
+
+#ifdef NCNN_STATIC_DEFINE
+#  define NCNN_EXPORT
+#  define NCNN_NO_EXPORT
+#else
+#  ifndef NCNN_EXPORT
+#    ifdef ncnn_EXPORTS
+        /* We are building this library */
+#      define NCNN_EXPORT __attribute__((visibility("default")))
+#    else
+        /* We are using this library */
+#      define NCNN_EXPORT __attribute__((visibility("default")))
+#    endif
+#  endif
+
+#  ifndef NCNN_NO_EXPORT
+#    define NCNN_NO_EXPORT __attribute__((visibility("hidden")))
+#  endif
+#endif
+
+#ifndef NCNN_DEPRECATED
+#  define NCNN_DEPRECATED __attribute__ ((__deprecated__))
+#endif
+
+#ifndef NCNN_DEPRECATED_EXPORT
+#  define NCNN_DEPRECATED_EXPORT NCNN_EXPORT NCNN_DEPRECATED
+#endif
+
+#ifndef NCNN_DEPRECATED_NO_EXPORT
+#  define NCNN_DEPRECATED_NO_EXPORT NCNN_NO_EXPORT NCNN_DEPRECATED
+#endif
+
+#if 0 /* DEFINE_NO_DEPRECATED */
+#  ifndef NCNN_NO_DEPRECATED
+#    define NCNN_NO_DEPRECATED
+#  endif
+#endif
+
+#endif /* NCNN_EXPORT_H */
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/net.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/net.h
new file mode 100644
index 0000000..9407042
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/net.h
@@ -0,0 +1,272 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_NET_H
+#define NCNN_NET_H
+
+#include "blob.h"
+#include "layer.h"
+#include "mat.h"
+#include "option.h"
+#include "platform.h"
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/asset_manager.h>
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkCompute;
+#endif // NCNN_VULKAN
+class DataReader;
+class Extractor;
+class NetPrivate;
+class NCNN_EXPORT Net
+{
+public:
+    // empty init
+    Net();
+    // clear and destroy
+    virtual ~Net();
+
+public:
+    // option can be changed before loading
+    Option opt;
+
+#if NCNN_VULKAN
+    // set gpu device by index
+    void set_vulkan_device(int device_index);
+
+    // set gpu device by device handle, no owner transfer
+    void set_vulkan_device(const VulkanDevice* vkdev);
+
+    const VulkanDevice* vulkan_device() const;
+#endif // NCNN_VULKAN
+
+#if NCNN_STRING
+    // register custom layer by layer type name
+    // return 0 if success
+    int register_custom_layer(const char* type, layer_creator_func creator, layer_destroyer_func destroyer = 0, void* userdata = 0);
+    virtual int custom_layer_to_index(const char* type);
+#endif // NCNN_STRING
+    // register custom layer by layer type
+    // return 0 if success
+    int register_custom_layer(int index, layer_creator_func creator, layer_destroyer_func destroyer = 0, void* userdata = 0);
+
+#if NCNN_STRING
+    int load_param(const DataReader& dr);
+#endif // NCNN_STRING
+
+    int load_param_bin(const DataReader& dr);
+
+    int load_model(const DataReader& dr);
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    // load network structure from plain param file
+    // return 0 if success
+    int load_param(FILE* fp);
+    int load_param(const char* protopath);
+    int load_param_mem(const char* mem);
+#endif // NCNN_STRING
+    // load network structure from binary param file
+    // return 0 if success
+    int load_param_bin(FILE* fp);
+    int load_param_bin(const char* protopath);
+
+    // load network weight data from model file
+    // return 0 if success
+    int load_model(FILE* fp);
+    int load_model(const char* modelpath);
+#endif // NCNN_STDIO
+
+    // load network structure from external memory
+    // memory pointer must be 32-bit aligned
+    // return bytes consumed
+    int load_param(const unsigned char* mem);
+
+    // reference network weight data from external memory
+    // weight data is not copied but referenced
+    // so external memory should be retained when used
+    // memory pointer must be 32-bit aligned
+    // return bytes consumed
+    int load_model(const unsigned char* mem);
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#if NCNN_STRING
+    // convenient load network structure from android asset plain param file
+    int load_param(AAsset* asset);
+    int load_param(AAssetManager* mgr, const char* assetpath);
+#endif // NCNN_STRING
+    // convenient load network structure from android asset binary param file
+    int load_param_bin(AAsset* asset);
+    int load_param_bin(AAssetManager* mgr, const char* assetpath);
+
+    // convenient load network weight data from android asset model file
+    int load_model(AAsset* asset);
+    int load_model(AAssetManager* mgr, const char* assetpath);
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+    // unload network structure and weight data
+    void clear();
+
+    // construct an Extractor from network
+    Extractor create_extractor() const;
+
+    // get input/output indexes/names
+    const std::vector<int>& input_indexes() const;
+    const std::vector<int>& output_indexes() const;
+#if NCNN_STRING
+    const std::vector<const char*>& input_names() const;
+    const std::vector<const char*>& output_names() const;
+#endif
+
+    const std::vector<Blob>& blobs() const;
+    const std::vector<Layer*>& layers() const;
+
+    std::vector<Blob>& mutable_blobs();
+    std::vector<Layer*>& mutable_layers();
+
+protected:
+    friend class Extractor;
+#if NCNN_STRING
+    int find_blob_index_by_name(const char* name) const;
+    int find_layer_index_by_name(const char* name) const;
+    virtual Layer* create_custom_layer(const char* type);
+#endif // NCNN_STRING
+    virtual Layer* create_custom_layer(int index);
+
+private:
+    Net(const Net&);
+    Net& operator=(const Net&);
+
+private:
+    NetPrivate* const d;
+};
+
+class ExtractorPrivate;
+class NCNN_EXPORT Extractor
+{
+public:
+    virtual ~Extractor();
+
+    // copy
+    Extractor(const Extractor&);
+
+    // assign
+    Extractor& operator=(const Extractor&);
+
+    // clear blob mats and alloctors
+    void clear();
+
+    // enable light mode
+    // intermediate blob will be recycled when enabled
+    // enabled by default
+    void set_light_mode(bool enable);
+
+    // set thread count for this extractor
+    // this will overwrite the global setting
+    // default count is system depended
+    void set_num_threads(int num_threads);
+
+    // set blob memory allocator
+    void set_blob_allocator(Allocator* allocator);
+
+    // set workspace memory allocator
+    void set_workspace_allocator(Allocator* allocator);
+
+#if NCNN_VULKAN
+    void set_vulkan_compute(bool enable);
+
+    void set_blob_vkallocator(VkAllocator* allocator);
+
+    void set_workspace_vkallocator(VkAllocator* allocator);
+
+    void set_staging_vkallocator(VkAllocator* allocator);
+#endif // NCNN_VULKAN
+
+#if NCNN_STRING
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const Mat& in);
+
+    // get result by blob name
+    // return 0 if success
+    // type = 0, default
+    // type = 1, do not convert fp16/bf16 or / and packing
+    int extract(const char* blob_name, Mat& feat, int type = 0);
+#endif // NCNN_STRING
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const Mat& in);
+
+    // get result by blob index
+    // return 0 if success
+    // type = 0, default
+    // type = 1, do not convert fp16/bf16 or / and packing
+    int extract(int blob_index, Mat& feat, int type = 0);
+
+#if NCNN_VULKAN
+#if NCNN_STRING
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const VkMat& in);
+
+    // get result by blob name
+    // return 0 if success
+    int extract(const char* blob_name, VkMat& feat, VkCompute& cmd);
+
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const VkImageMat& in);
+
+    // get result by blob name
+    // return 0 if success
+    int extract(const char* blob_name, VkImageMat& feat, VkCompute& cmd);
+#endif // NCNN_STRING
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const VkMat& in);
+
+    // get result by blob index
+    // return 0 if success
+    int extract(int blob_index, VkMat& feat, VkCompute& cmd);
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const VkImageMat& in);
+
+    // get result by blob index
+    // return 0 if success
+    int extract(int blob_index, VkImageMat& feat, VkCompute& cmd);
+#endif // NCNN_VULKAN
+
+protected:
+    friend Extractor Net::create_extractor() const;
+    Extractor(const Net* net, size_t blob_count);
+
+private:
+    ExtractorPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_NET_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/option.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/option.h
new file mode 100644
index 0000000..3fda808
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/option.h
@@ -0,0 +1,153 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_OPTION_H
+#define NCNN_OPTION_H
+
+#include "platform.h"
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkAllocator;
+class PipelineCache;
+#endif // NCNN_VULKAN
+
+class Allocator;
+class NCNN_EXPORT Option
+{
+public:
+    // default option
+    Option();
+
+public:
+    // light mode
+    // intermediate blob will be recycled when enabled
+    // enabled by default
+    bool lightmode;
+
+    // thread count
+    // default value is the one returned by get_cpu_count()
+    int num_threads;
+
+    // blob memory allocator
+    Allocator* blob_allocator;
+
+    // workspace memory allocator
+    Allocator* workspace_allocator;
+
+#if NCNN_VULKAN
+    // blob memory allocator
+    VkAllocator* blob_vkallocator;
+
+    // workspace memory allocator
+    VkAllocator* workspace_vkallocator;
+
+    // staging memory allocator
+    VkAllocator* staging_vkallocator;
+
+    // pipeline cache
+    PipelineCache* pipeline_cache;
+#endif // NCNN_VULKAN
+
+    // the time openmp threads busy-wait for more work before going to sleep
+    // default value is 20ms to keep the cores enabled
+    // without too much extra power consumption afterwards
+    int openmp_blocktime;
+
+    // enable winograd convolution optimization
+    // improve convolution 3x3 stride1 performance, may consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_winograd_convolution;
+
+    // enable sgemm convolution optimization
+    // improve convolution 1x1 stride1 performance, may consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_sgemm_convolution;
+
+    // enable quantized int8 inference
+    // use low-precision int8 path for quantized model
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_int8_inference;
+
+    // enable vulkan compute
+    bool use_vulkan_compute;
+
+    // enable bf16 data type for storage
+    // improve most operator performance on all arm devices, may consume more memory
+    bool use_bf16_storage;
+
+    // enable options for gpu inference
+    bool use_fp16_packed;
+    bool use_fp16_storage;
+    bool use_fp16_arithmetic;
+    bool use_int8_packed;
+    bool use_int8_storage;
+    bool use_int8_arithmetic;
+
+    // enable simd-friendly packed memory layout
+    // improve all operator performance on all arm devices, will consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_packing_layout;
+
+    bool use_shader_pack8;
+
+    // subgroup option
+    bool use_subgroup_basic;
+    bool use_subgroup_vote;
+    bool use_subgroup_ballot;
+    bool use_subgroup_shuffle;
+
+    // turn on for adreno
+    bool use_image_storage;
+    bool use_tensor_storage;
+
+    bool use_reserved_0;
+
+    // enable DAZ(Denormals-Are-Zero) and FTZ(Flush-To-Zero)
+    // default value is 3
+    // 0 = DAZ OFF, FTZ OFF
+    // 1 = DAZ ON , FTZ OFF
+    // 2 = DAZ OFF, FTZ ON
+    // 3 = DAZ ON,  FTZ ON
+    int flush_denormals;
+
+    bool use_local_pool_allocator;
+
+    // enable local memory optimization for gpu inference
+    bool use_shader_local_memory;
+
+    // enable cooperative matrix optimization for gpu inference
+    bool use_cooperative_matrix;
+
+    // more fine-grained control of winograd convolution
+    bool use_winograd23_convolution;
+    bool use_winograd43_convolution;
+    bool use_winograd63_convolution;
+
+    bool use_reserved_6;
+    bool use_reserved_7;
+    bool use_reserved_8;
+    bool use_reserved_9;
+    bool use_reserved_10;
+    bool use_reserved_11;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_OPTION_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/paramdict.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/paramdict.h
new file mode 100644
index 0000000..c2ef160
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/paramdict.h
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PARAMDICT_H
+#define NCNN_PARAMDICT_H
+
+#include "mat.h"
+
+// at most 32 parameters
+#define NCNN_MAX_PARAM_COUNT 32
+
+namespace ncnn {
+
+class DataReader;
+class Net;
+class ParamDictPrivate;
+class NCNN_EXPORT ParamDict
+{
+public:
+    // empty
+    ParamDict();
+
+    virtual ~ParamDict();
+
+    // copy
+    ParamDict(const ParamDict&);
+
+    // assign
+    ParamDict& operator=(const ParamDict&);
+
+    // get type
+    int type(int id) const;
+
+    // get int
+    int get(int id, int def) const;
+    // get float
+    float get(int id, float def) const;
+    // get array
+    Mat get(int id, const Mat& def) const;
+
+    // set int
+    void set(int id, int i);
+    // set float
+    void set(int id, float f);
+    // set array
+    void set(int id, const Mat& v);
+
+protected:
+    friend class Net;
+
+    void clear();
+
+    int load_param(const DataReader& dr);
+    int load_param_bin(const DataReader& dr);
+
+private:
+    ParamDictPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_PARAMDICT_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/pipeline.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/pipeline.h
new file mode 100644
index 0000000..c284a14
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/pipeline.h
@@ -0,0 +1,113 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PIPELINE_H
+#define NCNN_PIPELINE_H
+
+#include "mat.h"
+#include "platform.h"
+#if NCNN_VULKAN
+#include "gpu.h"
+
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class Option;
+class PipelinePrivate;
+class NCNN_EXPORT Pipeline
+{
+public:
+    explicit Pipeline(const VulkanDevice* vkdev);
+    virtual ~Pipeline();
+
+public:
+    void set_optimal_local_size_xyz(int w = 4, int h = 4, int c = 4);
+    void set_optimal_local_size_xyz(const Mat& local_size_xyz);
+    void set_local_size_xyz(int w, int h, int c);
+
+    int create(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations);
+
+    int create(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations);
+
+public:
+    VkShaderModule shader_module() const;
+    VkDescriptorSetLayout descriptorset_layout() const;
+    VkPipelineLayout pipeline_layout() const;
+    VkPipeline pipeline() const;
+    VkDescriptorUpdateTemplateKHR descriptor_update_template() const;
+
+    const ShaderInfo& shader_info() const;
+
+    uint32_t local_size_x() const;
+    uint32_t local_size_y() const;
+    uint32_t local_size_z() const;
+
+protected:
+    void set_shader_module(VkShaderModule shader_module);
+    void set_descriptorset_layout(VkDescriptorSetLayout descriptorset_layout);
+    void set_pipeline_layout(VkPipelineLayout pipeline_layout);
+    void set_pipeline(VkPipeline pipeline);
+    void set_descriptor_update_template(VkDescriptorUpdateTemplateKHR descriptor_update_template);
+
+    void set_shader_info(const ShaderInfo& shader_info);
+
+public:
+    const VulkanDevice* vkdev;
+
+private:
+    Pipeline(const Pipeline&);
+    Pipeline& operator=(const Pipeline&);
+
+private:
+    PipelinePrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class VkCompute;
+class NCNN_EXPORT ImportAndroidHardwareBufferPipeline : private Pipeline
+{
+public:
+    explicit ImportAndroidHardwareBufferPipeline(const VulkanDevice* vkdev);
+    virtual ~ImportAndroidHardwareBufferPipeline();
+
+    int create(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator, int type_to, int rotate_from, const Option& opt);
+    int create(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator, int type_to, int rotate_from, int target_width, int target_height, const Option& opt);
+    void destroy();
+
+    friend class VkCompute;
+
+protected:
+    int create_shader_module(const Option& opt);
+    int create_sampler(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator);
+    int create_descriptorset_layout();
+
+public:
+    int type_to;
+    int rotate_from;
+    bool need_resize;
+
+    VkSampler sampler;
+};
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_PIPELINE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/pipelinecache.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/pipelinecache.h
new file mode 100644
index 0000000..bb6b8fb
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/pipelinecache.h
@@ -0,0 +1,85 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PIPELINECACHE_H
+#define NCNN_PIPELINECACHE_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#include "mat.h"
+#include "gpu.h"
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+
+class VulkanDevice;
+class PipelineCachePrivate;
+class NCNN_EXPORT PipelineCache
+{
+public:
+    explicit PipelineCache(const VulkanDevice* _vkdev);
+
+    virtual ~PipelineCache();
+
+    void clear();
+
+    int get_pipeline(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations,
+                     uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                     VkShaderModule* shader_module,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template,
+                     ShaderInfo& shader_info) const;
+
+    int get_pipeline(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations,
+                     uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                     VkShaderModule* shader_module,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template,
+                     ShaderInfo& shader_info) const;
+
+protected:
+    int create_shader_module(int shader_type_index, const Option& opt, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                             VkShaderModule* _shader_module, ShaderInfo& si) const;
+
+    int new_pipeline(VkShaderModule shader_module, const ShaderInfo& shader_info, const std::vector<vk_specialization_type>& specializations,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template) const;
+
+protected:
+    const VulkanDevice* vkdev;
+
+private:
+    PipelineCache(const PipelineCache&);
+    PipelineCache& operator=(const PipelineCache&);
+
+private:
+    PipelineCachePrivate* const d;
+};
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_PIPELINECACHE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/platform.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/platform.h
new file mode 100644
index 0000000..95e8a25
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/platform.h
@@ -0,0 +1,285 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PLATFORM_H
+#define NCNN_PLATFORM_H
+
+#define NCNN_STDIO 1
+#define NCNN_STRING 1
+#define NCNN_SIMPLEOCV 0
+#define NCNN_SIMPLEOMP 0
+#define NCNN_SIMPLESTL 0
+#define NCNN_THREADS 1
+#define NCNN_BENCHMARK 0
+#define NCNN_C_API 1
+#define NCNN_PLATFORM_API 1
+#define NCNN_PIXEL 1
+#define NCNN_PIXEL_ROTATE 1
+#define NCNN_PIXEL_AFFINE 1
+#define NCNN_PIXEL_DRAWING 1
+#define NCNN_VULKAN 1
+#define NCNN_SYSTEM_GLSLANG 0
+#define NCNN_RUNTIME_CPU 1
+#define NCNN_AVX 0
+#define NCNN_XOP 0
+#define NCNN_FMA 0
+#define NCNN_F16C 0
+#define NCNN_AVX2 0
+#define NCNN_AVXVNNI 0
+#define NCNN_AVX512 0
+#define NCNN_AVX512VNNI 0
+#define NCNN_AVX512BF16 0
+#define NCNN_AVX512FP16 0
+#define NCNN_VFPV4 1
+#if __aarch64__
+#define NCNN_ARM82 1
+#define NCNN_ARM82DOT 1
+#define NCNN_ARM82FP16FML 0
+#define NCNN_ARM84BF16 0
+#define NCNN_ARM84I8MM 0
+#define NCNN_ARM86SVE 0
+#define NCNN_ARM86SVE2 0
+#define NCNN_ARM86SVEBF16 0
+#define NCNN_ARM86SVEI8MM 0
+#define NCNN_ARM86SVEF32MM 0
+#endif // __aarch64__
+#define NCNN_MSA 0
+#define NCNN_LSX 0
+#define NCNN_MMI 0
+#define NCNN_RVV 0
+#define NCNN_INT8 1
+#define NCNN_BF16 1
+#define NCNN_FORCE_INLINE 1
+
+#define NCNN_VERSION_STRING "1.0.20221128"
+
+#include "ncnn_export.h"
+
+#ifdef __cplusplus
+
+#if NCNN_THREADS
+#if (defined _WIN32 && !(defined __MINGW32__))
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <process.h>
+#else
+#include <pthread.h>
+#endif
+#endif // NCNN_THREADS
+
+#if __ANDROID_API__ >= 26
+#define VK_USE_PLATFORM_ANDROID_KHR
+#endif // __ANDROID_API__ >= 26
+
+namespace ncnn {
+
+#if NCNN_THREADS
+#if (defined _WIN32 && !(defined __MINGW32__))
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() { InitializeSRWLock(&srwlock); }
+    ~Mutex() {}
+    void lock() { AcquireSRWLockExclusive(&srwlock); }
+    void unlock() { ReleaseSRWLockExclusive(&srwlock); }
+private:
+    friend class ConditionVariable;
+    // NOTE SRWLock is available from windows vista
+    SRWLOCK srwlock;
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() { InitializeConditionVariable(&condvar); }
+    ~ConditionVariable() {}
+    void wait(Mutex& mutex) { SleepConditionVariableSRW(&condvar, &mutex.srwlock, INFINITE, 0); }
+    void broadcast() { WakeAllConditionVariable(&condvar); }
+    void signal() { WakeConditionVariable(&condvar); }
+private:
+    CONDITION_VARIABLE condvar;
+};
+
+static unsigned __stdcall start_wrapper(void* args);
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*start)(void*), void* args = 0) { _start = start; _args = args; handle = (HANDLE)_beginthreadex(0, 0, start_wrapper, this, 0, 0); }
+    ~Thread() {}
+    void join() { WaitForSingleObject(handle, INFINITE); CloseHandle(handle); }
+private:
+    friend unsigned __stdcall start_wrapper(void* args)
+    {
+        Thread* t = (Thread*)args;
+        t->_start(t->_args);
+        return 0;
+    }
+    HANDLE handle;
+    void* (*_start)(void*);
+    void* _args;
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { key = TlsAlloc(); }
+    ~ThreadLocalStorage() { TlsFree(key); }
+    void set(void* value) { TlsSetValue(key, (LPVOID)value); }
+    void* get() { return (void*)TlsGetValue(key); }
+private:
+    DWORD key;
+};
+#else // (defined _WIN32 && !(defined __MINGW32__))
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() { pthread_mutex_init(&mutex, 0); }
+    ~Mutex() { pthread_mutex_destroy(&mutex); }
+    void lock() { pthread_mutex_lock(&mutex); }
+    void unlock() { pthread_mutex_unlock(&mutex); }
+private:
+    friend class ConditionVariable;
+    pthread_mutex_t mutex;
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() { pthread_cond_init(&cond, 0); }
+    ~ConditionVariable() { pthread_cond_destroy(&cond); }
+    void wait(Mutex& mutex) { pthread_cond_wait(&cond, &mutex.mutex); }
+    void broadcast() { pthread_cond_broadcast(&cond); }
+    void signal() { pthread_cond_signal(&cond); }
+private:
+    pthread_cond_t cond;
+};
+
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*start)(void*), void* args = 0) { pthread_create(&t, 0, start, args); }
+    ~Thread() {}
+    void join() { pthread_join(t, 0); }
+private:
+    pthread_t t;
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { pthread_key_create(&key, 0); }
+    ~ThreadLocalStorage() { pthread_key_delete(key); }
+    void set(void* value) { pthread_setspecific(key, value); }
+    void* get() { return pthread_getspecific(key); }
+private:
+    pthread_key_t key;
+};
+#endif // (defined _WIN32 && !(defined __MINGW32__))
+#else // NCNN_THREADS
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() {}
+    ~Mutex() {}
+    void lock() {}
+    void unlock() {}
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() {}
+    ~ConditionVariable() {}
+    void wait(Mutex& /*mutex*/) {}
+    void broadcast() {}
+    void signal() {}
+};
+
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*/*start*/)(void*), void* /*args*/ = 0) {}
+    ~Thread() {}
+    void join() {}
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { data = 0; }
+    ~ThreadLocalStorage() {}
+    void set(void* value) { data = value; }
+    void* get() { return data; }
+private:
+    void* data;
+};
+#endif // NCNN_THREADS
+
+class NCNN_EXPORT MutexLockGuard
+{
+public:
+    MutexLockGuard(Mutex& _mutex) : mutex(_mutex) { mutex.lock(); }
+    ~MutexLockGuard() { mutex.unlock(); }
+private:
+    Mutex& mutex;
+};
+
+} // namespace ncnn
+
+#if NCNN_SIMPLESTL
+#include "simplestl.h"
+#else
+#include <algorithm>
+#include <list>
+#include <vector>
+#include <string>
+#endif
+
+#endif // __cplusplus
+
+#if NCNN_STDIO
+#if NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#include <android/log.h>
+#define NCNN_LOGE(...) do { \
+    fprintf(stderr, ##__VA_ARGS__); fprintf(stderr, "\n"); \
+    __android_log_print(ANDROID_LOG_WARN, "ncnn", ##__VA_ARGS__); } while(0)
+#else // NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#include <stdio.h>
+#define NCNN_LOGE(...) do { \
+    fprintf(stderr, ##__VA_ARGS__); fprintf(stderr, "\n"); } while(0)
+#endif // NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#else
+#define NCNN_LOGE(...)
+#endif
+
+
+#if NCNN_FORCE_INLINE
+#ifdef _MSC_VER
+    #define NCNN_FORCEINLINE __forceinline
+#elif defined(__GNUC__)
+    #define NCNN_FORCEINLINE inline __attribute__((__always_inline__))
+#elif defined(__CLANG__)
+    #if __has_attribute(__always_inline__)
+        #define NCNN_FORCEINLINE inline __attribute__((__always_inline__))
+    #else
+        #define NCNN_FORCEINLINE inline
+    #endif
+#else
+    #define NCNN_FORCEINLINE inline
+#endif
+#else
+    #define NCNN_FORCEINLINE inline
+#endif
+
+#endif // NCNN_PLATFORM_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/simpleocv.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/simpleocv.h
new file mode 100644
index 0000000..55ede15
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/simpleocv.h
@@ -0,0 +1,501 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLEOCV_H
+#define NCNN_SIMPLEOCV_H
+
+#include "platform.h"
+
+#if NCNN_SIMPLEOCV
+
+#include <limits.h>
+#include <string.h>
+#include "allocator.h"
+#include "mat.h"
+
+#if defined(_MSC_VER) || defined(__GNUC__)
+#pragma push_macro("min")
+#pragma push_macro("max")
+#undef min
+#undef max
+#endif
+
+#ifndef NCNN_XADD
+using ncnn::NCNN_XADD;
+#endif
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+
+enum
+{
+    CV_LOAD_IMAGE_UNCHANGED = -1,
+    CV_LOAD_IMAGE_GRAYSCALE = 0,
+    CV_LOAD_IMAGE_COLOR = 1,
+};
+
+enum
+{
+    CV_IMWRITE_JPEG_QUALITY = 1
+};
+
+// minimal opencv style data structure implementation
+namespace cv {
+
+template<typename _Tp>
+static inline _Tp saturate_cast(int v)
+{
+    return _Tp(v);
+}
+template<>
+inline uchar saturate_cast<uchar>(int v)
+{
+    return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
+}
+
+template<typename _Tp>
+struct Scalar_
+{
+    Scalar_()
+    {
+        v[0] = 0;
+        v[1] = 0;
+        v[2] = 0;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0)
+    {
+        v[0] = _v0;
+        v[1] = 0;
+        v[2] = 0;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0, _Tp _v1, _Tp _v2)
+    {
+        v[0] = _v0;
+        v[1] = _v1;
+        v[2] = _v2;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0, _Tp _v1, _Tp _v2, _Tp _v3)
+    {
+        v[0] = _v0;
+        v[1] = _v1;
+        v[2] = _v2;
+        v[3] = _v3;
+    }
+
+    const _Tp operator[](const int i) const
+    {
+        return v[i];
+    }
+
+    _Tp operator[](const int i)
+    {
+        return v[i];
+    }
+
+    _Tp v[4];
+};
+
+typedef Scalar_<uchar> Scalar;
+
+template<typename _Tp>
+struct Point_
+{
+    Point_()
+        : x(0), y(0)
+    {
+    }
+    Point_(_Tp _x, _Tp _y)
+        : x(_x), y(_y)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Point_<_Tp2>() const
+    {
+        return Point_<_Tp2>(saturate_cast<_Tp2>(x), saturate_cast<_Tp2>(y));
+    }
+
+    _Tp x;
+    _Tp y;
+};
+
+typedef Point_<int> Point;
+typedef Point_<float> Point2f;
+
+template<typename _Tp>
+struct Size_
+{
+    Size_()
+        : width(0), height(0)
+    {
+    }
+    Size_(_Tp _w, _Tp _h)
+        : width(_w), height(_h)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Size_<_Tp2>() const
+    {
+        return Size_<_Tp2>(saturate_cast<_Tp2>(width), saturate_cast<_Tp2>(height));
+    }
+
+    _Tp width;
+    _Tp height;
+};
+
+typedef Size_<int> Size;
+typedef Size_<float> Size2f;
+
+template<typename _Tp>
+struct Rect_
+{
+    Rect_()
+        : x(0), y(0), width(0), height(0)
+    {
+    }
+    Rect_(_Tp _x, _Tp _y, _Tp _w, _Tp _h)
+        : x(_x), y(_y), width(_w), height(_h)
+    {
+    }
+    Rect_(Point_<_Tp> _p, Size_<_Tp> _size)
+        : x(_p.x), y(_p.y), width(_size.width), height(_size.height)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Rect_<_Tp2>() const
+    {
+        return Rect_<_Tp2>(saturate_cast<_Tp2>(x), saturate_cast<_Tp2>(y), saturate_cast<_Tp2>(width), saturate_cast<_Tp2>(height));
+    }
+
+    _Tp x;
+    _Tp y;
+    _Tp width;
+    _Tp height;
+
+    // area
+    _Tp area() const
+    {
+        return width * height;
+    }
+};
+
+template<typename _Tp>
+static inline Rect_<_Tp>& operator&=(Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    _Tp x1 = std::max(a.x, b.x), y1 = std::max(a.y, b.y);
+    a.width = std::min(a.x + a.width, b.x + b.width) - x1;
+    a.height = std::min(a.y + a.height, b.y + b.height) - y1;
+    a.x = x1;
+    a.y = y1;
+    if (a.width <= 0 || a.height <= 0)
+        a = Rect_<_Tp>();
+    return a;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp>& operator|=(Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    _Tp x1 = std::min(a.x, b.x), y1 = std::min(a.y, b.y);
+    a.width = std::max(a.x + a.width, b.x + b.width) - x1;
+    a.height = std::max(a.y + a.height, b.y + b.height) - y1;
+    a.x = x1;
+    a.y = y1;
+    return a;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp> operator&(const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    Rect_<_Tp> c = a;
+    return c &= b;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp> operator|(const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    Rect_<_Tp> c = a;
+    return c |= b;
+}
+
+typedef Rect_<int> Rect;
+typedef Rect_<float> Rect2f;
+
+#define CV_8UC1  1
+#define CV_8UC3  3
+#define CV_8UC4  4
+#define CV_32FC1 4
+
+struct NCNN_EXPORT Mat
+{
+    Mat()
+        : data(0), refcount(0), rows(0), cols(0), c(0)
+    {
+    }
+
+    Mat(int _rows, int _cols, int flags)
+        : data(0), refcount(0)
+    {
+        create(_rows, _cols, flags);
+    }
+
+    // copy
+    Mat(const Mat& m)
+        : data(m.data), refcount(m.refcount)
+    {
+        if (refcount)
+            NCNN_XADD(refcount, 1);
+
+        rows = m.rows;
+        cols = m.cols;
+        c = m.c;
+    }
+
+    Mat(int _rows, int _cols, int flags, void* _data)
+        : data((unsigned char*)_data), refcount(0)
+    {
+        rows = _rows;
+        cols = _cols;
+        c = flags;
+    }
+
+    ~Mat()
+    {
+        release();
+    }
+
+    // assign
+    Mat& operator=(const Mat& m)
+    {
+        if (this == &m)
+            return *this;
+
+        if (m.refcount)
+            NCNN_XADD(m.refcount, 1);
+
+        release();
+
+        data = m.data;
+        refcount = m.refcount;
+
+        rows = m.rows;
+        cols = m.cols;
+        c = m.c;
+
+        return *this;
+    }
+
+    Mat& operator=(const Scalar& s)
+    {
+        if (total() > 0)
+        {
+            uchar* p = data;
+            for (int i = 0; i < cols * rows; i++)
+            {
+                for (int j = 0; j < c; j++)
+                {
+                    *p++ = s[j];
+                }
+            }
+        }
+
+        return *this;
+    }
+
+    void create(int _rows, int _cols, int flags)
+    {
+        release();
+
+        rows = _rows;
+        cols = _cols;
+        c = flags;
+
+        if (total() > 0)
+        {
+            // refcount address must be aligned, so we expand totalsize here
+            size_t totalsize = (total() + 3) >> 2 << 2;
+            data = (uchar*)ncnn::fastMalloc(totalsize + (int)sizeof(*refcount));
+            refcount = (int*)(((uchar*)data) + totalsize);
+            *refcount = 1;
+        }
+    }
+
+    void release()
+    {
+        if (refcount && NCNN_XADD(refcount, -1) == 1)
+            ncnn::fastFree(data);
+
+        data = 0;
+
+        rows = 0;
+        cols = 0;
+        c = 0;
+
+        refcount = 0;
+    }
+
+    Mat clone() const
+    {
+        if (empty())
+            return Mat();
+
+        Mat m(rows, cols, c);
+
+        if (total() > 0)
+        {
+            memcpy(m.data, data, total());
+        }
+
+        return m;
+    }
+
+    bool empty() const
+    {
+        return data == 0 || total() == 0;
+    }
+
+    int channels() const
+    {
+        return c;
+    }
+
+    int type() const
+    {
+        return c;
+    }
+
+    size_t total() const
+    {
+        return cols * rows * c;
+    }
+
+    const uchar* ptr(int y) const
+    {
+        return data + y * cols * c;
+    }
+
+    uchar* ptr(int y)
+    {
+        return data + y * cols * c;
+    }
+
+    template<typename _Tp>
+    const _Tp* ptr(int y) const
+    {
+        return (const _Tp*)data + y * cols * c;
+    }
+
+    template<typename _Tp>
+    _Tp* ptr(int y)
+    {
+        return (_Tp*)data + y * cols * c;
+    }
+
+    // roi
+    Mat operator()(const Rect& roi) const
+    {
+        if (empty())
+            return Mat();
+
+        Mat m(roi.height, roi.width, c);
+
+        int sy = roi.y;
+        for (int y = 0; y < roi.height; y++)
+        {
+            const uchar* sptr = ptr(sy) + roi.x * c;
+            uchar* dptr = m.ptr(y);
+            memcpy(dptr, sptr, roi.width * c);
+            sy++;
+        }
+
+        return m;
+    }
+
+    uchar* data;
+
+    // pointer to the reference counter;
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    int rows;
+    int cols;
+
+    int c;
+};
+
+enum ImreadModes
+{
+    IMREAD_UNCHANGED = -1,
+    IMREAD_GRAYSCALE = 0,
+    IMREAD_COLOR = 1
+};
+
+NCNN_EXPORT Mat imread(const std::string& path, int flags = IMREAD_COLOR);
+
+enum ImwriteFlags
+{
+    IMWRITE_JPEG_QUALITY = 1
+};
+
+NCNN_EXPORT bool imwrite(const std::string& path, const Mat& m, const std::vector<int>& params = std::vector<int>());
+
+NCNN_EXPORT void imshow(const std::string& name, const Mat& m);
+
+NCNN_EXPORT int waitKey(int delay = 0);
+
+#if NCNN_PIXEL
+NCNN_EXPORT void resize(const Mat& src, Mat& dst, const Size& size, float sw = 0.f, float sh = 0.f, int flags = 0);
+#endif // NCNN_PIXEL
+
+#if NCNN_PIXEL_DRAWING
+
+enum
+{
+    FILLED = -1
+};
+
+NCNN_EXPORT void rectangle(Mat& img, Point pt1, Point pt2, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void rectangle(Mat& img, Rect rec, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void circle(Mat& img, Point center, int radius, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void line(Mat& img, Point p0, Point p1, const Scalar& color, int thickness = 1);
+
+enum
+{
+    FONT_HERSHEY_SIMPLEX = 0
+};
+
+NCNN_EXPORT void putText(Mat& img, const std::string& text, Point org, int fontFace, double fontScale, Scalar color, int thickness = 1);
+
+NCNN_EXPORT Size getTextSize(const std::string& text, int fontFace, double fontScale, int thickness, int* baseLine);
+
+#endif // NCNN_PIXEL_DRAWING
+
+} // namespace cv
+
+#if defined(_MSC_VER) || defined(__GNUC__)
+#pragma pop_macro("min")
+#pragma pop_macro("max")
+#endif
+
+#endif // NCNN_SIMPLEOCV
+
+#endif // NCNN_SIMPLEOCV_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/simpleomp.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/simpleomp.h
new file mode 100644
index 0000000..13e2452
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/simpleomp.h
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLEOMP_H
+#define NCNN_SIMPLEOMP_H
+
+#include "platform.h"
+
+#if NCNN_SIMPLEOMP
+
+#include <stdint.h>
+
+// This minimal openmp runtime implementation only supports the llvm openmp abi
+// and only supports #pragma omp parallel for num_threads(X)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+NCNN_EXPORT int omp_get_max_threads();
+
+NCNN_EXPORT void omp_set_num_threads(int num_threads);
+
+NCNN_EXPORT int omp_get_dynamic();
+
+NCNN_EXPORT void omp_set_dynamic(int dynamic);
+
+NCNN_EXPORT int omp_get_num_threads();
+
+NCNN_EXPORT int omp_get_thread_num();
+
+NCNN_EXPORT int kmp_get_blocktime();
+
+NCNN_EXPORT void kmp_set_blocktime(int blocktime);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // NCNN_SIMPLEOMP
+
+#endif // NCNN_SIMPLEOMP_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/simplestl.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/simplestl.h
new file mode 100644
index 0000000..00ff468
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/simplestl.h
@@ -0,0 +1,565 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLESTL_H
+#define NCNN_SIMPLESTL_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#if !NCNN_SIMPLESTL
+
+#include <new>
+
+#else
+
+// allocation functions
+NCNN_EXPORT void* operator new(size_t size);
+NCNN_EXPORT void* operator new[](size_t size);
+// placement allocation functions
+NCNN_EXPORT void* operator new(size_t size, void* ptr);
+NCNN_EXPORT void* operator new[](size_t size, void* ptr);
+// deallocation functions
+NCNN_EXPORT void operator delete(void* ptr);
+NCNN_EXPORT void operator delete[](void* ptr);
+// deallocation functions since c++14
+#if __cplusplus >= 201402L
+NCNN_EXPORT void operator delete(void* ptr, size_t sz);
+NCNN_EXPORT void operator delete[](void* ptr, size_t sz);
+#endif
+// placement deallocation functions
+NCNN_EXPORT void operator delete(void* ptr, void* voidptr2);
+NCNN_EXPORT void operator delete[](void* ptr, void* voidptr2);
+
+#endif
+
+// minimal stl data structure implementation
+namespace std {
+
+template<typename T>
+const T& max(const T& a, const T& b)
+{
+    return (a < b) ? b : a;
+}
+
+template<typename T>
+const T& min(const T& a, const T& b)
+{
+    return (a > b) ? b : a;
+}
+
+template<typename T>
+void swap(T& a, T& b)
+{
+    T temp(a);
+    a = b;
+    b = temp;
+}
+
+template<typename T1, typename T2>
+struct pair
+{
+    pair()
+        : first(), second()
+    {
+    }
+    pair(const T1& t1, const T2& t2)
+        : first(t1), second(t2)
+    {
+    }
+
+    T1 first;
+    T2 second;
+};
+
+template<typename T1, typename T2>
+bool operator==(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return (x.first == y.first && x.second == y.second);
+}
+template<typename T1, typename T2>
+bool operator<(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return x.first < y.first || (!(y.first < x.first) && x.second < y.second);
+}
+template<typename T1, typename T2>
+bool operator!=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(x == y);
+}
+template<typename T1, typename T2>
+bool operator>(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return y < x;
+}
+template<typename T1, typename T2>
+bool operator<=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(y < x);
+}
+template<typename T1, typename T2>
+bool operator>=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(x < y);
+}
+
+template<typename T1, typename T2>
+pair<T1, T2> make_pair(const T1& t1, const T2& t2)
+{
+    return pair<T1, T2>(t1, t2);
+}
+
+template<typename T>
+struct node
+{
+    node* prev_;
+    node* next_;
+    T data_;
+
+    node()
+        : prev_(0), next_(0), data_()
+    {
+    }
+    node(const T& t)
+        : prev_(0), next_(0), data_(t)
+    {
+    }
+};
+
+template<typename T>
+struct iter_list
+{
+    iter_list()
+        : curr_(0)
+    {
+    }
+    iter_list(node<T>* n)
+        : curr_(n)
+    {
+    }
+    iter_list(const iter_list& i)
+        : curr_(i.curr_)
+    {
+    }
+    ~iter_list()
+    {
+    }
+
+    iter_list& operator=(const iter_list& i)
+    {
+        curr_ = i.curr_;
+        return *this;
+    }
+
+    T& operator*()
+    {
+        return curr_->data_;
+    }
+    T* operator->()
+    {
+        return &(curr_->data_);
+    }
+
+    bool operator==(const iter_list& i)
+    {
+        return curr_ == i.curr_;
+    }
+    bool operator!=(const iter_list& i)
+    {
+        return curr_ != i.curr_;
+    }
+
+    iter_list& operator++()
+    {
+        curr_ = curr_->next_;
+        return *this;
+    }
+    iter_list& operator--()
+    {
+        curr_ = curr_->prev_;
+        return *this;
+    }
+
+    node<T>* curr_;
+};
+
+template<typename T>
+struct list
+{
+    typedef iter_list<T> iterator;
+
+    list()
+    {
+        head_ = new node<T>();
+        tail_ = head_;
+        count_ = 0;
+    }
+    ~list()
+    {
+        clear();
+        delete head_;
+    }
+    list(const list& l)
+    {
+        head_ = new node<T>();
+        tail_ = head_;
+        count_ = 0;
+
+        for (iter_list<T> i = l.begin(); i != l.end(); ++i)
+        {
+            push_back(*i);
+        }
+    }
+
+    list& operator=(const list& l)
+    {
+        if (this == &l)
+        {
+            return *this;
+        }
+        clear();
+
+        for (iter_list<T> i = l.begin(); i != l.end(); ++i)
+        {
+            push_back(*i);
+        }
+        return *this;
+    }
+
+    void clear()
+    {
+        while (count_ > 0)
+        {
+            pop_front();
+        }
+    }
+
+    void pop_front()
+    {
+        if (count_ > 0)
+        {
+            head_ = head_->next_;
+            delete head_->prev_;
+            head_->prev_ = 0;
+            --count_;
+        }
+    }
+
+    size_t size() const
+    {
+        return count_;
+    }
+    iter_list<T> begin() const
+    {
+        return iter_list<T>(head_);
+    }
+    iter_list<T> end() const
+    {
+        return iter_list<T>(tail_);
+    }
+    bool empty() const
+    {
+        return count_ == 0;
+    }
+
+    void push_back(const T& t)
+    {
+        if (count_ == 0)
+        {
+            head_ = new node<T>(t);
+            head_->prev_ = 0;
+            head_->next_ = tail_;
+            tail_->prev_ = head_;
+            count_ = 1;
+        }
+        else
+        {
+            node<T>* temp = new node<T>(t);
+            temp->prev_ = tail_->prev_;
+            temp->next_ = tail_;
+            tail_->prev_->next_ = temp;
+            tail_->prev_ = temp;
+            ++count_;
+        }
+    }
+
+    iter_list<T> erase(iter_list<T> pos)
+    {
+        if (pos != end())
+        {
+            node<T>* temp = pos.curr_;
+            if (temp == head_)
+            {
+                ++pos;
+                temp->next_->prev_ = 0;
+                head_ = temp->next_;
+            }
+            else
+            {
+                --pos;
+                temp->next_->prev_ = temp->prev_;
+                temp->prev_->next_ = temp->next_;
+                ++pos;
+            }
+            delete temp;
+            --count_;
+        }
+        return pos;
+    }
+
+protected:
+    node<T>* head_;
+    node<T>* tail_;
+    size_t count_;
+};
+
+template<typename T>
+struct greater
+{
+    bool operator()(const T& x, const T& y) const
+    {
+        return (x > y);
+    }
+};
+
+template<typename T>
+struct less
+{
+    bool operator()(const T& x, const T& y) const
+    {
+        return (x < y);
+    }
+};
+
+template<typename RandomAccessIter, typename Compare>
+void partial_sort(RandomAccessIter first, RandomAccessIter middle, RandomAccessIter last, Compare comp)
+{
+    // [TODO] heap sort should be used here, but we simply use bubble sort now
+    for (RandomAccessIter i = first; i < middle; ++i)
+    {
+        // bubble sort
+        for (RandomAccessIter j = last - 1; j > first; --j)
+        {
+            if (comp(*j, *(j - 1)))
+            {
+                swap(*j, *(j - 1));
+            }
+        }
+    }
+}
+
+template<typename T>
+struct vector
+{
+    vector()
+        : data_(0), size_(0), capacity_(0)
+    {
+    }
+    vector(const size_t new_size, const T& value = T())
+        : data_(0), size_(0), capacity_(0)
+    {
+        resize(new_size, value);
+    }
+    ~vector()
+    {
+        clear();
+    }
+    vector(const vector& v)
+        : data_(0), size_(0), capacity_(0)
+    {
+        resize(v.size());
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i] = v.data_[i];
+        }
+    }
+
+    vector& operator=(const vector& v)
+    {
+        if (this == &v)
+        {
+            return *this;
+        }
+        resize(0);
+        resize(v.size());
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i] = v.data_[i];
+        }
+        return *this;
+    }
+
+    void resize(const size_t new_size, const T& value = T())
+    {
+        try_alloc(new_size);
+        if (new_size > size_)
+        {
+            for (size_t i = size_; i < new_size; i++)
+            {
+                new (&data_[i]) T(value);
+            }
+        }
+        else if (new_size < size_)
+        {
+            for (size_t i = new_size; i < size_; i++)
+            {
+                data_[i].~T();
+            }
+        }
+        size_ = new_size;
+    }
+
+    void clear()
+    {
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i].~T();
+        }
+        delete[](char*) data_;
+        data_ = 0;
+        size_ = 0;
+        capacity_ = 0;
+    }
+
+    T* data() const
+    {
+        return data_;
+    }
+    size_t size() const
+    {
+        return size_;
+    }
+    T& operator[](size_t i) const
+    {
+        return data_[i];
+    }
+    T* begin() const
+    {
+        return &data_[0];
+    }
+    T* end() const
+    {
+        return &data_[size_];
+    }
+    bool empty() const
+    {
+        return size_ == 0;
+    }
+
+    void push_back(const T& t)
+    {
+        try_alloc(size_ + 1);
+        new (&data_[size_]) T(t);
+        size_++;
+    }
+
+    void insert(T* pos, T* b, T* e)
+    {
+        vector* v = 0;
+        if (b >= begin() && b < end())
+        {
+            //the same vector
+            v = new vector(*this);
+            b = v->begin() + (b - begin());
+            e = v->begin() + (e - begin());
+        }
+        size_t diff = pos - begin();
+        try_alloc(size_ + (e - b));
+        pos = begin() + diff;
+        memmove(pos + (e - b), pos, (end() - pos) * sizeof(T));
+        size_t len = e - b;
+        size_ += len;
+        for (size_t i = 0; i < len; i++)
+        {
+            *pos = *b;
+            pos++;
+            b++;
+        }
+        delete v;
+    }
+
+    T* erase(T* pos)
+    {
+        pos->~T();
+        memmove(pos, pos + 1, (end() - pos - 1) * sizeof(T));
+        size_--;
+        return pos;
+    }
+
+protected:
+    T* data_;
+    size_t size_;
+    size_t capacity_;
+    void try_alloc(size_t new_size)
+    {
+        if (new_size * 3 / 2 > capacity_ / 2)
+        {
+            capacity_ = new_size * 2;
+            T* new_data = (T*)new char[capacity_ * sizeof(T)];
+            memset(static_cast<void*>(new_data), 0, capacity_ * sizeof(T));
+            if (data_)
+            {
+                memmove(new_data, data_, sizeof(T) * size_);
+                delete[](char*) data_;
+            }
+            data_ = new_data;
+        }
+    }
+};
+
+struct NCNN_EXPORT string : public vector<char>
+{
+    string()
+    {
+    }
+    string(const char* str)
+    {
+        size_t len = strlen(str);
+        resize(len);
+        memcpy(data_, str, len);
+    }
+    const char* c_str() const
+    {
+        return (const char*)data_;
+    }
+    bool operator==(const string& str2) const
+    {
+        return strcmp(data_, str2.data_) == 0;
+    }
+    bool operator==(const char* str2) const
+    {
+        return strcmp(data_, str2) == 0;
+    }
+    bool operator!=(const char* str2) const
+    {
+        return strcmp(data_, str2) != 0;
+    }
+    string& operator+=(const string& str1)
+    {
+        insert(end(), str1.begin(), str1.end());
+        return *this;
+    }
+};
+
+inline string operator+(const string& str1, const string& str2)
+{
+    string str(str1);
+    str.insert(str.end(), str2.begin(), str2.end());
+    return str;
+}
+
+} // namespace std
+
+#endif // NCNN_SIMPLESTL_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/vulkan_header_fix.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/vulkan_header_fix.h
new file mode 100644
index 0000000..e7a7e8e
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/include/ncnn/vulkan_header_fix.h
@@ -0,0 +1,251 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_VULKAN_HEADER_FIX_H
+#define NCNN_VULKAN_HEADER_FIX_H
+
+#include <vulkan/vulkan.h>
+
+// This header contains new structure and function declearation to fix build with old vulkan sdk
+
+#if VK_HEADER_VERSION < 70
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES (VkStructureType)1000094000
+typedef enum VkSubgroupFeatureFlagBits
+{
+    VK_SUBGROUP_FEATURE_BASIC_BIT = 0x00000001,
+    VK_SUBGROUP_FEATURE_VOTE_BIT = 0x00000002,
+    VK_SUBGROUP_FEATURE_ARITHMETIC_BIT = 0x00000004,
+    VK_SUBGROUP_FEATURE_BALLOT_BIT = 0x00000008,
+    VK_SUBGROUP_FEATURE_SHUFFLE_BIT = 0x00000010,
+    VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT = 0x00000020,
+    VK_SUBGROUP_FEATURE_CLUSTERED_BIT = 0x00000040,
+    VK_SUBGROUP_FEATURE_QUAD_BIT = 0x00000080,
+    VK_SUBGROUP_FEATURE_PARTITIONED_BIT_NV = 0x00000100,
+    VK_SUBGROUP_FEATURE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkSubgroupFeatureFlagBits;
+typedef VkFlags VkSubgroupFeatureFlags;
+typedef struct VkPhysicalDeviceSubgroupProperties
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t subgroupSize;
+    VkShaderStageFlags supportedStages;
+    VkSubgroupFeatureFlags supportedOperations;
+    VkBool32 quadOperationsInAllStages;
+} VkPhysicalDeviceSubgroupProperties;
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_3_PROPERTIES (VkStructureType)1000168000
+#define VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_SUPPORT            (VkStructureType)1000168001
+typedef struct VkPhysicalDeviceMaintenance3Properties
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t maxPerSetDescriptors;
+    VkDeviceSize maxMemoryAllocationSize;
+} VkPhysicalDeviceMaintenance3Properties;
+typedef struct VkDescriptorSetLayoutSupport
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 supported;
+} VkDescriptorSetLayoutSupport;
+typedef VkPhysicalDeviceMaintenance3Properties VkPhysicalDeviceMaintenance3PropertiesKHR;
+typedef VkDescriptorSetLayoutSupport VkDescriptorSetLayoutSupportKHR;
+typedef void(VKAPI_PTR* PFN_vkGetDescriptorSetLayoutSupportKHR)(VkDevice device, const VkDescriptorSetLayoutCreateInfo* pCreateInfo, VkDescriptorSetLayoutSupport* pSupport);
+#endif // VK_HEADER_VERSION < 70
+
+#if VK_HEADER_VERSION < 80
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR (VkStructureType)1000177000
+typedef struct VkPhysicalDevice8BitStorageFeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 storageBuffer8BitAccess;
+    VkBool32 uniformAndStorageBuffer8BitAccess;
+    VkBool32 storagePushConstant8;
+} VkPhysicalDevice8BitStorageFeaturesKHR;
+#define VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2_KHR  (VkStructureType)1000109000
+#define VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2_KHR    (VkStructureType)1000109001
+#define VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2_KHR     (VkStructureType)1000109002
+#define VK_STRUCTURE_TYPE_SUBPASS_DEPENDENCY_2_KHR      (VkStructureType)1000109003
+#define VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2_KHR (VkStructureType)1000109004
+#define VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO_KHR        (VkStructureType)1000109005
+#define VK_STRUCTURE_TYPE_SUBPASS_END_INFO_KHR          (VkStructureType)1000109006
+typedef struct VkAttachmentDescription2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkAttachmentDescriptionFlags flags;
+    VkFormat format;
+    VkSampleCountFlagBits samples;
+    VkAttachmentLoadOp loadOp;
+    VkAttachmentStoreOp storeOp;
+    VkAttachmentLoadOp stencilLoadOp;
+    VkAttachmentStoreOp stencilStoreOp;
+    VkImageLayout initialLayout;
+    VkImageLayout finalLayout;
+} VkAttachmentDescription2KHR;
+typedef struct VkAttachmentReference2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint32_t attachment;
+    VkImageLayout layout;
+    VkImageAspectFlags aspectMask;
+} VkAttachmentReference2KHR;
+typedef struct VkSubpassDescription2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkSubpassDescriptionFlags flags;
+    VkPipelineBindPoint pipelineBindPoint;
+    uint32_t viewMask;
+    uint32_t inputAttachmentCount;
+    const VkAttachmentReference2KHR* pInputAttachments;
+    uint32_t colorAttachmentCount;
+    const VkAttachmentReference2KHR* pColorAttachments;
+    const VkAttachmentReference2KHR* pResolveAttachments;
+    const VkAttachmentReference2KHR* pDepthStencilAttachment;
+    uint32_t preserveAttachmentCount;
+    const uint32_t* pPreserveAttachments;
+} VkSubpassDescription2KHR;
+typedef struct VkSubpassDependency2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint32_t srcSubpass;
+    uint32_t dstSubpass;
+    VkPipelineStageFlags srcStageMask;
+    VkPipelineStageFlags dstStageMask;
+    VkAccessFlags srcAccessMask;
+    VkAccessFlags dstAccessMask;
+    VkDependencyFlags dependencyFlags;
+    int32_t viewOffset;
+} VkSubpassDependency2KHR;
+typedef struct VkRenderPassCreateInfo2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkRenderPassCreateFlags flags;
+    uint32_t attachmentCount;
+    const VkAttachmentDescription2KHR* pAttachments;
+    uint32_t subpassCount;
+    const VkSubpassDescription2KHR* pSubpasses;
+    uint32_t dependencyCount;
+    const VkSubpassDependency2KHR* pDependencies;
+    uint32_t correlatedViewMaskCount;
+    const uint32_t* pCorrelatedViewMasks;
+} VkRenderPassCreateInfo2KHR;
+typedef struct VkSubpassBeginInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkSubpassContents contents;
+} VkSubpassBeginInfoKHR;
+
+typedef struct VkSubpassEndInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+} VkSubpassEndInfoKHR;
+typedef VkResult(VKAPI_PTR* PFN_vkCreateRenderPass2KHR)(VkDevice device, const VkRenderPassCreateInfo2KHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkRenderPass* pRenderPass);
+typedef void(VKAPI_PTR* PFN_vkCmdBeginRenderPass2KHR)(VkCommandBuffer commandBuffer, const VkRenderPassBeginInfo* pRenderPassBegin, const VkSubpassBeginInfoKHR* pSubpassBeginInfo);
+typedef void(VKAPI_PTR* PFN_vkCmdNextSubpass2KHR)(VkCommandBuffer commandBuffer, const VkSubpassBeginInfoKHR* pSubpassBeginInfo, const VkSubpassEndInfoKHR* pSubpassEndInfo);
+typedef void(VKAPI_PTR* PFN_vkCmdEndRenderPass2KHR)(VkCommandBuffer commandBuffer, const VkSubpassEndInfoKHR* pSubpassEndInfo);
+#endif // VK_HEADER_VERSION < 80
+
+#if VK_HEADER_VERSION < 95
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR (VkStructureType)1000082000
+typedef struct VkPhysicalDeviceFloat16Int8FeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 shaderFloat16;
+    VkBool32 shaderInt8;
+} VkPhysicalDeviceFloat16Int8FeaturesKHR;
+#endif // VK_HEADER_VERSION < 95
+
+#if VK_HEADER_VERSION < 97
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT (VkStructureType)1000237000
+typedef struct VkPhysicalDeviceMemoryBudgetPropertiesEXT
+{
+    VkStructureType sType;
+    void* pNext;
+    VkDeviceSize heapBudget[VK_MAX_MEMORY_HEAPS];
+    VkDeviceSize heapUsage[VK_MAX_MEMORY_HEAPS];
+} VkPhysicalDeviceMemoryBudgetPropertiesEXT;
+#endif // VK_HEADER_VERSION < 97
+
+#if VK_HEADER_VERSION < 101
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_NV   (VkStructureType)1000249000
+#define VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_NV                 (VkStructureType)1000249001
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_PROPERTIES_NV (VkStructureType)1000249002
+typedef enum VkComponentTypeNV
+{
+    VK_COMPONENT_TYPE_FLOAT16_NV = 0,
+    VK_COMPONENT_TYPE_FLOAT32_NV = 1,
+    VK_COMPONENT_TYPE_FLOAT64_NV = 2,
+    VK_COMPONENT_TYPE_SINT8_NV = 3,
+    VK_COMPONENT_TYPE_SINT16_NV = 4,
+    VK_COMPONENT_TYPE_SINT32_NV = 5,
+    VK_COMPONENT_TYPE_SINT64_NV = 6,
+    VK_COMPONENT_TYPE_UINT8_NV = 7,
+    VK_COMPONENT_TYPE_UINT16_NV = 8,
+    VK_COMPONENT_TYPE_UINT32_NV = 9,
+    VK_COMPONENT_TYPE_UINT64_NV = 10,
+    VK_COMPONENT_TYPE_BEGIN_RANGE_NV = VK_COMPONENT_TYPE_FLOAT16_NV,
+    VK_COMPONENT_TYPE_END_RANGE_NV = VK_COMPONENT_TYPE_UINT64_NV,
+    VK_COMPONENT_TYPE_RANGE_SIZE_NV = (VK_COMPONENT_TYPE_UINT64_NV - VK_COMPONENT_TYPE_FLOAT16_NV + 1),
+    VK_COMPONENT_TYPE_MAX_ENUM_NV = 0x7FFFFFFF
+} VkComponentTypeNV;
+typedef enum VkScopeNV
+{
+    VK_SCOPE_DEVICE_NV = 1,
+    VK_SCOPE_WORKGROUP_NV = 2,
+    VK_SCOPE_SUBGROUP_NV = 3,
+    VK_SCOPE_QUEUE_FAMILY_NV = 5,
+    VK_SCOPE_BEGIN_RANGE_NV = VK_SCOPE_DEVICE_NV,
+    VK_SCOPE_END_RANGE_NV = VK_SCOPE_QUEUE_FAMILY_NV,
+    VK_SCOPE_RANGE_SIZE_NV = (VK_SCOPE_QUEUE_FAMILY_NV - VK_SCOPE_DEVICE_NV + 1),
+    VK_SCOPE_MAX_ENUM_NV = 0x7FFFFFFF
+} VkScopeNV;
+typedef struct VkCooperativeMatrixPropertiesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t MSize;
+    uint32_t NSize;
+    uint32_t KSize;
+    VkComponentTypeNV AType;
+    VkComponentTypeNV BType;
+    VkComponentTypeNV CType;
+    VkComponentTypeNV DType;
+    VkScopeNV scope;
+} VkCooperativeMatrixPropertiesNV;
+typedef struct VkPhysicalDeviceCooperativeMatrixFeaturesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 cooperativeMatrix;
+    VkBool32 cooperativeMatrixRobustBufferAccess;
+} VkPhysicalDeviceCooperativeMatrixFeaturesNV;
+typedef struct VkPhysicalDeviceCooperativeMatrixPropertiesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    VkShaderStageFlags cooperativeMatrixSupportedStages;
+} VkPhysicalDeviceCooperativeMatrixPropertiesNV;
+typedef VkResult(VKAPI_PTR* PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesNV)(VkPhysicalDevice physicalDevice, uint32_t* pPropertyCount, VkCooperativeMatrixPropertiesNV* pProperties);
+#endif // VK_HEADER_VERSION < 101
+
+#endif // NCNN_VULKAN_HEADER_FIX_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/lib/cmake/ncnn/ncnn-release.cmake b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/lib/cmake/ncnn/ncnn-release.cmake
new file mode 100644
index 0000000..1fb8660
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/lib/cmake/ncnn/ncnn-release.cmake
@@ -0,0 +1,19 @@
+#----------------------------------------------------------------
+# Generated CMake target import file for configuration "Release".
+#----------------------------------------------------------------
+
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+
+# Import target "ncnn" for configuration "Release"
+set_property(TARGET ncnn APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(ncnn PROPERTIES
+  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/libncnn.so"
+  IMPORTED_SONAME_RELEASE "libncnn.so"
+  )
+
+list(APPEND _cmake_import_check_targets ncnn )
+list(APPEND _cmake_import_check_files_for_ncnn "${_IMPORT_PREFIX}/lib/libncnn.so" )
+
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/lib/cmake/ncnn/ncnn.cmake b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/lib/cmake/ncnn/ncnn.cmake
new file mode 100644
index 0000000..53b9fae
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/lib/cmake/ncnn/ncnn.cmake
@@ -0,0 +1,109 @@
+# Generated by CMake
+
+if("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.8)
+   message(FATAL_ERROR "CMake >= 2.8.0 required")
+endif()
+if(CMAKE_VERSION VERSION_LESS "2.8.3")
+   message(FATAL_ERROR "CMake >= 2.8.3 required")
+endif()
+cmake_policy(PUSH)
+cmake_policy(VERSION 2.8.3...3.23)
+#----------------------------------------------------------------
+# Generated CMake target import file.
+#----------------------------------------------------------------
+
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+
+# Protect against multiple inclusion, which would fail when already imported targets are added once more.
+set(_cmake_targets_defined "")
+set(_cmake_targets_not_defined "")
+set(_cmake_expected_targets "")
+foreach(_cmake_expected_target IN ITEMS ncnn)
+  list(APPEND _cmake_expected_targets "${_cmake_expected_target}")
+  if(TARGET "${_cmake_expected_target}")
+    list(APPEND _cmake_targets_defined "${_cmake_expected_target}")
+  else()
+    list(APPEND _cmake_targets_not_defined "${_cmake_expected_target}")
+  endif()
+endforeach()
+unset(_cmake_expected_target)
+if(_cmake_targets_defined STREQUAL _cmake_expected_targets)
+  unset(_cmake_targets_defined)
+  unset(_cmake_targets_not_defined)
+  unset(_cmake_expected_targets)
+  unset(CMAKE_IMPORT_FILE_VERSION)
+  cmake_policy(POP)
+  return()
+endif()
+if(NOT _cmake_targets_defined STREQUAL "")
+  string(REPLACE ";" ", " _cmake_targets_defined_text "${_cmake_targets_defined}")
+  string(REPLACE ";" ", " _cmake_targets_not_defined_text "${_cmake_targets_not_defined}")
+  message(FATAL_ERROR "Some (but not all) targets in this export set were already defined.\nTargets Defined: ${_cmake_targets_defined_text}\nTargets not yet defined: ${_cmake_targets_not_defined_text}\n")
+endif()
+unset(_cmake_targets_defined)
+unset(_cmake_targets_not_defined)
+unset(_cmake_expected_targets)
+
+
+# Compute the installation prefix relative to this file.
+get_filename_component(_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+if(_IMPORT_PREFIX STREQUAL "/")
+  set(_IMPORT_PREFIX "")
+endif()
+
+# Create imported target ncnn
+add_library(ncnn SHARED IMPORTED)
+
+set_target_properties(ncnn PROPERTIES
+  INTERFACE_COMPILE_OPTIONS "-fno-rtti;-fno-exceptions"
+  INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include/ncnn"
+  INTERFACE_LINK_LIBRARIES "-fopenmp;-static-openmp;-Wl,-wrap,__kmp_affinity_determine_capable;Threads::Threads;Vulkan::Vulkan;android;jnigraphics;log"
+  INTERFACE_POSITION_INDEPENDENT_CODE "ON"
+)
+
+if(CMAKE_VERSION VERSION_LESS 2.8.12)
+  message(FATAL_ERROR "This file relies on consumers using CMake 2.8.12 or greater.")
+endif()
+
+# Load information for each installed configuration.
+file(GLOB _cmake_config_files "${CMAKE_CURRENT_LIST_DIR}/ncnn-*.cmake")
+foreach(_cmake_config_file IN LISTS _cmake_config_files)
+  include("${_cmake_config_file}")
+endforeach()
+unset(_cmake_config_file)
+unset(_cmake_config_files)
+
+# Cleanup temporary variables.
+set(_IMPORT_PREFIX)
+
+# Loop over all imported files and verify that they actually exist
+foreach(_cmake_target IN LISTS _cmake_import_check_targets)
+  foreach(_cmake_file IN LISTS "_cmake_import_check_files_for_${_cmake_target}")
+    if(NOT EXISTS "${_cmake_file}")
+      message(FATAL_ERROR "The imported target \"${_cmake_target}\" references the file
+   \"${_cmake_file}\"
+but this file does not exist.  Possible reasons include:
+* The file was deleted, renamed, or moved to another location.
+* An install or uninstall procedure did not complete successfully.
+* The installation package was faulty and contained
+   \"${CMAKE_CURRENT_LIST_FILE}\"
+but not all the files it references.
+")
+    endif()
+  endforeach()
+  unset(_cmake_file)
+  unset("_cmake_import_check_files_for_${_cmake_target}")
+endforeach()
+unset(_cmake_target)
+unset(_cmake_import_check_targets)
+
+# This file does not depend on other imported targets which have
+# been exported from the same project but in a separate export set.
+
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)
+cmake_policy(POP)
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/lib/cmake/ncnn/ncnnConfig.cmake b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/lib/cmake/ncnn/ncnnConfig.cmake
new file mode 100644
index 0000000..abb2dd6
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/lib/cmake/ncnn/ncnnConfig.cmake
@@ -0,0 +1,42 @@
+set(NCNN_OPENMP ON)
+set(NCNN_THREADS ON)
+set(NCNN_VULKAN ON)
+set(NCNN_SHARED_LIB ON)
+set(NCNN_SYSTEM_GLSLANG OFF)
+
+if(NCNN_OPENMP)
+    find_package(OpenMP)
+endif()
+
+if(NCNN_THREADS)
+    set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
+    set(THREADS_PREFER_PTHREAD_FLAG TRUE)
+    find_package(Threads REQUIRED)
+endif()
+
+if(NCNN_VULKAN)
+    find_package(Vulkan REQUIRED)
+
+    if(NOT NCNN_SHARED_LIB)
+        if(NCNN_SYSTEM_GLSLANG)
+            find_package(glslang QUIET)
+            if(NOT glslang_FOUND)
+                set(GLSLANG_TARGET_DIR "")
+                include(${GLSLANG_TARGET_DIR}/OSDependentTargets.cmake)
+                include(${GLSLANG_TARGET_DIR}/OGLCompilerTargets.cmake)
+                if(EXISTS "${GLSLANG_TARGET_DIR}/HLSLTargets.cmake")
+                    # hlsl support can be optional
+                    include("${GLSLANG_TARGET_DIR}/HLSLTargets.cmake")
+                endif()
+                include(${GLSLANG_TARGET_DIR}/glslangTargets.cmake)
+                include(${GLSLANG_TARGET_DIR}/SPIRVTargets.cmake)
+            endif()
+        else()
+            set(glslang_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../lib/cmake/glslang")
+            find_package(glslang QUIET)
+        endif()
+
+    endif()
+endif()
+
+include(${CMAKE_CURRENT_LIST_DIR}/ncnn.cmake)
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/lib/pkgconfig/ncnn.pc b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/lib/pkgconfig/ncnn.pc
new file mode 100644
index 0000000..2ae00de
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/armeabi-v7a/lib/pkgconfig/ncnn.pc
@@ -0,0 +1,11 @@
+prefix=${pcfiledir}/../..
+librarydir=${prefix}/lib
+includedir=${prefix}/include
+
+Name: ncnn
+Description: high-performance neural network inference framework optimized for the mobile platform
+Version: 1.0.20221128
+URL: https://github.com/Tencent/ncnn
+Libs: -L"${librarydir}" -lncnn
+Cflags: -I"${includedir}"
+
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/allocator.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/allocator.h
new file mode 100644
index 0000000..3a5ebca
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/allocator.h
@@ -0,0 +1,448 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_ALLOCATOR_H
+#define NCNN_ALLOCATOR_H
+
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+
+#include "platform.h"
+
+#include <stdlib.h>
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+#include <android/hardware_buffer.h>
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+// the alignment of all the allocated buffers
+#if NCNN_AVX512
+#define NCNN_MALLOC_ALIGN 64
+#elif NCNN_AVX
+#define NCNN_MALLOC_ALIGN 32
+#else
+#define NCNN_MALLOC_ALIGN 16
+#endif
+
+// we have some optimized kernels that may overread buffer a bit in loop
+// it is common to interleave next-loop data load with arithmetic instructions
+// allocating more bytes keeps us safe from SEGV_ACCERR failure
+#define NCNN_MALLOC_OVERREAD 64
+
+// Aligns a pointer to the specified number of bytes
+// ptr Aligned pointer
+// n Alignment size that must be a power of two
+template<typename _Tp>
+static NCNN_FORCEINLINE _Tp* alignPtr(_Tp* ptr, int n = (int)sizeof(_Tp))
+{
+    return (_Tp*)(((size_t)ptr + n - 1) & -n);
+}
+
+// Aligns a buffer size to the specified number of bytes
+// The function returns the minimum number that is greater or equal to sz and is divisible by n
+// sz Buffer size to align
+// n Alignment size that must be a power of two
+static NCNN_FORCEINLINE size_t alignSize(size_t sz, int n)
+{
+    return (sz + n - 1) & -n;
+}
+
+static NCNN_FORCEINLINE void* fastMalloc(size_t size)
+{
+#if _MSC_VER
+    return _aligned_malloc(size, NCNN_MALLOC_ALIGN);
+#elif (defined(__unix__) || defined(__APPLE__)) && _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17)
+    void* ptr = 0;
+    if (posix_memalign(&ptr, NCNN_MALLOC_ALIGN, size + NCNN_MALLOC_OVERREAD))
+        ptr = 0;
+    return ptr;
+#elif __ANDROID__ && __ANDROID_API__ < 17
+    return memalign(NCNN_MALLOC_ALIGN, size + NCNN_MALLOC_OVERREAD);
+#else
+    unsigned char* udata = (unsigned char*)malloc(size + sizeof(void*) + NCNN_MALLOC_ALIGN + NCNN_MALLOC_OVERREAD);
+    if (!udata)
+        return 0;
+    unsigned char** adata = alignPtr((unsigned char**)udata + 1, NCNN_MALLOC_ALIGN);
+    adata[-1] = udata;
+    return adata;
+#endif
+}
+
+static NCNN_FORCEINLINE void fastFree(void* ptr)
+{
+    if (ptr)
+    {
+#if _MSC_VER
+        _aligned_free(ptr);
+#elif (defined(__unix__) || defined(__APPLE__)) && _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17)
+        free(ptr);
+#elif __ANDROID__ && __ANDROID_API__ < 17
+        free(ptr);
+#else
+        unsigned char* udata = ((unsigned char**)ptr)[-1];
+        free(udata);
+#endif
+    }
+}
+
+#if NCNN_THREADS
+// exchange-add operation for atomic operations on reference counters
+#if defined __riscv && !defined __riscv_atomic
+// riscv target without A extension
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#elif defined __INTEL_COMPILER && !(defined WIN32 || defined _WIN32)
+// atomic increment on the linux version of the Intel(tm) compiler
+#define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd(const_cast<void*>(reinterpret_cast<volatile void*>(addr)), delta)
+#elif defined __GNUC__
+#if defined __clang__ && __clang_major__ >= 3 && !defined __ANDROID__ && !defined __EMSCRIPTEN__ && !defined(__CUDACC__)
+#ifdef __ATOMIC_ACQ_REL
+#define NCNN_XADD(addr, delta) __c11_atomic_fetch_add((_Atomic(int)*)(addr), delta, __ATOMIC_ACQ_REL)
+#else
+#define NCNN_XADD(addr, delta) __atomic_fetch_add((_Atomic(int)*)(addr), delta, 4)
+#endif
+#else
+#if defined __ATOMIC_ACQ_REL && !defined __clang__
+// version for gcc >= 4.7
+#define NCNN_XADD(addr, delta) (int)__atomic_fetch_add((unsigned*)(addr), (unsigned)(delta), __ATOMIC_ACQ_REL)
+#else
+#define NCNN_XADD(addr, delta) (int)__sync_fetch_and_add((unsigned*)(addr), (unsigned)(delta))
+#endif
+#endif
+#elif defined _MSC_VER && !defined RC_INVOKED
+#define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile*)addr, delta)
+#else
+// thread-unsafe branch
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#endif
+#else  // NCNN_THREADS
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#endif // NCNN_THREADS
+
+class NCNN_EXPORT Allocator
+{
+public:
+    virtual ~Allocator();
+    virtual void* fastMalloc(size_t size) = 0;
+    virtual void fastFree(void* ptr) = 0;
+};
+
+class PoolAllocatorPrivate;
+class NCNN_EXPORT PoolAllocator : public Allocator
+{
+public:
+    PoolAllocator();
+    ~PoolAllocator();
+
+    // ratio range 0 ~ 1
+    // default cr = 0
+    void set_size_compare_ratio(float scr);
+
+    // budget drop threshold
+    // default threshold = 10
+    void set_size_drop_threshold(size_t);
+
+    // release all budgets immediately
+    void clear();
+
+    virtual void* fastMalloc(size_t size);
+    virtual void fastFree(void* ptr);
+
+private:
+    PoolAllocator(const PoolAllocator&);
+    PoolAllocator& operator=(const PoolAllocator&);
+
+private:
+    PoolAllocatorPrivate* const d;
+};
+
+class UnlockedPoolAllocatorPrivate;
+class NCNN_EXPORT UnlockedPoolAllocator : public Allocator
+{
+public:
+    UnlockedPoolAllocator();
+    ~UnlockedPoolAllocator();
+
+    // ratio range 0 ~ 1
+    // default cr = 0
+    void set_size_compare_ratio(float scr);
+
+    // budget drop threshold
+    // default threshold = 10
+    void set_size_drop_threshold(size_t);
+
+    // release all budgets immediately
+    void clear();
+
+    virtual void* fastMalloc(size_t size);
+    virtual void fastFree(void* ptr);
+
+private:
+    UnlockedPoolAllocator(const UnlockedPoolAllocator&);
+    UnlockedPoolAllocator& operator=(const UnlockedPoolAllocator&);
+
+private:
+    UnlockedPoolAllocatorPrivate* const d;
+};
+
+#if NCNN_VULKAN
+
+class VulkanDevice;
+
+class NCNN_EXPORT VkBufferMemory
+{
+public:
+    VkBuffer buffer;
+
+    // the base offset assigned by allocator
+    size_t offset;
+    size_t capacity;
+
+    VkDeviceMemory memory;
+    void* mapped_ptr;
+
+    // buffer state, modified by command functions internally
+    mutable VkAccessFlags access_flags;
+    mutable VkPipelineStageFlags stage_flags;
+
+    // initialize and modified by mat
+    int refcount;
+};
+
+class NCNN_EXPORT VkImageMemory
+{
+public:
+    VkImage image;
+    VkImageView imageview;
+
+    // underlying info assigned by allocator
+    int width;
+    int height;
+    int depth;
+    VkFormat format;
+
+    VkDeviceMemory memory;
+    void* mapped_ptr;
+
+    // the base offset assigned by allocator
+    size_t bind_offset;
+    size_t bind_capacity;
+
+    // image state, modified by command functions internally
+    mutable VkAccessFlags access_flags;
+    mutable VkImageLayout image_layout;
+    mutable VkPipelineStageFlags stage_flags;
+
+    // in-execution state, modified by command functions internally
+    mutable int command_refcount;
+
+    // initialize and modified by mat
+    int refcount;
+};
+
+class NCNN_EXPORT VkAllocator
+{
+public:
+    explicit VkAllocator(const VulkanDevice* _vkdev);
+    virtual ~VkAllocator();
+
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size) = 0;
+    virtual void fastFree(VkBufferMemory* ptr) = 0;
+    virtual int flush(VkBufferMemory* ptr);
+    virtual int invalidate(VkBufferMemory* ptr);
+
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack) = 0;
+    virtual void fastFree(VkImageMemory* ptr) = 0;
+
+public:
+    const VulkanDevice* vkdev;
+    uint32_t buffer_memory_type_index;
+    uint32_t image_memory_type_index;
+    uint32_t reserved_type_index;
+    bool mappable;
+    bool coherent;
+
+protected:
+    VkBuffer create_buffer(size_t size, VkBufferUsageFlags usage);
+    VkDeviceMemory allocate_memory(size_t size, uint32_t memory_type_index);
+    VkDeviceMemory allocate_dedicated_memory(size_t size, uint32_t memory_type_index, VkImage image, VkBuffer buffer);
+
+    VkImage create_image(int width, int height, int depth, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage);
+    VkImageView create_imageview(VkImage image, VkFormat format);
+};
+
+class VkBlobAllocatorPrivate;
+class NCNN_EXPORT VkBlobAllocator : public VkAllocator
+{
+public:
+    explicit VkBlobAllocator(const VulkanDevice* vkdev, size_t preferred_block_size = 16 * 1024 * 1024); // 16M
+    virtual ~VkBlobAllocator();
+
+public:
+    // release all budgets immediately
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkBlobAllocator(const VkBlobAllocator&);
+    VkBlobAllocator& operator=(const VkBlobAllocator&);
+
+private:
+    VkBlobAllocatorPrivate* const d;
+};
+
+class VkWeightAllocatorPrivate;
+class NCNN_EXPORT VkWeightAllocator : public VkAllocator
+{
+public:
+    explicit VkWeightAllocator(const VulkanDevice* vkdev, size_t preferred_block_size = 8 * 1024 * 1024); // 8M
+    virtual ~VkWeightAllocator();
+
+public:
+    // release all blocks immediately
+    virtual void clear();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkWeightAllocator(const VkWeightAllocator&);
+    VkWeightAllocator& operator=(const VkWeightAllocator&);
+
+private:
+    VkWeightAllocatorPrivate* const d;
+};
+
+class VkStagingAllocatorPrivate;
+class NCNN_EXPORT VkStagingAllocator : public VkAllocator
+{
+public:
+    explicit VkStagingAllocator(const VulkanDevice* vkdev);
+    virtual ~VkStagingAllocator();
+
+public:
+    // ratio range 0 ~ 1
+    // default cr = 0.75
+    void set_size_compare_ratio(float scr);
+
+    // release all budgets immediately
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkStagingAllocator(const VkStagingAllocator&);
+    VkStagingAllocator& operator=(const VkStagingAllocator&);
+
+private:
+    VkStagingAllocatorPrivate* const d;
+};
+
+class VkWeightStagingAllocatorPrivate;
+class NCNN_EXPORT VkWeightStagingAllocator : public VkAllocator
+{
+public:
+    explicit VkWeightStagingAllocator(const VulkanDevice* vkdev);
+    virtual ~VkWeightStagingAllocator();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkWeightStagingAllocator(const VkWeightStagingAllocator&);
+    VkWeightStagingAllocator& operator=(const VkWeightStagingAllocator&);
+
+private:
+    VkWeightStagingAllocatorPrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class NCNN_EXPORT VkAndroidHardwareBufferImageAllocator : public VkAllocator
+{
+public:
+    VkAndroidHardwareBufferImageAllocator(const VulkanDevice* _vkdev, AHardwareBuffer* _hb);
+    virtual ~VkAndroidHardwareBufferImageAllocator();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkAndroidHardwareBufferImageAllocator(const VkAndroidHardwareBufferImageAllocator&);
+    VkAndroidHardwareBufferImageAllocator& operator=(const VkAndroidHardwareBufferImageAllocator&);
+
+public:
+    int init();
+
+    int width() const;
+    int height() const;
+    uint64_t external_format() const;
+
+public:
+    AHardwareBuffer* hb;
+    AHardwareBuffer_Desc bufferDesc;
+    VkAndroidHardwareBufferFormatPropertiesANDROID bufferFormatProperties;
+    VkAndroidHardwareBufferPropertiesANDROID bufferProperties;
+    VkSamplerYcbcrConversionKHR samplerYcbcrConversion;
+};
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_ALLOCATOR_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/benchmark.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/benchmark.h
new file mode 100644
index 0000000..3d5c0cd
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/benchmark.h
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_BENCHMARK_H
+#define NCNN_BENCHMARK_H
+
+#include "layer.h"
+#include "mat.h"
+#include "platform.h"
+
+namespace ncnn {
+
+// get now timestamp in ms
+NCNN_EXPORT double get_current_time();
+
+#if NCNN_BENCHMARK
+
+NCNN_EXPORT void benchmark(const Layer* layer, double start, double end);
+NCNN_EXPORT void benchmark(const Layer* layer, const Mat& bottom_blob, Mat& top_blob, double start, double end);
+
+#endif // NCNN_BENCHMARK
+
+} // namespace ncnn
+
+#endif // NCNN_BENCHMARK_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/blob.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/blob.h
new file mode 100644
index 0000000..c9f144f
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/blob.h
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_BLOB_H
+#define NCNN_BLOB_H
+
+#include "mat.h"
+#include "platform.h"
+
+namespace ncnn {
+
+class NCNN_EXPORT Blob
+{
+public:
+    // empty
+    Blob();
+
+public:
+#if NCNN_STRING
+    // blob name
+    std::string name;
+#endif // NCNN_STRING
+    // layer index which produce this blob as output
+    int producer;
+    // layer index which need this blob as input
+    int consumer;
+    // shape hint
+    Mat shape;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_BLOB_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/c_api.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/c_api.h
new file mode 100644
index 0000000..b7435f8
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/c_api.h
@@ -0,0 +1,327 @@
+/* Tencent is pleased to support the open source community by making ncnn available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed
+ * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+ * CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ */
+
+#ifndef NCNN_C_API_H
+#define NCNN_C_API_H
+
+#include "platform.h"
+
+#if NCNN_C_API
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+NCNN_EXPORT const char* ncnn_version();
+
+/* allocator api */
+typedef struct __ncnn_allocator_t* ncnn_allocator_t;
+struct NCNN_EXPORT __ncnn_allocator_t
+{
+    void* pthis;
+
+    void* (*fast_malloc)(ncnn_allocator_t allocator, size_t size);
+    void (*fast_free)(ncnn_allocator_t allocator, void* ptr);
+};
+
+NCNN_EXPORT ncnn_allocator_t ncnn_allocator_create_pool_allocator();
+NCNN_EXPORT ncnn_allocator_t ncnn_allocator_create_unlocked_pool_allocator();
+NCNN_EXPORT void ncnn_allocator_destroy(ncnn_allocator_t allocator);
+
+/* option api */
+typedef struct __ncnn_option_t* ncnn_option_t;
+
+NCNN_EXPORT ncnn_option_t ncnn_option_create();
+NCNN_EXPORT void ncnn_option_destroy(ncnn_option_t opt);
+
+NCNN_EXPORT int ncnn_option_get_num_threads(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_num_threads(ncnn_option_t opt, int num_threads);
+
+NCNN_EXPORT int ncnn_option_get_use_local_pool_allocator(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_use_local_pool_allocator(ncnn_option_t opt, int use_local_pool_allocator);
+
+NCNN_EXPORT void ncnn_option_set_blob_allocator(ncnn_option_t opt, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_option_set_workspace_allocator(ncnn_option_t opt, ncnn_allocator_t allocator);
+
+NCNN_EXPORT int ncnn_option_get_use_vulkan_compute(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_use_vulkan_compute(ncnn_option_t opt, int use_vulkan_compute);
+
+/* mat api */
+typedef struct __ncnn_mat_t* ncnn_mat_t;
+
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create();
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_1d(int w, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_2d(int w, int h, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_3d(int w, int h, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_4d(int w, int h, int d, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_1d(int w, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_2d(int w, int h, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_3d(int w, int h, int c, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_4d(int w, int h, int d, int c, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_1d_elem(int w, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_2d_elem(int w, int h, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_3d_elem(int w, int h, int c, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_4d_elem(int w, int h, int d, int c, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_1d_elem(int w, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_2d_elem(int w, int h, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_3d_elem(int w, int h, int c, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_4d_elem(int w, int h, int d, int c, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_mat_destroy(ncnn_mat_t mat);
+
+NCNN_EXPORT void ncnn_mat_fill_float(ncnn_mat_t mat, float v);
+
+NCNN_EXPORT ncnn_mat_t ncnn_mat_clone(const ncnn_mat_t mat, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_1d(const ncnn_mat_t mat, int w, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_2d(const ncnn_mat_t mat, int w, int h, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_3d(const ncnn_mat_t mat, int w, int h, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_4d(const ncnn_mat_t mat, int w, int h, int d, int c, ncnn_allocator_t allocator);
+
+NCNN_EXPORT int ncnn_mat_get_dims(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_w(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_h(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_d(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_c(const ncnn_mat_t mat);
+NCNN_EXPORT size_t ncnn_mat_get_elemsize(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_elempack(const ncnn_mat_t mat);
+NCNN_EXPORT size_t ncnn_mat_get_cstep(const ncnn_mat_t mat);
+NCNN_EXPORT void* ncnn_mat_get_data(const ncnn_mat_t mat);
+
+NCNN_EXPORT void* ncnn_mat_get_channel_data(const ncnn_mat_t mat, int c);
+
+#if NCNN_PIXEL
+
+/* mat pixel api */
+#define NCNN_MAT_PIXEL_RGB       1
+#define NCNN_MAT_PIXEL_BGR       2
+#define NCNN_MAT_PIXEL_GRAY      3
+#define NCNN_MAT_PIXEL_RGBA      4
+#define NCNN_MAT_PIXEL_BGRA      5
+#define NCNN_MAT_PIXEL_X2Y(X, Y) (X | (Y << 16))
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels(const unsigned char* pixels, int type, int w, int h, int stride, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int stride, int target_width, int target_height, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, int target_width, int target_height, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_mat_to_pixels(const ncnn_mat_t mat, unsigned char* pixels, int type, int stride);
+NCNN_EXPORT void ncnn_mat_to_pixels_resize(const ncnn_mat_t mat, unsigned char* pixels, int type, int target_width, int target_height, int target_stride);
+
+#endif /* NCNN_PIXEL */
+
+NCNN_EXPORT void ncnn_mat_substract_mean_normalize(ncnn_mat_t mat, const float* mean_vals, const float* norm_vals);
+
+NCNN_EXPORT void ncnn_convert_packing(const ncnn_mat_t src, ncnn_mat_t* dst, int elempack, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_flatten(const ncnn_mat_t src, ncnn_mat_t* dst, const ncnn_option_t opt);
+
+/* blob api */
+typedef struct __ncnn_blob_t* ncnn_blob_t;
+
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_blob_get_name(const ncnn_blob_t blob);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_blob_get_producer(const ncnn_blob_t blob);
+NCNN_EXPORT int ncnn_blob_get_consumer(const ncnn_blob_t blob);
+
+NCNN_EXPORT void ncnn_blob_get_shape(const ncnn_blob_t blob, int* dims, int* w, int* h, int* c);
+
+/* paramdict api */
+typedef struct __ncnn_paramdict_t* ncnn_paramdict_t;
+
+NCNN_EXPORT ncnn_paramdict_t ncnn_paramdict_create();
+NCNN_EXPORT void ncnn_paramdict_destroy(ncnn_paramdict_t pd);
+
+NCNN_EXPORT int ncnn_paramdict_get_type(const ncnn_paramdict_t pd, int id);
+
+NCNN_EXPORT int ncnn_paramdict_get_int(const ncnn_paramdict_t pd, int id, int def);
+NCNN_EXPORT float ncnn_paramdict_get_float(const ncnn_paramdict_t pd, int id, float def);
+NCNN_EXPORT ncnn_mat_t ncnn_paramdict_get_array(const ncnn_paramdict_t pd, int id, const ncnn_mat_t def);
+
+NCNN_EXPORT void ncnn_paramdict_set_int(ncnn_paramdict_t pd, int id, int i);
+NCNN_EXPORT void ncnn_paramdict_set_float(ncnn_paramdict_t pd, int id, float f);
+NCNN_EXPORT void ncnn_paramdict_set_array(ncnn_paramdict_t pd, int id, const ncnn_mat_t v);
+
+/* datareader api */
+typedef struct __ncnn_datareader_t* ncnn_datareader_t;
+struct NCNN_EXPORT __ncnn_datareader_t
+{
+    void* pthis;
+
+#if NCNN_STRING
+    int (*scan)(ncnn_datareader_t dr, const char* format, void* p);
+#endif /* NCNN_STRING */
+    size_t (*read)(ncnn_datareader_t dr, void* buf, size_t size);
+};
+
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create();
+#if NCNN_STDIO
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create_from_stdio(FILE* fp);
+#endif /* NCNN_STDIO */
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create_from_memory(const unsigned char** mem);
+NCNN_EXPORT void ncnn_datareader_destroy(ncnn_datareader_t dr);
+
+/* modelbin api */
+typedef struct __ncnn_modelbin_t* ncnn_modelbin_t;
+struct NCNN_EXPORT __ncnn_modelbin_t
+{
+    void* pthis;
+
+    ncnn_mat_t (*load_1d)(const ncnn_modelbin_t mb, int w, int type);
+    ncnn_mat_t (*load_2d)(const ncnn_modelbin_t mb, int w, int h, int type);
+    ncnn_mat_t (*load_3d)(const ncnn_modelbin_t mb, int w, int h, int c, int type);
+};
+
+NCNN_EXPORT ncnn_modelbin_t ncnn_modelbin_create_from_datareader(const ncnn_datareader_t dr);
+NCNN_EXPORT ncnn_modelbin_t ncnn_modelbin_create_from_mat_array(const ncnn_mat_t* weights, int n);
+NCNN_EXPORT void ncnn_modelbin_destroy(ncnn_modelbin_t mb);
+
+/* layer api */
+typedef struct __ncnn_layer_t* ncnn_layer_t;
+struct NCNN_EXPORT __ncnn_layer_t
+{
+    void* pthis;
+
+    int (*load_param)(ncnn_layer_t layer, const ncnn_paramdict_t pd);
+    int (*load_model)(ncnn_layer_t layer, const ncnn_modelbin_t mb);
+
+    int (*create_pipeline)(ncnn_layer_t layer, const ncnn_option_t opt);
+    int (*destroy_pipeline)(ncnn_layer_t layer, const ncnn_option_t opt);
+
+    int (*forward_1)(const ncnn_layer_t layer, const ncnn_mat_t bottom_blob, ncnn_mat_t* top_blob, const ncnn_option_t opt);
+    int (*forward_n)(const ncnn_layer_t layer, const ncnn_mat_t* bottom_blobs, int n, ncnn_mat_t* top_blobs, int n2, const ncnn_option_t opt);
+
+    int (*forward_inplace_1)(const ncnn_layer_t layer, ncnn_mat_t bottom_top_blob, const ncnn_option_t opt);
+    int (*forward_inplace_n)(const ncnn_layer_t layer, ncnn_mat_t* bottom_top_blobs, int n, const ncnn_option_t opt);
+};
+
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create();
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create_by_typeindex(int typeindex);
+#if NCNN_STRING
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create_by_type(const char* type);
+#endif /* NCNN_STRING */
+NCNN_EXPORT void ncnn_layer_destroy(ncnn_layer_t layer);
+
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_layer_get_name(const ncnn_layer_t layer);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_layer_get_typeindex(const ncnn_layer_t layer);
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_layer_get_type(const ncnn_layer_t layer);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_layer_get_one_blob_only(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_inplace(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_vulkan(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_packing(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_bf16_storage(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_fp16_storage(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_image_storage(const ncnn_layer_t layer);
+
+NCNN_EXPORT void ncnn_layer_set_one_blob_only(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_inplace(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_vulkan(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_packing(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_bf16_storage(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_fp16_storage(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_image_storage(ncnn_layer_t layer, int enable);
+
+NCNN_EXPORT int ncnn_layer_get_bottom_count(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_bottom(const ncnn_layer_t layer, int i);
+NCNN_EXPORT int ncnn_layer_get_top_count(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_top(const ncnn_layer_t layer, int i);
+
+NCNN_EXPORT void ncnn_blob_get_bottom_shape(const ncnn_layer_t layer, int i, int* dims, int* w, int* h, int* c);
+NCNN_EXPORT void ncnn_blob_get_top_shape(const ncnn_layer_t layer, int i, int* dims, int* w, int* h, int* c);
+
+/* layer factory function */
+typedef ncnn_layer_t (*ncnn_layer_creator_t)(void* userdata);
+typedef void (*ncnn_layer_destroyer_t)(ncnn_layer_t layer, void* userdata);
+
+typedef struct __ncnn_net_custom_layer_factory_t* ncnn_net_custom_layer_factory_t;
+struct __ncnn_net_custom_layer_factory_t
+{
+    ncnn_layer_creator_t creator;
+    ncnn_layer_destroyer_t destroyer;
+    void* userdata;
+    ncnn_net_custom_layer_factory_t next;
+};
+
+/* net api */
+typedef struct __ncnn_net_t* ncnn_net_t;
+struct __ncnn_net_t
+{
+    void* pthis;
+
+    ncnn_net_custom_layer_factory_t custom_layer_factory;
+};
+
+NCNN_EXPORT ncnn_net_t ncnn_net_create();
+NCNN_EXPORT void ncnn_net_destroy(ncnn_net_t net);
+
+NCNN_EXPORT ncnn_option_t ncnn_net_get_option(ncnn_net_t net);
+NCNN_EXPORT void ncnn_net_set_option(ncnn_net_t net, ncnn_option_t opt);
+
+#if NCNN_STRING
+NCNN_EXPORT void ncnn_net_register_custom_layer_by_type(ncnn_net_t net, const char* type, ncnn_layer_creator_t creator, ncnn_layer_destroyer_t destroyer, void* userdata);
+#endif /* NCNN_STRING */
+NCNN_EXPORT void ncnn_net_register_custom_layer_by_typeindex(ncnn_net_t net, int typeindex, ncnn_layer_creator_t creator, ncnn_layer_destroyer_t destroyer, void* userdata);
+
+#if NCNN_STDIO
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param(ncnn_net_t net, const char* path);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_net_load_param_bin(ncnn_net_t net, const char* path);
+NCNN_EXPORT int ncnn_net_load_model(ncnn_net_t net, const char* path);
+#endif /* NCNN_STDIO */
+
+#if NCNN_STDIO
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param_memory(ncnn_net_t net, const char* mem);
+#endif /* NCNN_STRING */
+#endif /* NCNN_STDIO */
+NCNN_EXPORT int ncnn_net_load_param_bin_memory(ncnn_net_t net, const unsigned char* mem);
+NCNN_EXPORT int ncnn_net_load_model_memory(ncnn_net_t net, const unsigned char* mem);
+
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_net_load_param_bin_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+NCNN_EXPORT int ncnn_net_load_model_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+
+NCNN_EXPORT void ncnn_net_clear(ncnn_net_t net);
+
+/* extractor api */
+typedef struct __ncnn_extractor_t* ncnn_extractor_t;
+
+NCNN_EXPORT ncnn_extractor_t ncnn_extractor_create(ncnn_net_t net);
+NCNN_EXPORT void ncnn_extractor_destroy(ncnn_extractor_t ex);
+
+NCNN_EXPORT void ncnn_extractor_set_option(ncnn_extractor_t ex, const ncnn_option_t opt);
+
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_extractor_input(ncnn_extractor_t ex, const char* name, const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_extractor_extract(ncnn_extractor_t ex, const char* name, ncnn_mat_t* mat);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_extractor_input_index(ncnn_extractor_t ex, int index, const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_extractor_extract_index(ncnn_extractor_t ex, int index, ncnn_mat_t* mat);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* NCNN_C_API */
+
+#endif /* NCNN_C_API_H */
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/command.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/command.h
new file mode 100644
index 0000000..337d085
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/command.h
@@ -0,0 +1,136 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_COMMAND_H
+#define NCNN_COMMAND_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+
+#include "mat.h"
+
+#include <vulkan/vulkan.h>
+
+namespace ncnn {
+
+class Pipeline;
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class ImportAndroidHardwareBufferPipeline;
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+class VkComputePrivate;
+class NCNN_EXPORT VkCompute
+{
+public:
+    explicit VkCompute(const VulkanDevice* vkdev);
+    virtual ~VkCompute();
+
+public:
+    void record_upload(const Mat& src, VkMat& dst, const Option& opt);
+
+    void record_upload(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    void record_download(const VkMat& src, Mat& dst, const Option& opt);
+
+    void record_download(const VkImageMat& src, Mat& dst, const Option& opt);
+
+    void record_buffer_to_image(const VkMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_image_to_buffer(const VkImageMat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const Mat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, Mat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, Mat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, VkMat& dst, const Option& opt);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkImageMat>& bindings, const std::vector<vk_constant_type>& constants, const VkImageMat& dispatcher);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher);
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const VkImageMat& dispatcher);
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const Mat& dispatcher);
+
+#if NCNN_BENCHMARK
+    void record_write_timestamp(uint32_t query);
+#endif // NCNN_BENCHMARK
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+    void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkMat& dst);
+
+    void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkImageMat& dst);
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+    int submit_and_wait();
+
+    int reset();
+
+#if NCNN_BENCHMARK
+    int create_query_pool(uint32_t query_count);
+
+    int get_query_pool_results(uint32_t first_query, uint32_t query_count, std::vector<uint64_t>& results);
+#endif // NCNN_BENCHMARK
+
+protected:
+    const VulkanDevice* vkdev;
+
+    void barrier_readwrite(const VkMat& binding);
+    void barrier_readwrite(const VkImageMat& binding);
+    void barrier_readonly(const VkImageMat& binding);
+
+private:
+    VkComputePrivate* const d;
+};
+
+class VkTransferPrivate;
+class NCNN_EXPORT VkTransfer
+{
+public:
+    explicit VkTransfer(const VulkanDevice* vkdev);
+    virtual ~VkTransfer();
+
+public:
+    void record_upload(const Mat& src, VkMat& dst, const Option& opt, bool flatten = true);
+
+    void record_upload(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    int submit_and_wait();
+
+protected:
+    const VulkanDevice* vkdev;
+
+private:
+    VkTransferPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_VULKAN
+
+#endif // NCNN_COMMAND_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/cpu.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/cpu.h
new file mode 100644
index 0000000..0f748f3
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/cpu.h
@@ -0,0 +1,169 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_CPU_H
+#define NCNN_CPU_H
+
+#include <stddef.h>
+
+#if (defined _WIN32 && !(defined __MINGW32__))
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+#if defined __ANDROID__ || defined __linux__
+#include <sched.h> // cpu_set_t
+#endif
+
+#include "platform.h"
+
+namespace ncnn {
+
+class NCNN_EXPORT CpuSet
+{
+public:
+    CpuSet();
+    void enable(int cpu);
+    void disable(int cpu);
+    void disable_all();
+    bool is_enabled(int cpu) const;
+    int num_enabled() const;
+
+public:
+#if (defined _WIN32 && !(defined __MINGW32__))
+    ULONG_PTR mask;
+#endif
+#if defined __ANDROID__ || defined __linux__
+    cpu_set_t cpu_set;
+#endif
+#if __APPLE__
+    unsigned int policy;
+#endif
+};
+
+// test optional cpu features
+// edsp = armv7 edsp
+NCNN_EXPORT int cpu_support_arm_edsp();
+// neon = armv7 neon or aarch64 asimd
+NCNN_EXPORT int cpu_support_arm_neon();
+// vfpv4 = armv7 fp16 + fma
+NCNN_EXPORT int cpu_support_arm_vfpv4();
+// asimdhp = aarch64 asimd half precision
+NCNN_EXPORT int cpu_support_arm_asimdhp();
+// asimddp = aarch64 asimd dot product
+NCNN_EXPORT int cpu_support_arm_asimddp();
+// asimdfhm = aarch64 asimd fhm
+NCNN_EXPORT int cpu_support_arm_asimdfhm();
+// bf16 = aarch64 bf16
+NCNN_EXPORT int cpu_support_arm_bf16();
+// i8mm = aarch64 i8mm
+NCNN_EXPORT int cpu_support_arm_i8mm();
+// sve = aarch64 sve
+NCNN_EXPORT int cpu_support_arm_sve();
+// sve2 = aarch64 sve2
+NCNN_EXPORT int cpu_support_arm_sve2();
+// svebf16 = aarch64 svebf16
+NCNN_EXPORT int cpu_support_arm_svebf16();
+// svei8mm = aarch64 svei8mm
+NCNN_EXPORT int cpu_support_arm_svei8mm();
+// svef32mm = aarch64 svef32mm
+NCNN_EXPORT int cpu_support_arm_svef32mm();
+
+// avx = x86 avx
+NCNN_EXPORT int cpu_support_x86_avx();
+// fma = x86 fma
+NCNN_EXPORT int cpu_support_x86_fma();
+// xop = x86 xop
+NCNN_EXPORT int cpu_support_x86_xop();
+// f16c = x86 f16c
+NCNN_EXPORT int cpu_support_x86_f16c();
+// avx2 = x86 avx2 + fma + f16c
+NCNN_EXPORT int cpu_support_x86_avx2();
+// avx_vnni = x86 avx vnni
+NCNN_EXPORT int cpu_support_x86_avx_vnni();
+// avx512 = x86 avx512f + avx512cd + avx512bw + avx512dq + avx512vl
+NCNN_EXPORT int cpu_support_x86_avx512();
+// avx512_vnni = x86 avx512 vnni
+NCNN_EXPORT int cpu_support_x86_avx512_vnni();
+// avx512_bf16 = x86 avx512 bf16
+NCNN_EXPORT int cpu_support_x86_avx512_bf16();
+// avx512_fp16 = x86 avx512 fp16
+NCNN_EXPORT int cpu_support_x86_avx512_fp16();
+
+// lsx = loongarch lsx
+NCNN_EXPORT int cpu_support_loongarch_lsx();
+// lasx = loongarch lasx
+NCNN_EXPORT int cpu_support_loongarch_lasx();
+
+// msa = mips mas
+NCNN_EXPORT int cpu_support_mips_msa();
+// mmi = loongson mmi
+NCNN_EXPORT int cpu_support_loongson_mmi();
+
+// v = riscv vector
+NCNN_EXPORT int cpu_support_riscv_v();
+// zfh = riscv half-precision float
+NCNN_EXPORT int cpu_support_riscv_zfh();
+// vlenb = riscv vector length in bytes
+NCNN_EXPORT int cpu_riscv_vlenb();
+
+// cpu info
+NCNN_EXPORT int get_cpu_count();
+NCNN_EXPORT int get_little_cpu_count();
+NCNN_EXPORT int get_big_cpu_count();
+
+NCNN_EXPORT int get_physical_cpu_count();
+NCNN_EXPORT int get_physical_little_cpu_count();
+NCNN_EXPORT int get_physical_big_cpu_count();
+
+// bind all threads on little clusters if powersave enabled
+// affects HMP arch cpu like ARM big.LITTLE
+// only implemented on android at the moment
+// switching powersave is expensive and not thread-safe
+// 0 = all cores enabled(default)
+// 1 = only little clusters enabled
+// 2 = only big clusters enabled
+// return 0 if success for setter function
+NCNN_EXPORT int get_cpu_powersave();
+NCNN_EXPORT int set_cpu_powersave(int powersave);
+
+// convenient wrapper
+NCNN_EXPORT const CpuSet& get_cpu_thread_affinity_mask(int powersave);
+
+// set explicit thread affinity
+NCNN_EXPORT int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask);
+
+// misc function wrapper for openmp routines
+NCNN_EXPORT int get_omp_num_threads();
+NCNN_EXPORT void set_omp_num_threads(int num_threads);
+
+NCNN_EXPORT int get_omp_dynamic();
+NCNN_EXPORT void set_omp_dynamic(int dynamic);
+
+NCNN_EXPORT int get_omp_thread_num();
+
+NCNN_EXPORT int get_kmp_blocktime();
+NCNN_EXPORT void set_kmp_blocktime(int time_ms);
+
+// need to flush denormals on Intel Chipset.
+// Other architectures such as ARM can be added as needed.
+// 0 = DAZ OFF, FTZ OFF
+// 1 = DAZ ON , FTZ OFF
+// 2 = DAZ OFF, FTZ ON
+// 3 = DAZ ON,  FTZ ON
+NCNN_EXPORT int get_flush_denormals();
+NCNN_EXPORT int set_flush_denormals(int flush_denormals);
+
+} // namespace ncnn
+
+#endif // NCNN_CPU_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/datareader.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/datareader.h
new file mode 100644
index 0000000..ed2aba3
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/datareader.h
@@ -0,0 +1,122 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_DATAREADER_H
+#define NCNN_DATAREADER_H
+
+#include "platform.h"
+#if NCNN_STDIO
+#include <stdio.h>
+#endif
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/asset_manager.h>
+#endif
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+// data read wrapper
+class NCNN_EXPORT DataReader
+{
+public:
+    DataReader();
+    virtual ~DataReader();
+
+#if NCNN_STRING
+    // parse plain param text
+    // return 1 if scan success
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+
+    // read binary param and model data
+    // return bytes read
+    virtual size_t read(void* buf, size_t size) const;
+
+    // get model data reference
+    // return bytes referenced
+    virtual size_t reference(size_t size, const void** buf) const;
+};
+
+#if NCNN_STDIO
+class DataReaderFromStdioPrivate;
+class NCNN_EXPORT DataReaderFromStdio : public DataReader
+{
+public:
+    explicit DataReaderFromStdio(FILE* fp);
+    virtual ~DataReaderFromStdio();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+
+private:
+    DataReaderFromStdio(const DataReaderFromStdio&);
+    DataReaderFromStdio& operator=(const DataReaderFromStdio&);
+
+private:
+    DataReaderFromStdioPrivate* const d;
+};
+#endif // NCNN_STDIO
+
+class DataReaderFromMemoryPrivate;
+class NCNN_EXPORT DataReaderFromMemory : public DataReader
+{
+public:
+    explicit DataReaderFromMemory(const unsigned char*& mem);
+    virtual ~DataReaderFromMemory();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+    virtual size_t reference(size_t size, const void** buf) const;
+
+private:
+    DataReaderFromMemory(const DataReaderFromMemory&);
+    DataReaderFromMemory& operator=(const DataReaderFromMemory&);
+
+private:
+    DataReaderFromMemoryPrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+class DataReaderFromAndroidAssetPrivate;
+class NCNN_EXPORT DataReaderFromAndroidAsset : public DataReader
+{
+public:
+    explicit DataReaderFromAndroidAsset(AAsset* asset);
+    virtual ~DataReaderFromAndroidAsset();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+
+private:
+    DataReaderFromAndroidAsset(const DataReaderFromAndroidAsset&);
+    DataReaderFromAndroidAsset& operator=(const DataReaderFromAndroidAsset&);
+
+private:
+    DataReaderFromAndroidAssetPrivate* const d;
+};
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+} // namespace ncnn
+
+#endif // NCNN_DATAREADER_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/gpu.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/gpu.h
new file mode 100644
index 0000000..2ef4927
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/gpu.h
@@ -0,0 +1,359 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_GPU_H
+#define NCNN_GPU_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+
+#include "mat.h"
+
+#include <vulkan/vulkan.h>
+
+#include "vulkan_header_fix.h"
+
+namespace ncnn {
+
+// instance
+NCNN_EXPORT int create_gpu_instance();
+NCNN_EXPORT void destroy_gpu_instance();
+
+// instance extension capability
+extern int support_VK_KHR_external_memory_capabilities;
+extern int support_VK_KHR_get_physical_device_properties2;
+extern int support_VK_KHR_get_surface_capabilities2;
+extern int support_VK_KHR_surface;
+extern int support_VK_EXT_debug_utils;
+#if __ANDROID_API__ >= 26
+extern int support_VK_KHR_android_surface;
+#endif // __ANDROID_API__ >= 26
+
+// VK_KHR_external_memory_capabilities
+extern PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR vkGetPhysicalDeviceExternalBufferPropertiesKHR;
+
+// VK_KHR_get_physical_device_properties2
+extern PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR;
+extern PFN_vkGetPhysicalDeviceProperties2KHR vkGetPhysicalDeviceProperties2KHR;
+extern PFN_vkGetPhysicalDeviceFormatProperties2KHR vkGetPhysicalDeviceFormatProperties2KHR;
+extern PFN_vkGetPhysicalDeviceImageFormatProperties2KHR vkGetPhysicalDeviceImageFormatProperties2KHR;
+extern PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR vkGetPhysicalDeviceQueueFamilyProperties2KHR;
+extern PFN_vkGetPhysicalDeviceMemoryProperties2KHR vkGetPhysicalDeviceMemoryProperties2KHR;
+extern PFN_vkGetPhysicalDeviceSparseImageFormatProperties2KHR vkGetPhysicalDeviceSparseImageFormatProperties2KHR;
+
+// VK_KHR_get_surface_capabilities2
+extern PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR vkGetPhysicalDeviceSurfaceCapabilities2KHR;
+extern PFN_vkGetPhysicalDeviceSurfaceFormats2KHR vkGetPhysicalDeviceSurfaceFormats2KHR;
+
+// VK_KHR_surface
+extern PFN_vkDestroySurfaceKHR vkDestroySurfaceKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceSupportKHR vkGetPhysicalDeviceSurfaceSupportKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR vkGetPhysicalDeviceSurfaceCapabilitiesKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceFormatsKHR vkGetPhysicalDeviceSurfaceFormatsKHR;
+extern PFN_vkGetPhysicalDeviceSurfacePresentModesKHR vkGetPhysicalDeviceSurfacePresentModesKHR;
+
+#if __ANDROID_API__ >= 26
+// VK_KHR_android_surface
+extern PFN_vkCreateAndroidSurfaceKHR vkCreateAndroidSurfaceKHR;
+#endif // __ANDROID_API__ >= 26
+
+// VK_NV_cooperative_matrix
+extern PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesNV vkGetPhysicalDeviceCooperativeMatrixPropertiesNV;
+
+// get info
+NCNN_EXPORT int get_gpu_count();
+NCNN_EXPORT int get_default_gpu_index();
+
+class GpuInfoPrivate;
+class NCNN_EXPORT GpuInfo
+{
+public:
+    explicit GpuInfo();
+    virtual ~GpuInfo();
+
+    // vulkan physical device
+    VkPhysicalDevice physical_device() const;
+
+    // memory properties
+    const VkPhysicalDeviceMemoryProperties& physical_device_memory_properties() const;
+
+    // info
+    uint32_t api_version() const;
+    uint32_t driver_version() const;
+    uint32_t vendor_id() const;
+    uint32_t device_id() const;
+    const char* device_name() const;
+    uint8_t* pipeline_cache_uuid() const;
+
+    // 0 = discrete gpu
+    // 1 = integrated gpu
+    // 2 = virtual gpu
+    // 3 = cpu
+    int type() const;
+
+    // hardware limit
+    uint32_t max_shared_memory_size() const;
+    uint32_t max_workgroup_count_x() const;
+    uint32_t max_workgroup_count_y() const;
+    uint32_t max_workgroup_count_z() const;
+    uint32_t max_workgroup_invocations() const;
+    uint32_t max_workgroup_size_x() const;
+    uint32_t max_workgroup_size_y() const;
+    uint32_t max_workgroup_size_z() const;
+    size_t memory_map_alignment() const;
+    size_t buffer_offset_alignment() const;
+    size_t non_coherent_atom_size() const;
+    size_t buffer_image_granularity() const;
+    uint32_t max_image_dimension_1d() const;
+    uint32_t max_image_dimension_2d() const;
+    uint32_t max_image_dimension_3d() const;
+    float timestamp_period() const;
+
+    // runtime
+    uint32_t compute_queue_family_index() const;
+    uint32_t graphics_queue_family_index() const;
+    uint32_t transfer_queue_family_index() const;
+
+    uint32_t compute_queue_count() const;
+    uint32_t graphics_queue_count() const;
+    uint32_t transfer_queue_count() const;
+
+    // property
+    bool unified_compute_transfer_queue() const;
+
+    // subgroup
+    uint32_t subgroup_size() const;
+    bool support_subgroup_basic() const;
+    bool support_subgroup_vote() const;
+    bool support_subgroup_ballot() const;
+    bool support_subgroup_shuffle() const;
+
+    // bug is not feature
+    bool bug_storage_buffer_no_l1() const;
+    bool bug_corrupted_online_pipeline_cache() const;
+    bool bug_buffer_image_load_zero() const;
+
+    // but sometimes bug is a feature
+    bool bug_implicit_fp16_arithmetic() const;
+
+    // fp16 and int8 feature
+    bool support_fp16_packed() const;
+    bool support_fp16_storage() const;
+    bool support_fp16_arithmetic() const;
+    bool support_int8_packed() const;
+    bool support_int8_storage() const;
+    bool support_int8_arithmetic() const;
+
+    // ycbcr conversion feature
+    bool support_ycbcr_conversion() const;
+
+    // cooperative matrix feature
+    bool support_cooperative_matrix() const;
+    bool support_cooperative_matrix_16_8_8() const;
+
+    // extension capability
+    int support_VK_KHR_8bit_storage() const;
+    int support_VK_KHR_16bit_storage() const;
+    int support_VK_KHR_bind_memory2() const;
+    int support_VK_KHR_create_renderpass2() const;
+    int support_VK_KHR_dedicated_allocation() const;
+    int support_VK_KHR_descriptor_update_template() const;
+    int support_VK_KHR_external_memory() const;
+    int support_VK_KHR_get_memory_requirements2() const;
+    int support_VK_KHR_maintenance1() const;
+    int support_VK_KHR_maintenance2() const;
+    int support_VK_KHR_maintenance3() const;
+    int support_VK_KHR_multiview() const;
+    int support_VK_KHR_push_descriptor() const;
+    int support_VK_KHR_sampler_ycbcr_conversion() const;
+    int support_VK_KHR_shader_float16_int8() const;
+    int support_VK_KHR_shader_float_controls() const;
+    int support_VK_KHR_storage_buffer_storage_class() const;
+    int support_VK_KHR_swapchain() const;
+    int support_VK_EXT_descriptor_indexing() const;
+    int support_VK_EXT_memory_budget() const;
+    int support_VK_EXT_queue_family_foreign() const;
+#if __ANDROID_API__ >= 26
+    int support_VK_ANDROID_external_memory_android_hardware_buffer() const;
+#endif // __ANDROID_API__ >= 26
+    int support_VK_NV_cooperative_matrix() const;
+
+private:
+    GpuInfo(const GpuInfo&);
+    GpuInfo& operator=(const GpuInfo&);
+
+private:
+    friend int create_gpu_instance();
+    GpuInfoPrivate* const d;
+};
+
+NCNN_EXPORT const GpuInfo& get_gpu_info(int device_index = get_default_gpu_index());
+
+class VkAllocator;
+class VkCompute;
+class Option;
+class PipelineCache;
+class VulkanDevicePrivate;
+class NCNN_EXPORT VulkanDevice
+{
+public:
+    VulkanDevice(int device_index = get_default_gpu_index());
+    ~VulkanDevice();
+
+    const GpuInfo& info;
+
+    VkDevice vkdevice() const;
+
+    VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const;
+
+    // with fixed workgroup size
+    VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const;
+
+    // helper for creating pipeline
+    int create_descriptorset_layout(int binding_count, const int* binding_types, VkDescriptorSetLayout* descriptorset_layout) const;
+    int create_pipeline_layout(int push_constant_count, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout* pipeline_layout) const;
+    int create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, VkPipeline* pipeline) const;
+    int create_descriptor_update_template(int binding_count, const int* binding_types, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR* descriptor_update_template) const;
+
+    uint32_t find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const;
+    bool is_mappable(uint32_t memory_type_index) const;
+    bool is_coherent(uint32_t memory_type_index) const;
+
+    VkQueue acquire_queue(uint32_t queue_family_index) const;
+    void reclaim_queue(uint32_t queue_family_index, VkQueue queue) const;
+
+    // allocator on this device
+    VkAllocator* acquire_blob_allocator() const;
+    void reclaim_blob_allocator(VkAllocator* allocator) const;
+
+    VkAllocator* acquire_staging_allocator() const;
+    void reclaim_staging_allocator(VkAllocator* allocator) const;
+
+    // immutable sampler for texelfetch
+    const VkSampler* immutable_texelfetch_sampler() const;
+
+    // dummy buffer image
+    VkMat get_dummy_buffer() const;
+    VkImageMat get_dummy_image() const;
+    VkImageMat get_dummy_image_readonly() const;
+
+    // pipeline cache on this device
+    const PipelineCache* get_pipeline_cache() const;
+
+    // test image allocation
+    bool shape_support_image_storage(const Mat& shape) const;
+
+    // current gpu heap memory budget in MB
+    uint32_t get_heap_budget() const;
+
+    // utility operator
+    void convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkImageMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkImageMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+
+    // VK_KHR_bind_memory2
+    PFN_vkBindBufferMemory2KHR vkBindBufferMemory2KHR;
+    PFN_vkBindImageMemory2KHR vkBindImageMemory2KHR;
+
+    // VK_KHR_create_renderpass2
+    PFN_vkCmdBeginRenderPass2KHR vkCmdBeginRenderPass2KHR;
+    PFN_vkCmdEndRenderPass2KHR vkCmdEndRenderPass2KHR;
+    PFN_vkCmdNextSubpass2KHR vkCmdNextSubpass2KHR;
+    PFN_vkCreateRenderPass2KHR vkCreateRenderPass2KHR;
+
+    // VK_KHR_descriptor_update_template
+    PFN_vkCreateDescriptorUpdateTemplateKHR vkCreateDescriptorUpdateTemplateKHR;
+    PFN_vkDestroyDescriptorUpdateTemplateKHR vkDestroyDescriptorUpdateTemplateKHR;
+    PFN_vkUpdateDescriptorSetWithTemplateKHR vkUpdateDescriptorSetWithTemplateKHR;
+
+    // VK_KHR_get_memory_requirements2
+    PFN_vkGetImageMemoryRequirements2KHR vkGetImageMemoryRequirements2KHR;
+    PFN_vkGetBufferMemoryRequirements2KHR vkGetBufferMemoryRequirements2KHR;
+    PFN_vkGetImageSparseMemoryRequirements2KHR vkGetImageSparseMemoryRequirements2KHR;
+
+    // VK_KHR_maintenance1
+    PFN_vkTrimCommandPoolKHR vkTrimCommandPoolKHR;
+
+    // VK_KHR_maintenance3
+    PFN_vkGetDescriptorSetLayoutSupportKHR vkGetDescriptorSetLayoutSupportKHR;
+
+    // VK_KHR_push_descriptor
+    PFN_vkCmdPushDescriptorSetWithTemplateKHR vkCmdPushDescriptorSetWithTemplateKHR;
+    PFN_vkCmdPushDescriptorSetKHR vkCmdPushDescriptorSetKHR;
+
+    // VK_KHR_sampler_ycbcr_conversion
+    PFN_vkCreateSamplerYcbcrConversionKHR vkCreateSamplerYcbcrConversionKHR;
+    PFN_vkDestroySamplerYcbcrConversionKHR vkDestroySamplerYcbcrConversionKHR;
+
+    // VK_KHR_swapchain
+    PFN_vkCreateSwapchainKHR vkCreateSwapchainKHR;
+    PFN_vkDestroySwapchainKHR vkDestroySwapchainKHR;
+    PFN_vkGetSwapchainImagesKHR vkGetSwapchainImagesKHR;
+    PFN_vkAcquireNextImageKHR vkAcquireNextImageKHR;
+    PFN_vkQueuePresentKHR vkQueuePresentKHR;
+
+#if __ANDROID_API__ >= 26
+    // VK_ANDROID_external_memory_android_hardware_buffer
+    PFN_vkGetAndroidHardwareBufferPropertiesANDROID vkGetAndroidHardwareBufferPropertiesANDROID;
+    PFN_vkGetMemoryAndroidHardwareBufferANDROID vkGetMemoryAndroidHardwareBufferANDROID;
+#endif // __ANDROID_API__ >= 26
+
+protected:
+    // device extension
+    int init_device_extension();
+
+private:
+    VulkanDevice(const VulkanDevice&);
+    VulkanDevice& operator=(const VulkanDevice&);
+
+private:
+    VulkanDevicePrivate* const d;
+};
+
+NCNN_EXPORT VulkanDevice* get_gpu_device(int device_index = get_default_gpu_index());
+
+// online spirv compilation
+NCNN_EXPORT int compile_spirv_module(const char* comp_string, const Option& opt, std::vector<uint32_t>& spirv);
+NCNN_EXPORT int compile_spirv_module(const char* comp_data, int comp_data_size, const Option& opt, std::vector<uint32_t>& spirv);
+NCNN_EXPORT int compile_spirv_module(int shader_type_index, const Option& opt, std::vector<uint32_t>& spirv);
+
+// info from spirv
+class NCNN_EXPORT ShaderInfo
+{
+public:
+    int specialization_count;
+    int binding_count;
+    int push_constant_count;
+
+    // 0 = null
+    // 1 = storage buffer
+    // 2 = storage image
+    // 3 = combined image sampler
+    int binding_types[16]; // 16 is large enough I think ...
+
+    int reserved_0;
+    int reserved_1;
+    int reserved_2;
+    int reserved_3;
+};
+
+NCNN_EXPORT int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info);
+
+} // namespace ncnn
+
+#endif // NCNN_VULKAN
+
+#endif // NCNN_GPU_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/layer.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/layer.h
new file mode 100644
index 0000000..d02f65b
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/layer.h
@@ -0,0 +1,214 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_H
+#define NCNN_LAYER_H
+
+#include "mat.h"
+#include "modelbin.h"
+#include "option.h"
+#include "paramdict.h"
+#include "platform.h"
+
+#include <math.h>
+
+#if NCNN_VULKAN
+#include "command.h"
+#include "pipeline.h"
+
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+namespace ncnn {
+
+class NCNN_EXPORT Layer
+{
+public:
+    // empty
+    Layer();
+    // virtual destructor
+    virtual ~Layer();
+
+    // load layer specific parameter from parsed dict
+    // return 0 if success
+    virtual int load_param(const ParamDict& pd);
+
+    // load layer specific weight data from model binary
+    // return 0 if success
+    virtual int load_model(const ModelBin& mb);
+
+    // layer implementation specific setup
+    // return 0 if success
+    virtual int create_pipeline(const Option& opt);
+
+    // layer implementation specific clean
+    // return 0 if success
+    virtual int destroy_pipeline(const Option& opt);
+
+public:
+    // one input and one output blob
+    bool one_blob_only;
+
+    // support inplace inference
+    bool support_inplace;
+
+    // support vulkan compute
+    bool support_vulkan;
+
+    // accept input blob with packed storage
+    bool support_packing;
+
+    // accept bf16
+    bool support_bf16_storage;
+
+    // accept fp16
+    bool support_fp16_storage;
+
+    // accept int8
+    bool support_int8_storage;
+
+    // shader image storage
+    bool support_image_storage;
+
+    // shader tensor storage
+    bool support_tensor_storage;
+
+    bool support_reserved_00;
+
+    bool support_reserved_0;
+    bool support_reserved_1;
+    bool support_reserved_2;
+    bool support_reserved_3;
+    bool support_reserved_4;
+    bool support_reserved_5;
+    bool support_reserved_6;
+    bool support_reserved_7;
+    bool support_reserved_8;
+    bool support_reserved_9;
+
+    // feature disabled set
+    int featmask;
+
+public:
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+
+#if NCNN_VULKAN
+public:
+    // upload weight blob from host to device
+    virtual int upload_model(VkTransfer& cmd, const Option& opt);
+
+public:
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<VkMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<VkImageMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    // assigned immediately after creating this layer
+    const VulkanDevice* vkdev;
+#endif // NCNN_VULKAN
+
+public:
+    // custom user data
+    void* userdata;
+    // layer type index
+    int typeindex;
+#if NCNN_STRING
+    // layer type name
+    std::string type;
+    // layer name
+    std::string name;
+#endif // NCNN_STRING
+    // blob index which this layer needs as input
+    std::vector<int> bottoms;
+    // blob index which this layer produces as output
+    std::vector<int> tops;
+    // shape hint
+    std::vector<Mat> bottom_shapes;
+    std::vector<Mat> top_shapes;
+};
+
+// layer factory function
+typedef Layer* (*layer_creator_func)(void*);
+typedef void (*layer_destroyer_func)(Layer*, void*);
+
+struct layer_registry_entry
+{
+#if NCNN_STRING
+    // layer type name
+    const char* name;
+#endif // NCNN_STRING
+    // layer factory entry
+    layer_creator_func creator;
+};
+
+struct custom_layer_registry_entry
+{
+#if NCNN_STRING
+    // layer type name
+    const char* name;
+#endif // NCNN_STRING
+    // layer factory entry
+    layer_creator_func creator;
+    layer_destroyer_func destroyer;
+    void* userdata;
+};
+
+#if NCNN_STRING
+// get layer type from type name
+NCNN_EXPORT int layer_to_index(const char* type);
+// create layer from type name
+NCNN_EXPORT Layer* create_layer(const char* type);
+#endif // NCNN_STRING
+// create layer from layer type
+NCNN_EXPORT Layer* create_layer(int index);
+
+#define DEFINE_LAYER_CREATOR(name)                          \
+    ::ncnn::Layer* name##_layer_creator(void* /*userdata*/) \
+    {                                                       \
+        return new name;                                    \
+    }
+
+#define DEFINE_LAYER_DESTROYER(name)                                      \
+    void name##_layer_destroyer(::ncnn::Layer* layer, void* /*userdata*/) \
+    {                                                                     \
+        delete layer;                                                     \
+    }
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/layer_shader_type.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/layer_shader_type.h
new file mode 100644
index 0000000..c143e7d
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/layer_shader_type.h
@@ -0,0 +1,29 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_SHADER_TYPE_H
+#define NCNN_LAYER_SHADER_TYPE_H
+
+namespace ncnn {
+
+namespace LayerShaderType {
+enum LayerShaderType
+{
+#include "layer_shader_type_enum.h"
+};
+} // namespace LayerShaderType
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_SHADER_TYPE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/layer_shader_type_enum.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/layer_shader_type_enum.h
new file mode 100644
index 0000000..f11cab9
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/layer_shader_type_enum.h
@@ -0,0 +1,370 @@
+// Layer Shader Enum header
+//
+// This file is auto-generated by cmake, don't edit it.
+
+absval = 0,
+absval_pack4 = 1,
+absval_pack8 = 2,
+batchnorm = 3,
+batchnorm_pack4 = 4,
+batchnorm_pack8 = 5,
+concat = 6,
+concat_pack4 = 7,
+concat_pack4to1 = 8,
+concat_pack8 = 9,
+concat_pack8to1 = 10,
+concat_pack8to4 = 11,
+convolution = 12,
+convolution_1x1s1d1 = 13,
+convolution_3x3s1d1_winograd23_transform_input = 14,
+convolution_3x3s1d1_winograd23_transform_output = 15,
+convolution_3x3s1d1_winograd43_transform_input = 16,
+convolution_3x3s1d1_winograd43_transform_output = 17,
+convolution_3x3s1d1_winograd_gemm = 18,
+convolution_gemm = 19,
+convolution_pack1to4 = 20,
+convolution_pack1to4_1x1s1d1 = 21,
+convolution_pack1to4_3x3s1d1_winograd_gemm = 22,
+convolution_pack1to4_gemm = 23,
+convolution_pack1to8 = 24,
+convolution_pack1to8_1x1s1d1 = 25,
+convolution_pack1to8_3x3s1d1_winograd_gemm = 26,
+convolution_pack1to8_gemm = 27,
+convolution_pack4 = 28,
+convolution_pack4_1x1s1d1 = 29,
+convolution_pack4_1x1s1d1_cm_16_8_8 = 30,
+convolution_pack4_3x3s1d1_winograd23_transform_input = 31,
+convolution_pack4_3x3s1d1_winograd23_transform_output = 32,
+convolution_pack4_3x3s1d1_winograd43_transform_input = 33,
+convolution_pack4_3x3s1d1_winograd43_transform_output = 34,
+convolution_pack4_3x3s1d1_winograd_gemm = 35,
+convolution_pack4_3x3s1d1_winograd_gemm_cm_16_8_8 = 36,
+convolution_pack4_gemm = 37,
+convolution_pack4_gemm_cm_16_8_8 = 38,
+convolution_pack4to1 = 39,
+convolution_pack4to1_1x1s1d1 = 40,
+convolution_pack4to1_3x3s1d1_winograd_gemm = 41,
+convolution_pack4to1_gemm = 42,
+convolution_pack4to8 = 43,
+convolution_pack4to8_1x1s1d1 = 44,
+convolution_pack4to8_3x3s1d1_winograd_gemm = 45,
+convolution_pack4to8_gemm = 46,
+convolution_pack8 = 47,
+convolution_pack8_1x1s1d1 = 48,
+convolution_pack8_3x3s1d1_winograd23_transform_input = 49,
+convolution_pack8_3x3s1d1_winograd23_transform_output = 50,
+convolution_pack8_3x3s1d1_winograd43_transform_input = 51,
+convolution_pack8_3x3s1d1_winograd43_transform_output = 52,
+convolution_pack8_3x3s1d1_winograd_gemm = 53,
+convolution_pack8_gemm = 54,
+convolution_pack8to1 = 55,
+convolution_pack8to1_1x1s1d1 = 56,
+convolution_pack8to1_3x3s1d1_winograd_gemm = 57,
+convolution_pack8to1_gemm = 58,
+convolution_pack8to4 = 59,
+convolution_pack8to4_1x1s1d1 = 60,
+convolution_pack8to4_3x3s1d1_winograd_gemm = 61,
+convolution_pack8to4_gemm = 62,
+crop = 63,
+crop_pack1to4 = 64,
+crop_pack1to8 = 65,
+crop_pack4 = 66,
+crop_pack4to1 = 67,
+crop_pack4to8 = 68,
+crop_pack8 = 69,
+crop_pack8to1 = 70,
+crop_pack8to4 = 71,
+deconvolution = 72,
+deconvolution_col2im = 73,
+deconvolution_gemm = 74,
+deconvolution_pack1to4 = 75,
+deconvolution_pack1to4_gemm = 76,
+deconvolution_pack1to8 = 77,
+deconvolution_pack1to8_gemm = 78,
+deconvolution_pack4 = 79,
+deconvolution_pack4_col2im = 80,
+deconvolution_pack4_gemm = 81,
+deconvolution_pack4_gemm_cm_16_8_8 = 82,
+deconvolution_pack4to1 = 83,
+deconvolution_pack4to1_gemm = 84,
+deconvolution_pack4to8 = 85,
+deconvolution_pack4to8_gemm = 86,
+deconvolution_pack8 = 87,
+deconvolution_pack8_col2im = 88,
+deconvolution_pack8_gemm = 89,
+deconvolution_pack8to1 = 90,
+deconvolution_pack8to1_gemm = 91,
+deconvolution_pack8to4 = 92,
+deconvolution_pack8to4_gemm = 93,
+dropout = 94,
+dropout_pack4 = 95,
+dropout_pack8 = 96,
+eltwise = 97,
+eltwise_pack4 = 98,
+eltwise_pack8 = 99,
+elu = 100,
+elu_pack4 = 101,
+elu_pack8 = 102,
+flatten = 103,
+flatten_pack1to4 = 104,
+flatten_pack1to8 = 105,
+flatten_pack4 = 106,
+flatten_pack4to8 = 107,
+flatten_pack8 = 108,
+innerproduct = 109,
+innerproduct_gemm = 110,
+innerproduct_gemm_wp1to4 = 111,
+innerproduct_gemm_wp1to8 = 112,
+innerproduct_gemm_wp4 = 113,
+innerproduct_gemm_wp4to1 = 114,
+innerproduct_gemm_wp4to8 = 115,
+innerproduct_gemm_wp8 = 116,
+innerproduct_gemm_wp8to1 = 117,
+innerproduct_gemm_wp8to4 = 118,
+innerproduct_pack1to4 = 119,
+innerproduct_pack1to8 = 120,
+innerproduct_pack4 = 121,
+innerproduct_pack4to1 = 122,
+innerproduct_pack4to8 = 123,
+innerproduct_pack8 = 124,
+innerproduct_pack8to1 = 125,
+innerproduct_pack8to4 = 126,
+innerproduct_reduce_sum8 = 127,
+innerproduct_reduce_sum8_pack4 = 128,
+innerproduct_reduce_sum8_pack8 = 129,
+innerproduct_sum8 = 130,
+innerproduct_sum8_pack1to4 = 131,
+innerproduct_sum8_pack1to8 = 132,
+innerproduct_sum8_pack4 = 133,
+innerproduct_sum8_pack4to1 = 134,
+innerproduct_sum8_pack4to8 = 135,
+innerproduct_sum8_pack8 = 136,
+innerproduct_sum8_pack8to1 = 137,
+innerproduct_sum8_pack8to4 = 138,
+lrn_norm = 139,
+lrn_norm_across_channel_pack4 = 140,
+lrn_norm_across_channel_pack8 = 141,
+lrn_norm_within_channel_pack4 = 142,
+lrn_norm_within_channel_pack8 = 143,
+lrn_square_pad = 144,
+lrn_square_pad_across_channel_pack4 = 145,
+lrn_square_pad_across_channel_pack8 = 146,
+lrn_square_pad_within_channel_pack4 = 147,
+lrn_square_pad_within_channel_pack8 = 148,
+pooling = 149,
+pooling_adaptive = 150,
+pooling_adaptive_pack4 = 151,
+pooling_adaptive_pack8 = 152,
+pooling_global = 153,
+pooling_global_pack4 = 154,
+pooling_global_pack8 = 155,
+pooling_pack4 = 156,
+pooling_pack8 = 157,
+prelu = 158,
+prelu_pack4 = 159,
+prelu_pack8 = 160,
+relu = 161,
+relu_pack4 = 162,
+relu_pack8 = 163,
+reshape = 164,
+reshape_pack1to4 = 165,
+reshape_pack1to8 = 166,
+reshape_pack4 = 167,
+reshape_pack4to1 = 168,
+reshape_pack4to8 = 169,
+reshape_pack8 = 170,
+reshape_pack8to1 = 171,
+reshape_pack8to4 = 172,
+scale = 173,
+scale_pack4 = 174,
+scale_pack8 = 175,
+sigmoid = 176,
+sigmoid_pack4 = 177,
+sigmoid_pack8 = 178,
+slice = 179,
+slice_pack1to4 = 180,
+slice_pack1to8 = 181,
+slice_pack4 = 182,
+slice_pack4to8 = 183,
+slice_pack8 = 184,
+softmax_div_sum = 185,
+softmax_div_sum_pack4 = 186,
+softmax_div_sum_pack8 = 187,
+softmax_exp_sub_max = 188,
+softmax_exp_sub_max_pack4 = 189,
+softmax_exp_sub_max_pack8 = 190,
+softmax_reduce_max = 191,
+softmax_reduce_max_pack4 = 192,
+softmax_reduce_max_pack8 = 193,
+softmax_reduce_sum = 194,
+softmax_reduce_sum_pack4 = 195,
+softmax_reduce_sum_pack8 = 196,
+tanh = 197,
+tanh_pack4 = 198,
+tanh_pack8 = 199,
+binaryop = 200,
+binaryop_broadcast = 201,
+binaryop_broadcast_a1_pack4 = 202,
+binaryop_broadcast_a1_pack8 = 203,
+binaryop_broadcast_b1_pack4 = 204,
+binaryop_broadcast_b1_pack8 = 205,
+binaryop_broadcast_pack4 = 206,
+binaryop_broadcast_pack8 = 207,
+binaryop_pack4 = 208,
+binaryop_pack8 = 209,
+unaryop = 210,
+unaryop_pack4 = 211,
+unaryop_pack8 = 212,
+convolutiondepthwise = 213,
+convolutiondepthwise_group = 214,
+convolutiondepthwise_group_pack1to4 = 215,
+convolutiondepthwise_group_pack1to8 = 216,
+convolutiondepthwise_group_pack4 = 217,
+convolutiondepthwise_group_pack4to1 = 218,
+convolutiondepthwise_group_pack4to8 = 219,
+convolutiondepthwise_group_pack8 = 220,
+convolutiondepthwise_group_pack8to1 = 221,
+convolutiondepthwise_group_pack8to4 = 222,
+convolutiondepthwise_pack4 = 223,
+convolutiondepthwise_pack8 = 224,
+padding = 225,
+padding_3d = 226,
+padding_3d_pack4 = 227,
+padding_3d_pack8 = 228,
+padding_pack1to4 = 229,
+padding_pack1to8 = 230,
+padding_pack4 = 231,
+padding_pack4to1 = 232,
+padding_pack4to8 = 233,
+padding_pack8 = 234,
+padding_pack8to1 = 235,
+padding_pack8to4 = 236,
+normalize_coeffs = 237,
+normalize_coeffs_pack4 = 238,
+normalize_coeffs_pack8 = 239,
+normalize_norm = 240,
+normalize_norm_pack4 = 241,
+normalize_norm_pack8 = 242,
+normalize_reduce_sum4_fp16_to_fp32 = 243,
+normalize_reduce_sum4_fp16_to_fp32_pack4 = 244,
+normalize_reduce_sum4_fp16_to_fp32_pack8 = 245,
+normalize_reduce_sum4_fp32 = 246,
+normalize_reduce_sum4_fp32_pack4 = 247,
+normalize_reduce_sum4_fp32_pack8 = 248,
+permute = 249,
+permute_pack1to4 = 250,
+permute_pack1to8 = 251,
+permute_pack4 = 252,
+permute_pack4to1 = 253,
+permute_pack4to8 = 254,
+permute_pack8 = 255,
+permute_pack8to1 = 256,
+permute_pack8to4 = 257,
+priorbox = 258,
+priorbox_mxnet = 259,
+interp = 260,
+interp_bicubic = 261,
+interp_bicubic_coeffs = 262,
+interp_bicubic_pack4 = 263,
+interp_bicubic_pack8 = 264,
+interp_pack4 = 265,
+interp_pack8 = 266,
+deconvolutiondepthwise = 267,
+deconvolutiondepthwise_group = 268,
+deconvolutiondepthwise_group_pack1to4 = 269,
+deconvolutiondepthwise_group_pack1to8 = 270,
+deconvolutiondepthwise_group_pack4 = 271,
+deconvolutiondepthwise_group_pack4to1 = 272,
+deconvolutiondepthwise_group_pack4to8 = 273,
+deconvolutiondepthwise_group_pack8 = 274,
+deconvolutiondepthwise_group_pack8to1 = 275,
+deconvolutiondepthwise_group_pack8to4 = 276,
+deconvolutiondepthwise_pack4 = 277,
+deconvolutiondepthwise_pack8 = 278,
+shufflechannel = 279,
+shufflechannel_pack4 = 280,
+shufflechannel_pack8 = 281,
+instancenorm_coeffs = 282,
+instancenorm_coeffs_pack4 = 283,
+instancenorm_coeffs_pack8 = 284,
+instancenorm_norm = 285,
+instancenorm_norm_pack4 = 286,
+instancenorm_norm_pack8 = 287,
+instancenorm_reduce_mean = 288,
+instancenorm_reduce_mean_pack4 = 289,
+instancenorm_reduce_mean_pack8 = 290,
+instancenorm_reduce_sum4_fp16_to_fp32 = 291,
+instancenorm_reduce_sum4_fp16_to_fp32_pack4 = 292,
+instancenorm_reduce_sum4_fp16_to_fp32_pack8 = 293,
+instancenorm_reduce_sum4_fp32 = 294,
+instancenorm_reduce_sum4_fp32_pack4 = 295,
+instancenorm_reduce_sum4_fp32_pack8 = 296,
+instancenorm_sub_mean_square = 297,
+instancenorm_sub_mean_square_pack4 = 298,
+instancenorm_sub_mean_square_pack8 = 299,
+clip = 300,
+clip_pack4 = 301,
+clip_pack8 = 302,
+reorg = 303,
+reorg_pack1to4 = 304,
+reorg_pack1to8 = 305,
+reorg_pack4 = 306,
+reorg_pack4to8 = 307,
+reorg_pack8 = 308,
+packing = 309,
+packing_fp16_to_fp32 = 310,
+packing_fp32_to_fp16 = 311,
+packing_pack1to4 = 312,
+packing_pack1to4_fp16_to_fp32 = 313,
+packing_pack1to4_fp32_to_fp16 = 314,
+packing_pack1to8 = 315,
+packing_pack1to8_fp16_to_fp32 = 316,
+packing_pack1to8_fp32_to_fp16 = 317,
+packing_pack4 = 318,
+packing_pack4_fp16_to_fp32 = 319,
+packing_pack4_fp32_to_fp16 = 320,
+packing_pack4to1 = 321,
+packing_pack4to1_fp16_to_fp32 = 322,
+packing_pack4to1_fp32_to_fp16 = 323,
+packing_pack4to8 = 324,
+packing_pack4to8_fp16_to_fp32 = 325,
+packing_pack4to8_fp32_to_fp16 = 326,
+packing_pack8 = 327,
+packing_pack8_fp16_to_fp32 = 328,
+packing_pack8_fp32_to_fp16 = 329,
+packing_pack8to1 = 330,
+packing_pack8to1_fp16_to_fp32 = 331,
+packing_pack8to1_fp32_to_fp16 = 332,
+packing_pack8to4 = 333,
+packing_pack8to4_fp16_to_fp32 = 334,
+packing_pack8to4_fp32_to_fp16 = 335,
+cast_fp16_to_fp32 = 336,
+cast_fp16_to_fp32_pack4 = 337,
+cast_fp16_to_fp32_pack8 = 338,
+cast_fp32_to_fp16 = 339,
+cast_fp32_to_fp16_pack4 = 340,
+cast_fp32_to_fp16_pack8 = 341,
+hardsigmoid = 342,
+hardsigmoid_pack4 = 343,
+hardsigmoid_pack8 = 344,
+hardswish = 345,
+hardswish_pack4 = 346,
+hardswish_pack8 = 347,
+pixelshuffle = 348,
+pixelshuffle_pack4 = 349,
+pixelshuffle_pack4to1 = 350,
+pixelshuffle_pack8 = 351,
+pixelshuffle_pack8to1 = 352,
+pixelshuffle_pack8to4 = 353,
+deepcopy = 354,
+deepcopy_pack4 = 355,
+deepcopy_pack8 = 356,
+mish = 357,
+mish_pack4 = 358,
+mish_pack8 = 359,
+swish = 360,
+swish_pack4 = 361,
+swish_pack8 = 362,
+convert_ycbcr = 363,
+vulkan_activation = 364,
+
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/layer_type.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/layer_type.h
new file mode 100644
index 0000000..511c714
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/layer_type.h
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_TYPE_H
+#define NCNN_LAYER_TYPE_H
+
+namespace ncnn {
+
+namespace LayerType {
+enum LayerType
+{
+#include "layer_type_enum.h"
+    CustomBit = (1 << 8),
+};
+} // namespace LayerType
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_TYPE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/layer_type_enum.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/layer_type_enum.h
new file mode 100644
index 0000000..581d589
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/layer_type_enum.h
@@ -0,0 +1,103 @@
+// Layer Type Enum header
+//
+// This file is auto-generated by cmake, don't edit it.
+
+AbsVal = 0,
+ArgMax = 1,
+BatchNorm = 2,
+Bias = 3,
+BNLL = 4,
+Concat = 5,
+Convolution = 6,
+Crop = 7,
+Deconvolution = 8,
+Dropout = 9,
+Eltwise = 10,
+ELU = 11,
+Embed = 12,
+Exp = 13,
+Flatten = 14,
+InnerProduct = 15,
+Input = 16,
+Log = 17,
+LRN = 18,
+MemoryData = 19,
+MVN = 20,
+Pooling = 21,
+Power = 22,
+PReLU = 23,
+Proposal = 24,
+Reduction = 25,
+ReLU = 26,
+Reshape = 27,
+ROIPooling = 28,
+Scale = 29,
+Sigmoid = 30,
+Slice = 31,
+Softmax = 32,
+Split = 33,
+SPP = 34,
+TanH = 35,
+Threshold = 36,
+Tile = 37,
+RNN = 38,
+LSTM = 39,
+BinaryOp = 40,
+UnaryOp = 41,
+ConvolutionDepthWise = 42,
+Padding = 43,
+Squeeze = 44,
+ExpandDims = 45,
+Normalize = 46,
+Permute = 47,
+PriorBox = 48,
+DetectionOutput = 49,
+Interp = 50,
+DeconvolutionDepthWise = 51,
+ShuffleChannel = 52,
+InstanceNorm = 53,
+Clip = 54,
+Reorg = 55,
+YoloDetectionOutput = 56,
+Quantize = 57,
+Dequantize = 58,
+Yolov3DetectionOutput = 59,
+PSROIPooling = 60,
+ROIAlign = 61,
+Packing = 62,
+Requantize = 63,
+Cast = 64,
+HardSigmoid = 65,
+SELU = 66,
+HardSwish = 67,
+Noop = 68,
+PixelShuffle = 69,
+DeepCopy = 70,
+Mish = 71,
+StatisticsPooling = 72,
+Swish = 73,
+Gemm = 74,
+GroupNorm = 75,
+LayerNorm = 76,
+Softplus = 77,
+GRU = 78,
+MultiHeadAttention = 79,
+GELU = 80,
+Convolution1D = 81,
+Pooling1D = 82,
+ConvolutionDepthWise1D = 83,
+Convolution3D = 84,
+ConvolutionDepthWise3D = 85,
+Pooling3D = 86,
+MatMul = 87,
+Deconvolution1D = 88,
+DeconvolutionDepthWise1D = 89,
+Deconvolution3D = 90,
+DeconvolutionDepthWise3D = 91,
+Einsum = 92,
+DeformableConv2D = 93,
+GLU = 94,
+Fold = 95,
+Unfold = 96,
+GridSample = 97,
+
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/mat.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/mat.h
new file mode 100644
index 0000000..c6f59ef
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/mat.h
@@ -0,0 +1,1843 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_MAT_H
+#define NCNN_MAT_H
+
+#include <stdlib.h>
+#include <string.h>
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif
+#if __SSE2__
+#include <emmintrin.h>
+#if __AVX__
+#include <immintrin.h>
+#endif
+#endif
+#if __mips_msa
+#include <msa.h>
+#endif
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif
+#if __riscv_vector
+#include <riscv_vector.h>
+#include "cpu.h" // cpu_riscv_vlenb()
+#endif
+
+#include "allocator.h"
+#include "option.h"
+#include "platform.h"
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#if NCNN_PIXEL
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/bitmap.h>
+#include <jni.h>
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+#endif // NCNN_PIXEL
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkMat;
+class VkImageMat;
+#endif // NCNN_VULKAN
+
+// the three dimension matrix
+class NCNN_EXPORT Mat
+{
+public:
+    // empty
+    Mat();
+    // vec
+    Mat(int w, size_t elemsize = 4u, Allocator* allocator = 0);
+    // image
+    Mat(int w, int h, size_t elemsize = 4u, Allocator* allocator = 0);
+    // dim
+    Mat(int w, int h, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // cube
+    Mat(int w, int h, int d, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // packed vec
+    Mat(int w, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed image
+    Mat(int w, int h, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed dim
+    Mat(int w, int h, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed cube
+    Mat(int w, int h, int d, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // copy
+    Mat(const Mat& m);
+    // external vec
+    Mat(int w, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external image
+    Mat(int w, int h, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external dim
+    Mat(int w, int h, int c, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external cube
+    Mat(int w, int h, int d, int c, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external packed vec
+    Mat(int w, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed image
+    Mat(int w, int h, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed dim
+    Mat(int w, int h, int c, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed cube
+    Mat(int w, int h, int d, int c, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // release
+    ~Mat();
+    // assign
+    Mat& operator=(const Mat& m);
+    // set all
+    void fill(float v);
+    void fill(int v);
+#if __ARM_NEON
+    void fill(float32x4_t _v);
+    void fill(uint16x4_t _v);
+    void fill(int32x4_t _v);
+    void fill(int32x4_t _v0, int32x4_t _v1);
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    void fill(float16x4_t _v);
+    void fill(float16x8_t _v);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif // __ARM_NEON
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+    void fill(__m512 _v);
+#endif // __AVX512F__
+    void fill(__m256 _v, int i = 0);
+#endif // __AVX__
+    void fill(__m128 _v);
+    void fill(__m128i _v);
+#endif // __SSE2__
+#if __mips_msa
+    void fill(v4f32 _v);
+#endif // __mips_msa
+#if __loongarch_sx
+    void fill(__m128 _v);
+#endif //__loongarch_sx
+#if __riscv_vector
+    void fill(vfloat32m1_t _v);
+    void fill(vuint16m1_t _v);
+    void fill(vint8m1_t _v);
+#if __riscv_zfh
+    void fill(vfloat16m1_t _v);
+#endif // __riscv_zfh
+#endif // __riscv_vector
+    template<typename T>
+    void fill(T v);
+    // deep copy
+    Mat clone(Allocator* allocator = 0) const;
+    // deep copy from other mat, inplace
+    void clone_from(const ncnn::Mat& mat, Allocator* allocator = 0);
+    // reshape vec
+    Mat reshape(int w, Allocator* allocator = 0) const;
+    // reshape image
+    Mat reshape(int w, int h, Allocator* allocator = 0) const;
+    // reshape dim
+    Mat reshape(int w, int h, int c, Allocator* allocator = 0) const;
+    // reshape cube
+    Mat reshape(int w, int h, int d, int c, Allocator* allocator = 0) const;
+    // allocate vec
+    void create(int w, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate image
+    void create(int w, int h, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate cube
+    void create(int w, int h, int d, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed cube
+    void create(int w, int h, int d, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate like
+    void create_like(const Mat& m, Allocator* allocator = 0);
+#if NCNN_VULKAN
+    // allocate like
+    void create_like(const VkMat& m, Allocator* allocator = 0);
+    // allocate like
+    void create_like(const VkImageMat& im, Allocator* allocator = 0);
+#endif // NCNN_VULKAN
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // bits per element
+    int elembits() const;
+
+    // shape only
+    Mat shape() const;
+
+    // data reference
+    Mat channel(int c);
+    const Mat channel(int c) const;
+    Mat depth(int z);
+    const Mat depth(int z) const;
+    float* row(int y);
+    const float* row(int y) const;
+    template<typename T>
+    T* row(int y);
+    template<typename T>
+    const T* row(int y) const;
+
+    // range reference
+    Mat channel_range(int c, int channels);
+    const Mat channel_range(int c, int channels) const;
+    Mat depth_range(int z, int depths);
+    const Mat depth_range(int z, int depths) const;
+    Mat row_range(int y, int rows);
+    const Mat row_range(int y, int rows) const;
+    Mat range(int x, int n);
+    const Mat range(int x, int n) const;
+
+    // access raw data
+    template<typename T>
+    operator T*();
+    template<typename T>
+    operator const T*() const;
+
+    // convenient access float vec element
+    float& operator[](size_t i);
+    const float& operator[](size_t i) const;
+
+#if NCNN_PIXEL
+    enum PixelType
+    {
+        PIXEL_CONVERT_SHIFT = 16,
+        PIXEL_FORMAT_MASK = 0x0000ffff,
+        PIXEL_CONVERT_MASK = 0xffff0000,
+
+        PIXEL_RGB = 1,
+        PIXEL_BGR = 2,
+        PIXEL_GRAY = 3,
+        PIXEL_RGBA = 4,
+        PIXEL_BGRA = 5,
+
+        PIXEL_RGB2BGR = PIXEL_RGB | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGB2GRAY = PIXEL_RGB | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGB2RGBA = PIXEL_RGB | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGB2BGRA = PIXEL_RGB | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_BGR2RGB = PIXEL_BGR | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGR2GRAY = PIXEL_BGR | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGR2RGBA = PIXEL_BGR | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGR2BGRA = PIXEL_BGR | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_GRAY2RGB = PIXEL_GRAY | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_GRAY2BGR = PIXEL_GRAY | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_GRAY2RGBA = PIXEL_GRAY | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+        PIXEL_GRAY2BGRA = PIXEL_GRAY | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_RGBA2RGB = PIXEL_RGBA | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2BGR = PIXEL_RGBA | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2GRAY = PIXEL_RGBA | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2BGRA = PIXEL_RGBA | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_BGRA2RGB = PIXEL_BGRA | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGRA2BGR = PIXEL_BGRA | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGRA2GRAY = PIXEL_BGRA | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGRA2RGBA = PIXEL_BGRA | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+    };
+    // convenient construct from pixel data
+    static Mat from_pixels(const unsigned char* pixels, int type, int w, int h, Allocator* allocator = 0);
+    // convenient construct from pixel data with stride(bytes-per-row) parameter
+    static Mat from_pixels(const unsigned char* pixels, int type, int w, int h, int stride, Allocator* allocator = 0);
+    // convenient construct from pixel data and resize to specific size
+    static Mat from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from pixel data and resize to specific size with stride(bytes-per-row) parameter
+    static Mat from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int stride, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from pixel data roi
+    static Mat from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int roix, int roiy, int roiw, int roih, Allocator* allocator = 0);
+    // convenient construct from pixel data roi with stride(bytes-per-row) parameter
+    static Mat from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, Allocator* allocator = 0);
+    // convenient construct from pixel data roi and resize to specific size
+    static Mat from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from pixel data roi and resize to specific size with stride(bytes-per-row) parameter
+    static Mat from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator = 0);
+
+    // convenient export to pixel data
+    void to_pixels(unsigned char* pixels, int type) const;
+    // convenient export to pixel data with stride(bytes-per-row) parameter
+    void to_pixels(unsigned char* pixels, int type, int stride) const;
+    // convenient export to pixel data and resize to specific size
+    void to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height) const;
+    // convenient export to pixel data and resize to specific size with stride(bytes-per-row) parameter
+    void to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height, int target_stride) const;
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+    // convenient construct from android Bitmap
+    static Mat from_android_bitmap(JNIEnv* env, jobject bitmap, int type_to, Allocator* allocator = 0);
+    // convenient construct from android Bitmap and resize to specific size
+    static Mat from_android_bitmap_resize(JNIEnv* env, jobject bitmap, int type_to, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from android Bitmap roi
+    static Mat from_android_bitmap_roi(JNIEnv* env, jobject bitmap, int type_to, int roix, int roiy, int roiw, int roih, Allocator* allocator = 0);
+    // convenient construct from android Bitmap roi and resize to specific size
+    static Mat from_android_bitmap_roi_resize(JNIEnv* env, jobject bitmap, int type_to, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient export to android Bitmap and resize to the android Bitmap size
+    void to_android_bitmap(JNIEnv* env, jobject bitmap, int type_from) const;
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+#endif // NCNN_PIXEL
+
+    // substract channel-wise mean values, then multiply by normalize values, pass 0 to skip
+    void substract_mean_normalize(const float* mean_vals, const float* norm_vals);
+
+    // convenient construct from half precision floating point data
+    static Mat from_float16(const unsigned short* data, int size);
+
+    // pointer to the data
+    void* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-d-h-w-1  c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-d-h-w-4  c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-d-h-w-8  c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    Allocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int d;
+    int c;
+
+    size_t cstep;
+};
+
+#if NCNN_VULKAN
+
+// the three dimension matrix, vulkan version
+class NCNN_EXPORT VkMat
+{
+public:
+    // empty
+    VkMat();
+    // vec
+    VkMat(int w, size_t elemsize, VkAllocator* allocator);
+    // image
+    VkMat(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // dim
+    VkMat(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // cube
+    VkMat(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // packed vec
+    VkMat(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed image
+    VkMat(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed dim
+    VkMat(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed cube
+    VkMat(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // copy
+    VkMat(const VkMat& m);
+    // external vec
+    VkMat(int w, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external image
+    VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external dim
+    VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external cube
+    VkMat(int w, int h, int d, int c, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external packed vec
+    VkMat(int w, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed image
+    VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed dim
+    VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed cube
+    VkMat(int w, int h, int d, int c, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // release
+    ~VkMat();
+    // assign
+    VkMat& operator=(const VkMat& m);
+    // allocate vec
+    void create(int w, size_t elemsize, VkAllocator* allocator);
+    // allocate image
+    void create(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate cube
+    void create(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed cube
+    void create(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate like
+    void create_like(const Mat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkMat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkImageMat& im, VkAllocator* allocator);
+
+    // mapped
+    Mat mapped() const;
+    void* mapped_ptr() const;
+
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // bits per element
+    int elembits() const;
+
+    // shape only
+    Mat shape() const;
+
+    // low-level reference
+    VkBuffer buffer() const;
+    size_t buffer_offset() const;
+    size_t buffer_capacity() const;
+
+    // device buffer
+    VkBufferMemory* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-d-h-w-1  c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-d-h-w-4  c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-d-h-w-8  c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    VkAllocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int d;
+    int c;
+
+    size_t cstep;
+};
+
+class NCNN_EXPORT VkImageMat
+{
+public:
+    // empty
+    VkImageMat();
+    // vec
+    VkImageMat(int w, size_t elemsize, VkAllocator* allocator);
+    // image
+    VkImageMat(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // dim
+    VkImageMat(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // cube
+    VkImageMat(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // packed vec
+    VkImageMat(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed image
+    VkImageMat(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed dim
+    VkImageMat(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed cube
+    VkImageMat(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // copy
+    VkImageMat(const VkImageMat& m);
+    // external vec
+    VkImageMat(int w, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external image
+    VkImageMat(int w, int h, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external dim
+    VkImageMat(int w, int h, int c, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external cube
+    VkImageMat(int w, int h, int d, int c, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external packed vec
+    VkImageMat(int w, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed image
+    VkImageMat(int w, int h, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed dim
+    VkImageMat(int w, int h, int c, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed cube
+    VkImageMat(int w, int h, int d, int c, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // release
+    ~VkImageMat();
+    // assign
+    VkImageMat& operator=(const VkImageMat& m);
+    // allocate vec
+    void create(int w, size_t elemsize, VkAllocator* allocator);
+    // allocate image
+    void create(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate cube
+    void create(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed cube
+    void create(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate like
+    void create_like(const Mat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkMat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkImageMat& im, VkAllocator* allocator);
+
+    // mapped
+    Mat mapped() const;
+    void* mapped_ptr() const;
+
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // bits per element
+    int elembits() const;
+
+    // shape only
+    Mat shape() const;
+
+    // low-level reference
+    VkImage image() const;
+    VkImageView imageview() const;
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+    // convenient construct from android hardware buffer
+    static VkImageMat from_android_hardware_buffer(VkAndroidHardwareBufferImageAllocator* allocator);
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+    // device image
+    VkImageMemory* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-d-h-w-1  c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-d-h-w-4  c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-d-h-w-8  c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    VkAllocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int d;
+    int c;
+};
+
+// type for vulkan specialization constant and push constant
+union vk_specialization_type
+{
+    int i;
+    float f;
+    uint32_t u32;
+};
+union vk_constant_type
+{
+    int i;
+    float f;
+};
+#endif // NCNN_VULKAN
+
+// misc function
+#if NCNN_PIXEL
+// convert yuv420sp(nv21) to rgb, the fast approximate version
+NCNN_EXPORT void yuv420sp2rgb(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
+// convert yuv420sp(nv12) to rgb, the fast approximate version
+NCNN_EXPORT void yuv420sp2rgb_nv12(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
+// convert yuv420sp(nv21) to rgb with half resize, the faster approximate version
+NCNN_EXPORT void yuv420sp2rgb_half(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
+// image pixel bilinear resize
+NCNN_EXPORT void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+NCNN_EXPORT void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+NCNN_EXPORT void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+NCNN_EXPORT void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+// image pixel bilinear resize with stride(bytes-per-row) parameter
+NCNN_EXPORT void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+NCNN_EXPORT void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+NCNN_EXPORT void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+NCNN_EXPORT void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+// image pixel bilinear resize, convenient wrapper for yuv420sp(nv21/nv12)
+NCNN_EXPORT void resize_bilinear_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+#endif // NCNN_PIXEL
+#if NCNN_PIXEL_ROTATE
+// type is the from type, 6 means rotating from 6 to 1
+//
+//     1        2       3      4         5            6           7          8
+//
+//   888888  888888      88  88      8888888888  88                  88  8888888888
+//   88          88      88  88      88  88      88  88          88  88      88  88
+//   8888      8888    8888  8888    88          8888888888  8888888888          88
+//   88          88      88  88
+//   88          88  888888  888888
+//
+// ref http://sylvana.net/jpegcrop/exif_orientation.html
+// image pixel kanna rotate
+NCNN_EXPORT void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+NCNN_EXPORT void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+NCNN_EXPORT void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+NCNN_EXPORT void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+// image pixel kanna rotate with stride(bytes-per-row) parameter
+NCNN_EXPORT void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+NCNN_EXPORT void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+NCNN_EXPORT void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+NCNN_EXPORT void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+// image pixel kanna rotate, convenient wrapper for yuv420sp(nv21/nv12)
+NCNN_EXPORT void kanna_rotate_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+#endif // NCNN_PIXEL_ROTATE
+#if NCNN_PIXEL_AFFINE
+// resolve affine transform matrix from rotation angle, scale factor and x y offset
+NCNN_EXPORT void get_rotation_matrix(float angle, float scale, float dx, float dy, float* tm);
+// resolve affine transform matrix from two set of points, num_point must be >= 2
+NCNN_EXPORT void get_affine_transform(const float* points_from, const float* points_to, int num_point, float* tm);
+// resolve the inversion affine transform matrix
+NCNN_EXPORT void invert_affine_transform(const float* tm, float* tm_inv);
+// image pixel bilinear warpaffine inverse transform, set -233 for transparent border color, the color RGBA is little-endian encoded
+NCNN_EXPORT void warpaffine_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+// image pixel bilinear warpaffine inverse transform with stride(bytes-per-row) parameter, set -233 for transparent border color, the color RGBA is little-endian encoded
+NCNN_EXPORT void warpaffine_bilinear_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+// image pixel bilinear warpaffine, convenient wrapper for yuv420sp(nv21/nv12), set -233 for transparent border color, the color YUV_ is little-endian encoded
+NCNN_EXPORT void warpaffine_bilinear_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+#endif // NCNN_PIXEL_AFFINE
+#if NCNN_PIXEL_DRAWING
+// draw rectangle, set thickness -1 for filled rectangle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_rectangle_c1(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c2(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c3(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c4(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+// draw rectangle with stride(bytes-per-row) parameter, set thickness -1 for filled rectangle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_rectangle_c1(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c2(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c3(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c4(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+// draw rectangle, convenient wrapper for yuv420sp(nv21/nv12), set thickness -1 for filled rectangle, the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_rectangle_yuv420sp(unsigned char* yuv420sp, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+// draw circle, set thickness -1 for filled circle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_circle_c1(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c2(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c3(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c4(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+// draw circle with stride(bytes-per-row) parameter, set thickness -1 for filled circle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_circle_c1(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c2(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c3(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c4(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+// draw circle, convenient wrapper for yuv420sp(nv21/nv12), set thickness -1 for filled circle, the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_circle_yuv420sp(unsigned char* yuv420sp, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+// draw line, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_line_c1(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c2(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c3(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c4(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+// draw line with stride(bytes-per-row) parameter, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_line_c1(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c2(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c3(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c4(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+// draw line, convenient wrapper for yuv420sp(nv21/nv12), the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_line_yuv420sp(unsigned char* yuv420sp, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+// resolve text bounding box size
+NCNN_EXPORT void get_text_drawing_size(const char* text, int fontpixelsize, int* w, int* h);
+// draw ascii printables and newline, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_text_c1(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c2(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c3(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c4(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+// draw ascii printables and newline with stride(bytes-per-row) parameter, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_text_c1(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c2(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c3(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c4(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+// draw ascii printables and newline, convenient wrapper for yuv420sp(nv21/nv12), the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_text_yuv420sp(unsigned char* yuv420sp, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+#endif // NCNN_PIXEL_DRAWING
+
+// type conversion
+// convert float to half precision floating point
+NCNN_EXPORT unsigned short float32_to_float16(float value);
+// convert half precision floating point to float
+NCNN_EXPORT float float16_to_float32(unsigned short value);
+// convert float to brain half
+NCNN_EXPORT NCNN_FORCEINLINE unsigned short float32_to_bfloat16(float value)
+{
+    // 16 : 16
+    union
+    {
+        unsigned int u;
+        float f;
+    } tmp;
+    tmp.f = value;
+    return tmp.u >> 16;
+}
+// convert brain half to float
+NCNN_EXPORT NCNN_FORCEINLINE float bfloat16_to_float32(unsigned short value)
+{
+    // 16 : 16
+    union
+    {
+        unsigned int u;
+        float f;
+    } tmp;
+    tmp.u = value << 16;
+    return tmp.f;
+}
+
+// mat process
+enum BorderType
+{
+    BORDER_CONSTANT = 0,
+    BORDER_REPLICATE = 1,
+    BORDER_REFLECT = 2,
+    BORDER_TRANSPARENT = -233,
+};
+NCNN_EXPORT void copy_make_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int type, float v, const Option& opt = Option());
+NCNN_EXPORT void copy_make_border_3d(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int front, int behind, int type, float v, const Option& opt = Option());
+NCNN_EXPORT void copy_cut_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, const Option& opt = Option());
+NCNN_EXPORT void copy_cut_border_3d(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int front, int behind, const Option& opt = Option());
+NCNN_EXPORT void resize_nearest(const Mat& src, Mat& dst, int w, int h, const Option& opt = Option());
+NCNN_EXPORT void resize_bilinear(const Mat& src, Mat& dst, int w, int h, const Option& opt = Option());
+NCNN_EXPORT void resize_bicubic(const Mat& src, Mat& dst, int w, int h, const Option& opt = Option());
+NCNN_EXPORT void convert_packing(const Mat& src, Mat& dst, int elempack, const Option& opt = Option());
+NCNN_EXPORT void flatten(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_float32_to_float16(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_float16_to_float32(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_int8_to_float32(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_float32_to_bfloat16(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_bfloat16_to_float32(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void quantize_to_int8(const Mat& src, Mat& dst, const Mat& scale_data, const Option& opt = Option());
+NCNN_EXPORT void dequantize_from_int32(const Mat& src, Mat& dst, const Mat& scale_data, const Mat& bias_data, const Option& opt = Option());
+NCNN_EXPORT void requantize_from_int32_to_int8(const Mat& src, Mat& dst, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt = Option());
+
+NCNN_FORCEINLINE Mat::Mat()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(const Mat& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c), cstep(m.cstep)
+{
+    addref();
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = (size_t)w * h;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = (size_t)w * h;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::~Mat()
+{
+    release();
+}
+
+NCNN_FORCEINLINE void Mat::fill(float _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+
+    int i = 0;
+#if __ARM_NEON
+    float32x4_t _c = vdupq_n_f32(_v);
+    for (; i + 3 < size; i += 4)
+    {
+        vst1q_f32(ptr, _c);
+        ptr += 4;
+    }
+#endif // __ARM_NEON
+    for (; i < size; i++)
+    {
+        *ptr++ = _v;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(int _v)
+{
+    int size = (int)total();
+    int* ptr = (int*)data;
+
+    int i = 0;
+#if __ARM_NEON
+    int32x4_t _c = vdupq_n_s32(_v);
+    for (; i + 3 < size; i += 4)
+    {
+        vst1q_s32(ptr, _c);
+        ptr += 4;
+    }
+#endif // __ARM_NEON
+    for (; i < size; i++)
+    {
+        *ptr++ = _v;
+    }
+}
+
+#if __ARM_NEON
+NCNN_FORCEINLINE void Mat::fill(float32x4_t _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_f32(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(uint16x4_t _v)
+{
+    int size = (int)total();
+    unsigned short* ptr = (unsigned short*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1_u16(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(int32x4_t _v)
+{
+    int size = (int)total();
+    int* ptr = (int*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_s32(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(int32x4_t _v0, int32x4_t _v1)
+{
+    int size = (int)total();
+    int* ptr = (int*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_s32(ptr, _v0);
+        vst1q_s32(ptr + 4, _v1);
+        ptr += 8;
+    }
+}
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+NCNN_FORCEINLINE void Mat::fill(float16x4_t _v)
+{
+    int size = (int)total();
+    __fp16* ptr = (__fp16*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1_f16(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(float16x8_t _v)
+{
+    int size = (int)total();
+    __fp16* ptr = (__fp16*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_f16(ptr, _v);
+        ptr += 8;
+    }
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif // __ARM_NEON
+
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+NCNN_FORCEINLINE void Mat::fill(__m512 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm512_storeu_ps(ptr, _v);
+        ptr += 16;
+    }
+}
+#endif // __AVX512F__
+NCNN_FORCEINLINE void Mat::fill(__m256 _v, int _i)
+{
+    // old gcc cannot overload __m128 and __m256 type
+    // add a dummy int parameter for different mangled function symbol
+    (void)_i;
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm256_storeu_ps(ptr, _v);
+        ptr += 8;
+    }
+}
+#endif // __AVX__
+NCNN_FORCEINLINE void Mat::fill(__m128 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm_storeu_ps(ptr, _v);
+        ptr += 4;
+    }
+}
+NCNN_FORCEINLINE void Mat::fill(__m128i _v)
+{
+    int size = (int)total();
+    unsigned short* ptr = (unsigned short*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm_store_si128((__m128i*)ptr, _v);
+        ptr += 8;
+    }
+}
+#endif // __SSE2__
+
+#if __mips_msa
+NCNN_FORCEINLINE void Mat::fill(v4f32 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        __msa_st_w((v4i32)_v, ptr, 0);
+        ptr += 4;
+    }
+}
+#endif // __mips_msa
+
+#if __loongarch_sx
+NCNN_FORCEINLINE void Mat::fill(__m128 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        __lsx_vst(_v, ptr, 0);
+        ptr += 4;
+    }
+}
+#endif // __loongarch_sx
+#if __riscv_vector
+NCNN_FORCEINLINE void Mat::fill(vfloat32m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 4;
+    const size_t vl = vsetvl_e32m1(packn);
+
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse32_v_f32m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(vuint16m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 2;
+    const size_t vl = vsetvl_e16m1(packn);
+
+    int size = (int)total();
+    unsigned short* ptr = (unsigned short*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse16_v_u16m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(vint8m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 1;
+    const size_t vl = vsetvl_e8m1(packn);
+
+    int size = (int)total();
+    signed char* ptr = (signed char*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse8_v_i8m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+#if __riscv_zfh
+NCNN_FORCEINLINE void Mat::fill(vfloat16m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 2;
+    const size_t vl = vsetvl_e16m1(packn);
+
+    int size = (int)total();
+    __fp16* ptr = (__fp16*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse16_v_f16m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+#endif // __riscv_zfh
+#endif // __riscv_vector
+
+template<typename T>
+NCNN_FORCEINLINE void Mat::fill(T _v)
+{
+    int size = (int)total();
+    T* ptr = (T*)data;
+    for (int i = 0; i < size; i++)
+    {
+        ptr[i] = _v;
+    }
+}
+
+NCNN_FORCEINLINE Mat& Mat::operator=(const Mat& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        NCNN_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    d = m.d;
+    c = m.c;
+
+    cstep = m.cstep;
+
+    return *this;
+}
+
+NCNN_FORCEINLINE void Mat::addref()
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+}
+
+NCNN_FORCEINLINE void Mat::release()
+{
+    if (refcount && NCNN_XADD(refcount, -1) == 1)
+    {
+        if (allocator)
+            allocator->fastFree(data);
+        else
+            fastFree(data);
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    d = 0;
+    c = 0;
+
+    cstep = 0;
+
+    refcount = 0;
+}
+
+NCNN_FORCEINLINE bool Mat::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+NCNN_FORCEINLINE size_t Mat::total() const
+{
+    return cstep * c;
+}
+
+NCNN_FORCEINLINE int Mat::elembits() const
+{
+    return elempack ? static_cast<int>(elemsize * 8) / elempack : 0;
+}
+
+NCNN_FORCEINLINE Mat Mat::shape() const
+{
+    if (dims == 1)
+        return Mat(w * elempack, (void*)0);
+    if (dims == 2)
+        return Mat(w, h * elempack, (void*)0);
+    if (dims == 3)
+        return Mat(w, h, c * elempack, (void*)0);
+    if (dims == 4)
+        return Mat(w, h, d, c * elempack, (void*)0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE Mat Mat::channel(int _c)
+{
+    Mat m(w, h, d, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims - 1;
+    if (dims == 4)
+        m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::channel(int _c) const
+{
+    Mat m(w, h, d, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims - 1;
+    if (dims == 4)
+        m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE Mat Mat::depth(int z)
+{
+    return Mat(w, h, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE const Mat Mat::depth(int z) const
+{
+    return Mat(w, h, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE float* Mat::row(int y)
+{
+    return (float*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+NCNN_FORCEINLINE const float* Mat::row(int y) const
+{
+    return (const float*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+template<typename T>
+NCNN_FORCEINLINE T* Mat::row(int y)
+{
+    return (T*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+template<typename T>
+NCNN_FORCEINLINE const T* Mat::row(int y) const
+{
+    return (const T*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+NCNN_FORCEINLINE Mat Mat::channel_range(int _c, int channels)
+{
+    Mat m(w, h, d, channels, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::channel_range(int _c, int channels) const
+{
+    Mat m(w, h, d, channels, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims;
+    return m;
+}
+
+NCNN_FORCEINLINE Mat Mat::depth_range(int z, int depths)
+{
+    Mat m(w, h, depths, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+    m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::depth_range(int z, int depths) const
+{
+    Mat m(w, h, depths, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+    m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE Mat Mat::row_range(int y, int rows)
+{
+    return Mat(w, rows, (unsigned char*)data + (size_t)w * y * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE const Mat Mat::row_range(int y, int rows) const
+{
+    return Mat(w, rows, (unsigned char*)data + (size_t)w * y * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE Mat Mat::range(int x, int n)
+{
+    return Mat(n, (unsigned char*)data + x * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE const Mat Mat::range(int x, int n) const
+{
+    return Mat(n, (unsigned char*)data + x * elemsize, elemsize, elempack, allocator);
+}
+
+template<typename T>
+NCNN_FORCEINLINE Mat::operator T*()
+{
+    return (T*)data;
+}
+
+template<typename T>
+NCNN_FORCEINLINE Mat::operator const T*() const
+{
+    return (const T*)data;
+}
+
+NCNN_FORCEINLINE float& Mat::operator[](size_t i)
+{
+    return ((float*)data)[i];
+}
+
+NCNN_FORCEINLINE const float& Mat::operator[](size_t i) const
+{
+    return ((const float*)data)[i];
+}
+
+#if NCNN_VULKAN
+
+NCNN_FORCEINLINE VkMat::VkMat()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(const VkMat& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c)
+{
+    addref();
+
+    cstep = m.cstep;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = w * h;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize(w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize(w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = w * h;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize(w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize(w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::~VkMat()
+{
+    release();
+}
+
+NCNN_FORCEINLINE VkMat& VkMat::operator=(const VkMat& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        NCNN_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    d = m.d;
+    c = m.c;
+
+    cstep = m.cstep;
+
+    return *this;
+}
+
+NCNN_FORCEINLINE Mat VkMat::mapped() const
+{
+    if (!allocator->mappable)
+        return Mat();
+
+    if (dims == 1)
+        return Mat(w, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 2)
+        return Mat(w, h, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 3)
+        return Mat(w, h, c, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 4)
+        return Mat(w, h, d, c, mapped_ptr(), elemsize, elempack, 0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE void* VkMat::mapped_ptr() const
+{
+    if (!allocator->mappable)
+        return 0;
+
+    return (unsigned char*)data->mapped_ptr + data->offset;
+}
+
+NCNN_FORCEINLINE void VkMat::addref()
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+}
+
+NCNN_FORCEINLINE void VkMat::release()
+{
+    if (refcount && NCNN_XADD(refcount, -1) == 1)
+    {
+        if (allocator && data)
+        {
+            allocator->fastFree(data);
+        }
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    d = 0;
+    c = 0;
+
+    cstep = 0;
+
+    refcount = 0;
+}
+
+NCNN_FORCEINLINE bool VkMat::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+NCNN_FORCEINLINE size_t VkMat::total() const
+{
+    return cstep * c;
+}
+
+NCNN_FORCEINLINE int VkMat::elembits() const
+{
+    return elempack ? static_cast<int>(elemsize) * 8 / elempack : 0;
+}
+
+NCNN_FORCEINLINE Mat VkMat::shape() const
+{
+    if (dims == 1)
+        return Mat(w * elempack, (void*)0);
+    if (dims == 2)
+        return Mat(w, h * elempack, (void*)0);
+    if (dims == 3)
+        return Mat(w, h, c * elempack, (void*)0);
+    if (dims == 4)
+        return Mat(w, h, d, c * elempack, (void*)0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE VkBuffer VkMat::buffer() const
+{
+    return data->buffer;
+}
+
+NCNN_FORCEINLINE size_t VkMat::buffer_offset() const
+{
+    return data->offset;
+}
+
+NCNN_FORCEINLINE size_t VkMat::buffer_capacity() const
+{
+    return data->capacity;
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(const VkImageMat& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c)
+{
+    addref();
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::~VkImageMat()
+{
+    release();
+}
+
+NCNN_FORCEINLINE VkImageMat& VkImageMat::operator=(const VkImageMat& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        NCNN_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    d = m.d;
+    c = m.c;
+
+    return *this;
+}
+
+NCNN_FORCEINLINE Mat VkImageMat::mapped() const
+{
+    if (!allocator->mappable || !data->mapped_ptr)
+        return Mat();
+
+    if (dims == 1)
+        return Mat(w, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 2)
+        return Mat(w, h, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 3)
+        return Mat(w, h, c, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 4)
+        return Mat(w, h, d, c, mapped_ptr(), elemsize, elempack, 0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE void* VkImageMat::mapped_ptr() const
+{
+    if (!allocator->mappable || !data->mapped_ptr)
+        return 0;
+
+    return (unsigned char*)data->mapped_ptr + data->bind_offset;
+}
+
+NCNN_FORCEINLINE void VkImageMat::addref()
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+}
+
+NCNN_FORCEINLINE void VkImageMat::release()
+{
+    if (refcount && NCNN_XADD(refcount, -1) == 1)
+    {
+        if (allocator && data)
+        {
+            allocator->fastFree(data);
+        }
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    d = 0;
+    c = 0;
+
+    refcount = 0;
+}
+
+NCNN_FORCEINLINE bool VkImageMat::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+NCNN_FORCEINLINE size_t VkImageMat::total() const
+{
+    return w * h * d * c;
+}
+
+NCNN_FORCEINLINE int VkImageMat::elembits() const
+{
+    return elempack ? static_cast<int>(elemsize) * 8 / elempack : 0;
+}
+
+NCNN_FORCEINLINE Mat VkImageMat::shape() const
+{
+    if (dims == 1)
+        return Mat(w * elempack, (void*)0);
+    if (dims == 2)
+        return Mat(w, h * elempack, (void*)0);
+    if (dims == 3)
+        return Mat(w, h, c * elempack, (void*)0);
+    if (dims == 4)
+        return Mat(w, h, d, c * elempack, (void*)0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE VkImage VkImageMat::image() const
+{
+    return data->image;
+}
+
+NCNN_FORCEINLINE VkImageView VkImageMat::imageview() const
+{
+    return data->imageview;
+}
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_MAT_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/modelbin.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/modelbin.h
new file mode 100644
index 0000000..15d2b9c
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/modelbin.h
@@ -0,0 +1,80 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_MODELBIN_H
+#define NCNN_MODELBIN_H
+
+#include "mat.h"
+
+namespace ncnn {
+
+class DataReader;
+class NCNN_EXPORT ModelBin
+{
+public:
+    ModelBin();
+    virtual ~ModelBin();
+    // element type
+    // 0 = auto
+    // 1 = float32
+    // 2 = float16
+    // 3 = int8
+    // load vec
+    virtual Mat load(int w, int type) const = 0;
+    // load image
+    virtual Mat load(int w, int h, int type) const;
+    // load dim
+    virtual Mat load(int w, int h, int c, int type) const;
+    // load cube
+    virtual Mat load(int w, int h, int d, int c, int type) const;
+};
+
+class ModelBinFromDataReaderPrivate;
+class NCNN_EXPORT ModelBinFromDataReader : public ModelBin
+{
+public:
+    explicit ModelBinFromDataReader(const DataReader& dr);
+    virtual ~ModelBinFromDataReader();
+
+    virtual Mat load(int w, int type) const;
+
+private:
+    ModelBinFromDataReader(const ModelBinFromDataReader&);
+    ModelBinFromDataReader& operator=(const ModelBinFromDataReader&);
+
+private:
+    ModelBinFromDataReaderPrivate* const d;
+};
+
+class ModelBinFromMatArrayPrivate;
+class NCNN_EXPORT ModelBinFromMatArray : public ModelBin
+{
+public:
+    // construct from weight blob array
+    explicit ModelBinFromMatArray(const Mat* weights);
+    virtual ~ModelBinFromMatArray();
+
+    virtual Mat load(int w, int type) const;
+
+private:
+    ModelBinFromMatArray(const ModelBinFromMatArray&);
+    ModelBinFromMatArray& operator=(const ModelBinFromMatArray&);
+
+private:
+    ModelBinFromMatArrayPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_MODELBIN_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/ncnn_export.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/ncnn_export.h
new file mode 100644
index 0000000..e2f5fde
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/ncnn_export.h
@@ -0,0 +1,42 @@
+
+#ifndef NCNN_EXPORT_H
+#define NCNN_EXPORT_H
+
+#ifdef NCNN_STATIC_DEFINE
+#  define NCNN_EXPORT
+#  define NCNN_NO_EXPORT
+#else
+#  ifndef NCNN_EXPORT
+#    ifdef ncnn_EXPORTS
+        /* We are building this library */
+#      define NCNN_EXPORT __attribute__((visibility("default")))
+#    else
+        /* We are using this library */
+#      define NCNN_EXPORT __attribute__((visibility("default")))
+#    endif
+#  endif
+
+#  ifndef NCNN_NO_EXPORT
+#    define NCNN_NO_EXPORT __attribute__((visibility("hidden")))
+#  endif
+#endif
+
+#ifndef NCNN_DEPRECATED
+#  define NCNN_DEPRECATED __attribute__ ((__deprecated__))
+#endif
+
+#ifndef NCNN_DEPRECATED_EXPORT
+#  define NCNN_DEPRECATED_EXPORT NCNN_EXPORT NCNN_DEPRECATED
+#endif
+
+#ifndef NCNN_DEPRECATED_NO_EXPORT
+#  define NCNN_DEPRECATED_NO_EXPORT NCNN_NO_EXPORT NCNN_DEPRECATED
+#endif
+
+#if 0 /* DEFINE_NO_DEPRECATED */
+#  ifndef NCNN_NO_DEPRECATED
+#    define NCNN_NO_DEPRECATED
+#  endif
+#endif
+
+#endif /* NCNN_EXPORT_H */
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/net.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/net.h
new file mode 100644
index 0000000..9407042
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/net.h
@@ -0,0 +1,272 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_NET_H
+#define NCNN_NET_H
+
+#include "blob.h"
+#include "layer.h"
+#include "mat.h"
+#include "option.h"
+#include "platform.h"
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/asset_manager.h>
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkCompute;
+#endif // NCNN_VULKAN
+class DataReader;
+class Extractor;
+class NetPrivate;
+class NCNN_EXPORT Net
+{
+public:
+    // empty init
+    Net();
+    // clear and destroy
+    virtual ~Net();
+
+public:
+    // option can be changed before loading
+    Option opt;
+
+#if NCNN_VULKAN
+    // set gpu device by index
+    void set_vulkan_device(int device_index);
+
+    // set gpu device by device handle, no owner transfer
+    void set_vulkan_device(const VulkanDevice* vkdev);
+
+    const VulkanDevice* vulkan_device() const;
+#endif // NCNN_VULKAN
+
+#if NCNN_STRING
+    // register custom layer by layer type name
+    // return 0 if success
+    int register_custom_layer(const char* type, layer_creator_func creator, layer_destroyer_func destroyer = 0, void* userdata = 0);
+    virtual int custom_layer_to_index(const char* type);
+#endif // NCNN_STRING
+    // register custom layer by layer type
+    // return 0 if success
+    int register_custom_layer(int index, layer_creator_func creator, layer_destroyer_func destroyer = 0, void* userdata = 0);
+
+#if NCNN_STRING
+    int load_param(const DataReader& dr);
+#endif // NCNN_STRING
+
+    int load_param_bin(const DataReader& dr);
+
+    int load_model(const DataReader& dr);
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    // load network structure from plain param file
+    // return 0 if success
+    int load_param(FILE* fp);
+    int load_param(const char* protopath);
+    int load_param_mem(const char* mem);
+#endif // NCNN_STRING
+    // load network structure from binary param file
+    // return 0 if success
+    int load_param_bin(FILE* fp);
+    int load_param_bin(const char* protopath);
+
+    // load network weight data from model file
+    // return 0 if success
+    int load_model(FILE* fp);
+    int load_model(const char* modelpath);
+#endif // NCNN_STDIO
+
+    // load network structure from external memory
+    // memory pointer must be 32-bit aligned
+    // return bytes consumed
+    int load_param(const unsigned char* mem);
+
+    // reference network weight data from external memory
+    // weight data is not copied but referenced
+    // so external memory should be retained when used
+    // memory pointer must be 32-bit aligned
+    // return bytes consumed
+    int load_model(const unsigned char* mem);
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#if NCNN_STRING
+    // convenient load network structure from android asset plain param file
+    int load_param(AAsset* asset);
+    int load_param(AAssetManager* mgr, const char* assetpath);
+#endif // NCNN_STRING
+    // convenient load network structure from android asset binary param file
+    int load_param_bin(AAsset* asset);
+    int load_param_bin(AAssetManager* mgr, const char* assetpath);
+
+    // convenient load network weight data from android asset model file
+    int load_model(AAsset* asset);
+    int load_model(AAssetManager* mgr, const char* assetpath);
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+    // unload network structure and weight data
+    void clear();
+
+    // construct an Extractor from network
+    Extractor create_extractor() const;
+
+    // get input/output indexes/names
+    const std::vector<int>& input_indexes() const;
+    const std::vector<int>& output_indexes() const;
+#if NCNN_STRING
+    const std::vector<const char*>& input_names() const;
+    const std::vector<const char*>& output_names() const;
+#endif
+
+    const std::vector<Blob>& blobs() const;
+    const std::vector<Layer*>& layers() const;
+
+    std::vector<Blob>& mutable_blobs();
+    std::vector<Layer*>& mutable_layers();
+
+protected:
+    friend class Extractor;
+#if NCNN_STRING
+    int find_blob_index_by_name(const char* name) const;
+    int find_layer_index_by_name(const char* name) const;
+    virtual Layer* create_custom_layer(const char* type);
+#endif // NCNN_STRING
+    virtual Layer* create_custom_layer(int index);
+
+private:
+    Net(const Net&);
+    Net& operator=(const Net&);
+
+private:
+    NetPrivate* const d;
+};
+
+class ExtractorPrivate;
+class NCNN_EXPORT Extractor
+{
+public:
+    virtual ~Extractor();
+
+    // copy
+    Extractor(const Extractor&);
+
+    // assign
+    Extractor& operator=(const Extractor&);
+
+    // clear blob mats and alloctors
+    void clear();
+
+    // enable light mode
+    // intermediate blob will be recycled when enabled
+    // enabled by default
+    void set_light_mode(bool enable);
+
+    // set thread count for this extractor
+    // this will overwrite the global setting
+    // default count is system depended
+    void set_num_threads(int num_threads);
+
+    // set blob memory allocator
+    void set_blob_allocator(Allocator* allocator);
+
+    // set workspace memory allocator
+    void set_workspace_allocator(Allocator* allocator);
+
+#if NCNN_VULKAN
+    void set_vulkan_compute(bool enable);
+
+    void set_blob_vkallocator(VkAllocator* allocator);
+
+    void set_workspace_vkallocator(VkAllocator* allocator);
+
+    void set_staging_vkallocator(VkAllocator* allocator);
+#endif // NCNN_VULKAN
+
+#if NCNN_STRING
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const Mat& in);
+
+    // get result by blob name
+    // return 0 if success
+    // type = 0, default
+    // type = 1, do not convert fp16/bf16 or / and packing
+    int extract(const char* blob_name, Mat& feat, int type = 0);
+#endif // NCNN_STRING
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const Mat& in);
+
+    // get result by blob index
+    // return 0 if success
+    // type = 0, default
+    // type = 1, do not convert fp16/bf16 or / and packing
+    int extract(int blob_index, Mat& feat, int type = 0);
+
+#if NCNN_VULKAN
+#if NCNN_STRING
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const VkMat& in);
+
+    // get result by blob name
+    // return 0 if success
+    int extract(const char* blob_name, VkMat& feat, VkCompute& cmd);
+
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const VkImageMat& in);
+
+    // get result by blob name
+    // return 0 if success
+    int extract(const char* blob_name, VkImageMat& feat, VkCompute& cmd);
+#endif // NCNN_STRING
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const VkMat& in);
+
+    // get result by blob index
+    // return 0 if success
+    int extract(int blob_index, VkMat& feat, VkCompute& cmd);
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const VkImageMat& in);
+
+    // get result by blob index
+    // return 0 if success
+    int extract(int blob_index, VkImageMat& feat, VkCompute& cmd);
+#endif // NCNN_VULKAN
+
+protected:
+    friend Extractor Net::create_extractor() const;
+    Extractor(const Net* net, size_t blob_count);
+
+private:
+    ExtractorPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_NET_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/option.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/option.h
new file mode 100644
index 0000000..3fda808
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/option.h
@@ -0,0 +1,153 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_OPTION_H
+#define NCNN_OPTION_H
+
+#include "platform.h"
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkAllocator;
+class PipelineCache;
+#endif // NCNN_VULKAN
+
+class Allocator;
+class NCNN_EXPORT Option
+{
+public:
+    // default option
+    Option();
+
+public:
+    // light mode
+    // intermediate blob will be recycled when enabled
+    // enabled by default
+    bool lightmode;
+
+    // thread count
+    // default value is the one returned by get_cpu_count()
+    int num_threads;
+
+    // blob memory allocator
+    Allocator* blob_allocator;
+
+    // workspace memory allocator
+    Allocator* workspace_allocator;
+
+#if NCNN_VULKAN
+    // blob memory allocator
+    VkAllocator* blob_vkallocator;
+
+    // workspace memory allocator
+    VkAllocator* workspace_vkallocator;
+
+    // staging memory allocator
+    VkAllocator* staging_vkallocator;
+
+    // pipeline cache
+    PipelineCache* pipeline_cache;
+#endif // NCNN_VULKAN
+
+    // the time openmp threads busy-wait for more work before going to sleep
+    // default value is 20ms to keep the cores enabled
+    // without too much extra power consumption afterwards
+    int openmp_blocktime;
+
+    // enable winograd convolution optimization
+    // improve convolution 3x3 stride1 performance, may consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_winograd_convolution;
+
+    // enable sgemm convolution optimization
+    // improve convolution 1x1 stride1 performance, may consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_sgemm_convolution;
+
+    // enable quantized int8 inference
+    // use low-precision int8 path for quantized model
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_int8_inference;
+
+    // enable vulkan compute
+    bool use_vulkan_compute;
+
+    // enable bf16 data type for storage
+    // improve most operator performance on all arm devices, may consume more memory
+    bool use_bf16_storage;
+
+    // enable options for gpu inference
+    bool use_fp16_packed;
+    bool use_fp16_storage;
+    bool use_fp16_arithmetic;
+    bool use_int8_packed;
+    bool use_int8_storage;
+    bool use_int8_arithmetic;
+
+    // enable simd-friendly packed memory layout
+    // improve all operator performance on all arm devices, will consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_packing_layout;
+
+    bool use_shader_pack8;
+
+    // subgroup option
+    bool use_subgroup_basic;
+    bool use_subgroup_vote;
+    bool use_subgroup_ballot;
+    bool use_subgroup_shuffle;
+
+    // turn on for adreno
+    bool use_image_storage;
+    bool use_tensor_storage;
+
+    bool use_reserved_0;
+
+    // enable DAZ(Denormals-Are-Zero) and FTZ(Flush-To-Zero)
+    // default value is 3
+    // 0 = DAZ OFF, FTZ OFF
+    // 1 = DAZ ON , FTZ OFF
+    // 2 = DAZ OFF, FTZ ON
+    // 3 = DAZ ON,  FTZ ON
+    int flush_denormals;
+
+    bool use_local_pool_allocator;
+
+    // enable local memory optimization for gpu inference
+    bool use_shader_local_memory;
+
+    // enable cooperative matrix optimization for gpu inference
+    bool use_cooperative_matrix;
+
+    // more fine-grained control of winograd convolution
+    bool use_winograd23_convolution;
+    bool use_winograd43_convolution;
+    bool use_winograd63_convolution;
+
+    bool use_reserved_6;
+    bool use_reserved_7;
+    bool use_reserved_8;
+    bool use_reserved_9;
+    bool use_reserved_10;
+    bool use_reserved_11;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_OPTION_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/paramdict.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/paramdict.h
new file mode 100644
index 0000000..c2ef160
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/paramdict.h
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PARAMDICT_H
+#define NCNN_PARAMDICT_H
+
+#include "mat.h"
+
+// at most 32 parameters
+#define NCNN_MAX_PARAM_COUNT 32
+
+namespace ncnn {
+
+class DataReader;
+class Net;
+class ParamDictPrivate;
+class NCNN_EXPORT ParamDict
+{
+public:
+    // empty
+    ParamDict();
+
+    virtual ~ParamDict();
+
+    // copy
+    ParamDict(const ParamDict&);
+
+    // assign
+    ParamDict& operator=(const ParamDict&);
+
+    // get type
+    int type(int id) const;
+
+    // get int
+    int get(int id, int def) const;
+    // get float
+    float get(int id, float def) const;
+    // get array
+    Mat get(int id, const Mat& def) const;
+
+    // set int
+    void set(int id, int i);
+    // set float
+    void set(int id, float f);
+    // set array
+    void set(int id, const Mat& v);
+
+protected:
+    friend class Net;
+
+    void clear();
+
+    int load_param(const DataReader& dr);
+    int load_param_bin(const DataReader& dr);
+
+private:
+    ParamDictPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_PARAMDICT_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/pipeline.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/pipeline.h
new file mode 100644
index 0000000..c284a14
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/pipeline.h
@@ -0,0 +1,113 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PIPELINE_H
+#define NCNN_PIPELINE_H
+
+#include "mat.h"
+#include "platform.h"
+#if NCNN_VULKAN
+#include "gpu.h"
+
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class Option;
+class PipelinePrivate;
+class NCNN_EXPORT Pipeline
+{
+public:
+    explicit Pipeline(const VulkanDevice* vkdev);
+    virtual ~Pipeline();
+
+public:
+    void set_optimal_local_size_xyz(int w = 4, int h = 4, int c = 4);
+    void set_optimal_local_size_xyz(const Mat& local_size_xyz);
+    void set_local_size_xyz(int w, int h, int c);
+
+    int create(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations);
+
+    int create(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations);
+
+public:
+    VkShaderModule shader_module() const;
+    VkDescriptorSetLayout descriptorset_layout() const;
+    VkPipelineLayout pipeline_layout() const;
+    VkPipeline pipeline() const;
+    VkDescriptorUpdateTemplateKHR descriptor_update_template() const;
+
+    const ShaderInfo& shader_info() const;
+
+    uint32_t local_size_x() const;
+    uint32_t local_size_y() const;
+    uint32_t local_size_z() const;
+
+protected:
+    void set_shader_module(VkShaderModule shader_module);
+    void set_descriptorset_layout(VkDescriptorSetLayout descriptorset_layout);
+    void set_pipeline_layout(VkPipelineLayout pipeline_layout);
+    void set_pipeline(VkPipeline pipeline);
+    void set_descriptor_update_template(VkDescriptorUpdateTemplateKHR descriptor_update_template);
+
+    void set_shader_info(const ShaderInfo& shader_info);
+
+public:
+    const VulkanDevice* vkdev;
+
+private:
+    Pipeline(const Pipeline&);
+    Pipeline& operator=(const Pipeline&);
+
+private:
+    PipelinePrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class VkCompute;
+class NCNN_EXPORT ImportAndroidHardwareBufferPipeline : private Pipeline
+{
+public:
+    explicit ImportAndroidHardwareBufferPipeline(const VulkanDevice* vkdev);
+    virtual ~ImportAndroidHardwareBufferPipeline();
+
+    int create(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator, int type_to, int rotate_from, const Option& opt);
+    int create(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator, int type_to, int rotate_from, int target_width, int target_height, const Option& opt);
+    void destroy();
+
+    friend class VkCompute;
+
+protected:
+    int create_shader_module(const Option& opt);
+    int create_sampler(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator);
+    int create_descriptorset_layout();
+
+public:
+    int type_to;
+    int rotate_from;
+    bool need_resize;
+
+    VkSampler sampler;
+};
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_PIPELINE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/pipelinecache.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/pipelinecache.h
new file mode 100644
index 0000000..bb6b8fb
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/pipelinecache.h
@@ -0,0 +1,85 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PIPELINECACHE_H
+#define NCNN_PIPELINECACHE_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#include "mat.h"
+#include "gpu.h"
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+
+class VulkanDevice;
+class PipelineCachePrivate;
+class NCNN_EXPORT PipelineCache
+{
+public:
+    explicit PipelineCache(const VulkanDevice* _vkdev);
+
+    virtual ~PipelineCache();
+
+    void clear();
+
+    int get_pipeline(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations,
+                     uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                     VkShaderModule* shader_module,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template,
+                     ShaderInfo& shader_info) const;
+
+    int get_pipeline(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations,
+                     uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                     VkShaderModule* shader_module,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template,
+                     ShaderInfo& shader_info) const;
+
+protected:
+    int create_shader_module(int shader_type_index, const Option& opt, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                             VkShaderModule* _shader_module, ShaderInfo& si) const;
+
+    int new_pipeline(VkShaderModule shader_module, const ShaderInfo& shader_info, const std::vector<vk_specialization_type>& specializations,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template) const;
+
+protected:
+    const VulkanDevice* vkdev;
+
+private:
+    PipelineCache(const PipelineCache&);
+    PipelineCache& operator=(const PipelineCache&);
+
+private:
+    PipelineCachePrivate* const d;
+};
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_PIPELINECACHE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/platform.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/platform.h
new file mode 100644
index 0000000..89f3243
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/platform.h
@@ -0,0 +1,285 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PLATFORM_H
+#define NCNN_PLATFORM_H
+
+#define NCNN_STDIO 1
+#define NCNN_STRING 1
+#define NCNN_SIMPLEOCV 0
+#define NCNN_SIMPLEOMP 0
+#define NCNN_SIMPLESTL 0
+#define NCNN_THREADS 1
+#define NCNN_BENCHMARK 0
+#define NCNN_C_API 1
+#define NCNN_PLATFORM_API 1
+#define NCNN_PIXEL 1
+#define NCNN_PIXEL_ROTATE 1
+#define NCNN_PIXEL_AFFINE 1
+#define NCNN_PIXEL_DRAWING 1
+#define NCNN_VULKAN 1
+#define NCNN_SYSTEM_GLSLANG 0
+#define NCNN_RUNTIME_CPU 1
+#define NCNN_AVX 1
+#define NCNN_XOP 1
+#define NCNN_FMA 1
+#define NCNN_F16C 1
+#define NCNN_AVX2 1
+#define NCNN_AVXVNNI 1
+#define NCNN_AVX512 1
+#define NCNN_AVX512VNNI 1
+#define NCNN_AVX512BF16 1
+#define NCNN_AVX512FP16 1
+#define NCNN_VFPV4 0
+#if __aarch64__
+#define NCNN_ARM82 0
+#define NCNN_ARM82DOT 0
+#define NCNN_ARM82FP16FML 0
+#define NCNN_ARM84BF16 0
+#define NCNN_ARM84I8MM 0
+#define NCNN_ARM86SVE 0
+#define NCNN_ARM86SVE2 0
+#define NCNN_ARM86SVEBF16 0
+#define NCNN_ARM86SVEI8MM 0
+#define NCNN_ARM86SVEF32MM 0
+#endif // __aarch64__
+#define NCNN_MSA 0
+#define NCNN_LSX 0
+#define NCNN_MMI 0
+#define NCNN_RVV 0
+#define NCNN_INT8 1
+#define NCNN_BF16 1
+#define NCNN_FORCE_INLINE 1
+
+#define NCNN_VERSION_STRING "1.0.20221128"
+
+#include "ncnn_export.h"
+
+#ifdef __cplusplus
+
+#if NCNN_THREADS
+#if (defined _WIN32 && !(defined __MINGW32__))
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <process.h>
+#else
+#include <pthread.h>
+#endif
+#endif // NCNN_THREADS
+
+#if __ANDROID_API__ >= 26
+#define VK_USE_PLATFORM_ANDROID_KHR
+#endif // __ANDROID_API__ >= 26
+
+namespace ncnn {
+
+#if NCNN_THREADS
+#if (defined _WIN32 && !(defined __MINGW32__))
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() { InitializeSRWLock(&srwlock); }
+    ~Mutex() {}
+    void lock() { AcquireSRWLockExclusive(&srwlock); }
+    void unlock() { ReleaseSRWLockExclusive(&srwlock); }
+private:
+    friend class ConditionVariable;
+    // NOTE SRWLock is available from windows vista
+    SRWLOCK srwlock;
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() { InitializeConditionVariable(&condvar); }
+    ~ConditionVariable() {}
+    void wait(Mutex& mutex) { SleepConditionVariableSRW(&condvar, &mutex.srwlock, INFINITE, 0); }
+    void broadcast() { WakeAllConditionVariable(&condvar); }
+    void signal() { WakeConditionVariable(&condvar); }
+private:
+    CONDITION_VARIABLE condvar;
+};
+
+static unsigned __stdcall start_wrapper(void* args);
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*start)(void*), void* args = 0) { _start = start; _args = args; handle = (HANDLE)_beginthreadex(0, 0, start_wrapper, this, 0, 0); }
+    ~Thread() {}
+    void join() { WaitForSingleObject(handle, INFINITE); CloseHandle(handle); }
+private:
+    friend unsigned __stdcall start_wrapper(void* args)
+    {
+        Thread* t = (Thread*)args;
+        t->_start(t->_args);
+        return 0;
+    }
+    HANDLE handle;
+    void* (*_start)(void*);
+    void* _args;
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { key = TlsAlloc(); }
+    ~ThreadLocalStorage() { TlsFree(key); }
+    void set(void* value) { TlsSetValue(key, (LPVOID)value); }
+    void* get() { return (void*)TlsGetValue(key); }
+private:
+    DWORD key;
+};
+#else // (defined _WIN32 && !(defined __MINGW32__))
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() { pthread_mutex_init(&mutex, 0); }
+    ~Mutex() { pthread_mutex_destroy(&mutex); }
+    void lock() { pthread_mutex_lock(&mutex); }
+    void unlock() { pthread_mutex_unlock(&mutex); }
+private:
+    friend class ConditionVariable;
+    pthread_mutex_t mutex;
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() { pthread_cond_init(&cond, 0); }
+    ~ConditionVariable() { pthread_cond_destroy(&cond); }
+    void wait(Mutex& mutex) { pthread_cond_wait(&cond, &mutex.mutex); }
+    void broadcast() { pthread_cond_broadcast(&cond); }
+    void signal() { pthread_cond_signal(&cond); }
+private:
+    pthread_cond_t cond;
+};
+
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*start)(void*), void* args = 0) { pthread_create(&t, 0, start, args); }
+    ~Thread() {}
+    void join() { pthread_join(t, 0); }
+private:
+    pthread_t t;
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { pthread_key_create(&key, 0); }
+    ~ThreadLocalStorage() { pthread_key_delete(key); }
+    void set(void* value) { pthread_setspecific(key, value); }
+    void* get() { return pthread_getspecific(key); }
+private:
+    pthread_key_t key;
+};
+#endif // (defined _WIN32 && !(defined __MINGW32__))
+#else // NCNN_THREADS
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() {}
+    ~Mutex() {}
+    void lock() {}
+    void unlock() {}
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() {}
+    ~ConditionVariable() {}
+    void wait(Mutex& /*mutex*/) {}
+    void broadcast() {}
+    void signal() {}
+};
+
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*/*start*/)(void*), void* /*args*/ = 0) {}
+    ~Thread() {}
+    void join() {}
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { data = 0; }
+    ~ThreadLocalStorage() {}
+    void set(void* value) { data = value; }
+    void* get() { return data; }
+private:
+    void* data;
+};
+#endif // NCNN_THREADS
+
+class NCNN_EXPORT MutexLockGuard
+{
+public:
+    MutexLockGuard(Mutex& _mutex) : mutex(_mutex) { mutex.lock(); }
+    ~MutexLockGuard() { mutex.unlock(); }
+private:
+    Mutex& mutex;
+};
+
+} // namespace ncnn
+
+#if NCNN_SIMPLESTL
+#include "simplestl.h"
+#else
+#include <algorithm>
+#include <list>
+#include <vector>
+#include <string>
+#endif
+
+#endif // __cplusplus
+
+#if NCNN_STDIO
+#if NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#include <android/log.h>
+#define NCNN_LOGE(...) do { \
+    fprintf(stderr, ##__VA_ARGS__); fprintf(stderr, "\n"); \
+    __android_log_print(ANDROID_LOG_WARN, "ncnn", ##__VA_ARGS__); } while(0)
+#else // NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#include <stdio.h>
+#define NCNN_LOGE(...) do { \
+    fprintf(stderr, ##__VA_ARGS__); fprintf(stderr, "\n"); } while(0)
+#endif // NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#else
+#define NCNN_LOGE(...)
+#endif
+
+
+#if NCNN_FORCE_INLINE
+#ifdef _MSC_VER
+    #define NCNN_FORCEINLINE __forceinline
+#elif defined(__GNUC__)
+    #define NCNN_FORCEINLINE inline __attribute__((__always_inline__))
+#elif defined(__CLANG__)
+    #if __has_attribute(__always_inline__)
+        #define NCNN_FORCEINLINE inline __attribute__((__always_inline__))
+    #else
+        #define NCNN_FORCEINLINE inline
+    #endif
+#else
+    #define NCNN_FORCEINLINE inline
+#endif
+#else
+    #define NCNN_FORCEINLINE inline
+#endif
+
+#endif // NCNN_PLATFORM_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/simpleocv.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/simpleocv.h
new file mode 100644
index 0000000..55ede15
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/simpleocv.h
@@ -0,0 +1,501 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLEOCV_H
+#define NCNN_SIMPLEOCV_H
+
+#include "platform.h"
+
+#if NCNN_SIMPLEOCV
+
+#include <limits.h>
+#include <string.h>
+#include "allocator.h"
+#include "mat.h"
+
+#if defined(_MSC_VER) || defined(__GNUC__)
+#pragma push_macro("min")
+#pragma push_macro("max")
+#undef min
+#undef max
+#endif
+
+#ifndef NCNN_XADD
+using ncnn::NCNN_XADD;
+#endif
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+
+enum
+{
+    CV_LOAD_IMAGE_UNCHANGED = -1,
+    CV_LOAD_IMAGE_GRAYSCALE = 0,
+    CV_LOAD_IMAGE_COLOR = 1,
+};
+
+enum
+{
+    CV_IMWRITE_JPEG_QUALITY = 1
+};
+
+// minimal opencv style data structure implementation
+namespace cv {
+
+template<typename _Tp>
+static inline _Tp saturate_cast(int v)
+{
+    return _Tp(v);
+}
+template<>
+inline uchar saturate_cast<uchar>(int v)
+{
+    return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
+}
+
+template<typename _Tp>
+struct Scalar_
+{
+    Scalar_()
+    {
+        v[0] = 0;
+        v[1] = 0;
+        v[2] = 0;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0)
+    {
+        v[0] = _v0;
+        v[1] = 0;
+        v[2] = 0;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0, _Tp _v1, _Tp _v2)
+    {
+        v[0] = _v0;
+        v[1] = _v1;
+        v[2] = _v2;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0, _Tp _v1, _Tp _v2, _Tp _v3)
+    {
+        v[0] = _v0;
+        v[1] = _v1;
+        v[2] = _v2;
+        v[3] = _v3;
+    }
+
+    const _Tp operator[](const int i) const
+    {
+        return v[i];
+    }
+
+    _Tp operator[](const int i)
+    {
+        return v[i];
+    }
+
+    _Tp v[4];
+};
+
+typedef Scalar_<uchar> Scalar;
+
+template<typename _Tp>
+struct Point_
+{
+    Point_()
+        : x(0), y(0)
+    {
+    }
+    Point_(_Tp _x, _Tp _y)
+        : x(_x), y(_y)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Point_<_Tp2>() const
+    {
+        return Point_<_Tp2>(saturate_cast<_Tp2>(x), saturate_cast<_Tp2>(y));
+    }
+
+    _Tp x;
+    _Tp y;
+};
+
+typedef Point_<int> Point;
+typedef Point_<float> Point2f;
+
+template<typename _Tp>
+struct Size_
+{
+    Size_()
+        : width(0), height(0)
+    {
+    }
+    Size_(_Tp _w, _Tp _h)
+        : width(_w), height(_h)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Size_<_Tp2>() const
+    {
+        return Size_<_Tp2>(saturate_cast<_Tp2>(width), saturate_cast<_Tp2>(height));
+    }
+
+    _Tp width;
+    _Tp height;
+};
+
+typedef Size_<int> Size;
+typedef Size_<float> Size2f;
+
+template<typename _Tp>
+struct Rect_
+{
+    Rect_()
+        : x(0), y(0), width(0), height(0)
+    {
+    }
+    Rect_(_Tp _x, _Tp _y, _Tp _w, _Tp _h)
+        : x(_x), y(_y), width(_w), height(_h)
+    {
+    }
+    Rect_(Point_<_Tp> _p, Size_<_Tp> _size)
+        : x(_p.x), y(_p.y), width(_size.width), height(_size.height)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Rect_<_Tp2>() const
+    {
+        return Rect_<_Tp2>(saturate_cast<_Tp2>(x), saturate_cast<_Tp2>(y), saturate_cast<_Tp2>(width), saturate_cast<_Tp2>(height));
+    }
+
+    _Tp x;
+    _Tp y;
+    _Tp width;
+    _Tp height;
+
+    // area
+    _Tp area() const
+    {
+        return width * height;
+    }
+};
+
+template<typename _Tp>
+static inline Rect_<_Tp>& operator&=(Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    _Tp x1 = std::max(a.x, b.x), y1 = std::max(a.y, b.y);
+    a.width = std::min(a.x + a.width, b.x + b.width) - x1;
+    a.height = std::min(a.y + a.height, b.y + b.height) - y1;
+    a.x = x1;
+    a.y = y1;
+    if (a.width <= 0 || a.height <= 0)
+        a = Rect_<_Tp>();
+    return a;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp>& operator|=(Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    _Tp x1 = std::min(a.x, b.x), y1 = std::min(a.y, b.y);
+    a.width = std::max(a.x + a.width, b.x + b.width) - x1;
+    a.height = std::max(a.y + a.height, b.y + b.height) - y1;
+    a.x = x1;
+    a.y = y1;
+    return a;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp> operator&(const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    Rect_<_Tp> c = a;
+    return c &= b;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp> operator|(const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    Rect_<_Tp> c = a;
+    return c |= b;
+}
+
+typedef Rect_<int> Rect;
+typedef Rect_<float> Rect2f;
+
+#define CV_8UC1  1
+#define CV_8UC3  3
+#define CV_8UC4  4
+#define CV_32FC1 4
+
+struct NCNN_EXPORT Mat
+{
+    Mat()
+        : data(0), refcount(0), rows(0), cols(0), c(0)
+    {
+    }
+
+    Mat(int _rows, int _cols, int flags)
+        : data(0), refcount(0)
+    {
+        create(_rows, _cols, flags);
+    }
+
+    // copy
+    Mat(const Mat& m)
+        : data(m.data), refcount(m.refcount)
+    {
+        if (refcount)
+            NCNN_XADD(refcount, 1);
+
+        rows = m.rows;
+        cols = m.cols;
+        c = m.c;
+    }
+
+    Mat(int _rows, int _cols, int flags, void* _data)
+        : data((unsigned char*)_data), refcount(0)
+    {
+        rows = _rows;
+        cols = _cols;
+        c = flags;
+    }
+
+    ~Mat()
+    {
+        release();
+    }
+
+    // assign
+    Mat& operator=(const Mat& m)
+    {
+        if (this == &m)
+            return *this;
+
+        if (m.refcount)
+            NCNN_XADD(m.refcount, 1);
+
+        release();
+
+        data = m.data;
+        refcount = m.refcount;
+
+        rows = m.rows;
+        cols = m.cols;
+        c = m.c;
+
+        return *this;
+    }
+
+    Mat& operator=(const Scalar& s)
+    {
+        if (total() > 0)
+        {
+            uchar* p = data;
+            for (int i = 0; i < cols * rows; i++)
+            {
+                for (int j = 0; j < c; j++)
+                {
+                    *p++ = s[j];
+                }
+            }
+        }
+
+        return *this;
+    }
+
+    void create(int _rows, int _cols, int flags)
+    {
+        release();
+
+        rows = _rows;
+        cols = _cols;
+        c = flags;
+
+        if (total() > 0)
+        {
+            // refcount address must be aligned, so we expand totalsize here
+            size_t totalsize = (total() + 3) >> 2 << 2;
+            data = (uchar*)ncnn::fastMalloc(totalsize + (int)sizeof(*refcount));
+            refcount = (int*)(((uchar*)data) + totalsize);
+            *refcount = 1;
+        }
+    }
+
+    void release()
+    {
+        if (refcount && NCNN_XADD(refcount, -1) == 1)
+            ncnn::fastFree(data);
+
+        data = 0;
+
+        rows = 0;
+        cols = 0;
+        c = 0;
+
+        refcount = 0;
+    }
+
+    Mat clone() const
+    {
+        if (empty())
+            return Mat();
+
+        Mat m(rows, cols, c);
+
+        if (total() > 0)
+        {
+            memcpy(m.data, data, total());
+        }
+
+        return m;
+    }
+
+    bool empty() const
+    {
+        return data == 0 || total() == 0;
+    }
+
+    int channels() const
+    {
+        return c;
+    }
+
+    int type() const
+    {
+        return c;
+    }
+
+    size_t total() const
+    {
+        return cols * rows * c;
+    }
+
+    const uchar* ptr(int y) const
+    {
+        return data + y * cols * c;
+    }
+
+    uchar* ptr(int y)
+    {
+        return data + y * cols * c;
+    }
+
+    template<typename _Tp>
+    const _Tp* ptr(int y) const
+    {
+        return (const _Tp*)data + y * cols * c;
+    }
+
+    template<typename _Tp>
+    _Tp* ptr(int y)
+    {
+        return (_Tp*)data + y * cols * c;
+    }
+
+    // roi
+    Mat operator()(const Rect& roi) const
+    {
+        if (empty())
+            return Mat();
+
+        Mat m(roi.height, roi.width, c);
+
+        int sy = roi.y;
+        for (int y = 0; y < roi.height; y++)
+        {
+            const uchar* sptr = ptr(sy) + roi.x * c;
+            uchar* dptr = m.ptr(y);
+            memcpy(dptr, sptr, roi.width * c);
+            sy++;
+        }
+
+        return m;
+    }
+
+    uchar* data;
+
+    // pointer to the reference counter;
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    int rows;
+    int cols;
+
+    int c;
+};
+
+enum ImreadModes
+{
+    IMREAD_UNCHANGED = -1,
+    IMREAD_GRAYSCALE = 0,
+    IMREAD_COLOR = 1
+};
+
+NCNN_EXPORT Mat imread(const std::string& path, int flags = IMREAD_COLOR);
+
+enum ImwriteFlags
+{
+    IMWRITE_JPEG_QUALITY = 1
+};
+
+NCNN_EXPORT bool imwrite(const std::string& path, const Mat& m, const std::vector<int>& params = std::vector<int>());
+
+NCNN_EXPORT void imshow(const std::string& name, const Mat& m);
+
+NCNN_EXPORT int waitKey(int delay = 0);
+
+#if NCNN_PIXEL
+NCNN_EXPORT void resize(const Mat& src, Mat& dst, const Size& size, float sw = 0.f, float sh = 0.f, int flags = 0);
+#endif // NCNN_PIXEL
+
+#if NCNN_PIXEL_DRAWING
+
+enum
+{
+    FILLED = -1
+};
+
+NCNN_EXPORT void rectangle(Mat& img, Point pt1, Point pt2, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void rectangle(Mat& img, Rect rec, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void circle(Mat& img, Point center, int radius, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void line(Mat& img, Point p0, Point p1, const Scalar& color, int thickness = 1);
+
+enum
+{
+    FONT_HERSHEY_SIMPLEX = 0
+};
+
+NCNN_EXPORT void putText(Mat& img, const std::string& text, Point org, int fontFace, double fontScale, Scalar color, int thickness = 1);
+
+NCNN_EXPORT Size getTextSize(const std::string& text, int fontFace, double fontScale, int thickness, int* baseLine);
+
+#endif // NCNN_PIXEL_DRAWING
+
+} // namespace cv
+
+#if defined(_MSC_VER) || defined(__GNUC__)
+#pragma pop_macro("min")
+#pragma pop_macro("max")
+#endif
+
+#endif // NCNN_SIMPLEOCV
+
+#endif // NCNN_SIMPLEOCV_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/simpleomp.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/simpleomp.h
new file mode 100644
index 0000000..13e2452
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/simpleomp.h
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLEOMP_H
+#define NCNN_SIMPLEOMP_H
+
+#include "platform.h"
+
+#if NCNN_SIMPLEOMP
+
+#include <stdint.h>
+
+// This minimal openmp runtime implementation only supports the llvm openmp abi
+// and only supports #pragma omp parallel for num_threads(X)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+NCNN_EXPORT int omp_get_max_threads();
+
+NCNN_EXPORT void omp_set_num_threads(int num_threads);
+
+NCNN_EXPORT int omp_get_dynamic();
+
+NCNN_EXPORT void omp_set_dynamic(int dynamic);
+
+NCNN_EXPORT int omp_get_num_threads();
+
+NCNN_EXPORT int omp_get_thread_num();
+
+NCNN_EXPORT int kmp_get_blocktime();
+
+NCNN_EXPORT void kmp_set_blocktime(int blocktime);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // NCNN_SIMPLEOMP
+
+#endif // NCNN_SIMPLEOMP_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/simplestl.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/simplestl.h
new file mode 100644
index 0000000..00ff468
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/simplestl.h
@@ -0,0 +1,565 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLESTL_H
+#define NCNN_SIMPLESTL_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#if !NCNN_SIMPLESTL
+
+#include <new>
+
+#else
+
+// allocation functions
+NCNN_EXPORT void* operator new(size_t size);
+NCNN_EXPORT void* operator new[](size_t size);
+// placement allocation functions
+NCNN_EXPORT void* operator new(size_t size, void* ptr);
+NCNN_EXPORT void* operator new[](size_t size, void* ptr);
+// deallocation functions
+NCNN_EXPORT void operator delete(void* ptr);
+NCNN_EXPORT void operator delete[](void* ptr);
+// deallocation functions since c++14
+#if __cplusplus >= 201402L
+NCNN_EXPORT void operator delete(void* ptr, size_t sz);
+NCNN_EXPORT void operator delete[](void* ptr, size_t sz);
+#endif
+// placement deallocation functions
+NCNN_EXPORT void operator delete(void* ptr, void* voidptr2);
+NCNN_EXPORT void operator delete[](void* ptr, void* voidptr2);
+
+#endif
+
+// minimal stl data structure implementation
+namespace std {
+
+template<typename T>
+const T& max(const T& a, const T& b)
+{
+    return (a < b) ? b : a;
+}
+
+template<typename T>
+const T& min(const T& a, const T& b)
+{
+    return (a > b) ? b : a;
+}
+
+template<typename T>
+void swap(T& a, T& b)
+{
+    T temp(a);
+    a = b;
+    b = temp;
+}
+
+template<typename T1, typename T2>
+struct pair
+{
+    pair()
+        : first(), second()
+    {
+    }
+    pair(const T1& t1, const T2& t2)
+        : first(t1), second(t2)
+    {
+    }
+
+    T1 first;
+    T2 second;
+};
+
+template<typename T1, typename T2>
+bool operator==(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return (x.first == y.first && x.second == y.second);
+}
+template<typename T1, typename T2>
+bool operator<(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return x.first < y.first || (!(y.first < x.first) && x.second < y.second);
+}
+template<typename T1, typename T2>
+bool operator!=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(x == y);
+}
+template<typename T1, typename T2>
+bool operator>(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return y < x;
+}
+template<typename T1, typename T2>
+bool operator<=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(y < x);
+}
+template<typename T1, typename T2>
+bool operator>=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(x < y);
+}
+
+template<typename T1, typename T2>
+pair<T1, T2> make_pair(const T1& t1, const T2& t2)
+{
+    return pair<T1, T2>(t1, t2);
+}
+
+template<typename T>
+struct node
+{
+    node* prev_;
+    node* next_;
+    T data_;
+
+    node()
+        : prev_(0), next_(0), data_()
+    {
+    }
+    node(const T& t)
+        : prev_(0), next_(0), data_(t)
+    {
+    }
+};
+
+template<typename T>
+struct iter_list
+{
+    iter_list()
+        : curr_(0)
+    {
+    }
+    iter_list(node<T>* n)
+        : curr_(n)
+    {
+    }
+    iter_list(const iter_list& i)
+        : curr_(i.curr_)
+    {
+    }
+    ~iter_list()
+    {
+    }
+
+    iter_list& operator=(const iter_list& i)
+    {
+        curr_ = i.curr_;
+        return *this;
+    }
+
+    T& operator*()
+    {
+        return curr_->data_;
+    }
+    T* operator->()
+    {
+        return &(curr_->data_);
+    }
+
+    bool operator==(const iter_list& i)
+    {
+        return curr_ == i.curr_;
+    }
+    bool operator!=(const iter_list& i)
+    {
+        return curr_ != i.curr_;
+    }
+
+    iter_list& operator++()
+    {
+        curr_ = curr_->next_;
+        return *this;
+    }
+    iter_list& operator--()
+    {
+        curr_ = curr_->prev_;
+        return *this;
+    }
+
+    node<T>* curr_;
+};
+
+template<typename T>
+struct list
+{
+    typedef iter_list<T> iterator;
+
+    list()
+    {
+        head_ = new node<T>();
+        tail_ = head_;
+        count_ = 0;
+    }
+    ~list()
+    {
+        clear();
+        delete head_;
+    }
+    list(const list& l)
+    {
+        head_ = new node<T>();
+        tail_ = head_;
+        count_ = 0;
+
+        for (iter_list<T> i = l.begin(); i != l.end(); ++i)
+        {
+            push_back(*i);
+        }
+    }
+
+    list& operator=(const list& l)
+    {
+        if (this == &l)
+        {
+            return *this;
+        }
+        clear();
+
+        for (iter_list<T> i = l.begin(); i != l.end(); ++i)
+        {
+            push_back(*i);
+        }
+        return *this;
+    }
+
+    void clear()
+    {
+        while (count_ > 0)
+        {
+            pop_front();
+        }
+    }
+
+    void pop_front()
+    {
+        if (count_ > 0)
+        {
+            head_ = head_->next_;
+            delete head_->prev_;
+            head_->prev_ = 0;
+            --count_;
+        }
+    }
+
+    size_t size() const
+    {
+        return count_;
+    }
+    iter_list<T> begin() const
+    {
+        return iter_list<T>(head_);
+    }
+    iter_list<T> end() const
+    {
+        return iter_list<T>(tail_);
+    }
+    bool empty() const
+    {
+        return count_ == 0;
+    }
+
+    void push_back(const T& t)
+    {
+        if (count_ == 0)
+        {
+            head_ = new node<T>(t);
+            head_->prev_ = 0;
+            head_->next_ = tail_;
+            tail_->prev_ = head_;
+            count_ = 1;
+        }
+        else
+        {
+            node<T>* temp = new node<T>(t);
+            temp->prev_ = tail_->prev_;
+            temp->next_ = tail_;
+            tail_->prev_->next_ = temp;
+            tail_->prev_ = temp;
+            ++count_;
+        }
+    }
+
+    iter_list<T> erase(iter_list<T> pos)
+    {
+        if (pos != end())
+        {
+            node<T>* temp = pos.curr_;
+            if (temp == head_)
+            {
+                ++pos;
+                temp->next_->prev_ = 0;
+                head_ = temp->next_;
+            }
+            else
+            {
+                --pos;
+                temp->next_->prev_ = temp->prev_;
+                temp->prev_->next_ = temp->next_;
+                ++pos;
+            }
+            delete temp;
+            --count_;
+        }
+        return pos;
+    }
+
+protected:
+    node<T>* head_;
+    node<T>* tail_;
+    size_t count_;
+};
+
+template<typename T>
+struct greater
+{
+    bool operator()(const T& x, const T& y) const
+    {
+        return (x > y);
+    }
+};
+
+template<typename T>
+struct less
+{
+    bool operator()(const T& x, const T& y) const
+    {
+        return (x < y);
+    }
+};
+
+template<typename RandomAccessIter, typename Compare>
+void partial_sort(RandomAccessIter first, RandomAccessIter middle, RandomAccessIter last, Compare comp)
+{
+    // [TODO] heap sort should be used here, but we simply use bubble sort now
+    for (RandomAccessIter i = first; i < middle; ++i)
+    {
+        // bubble sort
+        for (RandomAccessIter j = last - 1; j > first; --j)
+        {
+            if (comp(*j, *(j - 1)))
+            {
+                swap(*j, *(j - 1));
+            }
+        }
+    }
+}
+
+template<typename T>
+struct vector
+{
+    vector()
+        : data_(0), size_(0), capacity_(0)
+    {
+    }
+    vector(const size_t new_size, const T& value = T())
+        : data_(0), size_(0), capacity_(0)
+    {
+        resize(new_size, value);
+    }
+    ~vector()
+    {
+        clear();
+    }
+    vector(const vector& v)
+        : data_(0), size_(0), capacity_(0)
+    {
+        resize(v.size());
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i] = v.data_[i];
+        }
+    }
+
+    vector& operator=(const vector& v)
+    {
+        if (this == &v)
+        {
+            return *this;
+        }
+        resize(0);
+        resize(v.size());
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i] = v.data_[i];
+        }
+        return *this;
+    }
+
+    void resize(const size_t new_size, const T& value = T())
+    {
+        try_alloc(new_size);
+        if (new_size > size_)
+        {
+            for (size_t i = size_; i < new_size; i++)
+            {
+                new (&data_[i]) T(value);
+            }
+        }
+        else if (new_size < size_)
+        {
+            for (size_t i = new_size; i < size_; i++)
+            {
+                data_[i].~T();
+            }
+        }
+        size_ = new_size;
+    }
+
+    void clear()
+    {
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i].~T();
+        }
+        delete[](char*) data_;
+        data_ = 0;
+        size_ = 0;
+        capacity_ = 0;
+    }
+
+    T* data() const
+    {
+        return data_;
+    }
+    size_t size() const
+    {
+        return size_;
+    }
+    T& operator[](size_t i) const
+    {
+        return data_[i];
+    }
+    T* begin() const
+    {
+        return &data_[0];
+    }
+    T* end() const
+    {
+        return &data_[size_];
+    }
+    bool empty() const
+    {
+        return size_ == 0;
+    }
+
+    void push_back(const T& t)
+    {
+        try_alloc(size_ + 1);
+        new (&data_[size_]) T(t);
+        size_++;
+    }
+
+    void insert(T* pos, T* b, T* e)
+    {
+        vector* v = 0;
+        if (b >= begin() && b < end())
+        {
+            //the same vector
+            v = new vector(*this);
+            b = v->begin() + (b - begin());
+            e = v->begin() + (e - begin());
+        }
+        size_t diff = pos - begin();
+        try_alloc(size_ + (e - b));
+        pos = begin() + diff;
+        memmove(pos + (e - b), pos, (end() - pos) * sizeof(T));
+        size_t len = e - b;
+        size_ += len;
+        for (size_t i = 0; i < len; i++)
+        {
+            *pos = *b;
+            pos++;
+            b++;
+        }
+        delete v;
+    }
+
+    T* erase(T* pos)
+    {
+        pos->~T();
+        memmove(pos, pos + 1, (end() - pos - 1) * sizeof(T));
+        size_--;
+        return pos;
+    }
+
+protected:
+    T* data_;
+    size_t size_;
+    size_t capacity_;
+    void try_alloc(size_t new_size)
+    {
+        if (new_size * 3 / 2 > capacity_ / 2)
+        {
+            capacity_ = new_size * 2;
+            T* new_data = (T*)new char[capacity_ * sizeof(T)];
+            memset(static_cast<void*>(new_data), 0, capacity_ * sizeof(T));
+            if (data_)
+            {
+                memmove(new_data, data_, sizeof(T) * size_);
+                delete[](char*) data_;
+            }
+            data_ = new_data;
+        }
+    }
+};
+
+struct NCNN_EXPORT string : public vector<char>
+{
+    string()
+    {
+    }
+    string(const char* str)
+    {
+        size_t len = strlen(str);
+        resize(len);
+        memcpy(data_, str, len);
+    }
+    const char* c_str() const
+    {
+        return (const char*)data_;
+    }
+    bool operator==(const string& str2) const
+    {
+        return strcmp(data_, str2.data_) == 0;
+    }
+    bool operator==(const char* str2) const
+    {
+        return strcmp(data_, str2) == 0;
+    }
+    bool operator!=(const char* str2) const
+    {
+        return strcmp(data_, str2) != 0;
+    }
+    string& operator+=(const string& str1)
+    {
+        insert(end(), str1.begin(), str1.end());
+        return *this;
+    }
+};
+
+inline string operator+(const string& str1, const string& str2)
+{
+    string str(str1);
+    str.insert(str.end(), str2.begin(), str2.end());
+    return str;
+}
+
+} // namespace std
+
+#endif // NCNN_SIMPLESTL_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/vulkan_header_fix.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/vulkan_header_fix.h
new file mode 100644
index 0000000..e7a7e8e
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/include/ncnn/vulkan_header_fix.h
@@ -0,0 +1,251 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_VULKAN_HEADER_FIX_H
+#define NCNN_VULKAN_HEADER_FIX_H
+
+#include <vulkan/vulkan.h>
+
+// This header contains new structure and function declearation to fix build with old vulkan sdk
+
+#if VK_HEADER_VERSION < 70
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES (VkStructureType)1000094000
+typedef enum VkSubgroupFeatureFlagBits
+{
+    VK_SUBGROUP_FEATURE_BASIC_BIT = 0x00000001,
+    VK_SUBGROUP_FEATURE_VOTE_BIT = 0x00000002,
+    VK_SUBGROUP_FEATURE_ARITHMETIC_BIT = 0x00000004,
+    VK_SUBGROUP_FEATURE_BALLOT_BIT = 0x00000008,
+    VK_SUBGROUP_FEATURE_SHUFFLE_BIT = 0x00000010,
+    VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT = 0x00000020,
+    VK_SUBGROUP_FEATURE_CLUSTERED_BIT = 0x00000040,
+    VK_SUBGROUP_FEATURE_QUAD_BIT = 0x00000080,
+    VK_SUBGROUP_FEATURE_PARTITIONED_BIT_NV = 0x00000100,
+    VK_SUBGROUP_FEATURE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkSubgroupFeatureFlagBits;
+typedef VkFlags VkSubgroupFeatureFlags;
+typedef struct VkPhysicalDeviceSubgroupProperties
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t subgroupSize;
+    VkShaderStageFlags supportedStages;
+    VkSubgroupFeatureFlags supportedOperations;
+    VkBool32 quadOperationsInAllStages;
+} VkPhysicalDeviceSubgroupProperties;
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_3_PROPERTIES (VkStructureType)1000168000
+#define VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_SUPPORT            (VkStructureType)1000168001
+typedef struct VkPhysicalDeviceMaintenance3Properties
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t maxPerSetDescriptors;
+    VkDeviceSize maxMemoryAllocationSize;
+} VkPhysicalDeviceMaintenance3Properties;
+typedef struct VkDescriptorSetLayoutSupport
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 supported;
+} VkDescriptorSetLayoutSupport;
+typedef VkPhysicalDeviceMaintenance3Properties VkPhysicalDeviceMaintenance3PropertiesKHR;
+typedef VkDescriptorSetLayoutSupport VkDescriptorSetLayoutSupportKHR;
+typedef void(VKAPI_PTR* PFN_vkGetDescriptorSetLayoutSupportKHR)(VkDevice device, const VkDescriptorSetLayoutCreateInfo* pCreateInfo, VkDescriptorSetLayoutSupport* pSupport);
+#endif // VK_HEADER_VERSION < 70
+
+#if VK_HEADER_VERSION < 80
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR (VkStructureType)1000177000
+typedef struct VkPhysicalDevice8BitStorageFeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 storageBuffer8BitAccess;
+    VkBool32 uniformAndStorageBuffer8BitAccess;
+    VkBool32 storagePushConstant8;
+} VkPhysicalDevice8BitStorageFeaturesKHR;
+#define VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2_KHR  (VkStructureType)1000109000
+#define VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2_KHR    (VkStructureType)1000109001
+#define VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2_KHR     (VkStructureType)1000109002
+#define VK_STRUCTURE_TYPE_SUBPASS_DEPENDENCY_2_KHR      (VkStructureType)1000109003
+#define VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2_KHR (VkStructureType)1000109004
+#define VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO_KHR        (VkStructureType)1000109005
+#define VK_STRUCTURE_TYPE_SUBPASS_END_INFO_KHR          (VkStructureType)1000109006
+typedef struct VkAttachmentDescription2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkAttachmentDescriptionFlags flags;
+    VkFormat format;
+    VkSampleCountFlagBits samples;
+    VkAttachmentLoadOp loadOp;
+    VkAttachmentStoreOp storeOp;
+    VkAttachmentLoadOp stencilLoadOp;
+    VkAttachmentStoreOp stencilStoreOp;
+    VkImageLayout initialLayout;
+    VkImageLayout finalLayout;
+} VkAttachmentDescription2KHR;
+typedef struct VkAttachmentReference2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint32_t attachment;
+    VkImageLayout layout;
+    VkImageAspectFlags aspectMask;
+} VkAttachmentReference2KHR;
+typedef struct VkSubpassDescription2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkSubpassDescriptionFlags flags;
+    VkPipelineBindPoint pipelineBindPoint;
+    uint32_t viewMask;
+    uint32_t inputAttachmentCount;
+    const VkAttachmentReference2KHR* pInputAttachments;
+    uint32_t colorAttachmentCount;
+    const VkAttachmentReference2KHR* pColorAttachments;
+    const VkAttachmentReference2KHR* pResolveAttachments;
+    const VkAttachmentReference2KHR* pDepthStencilAttachment;
+    uint32_t preserveAttachmentCount;
+    const uint32_t* pPreserveAttachments;
+} VkSubpassDescription2KHR;
+typedef struct VkSubpassDependency2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint32_t srcSubpass;
+    uint32_t dstSubpass;
+    VkPipelineStageFlags srcStageMask;
+    VkPipelineStageFlags dstStageMask;
+    VkAccessFlags srcAccessMask;
+    VkAccessFlags dstAccessMask;
+    VkDependencyFlags dependencyFlags;
+    int32_t viewOffset;
+} VkSubpassDependency2KHR;
+typedef struct VkRenderPassCreateInfo2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkRenderPassCreateFlags flags;
+    uint32_t attachmentCount;
+    const VkAttachmentDescription2KHR* pAttachments;
+    uint32_t subpassCount;
+    const VkSubpassDescription2KHR* pSubpasses;
+    uint32_t dependencyCount;
+    const VkSubpassDependency2KHR* pDependencies;
+    uint32_t correlatedViewMaskCount;
+    const uint32_t* pCorrelatedViewMasks;
+} VkRenderPassCreateInfo2KHR;
+typedef struct VkSubpassBeginInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkSubpassContents contents;
+} VkSubpassBeginInfoKHR;
+
+typedef struct VkSubpassEndInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+} VkSubpassEndInfoKHR;
+typedef VkResult(VKAPI_PTR* PFN_vkCreateRenderPass2KHR)(VkDevice device, const VkRenderPassCreateInfo2KHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkRenderPass* pRenderPass);
+typedef void(VKAPI_PTR* PFN_vkCmdBeginRenderPass2KHR)(VkCommandBuffer commandBuffer, const VkRenderPassBeginInfo* pRenderPassBegin, const VkSubpassBeginInfoKHR* pSubpassBeginInfo);
+typedef void(VKAPI_PTR* PFN_vkCmdNextSubpass2KHR)(VkCommandBuffer commandBuffer, const VkSubpassBeginInfoKHR* pSubpassBeginInfo, const VkSubpassEndInfoKHR* pSubpassEndInfo);
+typedef void(VKAPI_PTR* PFN_vkCmdEndRenderPass2KHR)(VkCommandBuffer commandBuffer, const VkSubpassEndInfoKHR* pSubpassEndInfo);
+#endif // VK_HEADER_VERSION < 80
+
+#if VK_HEADER_VERSION < 95
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR (VkStructureType)1000082000
+typedef struct VkPhysicalDeviceFloat16Int8FeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 shaderFloat16;
+    VkBool32 shaderInt8;
+} VkPhysicalDeviceFloat16Int8FeaturesKHR;
+#endif // VK_HEADER_VERSION < 95
+
+#if VK_HEADER_VERSION < 97
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT (VkStructureType)1000237000
+typedef struct VkPhysicalDeviceMemoryBudgetPropertiesEXT
+{
+    VkStructureType sType;
+    void* pNext;
+    VkDeviceSize heapBudget[VK_MAX_MEMORY_HEAPS];
+    VkDeviceSize heapUsage[VK_MAX_MEMORY_HEAPS];
+} VkPhysicalDeviceMemoryBudgetPropertiesEXT;
+#endif // VK_HEADER_VERSION < 97
+
+#if VK_HEADER_VERSION < 101
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_NV   (VkStructureType)1000249000
+#define VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_NV                 (VkStructureType)1000249001
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_PROPERTIES_NV (VkStructureType)1000249002
+typedef enum VkComponentTypeNV
+{
+    VK_COMPONENT_TYPE_FLOAT16_NV = 0,
+    VK_COMPONENT_TYPE_FLOAT32_NV = 1,
+    VK_COMPONENT_TYPE_FLOAT64_NV = 2,
+    VK_COMPONENT_TYPE_SINT8_NV = 3,
+    VK_COMPONENT_TYPE_SINT16_NV = 4,
+    VK_COMPONENT_TYPE_SINT32_NV = 5,
+    VK_COMPONENT_TYPE_SINT64_NV = 6,
+    VK_COMPONENT_TYPE_UINT8_NV = 7,
+    VK_COMPONENT_TYPE_UINT16_NV = 8,
+    VK_COMPONENT_TYPE_UINT32_NV = 9,
+    VK_COMPONENT_TYPE_UINT64_NV = 10,
+    VK_COMPONENT_TYPE_BEGIN_RANGE_NV = VK_COMPONENT_TYPE_FLOAT16_NV,
+    VK_COMPONENT_TYPE_END_RANGE_NV = VK_COMPONENT_TYPE_UINT64_NV,
+    VK_COMPONENT_TYPE_RANGE_SIZE_NV = (VK_COMPONENT_TYPE_UINT64_NV - VK_COMPONENT_TYPE_FLOAT16_NV + 1),
+    VK_COMPONENT_TYPE_MAX_ENUM_NV = 0x7FFFFFFF
+} VkComponentTypeNV;
+typedef enum VkScopeNV
+{
+    VK_SCOPE_DEVICE_NV = 1,
+    VK_SCOPE_WORKGROUP_NV = 2,
+    VK_SCOPE_SUBGROUP_NV = 3,
+    VK_SCOPE_QUEUE_FAMILY_NV = 5,
+    VK_SCOPE_BEGIN_RANGE_NV = VK_SCOPE_DEVICE_NV,
+    VK_SCOPE_END_RANGE_NV = VK_SCOPE_QUEUE_FAMILY_NV,
+    VK_SCOPE_RANGE_SIZE_NV = (VK_SCOPE_QUEUE_FAMILY_NV - VK_SCOPE_DEVICE_NV + 1),
+    VK_SCOPE_MAX_ENUM_NV = 0x7FFFFFFF
+} VkScopeNV;
+typedef struct VkCooperativeMatrixPropertiesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t MSize;
+    uint32_t NSize;
+    uint32_t KSize;
+    VkComponentTypeNV AType;
+    VkComponentTypeNV BType;
+    VkComponentTypeNV CType;
+    VkComponentTypeNV DType;
+    VkScopeNV scope;
+} VkCooperativeMatrixPropertiesNV;
+typedef struct VkPhysicalDeviceCooperativeMatrixFeaturesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 cooperativeMatrix;
+    VkBool32 cooperativeMatrixRobustBufferAccess;
+} VkPhysicalDeviceCooperativeMatrixFeaturesNV;
+typedef struct VkPhysicalDeviceCooperativeMatrixPropertiesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    VkShaderStageFlags cooperativeMatrixSupportedStages;
+} VkPhysicalDeviceCooperativeMatrixPropertiesNV;
+typedef VkResult(VKAPI_PTR* PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesNV)(VkPhysicalDevice physicalDevice, uint32_t* pPropertyCount, VkCooperativeMatrixPropertiesNV* pProperties);
+#endif // VK_HEADER_VERSION < 101
+
+#endif // NCNN_VULKAN_HEADER_FIX_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/lib/cmake/ncnn/ncnn-release.cmake b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/lib/cmake/ncnn/ncnn-release.cmake
new file mode 100644
index 0000000..1fb8660
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/lib/cmake/ncnn/ncnn-release.cmake
@@ -0,0 +1,19 @@
+#----------------------------------------------------------------
+# Generated CMake target import file for configuration "Release".
+#----------------------------------------------------------------
+
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+
+# Import target "ncnn" for configuration "Release"
+set_property(TARGET ncnn APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(ncnn PROPERTIES
+  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/libncnn.so"
+  IMPORTED_SONAME_RELEASE "libncnn.so"
+  )
+
+list(APPEND _cmake_import_check_targets ncnn )
+list(APPEND _cmake_import_check_files_for_ncnn "${_IMPORT_PREFIX}/lib/libncnn.so" )
+
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/lib/cmake/ncnn/ncnn.cmake b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/lib/cmake/ncnn/ncnn.cmake
new file mode 100644
index 0000000..53b9fae
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/lib/cmake/ncnn/ncnn.cmake
@@ -0,0 +1,109 @@
+# Generated by CMake
+
+if("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.8)
+   message(FATAL_ERROR "CMake >= 2.8.0 required")
+endif()
+if(CMAKE_VERSION VERSION_LESS "2.8.3")
+   message(FATAL_ERROR "CMake >= 2.8.3 required")
+endif()
+cmake_policy(PUSH)
+cmake_policy(VERSION 2.8.3...3.23)
+#----------------------------------------------------------------
+# Generated CMake target import file.
+#----------------------------------------------------------------
+
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+
+# Protect against multiple inclusion, which would fail when already imported targets are added once more.
+set(_cmake_targets_defined "")
+set(_cmake_targets_not_defined "")
+set(_cmake_expected_targets "")
+foreach(_cmake_expected_target IN ITEMS ncnn)
+  list(APPEND _cmake_expected_targets "${_cmake_expected_target}")
+  if(TARGET "${_cmake_expected_target}")
+    list(APPEND _cmake_targets_defined "${_cmake_expected_target}")
+  else()
+    list(APPEND _cmake_targets_not_defined "${_cmake_expected_target}")
+  endif()
+endforeach()
+unset(_cmake_expected_target)
+if(_cmake_targets_defined STREQUAL _cmake_expected_targets)
+  unset(_cmake_targets_defined)
+  unset(_cmake_targets_not_defined)
+  unset(_cmake_expected_targets)
+  unset(CMAKE_IMPORT_FILE_VERSION)
+  cmake_policy(POP)
+  return()
+endif()
+if(NOT _cmake_targets_defined STREQUAL "")
+  string(REPLACE ";" ", " _cmake_targets_defined_text "${_cmake_targets_defined}")
+  string(REPLACE ";" ", " _cmake_targets_not_defined_text "${_cmake_targets_not_defined}")
+  message(FATAL_ERROR "Some (but not all) targets in this export set were already defined.\nTargets Defined: ${_cmake_targets_defined_text}\nTargets not yet defined: ${_cmake_targets_not_defined_text}\n")
+endif()
+unset(_cmake_targets_defined)
+unset(_cmake_targets_not_defined)
+unset(_cmake_expected_targets)
+
+
+# Compute the installation prefix relative to this file.
+get_filename_component(_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+if(_IMPORT_PREFIX STREQUAL "/")
+  set(_IMPORT_PREFIX "")
+endif()
+
+# Create imported target ncnn
+add_library(ncnn SHARED IMPORTED)
+
+set_target_properties(ncnn PROPERTIES
+  INTERFACE_COMPILE_OPTIONS "-fno-rtti;-fno-exceptions"
+  INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include/ncnn"
+  INTERFACE_LINK_LIBRARIES "-fopenmp;-static-openmp;-Wl,-wrap,__kmp_affinity_determine_capable;Threads::Threads;Vulkan::Vulkan;android;jnigraphics;log"
+  INTERFACE_POSITION_INDEPENDENT_CODE "ON"
+)
+
+if(CMAKE_VERSION VERSION_LESS 2.8.12)
+  message(FATAL_ERROR "This file relies on consumers using CMake 2.8.12 or greater.")
+endif()
+
+# Load information for each installed configuration.
+file(GLOB _cmake_config_files "${CMAKE_CURRENT_LIST_DIR}/ncnn-*.cmake")
+foreach(_cmake_config_file IN LISTS _cmake_config_files)
+  include("${_cmake_config_file}")
+endforeach()
+unset(_cmake_config_file)
+unset(_cmake_config_files)
+
+# Cleanup temporary variables.
+set(_IMPORT_PREFIX)
+
+# Loop over all imported files and verify that they actually exist
+foreach(_cmake_target IN LISTS _cmake_import_check_targets)
+  foreach(_cmake_file IN LISTS "_cmake_import_check_files_for_${_cmake_target}")
+    if(NOT EXISTS "${_cmake_file}")
+      message(FATAL_ERROR "The imported target \"${_cmake_target}\" references the file
+   \"${_cmake_file}\"
+but this file does not exist.  Possible reasons include:
+* The file was deleted, renamed, or moved to another location.
+* An install or uninstall procedure did not complete successfully.
+* The installation package was faulty and contained
+   \"${CMAKE_CURRENT_LIST_FILE}\"
+but not all the files it references.
+")
+    endif()
+  endforeach()
+  unset(_cmake_file)
+  unset("_cmake_import_check_files_for_${_cmake_target}")
+endforeach()
+unset(_cmake_target)
+unset(_cmake_import_check_targets)
+
+# This file does not depend on other imported targets which have
+# been exported from the same project but in a separate export set.
+
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)
+cmake_policy(POP)
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/lib/cmake/ncnn/ncnnConfig.cmake b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/lib/cmake/ncnn/ncnnConfig.cmake
new file mode 100644
index 0000000..abb2dd6
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/lib/cmake/ncnn/ncnnConfig.cmake
@@ -0,0 +1,42 @@
+set(NCNN_OPENMP ON)
+set(NCNN_THREADS ON)
+set(NCNN_VULKAN ON)
+set(NCNN_SHARED_LIB ON)
+set(NCNN_SYSTEM_GLSLANG OFF)
+
+if(NCNN_OPENMP)
+    find_package(OpenMP)
+endif()
+
+if(NCNN_THREADS)
+    set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
+    set(THREADS_PREFER_PTHREAD_FLAG TRUE)
+    find_package(Threads REQUIRED)
+endif()
+
+if(NCNN_VULKAN)
+    find_package(Vulkan REQUIRED)
+
+    if(NOT NCNN_SHARED_LIB)
+        if(NCNN_SYSTEM_GLSLANG)
+            find_package(glslang QUIET)
+            if(NOT glslang_FOUND)
+                set(GLSLANG_TARGET_DIR "")
+                include(${GLSLANG_TARGET_DIR}/OSDependentTargets.cmake)
+                include(${GLSLANG_TARGET_DIR}/OGLCompilerTargets.cmake)
+                if(EXISTS "${GLSLANG_TARGET_DIR}/HLSLTargets.cmake")
+                    # hlsl support can be optional
+                    include("${GLSLANG_TARGET_DIR}/HLSLTargets.cmake")
+                endif()
+                include(${GLSLANG_TARGET_DIR}/glslangTargets.cmake)
+                include(${GLSLANG_TARGET_DIR}/SPIRVTargets.cmake)
+            endif()
+        else()
+            set(glslang_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../lib/cmake/glslang")
+            find_package(glslang QUIET)
+        endif()
+
+    endif()
+endif()
+
+include(${CMAKE_CURRENT_LIST_DIR}/ncnn.cmake)
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/lib/pkgconfig/ncnn.pc b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/lib/pkgconfig/ncnn.pc
new file mode 100644
index 0000000..2ae00de
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86/lib/pkgconfig/ncnn.pc
@@ -0,0 +1,11 @@
+prefix=${pcfiledir}/../..
+librarydir=${prefix}/lib
+includedir=${prefix}/include
+
+Name: ncnn
+Description: high-performance neural network inference framework optimized for the mobile platform
+Version: 1.0.20221128
+URL: https://github.com/Tencent/ncnn
+Libs: -L"${librarydir}" -lncnn
+Cflags: -I"${includedir}"
+
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/allocator.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/allocator.h
new file mode 100644
index 0000000..3a5ebca
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/allocator.h
@@ -0,0 +1,448 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_ALLOCATOR_H
+#define NCNN_ALLOCATOR_H
+
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+
+#include "platform.h"
+
+#include <stdlib.h>
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+#include <android/hardware_buffer.h>
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+// the alignment of all the allocated buffers
+#if NCNN_AVX512
+#define NCNN_MALLOC_ALIGN 64
+#elif NCNN_AVX
+#define NCNN_MALLOC_ALIGN 32
+#else
+#define NCNN_MALLOC_ALIGN 16
+#endif
+
+// we have some optimized kernels that may overread buffer a bit in loop
+// it is common to interleave next-loop data load with arithmetic instructions
+// allocating more bytes keeps us safe from SEGV_ACCERR failure
+#define NCNN_MALLOC_OVERREAD 64
+
+// Aligns a pointer to the specified number of bytes
+// ptr Aligned pointer
+// n Alignment size that must be a power of two
+template<typename _Tp>
+static NCNN_FORCEINLINE _Tp* alignPtr(_Tp* ptr, int n = (int)sizeof(_Tp))
+{
+    return (_Tp*)(((size_t)ptr + n - 1) & -n);
+}
+
+// Aligns a buffer size to the specified number of bytes
+// The function returns the minimum number that is greater or equal to sz and is divisible by n
+// sz Buffer size to align
+// n Alignment size that must be a power of two
+static NCNN_FORCEINLINE size_t alignSize(size_t sz, int n)
+{
+    return (sz + n - 1) & -n;
+}
+
+static NCNN_FORCEINLINE void* fastMalloc(size_t size)
+{
+#if _MSC_VER
+    return _aligned_malloc(size, NCNN_MALLOC_ALIGN);
+#elif (defined(__unix__) || defined(__APPLE__)) && _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17)
+    void* ptr = 0;
+    if (posix_memalign(&ptr, NCNN_MALLOC_ALIGN, size + NCNN_MALLOC_OVERREAD))
+        ptr = 0;
+    return ptr;
+#elif __ANDROID__ && __ANDROID_API__ < 17
+    return memalign(NCNN_MALLOC_ALIGN, size + NCNN_MALLOC_OVERREAD);
+#else
+    unsigned char* udata = (unsigned char*)malloc(size + sizeof(void*) + NCNN_MALLOC_ALIGN + NCNN_MALLOC_OVERREAD);
+    if (!udata)
+        return 0;
+    unsigned char** adata = alignPtr((unsigned char**)udata + 1, NCNN_MALLOC_ALIGN);
+    adata[-1] = udata;
+    return adata;
+#endif
+}
+
+static NCNN_FORCEINLINE void fastFree(void* ptr)
+{
+    if (ptr)
+    {
+#if _MSC_VER
+        _aligned_free(ptr);
+#elif (defined(__unix__) || defined(__APPLE__)) && _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17)
+        free(ptr);
+#elif __ANDROID__ && __ANDROID_API__ < 17
+        free(ptr);
+#else
+        unsigned char* udata = ((unsigned char**)ptr)[-1];
+        free(udata);
+#endif
+    }
+}
+
+#if NCNN_THREADS
+// exchange-add operation for atomic operations on reference counters
+#if defined __riscv && !defined __riscv_atomic
+// riscv target without A extension
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#elif defined __INTEL_COMPILER && !(defined WIN32 || defined _WIN32)
+// atomic increment on the linux version of the Intel(tm) compiler
+#define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd(const_cast<void*>(reinterpret_cast<volatile void*>(addr)), delta)
+#elif defined __GNUC__
+#if defined __clang__ && __clang_major__ >= 3 && !defined __ANDROID__ && !defined __EMSCRIPTEN__ && !defined(__CUDACC__)
+#ifdef __ATOMIC_ACQ_REL
+#define NCNN_XADD(addr, delta) __c11_atomic_fetch_add((_Atomic(int)*)(addr), delta, __ATOMIC_ACQ_REL)
+#else
+#define NCNN_XADD(addr, delta) __atomic_fetch_add((_Atomic(int)*)(addr), delta, 4)
+#endif
+#else
+#if defined __ATOMIC_ACQ_REL && !defined __clang__
+// version for gcc >= 4.7
+#define NCNN_XADD(addr, delta) (int)__atomic_fetch_add((unsigned*)(addr), (unsigned)(delta), __ATOMIC_ACQ_REL)
+#else
+#define NCNN_XADD(addr, delta) (int)__sync_fetch_and_add((unsigned*)(addr), (unsigned)(delta))
+#endif
+#endif
+#elif defined _MSC_VER && !defined RC_INVOKED
+#define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile*)addr, delta)
+#else
+// thread-unsafe branch
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#endif
+#else  // NCNN_THREADS
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#endif // NCNN_THREADS
+
+class NCNN_EXPORT Allocator
+{
+public:
+    virtual ~Allocator();
+    virtual void* fastMalloc(size_t size) = 0;
+    virtual void fastFree(void* ptr) = 0;
+};
+
+class PoolAllocatorPrivate;
+class NCNN_EXPORT PoolAllocator : public Allocator
+{
+public:
+    PoolAllocator();
+    ~PoolAllocator();
+
+    // ratio range 0 ~ 1
+    // default cr = 0
+    void set_size_compare_ratio(float scr);
+
+    // budget drop threshold
+    // default threshold = 10
+    void set_size_drop_threshold(size_t);
+
+    // release all budgets immediately
+    void clear();
+
+    virtual void* fastMalloc(size_t size);
+    virtual void fastFree(void* ptr);
+
+private:
+    PoolAllocator(const PoolAllocator&);
+    PoolAllocator& operator=(const PoolAllocator&);
+
+private:
+    PoolAllocatorPrivate* const d;
+};
+
+class UnlockedPoolAllocatorPrivate;
+class NCNN_EXPORT UnlockedPoolAllocator : public Allocator
+{
+public:
+    UnlockedPoolAllocator();
+    ~UnlockedPoolAllocator();
+
+    // ratio range 0 ~ 1
+    // default cr = 0
+    void set_size_compare_ratio(float scr);
+
+    // budget drop threshold
+    // default threshold = 10
+    void set_size_drop_threshold(size_t);
+
+    // release all budgets immediately
+    void clear();
+
+    virtual void* fastMalloc(size_t size);
+    virtual void fastFree(void* ptr);
+
+private:
+    UnlockedPoolAllocator(const UnlockedPoolAllocator&);
+    UnlockedPoolAllocator& operator=(const UnlockedPoolAllocator&);
+
+private:
+    UnlockedPoolAllocatorPrivate* const d;
+};
+
+#if NCNN_VULKAN
+
+class VulkanDevice;
+
+class NCNN_EXPORT VkBufferMemory
+{
+public:
+    VkBuffer buffer;
+
+    // the base offset assigned by allocator
+    size_t offset;
+    size_t capacity;
+
+    VkDeviceMemory memory;
+    void* mapped_ptr;
+
+    // buffer state, modified by command functions internally
+    mutable VkAccessFlags access_flags;
+    mutable VkPipelineStageFlags stage_flags;
+
+    // initialize and modified by mat
+    int refcount;
+};
+
+class NCNN_EXPORT VkImageMemory
+{
+public:
+    VkImage image;
+    VkImageView imageview;
+
+    // underlying info assigned by allocator
+    int width;
+    int height;
+    int depth;
+    VkFormat format;
+
+    VkDeviceMemory memory;
+    void* mapped_ptr;
+
+    // the base offset assigned by allocator
+    size_t bind_offset;
+    size_t bind_capacity;
+
+    // image state, modified by command functions internally
+    mutable VkAccessFlags access_flags;
+    mutable VkImageLayout image_layout;
+    mutable VkPipelineStageFlags stage_flags;
+
+    // in-execution state, modified by command functions internally
+    mutable int command_refcount;
+
+    // initialize and modified by mat
+    int refcount;
+};
+
+class NCNN_EXPORT VkAllocator
+{
+public:
+    explicit VkAllocator(const VulkanDevice* _vkdev);
+    virtual ~VkAllocator();
+
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size) = 0;
+    virtual void fastFree(VkBufferMemory* ptr) = 0;
+    virtual int flush(VkBufferMemory* ptr);
+    virtual int invalidate(VkBufferMemory* ptr);
+
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack) = 0;
+    virtual void fastFree(VkImageMemory* ptr) = 0;
+
+public:
+    const VulkanDevice* vkdev;
+    uint32_t buffer_memory_type_index;
+    uint32_t image_memory_type_index;
+    uint32_t reserved_type_index;
+    bool mappable;
+    bool coherent;
+
+protected:
+    VkBuffer create_buffer(size_t size, VkBufferUsageFlags usage);
+    VkDeviceMemory allocate_memory(size_t size, uint32_t memory_type_index);
+    VkDeviceMemory allocate_dedicated_memory(size_t size, uint32_t memory_type_index, VkImage image, VkBuffer buffer);
+
+    VkImage create_image(int width, int height, int depth, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage);
+    VkImageView create_imageview(VkImage image, VkFormat format);
+};
+
+class VkBlobAllocatorPrivate;
+class NCNN_EXPORT VkBlobAllocator : public VkAllocator
+{
+public:
+    explicit VkBlobAllocator(const VulkanDevice* vkdev, size_t preferred_block_size = 16 * 1024 * 1024); // 16M
+    virtual ~VkBlobAllocator();
+
+public:
+    // release all budgets immediately
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkBlobAllocator(const VkBlobAllocator&);
+    VkBlobAllocator& operator=(const VkBlobAllocator&);
+
+private:
+    VkBlobAllocatorPrivate* const d;
+};
+
+class VkWeightAllocatorPrivate;
+class NCNN_EXPORT VkWeightAllocator : public VkAllocator
+{
+public:
+    explicit VkWeightAllocator(const VulkanDevice* vkdev, size_t preferred_block_size = 8 * 1024 * 1024); // 8M
+    virtual ~VkWeightAllocator();
+
+public:
+    // release all blocks immediately
+    virtual void clear();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkWeightAllocator(const VkWeightAllocator&);
+    VkWeightAllocator& operator=(const VkWeightAllocator&);
+
+private:
+    VkWeightAllocatorPrivate* const d;
+};
+
+class VkStagingAllocatorPrivate;
+class NCNN_EXPORT VkStagingAllocator : public VkAllocator
+{
+public:
+    explicit VkStagingAllocator(const VulkanDevice* vkdev);
+    virtual ~VkStagingAllocator();
+
+public:
+    // ratio range 0 ~ 1
+    // default cr = 0.75
+    void set_size_compare_ratio(float scr);
+
+    // release all budgets immediately
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkStagingAllocator(const VkStagingAllocator&);
+    VkStagingAllocator& operator=(const VkStagingAllocator&);
+
+private:
+    VkStagingAllocatorPrivate* const d;
+};
+
+class VkWeightStagingAllocatorPrivate;
+class NCNN_EXPORT VkWeightStagingAllocator : public VkAllocator
+{
+public:
+    explicit VkWeightStagingAllocator(const VulkanDevice* vkdev);
+    virtual ~VkWeightStagingAllocator();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkWeightStagingAllocator(const VkWeightStagingAllocator&);
+    VkWeightStagingAllocator& operator=(const VkWeightStagingAllocator&);
+
+private:
+    VkWeightStagingAllocatorPrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class NCNN_EXPORT VkAndroidHardwareBufferImageAllocator : public VkAllocator
+{
+public:
+    VkAndroidHardwareBufferImageAllocator(const VulkanDevice* _vkdev, AHardwareBuffer* _hb);
+    virtual ~VkAndroidHardwareBufferImageAllocator();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkAndroidHardwareBufferImageAllocator(const VkAndroidHardwareBufferImageAllocator&);
+    VkAndroidHardwareBufferImageAllocator& operator=(const VkAndroidHardwareBufferImageAllocator&);
+
+public:
+    int init();
+
+    int width() const;
+    int height() const;
+    uint64_t external_format() const;
+
+public:
+    AHardwareBuffer* hb;
+    AHardwareBuffer_Desc bufferDesc;
+    VkAndroidHardwareBufferFormatPropertiesANDROID bufferFormatProperties;
+    VkAndroidHardwareBufferPropertiesANDROID bufferProperties;
+    VkSamplerYcbcrConversionKHR samplerYcbcrConversion;
+};
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_ALLOCATOR_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/benchmark.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/benchmark.h
new file mode 100644
index 0000000..3d5c0cd
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/benchmark.h
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_BENCHMARK_H
+#define NCNN_BENCHMARK_H
+
+#include "layer.h"
+#include "mat.h"
+#include "platform.h"
+
+namespace ncnn {
+
+// get now timestamp in ms
+NCNN_EXPORT double get_current_time();
+
+#if NCNN_BENCHMARK
+
+NCNN_EXPORT void benchmark(const Layer* layer, double start, double end);
+NCNN_EXPORT void benchmark(const Layer* layer, const Mat& bottom_blob, Mat& top_blob, double start, double end);
+
+#endif // NCNN_BENCHMARK
+
+} // namespace ncnn
+
+#endif // NCNN_BENCHMARK_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/blob.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/blob.h
new file mode 100644
index 0000000..c9f144f
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/blob.h
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_BLOB_H
+#define NCNN_BLOB_H
+
+#include "mat.h"
+#include "platform.h"
+
+namespace ncnn {
+
+class NCNN_EXPORT Blob
+{
+public:
+    // empty
+    Blob();
+
+public:
+#if NCNN_STRING
+    // blob name
+    std::string name;
+#endif // NCNN_STRING
+    // layer index which produce this blob as output
+    int producer;
+    // layer index which need this blob as input
+    int consumer;
+    // shape hint
+    Mat shape;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_BLOB_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/c_api.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/c_api.h
new file mode 100644
index 0000000..b7435f8
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/c_api.h
@@ -0,0 +1,327 @@
+/* Tencent is pleased to support the open source community by making ncnn available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed
+ * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+ * CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ */
+
+#ifndef NCNN_C_API_H
+#define NCNN_C_API_H
+
+#include "platform.h"
+
+#if NCNN_C_API
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+NCNN_EXPORT const char* ncnn_version();
+
+/* allocator api */
+typedef struct __ncnn_allocator_t* ncnn_allocator_t;
+struct NCNN_EXPORT __ncnn_allocator_t
+{
+    void* pthis;
+
+    void* (*fast_malloc)(ncnn_allocator_t allocator, size_t size);
+    void (*fast_free)(ncnn_allocator_t allocator, void* ptr);
+};
+
+NCNN_EXPORT ncnn_allocator_t ncnn_allocator_create_pool_allocator();
+NCNN_EXPORT ncnn_allocator_t ncnn_allocator_create_unlocked_pool_allocator();
+NCNN_EXPORT void ncnn_allocator_destroy(ncnn_allocator_t allocator);
+
+/* option api */
+typedef struct __ncnn_option_t* ncnn_option_t;
+
+NCNN_EXPORT ncnn_option_t ncnn_option_create();
+NCNN_EXPORT void ncnn_option_destroy(ncnn_option_t opt);
+
+NCNN_EXPORT int ncnn_option_get_num_threads(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_num_threads(ncnn_option_t opt, int num_threads);
+
+NCNN_EXPORT int ncnn_option_get_use_local_pool_allocator(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_use_local_pool_allocator(ncnn_option_t opt, int use_local_pool_allocator);
+
+NCNN_EXPORT void ncnn_option_set_blob_allocator(ncnn_option_t opt, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_option_set_workspace_allocator(ncnn_option_t opt, ncnn_allocator_t allocator);
+
+NCNN_EXPORT int ncnn_option_get_use_vulkan_compute(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_use_vulkan_compute(ncnn_option_t opt, int use_vulkan_compute);
+
+/* mat api */
+typedef struct __ncnn_mat_t* ncnn_mat_t;
+
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create();
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_1d(int w, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_2d(int w, int h, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_3d(int w, int h, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_4d(int w, int h, int d, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_1d(int w, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_2d(int w, int h, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_3d(int w, int h, int c, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_4d(int w, int h, int d, int c, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_1d_elem(int w, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_2d_elem(int w, int h, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_3d_elem(int w, int h, int c, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_4d_elem(int w, int h, int d, int c, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_1d_elem(int w, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_2d_elem(int w, int h, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_3d_elem(int w, int h, int c, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_4d_elem(int w, int h, int d, int c, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_mat_destroy(ncnn_mat_t mat);
+
+NCNN_EXPORT void ncnn_mat_fill_float(ncnn_mat_t mat, float v);
+
+NCNN_EXPORT ncnn_mat_t ncnn_mat_clone(const ncnn_mat_t mat, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_1d(const ncnn_mat_t mat, int w, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_2d(const ncnn_mat_t mat, int w, int h, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_3d(const ncnn_mat_t mat, int w, int h, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_4d(const ncnn_mat_t mat, int w, int h, int d, int c, ncnn_allocator_t allocator);
+
+NCNN_EXPORT int ncnn_mat_get_dims(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_w(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_h(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_d(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_c(const ncnn_mat_t mat);
+NCNN_EXPORT size_t ncnn_mat_get_elemsize(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_elempack(const ncnn_mat_t mat);
+NCNN_EXPORT size_t ncnn_mat_get_cstep(const ncnn_mat_t mat);
+NCNN_EXPORT void* ncnn_mat_get_data(const ncnn_mat_t mat);
+
+NCNN_EXPORT void* ncnn_mat_get_channel_data(const ncnn_mat_t mat, int c);
+
+#if NCNN_PIXEL
+
+/* mat pixel api */
+#define NCNN_MAT_PIXEL_RGB       1
+#define NCNN_MAT_PIXEL_BGR       2
+#define NCNN_MAT_PIXEL_GRAY      3
+#define NCNN_MAT_PIXEL_RGBA      4
+#define NCNN_MAT_PIXEL_BGRA      5
+#define NCNN_MAT_PIXEL_X2Y(X, Y) (X | (Y << 16))
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels(const unsigned char* pixels, int type, int w, int h, int stride, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int stride, int target_width, int target_height, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, int target_width, int target_height, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_mat_to_pixels(const ncnn_mat_t mat, unsigned char* pixels, int type, int stride);
+NCNN_EXPORT void ncnn_mat_to_pixels_resize(const ncnn_mat_t mat, unsigned char* pixels, int type, int target_width, int target_height, int target_stride);
+
+#endif /* NCNN_PIXEL */
+
+NCNN_EXPORT void ncnn_mat_substract_mean_normalize(ncnn_mat_t mat, const float* mean_vals, const float* norm_vals);
+
+NCNN_EXPORT void ncnn_convert_packing(const ncnn_mat_t src, ncnn_mat_t* dst, int elempack, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_flatten(const ncnn_mat_t src, ncnn_mat_t* dst, const ncnn_option_t opt);
+
+/* blob api */
+typedef struct __ncnn_blob_t* ncnn_blob_t;
+
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_blob_get_name(const ncnn_blob_t blob);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_blob_get_producer(const ncnn_blob_t blob);
+NCNN_EXPORT int ncnn_blob_get_consumer(const ncnn_blob_t blob);
+
+NCNN_EXPORT void ncnn_blob_get_shape(const ncnn_blob_t blob, int* dims, int* w, int* h, int* c);
+
+/* paramdict api */
+typedef struct __ncnn_paramdict_t* ncnn_paramdict_t;
+
+NCNN_EXPORT ncnn_paramdict_t ncnn_paramdict_create();
+NCNN_EXPORT void ncnn_paramdict_destroy(ncnn_paramdict_t pd);
+
+NCNN_EXPORT int ncnn_paramdict_get_type(const ncnn_paramdict_t pd, int id);
+
+NCNN_EXPORT int ncnn_paramdict_get_int(const ncnn_paramdict_t pd, int id, int def);
+NCNN_EXPORT float ncnn_paramdict_get_float(const ncnn_paramdict_t pd, int id, float def);
+NCNN_EXPORT ncnn_mat_t ncnn_paramdict_get_array(const ncnn_paramdict_t pd, int id, const ncnn_mat_t def);
+
+NCNN_EXPORT void ncnn_paramdict_set_int(ncnn_paramdict_t pd, int id, int i);
+NCNN_EXPORT void ncnn_paramdict_set_float(ncnn_paramdict_t pd, int id, float f);
+NCNN_EXPORT void ncnn_paramdict_set_array(ncnn_paramdict_t pd, int id, const ncnn_mat_t v);
+
+/* datareader api */
+typedef struct __ncnn_datareader_t* ncnn_datareader_t;
+struct NCNN_EXPORT __ncnn_datareader_t
+{
+    void* pthis;
+
+#if NCNN_STRING
+    int (*scan)(ncnn_datareader_t dr, const char* format, void* p);
+#endif /* NCNN_STRING */
+    size_t (*read)(ncnn_datareader_t dr, void* buf, size_t size);
+};
+
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create();
+#if NCNN_STDIO
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create_from_stdio(FILE* fp);
+#endif /* NCNN_STDIO */
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create_from_memory(const unsigned char** mem);
+NCNN_EXPORT void ncnn_datareader_destroy(ncnn_datareader_t dr);
+
+/* modelbin api */
+typedef struct __ncnn_modelbin_t* ncnn_modelbin_t;
+struct NCNN_EXPORT __ncnn_modelbin_t
+{
+    void* pthis;
+
+    ncnn_mat_t (*load_1d)(const ncnn_modelbin_t mb, int w, int type);
+    ncnn_mat_t (*load_2d)(const ncnn_modelbin_t mb, int w, int h, int type);
+    ncnn_mat_t (*load_3d)(const ncnn_modelbin_t mb, int w, int h, int c, int type);
+};
+
+NCNN_EXPORT ncnn_modelbin_t ncnn_modelbin_create_from_datareader(const ncnn_datareader_t dr);
+NCNN_EXPORT ncnn_modelbin_t ncnn_modelbin_create_from_mat_array(const ncnn_mat_t* weights, int n);
+NCNN_EXPORT void ncnn_modelbin_destroy(ncnn_modelbin_t mb);
+
+/* layer api */
+typedef struct __ncnn_layer_t* ncnn_layer_t;
+struct NCNN_EXPORT __ncnn_layer_t
+{
+    void* pthis;
+
+    int (*load_param)(ncnn_layer_t layer, const ncnn_paramdict_t pd);
+    int (*load_model)(ncnn_layer_t layer, const ncnn_modelbin_t mb);
+
+    int (*create_pipeline)(ncnn_layer_t layer, const ncnn_option_t opt);
+    int (*destroy_pipeline)(ncnn_layer_t layer, const ncnn_option_t opt);
+
+    int (*forward_1)(const ncnn_layer_t layer, const ncnn_mat_t bottom_blob, ncnn_mat_t* top_blob, const ncnn_option_t opt);
+    int (*forward_n)(const ncnn_layer_t layer, const ncnn_mat_t* bottom_blobs, int n, ncnn_mat_t* top_blobs, int n2, const ncnn_option_t opt);
+
+    int (*forward_inplace_1)(const ncnn_layer_t layer, ncnn_mat_t bottom_top_blob, const ncnn_option_t opt);
+    int (*forward_inplace_n)(const ncnn_layer_t layer, ncnn_mat_t* bottom_top_blobs, int n, const ncnn_option_t opt);
+};
+
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create();
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create_by_typeindex(int typeindex);
+#if NCNN_STRING
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create_by_type(const char* type);
+#endif /* NCNN_STRING */
+NCNN_EXPORT void ncnn_layer_destroy(ncnn_layer_t layer);
+
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_layer_get_name(const ncnn_layer_t layer);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_layer_get_typeindex(const ncnn_layer_t layer);
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_layer_get_type(const ncnn_layer_t layer);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_layer_get_one_blob_only(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_inplace(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_vulkan(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_packing(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_bf16_storage(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_fp16_storage(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_image_storage(const ncnn_layer_t layer);
+
+NCNN_EXPORT void ncnn_layer_set_one_blob_only(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_inplace(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_vulkan(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_packing(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_bf16_storage(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_fp16_storage(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_image_storage(ncnn_layer_t layer, int enable);
+
+NCNN_EXPORT int ncnn_layer_get_bottom_count(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_bottom(const ncnn_layer_t layer, int i);
+NCNN_EXPORT int ncnn_layer_get_top_count(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_top(const ncnn_layer_t layer, int i);
+
+NCNN_EXPORT void ncnn_blob_get_bottom_shape(const ncnn_layer_t layer, int i, int* dims, int* w, int* h, int* c);
+NCNN_EXPORT void ncnn_blob_get_top_shape(const ncnn_layer_t layer, int i, int* dims, int* w, int* h, int* c);
+
+/* layer factory function */
+typedef ncnn_layer_t (*ncnn_layer_creator_t)(void* userdata);
+typedef void (*ncnn_layer_destroyer_t)(ncnn_layer_t layer, void* userdata);
+
+typedef struct __ncnn_net_custom_layer_factory_t* ncnn_net_custom_layer_factory_t;
+struct __ncnn_net_custom_layer_factory_t
+{
+    ncnn_layer_creator_t creator;
+    ncnn_layer_destroyer_t destroyer;
+    void* userdata;
+    ncnn_net_custom_layer_factory_t next;
+};
+
+/* net api */
+typedef struct __ncnn_net_t* ncnn_net_t;
+struct __ncnn_net_t
+{
+    void* pthis;
+
+    ncnn_net_custom_layer_factory_t custom_layer_factory;
+};
+
+NCNN_EXPORT ncnn_net_t ncnn_net_create();
+NCNN_EXPORT void ncnn_net_destroy(ncnn_net_t net);
+
+NCNN_EXPORT ncnn_option_t ncnn_net_get_option(ncnn_net_t net);
+NCNN_EXPORT void ncnn_net_set_option(ncnn_net_t net, ncnn_option_t opt);
+
+#if NCNN_STRING
+NCNN_EXPORT void ncnn_net_register_custom_layer_by_type(ncnn_net_t net, const char* type, ncnn_layer_creator_t creator, ncnn_layer_destroyer_t destroyer, void* userdata);
+#endif /* NCNN_STRING */
+NCNN_EXPORT void ncnn_net_register_custom_layer_by_typeindex(ncnn_net_t net, int typeindex, ncnn_layer_creator_t creator, ncnn_layer_destroyer_t destroyer, void* userdata);
+
+#if NCNN_STDIO
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param(ncnn_net_t net, const char* path);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_net_load_param_bin(ncnn_net_t net, const char* path);
+NCNN_EXPORT int ncnn_net_load_model(ncnn_net_t net, const char* path);
+#endif /* NCNN_STDIO */
+
+#if NCNN_STDIO
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param_memory(ncnn_net_t net, const char* mem);
+#endif /* NCNN_STRING */
+#endif /* NCNN_STDIO */
+NCNN_EXPORT int ncnn_net_load_param_bin_memory(ncnn_net_t net, const unsigned char* mem);
+NCNN_EXPORT int ncnn_net_load_model_memory(ncnn_net_t net, const unsigned char* mem);
+
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_net_load_param_bin_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+NCNN_EXPORT int ncnn_net_load_model_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+
+NCNN_EXPORT void ncnn_net_clear(ncnn_net_t net);
+
+/* extractor api */
+typedef struct __ncnn_extractor_t* ncnn_extractor_t;
+
+NCNN_EXPORT ncnn_extractor_t ncnn_extractor_create(ncnn_net_t net);
+NCNN_EXPORT void ncnn_extractor_destroy(ncnn_extractor_t ex);
+
+NCNN_EXPORT void ncnn_extractor_set_option(ncnn_extractor_t ex, const ncnn_option_t opt);
+
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_extractor_input(ncnn_extractor_t ex, const char* name, const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_extractor_extract(ncnn_extractor_t ex, const char* name, ncnn_mat_t* mat);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_extractor_input_index(ncnn_extractor_t ex, int index, const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_extractor_extract_index(ncnn_extractor_t ex, int index, ncnn_mat_t* mat);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* NCNN_C_API */
+
+#endif /* NCNN_C_API_H */
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/command.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/command.h
new file mode 100644
index 0000000..337d085
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/command.h
@@ -0,0 +1,136 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_COMMAND_H
+#define NCNN_COMMAND_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+
+#include "mat.h"
+
+#include <vulkan/vulkan.h>
+
+namespace ncnn {
+
+class Pipeline;
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class ImportAndroidHardwareBufferPipeline;
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+class VkComputePrivate;
+class NCNN_EXPORT VkCompute
+{
+public:
+    explicit VkCompute(const VulkanDevice* vkdev);
+    virtual ~VkCompute();
+
+public:
+    void record_upload(const Mat& src, VkMat& dst, const Option& opt);
+
+    void record_upload(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    void record_download(const VkMat& src, Mat& dst, const Option& opt);
+
+    void record_download(const VkImageMat& src, Mat& dst, const Option& opt);
+
+    void record_buffer_to_image(const VkMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_image_to_buffer(const VkImageMat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const Mat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, Mat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, Mat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, VkMat& dst, const Option& opt);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkImageMat>& bindings, const std::vector<vk_constant_type>& constants, const VkImageMat& dispatcher);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher);
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const VkImageMat& dispatcher);
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const Mat& dispatcher);
+
+#if NCNN_BENCHMARK
+    void record_write_timestamp(uint32_t query);
+#endif // NCNN_BENCHMARK
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+    void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkMat& dst);
+
+    void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkImageMat& dst);
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+    int submit_and_wait();
+
+    int reset();
+
+#if NCNN_BENCHMARK
+    int create_query_pool(uint32_t query_count);
+
+    int get_query_pool_results(uint32_t first_query, uint32_t query_count, std::vector<uint64_t>& results);
+#endif // NCNN_BENCHMARK
+
+protected:
+    const VulkanDevice* vkdev;
+
+    void barrier_readwrite(const VkMat& binding);
+    void barrier_readwrite(const VkImageMat& binding);
+    void barrier_readonly(const VkImageMat& binding);
+
+private:
+    VkComputePrivate* const d;
+};
+
+class VkTransferPrivate;
+class NCNN_EXPORT VkTransfer
+{
+public:
+    explicit VkTransfer(const VulkanDevice* vkdev);
+    virtual ~VkTransfer();
+
+public:
+    void record_upload(const Mat& src, VkMat& dst, const Option& opt, bool flatten = true);
+
+    void record_upload(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    int submit_and_wait();
+
+protected:
+    const VulkanDevice* vkdev;
+
+private:
+    VkTransferPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_VULKAN
+
+#endif // NCNN_COMMAND_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/cpu.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/cpu.h
new file mode 100644
index 0000000..0f748f3
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/cpu.h
@@ -0,0 +1,169 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_CPU_H
+#define NCNN_CPU_H
+
+#include <stddef.h>
+
+#if (defined _WIN32 && !(defined __MINGW32__))
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+#if defined __ANDROID__ || defined __linux__
+#include <sched.h> // cpu_set_t
+#endif
+
+#include "platform.h"
+
+namespace ncnn {
+
+class NCNN_EXPORT CpuSet
+{
+public:
+    CpuSet();
+    void enable(int cpu);
+    void disable(int cpu);
+    void disable_all();
+    bool is_enabled(int cpu) const;
+    int num_enabled() const;
+
+public:
+#if (defined _WIN32 && !(defined __MINGW32__))
+    ULONG_PTR mask;
+#endif
+#if defined __ANDROID__ || defined __linux__
+    cpu_set_t cpu_set;
+#endif
+#if __APPLE__
+    unsigned int policy;
+#endif
+};
+
+// test optional cpu features
+// edsp = armv7 edsp
+NCNN_EXPORT int cpu_support_arm_edsp();
+// neon = armv7 neon or aarch64 asimd
+NCNN_EXPORT int cpu_support_arm_neon();
+// vfpv4 = armv7 fp16 + fma
+NCNN_EXPORT int cpu_support_arm_vfpv4();
+// asimdhp = aarch64 asimd half precision
+NCNN_EXPORT int cpu_support_arm_asimdhp();
+// asimddp = aarch64 asimd dot product
+NCNN_EXPORT int cpu_support_arm_asimddp();
+// asimdfhm = aarch64 asimd fhm
+NCNN_EXPORT int cpu_support_arm_asimdfhm();
+// bf16 = aarch64 bf16
+NCNN_EXPORT int cpu_support_arm_bf16();
+// i8mm = aarch64 i8mm
+NCNN_EXPORT int cpu_support_arm_i8mm();
+// sve = aarch64 sve
+NCNN_EXPORT int cpu_support_arm_sve();
+// sve2 = aarch64 sve2
+NCNN_EXPORT int cpu_support_arm_sve2();
+// svebf16 = aarch64 svebf16
+NCNN_EXPORT int cpu_support_arm_svebf16();
+// svei8mm = aarch64 svei8mm
+NCNN_EXPORT int cpu_support_arm_svei8mm();
+// svef32mm = aarch64 svef32mm
+NCNN_EXPORT int cpu_support_arm_svef32mm();
+
+// avx = x86 avx
+NCNN_EXPORT int cpu_support_x86_avx();
+// fma = x86 fma
+NCNN_EXPORT int cpu_support_x86_fma();
+// xop = x86 xop
+NCNN_EXPORT int cpu_support_x86_xop();
+// f16c = x86 f16c
+NCNN_EXPORT int cpu_support_x86_f16c();
+// avx2 = x86 avx2 + fma + f16c
+NCNN_EXPORT int cpu_support_x86_avx2();
+// avx_vnni = x86 avx vnni
+NCNN_EXPORT int cpu_support_x86_avx_vnni();
+// avx512 = x86 avx512f + avx512cd + avx512bw + avx512dq + avx512vl
+NCNN_EXPORT int cpu_support_x86_avx512();
+// avx512_vnni = x86 avx512 vnni
+NCNN_EXPORT int cpu_support_x86_avx512_vnni();
+// avx512_bf16 = x86 avx512 bf16
+NCNN_EXPORT int cpu_support_x86_avx512_bf16();
+// avx512_fp16 = x86 avx512 fp16
+NCNN_EXPORT int cpu_support_x86_avx512_fp16();
+
+// lsx = loongarch lsx
+NCNN_EXPORT int cpu_support_loongarch_lsx();
+// lasx = loongarch lasx
+NCNN_EXPORT int cpu_support_loongarch_lasx();
+
+// msa = mips mas
+NCNN_EXPORT int cpu_support_mips_msa();
+// mmi = loongson mmi
+NCNN_EXPORT int cpu_support_loongson_mmi();
+
+// v = riscv vector
+NCNN_EXPORT int cpu_support_riscv_v();
+// zfh = riscv half-precision float
+NCNN_EXPORT int cpu_support_riscv_zfh();
+// vlenb = riscv vector length in bytes
+NCNN_EXPORT int cpu_riscv_vlenb();
+
+// cpu info
+NCNN_EXPORT int get_cpu_count();
+NCNN_EXPORT int get_little_cpu_count();
+NCNN_EXPORT int get_big_cpu_count();
+
+NCNN_EXPORT int get_physical_cpu_count();
+NCNN_EXPORT int get_physical_little_cpu_count();
+NCNN_EXPORT int get_physical_big_cpu_count();
+
+// bind all threads on little clusters if powersave enabled
+// affects HMP arch cpu like ARM big.LITTLE
+// only implemented on android at the moment
+// switching powersave is expensive and not thread-safe
+// 0 = all cores enabled(default)
+// 1 = only little clusters enabled
+// 2 = only big clusters enabled
+// return 0 if success for setter function
+NCNN_EXPORT int get_cpu_powersave();
+NCNN_EXPORT int set_cpu_powersave(int powersave);
+
+// convenient wrapper
+NCNN_EXPORT const CpuSet& get_cpu_thread_affinity_mask(int powersave);
+
+// set explicit thread affinity
+NCNN_EXPORT int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask);
+
+// misc function wrapper for openmp routines
+NCNN_EXPORT int get_omp_num_threads();
+NCNN_EXPORT void set_omp_num_threads(int num_threads);
+
+NCNN_EXPORT int get_omp_dynamic();
+NCNN_EXPORT void set_omp_dynamic(int dynamic);
+
+NCNN_EXPORT int get_omp_thread_num();
+
+NCNN_EXPORT int get_kmp_blocktime();
+NCNN_EXPORT void set_kmp_blocktime(int time_ms);
+
+// need to flush denormals on Intel Chipset.
+// Other architectures such as ARM can be added as needed.
+// 0 = DAZ OFF, FTZ OFF
+// 1 = DAZ ON , FTZ OFF
+// 2 = DAZ OFF, FTZ ON
+// 3 = DAZ ON,  FTZ ON
+NCNN_EXPORT int get_flush_denormals();
+NCNN_EXPORT int set_flush_denormals(int flush_denormals);
+
+} // namespace ncnn
+
+#endif // NCNN_CPU_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/datareader.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/datareader.h
new file mode 100644
index 0000000..ed2aba3
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/datareader.h
@@ -0,0 +1,122 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_DATAREADER_H
+#define NCNN_DATAREADER_H
+
+#include "platform.h"
+#if NCNN_STDIO
+#include <stdio.h>
+#endif
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/asset_manager.h>
+#endif
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+// data read wrapper
+class NCNN_EXPORT DataReader
+{
+public:
+    DataReader();
+    virtual ~DataReader();
+
+#if NCNN_STRING
+    // parse plain param text
+    // return 1 if scan success
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+
+    // read binary param and model data
+    // return bytes read
+    virtual size_t read(void* buf, size_t size) const;
+
+    // get model data reference
+    // return bytes referenced
+    virtual size_t reference(size_t size, const void** buf) const;
+};
+
+#if NCNN_STDIO
+class DataReaderFromStdioPrivate;
+class NCNN_EXPORT DataReaderFromStdio : public DataReader
+{
+public:
+    explicit DataReaderFromStdio(FILE* fp);
+    virtual ~DataReaderFromStdio();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+
+private:
+    DataReaderFromStdio(const DataReaderFromStdio&);
+    DataReaderFromStdio& operator=(const DataReaderFromStdio&);
+
+private:
+    DataReaderFromStdioPrivate* const d;
+};
+#endif // NCNN_STDIO
+
+class DataReaderFromMemoryPrivate;
+class NCNN_EXPORT DataReaderFromMemory : public DataReader
+{
+public:
+    explicit DataReaderFromMemory(const unsigned char*& mem);
+    virtual ~DataReaderFromMemory();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+    virtual size_t reference(size_t size, const void** buf) const;
+
+private:
+    DataReaderFromMemory(const DataReaderFromMemory&);
+    DataReaderFromMemory& operator=(const DataReaderFromMemory&);
+
+private:
+    DataReaderFromMemoryPrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+class DataReaderFromAndroidAssetPrivate;
+class NCNN_EXPORT DataReaderFromAndroidAsset : public DataReader
+{
+public:
+    explicit DataReaderFromAndroidAsset(AAsset* asset);
+    virtual ~DataReaderFromAndroidAsset();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+
+private:
+    DataReaderFromAndroidAsset(const DataReaderFromAndroidAsset&);
+    DataReaderFromAndroidAsset& operator=(const DataReaderFromAndroidAsset&);
+
+private:
+    DataReaderFromAndroidAssetPrivate* const d;
+};
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+} // namespace ncnn
+
+#endif // NCNN_DATAREADER_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/gpu.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/gpu.h
new file mode 100644
index 0000000..2ef4927
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/gpu.h
@@ -0,0 +1,359 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_GPU_H
+#define NCNN_GPU_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+
+#include "mat.h"
+
+#include <vulkan/vulkan.h>
+
+#include "vulkan_header_fix.h"
+
+namespace ncnn {
+
+// instance
+NCNN_EXPORT int create_gpu_instance();
+NCNN_EXPORT void destroy_gpu_instance();
+
+// instance extension capability
+extern int support_VK_KHR_external_memory_capabilities;
+extern int support_VK_KHR_get_physical_device_properties2;
+extern int support_VK_KHR_get_surface_capabilities2;
+extern int support_VK_KHR_surface;
+extern int support_VK_EXT_debug_utils;
+#if __ANDROID_API__ >= 26
+extern int support_VK_KHR_android_surface;
+#endif // __ANDROID_API__ >= 26
+
+// VK_KHR_external_memory_capabilities
+extern PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR vkGetPhysicalDeviceExternalBufferPropertiesKHR;
+
+// VK_KHR_get_physical_device_properties2
+extern PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR;
+extern PFN_vkGetPhysicalDeviceProperties2KHR vkGetPhysicalDeviceProperties2KHR;
+extern PFN_vkGetPhysicalDeviceFormatProperties2KHR vkGetPhysicalDeviceFormatProperties2KHR;
+extern PFN_vkGetPhysicalDeviceImageFormatProperties2KHR vkGetPhysicalDeviceImageFormatProperties2KHR;
+extern PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR vkGetPhysicalDeviceQueueFamilyProperties2KHR;
+extern PFN_vkGetPhysicalDeviceMemoryProperties2KHR vkGetPhysicalDeviceMemoryProperties2KHR;
+extern PFN_vkGetPhysicalDeviceSparseImageFormatProperties2KHR vkGetPhysicalDeviceSparseImageFormatProperties2KHR;
+
+// VK_KHR_get_surface_capabilities2
+extern PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR vkGetPhysicalDeviceSurfaceCapabilities2KHR;
+extern PFN_vkGetPhysicalDeviceSurfaceFormats2KHR vkGetPhysicalDeviceSurfaceFormats2KHR;
+
+// VK_KHR_surface
+extern PFN_vkDestroySurfaceKHR vkDestroySurfaceKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceSupportKHR vkGetPhysicalDeviceSurfaceSupportKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR vkGetPhysicalDeviceSurfaceCapabilitiesKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceFormatsKHR vkGetPhysicalDeviceSurfaceFormatsKHR;
+extern PFN_vkGetPhysicalDeviceSurfacePresentModesKHR vkGetPhysicalDeviceSurfacePresentModesKHR;
+
+#if __ANDROID_API__ >= 26
+// VK_KHR_android_surface
+extern PFN_vkCreateAndroidSurfaceKHR vkCreateAndroidSurfaceKHR;
+#endif // __ANDROID_API__ >= 26
+
+// VK_NV_cooperative_matrix
+extern PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesNV vkGetPhysicalDeviceCooperativeMatrixPropertiesNV;
+
+// get info
+NCNN_EXPORT int get_gpu_count();
+NCNN_EXPORT int get_default_gpu_index();
+
+class GpuInfoPrivate;
+class NCNN_EXPORT GpuInfo
+{
+public:
+    explicit GpuInfo();
+    virtual ~GpuInfo();
+
+    // vulkan physical device
+    VkPhysicalDevice physical_device() const;
+
+    // memory properties
+    const VkPhysicalDeviceMemoryProperties& physical_device_memory_properties() const;
+
+    // info
+    uint32_t api_version() const;
+    uint32_t driver_version() const;
+    uint32_t vendor_id() const;
+    uint32_t device_id() const;
+    const char* device_name() const;
+    uint8_t* pipeline_cache_uuid() const;
+
+    // 0 = discrete gpu
+    // 1 = integrated gpu
+    // 2 = virtual gpu
+    // 3 = cpu
+    int type() const;
+
+    // hardware limit
+    uint32_t max_shared_memory_size() const;
+    uint32_t max_workgroup_count_x() const;
+    uint32_t max_workgroup_count_y() const;
+    uint32_t max_workgroup_count_z() const;
+    uint32_t max_workgroup_invocations() const;
+    uint32_t max_workgroup_size_x() const;
+    uint32_t max_workgroup_size_y() const;
+    uint32_t max_workgroup_size_z() const;
+    size_t memory_map_alignment() const;
+    size_t buffer_offset_alignment() const;
+    size_t non_coherent_atom_size() const;
+    size_t buffer_image_granularity() const;
+    uint32_t max_image_dimension_1d() const;
+    uint32_t max_image_dimension_2d() const;
+    uint32_t max_image_dimension_3d() const;
+    float timestamp_period() const;
+
+    // runtime
+    uint32_t compute_queue_family_index() const;
+    uint32_t graphics_queue_family_index() const;
+    uint32_t transfer_queue_family_index() const;
+
+    uint32_t compute_queue_count() const;
+    uint32_t graphics_queue_count() const;
+    uint32_t transfer_queue_count() const;
+
+    // property
+    bool unified_compute_transfer_queue() const;
+
+    // subgroup
+    uint32_t subgroup_size() const;
+    bool support_subgroup_basic() const;
+    bool support_subgroup_vote() const;
+    bool support_subgroup_ballot() const;
+    bool support_subgroup_shuffle() const;
+
+    // bug is not feature
+    bool bug_storage_buffer_no_l1() const;
+    bool bug_corrupted_online_pipeline_cache() const;
+    bool bug_buffer_image_load_zero() const;
+
+    // but sometimes bug is a feature
+    bool bug_implicit_fp16_arithmetic() const;
+
+    // fp16 and int8 feature
+    bool support_fp16_packed() const;
+    bool support_fp16_storage() const;
+    bool support_fp16_arithmetic() const;
+    bool support_int8_packed() const;
+    bool support_int8_storage() const;
+    bool support_int8_arithmetic() const;
+
+    // ycbcr conversion feature
+    bool support_ycbcr_conversion() const;
+
+    // cooperative matrix feature
+    bool support_cooperative_matrix() const;
+    bool support_cooperative_matrix_16_8_8() const;
+
+    // extension capability
+    int support_VK_KHR_8bit_storage() const;
+    int support_VK_KHR_16bit_storage() const;
+    int support_VK_KHR_bind_memory2() const;
+    int support_VK_KHR_create_renderpass2() const;
+    int support_VK_KHR_dedicated_allocation() const;
+    int support_VK_KHR_descriptor_update_template() const;
+    int support_VK_KHR_external_memory() const;
+    int support_VK_KHR_get_memory_requirements2() const;
+    int support_VK_KHR_maintenance1() const;
+    int support_VK_KHR_maintenance2() const;
+    int support_VK_KHR_maintenance3() const;
+    int support_VK_KHR_multiview() const;
+    int support_VK_KHR_push_descriptor() const;
+    int support_VK_KHR_sampler_ycbcr_conversion() const;
+    int support_VK_KHR_shader_float16_int8() const;
+    int support_VK_KHR_shader_float_controls() const;
+    int support_VK_KHR_storage_buffer_storage_class() const;
+    int support_VK_KHR_swapchain() const;
+    int support_VK_EXT_descriptor_indexing() const;
+    int support_VK_EXT_memory_budget() const;
+    int support_VK_EXT_queue_family_foreign() const;
+#if __ANDROID_API__ >= 26
+    int support_VK_ANDROID_external_memory_android_hardware_buffer() const;
+#endif // __ANDROID_API__ >= 26
+    int support_VK_NV_cooperative_matrix() const;
+
+private:
+    GpuInfo(const GpuInfo&);
+    GpuInfo& operator=(const GpuInfo&);
+
+private:
+    friend int create_gpu_instance();
+    GpuInfoPrivate* const d;
+};
+
+NCNN_EXPORT const GpuInfo& get_gpu_info(int device_index = get_default_gpu_index());
+
+class VkAllocator;
+class VkCompute;
+class Option;
+class PipelineCache;
+class VulkanDevicePrivate;
+class NCNN_EXPORT VulkanDevice
+{
+public:
+    VulkanDevice(int device_index = get_default_gpu_index());
+    ~VulkanDevice();
+
+    const GpuInfo& info;
+
+    VkDevice vkdevice() const;
+
+    VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const;
+
+    // with fixed workgroup size
+    VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const;
+
+    // helper for creating pipeline
+    int create_descriptorset_layout(int binding_count, const int* binding_types, VkDescriptorSetLayout* descriptorset_layout) const;
+    int create_pipeline_layout(int push_constant_count, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout* pipeline_layout) const;
+    int create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, VkPipeline* pipeline) const;
+    int create_descriptor_update_template(int binding_count, const int* binding_types, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR* descriptor_update_template) const;
+
+    uint32_t find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const;
+    bool is_mappable(uint32_t memory_type_index) const;
+    bool is_coherent(uint32_t memory_type_index) const;
+
+    VkQueue acquire_queue(uint32_t queue_family_index) const;
+    void reclaim_queue(uint32_t queue_family_index, VkQueue queue) const;
+
+    // allocator on this device
+    VkAllocator* acquire_blob_allocator() const;
+    void reclaim_blob_allocator(VkAllocator* allocator) const;
+
+    VkAllocator* acquire_staging_allocator() const;
+    void reclaim_staging_allocator(VkAllocator* allocator) const;
+
+    // immutable sampler for texelfetch
+    const VkSampler* immutable_texelfetch_sampler() const;
+
+    // dummy buffer image
+    VkMat get_dummy_buffer() const;
+    VkImageMat get_dummy_image() const;
+    VkImageMat get_dummy_image_readonly() const;
+
+    // pipeline cache on this device
+    const PipelineCache* get_pipeline_cache() const;
+
+    // test image allocation
+    bool shape_support_image_storage(const Mat& shape) const;
+
+    // current gpu heap memory budget in MB
+    uint32_t get_heap_budget() const;
+
+    // utility operator
+    void convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkImageMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkImageMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+
+    // VK_KHR_bind_memory2
+    PFN_vkBindBufferMemory2KHR vkBindBufferMemory2KHR;
+    PFN_vkBindImageMemory2KHR vkBindImageMemory2KHR;
+
+    // VK_KHR_create_renderpass2
+    PFN_vkCmdBeginRenderPass2KHR vkCmdBeginRenderPass2KHR;
+    PFN_vkCmdEndRenderPass2KHR vkCmdEndRenderPass2KHR;
+    PFN_vkCmdNextSubpass2KHR vkCmdNextSubpass2KHR;
+    PFN_vkCreateRenderPass2KHR vkCreateRenderPass2KHR;
+
+    // VK_KHR_descriptor_update_template
+    PFN_vkCreateDescriptorUpdateTemplateKHR vkCreateDescriptorUpdateTemplateKHR;
+    PFN_vkDestroyDescriptorUpdateTemplateKHR vkDestroyDescriptorUpdateTemplateKHR;
+    PFN_vkUpdateDescriptorSetWithTemplateKHR vkUpdateDescriptorSetWithTemplateKHR;
+
+    // VK_KHR_get_memory_requirements2
+    PFN_vkGetImageMemoryRequirements2KHR vkGetImageMemoryRequirements2KHR;
+    PFN_vkGetBufferMemoryRequirements2KHR vkGetBufferMemoryRequirements2KHR;
+    PFN_vkGetImageSparseMemoryRequirements2KHR vkGetImageSparseMemoryRequirements2KHR;
+
+    // VK_KHR_maintenance1
+    PFN_vkTrimCommandPoolKHR vkTrimCommandPoolKHR;
+
+    // VK_KHR_maintenance3
+    PFN_vkGetDescriptorSetLayoutSupportKHR vkGetDescriptorSetLayoutSupportKHR;
+
+    // VK_KHR_push_descriptor
+    PFN_vkCmdPushDescriptorSetWithTemplateKHR vkCmdPushDescriptorSetWithTemplateKHR;
+    PFN_vkCmdPushDescriptorSetKHR vkCmdPushDescriptorSetKHR;
+
+    // VK_KHR_sampler_ycbcr_conversion
+    PFN_vkCreateSamplerYcbcrConversionKHR vkCreateSamplerYcbcrConversionKHR;
+    PFN_vkDestroySamplerYcbcrConversionKHR vkDestroySamplerYcbcrConversionKHR;
+
+    // VK_KHR_swapchain
+    PFN_vkCreateSwapchainKHR vkCreateSwapchainKHR;
+    PFN_vkDestroySwapchainKHR vkDestroySwapchainKHR;
+    PFN_vkGetSwapchainImagesKHR vkGetSwapchainImagesKHR;
+    PFN_vkAcquireNextImageKHR vkAcquireNextImageKHR;
+    PFN_vkQueuePresentKHR vkQueuePresentKHR;
+
+#if __ANDROID_API__ >= 26
+    // VK_ANDROID_external_memory_android_hardware_buffer
+    PFN_vkGetAndroidHardwareBufferPropertiesANDROID vkGetAndroidHardwareBufferPropertiesANDROID;
+    PFN_vkGetMemoryAndroidHardwareBufferANDROID vkGetMemoryAndroidHardwareBufferANDROID;
+#endif // __ANDROID_API__ >= 26
+
+protected:
+    // device extension
+    int init_device_extension();
+
+private:
+    VulkanDevice(const VulkanDevice&);
+    VulkanDevice& operator=(const VulkanDevice&);
+
+private:
+    VulkanDevicePrivate* const d;
+};
+
+NCNN_EXPORT VulkanDevice* get_gpu_device(int device_index = get_default_gpu_index());
+
+// online spirv compilation
+NCNN_EXPORT int compile_spirv_module(const char* comp_string, const Option& opt, std::vector<uint32_t>& spirv);
+NCNN_EXPORT int compile_spirv_module(const char* comp_data, int comp_data_size, const Option& opt, std::vector<uint32_t>& spirv);
+NCNN_EXPORT int compile_spirv_module(int shader_type_index, const Option& opt, std::vector<uint32_t>& spirv);
+
+// info from spirv
+class NCNN_EXPORT ShaderInfo
+{
+public:
+    int specialization_count;
+    int binding_count;
+    int push_constant_count;
+
+    // 0 = null
+    // 1 = storage buffer
+    // 2 = storage image
+    // 3 = combined image sampler
+    int binding_types[16]; // 16 is large enough I think ...
+
+    int reserved_0;
+    int reserved_1;
+    int reserved_2;
+    int reserved_3;
+};
+
+NCNN_EXPORT int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info);
+
+} // namespace ncnn
+
+#endif // NCNN_VULKAN
+
+#endif // NCNN_GPU_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/layer.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/layer.h
new file mode 100644
index 0000000..d02f65b
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/layer.h
@@ -0,0 +1,214 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_H
+#define NCNN_LAYER_H
+
+#include "mat.h"
+#include "modelbin.h"
+#include "option.h"
+#include "paramdict.h"
+#include "platform.h"
+
+#include <math.h>
+
+#if NCNN_VULKAN
+#include "command.h"
+#include "pipeline.h"
+
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+namespace ncnn {
+
+class NCNN_EXPORT Layer
+{
+public:
+    // empty
+    Layer();
+    // virtual destructor
+    virtual ~Layer();
+
+    // load layer specific parameter from parsed dict
+    // return 0 if success
+    virtual int load_param(const ParamDict& pd);
+
+    // load layer specific weight data from model binary
+    // return 0 if success
+    virtual int load_model(const ModelBin& mb);
+
+    // layer implementation specific setup
+    // return 0 if success
+    virtual int create_pipeline(const Option& opt);
+
+    // layer implementation specific clean
+    // return 0 if success
+    virtual int destroy_pipeline(const Option& opt);
+
+public:
+    // one input and one output blob
+    bool one_blob_only;
+
+    // support inplace inference
+    bool support_inplace;
+
+    // support vulkan compute
+    bool support_vulkan;
+
+    // accept input blob with packed storage
+    bool support_packing;
+
+    // accept bf16
+    bool support_bf16_storage;
+
+    // accept fp16
+    bool support_fp16_storage;
+
+    // accept int8
+    bool support_int8_storage;
+
+    // shader image storage
+    bool support_image_storage;
+
+    // shader tensor storage
+    bool support_tensor_storage;
+
+    bool support_reserved_00;
+
+    bool support_reserved_0;
+    bool support_reserved_1;
+    bool support_reserved_2;
+    bool support_reserved_3;
+    bool support_reserved_4;
+    bool support_reserved_5;
+    bool support_reserved_6;
+    bool support_reserved_7;
+    bool support_reserved_8;
+    bool support_reserved_9;
+
+    // feature disabled set
+    int featmask;
+
+public:
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+
+#if NCNN_VULKAN
+public:
+    // upload weight blob from host to device
+    virtual int upload_model(VkTransfer& cmd, const Option& opt);
+
+public:
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<VkMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<VkImageMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    // assigned immediately after creating this layer
+    const VulkanDevice* vkdev;
+#endif // NCNN_VULKAN
+
+public:
+    // custom user data
+    void* userdata;
+    // layer type index
+    int typeindex;
+#if NCNN_STRING
+    // layer type name
+    std::string type;
+    // layer name
+    std::string name;
+#endif // NCNN_STRING
+    // blob index which this layer needs as input
+    std::vector<int> bottoms;
+    // blob index which this layer produces as output
+    std::vector<int> tops;
+    // shape hint
+    std::vector<Mat> bottom_shapes;
+    std::vector<Mat> top_shapes;
+};
+
+// layer factory function
+typedef Layer* (*layer_creator_func)(void*);
+typedef void (*layer_destroyer_func)(Layer*, void*);
+
+struct layer_registry_entry
+{
+#if NCNN_STRING
+    // layer type name
+    const char* name;
+#endif // NCNN_STRING
+    // layer factory entry
+    layer_creator_func creator;
+};
+
+struct custom_layer_registry_entry
+{
+#if NCNN_STRING
+    // layer type name
+    const char* name;
+#endif // NCNN_STRING
+    // layer factory entry
+    layer_creator_func creator;
+    layer_destroyer_func destroyer;
+    void* userdata;
+};
+
+#if NCNN_STRING
+// get layer type from type name
+NCNN_EXPORT int layer_to_index(const char* type);
+// create layer from type name
+NCNN_EXPORT Layer* create_layer(const char* type);
+#endif // NCNN_STRING
+// create layer from layer type
+NCNN_EXPORT Layer* create_layer(int index);
+
+#define DEFINE_LAYER_CREATOR(name)                          \
+    ::ncnn::Layer* name##_layer_creator(void* /*userdata*/) \
+    {                                                       \
+        return new name;                                    \
+    }
+
+#define DEFINE_LAYER_DESTROYER(name)                                      \
+    void name##_layer_destroyer(::ncnn::Layer* layer, void* /*userdata*/) \
+    {                                                                     \
+        delete layer;                                                     \
+    }
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/layer_shader_type.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/layer_shader_type.h
new file mode 100644
index 0000000..c143e7d
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/layer_shader_type.h
@@ -0,0 +1,29 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_SHADER_TYPE_H
+#define NCNN_LAYER_SHADER_TYPE_H
+
+namespace ncnn {
+
+namespace LayerShaderType {
+enum LayerShaderType
+{
+#include "layer_shader_type_enum.h"
+};
+} // namespace LayerShaderType
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_SHADER_TYPE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/layer_shader_type_enum.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/layer_shader_type_enum.h
new file mode 100644
index 0000000..f11cab9
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/layer_shader_type_enum.h
@@ -0,0 +1,370 @@
+// Layer Shader Enum header
+//
+// This file is auto-generated by cmake, don't edit it.
+
+absval = 0,
+absval_pack4 = 1,
+absval_pack8 = 2,
+batchnorm = 3,
+batchnorm_pack4 = 4,
+batchnorm_pack8 = 5,
+concat = 6,
+concat_pack4 = 7,
+concat_pack4to1 = 8,
+concat_pack8 = 9,
+concat_pack8to1 = 10,
+concat_pack8to4 = 11,
+convolution = 12,
+convolution_1x1s1d1 = 13,
+convolution_3x3s1d1_winograd23_transform_input = 14,
+convolution_3x3s1d1_winograd23_transform_output = 15,
+convolution_3x3s1d1_winograd43_transform_input = 16,
+convolution_3x3s1d1_winograd43_transform_output = 17,
+convolution_3x3s1d1_winograd_gemm = 18,
+convolution_gemm = 19,
+convolution_pack1to4 = 20,
+convolution_pack1to4_1x1s1d1 = 21,
+convolution_pack1to4_3x3s1d1_winograd_gemm = 22,
+convolution_pack1to4_gemm = 23,
+convolution_pack1to8 = 24,
+convolution_pack1to8_1x1s1d1 = 25,
+convolution_pack1to8_3x3s1d1_winograd_gemm = 26,
+convolution_pack1to8_gemm = 27,
+convolution_pack4 = 28,
+convolution_pack4_1x1s1d1 = 29,
+convolution_pack4_1x1s1d1_cm_16_8_8 = 30,
+convolution_pack4_3x3s1d1_winograd23_transform_input = 31,
+convolution_pack4_3x3s1d1_winograd23_transform_output = 32,
+convolution_pack4_3x3s1d1_winograd43_transform_input = 33,
+convolution_pack4_3x3s1d1_winograd43_transform_output = 34,
+convolution_pack4_3x3s1d1_winograd_gemm = 35,
+convolution_pack4_3x3s1d1_winograd_gemm_cm_16_8_8 = 36,
+convolution_pack4_gemm = 37,
+convolution_pack4_gemm_cm_16_8_8 = 38,
+convolution_pack4to1 = 39,
+convolution_pack4to1_1x1s1d1 = 40,
+convolution_pack4to1_3x3s1d1_winograd_gemm = 41,
+convolution_pack4to1_gemm = 42,
+convolution_pack4to8 = 43,
+convolution_pack4to8_1x1s1d1 = 44,
+convolution_pack4to8_3x3s1d1_winograd_gemm = 45,
+convolution_pack4to8_gemm = 46,
+convolution_pack8 = 47,
+convolution_pack8_1x1s1d1 = 48,
+convolution_pack8_3x3s1d1_winograd23_transform_input = 49,
+convolution_pack8_3x3s1d1_winograd23_transform_output = 50,
+convolution_pack8_3x3s1d1_winograd43_transform_input = 51,
+convolution_pack8_3x3s1d1_winograd43_transform_output = 52,
+convolution_pack8_3x3s1d1_winograd_gemm = 53,
+convolution_pack8_gemm = 54,
+convolution_pack8to1 = 55,
+convolution_pack8to1_1x1s1d1 = 56,
+convolution_pack8to1_3x3s1d1_winograd_gemm = 57,
+convolution_pack8to1_gemm = 58,
+convolution_pack8to4 = 59,
+convolution_pack8to4_1x1s1d1 = 60,
+convolution_pack8to4_3x3s1d1_winograd_gemm = 61,
+convolution_pack8to4_gemm = 62,
+crop = 63,
+crop_pack1to4 = 64,
+crop_pack1to8 = 65,
+crop_pack4 = 66,
+crop_pack4to1 = 67,
+crop_pack4to8 = 68,
+crop_pack8 = 69,
+crop_pack8to1 = 70,
+crop_pack8to4 = 71,
+deconvolution = 72,
+deconvolution_col2im = 73,
+deconvolution_gemm = 74,
+deconvolution_pack1to4 = 75,
+deconvolution_pack1to4_gemm = 76,
+deconvolution_pack1to8 = 77,
+deconvolution_pack1to8_gemm = 78,
+deconvolution_pack4 = 79,
+deconvolution_pack4_col2im = 80,
+deconvolution_pack4_gemm = 81,
+deconvolution_pack4_gemm_cm_16_8_8 = 82,
+deconvolution_pack4to1 = 83,
+deconvolution_pack4to1_gemm = 84,
+deconvolution_pack4to8 = 85,
+deconvolution_pack4to8_gemm = 86,
+deconvolution_pack8 = 87,
+deconvolution_pack8_col2im = 88,
+deconvolution_pack8_gemm = 89,
+deconvolution_pack8to1 = 90,
+deconvolution_pack8to1_gemm = 91,
+deconvolution_pack8to4 = 92,
+deconvolution_pack8to4_gemm = 93,
+dropout = 94,
+dropout_pack4 = 95,
+dropout_pack8 = 96,
+eltwise = 97,
+eltwise_pack4 = 98,
+eltwise_pack8 = 99,
+elu = 100,
+elu_pack4 = 101,
+elu_pack8 = 102,
+flatten = 103,
+flatten_pack1to4 = 104,
+flatten_pack1to8 = 105,
+flatten_pack4 = 106,
+flatten_pack4to8 = 107,
+flatten_pack8 = 108,
+innerproduct = 109,
+innerproduct_gemm = 110,
+innerproduct_gemm_wp1to4 = 111,
+innerproduct_gemm_wp1to8 = 112,
+innerproduct_gemm_wp4 = 113,
+innerproduct_gemm_wp4to1 = 114,
+innerproduct_gemm_wp4to8 = 115,
+innerproduct_gemm_wp8 = 116,
+innerproduct_gemm_wp8to1 = 117,
+innerproduct_gemm_wp8to4 = 118,
+innerproduct_pack1to4 = 119,
+innerproduct_pack1to8 = 120,
+innerproduct_pack4 = 121,
+innerproduct_pack4to1 = 122,
+innerproduct_pack4to8 = 123,
+innerproduct_pack8 = 124,
+innerproduct_pack8to1 = 125,
+innerproduct_pack8to4 = 126,
+innerproduct_reduce_sum8 = 127,
+innerproduct_reduce_sum8_pack4 = 128,
+innerproduct_reduce_sum8_pack8 = 129,
+innerproduct_sum8 = 130,
+innerproduct_sum8_pack1to4 = 131,
+innerproduct_sum8_pack1to8 = 132,
+innerproduct_sum8_pack4 = 133,
+innerproduct_sum8_pack4to1 = 134,
+innerproduct_sum8_pack4to8 = 135,
+innerproduct_sum8_pack8 = 136,
+innerproduct_sum8_pack8to1 = 137,
+innerproduct_sum8_pack8to4 = 138,
+lrn_norm = 139,
+lrn_norm_across_channel_pack4 = 140,
+lrn_norm_across_channel_pack8 = 141,
+lrn_norm_within_channel_pack4 = 142,
+lrn_norm_within_channel_pack8 = 143,
+lrn_square_pad = 144,
+lrn_square_pad_across_channel_pack4 = 145,
+lrn_square_pad_across_channel_pack8 = 146,
+lrn_square_pad_within_channel_pack4 = 147,
+lrn_square_pad_within_channel_pack8 = 148,
+pooling = 149,
+pooling_adaptive = 150,
+pooling_adaptive_pack4 = 151,
+pooling_adaptive_pack8 = 152,
+pooling_global = 153,
+pooling_global_pack4 = 154,
+pooling_global_pack8 = 155,
+pooling_pack4 = 156,
+pooling_pack8 = 157,
+prelu = 158,
+prelu_pack4 = 159,
+prelu_pack8 = 160,
+relu = 161,
+relu_pack4 = 162,
+relu_pack8 = 163,
+reshape = 164,
+reshape_pack1to4 = 165,
+reshape_pack1to8 = 166,
+reshape_pack4 = 167,
+reshape_pack4to1 = 168,
+reshape_pack4to8 = 169,
+reshape_pack8 = 170,
+reshape_pack8to1 = 171,
+reshape_pack8to4 = 172,
+scale = 173,
+scale_pack4 = 174,
+scale_pack8 = 175,
+sigmoid = 176,
+sigmoid_pack4 = 177,
+sigmoid_pack8 = 178,
+slice = 179,
+slice_pack1to4 = 180,
+slice_pack1to8 = 181,
+slice_pack4 = 182,
+slice_pack4to8 = 183,
+slice_pack8 = 184,
+softmax_div_sum = 185,
+softmax_div_sum_pack4 = 186,
+softmax_div_sum_pack8 = 187,
+softmax_exp_sub_max = 188,
+softmax_exp_sub_max_pack4 = 189,
+softmax_exp_sub_max_pack8 = 190,
+softmax_reduce_max = 191,
+softmax_reduce_max_pack4 = 192,
+softmax_reduce_max_pack8 = 193,
+softmax_reduce_sum = 194,
+softmax_reduce_sum_pack4 = 195,
+softmax_reduce_sum_pack8 = 196,
+tanh = 197,
+tanh_pack4 = 198,
+tanh_pack8 = 199,
+binaryop = 200,
+binaryop_broadcast = 201,
+binaryop_broadcast_a1_pack4 = 202,
+binaryop_broadcast_a1_pack8 = 203,
+binaryop_broadcast_b1_pack4 = 204,
+binaryop_broadcast_b1_pack8 = 205,
+binaryop_broadcast_pack4 = 206,
+binaryop_broadcast_pack8 = 207,
+binaryop_pack4 = 208,
+binaryop_pack8 = 209,
+unaryop = 210,
+unaryop_pack4 = 211,
+unaryop_pack8 = 212,
+convolutiondepthwise = 213,
+convolutiondepthwise_group = 214,
+convolutiondepthwise_group_pack1to4 = 215,
+convolutiondepthwise_group_pack1to8 = 216,
+convolutiondepthwise_group_pack4 = 217,
+convolutiondepthwise_group_pack4to1 = 218,
+convolutiondepthwise_group_pack4to8 = 219,
+convolutiondepthwise_group_pack8 = 220,
+convolutiondepthwise_group_pack8to1 = 221,
+convolutiondepthwise_group_pack8to4 = 222,
+convolutiondepthwise_pack4 = 223,
+convolutiondepthwise_pack8 = 224,
+padding = 225,
+padding_3d = 226,
+padding_3d_pack4 = 227,
+padding_3d_pack8 = 228,
+padding_pack1to4 = 229,
+padding_pack1to8 = 230,
+padding_pack4 = 231,
+padding_pack4to1 = 232,
+padding_pack4to8 = 233,
+padding_pack8 = 234,
+padding_pack8to1 = 235,
+padding_pack8to4 = 236,
+normalize_coeffs = 237,
+normalize_coeffs_pack4 = 238,
+normalize_coeffs_pack8 = 239,
+normalize_norm = 240,
+normalize_norm_pack4 = 241,
+normalize_norm_pack8 = 242,
+normalize_reduce_sum4_fp16_to_fp32 = 243,
+normalize_reduce_sum4_fp16_to_fp32_pack4 = 244,
+normalize_reduce_sum4_fp16_to_fp32_pack8 = 245,
+normalize_reduce_sum4_fp32 = 246,
+normalize_reduce_sum4_fp32_pack4 = 247,
+normalize_reduce_sum4_fp32_pack8 = 248,
+permute = 249,
+permute_pack1to4 = 250,
+permute_pack1to8 = 251,
+permute_pack4 = 252,
+permute_pack4to1 = 253,
+permute_pack4to8 = 254,
+permute_pack8 = 255,
+permute_pack8to1 = 256,
+permute_pack8to4 = 257,
+priorbox = 258,
+priorbox_mxnet = 259,
+interp = 260,
+interp_bicubic = 261,
+interp_bicubic_coeffs = 262,
+interp_bicubic_pack4 = 263,
+interp_bicubic_pack8 = 264,
+interp_pack4 = 265,
+interp_pack8 = 266,
+deconvolutiondepthwise = 267,
+deconvolutiondepthwise_group = 268,
+deconvolutiondepthwise_group_pack1to4 = 269,
+deconvolutiondepthwise_group_pack1to8 = 270,
+deconvolutiondepthwise_group_pack4 = 271,
+deconvolutiondepthwise_group_pack4to1 = 272,
+deconvolutiondepthwise_group_pack4to8 = 273,
+deconvolutiondepthwise_group_pack8 = 274,
+deconvolutiondepthwise_group_pack8to1 = 275,
+deconvolutiondepthwise_group_pack8to4 = 276,
+deconvolutiondepthwise_pack4 = 277,
+deconvolutiondepthwise_pack8 = 278,
+shufflechannel = 279,
+shufflechannel_pack4 = 280,
+shufflechannel_pack8 = 281,
+instancenorm_coeffs = 282,
+instancenorm_coeffs_pack4 = 283,
+instancenorm_coeffs_pack8 = 284,
+instancenorm_norm = 285,
+instancenorm_norm_pack4 = 286,
+instancenorm_norm_pack8 = 287,
+instancenorm_reduce_mean = 288,
+instancenorm_reduce_mean_pack4 = 289,
+instancenorm_reduce_mean_pack8 = 290,
+instancenorm_reduce_sum4_fp16_to_fp32 = 291,
+instancenorm_reduce_sum4_fp16_to_fp32_pack4 = 292,
+instancenorm_reduce_sum4_fp16_to_fp32_pack8 = 293,
+instancenorm_reduce_sum4_fp32 = 294,
+instancenorm_reduce_sum4_fp32_pack4 = 295,
+instancenorm_reduce_sum4_fp32_pack8 = 296,
+instancenorm_sub_mean_square = 297,
+instancenorm_sub_mean_square_pack4 = 298,
+instancenorm_sub_mean_square_pack8 = 299,
+clip = 300,
+clip_pack4 = 301,
+clip_pack8 = 302,
+reorg = 303,
+reorg_pack1to4 = 304,
+reorg_pack1to8 = 305,
+reorg_pack4 = 306,
+reorg_pack4to8 = 307,
+reorg_pack8 = 308,
+packing = 309,
+packing_fp16_to_fp32 = 310,
+packing_fp32_to_fp16 = 311,
+packing_pack1to4 = 312,
+packing_pack1to4_fp16_to_fp32 = 313,
+packing_pack1to4_fp32_to_fp16 = 314,
+packing_pack1to8 = 315,
+packing_pack1to8_fp16_to_fp32 = 316,
+packing_pack1to8_fp32_to_fp16 = 317,
+packing_pack4 = 318,
+packing_pack4_fp16_to_fp32 = 319,
+packing_pack4_fp32_to_fp16 = 320,
+packing_pack4to1 = 321,
+packing_pack4to1_fp16_to_fp32 = 322,
+packing_pack4to1_fp32_to_fp16 = 323,
+packing_pack4to8 = 324,
+packing_pack4to8_fp16_to_fp32 = 325,
+packing_pack4to8_fp32_to_fp16 = 326,
+packing_pack8 = 327,
+packing_pack8_fp16_to_fp32 = 328,
+packing_pack8_fp32_to_fp16 = 329,
+packing_pack8to1 = 330,
+packing_pack8to1_fp16_to_fp32 = 331,
+packing_pack8to1_fp32_to_fp16 = 332,
+packing_pack8to4 = 333,
+packing_pack8to4_fp16_to_fp32 = 334,
+packing_pack8to4_fp32_to_fp16 = 335,
+cast_fp16_to_fp32 = 336,
+cast_fp16_to_fp32_pack4 = 337,
+cast_fp16_to_fp32_pack8 = 338,
+cast_fp32_to_fp16 = 339,
+cast_fp32_to_fp16_pack4 = 340,
+cast_fp32_to_fp16_pack8 = 341,
+hardsigmoid = 342,
+hardsigmoid_pack4 = 343,
+hardsigmoid_pack8 = 344,
+hardswish = 345,
+hardswish_pack4 = 346,
+hardswish_pack8 = 347,
+pixelshuffle = 348,
+pixelshuffle_pack4 = 349,
+pixelshuffle_pack4to1 = 350,
+pixelshuffle_pack8 = 351,
+pixelshuffle_pack8to1 = 352,
+pixelshuffle_pack8to4 = 353,
+deepcopy = 354,
+deepcopy_pack4 = 355,
+deepcopy_pack8 = 356,
+mish = 357,
+mish_pack4 = 358,
+mish_pack8 = 359,
+swish = 360,
+swish_pack4 = 361,
+swish_pack8 = 362,
+convert_ycbcr = 363,
+vulkan_activation = 364,
+
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/layer_type.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/layer_type.h
new file mode 100644
index 0000000..511c714
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/layer_type.h
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_TYPE_H
+#define NCNN_LAYER_TYPE_H
+
+namespace ncnn {
+
+namespace LayerType {
+enum LayerType
+{
+#include "layer_type_enum.h"
+    CustomBit = (1 << 8),
+};
+} // namespace LayerType
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_TYPE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/layer_type_enum.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/layer_type_enum.h
new file mode 100644
index 0000000..581d589
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/layer_type_enum.h
@@ -0,0 +1,103 @@
+// Layer Type Enum header
+//
+// This file is auto-generated by cmake, don't edit it.
+
+AbsVal = 0,
+ArgMax = 1,
+BatchNorm = 2,
+Bias = 3,
+BNLL = 4,
+Concat = 5,
+Convolution = 6,
+Crop = 7,
+Deconvolution = 8,
+Dropout = 9,
+Eltwise = 10,
+ELU = 11,
+Embed = 12,
+Exp = 13,
+Flatten = 14,
+InnerProduct = 15,
+Input = 16,
+Log = 17,
+LRN = 18,
+MemoryData = 19,
+MVN = 20,
+Pooling = 21,
+Power = 22,
+PReLU = 23,
+Proposal = 24,
+Reduction = 25,
+ReLU = 26,
+Reshape = 27,
+ROIPooling = 28,
+Scale = 29,
+Sigmoid = 30,
+Slice = 31,
+Softmax = 32,
+Split = 33,
+SPP = 34,
+TanH = 35,
+Threshold = 36,
+Tile = 37,
+RNN = 38,
+LSTM = 39,
+BinaryOp = 40,
+UnaryOp = 41,
+ConvolutionDepthWise = 42,
+Padding = 43,
+Squeeze = 44,
+ExpandDims = 45,
+Normalize = 46,
+Permute = 47,
+PriorBox = 48,
+DetectionOutput = 49,
+Interp = 50,
+DeconvolutionDepthWise = 51,
+ShuffleChannel = 52,
+InstanceNorm = 53,
+Clip = 54,
+Reorg = 55,
+YoloDetectionOutput = 56,
+Quantize = 57,
+Dequantize = 58,
+Yolov3DetectionOutput = 59,
+PSROIPooling = 60,
+ROIAlign = 61,
+Packing = 62,
+Requantize = 63,
+Cast = 64,
+HardSigmoid = 65,
+SELU = 66,
+HardSwish = 67,
+Noop = 68,
+PixelShuffle = 69,
+DeepCopy = 70,
+Mish = 71,
+StatisticsPooling = 72,
+Swish = 73,
+Gemm = 74,
+GroupNorm = 75,
+LayerNorm = 76,
+Softplus = 77,
+GRU = 78,
+MultiHeadAttention = 79,
+GELU = 80,
+Convolution1D = 81,
+Pooling1D = 82,
+ConvolutionDepthWise1D = 83,
+Convolution3D = 84,
+ConvolutionDepthWise3D = 85,
+Pooling3D = 86,
+MatMul = 87,
+Deconvolution1D = 88,
+DeconvolutionDepthWise1D = 89,
+Deconvolution3D = 90,
+DeconvolutionDepthWise3D = 91,
+Einsum = 92,
+DeformableConv2D = 93,
+GLU = 94,
+Fold = 95,
+Unfold = 96,
+GridSample = 97,
+
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/mat.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/mat.h
new file mode 100644
index 0000000..c6f59ef
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/mat.h
@@ -0,0 +1,1843 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_MAT_H
+#define NCNN_MAT_H
+
+#include <stdlib.h>
+#include <string.h>
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif
+#if __SSE2__
+#include <emmintrin.h>
+#if __AVX__
+#include <immintrin.h>
+#endif
+#endif
+#if __mips_msa
+#include <msa.h>
+#endif
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif
+#if __riscv_vector
+#include <riscv_vector.h>
+#include "cpu.h" // cpu_riscv_vlenb()
+#endif
+
+#include "allocator.h"
+#include "option.h"
+#include "platform.h"
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#if NCNN_PIXEL
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/bitmap.h>
+#include <jni.h>
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+#endif // NCNN_PIXEL
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkMat;
+class VkImageMat;
+#endif // NCNN_VULKAN
+
+// the three dimension matrix
+class NCNN_EXPORT Mat
+{
+public:
+    // empty
+    Mat();
+    // vec
+    Mat(int w, size_t elemsize = 4u, Allocator* allocator = 0);
+    // image
+    Mat(int w, int h, size_t elemsize = 4u, Allocator* allocator = 0);
+    // dim
+    Mat(int w, int h, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // cube
+    Mat(int w, int h, int d, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // packed vec
+    Mat(int w, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed image
+    Mat(int w, int h, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed dim
+    Mat(int w, int h, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed cube
+    Mat(int w, int h, int d, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // copy
+    Mat(const Mat& m);
+    // external vec
+    Mat(int w, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external image
+    Mat(int w, int h, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external dim
+    Mat(int w, int h, int c, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external cube
+    Mat(int w, int h, int d, int c, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external packed vec
+    Mat(int w, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed image
+    Mat(int w, int h, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed dim
+    Mat(int w, int h, int c, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed cube
+    Mat(int w, int h, int d, int c, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // release
+    ~Mat();
+    // assign
+    Mat& operator=(const Mat& m);
+    // set all
+    void fill(float v);
+    void fill(int v);
+#if __ARM_NEON
+    void fill(float32x4_t _v);
+    void fill(uint16x4_t _v);
+    void fill(int32x4_t _v);
+    void fill(int32x4_t _v0, int32x4_t _v1);
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    void fill(float16x4_t _v);
+    void fill(float16x8_t _v);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif // __ARM_NEON
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+    void fill(__m512 _v);
+#endif // __AVX512F__
+    void fill(__m256 _v, int i = 0);
+#endif // __AVX__
+    void fill(__m128 _v);
+    void fill(__m128i _v);
+#endif // __SSE2__
+#if __mips_msa
+    void fill(v4f32 _v);
+#endif // __mips_msa
+#if __loongarch_sx
+    void fill(__m128 _v);
+#endif //__loongarch_sx
+#if __riscv_vector
+    void fill(vfloat32m1_t _v);
+    void fill(vuint16m1_t _v);
+    void fill(vint8m1_t _v);
+#if __riscv_zfh
+    void fill(vfloat16m1_t _v);
+#endif // __riscv_zfh
+#endif // __riscv_vector
+    template<typename T>
+    void fill(T v);
+    // deep copy
+    Mat clone(Allocator* allocator = 0) const;
+    // deep copy from other mat, inplace
+    void clone_from(const ncnn::Mat& mat, Allocator* allocator = 0);
+    // reshape vec
+    Mat reshape(int w, Allocator* allocator = 0) const;
+    // reshape image
+    Mat reshape(int w, int h, Allocator* allocator = 0) const;
+    // reshape dim
+    Mat reshape(int w, int h, int c, Allocator* allocator = 0) const;
+    // reshape cube
+    Mat reshape(int w, int h, int d, int c, Allocator* allocator = 0) const;
+    // allocate vec
+    void create(int w, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate image
+    void create(int w, int h, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate cube
+    void create(int w, int h, int d, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed cube
+    void create(int w, int h, int d, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate like
+    void create_like(const Mat& m, Allocator* allocator = 0);
+#if NCNN_VULKAN
+    // allocate like
+    void create_like(const VkMat& m, Allocator* allocator = 0);
+    // allocate like
+    void create_like(const VkImageMat& im, Allocator* allocator = 0);
+#endif // NCNN_VULKAN
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // bits per element
+    int elembits() const;
+
+    // shape only
+    Mat shape() const;
+
+    // data reference
+    Mat channel(int c);
+    const Mat channel(int c) const;
+    Mat depth(int z);
+    const Mat depth(int z) const;
+    float* row(int y);
+    const float* row(int y) const;
+    template<typename T>
+    T* row(int y);
+    template<typename T>
+    const T* row(int y) const;
+
+    // range reference
+    Mat channel_range(int c, int channels);
+    const Mat channel_range(int c, int channels) const;
+    Mat depth_range(int z, int depths);
+    const Mat depth_range(int z, int depths) const;
+    Mat row_range(int y, int rows);
+    const Mat row_range(int y, int rows) const;
+    Mat range(int x, int n);
+    const Mat range(int x, int n) const;
+
+    // access raw data
+    template<typename T>
+    operator T*();
+    template<typename T>
+    operator const T*() const;
+
+    // convenient access float vec element
+    float& operator[](size_t i);
+    const float& operator[](size_t i) const;
+
+#if NCNN_PIXEL
+    enum PixelType
+    {
+        PIXEL_CONVERT_SHIFT = 16,
+        PIXEL_FORMAT_MASK = 0x0000ffff,
+        PIXEL_CONVERT_MASK = 0xffff0000,
+
+        PIXEL_RGB = 1,
+        PIXEL_BGR = 2,
+        PIXEL_GRAY = 3,
+        PIXEL_RGBA = 4,
+        PIXEL_BGRA = 5,
+
+        PIXEL_RGB2BGR = PIXEL_RGB | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGB2GRAY = PIXEL_RGB | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGB2RGBA = PIXEL_RGB | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGB2BGRA = PIXEL_RGB | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_BGR2RGB = PIXEL_BGR | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGR2GRAY = PIXEL_BGR | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGR2RGBA = PIXEL_BGR | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGR2BGRA = PIXEL_BGR | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_GRAY2RGB = PIXEL_GRAY | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_GRAY2BGR = PIXEL_GRAY | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_GRAY2RGBA = PIXEL_GRAY | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+        PIXEL_GRAY2BGRA = PIXEL_GRAY | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_RGBA2RGB = PIXEL_RGBA | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2BGR = PIXEL_RGBA | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2GRAY = PIXEL_RGBA | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2BGRA = PIXEL_RGBA | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_BGRA2RGB = PIXEL_BGRA | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGRA2BGR = PIXEL_BGRA | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGRA2GRAY = PIXEL_BGRA | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGRA2RGBA = PIXEL_BGRA | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+    };
+    // convenient construct from pixel data
+    static Mat from_pixels(const unsigned char* pixels, int type, int w, int h, Allocator* allocator = 0);
+    // convenient construct from pixel data with stride(bytes-per-row) parameter
+    static Mat from_pixels(const unsigned char* pixels, int type, int w, int h, int stride, Allocator* allocator = 0);
+    // convenient construct from pixel data and resize to specific size
+    static Mat from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from pixel data and resize to specific size with stride(bytes-per-row) parameter
+    static Mat from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int stride, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from pixel data roi
+    static Mat from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int roix, int roiy, int roiw, int roih, Allocator* allocator = 0);
+    // convenient construct from pixel data roi with stride(bytes-per-row) parameter
+    static Mat from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, Allocator* allocator = 0);
+    // convenient construct from pixel data roi and resize to specific size
+    static Mat from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from pixel data roi and resize to specific size with stride(bytes-per-row) parameter
+    static Mat from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator = 0);
+
+    // convenient export to pixel data
+    void to_pixels(unsigned char* pixels, int type) const;
+    // convenient export to pixel data with stride(bytes-per-row) parameter
+    void to_pixels(unsigned char* pixels, int type, int stride) const;
+    // convenient export to pixel data and resize to specific size
+    void to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height) const;
+    // convenient export to pixel data and resize to specific size with stride(bytes-per-row) parameter
+    void to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height, int target_stride) const;
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+    // convenient construct from android Bitmap
+    static Mat from_android_bitmap(JNIEnv* env, jobject bitmap, int type_to, Allocator* allocator = 0);
+    // convenient construct from android Bitmap and resize to specific size
+    static Mat from_android_bitmap_resize(JNIEnv* env, jobject bitmap, int type_to, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from android Bitmap roi
+    static Mat from_android_bitmap_roi(JNIEnv* env, jobject bitmap, int type_to, int roix, int roiy, int roiw, int roih, Allocator* allocator = 0);
+    // convenient construct from android Bitmap roi and resize to specific size
+    static Mat from_android_bitmap_roi_resize(JNIEnv* env, jobject bitmap, int type_to, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient export to android Bitmap and resize to the android Bitmap size
+    void to_android_bitmap(JNIEnv* env, jobject bitmap, int type_from) const;
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+#endif // NCNN_PIXEL
+
+    // substract channel-wise mean values, then multiply by normalize values, pass 0 to skip
+    void substract_mean_normalize(const float* mean_vals, const float* norm_vals);
+
+    // convenient construct from half precision floating point data
+    static Mat from_float16(const unsigned short* data, int size);
+
+    // pointer to the data
+    void* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-d-h-w-1  c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-d-h-w-4  c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-d-h-w-8  c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    Allocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int d;
+    int c;
+
+    size_t cstep;
+};
+
+#if NCNN_VULKAN
+
+// the three dimension matrix, vulkan version
+class NCNN_EXPORT VkMat
+{
+public:
+    // empty
+    VkMat();
+    // vec
+    VkMat(int w, size_t elemsize, VkAllocator* allocator);
+    // image
+    VkMat(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // dim
+    VkMat(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // cube
+    VkMat(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // packed vec
+    VkMat(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed image
+    VkMat(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed dim
+    VkMat(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed cube
+    VkMat(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // copy
+    VkMat(const VkMat& m);
+    // external vec
+    VkMat(int w, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external image
+    VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external dim
+    VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external cube
+    VkMat(int w, int h, int d, int c, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external packed vec
+    VkMat(int w, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed image
+    VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed dim
+    VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed cube
+    VkMat(int w, int h, int d, int c, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // release
+    ~VkMat();
+    // assign
+    VkMat& operator=(const VkMat& m);
+    // allocate vec
+    void create(int w, size_t elemsize, VkAllocator* allocator);
+    // allocate image
+    void create(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate cube
+    void create(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed cube
+    void create(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate like
+    void create_like(const Mat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkMat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkImageMat& im, VkAllocator* allocator);
+
+    // mapped
+    Mat mapped() const;
+    void* mapped_ptr() const;
+
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // bits per element
+    int elembits() const;
+
+    // shape only
+    Mat shape() const;
+
+    // low-level reference
+    VkBuffer buffer() const;
+    size_t buffer_offset() const;
+    size_t buffer_capacity() const;
+
+    // device buffer
+    VkBufferMemory* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-d-h-w-1  c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-d-h-w-4  c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-d-h-w-8  c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    VkAllocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int d;
+    int c;
+
+    size_t cstep;
+};
+
+class NCNN_EXPORT VkImageMat
+{
+public:
+    // empty
+    VkImageMat();
+    // vec
+    VkImageMat(int w, size_t elemsize, VkAllocator* allocator);
+    // image
+    VkImageMat(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // dim
+    VkImageMat(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // cube
+    VkImageMat(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // packed vec
+    VkImageMat(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed image
+    VkImageMat(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed dim
+    VkImageMat(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed cube
+    VkImageMat(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // copy
+    VkImageMat(const VkImageMat& m);
+    // external vec
+    VkImageMat(int w, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external image
+    VkImageMat(int w, int h, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external dim
+    VkImageMat(int w, int h, int c, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external cube
+    VkImageMat(int w, int h, int d, int c, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external packed vec
+    VkImageMat(int w, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed image
+    VkImageMat(int w, int h, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed dim
+    VkImageMat(int w, int h, int c, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed cube
+    VkImageMat(int w, int h, int d, int c, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // release
+    ~VkImageMat();
+    // assign
+    VkImageMat& operator=(const VkImageMat& m);
+    // allocate vec
+    void create(int w, size_t elemsize, VkAllocator* allocator);
+    // allocate image
+    void create(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate cube
+    void create(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed cube
+    void create(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate like
+    void create_like(const Mat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkMat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkImageMat& im, VkAllocator* allocator);
+
+    // mapped
+    Mat mapped() const;
+    void* mapped_ptr() const;
+
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // bits per element
+    int elembits() const;
+
+    // shape only
+    Mat shape() const;
+
+    // low-level reference
+    VkImage image() const;
+    VkImageView imageview() const;
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+    // convenient construct from android hardware buffer
+    static VkImageMat from_android_hardware_buffer(VkAndroidHardwareBufferImageAllocator* allocator);
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+    // device image
+    VkImageMemory* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-d-h-w-1  c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-d-h-w-4  c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-d-h-w-8  c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    VkAllocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int d;
+    int c;
+};
+
+// type for vulkan specialization constant and push constant
+union vk_specialization_type
+{
+    int i;
+    float f;
+    uint32_t u32;
+};
+union vk_constant_type
+{
+    int i;
+    float f;
+};
+#endif // NCNN_VULKAN
+
+// misc function
+#if NCNN_PIXEL
+// convert yuv420sp(nv21) to rgb, the fast approximate version
+NCNN_EXPORT void yuv420sp2rgb(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
+// convert yuv420sp(nv12) to rgb, the fast approximate version
+NCNN_EXPORT void yuv420sp2rgb_nv12(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
+// convert yuv420sp(nv21) to rgb with half resize, the faster approximate version
+NCNN_EXPORT void yuv420sp2rgb_half(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
+// image pixel bilinear resize
+NCNN_EXPORT void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+NCNN_EXPORT void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+NCNN_EXPORT void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+NCNN_EXPORT void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+// image pixel bilinear resize with stride(bytes-per-row) parameter
+NCNN_EXPORT void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+NCNN_EXPORT void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+NCNN_EXPORT void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+NCNN_EXPORT void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+// image pixel bilinear resize, convenient wrapper for yuv420sp(nv21/nv12)
+NCNN_EXPORT void resize_bilinear_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+#endif // NCNN_PIXEL
+#if NCNN_PIXEL_ROTATE
+// type is the from type, 6 means rotating from 6 to 1
+//
+//     1        2       3      4         5            6           7          8
+//
+//   888888  888888      88  88      8888888888  88                  88  8888888888
+//   88          88      88  88      88  88      88  88          88  88      88  88
+//   8888      8888    8888  8888    88          8888888888  8888888888          88
+//   88          88      88  88
+//   88          88  888888  888888
+//
+// ref http://sylvana.net/jpegcrop/exif_orientation.html
+// image pixel kanna rotate
+NCNN_EXPORT void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+NCNN_EXPORT void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+NCNN_EXPORT void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+NCNN_EXPORT void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+// image pixel kanna rotate with stride(bytes-per-row) parameter
+NCNN_EXPORT void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+NCNN_EXPORT void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+NCNN_EXPORT void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+NCNN_EXPORT void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+// image pixel kanna rotate, convenient wrapper for yuv420sp(nv21/nv12)
+NCNN_EXPORT void kanna_rotate_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+#endif // NCNN_PIXEL_ROTATE
+#if NCNN_PIXEL_AFFINE
+// resolve affine transform matrix from rotation angle, scale factor and x y offset
+NCNN_EXPORT void get_rotation_matrix(float angle, float scale, float dx, float dy, float* tm);
+// resolve affine transform matrix from two set of points, num_point must be >= 2
+NCNN_EXPORT void get_affine_transform(const float* points_from, const float* points_to, int num_point, float* tm);
+// resolve the inversion affine transform matrix
+NCNN_EXPORT void invert_affine_transform(const float* tm, float* tm_inv);
+// image pixel bilinear warpaffine inverse transform, set -233 for transparent border color, the color RGBA is little-endian encoded
+NCNN_EXPORT void warpaffine_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+// image pixel bilinear warpaffine inverse transform with stride(bytes-per-row) parameter, set -233 for transparent border color, the color RGBA is little-endian encoded
+NCNN_EXPORT void warpaffine_bilinear_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+// image pixel bilinear warpaffine, convenient wrapper for yuv420sp(nv21/nv12), set -233 for transparent border color, the color YUV_ is little-endian encoded
+NCNN_EXPORT void warpaffine_bilinear_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+#endif // NCNN_PIXEL_AFFINE
+#if NCNN_PIXEL_DRAWING
+// draw rectangle, set thickness -1 for filled rectangle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_rectangle_c1(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c2(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c3(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c4(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+// draw rectangle with stride(bytes-per-row) parameter, set thickness -1 for filled rectangle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_rectangle_c1(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c2(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c3(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c4(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+// draw rectangle, convenient wrapper for yuv420sp(nv21/nv12), set thickness -1 for filled rectangle, the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_rectangle_yuv420sp(unsigned char* yuv420sp, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+// draw circle, set thickness -1 for filled circle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_circle_c1(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c2(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c3(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c4(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+// draw circle with stride(bytes-per-row) parameter, set thickness -1 for filled circle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_circle_c1(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c2(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c3(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c4(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+// draw circle, convenient wrapper for yuv420sp(nv21/nv12), set thickness -1 for filled circle, the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_circle_yuv420sp(unsigned char* yuv420sp, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+// draw line, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_line_c1(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c2(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c3(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c4(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+// draw line with stride(bytes-per-row) parameter, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_line_c1(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c2(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c3(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c4(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+// draw line, convenient wrapper for yuv420sp(nv21/nv12), the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_line_yuv420sp(unsigned char* yuv420sp, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+// resolve text bounding box size
+NCNN_EXPORT void get_text_drawing_size(const char* text, int fontpixelsize, int* w, int* h);
+// draw ascii printables and newline, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_text_c1(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c2(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c3(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c4(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+// draw ascii printables and newline with stride(bytes-per-row) parameter, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_text_c1(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c2(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c3(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c4(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+// draw ascii printables and newline, convenient wrapper for yuv420sp(nv21/nv12), the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_text_yuv420sp(unsigned char* yuv420sp, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+#endif // NCNN_PIXEL_DRAWING
+
+// type conversion
+// convert float to half precision floating point
+NCNN_EXPORT unsigned short float32_to_float16(float value);
+// convert half precision floating point to float
+NCNN_EXPORT float float16_to_float32(unsigned short value);
+// convert float to brain half
+NCNN_EXPORT NCNN_FORCEINLINE unsigned short float32_to_bfloat16(float value)
+{
+    // 16 : 16
+    union
+    {
+        unsigned int u;
+        float f;
+    } tmp;
+    tmp.f = value;
+    return tmp.u >> 16;
+}
+// convert brain half to float
+NCNN_EXPORT NCNN_FORCEINLINE float bfloat16_to_float32(unsigned short value)
+{
+    // 16 : 16
+    union
+    {
+        unsigned int u;
+        float f;
+    } tmp;
+    tmp.u = value << 16;
+    return tmp.f;
+}
+
+// mat process
+enum BorderType
+{
+    BORDER_CONSTANT = 0,
+    BORDER_REPLICATE = 1,
+    BORDER_REFLECT = 2,
+    BORDER_TRANSPARENT = -233,
+};
+NCNN_EXPORT void copy_make_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int type, float v, const Option& opt = Option());
+NCNN_EXPORT void copy_make_border_3d(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int front, int behind, int type, float v, const Option& opt = Option());
+NCNN_EXPORT void copy_cut_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, const Option& opt = Option());
+NCNN_EXPORT void copy_cut_border_3d(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int front, int behind, const Option& opt = Option());
+NCNN_EXPORT void resize_nearest(const Mat& src, Mat& dst, int w, int h, const Option& opt = Option());
+NCNN_EXPORT void resize_bilinear(const Mat& src, Mat& dst, int w, int h, const Option& opt = Option());
+NCNN_EXPORT void resize_bicubic(const Mat& src, Mat& dst, int w, int h, const Option& opt = Option());
+NCNN_EXPORT void convert_packing(const Mat& src, Mat& dst, int elempack, const Option& opt = Option());
+NCNN_EXPORT void flatten(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_float32_to_float16(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_float16_to_float32(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_int8_to_float32(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_float32_to_bfloat16(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_bfloat16_to_float32(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void quantize_to_int8(const Mat& src, Mat& dst, const Mat& scale_data, const Option& opt = Option());
+NCNN_EXPORT void dequantize_from_int32(const Mat& src, Mat& dst, const Mat& scale_data, const Mat& bias_data, const Option& opt = Option());
+NCNN_EXPORT void requantize_from_int32_to_int8(const Mat& src, Mat& dst, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt = Option());
+
+NCNN_FORCEINLINE Mat::Mat()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(const Mat& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c), cstep(m.cstep)
+{
+    addref();
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = (size_t)w * h;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = (size_t)w * h;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::~Mat()
+{
+    release();
+}
+
+NCNN_FORCEINLINE void Mat::fill(float _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+
+    int i = 0;
+#if __ARM_NEON
+    float32x4_t _c = vdupq_n_f32(_v);
+    for (; i + 3 < size; i += 4)
+    {
+        vst1q_f32(ptr, _c);
+        ptr += 4;
+    }
+#endif // __ARM_NEON
+    for (; i < size; i++)
+    {
+        *ptr++ = _v;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(int _v)
+{
+    int size = (int)total();
+    int* ptr = (int*)data;
+
+    int i = 0;
+#if __ARM_NEON
+    int32x4_t _c = vdupq_n_s32(_v);
+    for (; i + 3 < size; i += 4)
+    {
+        vst1q_s32(ptr, _c);
+        ptr += 4;
+    }
+#endif // __ARM_NEON
+    for (; i < size; i++)
+    {
+        *ptr++ = _v;
+    }
+}
+
+#if __ARM_NEON
+NCNN_FORCEINLINE void Mat::fill(float32x4_t _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_f32(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(uint16x4_t _v)
+{
+    int size = (int)total();
+    unsigned short* ptr = (unsigned short*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1_u16(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(int32x4_t _v)
+{
+    int size = (int)total();
+    int* ptr = (int*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_s32(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(int32x4_t _v0, int32x4_t _v1)
+{
+    int size = (int)total();
+    int* ptr = (int*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_s32(ptr, _v0);
+        vst1q_s32(ptr + 4, _v1);
+        ptr += 8;
+    }
+}
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+NCNN_FORCEINLINE void Mat::fill(float16x4_t _v)
+{
+    int size = (int)total();
+    __fp16* ptr = (__fp16*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1_f16(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(float16x8_t _v)
+{
+    int size = (int)total();
+    __fp16* ptr = (__fp16*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_f16(ptr, _v);
+        ptr += 8;
+    }
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif // __ARM_NEON
+
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+NCNN_FORCEINLINE void Mat::fill(__m512 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm512_storeu_ps(ptr, _v);
+        ptr += 16;
+    }
+}
+#endif // __AVX512F__
+NCNN_FORCEINLINE void Mat::fill(__m256 _v, int _i)
+{
+    // old gcc cannot overload __m128 and __m256 type
+    // add a dummy int parameter for different mangled function symbol
+    (void)_i;
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm256_storeu_ps(ptr, _v);
+        ptr += 8;
+    }
+}
+#endif // __AVX__
+NCNN_FORCEINLINE void Mat::fill(__m128 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm_storeu_ps(ptr, _v);
+        ptr += 4;
+    }
+}
+NCNN_FORCEINLINE void Mat::fill(__m128i _v)
+{
+    int size = (int)total();
+    unsigned short* ptr = (unsigned short*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm_store_si128((__m128i*)ptr, _v);
+        ptr += 8;
+    }
+}
+#endif // __SSE2__
+
+#if __mips_msa
+NCNN_FORCEINLINE void Mat::fill(v4f32 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        __msa_st_w((v4i32)_v, ptr, 0);
+        ptr += 4;
+    }
+}
+#endif // __mips_msa
+
+#if __loongarch_sx
+NCNN_FORCEINLINE void Mat::fill(__m128 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        __lsx_vst(_v, ptr, 0);
+        ptr += 4;
+    }
+}
+#endif // __loongarch_sx
+#if __riscv_vector
+NCNN_FORCEINLINE void Mat::fill(vfloat32m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 4;
+    const size_t vl = vsetvl_e32m1(packn);
+
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse32_v_f32m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(vuint16m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 2;
+    const size_t vl = vsetvl_e16m1(packn);
+
+    int size = (int)total();
+    unsigned short* ptr = (unsigned short*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse16_v_u16m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(vint8m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 1;
+    const size_t vl = vsetvl_e8m1(packn);
+
+    int size = (int)total();
+    signed char* ptr = (signed char*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse8_v_i8m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+#if __riscv_zfh
+NCNN_FORCEINLINE void Mat::fill(vfloat16m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 2;
+    const size_t vl = vsetvl_e16m1(packn);
+
+    int size = (int)total();
+    __fp16* ptr = (__fp16*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse16_v_f16m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+#endif // __riscv_zfh
+#endif // __riscv_vector
+
+template<typename T>
+NCNN_FORCEINLINE void Mat::fill(T _v)
+{
+    int size = (int)total();
+    T* ptr = (T*)data;
+    for (int i = 0; i < size; i++)
+    {
+        ptr[i] = _v;
+    }
+}
+
+NCNN_FORCEINLINE Mat& Mat::operator=(const Mat& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        NCNN_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    d = m.d;
+    c = m.c;
+
+    cstep = m.cstep;
+
+    return *this;
+}
+
+NCNN_FORCEINLINE void Mat::addref()
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+}
+
+NCNN_FORCEINLINE void Mat::release()
+{
+    if (refcount && NCNN_XADD(refcount, -1) == 1)
+    {
+        if (allocator)
+            allocator->fastFree(data);
+        else
+            fastFree(data);
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    d = 0;
+    c = 0;
+
+    cstep = 0;
+
+    refcount = 0;
+}
+
+NCNN_FORCEINLINE bool Mat::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+NCNN_FORCEINLINE size_t Mat::total() const
+{
+    return cstep * c;
+}
+
+NCNN_FORCEINLINE int Mat::elembits() const
+{
+    return elempack ? static_cast<int>(elemsize * 8) / elempack : 0;
+}
+
+NCNN_FORCEINLINE Mat Mat::shape() const
+{
+    if (dims == 1)
+        return Mat(w * elempack, (void*)0);
+    if (dims == 2)
+        return Mat(w, h * elempack, (void*)0);
+    if (dims == 3)
+        return Mat(w, h, c * elempack, (void*)0);
+    if (dims == 4)
+        return Mat(w, h, d, c * elempack, (void*)0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE Mat Mat::channel(int _c)
+{
+    Mat m(w, h, d, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims - 1;
+    if (dims == 4)
+        m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::channel(int _c) const
+{
+    Mat m(w, h, d, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims - 1;
+    if (dims == 4)
+        m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE Mat Mat::depth(int z)
+{
+    return Mat(w, h, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE const Mat Mat::depth(int z) const
+{
+    return Mat(w, h, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE float* Mat::row(int y)
+{
+    return (float*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+NCNN_FORCEINLINE const float* Mat::row(int y) const
+{
+    return (const float*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+template<typename T>
+NCNN_FORCEINLINE T* Mat::row(int y)
+{
+    return (T*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+template<typename T>
+NCNN_FORCEINLINE const T* Mat::row(int y) const
+{
+    return (const T*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+NCNN_FORCEINLINE Mat Mat::channel_range(int _c, int channels)
+{
+    Mat m(w, h, d, channels, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::channel_range(int _c, int channels) const
+{
+    Mat m(w, h, d, channels, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims;
+    return m;
+}
+
+NCNN_FORCEINLINE Mat Mat::depth_range(int z, int depths)
+{
+    Mat m(w, h, depths, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+    m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::depth_range(int z, int depths) const
+{
+    Mat m(w, h, depths, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+    m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE Mat Mat::row_range(int y, int rows)
+{
+    return Mat(w, rows, (unsigned char*)data + (size_t)w * y * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE const Mat Mat::row_range(int y, int rows) const
+{
+    return Mat(w, rows, (unsigned char*)data + (size_t)w * y * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE Mat Mat::range(int x, int n)
+{
+    return Mat(n, (unsigned char*)data + x * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE const Mat Mat::range(int x, int n) const
+{
+    return Mat(n, (unsigned char*)data + x * elemsize, elemsize, elempack, allocator);
+}
+
+template<typename T>
+NCNN_FORCEINLINE Mat::operator T*()
+{
+    return (T*)data;
+}
+
+template<typename T>
+NCNN_FORCEINLINE Mat::operator const T*() const
+{
+    return (const T*)data;
+}
+
+NCNN_FORCEINLINE float& Mat::operator[](size_t i)
+{
+    return ((float*)data)[i];
+}
+
+NCNN_FORCEINLINE const float& Mat::operator[](size_t i) const
+{
+    return ((const float*)data)[i];
+}
+
+#if NCNN_VULKAN
+
+NCNN_FORCEINLINE VkMat::VkMat()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(const VkMat& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c)
+{
+    addref();
+
+    cstep = m.cstep;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = w * h;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize(w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize(w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = w * h;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize(w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize(w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::~VkMat()
+{
+    release();
+}
+
+NCNN_FORCEINLINE VkMat& VkMat::operator=(const VkMat& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        NCNN_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    d = m.d;
+    c = m.c;
+
+    cstep = m.cstep;
+
+    return *this;
+}
+
+NCNN_FORCEINLINE Mat VkMat::mapped() const
+{
+    if (!allocator->mappable)
+        return Mat();
+
+    if (dims == 1)
+        return Mat(w, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 2)
+        return Mat(w, h, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 3)
+        return Mat(w, h, c, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 4)
+        return Mat(w, h, d, c, mapped_ptr(), elemsize, elempack, 0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE void* VkMat::mapped_ptr() const
+{
+    if (!allocator->mappable)
+        return 0;
+
+    return (unsigned char*)data->mapped_ptr + data->offset;
+}
+
+NCNN_FORCEINLINE void VkMat::addref()
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+}
+
+NCNN_FORCEINLINE void VkMat::release()
+{
+    if (refcount && NCNN_XADD(refcount, -1) == 1)
+    {
+        if (allocator && data)
+        {
+            allocator->fastFree(data);
+        }
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    d = 0;
+    c = 0;
+
+    cstep = 0;
+
+    refcount = 0;
+}
+
+NCNN_FORCEINLINE bool VkMat::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+NCNN_FORCEINLINE size_t VkMat::total() const
+{
+    return cstep * c;
+}
+
+NCNN_FORCEINLINE int VkMat::elembits() const
+{
+    return elempack ? static_cast<int>(elemsize) * 8 / elempack : 0;
+}
+
+NCNN_FORCEINLINE Mat VkMat::shape() const
+{
+    if (dims == 1)
+        return Mat(w * elempack, (void*)0);
+    if (dims == 2)
+        return Mat(w, h * elempack, (void*)0);
+    if (dims == 3)
+        return Mat(w, h, c * elempack, (void*)0);
+    if (dims == 4)
+        return Mat(w, h, d, c * elempack, (void*)0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE VkBuffer VkMat::buffer() const
+{
+    return data->buffer;
+}
+
+NCNN_FORCEINLINE size_t VkMat::buffer_offset() const
+{
+    return data->offset;
+}
+
+NCNN_FORCEINLINE size_t VkMat::buffer_capacity() const
+{
+    return data->capacity;
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(const VkImageMat& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c)
+{
+    addref();
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::~VkImageMat()
+{
+    release();
+}
+
+NCNN_FORCEINLINE VkImageMat& VkImageMat::operator=(const VkImageMat& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        NCNN_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    d = m.d;
+    c = m.c;
+
+    return *this;
+}
+
+NCNN_FORCEINLINE Mat VkImageMat::mapped() const
+{
+    if (!allocator->mappable || !data->mapped_ptr)
+        return Mat();
+
+    if (dims == 1)
+        return Mat(w, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 2)
+        return Mat(w, h, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 3)
+        return Mat(w, h, c, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 4)
+        return Mat(w, h, d, c, mapped_ptr(), elemsize, elempack, 0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE void* VkImageMat::mapped_ptr() const
+{
+    if (!allocator->mappable || !data->mapped_ptr)
+        return 0;
+
+    return (unsigned char*)data->mapped_ptr + data->bind_offset;
+}
+
+NCNN_FORCEINLINE void VkImageMat::addref()
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+}
+
+NCNN_FORCEINLINE void VkImageMat::release()
+{
+    if (refcount && NCNN_XADD(refcount, -1) == 1)
+    {
+        if (allocator && data)
+        {
+            allocator->fastFree(data);
+        }
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    d = 0;
+    c = 0;
+
+    refcount = 0;
+}
+
+NCNN_FORCEINLINE bool VkImageMat::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+NCNN_FORCEINLINE size_t VkImageMat::total() const
+{
+    return w * h * d * c;
+}
+
+NCNN_FORCEINLINE int VkImageMat::elembits() const
+{
+    return elempack ? static_cast<int>(elemsize) * 8 / elempack : 0;
+}
+
+NCNN_FORCEINLINE Mat VkImageMat::shape() const
+{
+    if (dims == 1)
+        return Mat(w * elempack, (void*)0);
+    if (dims == 2)
+        return Mat(w, h * elempack, (void*)0);
+    if (dims == 3)
+        return Mat(w, h, c * elempack, (void*)0);
+    if (dims == 4)
+        return Mat(w, h, d, c * elempack, (void*)0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE VkImage VkImageMat::image() const
+{
+    return data->image;
+}
+
+NCNN_FORCEINLINE VkImageView VkImageMat::imageview() const
+{
+    return data->imageview;
+}
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_MAT_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/modelbin.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/modelbin.h
new file mode 100644
index 0000000..15d2b9c
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/modelbin.h
@@ -0,0 +1,80 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_MODELBIN_H
+#define NCNN_MODELBIN_H
+
+#include "mat.h"
+
+namespace ncnn {
+
+class DataReader;
+class NCNN_EXPORT ModelBin
+{
+public:
+    ModelBin();
+    virtual ~ModelBin();
+    // element type
+    // 0 = auto
+    // 1 = float32
+    // 2 = float16
+    // 3 = int8
+    // load vec
+    virtual Mat load(int w, int type) const = 0;
+    // load image
+    virtual Mat load(int w, int h, int type) const;
+    // load dim
+    virtual Mat load(int w, int h, int c, int type) const;
+    // load cube
+    virtual Mat load(int w, int h, int d, int c, int type) const;
+};
+
+class ModelBinFromDataReaderPrivate;
+class NCNN_EXPORT ModelBinFromDataReader : public ModelBin
+{
+public:
+    explicit ModelBinFromDataReader(const DataReader& dr);
+    virtual ~ModelBinFromDataReader();
+
+    virtual Mat load(int w, int type) const;
+
+private:
+    ModelBinFromDataReader(const ModelBinFromDataReader&);
+    ModelBinFromDataReader& operator=(const ModelBinFromDataReader&);
+
+private:
+    ModelBinFromDataReaderPrivate* const d;
+};
+
+class ModelBinFromMatArrayPrivate;
+class NCNN_EXPORT ModelBinFromMatArray : public ModelBin
+{
+public:
+    // construct from weight blob array
+    explicit ModelBinFromMatArray(const Mat* weights);
+    virtual ~ModelBinFromMatArray();
+
+    virtual Mat load(int w, int type) const;
+
+private:
+    ModelBinFromMatArray(const ModelBinFromMatArray&);
+    ModelBinFromMatArray& operator=(const ModelBinFromMatArray&);
+
+private:
+    ModelBinFromMatArrayPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_MODELBIN_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/ncnn_export.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/ncnn_export.h
new file mode 100644
index 0000000..e2f5fde
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/ncnn_export.h
@@ -0,0 +1,42 @@
+
+#ifndef NCNN_EXPORT_H
+#define NCNN_EXPORT_H
+
+#ifdef NCNN_STATIC_DEFINE
+#  define NCNN_EXPORT
+#  define NCNN_NO_EXPORT
+#else
+#  ifndef NCNN_EXPORT
+#    ifdef ncnn_EXPORTS
+        /* We are building this library */
+#      define NCNN_EXPORT __attribute__((visibility("default")))
+#    else
+        /* We are using this library */
+#      define NCNN_EXPORT __attribute__((visibility("default")))
+#    endif
+#  endif
+
+#  ifndef NCNN_NO_EXPORT
+#    define NCNN_NO_EXPORT __attribute__((visibility("hidden")))
+#  endif
+#endif
+
+#ifndef NCNN_DEPRECATED
+#  define NCNN_DEPRECATED __attribute__ ((__deprecated__))
+#endif
+
+#ifndef NCNN_DEPRECATED_EXPORT
+#  define NCNN_DEPRECATED_EXPORT NCNN_EXPORT NCNN_DEPRECATED
+#endif
+
+#ifndef NCNN_DEPRECATED_NO_EXPORT
+#  define NCNN_DEPRECATED_NO_EXPORT NCNN_NO_EXPORT NCNN_DEPRECATED
+#endif
+
+#if 0 /* DEFINE_NO_DEPRECATED */
+#  ifndef NCNN_NO_DEPRECATED
+#    define NCNN_NO_DEPRECATED
+#  endif
+#endif
+
+#endif /* NCNN_EXPORT_H */
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/net.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/net.h
new file mode 100644
index 0000000..9407042
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/net.h
@@ -0,0 +1,272 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_NET_H
+#define NCNN_NET_H
+
+#include "blob.h"
+#include "layer.h"
+#include "mat.h"
+#include "option.h"
+#include "platform.h"
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/asset_manager.h>
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkCompute;
+#endif // NCNN_VULKAN
+class DataReader;
+class Extractor;
+class NetPrivate;
+class NCNN_EXPORT Net
+{
+public:
+    // empty init
+    Net();
+    // clear and destroy
+    virtual ~Net();
+
+public:
+    // option can be changed before loading
+    Option opt;
+
+#if NCNN_VULKAN
+    // set gpu device by index
+    void set_vulkan_device(int device_index);
+
+    // set gpu device by device handle, no owner transfer
+    void set_vulkan_device(const VulkanDevice* vkdev);
+
+    const VulkanDevice* vulkan_device() const;
+#endif // NCNN_VULKAN
+
+#if NCNN_STRING
+    // register custom layer by layer type name
+    // return 0 if success
+    int register_custom_layer(const char* type, layer_creator_func creator, layer_destroyer_func destroyer = 0, void* userdata = 0);
+    virtual int custom_layer_to_index(const char* type);
+#endif // NCNN_STRING
+    // register custom layer by layer type
+    // return 0 if success
+    int register_custom_layer(int index, layer_creator_func creator, layer_destroyer_func destroyer = 0, void* userdata = 0);
+
+#if NCNN_STRING
+    int load_param(const DataReader& dr);
+#endif // NCNN_STRING
+
+    int load_param_bin(const DataReader& dr);
+
+    int load_model(const DataReader& dr);
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    // load network structure from plain param file
+    // return 0 if success
+    int load_param(FILE* fp);
+    int load_param(const char* protopath);
+    int load_param_mem(const char* mem);
+#endif // NCNN_STRING
+    // load network structure from binary param file
+    // return 0 if success
+    int load_param_bin(FILE* fp);
+    int load_param_bin(const char* protopath);
+
+    // load network weight data from model file
+    // return 0 if success
+    int load_model(FILE* fp);
+    int load_model(const char* modelpath);
+#endif // NCNN_STDIO
+
+    // load network structure from external memory
+    // memory pointer must be 32-bit aligned
+    // return bytes consumed
+    int load_param(const unsigned char* mem);
+
+    // reference network weight data from external memory
+    // weight data is not copied but referenced
+    // so external memory should be retained when used
+    // memory pointer must be 32-bit aligned
+    // return bytes consumed
+    int load_model(const unsigned char* mem);
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#if NCNN_STRING
+    // convenient load network structure from android asset plain param file
+    int load_param(AAsset* asset);
+    int load_param(AAssetManager* mgr, const char* assetpath);
+#endif // NCNN_STRING
+    // convenient load network structure from android asset binary param file
+    int load_param_bin(AAsset* asset);
+    int load_param_bin(AAssetManager* mgr, const char* assetpath);
+
+    // convenient load network weight data from android asset model file
+    int load_model(AAsset* asset);
+    int load_model(AAssetManager* mgr, const char* assetpath);
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+    // unload network structure and weight data
+    void clear();
+
+    // construct an Extractor from network
+    Extractor create_extractor() const;
+
+    // get input/output indexes/names
+    const std::vector<int>& input_indexes() const;
+    const std::vector<int>& output_indexes() const;
+#if NCNN_STRING
+    const std::vector<const char*>& input_names() const;
+    const std::vector<const char*>& output_names() const;
+#endif
+
+    const std::vector<Blob>& blobs() const;
+    const std::vector<Layer*>& layers() const;
+
+    std::vector<Blob>& mutable_blobs();
+    std::vector<Layer*>& mutable_layers();
+
+protected:
+    friend class Extractor;
+#if NCNN_STRING
+    int find_blob_index_by_name(const char* name) const;
+    int find_layer_index_by_name(const char* name) const;
+    virtual Layer* create_custom_layer(const char* type);
+#endif // NCNN_STRING
+    virtual Layer* create_custom_layer(int index);
+
+private:
+    Net(const Net&);
+    Net& operator=(const Net&);
+
+private:
+    NetPrivate* const d;
+};
+
+class ExtractorPrivate;
+class NCNN_EXPORT Extractor
+{
+public:
+    virtual ~Extractor();
+
+    // copy
+    Extractor(const Extractor&);
+
+    // assign
+    Extractor& operator=(const Extractor&);
+
+    // clear blob mats and alloctors
+    void clear();
+
+    // enable light mode
+    // intermediate blob will be recycled when enabled
+    // enabled by default
+    void set_light_mode(bool enable);
+
+    // set thread count for this extractor
+    // this will overwrite the global setting
+    // default count is system depended
+    void set_num_threads(int num_threads);
+
+    // set blob memory allocator
+    void set_blob_allocator(Allocator* allocator);
+
+    // set workspace memory allocator
+    void set_workspace_allocator(Allocator* allocator);
+
+#if NCNN_VULKAN
+    void set_vulkan_compute(bool enable);
+
+    void set_blob_vkallocator(VkAllocator* allocator);
+
+    void set_workspace_vkallocator(VkAllocator* allocator);
+
+    void set_staging_vkallocator(VkAllocator* allocator);
+#endif // NCNN_VULKAN
+
+#if NCNN_STRING
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const Mat& in);
+
+    // get result by blob name
+    // return 0 if success
+    // type = 0, default
+    // type = 1, do not convert fp16/bf16 or / and packing
+    int extract(const char* blob_name, Mat& feat, int type = 0);
+#endif // NCNN_STRING
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const Mat& in);
+
+    // get result by blob index
+    // return 0 if success
+    // type = 0, default
+    // type = 1, do not convert fp16/bf16 or / and packing
+    int extract(int blob_index, Mat& feat, int type = 0);
+
+#if NCNN_VULKAN
+#if NCNN_STRING
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const VkMat& in);
+
+    // get result by blob name
+    // return 0 if success
+    int extract(const char* blob_name, VkMat& feat, VkCompute& cmd);
+
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const VkImageMat& in);
+
+    // get result by blob name
+    // return 0 if success
+    int extract(const char* blob_name, VkImageMat& feat, VkCompute& cmd);
+#endif // NCNN_STRING
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const VkMat& in);
+
+    // get result by blob index
+    // return 0 if success
+    int extract(int blob_index, VkMat& feat, VkCompute& cmd);
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const VkImageMat& in);
+
+    // get result by blob index
+    // return 0 if success
+    int extract(int blob_index, VkImageMat& feat, VkCompute& cmd);
+#endif // NCNN_VULKAN
+
+protected:
+    friend Extractor Net::create_extractor() const;
+    Extractor(const Net* net, size_t blob_count);
+
+private:
+    ExtractorPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_NET_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/option.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/option.h
new file mode 100644
index 0000000..3fda808
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/option.h
@@ -0,0 +1,153 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_OPTION_H
+#define NCNN_OPTION_H
+
+#include "platform.h"
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkAllocator;
+class PipelineCache;
+#endif // NCNN_VULKAN
+
+class Allocator;
+class NCNN_EXPORT Option
+{
+public:
+    // default option
+    Option();
+
+public:
+    // light mode
+    // intermediate blob will be recycled when enabled
+    // enabled by default
+    bool lightmode;
+
+    // thread count
+    // default value is the one returned by get_cpu_count()
+    int num_threads;
+
+    // blob memory allocator
+    Allocator* blob_allocator;
+
+    // workspace memory allocator
+    Allocator* workspace_allocator;
+
+#if NCNN_VULKAN
+    // blob memory allocator
+    VkAllocator* blob_vkallocator;
+
+    // workspace memory allocator
+    VkAllocator* workspace_vkallocator;
+
+    // staging memory allocator
+    VkAllocator* staging_vkallocator;
+
+    // pipeline cache
+    PipelineCache* pipeline_cache;
+#endif // NCNN_VULKAN
+
+    // the time openmp threads busy-wait for more work before going to sleep
+    // default value is 20ms to keep the cores enabled
+    // without too much extra power consumption afterwards
+    int openmp_blocktime;
+
+    // enable winograd convolution optimization
+    // improve convolution 3x3 stride1 performance, may consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_winograd_convolution;
+
+    // enable sgemm convolution optimization
+    // improve convolution 1x1 stride1 performance, may consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_sgemm_convolution;
+
+    // enable quantized int8 inference
+    // use low-precision int8 path for quantized model
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_int8_inference;
+
+    // enable vulkan compute
+    bool use_vulkan_compute;
+
+    // enable bf16 data type for storage
+    // improve most operator performance on all arm devices, may consume more memory
+    bool use_bf16_storage;
+
+    // enable options for gpu inference
+    bool use_fp16_packed;
+    bool use_fp16_storage;
+    bool use_fp16_arithmetic;
+    bool use_int8_packed;
+    bool use_int8_storage;
+    bool use_int8_arithmetic;
+
+    // enable simd-friendly packed memory layout
+    // improve all operator performance on all arm devices, will consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_packing_layout;
+
+    bool use_shader_pack8;
+
+    // subgroup option
+    bool use_subgroup_basic;
+    bool use_subgroup_vote;
+    bool use_subgroup_ballot;
+    bool use_subgroup_shuffle;
+
+    // turn on for adreno
+    bool use_image_storage;
+    bool use_tensor_storage;
+
+    bool use_reserved_0;
+
+    // enable DAZ(Denormals-Are-Zero) and FTZ(Flush-To-Zero)
+    // default value is 3
+    // 0 = DAZ OFF, FTZ OFF
+    // 1 = DAZ ON , FTZ OFF
+    // 2 = DAZ OFF, FTZ ON
+    // 3 = DAZ ON,  FTZ ON
+    int flush_denormals;
+
+    bool use_local_pool_allocator;
+
+    // enable local memory optimization for gpu inference
+    bool use_shader_local_memory;
+
+    // enable cooperative matrix optimization for gpu inference
+    bool use_cooperative_matrix;
+
+    // more fine-grained control of winograd convolution
+    bool use_winograd23_convolution;
+    bool use_winograd43_convolution;
+    bool use_winograd63_convolution;
+
+    bool use_reserved_6;
+    bool use_reserved_7;
+    bool use_reserved_8;
+    bool use_reserved_9;
+    bool use_reserved_10;
+    bool use_reserved_11;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_OPTION_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/paramdict.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/paramdict.h
new file mode 100644
index 0000000..c2ef160
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/paramdict.h
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PARAMDICT_H
+#define NCNN_PARAMDICT_H
+
+#include "mat.h"
+
+// at most 32 parameters
+#define NCNN_MAX_PARAM_COUNT 32
+
+namespace ncnn {
+
+class DataReader;
+class Net;
+class ParamDictPrivate;
+class NCNN_EXPORT ParamDict
+{
+public:
+    // empty
+    ParamDict();
+
+    virtual ~ParamDict();
+
+    // copy
+    ParamDict(const ParamDict&);
+
+    // assign
+    ParamDict& operator=(const ParamDict&);
+
+    // get type
+    int type(int id) const;
+
+    // get int
+    int get(int id, int def) const;
+    // get float
+    float get(int id, float def) const;
+    // get array
+    Mat get(int id, const Mat& def) const;
+
+    // set int
+    void set(int id, int i);
+    // set float
+    void set(int id, float f);
+    // set array
+    void set(int id, const Mat& v);
+
+protected:
+    friend class Net;
+
+    void clear();
+
+    int load_param(const DataReader& dr);
+    int load_param_bin(const DataReader& dr);
+
+private:
+    ParamDictPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_PARAMDICT_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/pipeline.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/pipeline.h
new file mode 100644
index 0000000..c284a14
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/pipeline.h
@@ -0,0 +1,113 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PIPELINE_H
+#define NCNN_PIPELINE_H
+
+#include "mat.h"
+#include "platform.h"
+#if NCNN_VULKAN
+#include "gpu.h"
+
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class Option;
+class PipelinePrivate;
+class NCNN_EXPORT Pipeline
+{
+public:
+    explicit Pipeline(const VulkanDevice* vkdev);
+    virtual ~Pipeline();
+
+public:
+    void set_optimal_local_size_xyz(int w = 4, int h = 4, int c = 4);
+    void set_optimal_local_size_xyz(const Mat& local_size_xyz);
+    void set_local_size_xyz(int w, int h, int c);
+
+    int create(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations);
+
+    int create(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations);
+
+public:
+    VkShaderModule shader_module() const;
+    VkDescriptorSetLayout descriptorset_layout() const;
+    VkPipelineLayout pipeline_layout() const;
+    VkPipeline pipeline() const;
+    VkDescriptorUpdateTemplateKHR descriptor_update_template() const;
+
+    const ShaderInfo& shader_info() const;
+
+    uint32_t local_size_x() const;
+    uint32_t local_size_y() const;
+    uint32_t local_size_z() const;
+
+protected:
+    void set_shader_module(VkShaderModule shader_module);
+    void set_descriptorset_layout(VkDescriptorSetLayout descriptorset_layout);
+    void set_pipeline_layout(VkPipelineLayout pipeline_layout);
+    void set_pipeline(VkPipeline pipeline);
+    void set_descriptor_update_template(VkDescriptorUpdateTemplateKHR descriptor_update_template);
+
+    void set_shader_info(const ShaderInfo& shader_info);
+
+public:
+    const VulkanDevice* vkdev;
+
+private:
+    Pipeline(const Pipeline&);
+    Pipeline& operator=(const Pipeline&);
+
+private:
+    PipelinePrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class VkCompute;
+class NCNN_EXPORT ImportAndroidHardwareBufferPipeline : private Pipeline
+{
+public:
+    explicit ImportAndroidHardwareBufferPipeline(const VulkanDevice* vkdev);
+    virtual ~ImportAndroidHardwareBufferPipeline();
+
+    int create(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator, int type_to, int rotate_from, const Option& opt);
+    int create(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator, int type_to, int rotate_from, int target_width, int target_height, const Option& opt);
+    void destroy();
+
+    friend class VkCompute;
+
+protected:
+    int create_shader_module(const Option& opt);
+    int create_sampler(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator);
+    int create_descriptorset_layout();
+
+public:
+    int type_to;
+    int rotate_from;
+    bool need_resize;
+
+    VkSampler sampler;
+};
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_PIPELINE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/pipelinecache.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/pipelinecache.h
new file mode 100644
index 0000000..bb6b8fb
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/pipelinecache.h
@@ -0,0 +1,85 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PIPELINECACHE_H
+#define NCNN_PIPELINECACHE_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#include "mat.h"
+#include "gpu.h"
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+
+class VulkanDevice;
+class PipelineCachePrivate;
+class NCNN_EXPORT PipelineCache
+{
+public:
+    explicit PipelineCache(const VulkanDevice* _vkdev);
+
+    virtual ~PipelineCache();
+
+    void clear();
+
+    int get_pipeline(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations,
+                     uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                     VkShaderModule* shader_module,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template,
+                     ShaderInfo& shader_info) const;
+
+    int get_pipeline(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations,
+                     uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                     VkShaderModule* shader_module,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template,
+                     ShaderInfo& shader_info) const;
+
+protected:
+    int create_shader_module(int shader_type_index, const Option& opt, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                             VkShaderModule* _shader_module, ShaderInfo& si) const;
+
+    int new_pipeline(VkShaderModule shader_module, const ShaderInfo& shader_info, const std::vector<vk_specialization_type>& specializations,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template) const;
+
+protected:
+    const VulkanDevice* vkdev;
+
+private:
+    PipelineCache(const PipelineCache&);
+    PipelineCache& operator=(const PipelineCache&);
+
+private:
+    PipelineCachePrivate* const d;
+};
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_PIPELINECACHE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/platform.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/platform.h
new file mode 100644
index 0000000..89f3243
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/platform.h
@@ -0,0 +1,285 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PLATFORM_H
+#define NCNN_PLATFORM_H
+
+#define NCNN_STDIO 1
+#define NCNN_STRING 1
+#define NCNN_SIMPLEOCV 0
+#define NCNN_SIMPLEOMP 0
+#define NCNN_SIMPLESTL 0
+#define NCNN_THREADS 1
+#define NCNN_BENCHMARK 0
+#define NCNN_C_API 1
+#define NCNN_PLATFORM_API 1
+#define NCNN_PIXEL 1
+#define NCNN_PIXEL_ROTATE 1
+#define NCNN_PIXEL_AFFINE 1
+#define NCNN_PIXEL_DRAWING 1
+#define NCNN_VULKAN 1
+#define NCNN_SYSTEM_GLSLANG 0
+#define NCNN_RUNTIME_CPU 1
+#define NCNN_AVX 1
+#define NCNN_XOP 1
+#define NCNN_FMA 1
+#define NCNN_F16C 1
+#define NCNN_AVX2 1
+#define NCNN_AVXVNNI 1
+#define NCNN_AVX512 1
+#define NCNN_AVX512VNNI 1
+#define NCNN_AVX512BF16 1
+#define NCNN_AVX512FP16 1
+#define NCNN_VFPV4 0
+#if __aarch64__
+#define NCNN_ARM82 0
+#define NCNN_ARM82DOT 0
+#define NCNN_ARM82FP16FML 0
+#define NCNN_ARM84BF16 0
+#define NCNN_ARM84I8MM 0
+#define NCNN_ARM86SVE 0
+#define NCNN_ARM86SVE2 0
+#define NCNN_ARM86SVEBF16 0
+#define NCNN_ARM86SVEI8MM 0
+#define NCNN_ARM86SVEF32MM 0
+#endif // __aarch64__
+#define NCNN_MSA 0
+#define NCNN_LSX 0
+#define NCNN_MMI 0
+#define NCNN_RVV 0
+#define NCNN_INT8 1
+#define NCNN_BF16 1
+#define NCNN_FORCE_INLINE 1
+
+#define NCNN_VERSION_STRING "1.0.20221128"
+
+#include "ncnn_export.h"
+
+#ifdef __cplusplus
+
+#if NCNN_THREADS
+#if (defined _WIN32 && !(defined __MINGW32__))
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <process.h>
+#else
+#include <pthread.h>
+#endif
+#endif // NCNN_THREADS
+
+#if __ANDROID_API__ >= 26
+#define VK_USE_PLATFORM_ANDROID_KHR
+#endif // __ANDROID_API__ >= 26
+
+namespace ncnn {
+
+#if NCNN_THREADS
+#if (defined _WIN32 && !(defined __MINGW32__))
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() { InitializeSRWLock(&srwlock); }
+    ~Mutex() {}
+    void lock() { AcquireSRWLockExclusive(&srwlock); }
+    void unlock() { ReleaseSRWLockExclusive(&srwlock); }
+private:
+    friend class ConditionVariable;
+    // NOTE SRWLock is available from windows vista
+    SRWLOCK srwlock;
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() { InitializeConditionVariable(&condvar); }
+    ~ConditionVariable() {}
+    void wait(Mutex& mutex) { SleepConditionVariableSRW(&condvar, &mutex.srwlock, INFINITE, 0); }
+    void broadcast() { WakeAllConditionVariable(&condvar); }
+    void signal() { WakeConditionVariable(&condvar); }
+private:
+    CONDITION_VARIABLE condvar;
+};
+
+static unsigned __stdcall start_wrapper(void* args);
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*start)(void*), void* args = 0) { _start = start; _args = args; handle = (HANDLE)_beginthreadex(0, 0, start_wrapper, this, 0, 0); }
+    ~Thread() {}
+    void join() { WaitForSingleObject(handle, INFINITE); CloseHandle(handle); }
+private:
+    friend unsigned __stdcall start_wrapper(void* args)
+    {
+        Thread* t = (Thread*)args;
+        t->_start(t->_args);
+        return 0;
+    }
+    HANDLE handle;
+    void* (*_start)(void*);
+    void* _args;
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { key = TlsAlloc(); }
+    ~ThreadLocalStorage() { TlsFree(key); }
+    void set(void* value) { TlsSetValue(key, (LPVOID)value); }
+    void* get() { return (void*)TlsGetValue(key); }
+private:
+    DWORD key;
+};
+#else // (defined _WIN32 && !(defined __MINGW32__))
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() { pthread_mutex_init(&mutex, 0); }
+    ~Mutex() { pthread_mutex_destroy(&mutex); }
+    void lock() { pthread_mutex_lock(&mutex); }
+    void unlock() { pthread_mutex_unlock(&mutex); }
+private:
+    friend class ConditionVariable;
+    pthread_mutex_t mutex;
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() { pthread_cond_init(&cond, 0); }
+    ~ConditionVariable() { pthread_cond_destroy(&cond); }
+    void wait(Mutex& mutex) { pthread_cond_wait(&cond, &mutex.mutex); }
+    void broadcast() { pthread_cond_broadcast(&cond); }
+    void signal() { pthread_cond_signal(&cond); }
+private:
+    pthread_cond_t cond;
+};
+
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*start)(void*), void* args = 0) { pthread_create(&t, 0, start, args); }
+    ~Thread() {}
+    void join() { pthread_join(t, 0); }
+private:
+    pthread_t t;
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { pthread_key_create(&key, 0); }
+    ~ThreadLocalStorage() { pthread_key_delete(key); }
+    void set(void* value) { pthread_setspecific(key, value); }
+    void* get() { return pthread_getspecific(key); }
+private:
+    pthread_key_t key;
+};
+#endif // (defined _WIN32 && !(defined __MINGW32__))
+#else // NCNN_THREADS
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() {}
+    ~Mutex() {}
+    void lock() {}
+    void unlock() {}
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() {}
+    ~ConditionVariable() {}
+    void wait(Mutex& /*mutex*/) {}
+    void broadcast() {}
+    void signal() {}
+};
+
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*/*start*/)(void*), void* /*args*/ = 0) {}
+    ~Thread() {}
+    void join() {}
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { data = 0; }
+    ~ThreadLocalStorage() {}
+    void set(void* value) { data = value; }
+    void* get() { return data; }
+private:
+    void* data;
+};
+#endif // NCNN_THREADS
+
+class NCNN_EXPORT MutexLockGuard
+{
+public:
+    MutexLockGuard(Mutex& _mutex) : mutex(_mutex) { mutex.lock(); }
+    ~MutexLockGuard() { mutex.unlock(); }
+private:
+    Mutex& mutex;
+};
+
+} // namespace ncnn
+
+#if NCNN_SIMPLESTL
+#include "simplestl.h"
+#else
+#include <algorithm>
+#include <list>
+#include <vector>
+#include <string>
+#endif
+
+#endif // __cplusplus
+
+#if NCNN_STDIO
+#if NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#include <android/log.h>
+#define NCNN_LOGE(...) do { \
+    fprintf(stderr, ##__VA_ARGS__); fprintf(stderr, "\n"); \
+    __android_log_print(ANDROID_LOG_WARN, "ncnn", ##__VA_ARGS__); } while(0)
+#else // NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#include <stdio.h>
+#define NCNN_LOGE(...) do { \
+    fprintf(stderr, ##__VA_ARGS__); fprintf(stderr, "\n"); } while(0)
+#endif // NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#else
+#define NCNN_LOGE(...)
+#endif
+
+
+#if NCNN_FORCE_INLINE
+#ifdef _MSC_VER
+    #define NCNN_FORCEINLINE __forceinline
+#elif defined(__GNUC__)
+    #define NCNN_FORCEINLINE inline __attribute__((__always_inline__))
+#elif defined(__CLANG__)
+    #if __has_attribute(__always_inline__)
+        #define NCNN_FORCEINLINE inline __attribute__((__always_inline__))
+    #else
+        #define NCNN_FORCEINLINE inline
+    #endif
+#else
+    #define NCNN_FORCEINLINE inline
+#endif
+#else
+    #define NCNN_FORCEINLINE inline
+#endif
+
+#endif // NCNN_PLATFORM_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/simpleocv.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/simpleocv.h
new file mode 100644
index 0000000..55ede15
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/simpleocv.h
@@ -0,0 +1,501 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLEOCV_H
+#define NCNN_SIMPLEOCV_H
+
+#include "platform.h"
+
+#if NCNN_SIMPLEOCV
+
+#include <limits.h>
+#include <string.h>
+#include "allocator.h"
+#include "mat.h"
+
+#if defined(_MSC_VER) || defined(__GNUC__)
+#pragma push_macro("min")
+#pragma push_macro("max")
+#undef min
+#undef max
+#endif
+
+#ifndef NCNN_XADD
+using ncnn::NCNN_XADD;
+#endif
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+
+enum
+{
+    CV_LOAD_IMAGE_UNCHANGED = -1,
+    CV_LOAD_IMAGE_GRAYSCALE = 0,
+    CV_LOAD_IMAGE_COLOR = 1,
+};
+
+enum
+{
+    CV_IMWRITE_JPEG_QUALITY = 1
+};
+
+// minimal opencv style data structure implementation
+namespace cv {
+
+template<typename _Tp>
+static inline _Tp saturate_cast(int v)
+{
+    return _Tp(v);
+}
+template<>
+inline uchar saturate_cast<uchar>(int v)
+{
+    return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
+}
+
+template<typename _Tp>
+struct Scalar_
+{
+    Scalar_()
+    {
+        v[0] = 0;
+        v[1] = 0;
+        v[2] = 0;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0)
+    {
+        v[0] = _v0;
+        v[1] = 0;
+        v[2] = 0;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0, _Tp _v1, _Tp _v2)
+    {
+        v[0] = _v0;
+        v[1] = _v1;
+        v[2] = _v2;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0, _Tp _v1, _Tp _v2, _Tp _v3)
+    {
+        v[0] = _v0;
+        v[1] = _v1;
+        v[2] = _v2;
+        v[3] = _v3;
+    }
+
+    const _Tp operator[](const int i) const
+    {
+        return v[i];
+    }
+
+    _Tp operator[](const int i)
+    {
+        return v[i];
+    }
+
+    _Tp v[4];
+};
+
+typedef Scalar_<uchar> Scalar;
+
+template<typename _Tp>
+struct Point_
+{
+    Point_()
+        : x(0), y(0)
+    {
+    }
+    Point_(_Tp _x, _Tp _y)
+        : x(_x), y(_y)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Point_<_Tp2>() const
+    {
+        return Point_<_Tp2>(saturate_cast<_Tp2>(x), saturate_cast<_Tp2>(y));
+    }
+
+    _Tp x;
+    _Tp y;
+};
+
+typedef Point_<int> Point;
+typedef Point_<float> Point2f;
+
+template<typename _Tp>
+struct Size_
+{
+    Size_()
+        : width(0), height(0)
+    {
+    }
+    Size_(_Tp _w, _Tp _h)
+        : width(_w), height(_h)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Size_<_Tp2>() const
+    {
+        return Size_<_Tp2>(saturate_cast<_Tp2>(width), saturate_cast<_Tp2>(height));
+    }
+
+    _Tp width;
+    _Tp height;
+};
+
+typedef Size_<int> Size;
+typedef Size_<float> Size2f;
+
+template<typename _Tp>
+struct Rect_
+{
+    Rect_()
+        : x(0), y(0), width(0), height(0)
+    {
+    }
+    Rect_(_Tp _x, _Tp _y, _Tp _w, _Tp _h)
+        : x(_x), y(_y), width(_w), height(_h)
+    {
+    }
+    Rect_(Point_<_Tp> _p, Size_<_Tp> _size)
+        : x(_p.x), y(_p.y), width(_size.width), height(_size.height)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Rect_<_Tp2>() const
+    {
+        return Rect_<_Tp2>(saturate_cast<_Tp2>(x), saturate_cast<_Tp2>(y), saturate_cast<_Tp2>(width), saturate_cast<_Tp2>(height));
+    }
+
+    _Tp x;
+    _Tp y;
+    _Tp width;
+    _Tp height;
+
+    // area
+    _Tp area() const
+    {
+        return width * height;
+    }
+};
+
+template<typename _Tp>
+static inline Rect_<_Tp>& operator&=(Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    _Tp x1 = std::max(a.x, b.x), y1 = std::max(a.y, b.y);
+    a.width = std::min(a.x + a.width, b.x + b.width) - x1;
+    a.height = std::min(a.y + a.height, b.y + b.height) - y1;
+    a.x = x1;
+    a.y = y1;
+    if (a.width <= 0 || a.height <= 0)
+        a = Rect_<_Tp>();
+    return a;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp>& operator|=(Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    _Tp x1 = std::min(a.x, b.x), y1 = std::min(a.y, b.y);
+    a.width = std::max(a.x + a.width, b.x + b.width) - x1;
+    a.height = std::max(a.y + a.height, b.y + b.height) - y1;
+    a.x = x1;
+    a.y = y1;
+    return a;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp> operator&(const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    Rect_<_Tp> c = a;
+    return c &= b;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp> operator|(const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    Rect_<_Tp> c = a;
+    return c |= b;
+}
+
+typedef Rect_<int> Rect;
+typedef Rect_<float> Rect2f;
+
+#define CV_8UC1  1
+#define CV_8UC3  3
+#define CV_8UC4  4
+#define CV_32FC1 4
+
+struct NCNN_EXPORT Mat
+{
+    Mat()
+        : data(0), refcount(0), rows(0), cols(0), c(0)
+    {
+    }
+
+    Mat(int _rows, int _cols, int flags)
+        : data(0), refcount(0)
+    {
+        create(_rows, _cols, flags);
+    }
+
+    // copy
+    Mat(const Mat& m)
+        : data(m.data), refcount(m.refcount)
+    {
+        if (refcount)
+            NCNN_XADD(refcount, 1);
+
+        rows = m.rows;
+        cols = m.cols;
+        c = m.c;
+    }
+
+    Mat(int _rows, int _cols, int flags, void* _data)
+        : data((unsigned char*)_data), refcount(0)
+    {
+        rows = _rows;
+        cols = _cols;
+        c = flags;
+    }
+
+    ~Mat()
+    {
+        release();
+    }
+
+    // assign
+    Mat& operator=(const Mat& m)
+    {
+        if (this == &m)
+            return *this;
+
+        if (m.refcount)
+            NCNN_XADD(m.refcount, 1);
+
+        release();
+
+        data = m.data;
+        refcount = m.refcount;
+
+        rows = m.rows;
+        cols = m.cols;
+        c = m.c;
+
+        return *this;
+    }
+
+    Mat& operator=(const Scalar& s)
+    {
+        if (total() > 0)
+        {
+            uchar* p = data;
+            for (int i = 0; i < cols * rows; i++)
+            {
+                for (int j = 0; j < c; j++)
+                {
+                    *p++ = s[j];
+                }
+            }
+        }
+
+        return *this;
+    }
+
+    void create(int _rows, int _cols, int flags)
+    {
+        release();
+
+        rows = _rows;
+        cols = _cols;
+        c = flags;
+
+        if (total() > 0)
+        {
+            // refcount address must be aligned, so we expand totalsize here
+            size_t totalsize = (total() + 3) >> 2 << 2;
+            data = (uchar*)ncnn::fastMalloc(totalsize + (int)sizeof(*refcount));
+            refcount = (int*)(((uchar*)data) + totalsize);
+            *refcount = 1;
+        }
+    }
+
+    void release()
+    {
+        if (refcount && NCNN_XADD(refcount, -1) == 1)
+            ncnn::fastFree(data);
+
+        data = 0;
+
+        rows = 0;
+        cols = 0;
+        c = 0;
+
+        refcount = 0;
+    }
+
+    Mat clone() const
+    {
+        if (empty())
+            return Mat();
+
+        Mat m(rows, cols, c);
+
+        if (total() > 0)
+        {
+            memcpy(m.data, data, total());
+        }
+
+        return m;
+    }
+
+    bool empty() const
+    {
+        return data == 0 || total() == 0;
+    }
+
+    int channels() const
+    {
+        return c;
+    }
+
+    int type() const
+    {
+        return c;
+    }
+
+    size_t total() const
+    {
+        return cols * rows * c;
+    }
+
+    const uchar* ptr(int y) const
+    {
+        return data + y * cols * c;
+    }
+
+    uchar* ptr(int y)
+    {
+        return data + y * cols * c;
+    }
+
+    template<typename _Tp>
+    const _Tp* ptr(int y) const
+    {
+        return (const _Tp*)data + y * cols * c;
+    }
+
+    template<typename _Tp>
+    _Tp* ptr(int y)
+    {
+        return (_Tp*)data + y * cols * c;
+    }
+
+    // roi
+    Mat operator()(const Rect& roi) const
+    {
+        if (empty())
+            return Mat();
+
+        Mat m(roi.height, roi.width, c);
+
+        int sy = roi.y;
+        for (int y = 0; y < roi.height; y++)
+        {
+            const uchar* sptr = ptr(sy) + roi.x * c;
+            uchar* dptr = m.ptr(y);
+            memcpy(dptr, sptr, roi.width * c);
+            sy++;
+        }
+
+        return m;
+    }
+
+    uchar* data;
+
+    // pointer to the reference counter;
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    int rows;
+    int cols;
+
+    int c;
+};
+
+enum ImreadModes
+{
+    IMREAD_UNCHANGED = -1,
+    IMREAD_GRAYSCALE = 0,
+    IMREAD_COLOR = 1
+};
+
+NCNN_EXPORT Mat imread(const std::string& path, int flags = IMREAD_COLOR);
+
+enum ImwriteFlags
+{
+    IMWRITE_JPEG_QUALITY = 1
+};
+
+NCNN_EXPORT bool imwrite(const std::string& path, const Mat& m, const std::vector<int>& params = std::vector<int>());
+
+NCNN_EXPORT void imshow(const std::string& name, const Mat& m);
+
+NCNN_EXPORT int waitKey(int delay = 0);
+
+#if NCNN_PIXEL
+NCNN_EXPORT void resize(const Mat& src, Mat& dst, const Size& size, float sw = 0.f, float sh = 0.f, int flags = 0);
+#endif // NCNN_PIXEL
+
+#if NCNN_PIXEL_DRAWING
+
+enum
+{
+    FILLED = -1
+};
+
+NCNN_EXPORT void rectangle(Mat& img, Point pt1, Point pt2, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void rectangle(Mat& img, Rect rec, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void circle(Mat& img, Point center, int radius, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void line(Mat& img, Point p0, Point p1, const Scalar& color, int thickness = 1);
+
+enum
+{
+    FONT_HERSHEY_SIMPLEX = 0
+};
+
+NCNN_EXPORT void putText(Mat& img, const std::string& text, Point org, int fontFace, double fontScale, Scalar color, int thickness = 1);
+
+NCNN_EXPORT Size getTextSize(const std::string& text, int fontFace, double fontScale, int thickness, int* baseLine);
+
+#endif // NCNN_PIXEL_DRAWING
+
+} // namespace cv
+
+#if defined(_MSC_VER) || defined(__GNUC__)
+#pragma pop_macro("min")
+#pragma pop_macro("max")
+#endif
+
+#endif // NCNN_SIMPLEOCV
+
+#endif // NCNN_SIMPLEOCV_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/simpleomp.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/simpleomp.h
new file mode 100644
index 0000000..13e2452
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/simpleomp.h
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLEOMP_H
+#define NCNN_SIMPLEOMP_H
+
+#include "platform.h"
+
+#if NCNN_SIMPLEOMP
+
+#include <stdint.h>
+
+// This minimal openmp runtime implementation only supports the llvm openmp abi
+// and only supports #pragma omp parallel for num_threads(X)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+NCNN_EXPORT int omp_get_max_threads();
+
+NCNN_EXPORT void omp_set_num_threads(int num_threads);
+
+NCNN_EXPORT int omp_get_dynamic();
+
+NCNN_EXPORT void omp_set_dynamic(int dynamic);
+
+NCNN_EXPORT int omp_get_num_threads();
+
+NCNN_EXPORT int omp_get_thread_num();
+
+NCNN_EXPORT int kmp_get_blocktime();
+
+NCNN_EXPORT void kmp_set_blocktime(int blocktime);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // NCNN_SIMPLEOMP
+
+#endif // NCNN_SIMPLEOMP_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/simplestl.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/simplestl.h
new file mode 100644
index 0000000..00ff468
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/simplestl.h
@@ -0,0 +1,565 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLESTL_H
+#define NCNN_SIMPLESTL_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#if !NCNN_SIMPLESTL
+
+#include <new>
+
+#else
+
+// allocation functions
+NCNN_EXPORT void* operator new(size_t size);
+NCNN_EXPORT void* operator new[](size_t size);
+// placement allocation functions
+NCNN_EXPORT void* operator new(size_t size, void* ptr);
+NCNN_EXPORT void* operator new[](size_t size, void* ptr);
+// deallocation functions
+NCNN_EXPORT void operator delete(void* ptr);
+NCNN_EXPORT void operator delete[](void* ptr);
+// deallocation functions since c++14
+#if __cplusplus >= 201402L
+NCNN_EXPORT void operator delete(void* ptr, size_t sz);
+NCNN_EXPORT void operator delete[](void* ptr, size_t sz);
+#endif
+// placement deallocation functions
+NCNN_EXPORT void operator delete(void* ptr, void* voidptr2);
+NCNN_EXPORT void operator delete[](void* ptr, void* voidptr2);
+
+#endif
+
+// minimal stl data structure implementation
+namespace std {
+
+template<typename T>
+const T& max(const T& a, const T& b)
+{
+    return (a < b) ? b : a;
+}
+
+template<typename T>
+const T& min(const T& a, const T& b)
+{
+    return (a > b) ? b : a;
+}
+
+template<typename T>
+void swap(T& a, T& b)
+{
+    T temp(a);
+    a = b;
+    b = temp;
+}
+
+template<typename T1, typename T2>
+struct pair
+{
+    pair()
+        : first(), second()
+    {
+    }
+    pair(const T1& t1, const T2& t2)
+        : first(t1), second(t2)
+    {
+    }
+
+    T1 first;
+    T2 second;
+};
+
+template<typename T1, typename T2>
+bool operator==(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return (x.first == y.first && x.second == y.second);
+}
+template<typename T1, typename T2>
+bool operator<(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return x.first < y.first || (!(y.first < x.first) && x.second < y.second);
+}
+template<typename T1, typename T2>
+bool operator!=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(x == y);
+}
+template<typename T1, typename T2>
+bool operator>(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return y < x;
+}
+template<typename T1, typename T2>
+bool operator<=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(y < x);
+}
+template<typename T1, typename T2>
+bool operator>=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(x < y);
+}
+
+template<typename T1, typename T2>
+pair<T1, T2> make_pair(const T1& t1, const T2& t2)
+{
+    return pair<T1, T2>(t1, t2);
+}
+
+template<typename T>
+struct node
+{
+    node* prev_;
+    node* next_;
+    T data_;
+
+    node()
+        : prev_(0), next_(0), data_()
+    {
+    }
+    node(const T& t)
+        : prev_(0), next_(0), data_(t)
+    {
+    }
+};
+
+template<typename T>
+struct iter_list
+{
+    iter_list()
+        : curr_(0)
+    {
+    }
+    iter_list(node<T>* n)
+        : curr_(n)
+    {
+    }
+    iter_list(const iter_list& i)
+        : curr_(i.curr_)
+    {
+    }
+    ~iter_list()
+    {
+    }
+
+    iter_list& operator=(const iter_list& i)
+    {
+        curr_ = i.curr_;
+        return *this;
+    }
+
+    T& operator*()
+    {
+        return curr_->data_;
+    }
+    T* operator->()
+    {
+        return &(curr_->data_);
+    }
+
+    bool operator==(const iter_list& i)
+    {
+        return curr_ == i.curr_;
+    }
+    bool operator!=(const iter_list& i)
+    {
+        return curr_ != i.curr_;
+    }
+
+    iter_list& operator++()
+    {
+        curr_ = curr_->next_;
+        return *this;
+    }
+    iter_list& operator--()
+    {
+        curr_ = curr_->prev_;
+        return *this;
+    }
+
+    node<T>* curr_;
+};
+
+template<typename T>
+struct list
+{
+    typedef iter_list<T> iterator;
+
+    list()
+    {
+        head_ = new node<T>();
+        tail_ = head_;
+        count_ = 0;
+    }
+    ~list()
+    {
+        clear();
+        delete head_;
+    }
+    list(const list& l)
+    {
+        head_ = new node<T>();
+        tail_ = head_;
+        count_ = 0;
+
+        for (iter_list<T> i = l.begin(); i != l.end(); ++i)
+        {
+            push_back(*i);
+        }
+    }
+
+    list& operator=(const list& l)
+    {
+        if (this == &l)
+        {
+            return *this;
+        }
+        clear();
+
+        for (iter_list<T> i = l.begin(); i != l.end(); ++i)
+        {
+            push_back(*i);
+        }
+        return *this;
+    }
+
+    void clear()
+    {
+        while (count_ > 0)
+        {
+            pop_front();
+        }
+    }
+
+    void pop_front()
+    {
+        if (count_ > 0)
+        {
+            head_ = head_->next_;
+            delete head_->prev_;
+            head_->prev_ = 0;
+            --count_;
+        }
+    }
+
+    size_t size() const
+    {
+        return count_;
+    }
+    iter_list<T> begin() const
+    {
+        return iter_list<T>(head_);
+    }
+    iter_list<T> end() const
+    {
+        return iter_list<T>(tail_);
+    }
+    bool empty() const
+    {
+        return count_ == 0;
+    }
+
+    void push_back(const T& t)
+    {
+        if (count_ == 0)
+        {
+            head_ = new node<T>(t);
+            head_->prev_ = 0;
+            head_->next_ = tail_;
+            tail_->prev_ = head_;
+            count_ = 1;
+        }
+        else
+        {
+            node<T>* temp = new node<T>(t);
+            temp->prev_ = tail_->prev_;
+            temp->next_ = tail_;
+            tail_->prev_->next_ = temp;
+            tail_->prev_ = temp;
+            ++count_;
+        }
+    }
+
+    iter_list<T> erase(iter_list<T> pos)
+    {
+        if (pos != end())
+        {
+            node<T>* temp = pos.curr_;
+            if (temp == head_)
+            {
+                ++pos;
+                temp->next_->prev_ = 0;
+                head_ = temp->next_;
+            }
+            else
+            {
+                --pos;
+                temp->next_->prev_ = temp->prev_;
+                temp->prev_->next_ = temp->next_;
+                ++pos;
+            }
+            delete temp;
+            --count_;
+        }
+        return pos;
+    }
+
+protected:
+    node<T>* head_;
+    node<T>* tail_;
+    size_t count_;
+};
+
+template<typename T>
+struct greater
+{
+    bool operator()(const T& x, const T& y) const
+    {
+        return (x > y);
+    }
+};
+
+template<typename T>
+struct less
+{
+    bool operator()(const T& x, const T& y) const
+    {
+        return (x < y);
+    }
+};
+
+template<typename RandomAccessIter, typename Compare>
+void partial_sort(RandomAccessIter first, RandomAccessIter middle, RandomAccessIter last, Compare comp)
+{
+    // [TODO] heap sort should be used here, but we simply use bubble sort now
+    for (RandomAccessIter i = first; i < middle; ++i)
+    {
+        // bubble sort
+        for (RandomAccessIter j = last - 1; j > first; --j)
+        {
+            if (comp(*j, *(j - 1)))
+            {
+                swap(*j, *(j - 1));
+            }
+        }
+    }
+}
+
+template<typename T>
+struct vector
+{
+    vector()
+        : data_(0), size_(0), capacity_(0)
+    {
+    }
+    vector(const size_t new_size, const T& value = T())
+        : data_(0), size_(0), capacity_(0)
+    {
+        resize(new_size, value);
+    }
+    ~vector()
+    {
+        clear();
+    }
+    vector(const vector& v)
+        : data_(0), size_(0), capacity_(0)
+    {
+        resize(v.size());
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i] = v.data_[i];
+        }
+    }
+
+    vector& operator=(const vector& v)
+    {
+        if (this == &v)
+        {
+            return *this;
+        }
+        resize(0);
+        resize(v.size());
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i] = v.data_[i];
+        }
+        return *this;
+    }
+
+    void resize(const size_t new_size, const T& value = T())
+    {
+        try_alloc(new_size);
+        if (new_size > size_)
+        {
+            for (size_t i = size_; i < new_size; i++)
+            {
+                new (&data_[i]) T(value);
+            }
+        }
+        else if (new_size < size_)
+        {
+            for (size_t i = new_size; i < size_; i++)
+            {
+                data_[i].~T();
+            }
+        }
+        size_ = new_size;
+    }
+
+    void clear()
+    {
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i].~T();
+        }
+        delete[](char*) data_;
+        data_ = 0;
+        size_ = 0;
+        capacity_ = 0;
+    }
+
+    T* data() const
+    {
+        return data_;
+    }
+    size_t size() const
+    {
+        return size_;
+    }
+    T& operator[](size_t i) const
+    {
+        return data_[i];
+    }
+    T* begin() const
+    {
+        return &data_[0];
+    }
+    T* end() const
+    {
+        return &data_[size_];
+    }
+    bool empty() const
+    {
+        return size_ == 0;
+    }
+
+    void push_back(const T& t)
+    {
+        try_alloc(size_ + 1);
+        new (&data_[size_]) T(t);
+        size_++;
+    }
+
+    void insert(T* pos, T* b, T* e)
+    {
+        vector* v = 0;
+        if (b >= begin() && b < end())
+        {
+            //the same vector
+            v = new vector(*this);
+            b = v->begin() + (b - begin());
+            e = v->begin() + (e - begin());
+        }
+        size_t diff = pos - begin();
+        try_alloc(size_ + (e - b));
+        pos = begin() + diff;
+        memmove(pos + (e - b), pos, (end() - pos) * sizeof(T));
+        size_t len = e - b;
+        size_ += len;
+        for (size_t i = 0; i < len; i++)
+        {
+            *pos = *b;
+            pos++;
+            b++;
+        }
+        delete v;
+    }
+
+    T* erase(T* pos)
+    {
+        pos->~T();
+        memmove(pos, pos + 1, (end() - pos - 1) * sizeof(T));
+        size_--;
+        return pos;
+    }
+
+protected:
+    T* data_;
+    size_t size_;
+    size_t capacity_;
+    void try_alloc(size_t new_size)
+    {
+        if (new_size * 3 / 2 > capacity_ / 2)
+        {
+            capacity_ = new_size * 2;
+            T* new_data = (T*)new char[capacity_ * sizeof(T)];
+            memset(static_cast<void*>(new_data), 0, capacity_ * sizeof(T));
+            if (data_)
+            {
+                memmove(new_data, data_, sizeof(T) * size_);
+                delete[](char*) data_;
+            }
+            data_ = new_data;
+        }
+    }
+};
+
+struct NCNN_EXPORT string : public vector<char>
+{
+    string()
+    {
+    }
+    string(const char* str)
+    {
+        size_t len = strlen(str);
+        resize(len);
+        memcpy(data_, str, len);
+    }
+    const char* c_str() const
+    {
+        return (const char*)data_;
+    }
+    bool operator==(const string& str2) const
+    {
+        return strcmp(data_, str2.data_) == 0;
+    }
+    bool operator==(const char* str2) const
+    {
+        return strcmp(data_, str2) == 0;
+    }
+    bool operator!=(const char* str2) const
+    {
+        return strcmp(data_, str2) != 0;
+    }
+    string& operator+=(const string& str1)
+    {
+        insert(end(), str1.begin(), str1.end());
+        return *this;
+    }
+};
+
+inline string operator+(const string& str1, const string& str2)
+{
+    string str(str1);
+    str.insert(str.end(), str2.begin(), str2.end());
+    return str;
+}
+
+} // namespace std
+
+#endif // NCNN_SIMPLESTL_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/vulkan_header_fix.h b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/vulkan_header_fix.h
new file mode 100644
index 0000000..e7a7e8e
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/include/ncnn/vulkan_header_fix.h
@@ -0,0 +1,251 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_VULKAN_HEADER_FIX_H
+#define NCNN_VULKAN_HEADER_FIX_H
+
+#include <vulkan/vulkan.h>
+
+// This header contains new structure and function declearation to fix build with old vulkan sdk
+
+#if VK_HEADER_VERSION < 70
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES (VkStructureType)1000094000
+typedef enum VkSubgroupFeatureFlagBits
+{
+    VK_SUBGROUP_FEATURE_BASIC_BIT = 0x00000001,
+    VK_SUBGROUP_FEATURE_VOTE_BIT = 0x00000002,
+    VK_SUBGROUP_FEATURE_ARITHMETIC_BIT = 0x00000004,
+    VK_SUBGROUP_FEATURE_BALLOT_BIT = 0x00000008,
+    VK_SUBGROUP_FEATURE_SHUFFLE_BIT = 0x00000010,
+    VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT = 0x00000020,
+    VK_SUBGROUP_FEATURE_CLUSTERED_BIT = 0x00000040,
+    VK_SUBGROUP_FEATURE_QUAD_BIT = 0x00000080,
+    VK_SUBGROUP_FEATURE_PARTITIONED_BIT_NV = 0x00000100,
+    VK_SUBGROUP_FEATURE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkSubgroupFeatureFlagBits;
+typedef VkFlags VkSubgroupFeatureFlags;
+typedef struct VkPhysicalDeviceSubgroupProperties
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t subgroupSize;
+    VkShaderStageFlags supportedStages;
+    VkSubgroupFeatureFlags supportedOperations;
+    VkBool32 quadOperationsInAllStages;
+} VkPhysicalDeviceSubgroupProperties;
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_3_PROPERTIES (VkStructureType)1000168000
+#define VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_SUPPORT            (VkStructureType)1000168001
+typedef struct VkPhysicalDeviceMaintenance3Properties
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t maxPerSetDescriptors;
+    VkDeviceSize maxMemoryAllocationSize;
+} VkPhysicalDeviceMaintenance3Properties;
+typedef struct VkDescriptorSetLayoutSupport
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 supported;
+} VkDescriptorSetLayoutSupport;
+typedef VkPhysicalDeviceMaintenance3Properties VkPhysicalDeviceMaintenance3PropertiesKHR;
+typedef VkDescriptorSetLayoutSupport VkDescriptorSetLayoutSupportKHR;
+typedef void(VKAPI_PTR* PFN_vkGetDescriptorSetLayoutSupportKHR)(VkDevice device, const VkDescriptorSetLayoutCreateInfo* pCreateInfo, VkDescriptorSetLayoutSupport* pSupport);
+#endif // VK_HEADER_VERSION < 70
+
+#if VK_HEADER_VERSION < 80
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR (VkStructureType)1000177000
+typedef struct VkPhysicalDevice8BitStorageFeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 storageBuffer8BitAccess;
+    VkBool32 uniformAndStorageBuffer8BitAccess;
+    VkBool32 storagePushConstant8;
+} VkPhysicalDevice8BitStorageFeaturesKHR;
+#define VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2_KHR  (VkStructureType)1000109000
+#define VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2_KHR    (VkStructureType)1000109001
+#define VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2_KHR     (VkStructureType)1000109002
+#define VK_STRUCTURE_TYPE_SUBPASS_DEPENDENCY_2_KHR      (VkStructureType)1000109003
+#define VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2_KHR (VkStructureType)1000109004
+#define VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO_KHR        (VkStructureType)1000109005
+#define VK_STRUCTURE_TYPE_SUBPASS_END_INFO_KHR          (VkStructureType)1000109006
+typedef struct VkAttachmentDescription2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkAttachmentDescriptionFlags flags;
+    VkFormat format;
+    VkSampleCountFlagBits samples;
+    VkAttachmentLoadOp loadOp;
+    VkAttachmentStoreOp storeOp;
+    VkAttachmentLoadOp stencilLoadOp;
+    VkAttachmentStoreOp stencilStoreOp;
+    VkImageLayout initialLayout;
+    VkImageLayout finalLayout;
+} VkAttachmentDescription2KHR;
+typedef struct VkAttachmentReference2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint32_t attachment;
+    VkImageLayout layout;
+    VkImageAspectFlags aspectMask;
+} VkAttachmentReference2KHR;
+typedef struct VkSubpassDescription2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkSubpassDescriptionFlags flags;
+    VkPipelineBindPoint pipelineBindPoint;
+    uint32_t viewMask;
+    uint32_t inputAttachmentCount;
+    const VkAttachmentReference2KHR* pInputAttachments;
+    uint32_t colorAttachmentCount;
+    const VkAttachmentReference2KHR* pColorAttachments;
+    const VkAttachmentReference2KHR* pResolveAttachments;
+    const VkAttachmentReference2KHR* pDepthStencilAttachment;
+    uint32_t preserveAttachmentCount;
+    const uint32_t* pPreserveAttachments;
+} VkSubpassDescription2KHR;
+typedef struct VkSubpassDependency2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint32_t srcSubpass;
+    uint32_t dstSubpass;
+    VkPipelineStageFlags srcStageMask;
+    VkPipelineStageFlags dstStageMask;
+    VkAccessFlags srcAccessMask;
+    VkAccessFlags dstAccessMask;
+    VkDependencyFlags dependencyFlags;
+    int32_t viewOffset;
+} VkSubpassDependency2KHR;
+typedef struct VkRenderPassCreateInfo2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkRenderPassCreateFlags flags;
+    uint32_t attachmentCount;
+    const VkAttachmentDescription2KHR* pAttachments;
+    uint32_t subpassCount;
+    const VkSubpassDescription2KHR* pSubpasses;
+    uint32_t dependencyCount;
+    const VkSubpassDependency2KHR* pDependencies;
+    uint32_t correlatedViewMaskCount;
+    const uint32_t* pCorrelatedViewMasks;
+} VkRenderPassCreateInfo2KHR;
+typedef struct VkSubpassBeginInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkSubpassContents contents;
+} VkSubpassBeginInfoKHR;
+
+typedef struct VkSubpassEndInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+} VkSubpassEndInfoKHR;
+typedef VkResult(VKAPI_PTR* PFN_vkCreateRenderPass2KHR)(VkDevice device, const VkRenderPassCreateInfo2KHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkRenderPass* pRenderPass);
+typedef void(VKAPI_PTR* PFN_vkCmdBeginRenderPass2KHR)(VkCommandBuffer commandBuffer, const VkRenderPassBeginInfo* pRenderPassBegin, const VkSubpassBeginInfoKHR* pSubpassBeginInfo);
+typedef void(VKAPI_PTR* PFN_vkCmdNextSubpass2KHR)(VkCommandBuffer commandBuffer, const VkSubpassBeginInfoKHR* pSubpassBeginInfo, const VkSubpassEndInfoKHR* pSubpassEndInfo);
+typedef void(VKAPI_PTR* PFN_vkCmdEndRenderPass2KHR)(VkCommandBuffer commandBuffer, const VkSubpassEndInfoKHR* pSubpassEndInfo);
+#endif // VK_HEADER_VERSION < 80
+
+#if VK_HEADER_VERSION < 95
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR (VkStructureType)1000082000
+typedef struct VkPhysicalDeviceFloat16Int8FeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 shaderFloat16;
+    VkBool32 shaderInt8;
+} VkPhysicalDeviceFloat16Int8FeaturesKHR;
+#endif // VK_HEADER_VERSION < 95
+
+#if VK_HEADER_VERSION < 97
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT (VkStructureType)1000237000
+typedef struct VkPhysicalDeviceMemoryBudgetPropertiesEXT
+{
+    VkStructureType sType;
+    void* pNext;
+    VkDeviceSize heapBudget[VK_MAX_MEMORY_HEAPS];
+    VkDeviceSize heapUsage[VK_MAX_MEMORY_HEAPS];
+} VkPhysicalDeviceMemoryBudgetPropertiesEXT;
+#endif // VK_HEADER_VERSION < 97
+
+#if VK_HEADER_VERSION < 101
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_NV   (VkStructureType)1000249000
+#define VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_NV                 (VkStructureType)1000249001
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_PROPERTIES_NV (VkStructureType)1000249002
+typedef enum VkComponentTypeNV
+{
+    VK_COMPONENT_TYPE_FLOAT16_NV = 0,
+    VK_COMPONENT_TYPE_FLOAT32_NV = 1,
+    VK_COMPONENT_TYPE_FLOAT64_NV = 2,
+    VK_COMPONENT_TYPE_SINT8_NV = 3,
+    VK_COMPONENT_TYPE_SINT16_NV = 4,
+    VK_COMPONENT_TYPE_SINT32_NV = 5,
+    VK_COMPONENT_TYPE_SINT64_NV = 6,
+    VK_COMPONENT_TYPE_UINT8_NV = 7,
+    VK_COMPONENT_TYPE_UINT16_NV = 8,
+    VK_COMPONENT_TYPE_UINT32_NV = 9,
+    VK_COMPONENT_TYPE_UINT64_NV = 10,
+    VK_COMPONENT_TYPE_BEGIN_RANGE_NV = VK_COMPONENT_TYPE_FLOAT16_NV,
+    VK_COMPONENT_TYPE_END_RANGE_NV = VK_COMPONENT_TYPE_UINT64_NV,
+    VK_COMPONENT_TYPE_RANGE_SIZE_NV = (VK_COMPONENT_TYPE_UINT64_NV - VK_COMPONENT_TYPE_FLOAT16_NV + 1),
+    VK_COMPONENT_TYPE_MAX_ENUM_NV = 0x7FFFFFFF
+} VkComponentTypeNV;
+typedef enum VkScopeNV
+{
+    VK_SCOPE_DEVICE_NV = 1,
+    VK_SCOPE_WORKGROUP_NV = 2,
+    VK_SCOPE_SUBGROUP_NV = 3,
+    VK_SCOPE_QUEUE_FAMILY_NV = 5,
+    VK_SCOPE_BEGIN_RANGE_NV = VK_SCOPE_DEVICE_NV,
+    VK_SCOPE_END_RANGE_NV = VK_SCOPE_QUEUE_FAMILY_NV,
+    VK_SCOPE_RANGE_SIZE_NV = (VK_SCOPE_QUEUE_FAMILY_NV - VK_SCOPE_DEVICE_NV + 1),
+    VK_SCOPE_MAX_ENUM_NV = 0x7FFFFFFF
+} VkScopeNV;
+typedef struct VkCooperativeMatrixPropertiesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t MSize;
+    uint32_t NSize;
+    uint32_t KSize;
+    VkComponentTypeNV AType;
+    VkComponentTypeNV BType;
+    VkComponentTypeNV CType;
+    VkComponentTypeNV DType;
+    VkScopeNV scope;
+} VkCooperativeMatrixPropertiesNV;
+typedef struct VkPhysicalDeviceCooperativeMatrixFeaturesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 cooperativeMatrix;
+    VkBool32 cooperativeMatrixRobustBufferAccess;
+} VkPhysicalDeviceCooperativeMatrixFeaturesNV;
+typedef struct VkPhysicalDeviceCooperativeMatrixPropertiesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    VkShaderStageFlags cooperativeMatrixSupportedStages;
+} VkPhysicalDeviceCooperativeMatrixPropertiesNV;
+typedef VkResult(VKAPI_PTR* PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesNV)(VkPhysicalDevice physicalDevice, uint32_t* pPropertyCount, VkCooperativeMatrixPropertiesNV* pProperties);
+#endif // VK_HEADER_VERSION < 101
+
+#endif // NCNN_VULKAN_HEADER_FIX_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/lib/cmake/ncnn/ncnn-release.cmake b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/lib/cmake/ncnn/ncnn-release.cmake
new file mode 100644
index 0000000..1fb8660
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/lib/cmake/ncnn/ncnn-release.cmake
@@ -0,0 +1,19 @@
+#----------------------------------------------------------------
+# Generated CMake target import file for configuration "Release".
+#----------------------------------------------------------------
+
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+
+# Import target "ncnn" for configuration "Release"
+set_property(TARGET ncnn APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(ncnn PROPERTIES
+  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/libncnn.so"
+  IMPORTED_SONAME_RELEASE "libncnn.so"
+  )
+
+list(APPEND _cmake_import_check_targets ncnn )
+list(APPEND _cmake_import_check_files_for_ncnn "${_IMPORT_PREFIX}/lib/libncnn.so" )
+
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/lib/cmake/ncnn/ncnn.cmake b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/lib/cmake/ncnn/ncnn.cmake
new file mode 100644
index 0000000..53b9fae
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/lib/cmake/ncnn/ncnn.cmake
@@ -0,0 +1,109 @@
+# Generated by CMake
+
+if("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.8)
+   message(FATAL_ERROR "CMake >= 2.8.0 required")
+endif()
+if(CMAKE_VERSION VERSION_LESS "2.8.3")
+   message(FATAL_ERROR "CMake >= 2.8.3 required")
+endif()
+cmake_policy(PUSH)
+cmake_policy(VERSION 2.8.3...3.23)
+#----------------------------------------------------------------
+# Generated CMake target import file.
+#----------------------------------------------------------------
+
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+
+# Protect against multiple inclusion, which would fail when already imported targets are added once more.
+set(_cmake_targets_defined "")
+set(_cmake_targets_not_defined "")
+set(_cmake_expected_targets "")
+foreach(_cmake_expected_target IN ITEMS ncnn)
+  list(APPEND _cmake_expected_targets "${_cmake_expected_target}")
+  if(TARGET "${_cmake_expected_target}")
+    list(APPEND _cmake_targets_defined "${_cmake_expected_target}")
+  else()
+    list(APPEND _cmake_targets_not_defined "${_cmake_expected_target}")
+  endif()
+endforeach()
+unset(_cmake_expected_target)
+if(_cmake_targets_defined STREQUAL _cmake_expected_targets)
+  unset(_cmake_targets_defined)
+  unset(_cmake_targets_not_defined)
+  unset(_cmake_expected_targets)
+  unset(CMAKE_IMPORT_FILE_VERSION)
+  cmake_policy(POP)
+  return()
+endif()
+if(NOT _cmake_targets_defined STREQUAL "")
+  string(REPLACE ";" ", " _cmake_targets_defined_text "${_cmake_targets_defined}")
+  string(REPLACE ";" ", " _cmake_targets_not_defined_text "${_cmake_targets_not_defined}")
+  message(FATAL_ERROR "Some (but not all) targets in this export set were already defined.\nTargets Defined: ${_cmake_targets_defined_text}\nTargets not yet defined: ${_cmake_targets_not_defined_text}\n")
+endif()
+unset(_cmake_targets_defined)
+unset(_cmake_targets_not_defined)
+unset(_cmake_expected_targets)
+
+
+# Compute the installation prefix relative to this file.
+get_filename_component(_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+if(_IMPORT_PREFIX STREQUAL "/")
+  set(_IMPORT_PREFIX "")
+endif()
+
+# Create imported target ncnn
+add_library(ncnn SHARED IMPORTED)
+
+set_target_properties(ncnn PROPERTIES
+  INTERFACE_COMPILE_OPTIONS "-fno-rtti;-fno-exceptions"
+  INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include/ncnn"
+  INTERFACE_LINK_LIBRARIES "-fopenmp;-static-openmp;-Wl,-wrap,__kmp_affinity_determine_capable;Threads::Threads;Vulkan::Vulkan;android;jnigraphics;log"
+  INTERFACE_POSITION_INDEPENDENT_CODE "ON"
+)
+
+if(CMAKE_VERSION VERSION_LESS 2.8.12)
+  message(FATAL_ERROR "This file relies on consumers using CMake 2.8.12 or greater.")
+endif()
+
+# Load information for each installed configuration.
+file(GLOB _cmake_config_files "${CMAKE_CURRENT_LIST_DIR}/ncnn-*.cmake")
+foreach(_cmake_config_file IN LISTS _cmake_config_files)
+  include("${_cmake_config_file}")
+endforeach()
+unset(_cmake_config_file)
+unset(_cmake_config_files)
+
+# Cleanup temporary variables.
+set(_IMPORT_PREFIX)
+
+# Loop over all imported files and verify that they actually exist
+foreach(_cmake_target IN LISTS _cmake_import_check_targets)
+  foreach(_cmake_file IN LISTS "_cmake_import_check_files_for_${_cmake_target}")
+    if(NOT EXISTS "${_cmake_file}")
+      message(FATAL_ERROR "The imported target \"${_cmake_target}\" references the file
+   \"${_cmake_file}\"
+but this file does not exist.  Possible reasons include:
+* The file was deleted, renamed, or moved to another location.
+* An install or uninstall procedure did not complete successfully.
+* The installation package was faulty and contained
+   \"${CMAKE_CURRENT_LIST_FILE}\"
+but not all the files it references.
+")
+    endif()
+  endforeach()
+  unset(_cmake_file)
+  unset("_cmake_import_check_files_for_${_cmake_target}")
+endforeach()
+unset(_cmake_target)
+unset(_cmake_import_check_targets)
+
+# This file does not depend on other imported targets which have
+# been exported from the same project but in a separate export set.
+
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)
+cmake_policy(POP)
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/lib/cmake/ncnn/ncnnConfig.cmake b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/lib/cmake/ncnn/ncnnConfig.cmake
new file mode 100644
index 0000000..abb2dd6
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/lib/cmake/ncnn/ncnnConfig.cmake
@@ -0,0 +1,42 @@
+set(NCNN_OPENMP ON)
+set(NCNN_THREADS ON)
+set(NCNN_VULKAN ON)
+set(NCNN_SHARED_LIB ON)
+set(NCNN_SYSTEM_GLSLANG OFF)
+
+if(NCNN_OPENMP)
+    find_package(OpenMP)
+endif()
+
+if(NCNN_THREADS)
+    set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
+    set(THREADS_PREFER_PTHREAD_FLAG TRUE)
+    find_package(Threads REQUIRED)
+endif()
+
+if(NCNN_VULKAN)
+    find_package(Vulkan REQUIRED)
+
+    if(NOT NCNN_SHARED_LIB)
+        if(NCNN_SYSTEM_GLSLANG)
+            find_package(glslang QUIET)
+            if(NOT glslang_FOUND)
+                set(GLSLANG_TARGET_DIR "")
+                include(${GLSLANG_TARGET_DIR}/OSDependentTargets.cmake)
+                include(${GLSLANG_TARGET_DIR}/OGLCompilerTargets.cmake)
+                if(EXISTS "${GLSLANG_TARGET_DIR}/HLSLTargets.cmake")
+                    # hlsl support can be optional
+                    include("${GLSLANG_TARGET_DIR}/HLSLTargets.cmake")
+                endif()
+                include(${GLSLANG_TARGET_DIR}/glslangTargets.cmake)
+                include(${GLSLANG_TARGET_DIR}/SPIRVTargets.cmake)
+            endif()
+        else()
+            set(glslang_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../lib/cmake/glslang")
+            find_package(glslang QUIET)
+        endif()
+
+    endif()
+endif()
+
+include(${CMAKE_CURRENT_LIST_DIR}/ncnn.cmake)
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/lib/pkgconfig/ncnn.pc b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/lib/pkgconfig/ncnn.pc
new file mode 100644
index 0000000..2ae00de
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20221128-android-vulkan-shared/x86_64/lib/pkgconfig/ncnn.pc
@@ -0,0 +1,11 @@
+prefix=${pcfiledir}/../..
+librarydir=${prefix}/lib
+includedir=${prefix}/include
+
+Name: ncnn
+Description: high-performance neural network inference framework optimized for the mobile platform
+Version: 1.0.20221128
+URL: https://github.com/Tencent/ncnn
+Libs: -L"${librarydir}" -lncnn
+Cflags: -I"${includedir}"
+
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/allocator.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/allocator.h
new file mode 100644
index 0000000..3a5ebca
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/allocator.h
@@ -0,0 +1,448 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_ALLOCATOR_H
+#define NCNN_ALLOCATOR_H
+
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+
+#include "platform.h"
+
+#include <stdlib.h>
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+#include <android/hardware_buffer.h>
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+// the alignment of all the allocated buffers
+#if NCNN_AVX512
+#define NCNN_MALLOC_ALIGN 64
+#elif NCNN_AVX
+#define NCNN_MALLOC_ALIGN 32
+#else
+#define NCNN_MALLOC_ALIGN 16
+#endif
+
+// we have some optimized kernels that may overread buffer a bit in loop
+// it is common to interleave next-loop data load with arithmetic instructions
+// allocating more bytes keeps us safe from SEGV_ACCERR failure
+#define NCNN_MALLOC_OVERREAD 64
+
+// Aligns a pointer to the specified number of bytes
+// ptr Aligned pointer
+// n Alignment size that must be a power of two
+template<typename _Tp>
+static NCNN_FORCEINLINE _Tp* alignPtr(_Tp* ptr, int n = (int)sizeof(_Tp))
+{
+    return (_Tp*)(((size_t)ptr + n - 1) & -n);
+}
+
+// Aligns a buffer size to the specified number of bytes
+// The function returns the minimum number that is greater or equal to sz and is divisible by n
+// sz Buffer size to align
+// n Alignment size that must be a power of two
+static NCNN_FORCEINLINE size_t alignSize(size_t sz, int n)
+{
+    return (sz + n - 1) & -n;
+}
+
+static NCNN_FORCEINLINE void* fastMalloc(size_t size)
+{
+#if _MSC_VER
+    return _aligned_malloc(size, NCNN_MALLOC_ALIGN);
+#elif (defined(__unix__) || defined(__APPLE__)) && _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17)
+    void* ptr = 0;
+    if (posix_memalign(&ptr, NCNN_MALLOC_ALIGN, size + NCNN_MALLOC_OVERREAD))
+        ptr = 0;
+    return ptr;
+#elif __ANDROID__ && __ANDROID_API__ < 17
+    return memalign(NCNN_MALLOC_ALIGN, size + NCNN_MALLOC_OVERREAD);
+#else
+    unsigned char* udata = (unsigned char*)malloc(size + sizeof(void*) + NCNN_MALLOC_ALIGN + NCNN_MALLOC_OVERREAD);
+    if (!udata)
+        return 0;
+    unsigned char** adata = alignPtr((unsigned char**)udata + 1, NCNN_MALLOC_ALIGN);
+    adata[-1] = udata;
+    return adata;
+#endif
+}
+
+static NCNN_FORCEINLINE void fastFree(void* ptr)
+{
+    if (ptr)
+    {
+#if _MSC_VER
+        _aligned_free(ptr);
+#elif (defined(__unix__) || defined(__APPLE__)) && _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17)
+        free(ptr);
+#elif __ANDROID__ && __ANDROID_API__ < 17
+        free(ptr);
+#else
+        unsigned char* udata = ((unsigned char**)ptr)[-1];
+        free(udata);
+#endif
+    }
+}
+
+#if NCNN_THREADS
+// exchange-add operation for atomic operations on reference counters
+#if defined __riscv && !defined __riscv_atomic
+// riscv target without A extension
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#elif defined __INTEL_COMPILER && !(defined WIN32 || defined _WIN32)
+// atomic increment on the linux version of the Intel(tm) compiler
+#define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd(const_cast<void*>(reinterpret_cast<volatile void*>(addr)), delta)
+#elif defined __GNUC__
+#if defined __clang__ && __clang_major__ >= 3 && !defined __ANDROID__ && !defined __EMSCRIPTEN__ && !defined(__CUDACC__)
+#ifdef __ATOMIC_ACQ_REL
+#define NCNN_XADD(addr, delta) __c11_atomic_fetch_add((_Atomic(int)*)(addr), delta, __ATOMIC_ACQ_REL)
+#else
+#define NCNN_XADD(addr, delta) __atomic_fetch_add((_Atomic(int)*)(addr), delta, 4)
+#endif
+#else
+#if defined __ATOMIC_ACQ_REL && !defined __clang__
+// version for gcc >= 4.7
+#define NCNN_XADD(addr, delta) (int)__atomic_fetch_add((unsigned*)(addr), (unsigned)(delta), __ATOMIC_ACQ_REL)
+#else
+#define NCNN_XADD(addr, delta) (int)__sync_fetch_and_add((unsigned*)(addr), (unsigned)(delta))
+#endif
+#endif
+#elif defined _MSC_VER && !defined RC_INVOKED
+#define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile*)addr, delta)
+#else
+// thread-unsafe branch
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#endif
+#else  // NCNN_THREADS
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#endif // NCNN_THREADS
+
+class NCNN_EXPORT Allocator
+{
+public:
+    virtual ~Allocator();
+    virtual void* fastMalloc(size_t size) = 0;
+    virtual void fastFree(void* ptr) = 0;
+};
+
+class PoolAllocatorPrivate;
+class NCNN_EXPORT PoolAllocator : public Allocator
+{
+public:
+    PoolAllocator();
+    ~PoolAllocator();
+
+    // ratio range 0 ~ 1
+    // default cr = 0
+    void set_size_compare_ratio(float scr);
+
+    // budget drop threshold
+    // default threshold = 10
+    void set_size_drop_threshold(size_t);
+
+    // release all budgets immediately
+    void clear();
+
+    virtual void* fastMalloc(size_t size);
+    virtual void fastFree(void* ptr);
+
+private:
+    PoolAllocator(const PoolAllocator&);
+    PoolAllocator& operator=(const PoolAllocator&);
+
+private:
+    PoolAllocatorPrivate* const d;
+};
+
+class UnlockedPoolAllocatorPrivate;
+class NCNN_EXPORT UnlockedPoolAllocator : public Allocator
+{
+public:
+    UnlockedPoolAllocator();
+    ~UnlockedPoolAllocator();
+
+    // ratio range 0 ~ 1
+    // default cr = 0
+    void set_size_compare_ratio(float scr);
+
+    // budget drop threshold
+    // default threshold = 10
+    void set_size_drop_threshold(size_t);
+
+    // release all budgets immediately
+    void clear();
+
+    virtual void* fastMalloc(size_t size);
+    virtual void fastFree(void* ptr);
+
+private:
+    UnlockedPoolAllocator(const UnlockedPoolAllocator&);
+    UnlockedPoolAllocator& operator=(const UnlockedPoolAllocator&);
+
+private:
+    UnlockedPoolAllocatorPrivate* const d;
+};
+
+#if NCNN_VULKAN
+
+class VulkanDevice;
+
+class NCNN_EXPORT VkBufferMemory
+{
+public:
+    VkBuffer buffer;
+
+    // the base offset assigned by allocator
+    size_t offset;
+    size_t capacity;
+
+    VkDeviceMemory memory;
+    void* mapped_ptr;
+
+    // buffer state, modified by command functions internally
+    mutable VkAccessFlags access_flags;
+    mutable VkPipelineStageFlags stage_flags;
+
+    // initialize and modified by mat
+    int refcount;
+};
+
+class NCNN_EXPORT VkImageMemory
+{
+public:
+    VkImage image;
+    VkImageView imageview;
+
+    // underlying info assigned by allocator
+    int width;
+    int height;
+    int depth;
+    VkFormat format;
+
+    VkDeviceMemory memory;
+    void* mapped_ptr;
+
+    // the base offset assigned by allocator
+    size_t bind_offset;
+    size_t bind_capacity;
+
+    // image state, modified by command functions internally
+    mutable VkAccessFlags access_flags;
+    mutable VkImageLayout image_layout;
+    mutable VkPipelineStageFlags stage_flags;
+
+    // in-execution state, modified by command functions internally
+    mutable int command_refcount;
+
+    // initialize and modified by mat
+    int refcount;
+};
+
+class NCNN_EXPORT VkAllocator
+{
+public:
+    explicit VkAllocator(const VulkanDevice* _vkdev);
+    virtual ~VkAllocator();
+
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size) = 0;
+    virtual void fastFree(VkBufferMemory* ptr) = 0;
+    virtual int flush(VkBufferMemory* ptr);
+    virtual int invalidate(VkBufferMemory* ptr);
+
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack) = 0;
+    virtual void fastFree(VkImageMemory* ptr) = 0;
+
+public:
+    const VulkanDevice* vkdev;
+    uint32_t buffer_memory_type_index;
+    uint32_t image_memory_type_index;
+    uint32_t reserved_type_index;
+    bool mappable;
+    bool coherent;
+
+protected:
+    VkBuffer create_buffer(size_t size, VkBufferUsageFlags usage);
+    VkDeviceMemory allocate_memory(size_t size, uint32_t memory_type_index);
+    VkDeviceMemory allocate_dedicated_memory(size_t size, uint32_t memory_type_index, VkImage image, VkBuffer buffer);
+
+    VkImage create_image(int width, int height, int depth, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage);
+    VkImageView create_imageview(VkImage image, VkFormat format);
+};
+
+class VkBlobAllocatorPrivate;
+class NCNN_EXPORT VkBlobAllocator : public VkAllocator
+{
+public:
+    explicit VkBlobAllocator(const VulkanDevice* vkdev, size_t preferred_block_size = 16 * 1024 * 1024); // 16M
+    virtual ~VkBlobAllocator();
+
+public:
+    // release all budgets immediately
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkBlobAllocator(const VkBlobAllocator&);
+    VkBlobAllocator& operator=(const VkBlobAllocator&);
+
+private:
+    VkBlobAllocatorPrivate* const d;
+};
+
+class VkWeightAllocatorPrivate;
+class NCNN_EXPORT VkWeightAllocator : public VkAllocator
+{
+public:
+    explicit VkWeightAllocator(const VulkanDevice* vkdev, size_t preferred_block_size = 8 * 1024 * 1024); // 8M
+    virtual ~VkWeightAllocator();
+
+public:
+    // release all blocks immediately
+    virtual void clear();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkWeightAllocator(const VkWeightAllocator&);
+    VkWeightAllocator& operator=(const VkWeightAllocator&);
+
+private:
+    VkWeightAllocatorPrivate* const d;
+};
+
+class VkStagingAllocatorPrivate;
+class NCNN_EXPORT VkStagingAllocator : public VkAllocator
+{
+public:
+    explicit VkStagingAllocator(const VulkanDevice* vkdev);
+    virtual ~VkStagingAllocator();
+
+public:
+    // ratio range 0 ~ 1
+    // default cr = 0.75
+    void set_size_compare_ratio(float scr);
+
+    // release all budgets immediately
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkStagingAllocator(const VkStagingAllocator&);
+    VkStagingAllocator& operator=(const VkStagingAllocator&);
+
+private:
+    VkStagingAllocatorPrivate* const d;
+};
+
+class VkWeightStagingAllocatorPrivate;
+class NCNN_EXPORT VkWeightStagingAllocator : public VkAllocator
+{
+public:
+    explicit VkWeightStagingAllocator(const VulkanDevice* vkdev);
+    virtual ~VkWeightStagingAllocator();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkWeightStagingAllocator(const VkWeightStagingAllocator&);
+    VkWeightStagingAllocator& operator=(const VkWeightStagingAllocator&);
+
+private:
+    VkWeightStagingAllocatorPrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class NCNN_EXPORT VkAndroidHardwareBufferImageAllocator : public VkAllocator
+{
+public:
+    VkAndroidHardwareBufferImageAllocator(const VulkanDevice* _vkdev, AHardwareBuffer* _hb);
+    virtual ~VkAndroidHardwareBufferImageAllocator();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkAndroidHardwareBufferImageAllocator(const VkAndroidHardwareBufferImageAllocator&);
+    VkAndroidHardwareBufferImageAllocator& operator=(const VkAndroidHardwareBufferImageAllocator&);
+
+public:
+    int init();
+
+    int width() const;
+    int height() const;
+    uint64_t external_format() const;
+
+public:
+    AHardwareBuffer* hb;
+    AHardwareBuffer_Desc bufferDesc;
+    VkAndroidHardwareBufferFormatPropertiesANDROID bufferFormatProperties;
+    VkAndroidHardwareBufferPropertiesANDROID bufferProperties;
+    VkSamplerYcbcrConversionKHR samplerYcbcrConversion;
+};
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_ALLOCATOR_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/benchmark.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/benchmark.h
new file mode 100644
index 0000000..ed42c1a
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/benchmark.h
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_BENCHMARK_H
+#define NCNN_BENCHMARK_H
+
+#include "layer.h"
+#include "mat.h"
+#include "platform.h"
+
+namespace ncnn {
+
+// get now timestamp in ms
+NCNN_EXPORT double get_current_time();
+
+// sleep milliseconds
+NCNN_EXPORT void sleep(unsigned long long int milliseconds = 1000);
+
+#if NCNN_BENCHMARK
+
+NCNN_EXPORT void benchmark(const Layer* layer, double start, double end);
+NCNN_EXPORT void benchmark(const Layer* layer, const Mat& bottom_blob, Mat& top_blob, double start, double end);
+
+#endif // NCNN_BENCHMARK
+
+} // namespace ncnn
+
+#endif // NCNN_BENCHMARK_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/blob.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/blob.h
new file mode 100644
index 0000000..c9f144f
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/blob.h
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_BLOB_H
+#define NCNN_BLOB_H
+
+#include "mat.h"
+#include "platform.h"
+
+namespace ncnn {
+
+class NCNN_EXPORT Blob
+{
+public:
+    // empty
+    Blob();
+
+public:
+#if NCNN_STRING
+    // blob name
+    std::string name;
+#endif // NCNN_STRING
+    // layer index which produce this blob as output
+    int producer;
+    // layer index which need this blob as input
+    int consumer;
+    // shape hint
+    Mat shape;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_BLOB_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/c_api.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/c_api.h
new file mode 100644
index 0000000..31d5b6d
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/c_api.h
@@ -0,0 +1,347 @@
+/* Tencent is pleased to support the open source community by making ncnn available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed
+ * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+ * CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ */
+
+#ifndef NCNN_C_API_H
+#define NCNN_C_API_H
+
+#include "platform.h"
+
+#if NCNN_C_API
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+NCNN_EXPORT const char* ncnn_version();
+
+/* allocator api */
+typedef struct __ncnn_allocator_t* ncnn_allocator_t;
+struct NCNN_EXPORT __ncnn_allocator_t
+{
+    void* pthis;
+
+    void* (*fast_malloc)(ncnn_allocator_t allocator, size_t size);
+    void (*fast_free)(ncnn_allocator_t allocator, void* ptr);
+};
+
+NCNN_EXPORT ncnn_allocator_t ncnn_allocator_create_pool_allocator();
+NCNN_EXPORT ncnn_allocator_t ncnn_allocator_create_unlocked_pool_allocator();
+NCNN_EXPORT void ncnn_allocator_destroy(ncnn_allocator_t allocator);
+
+/* option api */
+typedef struct __ncnn_option_t* ncnn_option_t;
+
+NCNN_EXPORT ncnn_option_t ncnn_option_create();
+NCNN_EXPORT void ncnn_option_destroy(ncnn_option_t opt);
+
+NCNN_EXPORT int ncnn_option_get_num_threads(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_num_threads(ncnn_option_t opt, int num_threads);
+
+NCNN_EXPORT int ncnn_option_get_use_local_pool_allocator(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_use_local_pool_allocator(ncnn_option_t opt, int use_local_pool_allocator);
+
+NCNN_EXPORT void ncnn_option_set_blob_allocator(ncnn_option_t opt, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_option_set_workspace_allocator(ncnn_option_t opt, ncnn_allocator_t allocator);
+
+NCNN_EXPORT int ncnn_option_get_use_vulkan_compute(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_use_vulkan_compute(ncnn_option_t opt, int use_vulkan_compute);
+
+/* mat api */
+typedef struct __ncnn_mat_t* ncnn_mat_t;
+
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create();
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_1d(int w, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_2d(int w, int h, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_3d(int w, int h, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_4d(int w, int h, int d, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_1d(int w, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_2d(int w, int h, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_3d(int w, int h, int c, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_4d(int w, int h, int d, int c, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_1d_elem(int w, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_2d_elem(int w, int h, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_3d_elem(int w, int h, int c, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_4d_elem(int w, int h, int d, int c, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_1d_elem(int w, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_2d_elem(int w, int h, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_3d_elem(int w, int h, int c, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_4d_elem(int w, int h, int d, int c, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_mat_destroy(ncnn_mat_t mat);
+
+NCNN_EXPORT void ncnn_mat_fill_float(ncnn_mat_t mat, float v);
+
+NCNN_EXPORT ncnn_mat_t ncnn_mat_clone(const ncnn_mat_t mat, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_1d(const ncnn_mat_t mat, int w, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_2d(const ncnn_mat_t mat, int w, int h, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_3d(const ncnn_mat_t mat, int w, int h, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_4d(const ncnn_mat_t mat, int w, int h, int d, int c, ncnn_allocator_t allocator);
+
+NCNN_EXPORT int ncnn_mat_get_dims(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_w(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_h(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_d(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_c(const ncnn_mat_t mat);
+NCNN_EXPORT size_t ncnn_mat_get_elemsize(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_elempack(const ncnn_mat_t mat);
+NCNN_EXPORT size_t ncnn_mat_get_cstep(const ncnn_mat_t mat);
+NCNN_EXPORT void* ncnn_mat_get_data(const ncnn_mat_t mat);
+
+NCNN_EXPORT void* ncnn_mat_get_channel_data(const ncnn_mat_t mat, int c);
+
+#if NCNN_PIXEL
+
+/* mat pixel api */
+#define NCNN_MAT_PIXEL_RGB       1
+#define NCNN_MAT_PIXEL_BGR       2
+#define NCNN_MAT_PIXEL_GRAY      3
+#define NCNN_MAT_PIXEL_RGBA      4
+#define NCNN_MAT_PIXEL_BGRA      5
+#define NCNN_MAT_PIXEL_X2Y(X, Y) (X | (Y << 16))
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels(const unsigned char* pixels, int type, int w, int h, int stride, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int stride, int target_width, int target_height, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, int target_width, int target_height, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_mat_to_pixels(const ncnn_mat_t mat, unsigned char* pixels, int type, int stride);
+NCNN_EXPORT void ncnn_mat_to_pixels_resize(const ncnn_mat_t mat, unsigned char* pixels, int type, int target_width, int target_height, int target_stride);
+
+#endif /* NCNN_PIXEL */
+
+NCNN_EXPORT void ncnn_mat_substract_mean_normalize(ncnn_mat_t mat, const float* mean_vals, const float* norm_vals);
+
+NCNN_EXPORT void ncnn_convert_packing(const ncnn_mat_t src, ncnn_mat_t* dst, int elempack, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_flatten(const ncnn_mat_t src, ncnn_mat_t* dst, const ncnn_option_t opt);
+
+/* blob api */
+typedef struct __ncnn_blob_t* ncnn_blob_t;
+
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_blob_get_name(const ncnn_blob_t blob);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_blob_get_producer(const ncnn_blob_t blob);
+NCNN_EXPORT int ncnn_blob_get_consumer(const ncnn_blob_t blob);
+
+NCNN_EXPORT void ncnn_blob_get_shape(const ncnn_blob_t blob, int* dims, int* w, int* h, int* c);
+
+/* paramdict api */
+typedef struct __ncnn_paramdict_t* ncnn_paramdict_t;
+
+NCNN_EXPORT ncnn_paramdict_t ncnn_paramdict_create();
+NCNN_EXPORT void ncnn_paramdict_destroy(ncnn_paramdict_t pd);
+
+NCNN_EXPORT int ncnn_paramdict_get_type(const ncnn_paramdict_t pd, int id);
+
+NCNN_EXPORT int ncnn_paramdict_get_int(const ncnn_paramdict_t pd, int id, int def);
+NCNN_EXPORT float ncnn_paramdict_get_float(const ncnn_paramdict_t pd, int id, float def);
+NCNN_EXPORT ncnn_mat_t ncnn_paramdict_get_array(const ncnn_paramdict_t pd, int id, const ncnn_mat_t def);
+
+NCNN_EXPORT void ncnn_paramdict_set_int(ncnn_paramdict_t pd, int id, int i);
+NCNN_EXPORT void ncnn_paramdict_set_float(ncnn_paramdict_t pd, int id, float f);
+NCNN_EXPORT void ncnn_paramdict_set_array(ncnn_paramdict_t pd, int id, const ncnn_mat_t v);
+
+/* datareader api */
+typedef struct __ncnn_datareader_t* ncnn_datareader_t;
+struct NCNN_EXPORT __ncnn_datareader_t
+{
+    void* pthis;
+
+#if NCNN_STRING
+    int (*scan)(ncnn_datareader_t dr, const char* format, void* p);
+#endif /* NCNN_STRING */
+    size_t (*read)(ncnn_datareader_t dr, void* buf, size_t size);
+};
+
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create();
+#if NCNN_STDIO
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create_from_stdio(FILE* fp);
+#endif /* NCNN_STDIO */
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create_from_memory(const unsigned char** mem);
+NCNN_EXPORT void ncnn_datareader_destroy(ncnn_datareader_t dr);
+
+/* modelbin api */
+typedef struct __ncnn_modelbin_t* ncnn_modelbin_t;
+struct NCNN_EXPORT __ncnn_modelbin_t
+{
+    void* pthis;
+
+    ncnn_mat_t (*load_1d)(const ncnn_modelbin_t mb, int w, int type);
+    ncnn_mat_t (*load_2d)(const ncnn_modelbin_t mb, int w, int h, int type);
+    ncnn_mat_t (*load_3d)(const ncnn_modelbin_t mb, int w, int h, int c, int type);
+};
+
+NCNN_EXPORT ncnn_modelbin_t ncnn_modelbin_create_from_datareader(const ncnn_datareader_t dr);
+NCNN_EXPORT ncnn_modelbin_t ncnn_modelbin_create_from_mat_array(const ncnn_mat_t* weights, int n);
+NCNN_EXPORT void ncnn_modelbin_destroy(ncnn_modelbin_t mb);
+
+/* layer api */
+typedef struct __ncnn_layer_t* ncnn_layer_t;
+struct NCNN_EXPORT __ncnn_layer_t
+{
+    void* pthis;
+
+    int (*load_param)(ncnn_layer_t layer, const ncnn_paramdict_t pd);
+    int (*load_model)(ncnn_layer_t layer, const ncnn_modelbin_t mb);
+
+    int (*create_pipeline)(ncnn_layer_t layer, const ncnn_option_t opt);
+    int (*destroy_pipeline)(ncnn_layer_t layer, const ncnn_option_t opt);
+
+    int (*forward_1)(const ncnn_layer_t layer, const ncnn_mat_t bottom_blob, ncnn_mat_t* top_blob, const ncnn_option_t opt);
+    int (*forward_n)(const ncnn_layer_t layer, const ncnn_mat_t* bottom_blobs, int n, ncnn_mat_t* top_blobs, int n2, const ncnn_option_t opt);
+
+    int (*forward_inplace_1)(const ncnn_layer_t layer, ncnn_mat_t bottom_top_blob, const ncnn_option_t opt);
+    int (*forward_inplace_n)(const ncnn_layer_t layer, ncnn_mat_t* bottom_top_blobs, int n, const ncnn_option_t opt);
+};
+
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create();
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create_by_typeindex(int typeindex);
+#if NCNN_STRING
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create_by_type(const char* type);
+NCNN_EXPORT int ncnn_layer_type_to_index(const char* type);
+#endif /* NCNN_STRING */
+NCNN_EXPORT void ncnn_layer_destroy(ncnn_layer_t layer);
+
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_layer_get_name(const ncnn_layer_t layer);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_layer_get_typeindex(const ncnn_layer_t layer);
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_layer_get_type(const ncnn_layer_t layer);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_layer_get_one_blob_only(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_inplace(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_vulkan(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_packing(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_bf16_storage(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_fp16_storage(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_image_storage(const ncnn_layer_t layer);
+
+NCNN_EXPORT void ncnn_layer_set_one_blob_only(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_inplace(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_vulkan(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_packing(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_bf16_storage(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_fp16_storage(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_image_storage(ncnn_layer_t layer, int enable);
+
+NCNN_EXPORT int ncnn_layer_get_bottom_count(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_bottom(const ncnn_layer_t layer, int i);
+NCNN_EXPORT int ncnn_layer_get_top_count(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_top(const ncnn_layer_t layer, int i);
+
+NCNN_EXPORT void ncnn_blob_get_bottom_shape(const ncnn_layer_t layer, int i, int* dims, int* w, int* h, int* c);
+NCNN_EXPORT void ncnn_blob_get_top_shape(const ncnn_layer_t layer, int i, int* dims, int* w, int* h, int* c);
+
+/* layer factory function */
+typedef ncnn_layer_t (*ncnn_layer_creator_t)(void* userdata);
+typedef void (*ncnn_layer_destroyer_t)(ncnn_layer_t layer, void* userdata);
+
+typedef struct __ncnn_net_custom_layer_factory_t* ncnn_net_custom_layer_factory_t;
+struct __ncnn_net_custom_layer_factory_t
+{
+    ncnn_layer_creator_t creator;
+    ncnn_layer_destroyer_t destroyer;
+    void* userdata;
+    ncnn_net_custom_layer_factory_t next;
+};
+
+/* net api */
+typedef struct __ncnn_net_t* ncnn_net_t;
+struct __ncnn_net_t
+{
+    void* pthis;
+
+    ncnn_net_custom_layer_factory_t custom_layer_factory;
+};
+
+NCNN_EXPORT ncnn_net_t ncnn_net_create();
+NCNN_EXPORT void ncnn_net_destroy(ncnn_net_t net);
+
+NCNN_EXPORT ncnn_option_t ncnn_net_get_option(ncnn_net_t net);
+NCNN_EXPORT void ncnn_net_set_option(ncnn_net_t net, ncnn_option_t opt);
+
+#if NCNN_STRING
+NCNN_EXPORT void ncnn_net_register_custom_layer_by_type(ncnn_net_t net, const char* type, ncnn_layer_creator_t creator, ncnn_layer_destroyer_t destroyer, void* userdata);
+#endif /* NCNN_STRING */
+NCNN_EXPORT void ncnn_net_register_custom_layer_by_typeindex(ncnn_net_t net, int typeindex, ncnn_layer_creator_t creator, ncnn_layer_destroyer_t destroyer, void* userdata);
+
+#if NCNN_STDIO
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param(ncnn_net_t net, const char* path);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_net_load_param_bin(ncnn_net_t net, const char* path);
+NCNN_EXPORT int ncnn_net_load_model(ncnn_net_t net, const char* path);
+#endif /* NCNN_STDIO */
+
+#if NCNN_STDIO
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param_memory(ncnn_net_t net, const char* mem);
+#endif /* NCNN_STRING */
+#endif /* NCNN_STDIO */
+NCNN_EXPORT int ncnn_net_load_param_bin_memory(ncnn_net_t net, const unsigned char* mem);
+NCNN_EXPORT int ncnn_net_load_model_memory(ncnn_net_t net, const unsigned char* mem);
+
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_net_load_param_bin_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+NCNN_EXPORT int ncnn_net_load_model_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+
+NCNN_EXPORT void ncnn_net_clear(ncnn_net_t net);
+
+NCNN_EXPORT int ncnn_net_get_input_count(const ncnn_net_t net);
+NCNN_EXPORT int ncnn_net_get_output_count(const ncnn_net_t net);
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_net_get_input_name(const ncnn_net_t net, int i);
+NCNN_EXPORT const char* ncnn_net_get_output_name(const ncnn_net_t net, int i);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_net_get_input_index(const ncnn_net_t net, int i);
+NCNN_EXPORT int ncnn_net_get_output_index(const ncnn_net_t net, int i);
+
+/* extractor api */
+typedef struct __ncnn_extractor_t* ncnn_extractor_t;
+
+NCNN_EXPORT ncnn_extractor_t ncnn_extractor_create(ncnn_net_t net);
+NCNN_EXPORT void ncnn_extractor_destroy(ncnn_extractor_t ex);
+
+NCNN_EXPORT void ncnn_extractor_set_option(ncnn_extractor_t ex, const ncnn_option_t opt);
+
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_extractor_input(ncnn_extractor_t ex, const char* name, const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_extractor_extract(ncnn_extractor_t ex, const char* name, ncnn_mat_t* mat);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_extractor_input_index(ncnn_extractor_t ex, int index, const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_extractor_extract_index(ncnn_extractor_t ex, int index, ncnn_mat_t* mat);
+
+/* mat process api */
+#define NCNN_BORDER_CONSTANT    0
+#define NCNN_BORDER_REPLICATE   1
+#define NCNN_BORDER_REFLECT     2
+#define NCNN_BORDER_TRANSPARENT -233
+NCNN_EXPORT void ncnn_copy_make_border(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int type, float v, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_copy_make_border_3d(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int front, int behind, int type, float v, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_copy_cut_border(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_copy_cut_border_3d(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int front, int behind, const ncnn_option_t opt);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* NCNN_C_API */
+
+#endif /* NCNN_C_API_H */
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/command.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/command.h
new file mode 100644
index 0000000..337d085
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/command.h
@@ -0,0 +1,136 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_COMMAND_H
+#define NCNN_COMMAND_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+
+#include "mat.h"
+
+#include <vulkan/vulkan.h>
+
+namespace ncnn {
+
+class Pipeline;
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class ImportAndroidHardwareBufferPipeline;
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+class VkComputePrivate;
+class NCNN_EXPORT VkCompute
+{
+public:
+    explicit VkCompute(const VulkanDevice* vkdev);
+    virtual ~VkCompute();
+
+public:
+    void record_upload(const Mat& src, VkMat& dst, const Option& opt);
+
+    void record_upload(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    void record_download(const VkMat& src, Mat& dst, const Option& opt);
+
+    void record_download(const VkImageMat& src, Mat& dst, const Option& opt);
+
+    void record_buffer_to_image(const VkMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_image_to_buffer(const VkImageMat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const Mat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, Mat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, Mat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, VkMat& dst, const Option& opt);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkImageMat>& bindings, const std::vector<vk_constant_type>& constants, const VkImageMat& dispatcher);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher);
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const VkImageMat& dispatcher);
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const Mat& dispatcher);
+
+#if NCNN_BENCHMARK
+    void record_write_timestamp(uint32_t query);
+#endif // NCNN_BENCHMARK
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+    void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkMat& dst);
+
+    void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkImageMat& dst);
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+    int submit_and_wait();
+
+    int reset();
+
+#if NCNN_BENCHMARK
+    int create_query_pool(uint32_t query_count);
+
+    int get_query_pool_results(uint32_t first_query, uint32_t query_count, std::vector<uint64_t>& results);
+#endif // NCNN_BENCHMARK
+
+protected:
+    const VulkanDevice* vkdev;
+
+    void barrier_readwrite(const VkMat& binding);
+    void barrier_readwrite(const VkImageMat& binding);
+    void barrier_readonly(const VkImageMat& binding);
+
+private:
+    VkComputePrivate* const d;
+};
+
+class VkTransferPrivate;
+class NCNN_EXPORT VkTransfer
+{
+public:
+    explicit VkTransfer(const VulkanDevice* vkdev);
+    virtual ~VkTransfer();
+
+public:
+    void record_upload(const Mat& src, VkMat& dst, const Option& opt, bool flatten = true);
+
+    void record_upload(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    int submit_and_wait();
+
+protected:
+    const VulkanDevice* vkdev;
+
+private:
+    VkTransferPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_VULKAN
+
+#endif // NCNN_COMMAND_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/cpu.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/cpu.h
new file mode 100644
index 0000000..7d6bfce
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/cpu.h
@@ -0,0 +1,178 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_CPU_H
+#define NCNN_CPU_H
+
+#include <stddef.h>
+
+#if (defined _WIN32 && !(defined __MINGW32__))
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+#if defined __ANDROID__ || defined __linux__
+#include <sched.h> // cpu_set_t
+#endif
+
+#include "platform.h"
+
+namespace ncnn {
+
+class NCNN_EXPORT CpuSet
+{
+public:
+    CpuSet();
+    void enable(int cpu);
+    void disable(int cpu);
+    void disable_all();
+    bool is_enabled(int cpu) const;
+    int num_enabled() const;
+
+public:
+#if (defined _WIN32 && !(defined __MINGW32__))
+    ULONG_PTR mask;
+#endif
+#if defined __ANDROID__ || defined __linux__
+    cpu_set_t cpu_set;
+#endif
+#if __APPLE__
+    unsigned int policy;
+#endif
+};
+
+// test optional cpu features
+// edsp = armv7 edsp
+NCNN_EXPORT int cpu_support_arm_edsp();
+// neon = armv7 neon or aarch64 asimd
+NCNN_EXPORT int cpu_support_arm_neon();
+// vfpv4 = armv7 fp16 + fma
+NCNN_EXPORT int cpu_support_arm_vfpv4();
+// asimdhp = aarch64 asimd half precision
+NCNN_EXPORT int cpu_support_arm_asimdhp();
+// cpuid = aarch64 cpuid info
+NCNN_EXPORT int cpu_support_arm_cpuid();
+// asimddp = aarch64 asimd dot product
+NCNN_EXPORT int cpu_support_arm_asimddp();
+// asimdfhm = aarch64 asimd fhm
+NCNN_EXPORT int cpu_support_arm_asimdfhm();
+// bf16 = aarch64 bf16
+NCNN_EXPORT int cpu_support_arm_bf16();
+// i8mm = aarch64 i8mm
+NCNN_EXPORT int cpu_support_arm_i8mm();
+// sve = aarch64 sve
+NCNN_EXPORT int cpu_support_arm_sve();
+// sve2 = aarch64 sve2
+NCNN_EXPORT int cpu_support_arm_sve2();
+// svebf16 = aarch64 svebf16
+NCNN_EXPORT int cpu_support_arm_svebf16();
+// svei8mm = aarch64 svei8mm
+NCNN_EXPORT int cpu_support_arm_svei8mm();
+// svef32mm = aarch64 svef32mm
+NCNN_EXPORT int cpu_support_arm_svef32mm();
+
+// avx = x86 avx
+NCNN_EXPORT int cpu_support_x86_avx();
+// fma = x86 fma
+NCNN_EXPORT int cpu_support_x86_fma();
+// xop = x86 xop
+NCNN_EXPORT int cpu_support_x86_xop();
+// f16c = x86 f16c
+NCNN_EXPORT int cpu_support_x86_f16c();
+// avx2 = x86 avx2 + fma + f16c
+NCNN_EXPORT int cpu_support_x86_avx2();
+// avx_vnni = x86 avx vnni
+NCNN_EXPORT int cpu_support_x86_avx_vnni();
+// avx512 = x86 avx512f + avx512cd + avx512bw + avx512dq + avx512vl
+NCNN_EXPORT int cpu_support_x86_avx512();
+// avx512_vnni = x86 avx512 vnni
+NCNN_EXPORT int cpu_support_x86_avx512_vnni();
+// avx512_bf16 = x86 avx512 bf16
+NCNN_EXPORT int cpu_support_x86_avx512_bf16();
+// avx512_fp16 = x86 avx512 fp16
+NCNN_EXPORT int cpu_support_x86_avx512_fp16();
+
+// lsx = loongarch lsx
+NCNN_EXPORT int cpu_support_loongarch_lsx();
+// lasx = loongarch lasx
+NCNN_EXPORT int cpu_support_loongarch_lasx();
+
+// msa = mips mas
+NCNN_EXPORT int cpu_support_mips_msa();
+// mmi = loongson mmi
+NCNN_EXPORT int cpu_support_loongson_mmi();
+
+// v = riscv vector
+NCNN_EXPORT int cpu_support_riscv_v();
+// zfh = riscv half-precision float
+NCNN_EXPORT int cpu_support_riscv_zfh();
+// vlenb = riscv vector length in bytes
+NCNN_EXPORT int cpu_riscv_vlenb();
+
+// cpu info
+NCNN_EXPORT int get_cpu_count();
+NCNN_EXPORT int get_little_cpu_count();
+NCNN_EXPORT int get_big_cpu_count();
+
+NCNN_EXPORT int get_physical_cpu_count();
+NCNN_EXPORT int get_physical_little_cpu_count();
+NCNN_EXPORT int get_physical_big_cpu_count();
+
+// cpu l2 varies from 64k to 1M, but l3 can be zero
+NCNN_EXPORT int get_cpu_level2_cache_size();
+NCNN_EXPORT int get_cpu_level3_cache_size();
+
+// bind all threads on little clusters if powersave enabled
+// affects HMP arch cpu like ARM big.LITTLE
+// only implemented on android at the moment
+// switching powersave is expensive and not thread-safe
+// 0 = all cores enabled(default)
+// 1 = only little clusters enabled
+// 2 = only big clusters enabled
+// return 0 if success for setter function
+NCNN_EXPORT int get_cpu_powersave();
+NCNN_EXPORT int set_cpu_powersave(int powersave);
+
+// convenient wrapper
+NCNN_EXPORT const CpuSet& get_cpu_thread_affinity_mask(int powersave);
+
+// set explicit thread affinity
+NCNN_EXPORT int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask);
+
+// runtime thread affinity info
+NCNN_EXPORT int is_current_thread_running_on_a53_a55();
+
+// misc function wrapper for openmp routines
+NCNN_EXPORT int get_omp_num_threads();
+NCNN_EXPORT void set_omp_num_threads(int num_threads);
+
+NCNN_EXPORT int get_omp_dynamic();
+NCNN_EXPORT void set_omp_dynamic(int dynamic);
+
+NCNN_EXPORT int get_omp_thread_num();
+
+NCNN_EXPORT int get_kmp_blocktime();
+NCNN_EXPORT void set_kmp_blocktime(int time_ms);
+
+// need to flush denormals on Intel Chipset.
+// Other architectures such as ARM can be added as needed.
+// 0 = DAZ OFF, FTZ OFF
+// 1 = DAZ ON , FTZ OFF
+// 2 = DAZ OFF, FTZ ON
+// 3 = DAZ ON,  FTZ ON
+NCNN_EXPORT int get_flush_denormals();
+NCNN_EXPORT int set_flush_denormals(int flush_denormals);
+
+} // namespace ncnn
+
+#endif // NCNN_CPU_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/datareader.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/datareader.h
new file mode 100644
index 0000000..ed2aba3
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/datareader.h
@@ -0,0 +1,122 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_DATAREADER_H
+#define NCNN_DATAREADER_H
+
+#include "platform.h"
+#if NCNN_STDIO
+#include <stdio.h>
+#endif
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/asset_manager.h>
+#endif
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+// data read wrapper
+class NCNN_EXPORT DataReader
+{
+public:
+    DataReader();
+    virtual ~DataReader();
+
+#if NCNN_STRING
+    // parse plain param text
+    // return 1 if scan success
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+
+    // read binary param and model data
+    // return bytes read
+    virtual size_t read(void* buf, size_t size) const;
+
+    // get model data reference
+    // return bytes referenced
+    virtual size_t reference(size_t size, const void** buf) const;
+};
+
+#if NCNN_STDIO
+class DataReaderFromStdioPrivate;
+class NCNN_EXPORT DataReaderFromStdio : public DataReader
+{
+public:
+    explicit DataReaderFromStdio(FILE* fp);
+    virtual ~DataReaderFromStdio();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+
+private:
+    DataReaderFromStdio(const DataReaderFromStdio&);
+    DataReaderFromStdio& operator=(const DataReaderFromStdio&);
+
+private:
+    DataReaderFromStdioPrivate* const d;
+};
+#endif // NCNN_STDIO
+
+class DataReaderFromMemoryPrivate;
+class NCNN_EXPORT DataReaderFromMemory : public DataReader
+{
+public:
+    explicit DataReaderFromMemory(const unsigned char*& mem);
+    virtual ~DataReaderFromMemory();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+    virtual size_t reference(size_t size, const void** buf) const;
+
+private:
+    DataReaderFromMemory(const DataReaderFromMemory&);
+    DataReaderFromMemory& operator=(const DataReaderFromMemory&);
+
+private:
+    DataReaderFromMemoryPrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+class DataReaderFromAndroidAssetPrivate;
+class NCNN_EXPORT DataReaderFromAndroidAsset : public DataReader
+{
+public:
+    explicit DataReaderFromAndroidAsset(AAsset* asset);
+    virtual ~DataReaderFromAndroidAsset();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+
+private:
+    DataReaderFromAndroidAsset(const DataReaderFromAndroidAsset&);
+    DataReaderFromAndroidAsset& operator=(const DataReaderFromAndroidAsset&);
+
+private:
+    DataReaderFromAndroidAssetPrivate* const d;
+};
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+} // namespace ncnn
+
+#endif // NCNN_DATAREADER_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/gpu.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/gpu.h
new file mode 100644
index 0000000..1eff228
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/gpu.h
@@ -0,0 +1,392 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_GPU_H
+#define NCNN_GPU_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+
+#include "mat.h"
+
+#include <vulkan/vulkan.h>
+
+#include "vulkan_header_fix.h"
+
+namespace ncnn {
+
+// instance
+
+// Create VkInstance and initialize some objects that need to be calculated by GPU
+// Creates a VkInstance object, Checks the extended attributes supported by the Vulkan instance concerned,
+// Initializes, and creates Vulkan validation layers (if ENABLE_VALIDATION_LAYER is enabled),
+// Iterates over all supported physical devices, etc.
+NCNN_EXPORT int create_gpu_instance();
+
+// Get global VkInstance variable
+// Must be called after create_gpu_instance() and before destroy_gpu_instance()
+NCNN_EXPORT VkInstance get_gpu_instance();
+
+// Destroy VkInstance object and free the memory of the associated object
+// Usually called in the destructor of the main program exit
+NCNN_EXPORT void destroy_gpu_instance();
+
+// instance extension capability
+extern int support_VK_KHR_external_memory_capabilities;
+extern int support_VK_KHR_get_physical_device_properties2;
+extern int support_VK_KHR_get_surface_capabilities2;
+extern int support_VK_KHR_surface;
+extern int support_VK_EXT_debug_utils;
+extern int support_VK_EXT_validation_features;
+extern int support_VK_EXT_validation_flags;
+#if __ANDROID_API__ >= 26
+extern int support_VK_KHR_android_surface;
+#endif // __ANDROID_API__ >= 26
+
+// VK_KHR_cooperative_matrix
+extern PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR;
+
+// VK_KHR_external_memory_capabilities
+extern PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR vkGetPhysicalDeviceExternalBufferPropertiesKHR;
+
+// VK_KHR_get_physical_device_properties2
+extern PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR;
+extern PFN_vkGetPhysicalDeviceProperties2KHR vkGetPhysicalDeviceProperties2KHR;
+extern PFN_vkGetPhysicalDeviceFormatProperties2KHR vkGetPhysicalDeviceFormatProperties2KHR;
+extern PFN_vkGetPhysicalDeviceImageFormatProperties2KHR vkGetPhysicalDeviceImageFormatProperties2KHR;
+extern PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR vkGetPhysicalDeviceQueueFamilyProperties2KHR;
+extern PFN_vkGetPhysicalDeviceMemoryProperties2KHR vkGetPhysicalDeviceMemoryProperties2KHR;
+extern PFN_vkGetPhysicalDeviceSparseImageFormatProperties2KHR vkGetPhysicalDeviceSparseImageFormatProperties2KHR;
+
+// VK_KHR_get_surface_capabilities2
+extern PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR vkGetPhysicalDeviceSurfaceCapabilities2KHR;
+extern PFN_vkGetPhysicalDeviceSurfaceFormats2KHR vkGetPhysicalDeviceSurfaceFormats2KHR;
+
+// VK_KHR_surface
+extern PFN_vkDestroySurfaceKHR vkDestroySurfaceKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceSupportKHR vkGetPhysicalDeviceSurfaceSupportKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR vkGetPhysicalDeviceSurfaceCapabilitiesKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceFormatsKHR vkGetPhysicalDeviceSurfaceFormatsKHR;
+extern PFN_vkGetPhysicalDeviceSurfacePresentModesKHR vkGetPhysicalDeviceSurfacePresentModesKHR;
+
+#if __ANDROID_API__ >= 26
+// VK_KHR_android_surface
+extern PFN_vkCreateAndroidSurfaceKHR vkCreateAndroidSurfaceKHR;
+#endif // __ANDROID_API__ >= 26
+
+// VK_NV_cooperative_matrix
+extern PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesNV vkGetPhysicalDeviceCooperativeMatrixPropertiesNV;
+
+// get info
+NCNN_EXPORT int get_gpu_count();
+NCNN_EXPORT int get_default_gpu_index();
+
+class GpuInfoPrivate;
+class NCNN_EXPORT GpuInfo
+{
+public:
+    explicit GpuInfo();
+    virtual ~GpuInfo();
+
+    // vulkan physical device
+    VkPhysicalDevice physical_device() const;
+
+    // memory properties
+    const VkPhysicalDeviceMemoryProperties& physical_device_memory_properties() const;
+
+    // info
+    uint32_t api_version() const;
+    uint32_t driver_version() const;
+    uint32_t vendor_id() const;
+    uint32_t device_id() const;
+    const char* device_name() const;
+    uint8_t* pipeline_cache_uuid() const;
+
+    // 0 = discrete gpu
+    // 1 = integrated gpu
+    // 2 = virtual gpu
+    // 3 = cpu
+    int type() const;
+
+    // hardware limit
+    uint32_t max_shared_memory_size() const;
+    uint32_t max_workgroup_count_x() const;
+    uint32_t max_workgroup_count_y() const;
+    uint32_t max_workgroup_count_z() const;
+    uint32_t max_workgroup_invocations() const;
+    uint32_t max_workgroup_size_x() const;
+    uint32_t max_workgroup_size_y() const;
+    uint32_t max_workgroup_size_z() const;
+    size_t memory_map_alignment() const;
+    size_t buffer_offset_alignment() const;
+    size_t non_coherent_atom_size() const;
+    size_t buffer_image_granularity() const;
+    uint32_t max_image_dimension_1d() const;
+    uint32_t max_image_dimension_2d() const;
+    uint32_t max_image_dimension_3d() const;
+    float timestamp_period() const;
+
+    // runtime
+    uint32_t compute_queue_family_index() const;
+    uint32_t graphics_queue_family_index() const;
+    uint32_t transfer_queue_family_index() const;
+
+    uint32_t compute_queue_count() const;
+    uint32_t graphics_queue_count() const;
+    uint32_t transfer_queue_count() const;
+
+    // property
+    bool unified_compute_transfer_queue() const;
+
+    // subgroup
+    uint32_t subgroup_size() const;
+    bool support_subgroup_basic() const;
+    bool support_subgroup_vote() const;
+    bool support_subgroup_ballot() const;
+    bool support_subgroup_shuffle() const;
+
+    // bug is not feature
+    bool bug_storage_buffer_no_l1() const;
+    bool bug_corrupted_online_pipeline_cache() const;
+    bool bug_buffer_image_load_zero() const;
+
+    // but sometimes bug is a feature
+    bool bug_implicit_fp16_arithmetic() const;
+
+    // fp16 and int8 feature
+    bool support_fp16_packed() const;
+    bool support_fp16_storage() const;
+    bool support_fp16_arithmetic() const;
+    bool support_int8_packed() const;
+    bool support_int8_storage() const;
+    bool support_int8_arithmetic() const;
+
+    // ycbcr conversion feature
+    bool support_ycbcr_conversion() const;
+
+    // cooperative matrix feature
+    bool support_cooperative_matrix() const;
+    bool support_cooperative_matrix_16_8_8() const;
+    bool support_cooperative_matrix_16_8_16() const;
+    bool support_cooperative_matrix_16_16_16() const;
+
+    // extension capability
+    int support_VK_KHR_8bit_storage() const;
+    int support_VK_KHR_16bit_storage() const;
+    int support_VK_KHR_bind_memory2() const;
+    int support_VK_KHR_buffer_device_address() const;
+    int support_VK_KHR_create_renderpass2() const;
+    int support_VK_KHR_cooperative_matrix() const;
+    int support_VK_KHR_dedicated_allocation() const;
+    int support_VK_KHR_descriptor_update_template() const;
+    int support_VK_KHR_external_memory() const;
+    int support_VK_KHR_get_memory_requirements2() const;
+    int support_VK_KHR_maintenance1() const;
+    int support_VK_KHR_maintenance2() const;
+    int support_VK_KHR_maintenance3() const;
+    int support_VK_KHR_multiview() const;
+    int support_VK_KHR_portability_subset() const;
+    int support_VK_KHR_push_descriptor() const;
+    int support_VK_KHR_sampler_ycbcr_conversion() const;
+    int support_VK_KHR_shader_float16_int8() const;
+    int support_VK_KHR_shader_float_controls() const;
+    int support_VK_KHR_storage_buffer_storage_class() const;
+    int support_VK_KHR_swapchain() const;
+    int support_VK_EXT_buffer_device_address() const;
+    int support_VK_EXT_descriptor_indexing() const;
+    int support_VK_EXT_memory_budget() const;
+    int support_VK_EXT_memory_priority() const;
+    int support_VK_EXT_queue_family_foreign() const;
+    int support_VK_AMD_device_coherent_memory() const;
+#if __ANDROID_API__ >= 26
+    int support_VK_ANDROID_external_memory_android_hardware_buffer() const;
+#endif // __ANDROID_API__ >= 26
+    int support_VK_NV_cooperative_matrix() const;
+
+private:
+    GpuInfo(const GpuInfo&);
+    GpuInfo& operator=(const GpuInfo&);
+
+private:
+    friend int create_gpu_instance();
+    GpuInfoPrivate* const d;
+};
+
+NCNN_EXPORT const GpuInfo& get_gpu_info(int device_index = get_default_gpu_index());
+
+class VkAllocator;
+class VkCompute;
+class Option;
+class PipelineCache;
+class VulkanDevicePrivate;
+class NCNN_EXPORT VulkanDevice
+{
+public:
+    VulkanDevice(int device_index = get_default_gpu_index());
+    ~VulkanDevice();
+
+    const GpuInfo& info;
+
+    VkDevice vkdevice() const;
+
+    VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const;
+
+    // with fixed workgroup size
+    VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const;
+
+    // helper for creating pipeline
+    int create_descriptorset_layout(int binding_count, const int* binding_types, VkDescriptorSetLayout* descriptorset_layout) const;
+    int create_pipeline_layout(int push_constant_count, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout* pipeline_layout) const;
+    int create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, VkPipeline* pipeline) const;
+    int create_descriptor_update_template(int binding_count, const int* binding_types, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR* descriptor_update_template) const;
+
+    uint32_t find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const;
+    bool is_mappable(uint32_t memory_type_index) const;
+    bool is_coherent(uint32_t memory_type_index) const;
+
+    VkQueue acquire_queue(uint32_t queue_family_index) const;
+    void reclaim_queue(uint32_t queue_family_index, VkQueue queue) const;
+
+    // allocator on this device
+    VkAllocator* acquire_blob_allocator() const;
+    void reclaim_blob_allocator(VkAllocator* allocator) const;
+
+    VkAllocator* acquire_staging_allocator() const;
+    void reclaim_staging_allocator(VkAllocator* allocator) const;
+
+    // immutable sampler for texelfetch
+    const VkSampler* immutable_texelfetch_sampler() const;
+
+    // dummy buffer image
+    VkMat get_dummy_buffer() const;
+    VkImageMat get_dummy_image() const;
+    VkImageMat get_dummy_image_readonly() const;
+
+    // pipeline cache on this device
+    const PipelineCache* get_pipeline_cache() const;
+
+    // test image allocation
+    bool shape_support_image_storage(const Mat& shape) const;
+
+    // current gpu heap memory budget in MB
+    uint32_t get_heap_budget() const;
+
+    // utility operator
+    void convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkImageMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkImageMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+
+    // VK_KHR_bind_memory2
+    PFN_vkBindBufferMemory2KHR vkBindBufferMemory2KHR;
+    PFN_vkBindImageMemory2KHR vkBindImageMemory2KHR;
+
+    // VK_KHR_buffer_device_address
+    PFN_vkGetBufferDeviceAddressKHR vkGetBufferDeviceAddressKHR;
+    PFN_vkGetBufferOpaqueCaptureAddressKHR vkGetBufferOpaqueCaptureAddressKHR;
+    PFN_vkGetDeviceMemoryOpaqueCaptureAddressKHR vkGetDeviceMemoryOpaqueCaptureAddressKHR;
+
+    // VK_KHR_create_renderpass2
+    PFN_vkCmdBeginRenderPass2KHR vkCmdBeginRenderPass2KHR;
+    PFN_vkCmdEndRenderPass2KHR vkCmdEndRenderPass2KHR;
+    PFN_vkCmdNextSubpass2KHR vkCmdNextSubpass2KHR;
+    PFN_vkCreateRenderPass2KHR vkCreateRenderPass2KHR;
+
+    // VK_KHR_descriptor_update_template
+    PFN_vkCreateDescriptorUpdateTemplateKHR vkCreateDescriptorUpdateTemplateKHR;
+    PFN_vkDestroyDescriptorUpdateTemplateKHR vkDestroyDescriptorUpdateTemplateKHR;
+    PFN_vkUpdateDescriptorSetWithTemplateKHR vkUpdateDescriptorSetWithTemplateKHR;
+
+    // VK_KHR_get_memory_requirements2
+    PFN_vkGetImageMemoryRequirements2KHR vkGetImageMemoryRequirements2KHR;
+    PFN_vkGetBufferMemoryRequirements2KHR vkGetBufferMemoryRequirements2KHR;
+    PFN_vkGetImageSparseMemoryRequirements2KHR vkGetImageSparseMemoryRequirements2KHR;
+
+    // VK_KHR_maintenance1
+    PFN_vkTrimCommandPoolKHR vkTrimCommandPoolKHR;
+
+    // VK_KHR_maintenance3
+    PFN_vkGetDescriptorSetLayoutSupportKHR vkGetDescriptorSetLayoutSupportKHR;
+
+    // VK_KHR_push_descriptor
+    PFN_vkCmdPushDescriptorSetWithTemplateKHR vkCmdPushDescriptorSetWithTemplateKHR;
+    PFN_vkCmdPushDescriptorSetKHR vkCmdPushDescriptorSetKHR;
+
+    // VK_KHR_sampler_ycbcr_conversion
+    PFN_vkCreateSamplerYcbcrConversionKHR vkCreateSamplerYcbcrConversionKHR;
+    PFN_vkDestroySamplerYcbcrConversionKHR vkDestroySamplerYcbcrConversionKHR;
+
+    // VK_KHR_swapchain
+    PFN_vkCreateSwapchainKHR vkCreateSwapchainKHR;
+    PFN_vkDestroySwapchainKHR vkDestroySwapchainKHR;
+    PFN_vkGetSwapchainImagesKHR vkGetSwapchainImagesKHR;
+    PFN_vkAcquireNextImageKHR vkAcquireNextImageKHR;
+    PFN_vkQueuePresentKHR vkQueuePresentKHR;
+
+    // VK_EXT_buffer_device_address
+    PFN_vkGetBufferDeviceAddressEXT vkGetBufferDeviceAddressEXT;
+
+#if __ANDROID_API__ >= 26
+    // VK_ANDROID_external_memory_android_hardware_buffer
+    PFN_vkGetAndroidHardwareBufferPropertiesANDROID vkGetAndroidHardwareBufferPropertiesANDROID;
+    PFN_vkGetMemoryAndroidHardwareBufferANDROID vkGetMemoryAndroidHardwareBufferANDROID;
+#endif // __ANDROID_API__ >= 26
+
+protected:
+    // device extension
+    int init_device_extension();
+
+private:
+    VulkanDevice(const VulkanDevice&);
+    VulkanDevice& operator=(const VulkanDevice&);
+
+private:
+    VulkanDevicePrivate* const d;
+};
+
+NCNN_EXPORT VulkanDevice* get_gpu_device(int device_index = get_default_gpu_index());
+
+// online spirv compilation
+NCNN_EXPORT int compile_spirv_module(const char* comp_string, const Option& opt, std::vector<uint32_t>& spirv);
+NCNN_EXPORT int compile_spirv_module(const char* comp_data, int comp_data_size, const Option& opt, std::vector<uint32_t>& spirv);
+NCNN_EXPORT int compile_spirv_module(int shader_type_index, const Option& opt, std::vector<uint32_t>& spirv);
+
+// info from spirv
+class NCNN_EXPORT ShaderInfo
+{
+public:
+    int specialization_count;
+    int binding_count;
+    int push_constant_count;
+
+    // 0 = null
+    // 1 = storage buffer
+    // 2 = storage image
+    // 3 = combined image sampler
+    int binding_types[16]; // 16 is large enough I think ...
+
+    int reserved_0;
+    int reserved_1;
+    int reserved_2;
+    int reserved_3;
+};
+
+NCNN_EXPORT int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info);
+
+} // namespace ncnn
+
+#endif // NCNN_VULKAN
+
+#endif // NCNN_GPU_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/layer.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/layer.h
new file mode 100644
index 0000000..f0418a9
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/layer.h
@@ -0,0 +1,222 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_H
+#define NCNN_LAYER_H
+
+#include "mat.h"
+#include "modelbin.h"
+#include "option.h"
+#include "paramdict.h"
+#include "platform.h"
+
+#if NCNN_VULKAN
+#include "command.h"
+#include "pipeline.h"
+
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+namespace ncnn {
+
+class NCNN_EXPORT Layer
+{
+public:
+    // empty
+    Layer();
+    // virtual destructor
+    virtual ~Layer();
+
+    // load layer specific parameter from parsed dict
+    // return 0 if success
+    virtual int load_param(const ParamDict& pd);
+
+    // load layer specific weight data from model binary
+    // return 0 if success
+    virtual int load_model(const ModelBin& mb);
+
+    // layer implementation specific setup
+    // return 0 if success
+    virtual int create_pipeline(const Option& opt);
+
+    // layer implementation specific clean
+    // return 0 if success
+    virtual int destroy_pipeline(const Option& opt);
+
+public:
+    // one input and one output blob
+    bool one_blob_only;
+
+    // support inplace inference
+    bool support_inplace;
+
+    // support vulkan compute
+    bool support_vulkan;
+
+    // accept input blob with packed storage
+    bool support_packing;
+
+    // accept bf16
+    bool support_bf16_storage;
+
+    // accept fp16
+    bool support_fp16_storage;
+
+    // accept int8
+    bool support_int8_storage;
+
+    // shader image storage
+    bool support_image_storage;
+
+    // shader tensor storage
+    bool support_tensor_storage;
+
+    bool support_reserved_00;
+
+    bool support_reserved_0;
+    bool support_reserved_1;
+    bool support_reserved_2;
+    bool support_reserved_3;
+    bool support_reserved_4;
+    bool support_reserved_5;
+    bool support_reserved_6;
+    bool support_reserved_7;
+    bool support_reserved_8;
+    bool support_reserved_9;
+
+    // feature disabled set
+    int featmask;
+
+public:
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+
+#if NCNN_VULKAN
+public:
+    // upload weight blob from host to device
+    virtual int upload_model(VkTransfer& cmd, const Option& opt);
+
+public:
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<VkMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<VkImageMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    // assigned immediately after creating this layer
+    const VulkanDevice* vkdev;
+#endif // NCNN_VULKAN
+
+public:
+    // custom user data
+    void* userdata;
+    // layer type index
+    int typeindex;
+#if NCNN_STRING
+    // layer type name
+    std::string type;
+    // layer name
+    std::string name;
+#endif // NCNN_STRING
+    // blob index which this layer needs as input
+    std::vector<int> bottoms;
+    // blob index which this layer produces as output
+    std::vector<int> tops;
+    // shape hint
+    std::vector<Mat> bottom_shapes;
+    std::vector<Mat> top_shapes;
+};
+
+// layer factory function
+typedef Layer* (*layer_creator_func)(void*);
+typedef void (*layer_destroyer_func)(Layer*, void*);
+
+struct layer_registry_entry
+{
+#if NCNN_STRING
+    // layer type name
+    const char* name;
+#endif // NCNN_STRING
+    // layer factory entry
+    layer_creator_func creator;
+};
+
+struct custom_layer_registry_entry
+{
+#if NCNN_STRING
+    // layer type name
+    const char* name;
+#endif // NCNN_STRING
+    // layer factory entry
+    layer_creator_func creator;
+    layer_destroyer_func destroyer;
+    void* userdata;
+};
+
+struct overwrite_builtin_layer_registry_entry
+{
+    // layer type index
+    int typeindex;
+    // layer factory entry
+    layer_creator_func creator;
+    layer_destroyer_func destroyer;
+    void* userdata;
+};
+
+#if NCNN_STRING
+// get layer type from type name
+NCNN_EXPORT int layer_to_index(const char* type);
+// create layer from type name
+NCNN_EXPORT Layer* create_layer(const char* type);
+#endif // NCNN_STRING
+// create layer from layer type
+NCNN_EXPORT Layer* create_layer(int index);
+
+#define DEFINE_LAYER_CREATOR(name)                          \
+    ::ncnn::Layer* name##_layer_creator(void* /*userdata*/) \
+    {                                                       \
+        return new name;                                    \
+    }
+
+#define DEFINE_LAYER_DESTROYER(name)                                      \
+    void name##_layer_destroyer(::ncnn::Layer* layer, void* /*userdata*/) \
+    {                                                                     \
+        delete layer;                                                     \
+    }
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/layer_shader_type.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/layer_shader_type.h
new file mode 100644
index 0000000..c143e7d
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/layer_shader_type.h
@@ -0,0 +1,29 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_SHADER_TYPE_H
+#define NCNN_LAYER_SHADER_TYPE_H
+
+namespace ncnn {
+
+namespace LayerShaderType {
+enum LayerShaderType
+{
+#include "layer_shader_type_enum.h"
+};
+} // namespace LayerShaderType
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_SHADER_TYPE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/layer_shader_type_enum.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/layer_shader_type_enum.h
new file mode 100644
index 0000000..aac8803
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/layer_shader_type_enum.h
@@ -0,0 +1,5 @@
+// Layer Shader Enum header
+//
+// This file is auto-generated by cmake, don't edit it.
+
+
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/layer_type.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/layer_type.h
new file mode 100644
index 0000000..511c714
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/layer_type.h
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_TYPE_H
+#define NCNN_LAYER_TYPE_H
+
+namespace ncnn {
+
+namespace LayerType {
+enum LayerType
+{
+#include "layer_type_enum.h"
+    CustomBit = (1 << 8),
+};
+} // namespace LayerType
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_TYPE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/layer_type_enum.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/layer_type_enum.h
new file mode 100644
index 0000000..97153ed
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/layer_type_enum.h
@@ -0,0 +1,109 @@
+// Layer Type Enum header
+//
+// This file is auto-generated by cmake, don't edit it.
+
+AbsVal = 0,
+ArgMax = 1,
+BatchNorm = 2,
+Bias = 3,
+BNLL = 4,
+Concat = 5,
+Convolution = 6,
+Crop = 7,
+Deconvolution = 8,
+Dropout = 9,
+Eltwise = 10,
+ELU = 11,
+Embed = 12,
+Exp = 13,
+Flatten = 14,
+InnerProduct = 15,
+Input = 16,
+Log = 17,
+LRN = 18,
+MemoryData = 19,
+MVN = 20,
+Pooling = 21,
+Power = 22,
+PReLU = 23,
+Proposal = 24,
+Reduction = 25,
+ReLU = 26,
+Reshape = 27,
+ROIPooling = 28,
+Scale = 29,
+Sigmoid = 30,
+Slice = 31,
+Softmax = 32,
+Split = 33,
+SPP = 34,
+TanH = 35,
+Threshold = 36,
+Tile = 37,
+RNN = 38,
+LSTM = 39,
+BinaryOp = 40,
+UnaryOp = 41,
+ConvolutionDepthWise = 42,
+Padding = 43,
+Squeeze = 44,
+ExpandDims = 45,
+Normalize = 46,
+Permute = 47,
+PriorBox = 48,
+DetectionOutput = 49,
+Interp = 50,
+DeconvolutionDepthWise = 51,
+ShuffleChannel = 52,
+InstanceNorm = 53,
+Clip = 54,
+Reorg = 55,
+YoloDetectionOutput = 56,
+Quantize = 57,
+Dequantize = 58,
+Yolov3DetectionOutput = 59,
+PSROIPooling = 60,
+ROIAlign = 61,
+Packing = 62,
+Requantize = 63,
+Cast = 64,
+HardSigmoid = 65,
+SELU = 66,
+HardSwish = 67,
+Noop = 68,
+PixelShuffle = 69,
+DeepCopy = 70,
+Mish = 71,
+StatisticsPooling = 72,
+Swish = 73,
+Gemm = 74,
+GroupNorm = 75,
+LayerNorm = 76,
+Softplus = 77,
+GRU = 78,
+MultiHeadAttention = 79,
+GELU = 80,
+Convolution1D = 81,
+Pooling1D = 82,
+ConvolutionDepthWise1D = 83,
+Convolution3D = 84,
+ConvolutionDepthWise3D = 85,
+Pooling3D = 86,
+MatMul = 87,
+Deconvolution1D = 88,
+DeconvolutionDepthWise1D = 89,
+Deconvolution3D = 90,
+DeconvolutionDepthWise3D = 91,
+Einsum = 92,
+DeformableConv2D = 93,
+GLU = 94,
+Fold = 95,
+Unfold = 96,
+GridSample = 97,
+CumulativeSum = 98,
+CopyTo = 99,
+Erf = 100,
+Diag = 101,
+CELU = 102,
+Shrink = 103,
+
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/mat.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/mat.h
new file mode 100644
index 0000000..c6f59ef
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/mat.h
@@ -0,0 +1,1843 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_MAT_H
+#define NCNN_MAT_H
+
+#include <stdlib.h>
+#include <string.h>
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif
+#if __SSE2__
+#include <emmintrin.h>
+#if __AVX__
+#include <immintrin.h>
+#endif
+#endif
+#if __mips_msa
+#include <msa.h>
+#endif
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif
+#if __riscv_vector
+#include <riscv_vector.h>
+#include "cpu.h" // cpu_riscv_vlenb()
+#endif
+
+#include "allocator.h"
+#include "option.h"
+#include "platform.h"
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#if NCNN_PIXEL
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/bitmap.h>
+#include <jni.h>
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+#endif // NCNN_PIXEL
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkMat;
+class VkImageMat;
+#endif // NCNN_VULKAN
+
+// the three dimension matrix
+class NCNN_EXPORT Mat
+{
+public:
+    // empty
+    Mat();
+    // vec
+    Mat(int w, size_t elemsize = 4u, Allocator* allocator = 0);
+    // image
+    Mat(int w, int h, size_t elemsize = 4u, Allocator* allocator = 0);
+    // dim
+    Mat(int w, int h, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // cube
+    Mat(int w, int h, int d, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // packed vec
+    Mat(int w, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed image
+    Mat(int w, int h, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed dim
+    Mat(int w, int h, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed cube
+    Mat(int w, int h, int d, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // copy
+    Mat(const Mat& m);
+    // external vec
+    Mat(int w, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external image
+    Mat(int w, int h, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external dim
+    Mat(int w, int h, int c, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external cube
+    Mat(int w, int h, int d, int c, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external packed vec
+    Mat(int w, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed image
+    Mat(int w, int h, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed dim
+    Mat(int w, int h, int c, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed cube
+    Mat(int w, int h, int d, int c, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // release
+    ~Mat();
+    // assign
+    Mat& operator=(const Mat& m);
+    // set all
+    void fill(float v);
+    void fill(int v);
+#if __ARM_NEON
+    void fill(float32x4_t _v);
+    void fill(uint16x4_t _v);
+    void fill(int32x4_t _v);
+    void fill(int32x4_t _v0, int32x4_t _v1);
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    void fill(float16x4_t _v);
+    void fill(float16x8_t _v);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif // __ARM_NEON
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+    void fill(__m512 _v);
+#endif // __AVX512F__
+    void fill(__m256 _v, int i = 0);
+#endif // __AVX__
+    void fill(__m128 _v);
+    void fill(__m128i _v);
+#endif // __SSE2__
+#if __mips_msa
+    void fill(v4f32 _v);
+#endif // __mips_msa
+#if __loongarch_sx
+    void fill(__m128 _v);
+#endif //__loongarch_sx
+#if __riscv_vector
+    void fill(vfloat32m1_t _v);
+    void fill(vuint16m1_t _v);
+    void fill(vint8m1_t _v);
+#if __riscv_zfh
+    void fill(vfloat16m1_t _v);
+#endif // __riscv_zfh
+#endif // __riscv_vector
+    template<typename T>
+    void fill(T v);
+    // deep copy
+    Mat clone(Allocator* allocator = 0) const;
+    // deep copy from other mat, inplace
+    void clone_from(const ncnn::Mat& mat, Allocator* allocator = 0);
+    // reshape vec
+    Mat reshape(int w, Allocator* allocator = 0) const;
+    // reshape image
+    Mat reshape(int w, int h, Allocator* allocator = 0) const;
+    // reshape dim
+    Mat reshape(int w, int h, int c, Allocator* allocator = 0) const;
+    // reshape cube
+    Mat reshape(int w, int h, int d, int c, Allocator* allocator = 0) const;
+    // allocate vec
+    void create(int w, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate image
+    void create(int w, int h, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate cube
+    void create(int w, int h, int d, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed cube
+    void create(int w, int h, int d, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate like
+    void create_like(const Mat& m, Allocator* allocator = 0);
+#if NCNN_VULKAN
+    // allocate like
+    void create_like(const VkMat& m, Allocator* allocator = 0);
+    // allocate like
+    void create_like(const VkImageMat& im, Allocator* allocator = 0);
+#endif // NCNN_VULKAN
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // bits per element
+    int elembits() const;
+
+    // shape only
+    Mat shape() const;
+
+    // data reference
+    Mat channel(int c);
+    const Mat channel(int c) const;
+    Mat depth(int z);
+    const Mat depth(int z) const;
+    float* row(int y);
+    const float* row(int y) const;
+    template<typename T>
+    T* row(int y);
+    template<typename T>
+    const T* row(int y) const;
+
+    // range reference
+    Mat channel_range(int c, int channels);
+    const Mat channel_range(int c, int channels) const;
+    Mat depth_range(int z, int depths);
+    const Mat depth_range(int z, int depths) const;
+    Mat row_range(int y, int rows);
+    const Mat row_range(int y, int rows) const;
+    Mat range(int x, int n);
+    const Mat range(int x, int n) const;
+
+    // access raw data
+    template<typename T>
+    operator T*();
+    template<typename T>
+    operator const T*() const;
+
+    // convenient access float vec element
+    float& operator[](size_t i);
+    const float& operator[](size_t i) const;
+
+#if NCNN_PIXEL
+    enum PixelType
+    {
+        PIXEL_CONVERT_SHIFT = 16,
+        PIXEL_FORMAT_MASK = 0x0000ffff,
+        PIXEL_CONVERT_MASK = 0xffff0000,
+
+        PIXEL_RGB = 1,
+        PIXEL_BGR = 2,
+        PIXEL_GRAY = 3,
+        PIXEL_RGBA = 4,
+        PIXEL_BGRA = 5,
+
+        PIXEL_RGB2BGR = PIXEL_RGB | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGB2GRAY = PIXEL_RGB | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGB2RGBA = PIXEL_RGB | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGB2BGRA = PIXEL_RGB | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_BGR2RGB = PIXEL_BGR | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGR2GRAY = PIXEL_BGR | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGR2RGBA = PIXEL_BGR | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGR2BGRA = PIXEL_BGR | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_GRAY2RGB = PIXEL_GRAY | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_GRAY2BGR = PIXEL_GRAY | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_GRAY2RGBA = PIXEL_GRAY | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+        PIXEL_GRAY2BGRA = PIXEL_GRAY | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_RGBA2RGB = PIXEL_RGBA | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2BGR = PIXEL_RGBA | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2GRAY = PIXEL_RGBA | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2BGRA = PIXEL_RGBA | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_BGRA2RGB = PIXEL_BGRA | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGRA2BGR = PIXEL_BGRA | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGRA2GRAY = PIXEL_BGRA | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGRA2RGBA = PIXEL_BGRA | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+    };
+    // convenient construct from pixel data
+    static Mat from_pixels(const unsigned char* pixels, int type, int w, int h, Allocator* allocator = 0);
+    // convenient construct from pixel data with stride(bytes-per-row) parameter
+    static Mat from_pixels(const unsigned char* pixels, int type, int w, int h, int stride, Allocator* allocator = 0);
+    // convenient construct from pixel data and resize to specific size
+    static Mat from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from pixel data and resize to specific size with stride(bytes-per-row) parameter
+    static Mat from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int stride, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from pixel data roi
+    static Mat from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int roix, int roiy, int roiw, int roih, Allocator* allocator = 0);
+    // convenient construct from pixel data roi with stride(bytes-per-row) parameter
+    static Mat from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, Allocator* allocator = 0);
+    // convenient construct from pixel data roi and resize to specific size
+    static Mat from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from pixel data roi and resize to specific size with stride(bytes-per-row) parameter
+    static Mat from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator = 0);
+
+    // convenient export to pixel data
+    void to_pixels(unsigned char* pixels, int type) const;
+    // convenient export to pixel data with stride(bytes-per-row) parameter
+    void to_pixels(unsigned char* pixels, int type, int stride) const;
+    // convenient export to pixel data and resize to specific size
+    void to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height) const;
+    // convenient export to pixel data and resize to specific size with stride(bytes-per-row) parameter
+    void to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height, int target_stride) const;
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+    // convenient construct from android Bitmap
+    static Mat from_android_bitmap(JNIEnv* env, jobject bitmap, int type_to, Allocator* allocator = 0);
+    // convenient construct from android Bitmap and resize to specific size
+    static Mat from_android_bitmap_resize(JNIEnv* env, jobject bitmap, int type_to, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from android Bitmap roi
+    static Mat from_android_bitmap_roi(JNIEnv* env, jobject bitmap, int type_to, int roix, int roiy, int roiw, int roih, Allocator* allocator = 0);
+    // convenient construct from android Bitmap roi and resize to specific size
+    static Mat from_android_bitmap_roi_resize(JNIEnv* env, jobject bitmap, int type_to, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient export to android Bitmap and resize to the android Bitmap size
+    void to_android_bitmap(JNIEnv* env, jobject bitmap, int type_from) const;
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+#endif // NCNN_PIXEL
+
+    // substract channel-wise mean values, then multiply by normalize values, pass 0 to skip
+    void substract_mean_normalize(const float* mean_vals, const float* norm_vals);
+
+    // convenient construct from half precision floating point data
+    static Mat from_float16(const unsigned short* data, int size);
+
+    // pointer to the data
+    void* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-d-h-w-1  c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-d-h-w-4  c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-d-h-w-8  c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    Allocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int d;
+    int c;
+
+    size_t cstep;
+};
+
+#if NCNN_VULKAN
+
+// the three dimension matrix, vulkan version
+class NCNN_EXPORT VkMat
+{
+public:
+    // empty
+    VkMat();
+    // vec
+    VkMat(int w, size_t elemsize, VkAllocator* allocator);
+    // image
+    VkMat(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // dim
+    VkMat(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // cube
+    VkMat(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // packed vec
+    VkMat(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed image
+    VkMat(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed dim
+    VkMat(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed cube
+    VkMat(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // copy
+    VkMat(const VkMat& m);
+    // external vec
+    VkMat(int w, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external image
+    VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external dim
+    VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external cube
+    VkMat(int w, int h, int d, int c, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external packed vec
+    VkMat(int w, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed image
+    VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed dim
+    VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed cube
+    VkMat(int w, int h, int d, int c, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // release
+    ~VkMat();
+    // assign
+    VkMat& operator=(const VkMat& m);
+    // allocate vec
+    void create(int w, size_t elemsize, VkAllocator* allocator);
+    // allocate image
+    void create(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate cube
+    void create(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed cube
+    void create(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate like
+    void create_like(const Mat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkMat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkImageMat& im, VkAllocator* allocator);
+
+    // mapped
+    Mat mapped() const;
+    void* mapped_ptr() const;
+
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // bits per element
+    int elembits() const;
+
+    // shape only
+    Mat shape() const;
+
+    // low-level reference
+    VkBuffer buffer() const;
+    size_t buffer_offset() const;
+    size_t buffer_capacity() const;
+
+    // device buffer
+    VkBufferMemory* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-d-h-w-1  c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-d-h-w-4  c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-d-h-w-8  c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    VkAllocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int d;
+    int c;
+
+    size_t cstep;
+};
+
+class NCNN_EXPORT VkImageMat
+{
+public:
+    // empty
+    VkImageMat();
+    // vec
+    VkImageMat(int w, size_t elemsize, VkAllocator* allocator);
+    // image
+    VkImageMat(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // dim
+    VkImageMat(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // cube
+    VkImageMat(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // packed vec
+    VkImageMat(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed image
+    VkImageMat(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed dim
+    VkImageMat(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed cube
+    VkImageMat(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // copy
+    VkImageMat(const VkImageMat& m);
+    // external vec
+    VkImageMat(int w, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external image
+    VkImageMat(int w, int h, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external dim
+    VkImageMat(int w, int h, int c, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external cube
+    VkImageMat(int w, int h, int d, int c, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external packed vec
+    VkImageMat(int w, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed image
+    VkImageMat(int w, int h, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed dim
+    VkImageMat(int w, int h, int c, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed cube
+    VkImageMat(int w, int h, int d, int c, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // release
+    ~VkImageMat();
+    // assign
+    VkImageMat& operator=(const VkImageMat& m);
+    // allocate vec
+    void create(int w, size_t elemsize, VkAllocator* allocator);
+    // allocate image
+    void create(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate cube
+    void create(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed cube
+    void create(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate like
+    void create_like(const Mat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkMat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkImageMat& im, VkAllocator* allocator);
+
+    // mapped
+    Mat mapped() const;
+    void* mapped_ptr() const;
+
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // bits per element
+    int elembits() const;
+
+    // shape only
+    Mat shape() const;
+
+    // low-level reference
+    VkImage image() const;
+    VkImageView imageview() const;
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+    // convenient construct from android hardware buffer
+    static VkImageMat from_android_hardware_buffer(VkAndroidHardwareBufferImageAllocator* allocator);
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+    // device image
+    VkImageMemory* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-d-h-w-1  c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-d-h-w-4  c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-d-h-w-8  c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    VkAllocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int d;
+    int c;
+};
+
+// type for vulkan specialization constant and push constant
+union vk_specialization_type
+{
+    int i;
+    float f;
+    uint32_t u32;
+};
+union vk_constant_type
+{
+    int i;
+    float f;
+};
+#endif // NCNN_VULKAN
+
+// misc function
+#if NCNN_PIXEL
+// convert yuv420sp(nv21) to rgb, the fast approximate version
+NCNN_EXPORT void yuv420sp2rgb(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
+// convert yuv420sp(nv12) to rgb, the fast approximate version
+NCNN_EXPORT void yuv420sp2rgb_nv12(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
+// convert yuv420sp(nv21) to rgb with half resize, the faster approximate version
+NCNN_EXPORT void yuv420sp2rgb_half(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
+// image pixel bilinear resize
+NCNN_EXPORT void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+NCNN_EXPORT void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+NCNN_EXPORT void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+NCNN_EXPORT void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+// image pixel bilinear resize with stride(bytes-per-row) parameter
+NCNN_EXPORT void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+NCNN_EXPORT void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+NCNN_EXPORT void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+NCNN_EXPORT void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+// image pixel bilinear resize, convenient wrapper for yuv420sp(nv21/nv12)
+NCNN_EXPORT void resize_bilinear_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+#endif // NCNN_PIXEL
+#if NCNN_PIXEL_ROTATE
+// type is the from type, 6 means rotating from 6 to 1
+//
+//     1        2       3      4         5            6           7          8
+//
+//   888888  888888      88  88      8888888888  88                  88  8888888888
+//   88          88      88  88      88  88      88  88          88  88      88  88
+//   8888      8888    8888  8888    88          8888888888  8888888888          88
+//   88          88      88  88
+//   88          88  888888  888888
+//
+// ref http://sylvana.net/jpegcrop/exif_orientation.html
+// image pixel kanna rotate
+NCNN_EXPORT void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+NCNN_EXPORT void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+NCNN_EXPORT void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+NCNN_EXPORT void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+// image pixel kanna rotate with stride(bytes-per-row) parameter
+NCNN_EXPORT void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+NCNN_EXPORT void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+NCNN_EXPORT void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+NCNN_EXPORT void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+// image pixel kanna rotate, convenient wrapper for yuv420sp(nv21/nv12)
+NCNN_EXPORT void kanna_rotate_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+#endif // NCNN_PIXEL_ROTATE
+#if NCNN_PIXEL_AFFINE
+// resolve affine transform matrix from rotation angle, scale factor and x y offset
+NCNN_EXPORT void get_rotation_matrix(float angle, float scale, float dx, float dy, float* tm);
+// resolve affine transform matrix from two set of points, num_point must be >= 2
+NCNN_EXPORT void get_affine_transform(const float* points_from, const float* points_to, int num_point, float* tm);
+// resolve the inversion affine transform matrix
+NCNN_EXPORT void invert_affine_transform(const float* tm, float* tm_inv);
+// image pixel bilinear warpaffine inverse transform, set -233 for transparent border color, the color RGBA is little-endian encoded
+NCNN_EXPORT void warpaffine_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+// image pixel bilinear warpaffine inverse transform with stride(bytes-per-row) parameter, set -233 for transparent border color, the color RGBA is little-endian encoded
+NCNN_EXPORT void warpaffine_bilinear_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+// image pixel bilinear warpaffine, convenient wrapper for yuv420sp(nv21/nv12), set -233 for transparent border color, the color YUV_ is little-endian encoded
+NCNN_EXPORT void warpaffine_bilinear_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+#endif // NCNN_PIXEL_AFFINE
+#if NCNN_PIXEL_DRAWING
+// draw rectangle, set thickness -1 for filled rectangle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_rectangle_c1(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c2(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c3(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c4(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+// draw rectangle with stride(bytes-per-row) parameter, set thickness -1 for filled rectangle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_rectangle_c1(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c2(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c3(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c4(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+// draw rectangle, convenient wrapper for yuv420sp(nv21/nv12), set thickness -1 for filled rectangle, the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_rectangle_yuv420sp(unsigned char* yuv420sp, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+// draw circle, set thickness -1 for filled circle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_circle_c1(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c2(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c3(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c4(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+// draw circle with stride(bytes-per-row) parameter, set thickness -1 for filled circle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_circle_c1(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c2(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c3(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c4(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+// draw circle, convenient wrapper for yuv420sp(nv21/nv12), set thickness -1 for filled circle, the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_circle_yuv420sp(unsigned char* yuv420sp, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+// draw line, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_line_c1(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c2(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c3(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c4(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+// draw line with stride(bytes-per-row) parameter, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_line_c1(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c2(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c3(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c4(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+// draw line, convenient wrapper for yuv420sp(nv21/nv12), the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_line_yuv420sp(unsigned char* yuv420sp, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+// resolve text bounding box size
+NCNN_EXPORT void get_text_drawing_size(const char* text, int fontpixelsize, int* w, int* h);
+// draw ascii printables and newline, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_text_c1(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c2(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c3(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c4(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+// draw ascii printables and newline with stride(bytes-per-row) parameter, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_text_c1(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c2(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c3(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c4(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+// draw ascii printables and newline, convenient wrapper for yuv420sp(nv21/nv12), the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_text_yuv420sp(unsigned char* yuv420sp, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+#endif // NCNN_PIXEL_DRAWING
+
+// type conversion
+// convert float to half precision floating point
+NCNN_EXPORT unsigned short float32_to_float16(float value);
+// convert half precision floating point to float
+NCNN_EXPORT float float16_to_float32(unsigned short value);
+// convert float to brain half
+NCNN_EXPORT NCNN_FORCEINLINE unsigned short float32_to_bfloat16(float value)
+{
+    // 16 : 16
+    union
+    {
+        unsigned int u;
+        float f;
+    } tmp;
+    tmp.f = value;
+    return tmp.u >> 16;
+}
+// convert brain half to float
+NCNN_EXPORT NCNN_FORCEINLINE float bfloat16_to_float32(unsigned short value)
+{
+    // 16 : 16
+    union
+    {
+        unsigned int u;
+        float f;
+    } tmp;
+    tmp.u = value << 16;
+    return tmp.f;
+}
+
+// mat process
+enum BorderType
+{
+    BORDER_CONSTANT = 0,
+    BORDER_REPLICATE = 1,
+    BORDER_REFLECT = 2,
+    BORDER_TRANSPARENT = -233,
+};
+NCNN_EXPORT void copy_make_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int type, float v, const Option& opt = Option());
+NCNN_EXPORT void copy_make_border_3d(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int front, int behind, int type, float v, const Option& opt = Option());
+NCNN_EXPORT void copy_cut_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, const Option& opt = Option());
+NCNN_EXPORT void copy_cut_border_3d(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int front, int behind, const Option& opt = Option());
+NCNN_EXPORT void resize_nearest(const Mat& src, Mat& dst, int w, int h, const Option& opt = Option());
+NCNN_EXPORT void resize_bilinear(const Mat& src, Mat& dst, int w, int h, const Option& opt = Option());
+NCNN_EXPORT void resize_bicubic(const Mat& src, Mat& dst, int w, int h, const Option& opt = Option());
+NCNN_EXPORT void convert_packing(const Mat& src, Mat& dst, int elempack, const Option& opt = Option());
+NCNN_EXPORT void flatten(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_float32_to_float16(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_float16_to_float32(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_int8_to_float32(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_float32_to_bfloat16(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_bfloat16_to_float32(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void quantize_to_int8(const Mat& src, Mat& dst, const Mat& scale_data, const Option& opt = Option());
+NCNN_EXPORT void dequantize_from_int32(const Mat& src, Mat& dst, const Mat& scale_data, const Mat& bias_data, const Option& opt = Option());
+NCNN_EXPORT void requantize_from_int32_to_int8(const Mat& src, Mat& dst, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt = Option());
+
+NCNN_FORCEINLINE Mat::Mat()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(const Mat& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c), cstep(m.cstep)
+{
+    addref();
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = (size_t)w * h;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = (size_t)w * h;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::~Mat()
+{
+    release();
+}
+
+NCNN_FORCEINLINE void Mat::fill(float _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+
+    int i = 0;
+#if __ARM_NEON
+    float32x4_t _c = vdupq_n_f32(_v);
+    for (; i + 3 < size; i += 4)
+    {
+        vst1q_f32(ptr, _c);
+        ptr += 4;
+    }
+#endif // __ARM_NEON
+    for (; i < size; i++)
+    {
+        *ptr++ = _v;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(int _v)
+{
+    int size = (int)total();
+    int* ptr = (int*)data;
+
+    int i = 0;
+#if __ARM_NEON
+    int32x4_t _c = vdupq_n_s32(_v);
+    for (; i + 3 < size; i += 4)
+    {
+        vst1q_s32(ptr, _c);
+        ptr += 4;
+    }
+#endif // __ARM_NEON
+    for (; i < size; i++)
+    {
+        *ptr++ = _v;
+    }
+}
+
+#if __ARM_NEON
+NCNN_FORCEINLINE void Mat::fill(float32x4_t _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_f32(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(uint16x4_t _v)
+{
+    int size = (int)total();
+    unsigned short* ptr = (unsigned short*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1_u16(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(int32x4_t _v)
+{
+    int size = (int)total();
+    int* ptr = (int*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_s32(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(int32x4_t _v0, int32x4_t _v1)
+{
+    int size = (int)total();
+    int* ptr = (int*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_s32(ptr, _v0);
+        vst1q_s32(ptr + 4, _v1);
+        ptr += 8;
+    }
+}
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+NCNN_FORCEINLINE void Mat::fill(float16x4_t _v)
+{
+    int size = (int)total();
+    __fp16* ptr = (__fp16*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1_f16(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(float16x8_t _v)
+{
+    int size = (int)total();
+    __fp16* ptr = (__fp16*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_f16(ptr, _v);
+        ptr += 8;
+    }
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif // __ARM_NEON
+
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+NCNN_FORCEINLINE void Mat::fill(__m512 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm512_storeu_ps(ptr, _v);
+        ptr += 16;
+    }
+}
+#endif // __AVX512F__
+NCNN_FORCEINLINE void Mat::fill(__m256 _v, int _i)
+{
+    // old gcc cannot overload __m128 and __m256 type
+    // add a dummy int parameter for different mangled function symbol
+    (void)_i;
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm256_storeu_ps(ptr, _v);
+        ptr += 8;
+    }
+}
+#endif // __AVX__
+NCNN_FORCEINLINE void Mat::fill(__m128 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm_storeu_ps(ptr, _v);
+        ptr += 4;
+    }
+}
+NCNN_FORCEINLINE void Mat::fill(__m128i _v)
+{
+    int size = (int)total();
+    unsigned short* ptr = (unsigned short*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm_store_si128((__m128i*)ptr, _v);
+        ptr += 8;
+    }
+}
+#endif // __SSE2__
+
+#if __mips_msa
+NCNN_FORCEINLINE void Mat::fill(v4f32 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        __msa_st_w((v4i32)_v, ptr, 0);
+        ptr += 4;
+    }
+}
+#endif // __mips_msa
+
+#if __loongarch_sx
+NCNN_FORCEINLINE void Mat::fill(__m128 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        __lsx_vst(_v, ptr, 0);
+        ptr += 4;
+    }
+}
+#endif // __loongarch_sx
+#if __riscv_vector
+NCNN_FORCEINLINE void Mat::fill(vfloat32m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 4;
+    const size_t vl = vsetvl_e32m1(packn);
+
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse32_v_f32m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(vuint16m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 2;
+    const size_t vl = vsetvl_e16m1(packn);
+
+    int size = (int)total();
+    unsigned short* ptr = (unsigned short*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse16_v_u16m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(vint8m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 1;
+    const size_t vl = vsetvl_e8m1(packn);
+
+    int size = (int)total();
+    signed char* ptr = (signed char*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse8_v_i8m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+#if __riscv_zfh
+NCNN_FORCEINLINE void Mat::fill(vfloat16m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 2;
+    const size_t vl = vsetvl_e16m1(packn);
+
+    int size = (int)total();
+    __fp16* ptr = (__fp16*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse16_v_f16m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+#endif // __riscv_zfh
+#endif // __riscv_vector
+
+template<typename T>
+NCNN_FORCEINLINE void Mat::fill(T _v)
+{
+    int size = (int)total();
+    T* ptr = (T*)data;
+    for (int i = 0; i < size; i++)
+    {
+        ptr[i] = _v;
+    }
+}
+
+NCNN_FORCEINLINE Mat& Mat::operator=(const Mat& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        NCNN_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    d = m.d;
+    c = m.c;
+
+    cstep = m.cstep;
+
+    return *this;
+}
+
+NCNN_FORCEINLINE void Mat::addref()
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+}
+
+NCNN_FORCEINLINE void Mat::release()
+{
+    if (refcount && NCNN_XADD(refcount, -1) == 1)
+    {
+        if (allocator)
+            allocator->fastFree(data);
+        else
+            fastFree(data);
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    d = 0;
+    c = 0;
+
+    cstep = 0;
+
+    refcount = 0;
+}
+
+NCNN_FORCEINLINE bool Mat::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+NCNN_FORCEINLINE size_t Mat::total() const
+{
+    return cstep * c;
+}
+
+NCNN_FORCEINLINE int Mat::elembits() const
+{
+    return elempack ? static_cast<int>(elemsize * 8) / elempack : 0;
+}
+
+NCNN_FORCEINLINE Mat Mat::shape() const
+{
+    if (dims == 1)
+        return Mat(w * elempack, (void*)0);
+    if (dims == 2)
+        return Mat(w, h * elempack, (void*)0);
+    if (dims == 3)
+        return Mat(w, h, c * elempack, (void*)0);
+    if (dims == 4)
+        return Mat(w, h, d, c * elempack, (void*)0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE Mat Mat::channel(int _c)
+{
+    Mat m(w, h, d, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims - 1;
+    if (dims == 4)
+        m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::channel(int _c) const
+{
+    Mat m(w, h, d, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims - 1;
+    if (dims == 4)
+        m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE Mat Mat::depth(int z)
+{
+    return Mat(w, h, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE const Mat Mat::depth(int z) const
+{
+    return Mat(w, h, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE float* Mat::row(int y)
+{
+    return (float*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+NCNN_FORCEINLINE const float* Mat::row(int y) const
+{
+    return (const float*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+template<typename T>
+NCNN_FORCEINLINE T* Mat::row(int y)
+{
+    return (T*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+template<typename T>
+NCNN_FORCEINLINE const T* Mat::row(int y) const
+{
+    return (const T*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+NCNN_FORCEINLINE Mat Mat::channel_range(int _c, int channels)
+{
+    Mat m(w, h, d, channels, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::channel_range(int _c, int channels) const
+{
+    Mat m(w, h, d, channels, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims;
+    return m;
+}
+
+NCNN_FORCEINLINE Mat Mat::depth_range(int z, int depths)
+{
+    Mat m(w, h, depths, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+    m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::depth_range(int z, int depths) const
+{
+    Mat m(w, h, depths, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+    m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE Mat Mat::row_range(int y, int rows)
+{
+    return Mat(w, rows, (unsigned char*)data + (size_t)w * y * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE const Mat Mat::row_range(int y, int rows) const
+{
+    return Mat(w, rows, (unsigned char*)data + (size_t)w * y * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE Mat Mat::range(int x, int n)
+{
+    return Mat(n, (unsigned char*)data + x * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE const Mat Mat::range(int x, int n) const
+{
+    return Mat(n, (unsigned char*)data + x * elemsize, elemsize, elempack, allocator);
+}
+
+template<typename T>
+NCNN_FORCEINLINE Mat::operator T*()
+{
+    return (T*)data;
+}
+
+template<typename T>
+NCNN_FORCEINLINE Mat::operator const T*() const
+{
+    return (const T*)data;
+}
+
+NCNN_FORCEINLINE float& Mat::operator[](size_t i)
+{
+    return ((float*)data)[i];
+}
+
+NCNN_FORCEINLINE const float& Mat::operator[](size_t i) const
+{
+    return ((const float*)data)[i];
+}
+
+#if NCNN_VULKAN
+
+NCNN_FORCEINLINE VkMat::VkMat()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(const VkMat& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c)
+{
+    addref();
+
+    cstep = m.cstep;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = w * h;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize(w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize(w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = w * h;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize(w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize(w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::~VkMat()
+{
+    release();
+}
+
+NCNN_FORCEINLINE VkMat& VkMat::operator=(const VkMat& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        NCNN_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    d = m.d;
+    c = m.c;
+
+    cstep = m.cstep;
+
+    return *this;
+}
+
+NCNN_FORCEINLINE Mat VkMat::mapped() const
+{
+    if (!allocator->mappable)
+        return Mat();
+
+    if (dims == 1)
+        return Mat(w, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 2)
+        return Mat(w, h, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 3)
+        return Mat(w, h, c, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 4)
+        return Mat(w, h, d, c, mapped_ptr(), elemsize, elempack, 0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE void* VkMat::mapped_ptr() const
+{
+    if (!allocator->mappable)
+        return 0;
+
+    return (unsigned char*)data->mapped_ptr + data->offset;
+}
+
+NCNN_FORCEINLINE void VkMat::addref()
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+}
+
+NCNN_FORCEINLINE void VkMat::release()
+{
+    if (refcount && NCNN_XADD(refcount, -1) == 1)
+    {
+        if (allocator && data)
+        {
+            allocator->fastFree(data);
+        }
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    d = 0;
+    c = 0;
+
+    cstep = 0;
+
+    refcount = 0;
+}
+
+NCNN_FORCEINLINE bool VkMat::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+NCNN_FORCEINLINE size_t VkMat::total() const
+{
+    return cstep * c;
+}
+
+NCNN_FORCEINLINE int VkMat::elembits() const
+{
+    return elempack ? static_cast<int>(elemsize) * 8 / elempack : 0;
+}
+
+NCNN_FORCEINLINE Mat VkMat::shape() const
+{
+    if (dims == 1)
+        return Mat(w * elempack, (void*)0);
+    if (dims == 2)
+        return Mat(w, h * elempack, (void*)0);
+    if (dims == 3)
+        return Mat(w, h, c * elempack, (void*)0);
+    if (dims == 4)
+        return Mat(w, h, d, c * elempack, (void*)0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE VkBuffer VkMat::buffer() const
+{
+    return data->buffer;
+}
+
+NCNN_FORCEINLINE size_t VkMat::buffer_offset() const
+{
+    return data->offset;
+}
+
+NCNN_FORCEINLINE size_t VkMat::buffer_capacity() const
+{
+    return data->capacity;
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(const VkImageMat& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c)
+{
+    addref();
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::~VkImageMat()
+{
+    release();
+}
+
+NCNN_FORCEINLINE VkImageMat& VkImageMat::operator=(const VkImageMat& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        NCNN_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    d = m.d;
+    c = m.c;
+
+    return *this;
+}
+
+NCNN_FORCEINLINE Mat VkImageMat::mapped() const
+{
+    if (!allocator->mappable || !data->mapped_ptr)
+        return Mat();
+
+    if (dims == 1)
+        return Mat(w, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 2)
+        return Mat(w, h, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 3)
+        return Mat(w, h, c, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 4)
+        return Mat(w, h, d, c, mapped_ptr(), elemsize, elempack, 0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE void* VkImageMat::mapped_ptr() const
+{
+    if (!allocator->mappable || !data->mapped_ptr)
+        return 0;
+
+    return (unsigned char*)data->mapped_ptr + data->bind_offset;
+}
+
+NCNN_FORCEINLINE void VkImageMat::addref()
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+}
+
+NCNN_FORCEINLINE void VkImageMat::release()
+{
+    if (refcount && NCNN_XADD(refcount, -1) == 1)
+    {
+        if (allocator && data)
+        {
+            allocator->fastFree(data);
+        }
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    d = 0;
+    c = 0;
+
+    refcount = 0;
+}
+
+NCNN_FORCEINLINE bool VkImageMat::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+NCNN_FORCEINLINE size_t VkImageMat::total() const
+{
+    return w * h * d * c;
+}
+
+NCNN_FORCEINLINE int VkImageMat::elembits() const
+{
+    return elempack ? static_cast<int>(elemsize) * 8 / elempack : 0;
+}
+
+NCNN_FORCEINLINE Mat VkImageMat::shape() const
+{
+    if (dims == 1)
+        return Mat(w * elempack, (void*)0);
+    if (dims == 2)
+        return Mat(w, h * elempack, (void*)0);
+    if (dims == 3)
+        return Mat(w, h, c * elempack, (void*)0);
+    if (dims == 4)
+        return Mat(w, h, d, c * elempack, (void*)0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE VkImage VkImageMat::image() const
+{
+    return data->image;
+}
+
+NCNN_FORCEINLINE VkImageView VkImageMat::imageview() const
+{
+    return data->imageview;
+}
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_MAT_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/modelbin.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/modelbin.h
new file mode 100644
index 0000000..aada5f6
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/modelbin.h
@@ -0,0 +1,80 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_MODELBIN_H
+#define NCNN_MODELBIN_H
+
+#include "mat.h"
+
+namespace ncnn {
+
+class DataReader;
+class NCNN_EXPORT ModelBin
+{
+public:
+    ModelBin();
+    virtual ~ModelBin();
+    // element type
+    // 0 = auto
+    // 1 = float32
+    // 2 = float16
+    // 3 = int8
+    // load vec
+    virtual Mat load(int w, int type) const;
+    // load image
+    virtual Mat load(int w, int h, int type) const;
+    // load dim
+    virtual Mat load(int w, int h, int c, int type) const;
+    // load cube
+    virtual Mat load(int w, int h, int d, int c, int type) const;
+};
+
+class ModelBinFromDataReaderPrivate;
+class NCNN_EXPORT ModelBinFromDataReader : public ModelBin
+{
+public:
+    explicit ModelBinFromDataReader(const DataReader& dr);
+    virtual ~ModelBinFromDataReader();
+
+    virtual Mat load(int w, int type) const;
+
+private:
+    ModelBinFromDataReader(const ModelBinFromDataReader&);
+    ModelBinFromDataReader& operator=(const ModelBinFromDataReader&);
+
+private:
+    ModelBinFromDataReaderPrivate* const d;
+};
+
+class ModelBinFromMatArrayPrivate;
+class NCNN_EXPORT ModelBinFromMatArray : public ModelBin
+{
+public:
+    // construct from weight blob array
+    explicit ModelBinFromMatArray(const Mat* weights);
+    virtual ~ModelBinFromMatArray();
+
+    virtual Mat load(int w, int type) const;
+
+private:
+    ModelBinFromMatArray(const ModelBinFromMatArray&);
+    ModelBinFromMatArray& operator=(const ModelBinFromMatArray&);
+
+private:
+    ModelBinFromMatArrayPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_MODELBIN_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/ncnn_export.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/ncnn_export.h
new file mode 100644
index 0000000..e2f5fde
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/ncnn_export.h
@@ -0,0 +1,42 @@
+
+#ifndef NCNN_EXPORT_H
+#define NCNN_EXPORT_H
+
+#ifdef NCNN_STATIC_DEFINE
+#  define NCNN_EXPORT
+#  define NCNN_NO_EXPORT
+#else
+#  ifndef NCNN_EXPORT
+#    ifdef ncnn_EXPORTS
+        /* We are building this library */
+#      define NCNN_EXPORT __attribute__((visibility("default")))
+#    else
+        /* We are using this library */
+#      define NCNN_EXPORT __attribute__((visibility("default")))
+#    endif
+#  endif
+
+#  ifndef NCNN_NO_EXPORT
+#    define NCNN_NO_EXPORT __attribute__((visibility("hidden")))
+#  endif
+#endif
+
+#ifndef NCNN_DEPRECATED
+#  define NCNN_DEPRECATED __attribute__ ((__deprecated__))
+#endif
+
+#ifndef NCNN_DEPRECATED_EXPORT
+#  define NCNN_DEPRECATED_EXPORT NCNN_EXPORT NCNN_DEPRECATED
+#endif
+
+#ifndef NCNN_DEPRECATED_NO_EXPORT
+#  define NCNN_DEPRECATED_NO_EXPORT NCNN_NO_EXPORT NCNN_DEPRECATED
+#endif
+
+#if 0 /* DEFINE_NO_DEPRECATED */
+#  ifndef NCNN_NO_DEPRECATED
+#    define NCNN_NO_DEPRECATED
+#  endif
+#endif
+
+#endif /* NCNN_EXPORT_H */
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/net.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/net.h
new file mode 100644
index 0000000..98e3ec3
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/net.h
@@ -0,0 +1,274 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_NET_H
+#define NCNN_NET_H
+
+#include "blob.h"
+#include "layer.h"
+#include "mat.h"
+#include "option.h"
+#include "platform.h"
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/asset_manager.h>
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkCompute;
+#endif // NCNN_VULKAN
+class DataReader;
+class Extractor;
+class NetPrivate;
+class NCNN_EXPORT Net
+{
+public:
+    // empty init
+    Net();
+    // clear and destroy
+    virtual ~Net();
+
+public:
+    // option can be changed before loading
+    Option opt;
+
+#if NCNN_VULKAN
+    // set gpu device by index
+    void set_vulkan_device(int device_index);
+
+    // set gpu device by device handle, no owner transfer
+    void set_vulkan_device(const VulkanDevice* vkdev);
+
+    const VulkanDevice* vulkan_device() const;
+#endif // NCNN_VULKAN
+
+#if NCNN_STRING
+    // register custom layer or overwrite built-in layer by layer type name
+    // return 0 if success
+    int register_custom_layer(const char* type, layer_creator_func creator, layer_destroyer_func destroyer = 0, void* userdata = 0);
+    virtual int custom_layer_to_index(const char* type);
+#endif // NCNN_STRING
+    // register custom layer or overwrite built-in layer by layer type
+    // return 0 if success
+    int register_custom_layer(int index, layer_creator_func creator, layer_destroyer_func destroyer = 0, void* userdata = 0);
+
+#if NCNN_STRING
+    int load_param(const DataReader& dr);
+#endif // NCNN_STRING
+
+    int load_param_bin(const DataReader& dr);
+
+    int load_model(const DataReader& dr);
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    // load network structure from plain param file
+    // return 0 if success
+    int load_param(FILE* fp);
+    int load_param(const char* protopath);
+    int load_param_mem(const char* mem);
+#endif // NCNN_STRING
+    // load network structure from binary param file
+    // return 0 if success
+    int load_param_bin(FILE* fp);
+    int load_param_bin(const char* protopath);
+
+    // load network weight data from model file
+    // return 0 if success
+    int load_model(FILE* fp);
+    int load_model(const char* modelpath);
+#endif // NCNN_STDIO
+
+    // load network structure from external memory
+    // memory pointer must be 32-bit aligned
+    // return bytes consumed
+    int load_param(const unsigned char* mem);
+
+    // reference network weight data from external memory
+    // weight data is not copied but referenced
+    // so external memory should be retained when used
+    // memory pointer must be 32-bit aligned
+    // return bytes consumed
+    int load_model(const unsigned char* mem);
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#if NCNN_STRING
+    // convenient load network structure from android asset plain param file
+    int load_param(AAsset* asset);
+    int load_param(AAssetManager* mgr, const char* assetpath);
+#endif // NCNN_STRING
+    // convenient load network structure from android asset binary param file
+    int load_param_bin(AAsset* asset);
+    int load_param_bin(AAssetManager* mgr, const char* assetpath);
+
+    // convenient load network weight data from android asset model file
+    int load_model(AAsset* asset);
+    int load_model(AAssetManager* mgr, const char* assetpath);
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+    // unload network structure and weight data
+    void clear();
+
+    // construct an Extractor from network
+    Extractor create_extractor() const;
+
+    // get input/output indexes/names
+    const std::vector<int>& input_indexes() const;
+    const std::vector<int>& output_indexes() const;
+#if NCNN_STRING
+    const std::vector<const char*>& input_names() const;
+    const std::vector<const char*>& output_names() const;
+#endif
+
+    const std::vector<Blob>& blobs() const;
+    const std::vector<Layer*>& layers() const;
+
+    std::vector<Blob>& mutable_blobs();
+    std::vector<Layer*>& mutable_layers();
+
+protected:
+    friend class Extractor;
+#if NCNN_STRING
+    int find_blob_index_by_name(const char* name) const;
+    int find_layer_index_by_name(const char* name) const;
+    virtual Layer* create_custom_layer(const char* type);
+    virtual Layer* create_overwrite_builtin_layer(const char* type);
+#endif // NCNN_STRING
+    virtual Layer* create_custom_layer(int index);
+    virtual Layer* create_overwrite_builtin_layer(int typeindex);
+
+private:
+    Net(const Net&);
+    Net& operator=(const Net&);
+
+private:
+    NetPrivate* const d;
+};
+
+class ExtractorPrivate;
+class NCNN_EXPORT Extractor
+{
+public:
+    virtual ~Extractor();
+
+    // copy
+    Extractor(const Extractor&);
+
+    // assign
+    Extractor& operator=(const Extractor&);
+
+    // clear blob mats and alloctors
+    void clear();
+
+    // enable light mode
+    // intermediate blob will be recycled when enabled
+    // enabled by default
+    void set_light_mode(bool enable);
+
+    // set thread count for this extractor
+    // this will overwrite the global setting
+    // default count is system depended
+    void set_num_threads(int num_threads);
+
+    // set blob memory allocator
+    void set_blob_allocator(Allocator* allocator);
+
+    // set workspace memory allocator
+    void set_workspace_allocator(Allocator* allocator);
+
+#if NCNN_VULKAN
+    void set_vulkan_compute(bool enable);
+
+    void set_blob_vkallocator(VkAllocator* allocator);
+
+    void set_workspace_vkallocator(VkAllocator* allocator);
+
+    void set_staging_vkallocator(VkAllocator* allocator);
+#endif // NCNN_VULKAN
+
+#if NCNN_STRING
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const Mat& in);
+
+    // get result by blob name
+    // return 0 if success
+    // type = 0, default
+    // type = 1, do not convert fp16/bf16 or / and packing
+    int extract(const char* blob_name, Mat& feat, int type = 0);
+#endif // NCNN_STRING
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const Mat& in);
+
+    // get result by blob index
+    // return 0 if success
+    // type = 0, default
+    // type = 1, do not convert fp16/bf16 or / and packing
+    int extract(int blob_index, Mat& feat, int type = 0);
+
+#if NCNN_VULKAN
+#if NCNN_STRING
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const VkMat& in);
+
+    // get result by blob name
+    // return 0 if success
+    int extract(const char* blob_name, VkMat& feat, VkCompute& cmd);
+
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const VkImageMat& in);
+
+    // get result by blob name
+    // return 0 if success
+    int extract(const char* blob_name, VkImageMat& feat, VkCompute& cmd);
+#endif // NCNN_STRING
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const VkMat& in);
+
+    // get result by blob index
+    // return 0 if success
+    int extract(int blob_index, VkMat& feat, VkCompute& cmd);
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const VkImageMat& in);
+
+    // get result by blob index
+    // return 0 if success
+    int extract(int blob_index, VkImageMat& feat, VkCompute& cmd);
+#endif // NCNN_VULKAN
+
+protected:
+    friend Extractor Net::create_extractor() const;
+    Extractor(const Net* net, size_t blob_count);
+
+private:
+    ExtractorPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_NET_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/option.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/option.h
new file mode 100644
index 0000000..7d0cc60
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/option.h
@@ -0,0 +1,156 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_OPTION_H
+#define NCNN_OPTION_H
+
+#include "platform.h"
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkAllocator;
+class PipelineCache;
+#endif // NCNN_VULKAN
+
+class Allocator;
+class NCNN_EXPORT Option
+{
+public:
+    // default option
+    Option();
+
+public:
+    // light mode
+    // intermediate blob will be recycled when enabled
+    // enabled by default
+    bool lightmode;
+
+    // thread count
+    // default value is the one returned by get_cpu_count()
+    int num_threads;
+
+    // blob memory allocator
+    Allocator* blob_allocator;
+
+    // workspace memory allocator
+    Allocator* workspace_allocator;
+
+#if NCNN_VULKAN
+    // blob memory allocator
+    VkAllocator* blob_vkallocator;
+
+    // workspace memory allocator
+    VkAllocator* workspace_vkallocator;
+
+    // staging memory allocator
+    VkAllocator* staging_vkallocator;
+
+    // pipeline cache
+    PipelineCache* pipeline_cache;
+#endif // NCNN_VULKAN
+
+    // the time openmp threads busy-wait for more work before going to sleep
+    // default value is 20ms to keep the cores enabled
+    // without too much extra power consumption afterwards
+    int openmp_blocktime;
+
+    // enable winograd convolution optimization
+    // improve convolution 3x3 stride1 performance, may consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_winograd_convolution;
+
+    // enable sgemm convolution optimization
+    // improve convolution 1x1 stride1 performance, may consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_sgemm_convolution;
+
+    // enable quantized int8 inference
+    // use low-precision int8 path for quantized model
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_int8_inference;
+
+    // enable vulkan compute
+    bool use_vulkan_compute;
+
+    // enable bf16 data type for storage
+    // improve most operator performance on all arm devices, may consume more memory
+    bool use_bf16_storage;
+
+    // enable options for gpu inference
+    bool use_fp16_packed;
+    bool use_fp16_storage;
+    bool use_fp16_arithmetic;
+    bool use_int8_packed;
+    bool use_int8_storage;
+    bool use_int8_arithmetic;
+
+    // enable simd-friendly packed memory layout
+    // improve all operator performance on all arm devices, will consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_packing_layout;
+
+    bool use_shader_pack8;
+
+    // subgroup option
+    bool use_subgroup_basic;
+    bool use_subgroup_vote;
+    bool use_subgroup_ballot;
+    bool use_subgroup_shuffle;
+
+    // turn on for adreno
+    bool use_image_storage;
+    bool use_tensor_storage;
+
+    bool use_reserved_0;
+
+    // enable DAZ(Denormals-Are-Zero) and FTZ(Flush-To-Zero)
+    // default value is 3
+    // 0 = DAZ OFF, FTZ OFF
+    // 1 = DAZ ON , FTZ OFF
+    // 2 = DAZ OFF, FTZ ON
+    // 3 = DAZ ON,  FTZ ON
+    int flush_denormals;
+
+    bool use_local_pool_allocator;
+
+    // enable local memory optimization for gpu inference
+    bool use_shader_local_memory;
+
+    // enable cooperative matrix optimization for gpu inference
+    bool use_cooperative_matrix;
+
+    // more fine-grained control of winograd convolution
+    bool use_winograd23_convolution;
+    bool use_winograd43_convolution;
+    bool use_winograd63_convolution;
+
+    // this option is turned on for A53/A55 automatically
+    // but you can force this on/off if you wish
+    bool use_a53_a55_optimized_kernel;
+
+    bool use_reserved_7;
+    bool use_reserved_8;
+    bool use_reserved_9;
+    bool use_reserved_10;
+    bool use_reserved_11;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_OPTION_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/paramdict.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/paramdict.h
new file mode 100644
index 0000000..c2ef160
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/paramdict.h
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PARAMDICT_H
+#define NCNN_PARAMDICT_H
+
+#include "mat.h"
+
+// at most 32 parameters
+#define NCNN_MAX_PARAM_COUNT 32
+
+namespace ncnn {
+
+class DataReader;
+class Net;
+class ParamDictPrivate;
+class NCNN_EXPORT ParamDict
+{
+public:
+    // empty
+    ParamDict();
+
+    virtual ~ParamDict();
+
+    // copy
+    ParamDict(const ParamDict&);
+
+    // assign
+    ParamDict& operator=(const ParamDict&);
+
+    // get type
+    int type(int id) const;
+
+    // get int
+    int get(int id, int def) const;
+    // get float
+    float get(int id, float def) const;
+    // get array
+    Mat get(int id, const Mat& def) const;
+
+    // set int
+    void set(int id, int i);
+    // set float
+    void set(int id, float f);
+    // set array
+    void set(int id, const Mat& v);
+
+protected:
+    friend class Net;
+
+    void clear();
+
+    int load_param(const DataReader& dr);
+    int load_param_bin(const DataReader& dr);
+
+private:
+    ParamDictPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_PARAMDICT_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/pipeline.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/pipeline.h
new file mode 100644
index 0000000..c284a14
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/pipeline.h
@@ -0,0 +1,113 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PIPELINE_H
+#define NCNN_PIPELINE_H
+
+#include "mat.h"
+#include "platform.h"
+#if NCNN_VULKAN
+#include "gpu.h"
+
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class Option;
+class PipelinePrivate;
+class NCNN_EXPORT Pipeline
+{
+public:
+    explicit Pipeline(const VulkanDevice* vkdev);
+    virtual ~Pipeline();
+
+public:
+    void set_optimal_local_size_xyz(int w = 4, int h = 4, int c = 4);
+    void set_optimal_local_size_xyz(const Mat& local_size_xyz);
+    void set_local_size_xyz(int w, int h, int c);
+
+    int create(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations);
+
+    int create(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations);
+
+public:
+    VkShaderModule shader_module() const;
+    VkDescriptorSetLayout descriptorset_layout() const;
+    VkPipelineLayout pipeline_layout() const;
+    VkPipeline pipeline() const;
+    VkDescriptorUpdateTemplateKHR descriptor_update_template() const;
+
+    const ShaderInfo& shader_info() const;
+
+    uint32_t local_size_x() const;
+    uint32_t local_size_y() const;
+    uint32_t local_size_z() const;
+
+protected:
+    void set_shader_module(VkShaderModule shader_module);
+    void set_descriptorset_layout(VkDescriptorSetLayout descriptorset_layout);
+    void set_pipeline_layout(VkPipelineLayout pipeline_layout);
+    void set_pipeline(VkPipeline pipeline);
+    void set_descriptor_update_template(VkDescriptorUpdateTemplateKHR descriptor_update_template);
+
+    void set_shader_info(const ShaderInfo& shader_info);
+
+public:
+    const VulkanDevice* vkdev;
+
+private:
+    Pipeline(const Pipeline&);
+    Pipeline& operator=(const Pipeline&);
+
+private:
+    PipelinePrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class VkCompute;
+class NCNN_EXPORT ImportAndroidHardwareBufferPipeline : private Pipeline
+{
+public:
+    explicit ImportAndroidHardwareBufferPipeline(const VulkanDevice* vkdev);
+    virtual ~ImportAndroidHardwareBufferPipeline();
+
+    int create(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator, int type_to, int rotate_from, const Option& opt);
+    int create(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator, int type_to, int rotate_from, int target_width, int target_height, const Option& opt);
+    void destroy();
+
+    friend class VkCompute;
+
+protected:
+    int create_shader_module(const Option& opt);
+    int create_sampler(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator);
+    int create_descriptorset_layout();
+
+public:
+    int type_to;
+    int rotate_from;
+    bool need_resize;
+
+    VkSampler sampler;
+};
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_PIPELINE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/pipelinecache.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/pipelinecache.h
new file mode 100644
index 0000000..bb6b8fb
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/pipelinecache.h
@@ -0,0 +1,85 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PIPELINECACHE_H
+#define NCNN_PIPELINECACHE_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#include "mat.h"
+#include "gpu.h"
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+
+class VulkanDevice;
+class PipelineCachePrivate;
+class NCNN_EXPORT PipelineCache
+{
+public:
+    explicit PipelineCache(const VulkanDevice* _vkdev);
+
+    virtual ~PipelineCache();
+
+    void clear();
+
+    int get_pipeline(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations,
+                     uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                     VkShaderModule* shader_module,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template,
+                     ShaderInfo& shader_info) const;
+
+    int get_pipeline(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations,
+                     uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                     VkShaderModule* shader_module,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template,
+                     ShaderInfo& shader_info) const;
+
+protected:
+    int create_shader_module(int shader_type_index, const Option& opt, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                             VkShaderModule* _shader_module, ShaderInfo& si) const;
+
+    int new_pipeline(VkShaderModule shader_module, const ShaderInfo& shader_info, const std::vector<vk_specialization_type>& specializations,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template) const;
+
+protected:
+    const VulkanDevice* vkdev;
+
+private:
+    PipelineCache(const PipelineCache&);
+    PipelineCache& operator=(const PipelineCache&);
+
+private:
+    PipelineCachePrivate* const d;
+};
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_PIPELINECACHE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/platform.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/platform.h
new file mode 100644
index 0000000..8c46058
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/platform.h
@@ -0,0 +1,293 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PLATFORM_H
+#define NCNN_PLATFORM_H
+
+#define NCNN_STDIO 1
+#define NCNN_STRING 1
+#define NCNN_SIMPLEOCV 0
+#define NCNN_SIMPLEOMP 0
+#define NCNN_SIMPLESTL 0
+#define NCNN_SIMPLEMATH 0
+#define NCNN_THREADS 1
+#define NCNN_BENCHMARK 0
+#define NCNN_C_API 1
+#define NCNN_PLATFORM_API 1
+#define NCNN_PIXEL 1
+#define NCNN_PIXEL_ROTATE 1
+#define NCNN_PIXEL_AFFINE 1
+#define NCNN_PIXEL_DRAWING 1
+#define NCNN_VULKAN 0
+#define NCNN_SYSTEM_GLSLANG 0
+#define NCNN_RUNTIME_CPU 1
+#define NCNN_GNU_INLINE_ASM 1
+#define NCNN_AVX 0
+#define NCNN_XOP 0
+#define NCNN_FMA 0
+#define NCNN_F16C 0
+#define NCNN_AVX2 0
+#define NCNN_AVXVNNI 0
+#define NCNN_AVX512 0
+#define NCNN_AVX512VNNI 0
+#define NCNN_AVX512BF16 0
+#define NCNN_AVX512FP16 0
+#define NCNN_VFPV4 1
+#define NCNN_ARM82 1
+#define NCNN_ARM82DOT 1
+#define NCNN_ARM82FP16FML 1
+#define NCNN_ARM84BF16 1
+#define NCNN_ARM84I8MM 1
+#define NCNN_ARM86SVE 1
+#define NCNN_ARM86SVE2 1
+#define NCNN_ARM86SVEBF16 1
+#define NCNN_ARM86SVEI8MM 1
+#define NCNN_ARM86SVEF32MM 1
+#define NCNN_MSA 0
+#define NCNN_LSX 0
+#define NCNN_MMI 0
+#define NCNN_RVV 0
+#define NCNN_INT8 1
+#define NCNN_BF16 1
+#define NCNN_FORCE_INLINE 1
+
+#define NCNN_VERSION_STRING "1.0.20231027"
+
+#include "ncnn_export.h"
+
+#ifdef __cplusplus
+
+#if NCNN_THREADS
+#if (defined _WIN32 && !(defined __MINGW32__))
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <process.h>
+#else
+#include <pthread.h>
+#endif
+#endif // NCNN_THREADS
+
+#if __ANDROID_API__ >= 26
+#define VK_USE_PLATFORM_ANDROID_KHR
+#endif // __ANDROID_API__ >= 26
+
+namespace ncnn {
+
+#if NCNN_THREADS
+#if (defined _WIN32 && !(defined __MINGW32__))
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() { InitializeSRWLock(&srwlock); }
+    ~Mutex() {}
+    void lock() { AcquireSRWLockExclusive(&srwlock); }
+    void unlock() { ReleaseSRWLockExclusive(&srwlock); }
+private:
+    friend class ConditionVariable;
+    // NOTE SRWLock is available from windows vista
+    SRWLOCK srwlock;
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() { InitializeConditionVariable(&condvar); }
+    ~ConditionVariable() {}
+    void wait(Mutex& mutex) { SleepConditionVariableSRW(&condvar, &mutex.srwlock, INFINITE, 0); }
+    void broadcast() { WakeAllConditionVariable(&condvar); }
+    void signal() { WakeConditionVariable(&condvar); }
+private:
+    CONDITION_VARIABLE condvar;
+};
+
+static unsigned __stdcall start_wrapper(void* args);
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*start)(void*), void* args = 0) { _start = start; _args = args; handle = (HANDLE)_beginthreadex(0, 0, start_wrapper, this, 0, 0); }
+    ~Thread() {}
+    void join() { WaitForSingleObject(handle, INFINITE); CloseHandle(handle); }
+private:
+    friend unsigned __stdcall start_wrapper(void* args)
+    {
+        Thread* t = (Thread*)args;
+        t->_start(t->_args);
+        return 0;
+    }
+    HANDLE handle;
+    void* (*_start)(void*);
+    void* _args;
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { key = TlsAlloc(); }
+    ~ThreadLocalStorage() { TlsFree(key); }
+    void set(void* value) { TlsSetValue(key, (LPVOID)value); }
+    void* get() { return (void*)TlsGetValue(key); }
+private:
+    DWORD key;
+};
+#else // (defined _WIN32 && !(defined __MINGW32__))
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() { pthread_mutex_init(&mutex, 0); }
+    ~Mutex() { pthread_mutex_destroy(&mutex); }
+    void lock() { pthread_mutex_lock(&mutex); }
+    void unlock() { pthread_mutex_unlock(&mutex); }
+private:
+    friend class ConditionVariable;
+    pthread_mutex_t mutex;
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() { pthread_cond_init(&cond, 0); }
+    ~ConditionVariable() { pthread_cond_destroy(&cond); }
+    void wait(Mutex& mutex) { pthread_cond_wait(&cond, &mutex.mutex); }
+    void broadcast() { pthread_cond_broadcast(&cond); }
+    void signal() { pthread_cond_signal(&cond); }
+private:
+    pthread_cond_t cond;
+};
+
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*start)(void*), void* args = 0) { pthread_create(&t, 0, start, args); }
+    ~Thread() {}
+    void join() { pthread_join(t, 0); }
+private:
+    pthread_t t;
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { pthread_key_create(&key, 0); }
+    ~ThreadLocalStorage() { pthread_key_delete(key); }
+    void set(void* value) { pthread_setspecific(key, value); }
+    void* get() { return pthread_getspecific(key); }
+private:
+    pthread_key_t key;
+};
+#endif // (defined _WIN32 && !(defined __MINGW32__))
+#else // NCNN_THREADS
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() {}
+    ~Mutex() {}
+    void lock() {}
+    void unlock() {}
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() {}
+    ~ConditionVariable() {}
+    void wait(Mutex& /*mutex*/) {}
+    void broadcast() {}
+    void signal() {}
+};
+
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*/*start*/)(void*), void* /*args*/ = 0) {}
+    ~Thread() {}
+    void join() {}
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { data = 0; }
+    ~ThreadLocalStorage() {}
+    void set(void* value) { data = value; }
+    void* get() { return data; }
+private:
+    void* data;
+};
+#endif // NCNN_THREADS
+
+class NCNN_EXPORT MutexLockGuard
+{
+public:
+    MutexLockGuard(Mutex& _mutex) : mutex(_mutex) { mutex.lock(); }
+    ~MutexLockGuard() { mutex.unlock(); }
+private:
+    Mutex& mutex;
+};
+
+} // namespace ncnn
+
+#if NCNN_SIMPLESTL
+#include "simplestl.h"
+#else
+#include <algorithm>
+#include <list>
+#include <vector>
+#include <string>
+#endif
+
+// simplemath
+#if NCNN_SIMPLEMATH
+#include "simplemath.h"
+#else
+#include <math.h>
+#include <fenv.h>
+#endif
+
+#endif // __cplusplus
+
+#if NCNN_STDIO
+#if NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#include <android/log.h>
+#define NCNN_LOGE(...) do { \
+    fprintf(stderr, ##__VA_ARGS__); fprintf(stderr, "\n"); \
+    __android_log_print(ANDROID_LOG_WARN, "ncnn", ##__VA_ARGS__); } while(0)
+#else // NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#include <stdio.h>
+#define NCNN_LOGE(...) do { \
+    fprintf(stderr, ##__VA_ARGS__); fprintf(stderr, "\n"); } while(0)
+#endif // NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#else
+#define NCNN_LOGE(...)
+#endif
+
+
+#if NCNN_FORCE_INLINE
+#ifdef _MSC_VER
+    #define NCNN_FORCEINLINE __forceinline
+#elif defined(__GNUC__)
+    #define NCNN_FORCEINLINE inline __attribute__((__always_inline__))
+#elif defined(__CLANG__)
+    #if __has_attribute(__always_inline__)
+        #define NCNN_FORCEINLINE inline __attribute__((__always_inline__))
+    #else
+        #define NCNN_FORCEINLINE inline
+    #endif
+#else
+    #define NCNN_FORCEINLINE inline
+#endif
+#else
+    #define NCNN_FORCEINLINE inline
+#endif
+
+#endif // NCNN_PLATFORM_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/simplemath.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/simplemath.h
new file mode 100644
index 0000000..fd7fa69
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/simplemath.h
@@ -0,0 +1,102 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLEMATH_H
+#define NCNN_SIMPLEMATH_H
+
+#include "platform.h"
+
+#if NCNN_SIMPLEMATH
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+* ====================================================
+* discrete functions
+* ====================================================
+*/
+NCNN_EXPORT float fabs(float);
+NCNN_EXPORT float fabsf(float);
+NCNN_EXPORT float fmod(float, float);
+NCNN_EXPORT float floor(float);
+NCNN_EXPORT float floorf(float);
+NCNN_EXPORT float round(float);
+NCNN_EXPORT float roundf(float);
+NCNN_EXPORT float ceil(float);
+NCNN_EXPORT float ceilf(float);
+NCNN_EXPORT float fmaxf(float, float);
+NCNN_EXPORT float truncf(float);
+NCNN_EXPORT float frac(float);
+/*
+* ====================================================
+* trigonometric functions
+* ====================================================
+*/
+NCNN_EXPORT float sinf(float);
+NCNN_EXPORT float cosf(float);
+NCNN_EXPORT float tanf(float);
+NCNN_EXPORT float asinf(float);
+NCNN_EXPORT float acosf(float);
+NCNN_EXPORT float atanf(float);
+NCNN_EXPORT float atan2f(float, float);
+NCNN_EXPORT float tanhf(float);
+
+/*
+* ====================================================
+* power functions
+* ====================================================
+*/
+NCNN_EXPORT float sqrtf(float);
+NCNN_EXPORT float sqrt(float);
+NCNN_EXPORT float powf(float, float);
+
+/*
+* ====================================================
+* exponential and logarithm functions
+* ====================================================
+*/
+NCNN_EXPORT float expf(float);
+NCNN_EXPORT float frexp(float, int*);
+NCNN_EXPORT float logf(float);
+NCNN_EXPORT float log(float);
+NCNN_EXPORT float log10f(float);
+
+/*
+* ====================================================
+* probability functions
+* ====================================================
+*/
+NCNN_EXPORT float erf(float);
+NCNN_EXPORT float erfcf(float);
+
+/*
+* ====================================================
+* other functions
+* ====================================================
+*/
+NCNN_EXPORT int msb(unsigned int);
+NCNN_EXPORT float fmaf(float, float, float);
+NCNN_EXPORT float copysignf(float, float);
+NCNN_EXPORT void fesetround(int);
+NCNN_EXPORT int fegetround();
+NCNN_EXPORT float nearbyintf(float);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // NCNN_SIMPLEMATH
+
+#endif // NCNN_SIMPLEMATH_H
\ No newline at end of file
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/simpleocv.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/simpleocv.h
new file mode 100644
index 0000000..54b22d9
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/simpleocv.h
@@ -0,0 +1,503 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLEOCV_H
+#define NCNN_SIMPLEOCV_H
+
+#include "platform.h"
+
+#if NCNN_SIMPLEOCV
+
+#include <limits.h>
+#include <string.h>
+#include "allocator.h"
+#include "mat.h"
+
+#if defined(_MSC_VER) || defined(__GNUC__)
+#pragma push_macro("min")
+#pragma push_macro("max")
+#undef min
+#undef max
+#endif
+
+#ifndef NCNN_XADD
+using ncnn::NCNN_XADD;
+#endif
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+
+enum
+{
+    CV_LOAD_IMAGE_UNCHANGED = -1,
+    CV_LOAD_IMAGE_GRAYSCALE = 0,
+    CV_LOAD_IMAGE_COLOR = 1,
+};
+
+enum
+{
+    CV_IMWRITE_JPEG_QUALITY = 1
+};
+
+// minimal opencv style data structure implementation
+namespace cv {
+
+template<typename _Tp>
+static inline _Tp saturate_cast(int v)
+{
+    return _Tp(v);
+}
+template<>
+inline uchar saturate_cast<uchar>(int v)
+{
+    return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
+}
+
+template<typename _Tp>
+struct Scalar_
+{
+    Scalar_()
+    {
+        v[0] = 0;
+        v[1] = 0;
+        v[2] = 0;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0)
+    {
+        v[0] = _v0;
+        v[1] = 0;
+        v[2] = 0;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0, _Tp _v1, _Tp _v2)
+    {
+        v[0] = _v0;
+        v[1] = _v1;
+        v[2] = _v2;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0, _Tp _v1, _Tp _v2, _Tp _v3)
+    {
+        v[0] = _v0;
+        v[1] = _v1;
+        v[2] = _v2;
+        v[3] = _v3;
+    }
+
+    const _Tp operator[](const int i) const
+    {
+        return v[i];
+    }
+
+    _Tp operator[](const int i)
+    {
+        return v[i];
+    }
+
+    _Tp v[4];
+};
+
+typedef Scalar_<uchar> Scalar;
+
+template<typename _Tp>
+struct Point_
+{
+    Point_()
+        : x(0), y(0)
+    {
+    }
+    Point_(_Tp _x, _Tp _y)
+        : x(_x), y(_y)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Point_<_Tp2>() const
+    {
+        return Point_<_Tp2>(saturate_cast<_Tp2>(x), saturate_cast<_Tp2>(y));
+    }
+
+    _Tp x;
+    _Tp y;
+};
+
+typedef Point_<int> Point;
+typedef Point_<float> Point2f;
+
+template<typename _Tp>
+struct Size_
+{
+    Size_()
+        : width(0), height(0)
+    {
+    }
+    Size_(_Tp _w, _Tp _h)
+        : width(_w), height(_h)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Size_<_Tp2>() const
+    {
+        return Size_<_Tp2>(saturate_cast<_Tp2>(width), saturate_cast<_Tp2>(height));
+    }
+
+    _Tp width;
+    _Tp height;
+};
+
+typedef Size_<int> Size;
+typedef Size_<float> Size2f;
+
+template<typename _Tp>
+struct Rect_
+{
+    Rect_()
+        : x(0), y(0), width(0), height(0)
+    {
+    }
+    Rect_(_Tp _x, _Tp _y, _Tp _w, _Tp _h)
+        : x(_x), y(_y), width(_w), height(_h)
+    {
+    }
+    Rect_(Point_<_Tp> _p, Size_<_Tp> _size)
+        : x(_p.x), y(_p.y), width(_size.width), height(_size.height)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Rect_<_Tp2>() const
+    {
+        return Rect_<_Tp2>(saturate_cast<_Tp2>(x), saturate_cast<_Tp2>(y), saturate_cast<_Tp2>(width), saturate_cast<_Tp2>(height));
+    }
+
+    _Tp x;
+    _Tp y;
+    _Tp width;
+    _Tp height;
+
+    // area
+    _Tp area() const
+    {
+        return width * height;
+    }
+};
+
+template<typename _Tp>
+static inline Rect_<_Tp>& operator&=(Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    _Tp x1 = std::max(a.x, b.x), y1 = std::max(a.y, b.y);
+    a.width = std::min(a.x + a.width, b.x + b.width) - x1;
+    a.height = std::min(a.y + a.height, b.y + b.height) - y1;
+    a.x = x1;
+    a.y = y1;
+    if (a.width <= 0 || a.height <= 0)
+        a = Rect_<_Tp>();
+    return a;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp>& operator|=(Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    _Tp x1 = std::min(a.x, b.x), y1 = std::min(a.y, b.y);
+    a.width = std::max(a.x + a.width, b.x + b.width) - x1;
+    a.height = std::max(a.y + a.height, b.y + b.height) - y1;
+    a.x = x1;
+    a.y = y1;
+    return a;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp> operator&(const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    Rect_<_Tp> c = a;
+    return c &= b;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp> operator|(const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    Rect_<_Tp> c = a;
+    return c |= b;
+}
+
+typedef Rect_<int> Rect;
+typedef Rect_<float> Rect2f;
+
+#define CV_8UC1  1
+#define CV_8UC3  3
+#define CV_8UC4  4
+#define CV_32FC1 4
+
+struct NCNN_EXPORT Mat
+{
+    Mat()
+        : data(0), refcount(0), rows(0), cols(0), c(0)
+    {
+    }
+
+    Mat(int _rows, int _cols, int flags)
+        : data(0), refcount(0)
+    {
+        create(_rows, _cols, flags);
+    }
+
+    // copy
+    Mat(const Mat& m)
+        : data(m.data), refcount(m.refcount)
+    {
+        if (refcount)
+            NCNN_XADD(refcount, 1);
+
+        rows = m.rows;
+        cols = m.cols;
+        c = m.c;
+    }
+
+    Mat(int _rows, int _cols, int flags, void* _data)
+        : data((unsigned char*)_data), refcount(0)
+    {
+        rows = _rows;
+        cols = _cols;
+        c = flags;
+    }
+
+    ~Mat()
+    {
+        release();
+    }
+
+    // assign
+    Mat& operator=(const Mat& m)
+    {
+        if (this == &m)
+            return *this;
+
+        if (m.refcount)
+            NCNN_XADD(m.refcount, 1);
+
+        release();
+
+        data = m.data;
+        refcount = m.refcount;
+
+        rows = m.rows;
+        cols = m.cols;
+        c = m.c;
+
+        return *this;
+    }
+
+    Mat& operator=(const Scalar& s)
+    {
+        if (total() > 0)
+        {
+            uchar* p = data;
+            for (int i = 0; i < cols * rows; i++)
+            {
+                for (int j = 0; j < c; j++)
+                {
+                    *p++ = s[j];
+                }
+            }
+        }
+
+        return *this;
+    }
+
+    void create(int _rows, int _cols, int flags)
+    {
+        release();
+
+        rows = _rows;
+        cols = _cols;
+        c = flags;
+
+        if (total() > 0)
+        {
+            // refcount address must be aligned, so we expand totalsize here
+            size_t totalsize = (total() + 3) >> 2 << 2;
+            data = (uchar*)ncnn::fastMalloc(totalsize + (int)sizeof(*refcount));
+            refcount = (int*)(((uchar*)data) + totalsize);
+            *refcount = 1;
+        }
+    }
+
+    void release()
+    {
+        if (refcount && NCNN_XADD(refcount, -1) == 1)
+            ncnn::fastFree(data);
+
+        data = 0;
+
+        rows = 0;
+        cols = 0;
+        c = 0;
+
+        refcount = 0;
+    }
+
+    Mat clone() const
+    {
+        if (empty())
+            return Mat();
+
+        Mat m(rows, cols, c);
+
+        if (total() > 0)
+        {
+            memcpy(m.data, data, total());
+        }
+
+        return m;
+    }
+
+    bool empty() const
+    {
+        return data == 0 || total() == 0;
+    }
+
+    int channels() const
+    {
+        return c;
+    }
+
+    int type() const
+    {
+        return c;
+    }
+
+    size_t total() const
+    {
+        return cols * rows * c;
+    }
+
+    const uchar* ptr(int y) const
+    {
+        return data + y * cols * c;
+    }
+
+    uchar* ptr(int y)
+    {
+        return data + y * cols * c;
+    }
+
+    template<typename _Tp>
+    const _Tp* ptr(int y) const
+    {
+        return (const _Tp*)data + y * cols * c;
+    }
+
+    template<typename _Tp>
+    _Tp* ptr(int y)
+    {
+        return (_Tp*)data + y * cols * c;
+    }
+
+    // roi
+    Mat operator()(const Rect& roi) const
+    {
+        if (empty())
+            return Mat();
+
+        Mat m(roi.height, roi.width, c);
+
+        int sy = roi.y;
+        for (int y = 0; y < roi.height; y++)
+        {
+            const uchar* sptr = ptr(sy) + roi.x * c;
+            uchar* dptr = m.ptr(y);
+            memcpy(dptr, sptr, roi.width * c);
+            sy++;
+        }
+
+        return m;
+    }
+
+    uchar* data;
+
+    // pointer to the reference counter;
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    int rows;
+    int cols;
+
+    int c;
+};
+
+enum ImreadModes
+{
+    IMREAD_UNCHANGED = -1,
+    IMREAD_GRAYSCALE = 0,
+    IMREAD_COLOR = 1
+};
+
+NCNN_EXPORT Mat imread(const std::string& path, int flags = IMREAD_COLOR);
+
+NCNN_EXPORT Mat imdecode(const std::vector<uchar>& buf, int flags = IMREAD_COLOR);
+
+enum ImwriteFlags
+{
+    IMWRITE_JPEG_QUALITY = 1
+};
+
+NCNN_EXPORT bool imwrite(const std::string& path, const Mat& m, const std::vector<int>& params = std::vector<int>());
+
+NCNN_EXPORT void imshow(const std::string& name, const Mat& m);
+
+NCNN_EXPORT int waitKey(int delay = 0);
+
+#if NCNN_PIXEL
+NCNN_EXPORT void resize(const Mat& src, Mat& dst, const Size& size, float sw = 0.f, float sh = 0.f, int flags = 0);
+#endif // NCNN_PIXEL
+
+#if NCNN_PIXEL_DRAWING
+
+enum
+{
+    FILLED = -1
+};
+
+NCNN_EXPORT void rectangle(Mat& img, Point pt1, Point pt2, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void rectangle(Mat& img, Rect rec, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void circle(Mat& img, Point center, int radius, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void line(Mat& img, Point p0, Point p1, const Scalar& color, int thickness = 1);
+
+enum
+{
+    FONT_HERSHEY_SIMPLEX = 0
+};
+
+NCNN_EXPORT void putText(Mat& img, const std::string& text, Point org, int fontFace, double fontScale, Scalar color, int thickness = 1);
+
+NCNN_EXPORT Size getTextSize(const std::string& text, int fontFace, double fontScale, int thickness, int* baseLine);
+
+#endif // NCNN_PIXEL_DRAWING
+
+} // namespace cv
+
+#if defined(_MSC_VER) || defined(__GNUC__)
+#pragma pop_macro("min")
+#pragma pop_macro("max")
+#endif
+
+#endif // NCNN_SIMPLEOCV
+
+#endif // NCNN_SIMPLEOCV_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/simpleomp.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/simpleomp.h
new file mode 100644
index 0000000..13e2452
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/simpleomp.h
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLEOMP_H
+#define NCNN_SIMPLEOMP_H
+
+#include "platform.h"
+
+#if NCNN_SIMPLEOMP
+
+#include <stdint.h>
+
+// This minimal openmp runtime implementation only supports the llvm openmp abi
+// and only supports #pragma omp parallel for num_threads(X)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+NCNN_EXPORT int omp_get_max_threads();
+
+NCNN_EXPORT void omp_set_num_threads(int num_threads);
+
+NCNN_EXPORT int omp_get_dynamic();
+
+NCNN_EXPORT void omp_set_dynamic(int dynamic);
+
+NCNN_EXPORT int omp_get_num_threads();
+
+NCNN_EXPORT int omp_get_thread_num();
+
+NCNN_EXPORT int kmp_get_blocktime();
+
+NCNN_EXPORT void kmp_set_blocktime(int blocktime);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // NCNN_SIMPLEOMP
+
+#endif // NCNN_SIMPLEOMP_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/simplestl.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/simplestl.h
new file mode 100644
index 0000000..00ff468
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/simplestl.h
@@ -0,0 +1,565 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLESTL_H
+#define NCNN_SIMPLESTL_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#if !NCNN_SIMPLESTL
+
+#include <new>
+
+#else
+
+// allocation functions
+NCNN_EXPORT void* operator new(size_t size);
+NCNN_EXPORT void* operator new[](size_t size);
+// placement allocation functions
+NCNN_EXPORT void* operator new(size_t size, void* ptr);
+NCNN_EXPORT void* operator new[](size_t size, void* ptr);
+// deallocation functions
+NCNN_EXPORT void operator delete(void* ptr);
+NCNN_EXPORT void operator delete[](void* ptr);
+// deallocation functions since c++14
+#if __cplusplus >= 201402L
+NCNN_EXPORT void operator delete(void* ptr, size_t sz);
+NCNN_EXPORT void operator delete[](void* ptr, size_t sz);
+#endif
+// placement deallocation functions
+NCNN_EXPORT void operator delete(void* ptr, void* voidptr2);
+NCNN_EXPORT void operator delete[](void* ptr, void* voidptr2);
+
+#endif
+
+// minimal stl data structure implementation
+namespace std {
+
+template<typename T>
+const T& max(const T& a, const T& b)
+{
+    return (a < b) ? b : a;
+}
+
+template<typename T>
+const T& min(const T& a, const T& b)
+{
+    return (a > b) ? b : a;
+}
+
+template<typename T>
+void swap(T& a, T& b)
+{
+    T temp(a);
+    a = b;
+    b = temp;
+}
+
+template<typename T1, typename T2>
+struct pair
+{
+    pair()
+        : first(), second()
+    {
+    }
+    pair(const T1& t1, const T2& t2)
+        : first(t1), second(t2)
+    {
+    }
+
+    T1 first;
+    T2 second;
+};
+
+template<typename T1, typename T2>
+bool operator==(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return (x.first == y.first && x.second == y.second);
+}
+template<typename T1, typename T2>
+bool operator<(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return x.first < y.first || (!(y.first < x.first) && x.second < y.second);
+}
+template<typename T1, typename T2>
+bool operator!=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(x == y);
+}
+template<typename T1, typename T2>
+bool operator>(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return y < x;
+}
+template<typename T1, typename T2>
+bool operator<=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(y < x);
+}
+template<typename T1, typename T2>
+bool operator>=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(x < y);
+}
+
+template<typename T1, typename T2>
+pair<T1, T2> make_pair(const T1& t1, const T2& t2)
+{
+    return pair<T1, T2>(t1, t2);
+}
+
+template<typename T>
+struct node
+{
+    node* prev_;
+    node* next_;
+    T data_;
+
+    node()
+        : prev_(0), next_(0), data_()
+    {
+    }
+    node(const T& t)
+        : prev_(0), next_(0), data_(t)
+    {
+    }
+};
+
+template<typename T>
+struct iter_list
+{
+    iter_list()
+        : curr_(0)
+    {
+    }
+    iter_list(node<T>* n)
+        : curr_(n)
+    {
+    }
+    iter_list(const iter_list& i)
+        : curr_(i.curr_)
+    {
+    }
+    ~iter_list()
+    {
+    }
+
+    iter_list& operator=(const iter_list& i)
+    {
+        curr_ = i.curr_;
+        return *this;
+    }
+
+    T& operator*()
+    {
+        return curr_->data_;
+    }
+    T* operator->()
+    {
+        return &(curr_->data_);
+    }
+
+    bool operator==(const iter_list& i)
+    {
+        return curr_ == i.curr_;
+    }
+    bool operator!=(const iter_list& i)
+    {
+        return curr_ != i.curr_;
+    }
+
+    iter_list& operator++()
+    {
+        curr_ = curr_->next_;
+        return *this;
+    }
+    iter_list& operator--()
+    {
+        curr_ = curr_->prev_;
+        return *this;
+    }
+
+    node<T>* curr_;
+};
+
+template<typename T>
+struct list
+{
+    typedef iter_list<T> iterator;
+
+    list()
+    {
+        head_ = new node<T>();
+        tail_ = head_;
+        count_ = 0;
+    }
+    ~list()
+    {
+        clear();
+        delete head_;
+    }
+    list(const list& l)
+    {
+        head_ = new node<T>();
+        tail_ = head_;
+        count_ = 0;
+
+        for (iter_list<T> i = l.begin(); i != l.end(); ++i)
+        {
+            push_back(*i);
+        }
+    }
+
+    list& operator=(const list& l)
+    {
+        if (this == &l)
+        {
+            return *this;
+        }
+        clear();
+
+        for (iter_list<T> i = l.begin(); i != l.end(); ++i)
+        {
+            push_back(*i);
+        }
+        return *this;
+    }
+
+    void clear()
+    {
+        while (count_ > 0)
+        {
+            pop_front();
+        }
+    }
+
+    void pop_front()
+    {
+        if (count_ > 0)
+        {
+            head_ = head_->next_;
+            delete head_->prev_;
+            head_->prev_ = 0;
+            --count_;
+        }
+    }
+
+    size_t size() const
+    {
+        return count_;
+    }
+    iter_list<T> begin() const
+    {
+        return iter_list<T>(head_);
+    }
+    iter_list<T> end() const
+    {
+        return iter_list<T>(tail_);
+    }
+    bool empty() const
+    {
+        return count_ == 0;
+    }
+
+    void push_back(const T& t)
+    {
+        if (count_ == 0)
+        {
+            head_ = new node<T>(t);
+            head_->prev_ = 0;
+            head_->next_ = tail_;
+            tail_->prev_ = head_;
+            count_ = 1;
+        }
+        else
+        {
+            node<T>* temp = new node<T>(t);
+            temp->prev_ = tail_->prev_;
+            temp->next_ = tail_;
+            tail_->prev_->next_ = temp;
+            tail_->prev_ = temp;
+            ++count_;
+        }
+    }
+
+    iter_list<T> erase(iter_list<T> pos)
+    {
+        if (pos != end())
+        {
+            node<T>* temp = pos.curr_;
+            if (temp == head_)
+            {
+                ++pos;
+                temp->next_->prev_ = 0;
+                head_ = temp->next_;
+            }
+            else
+            {
+                --pos;
+                temp->next_->prev_ = temp->prev_;
+                temp->prev_->next_ = temp->next_;
+                ++pos;
+            }
+            delete temp;
+            --count_;
+        }
+        return pos;
+    }
+
+protected:
+    node<T>* head_;
+    node<T>* tail_;
+    size_t count_;
+};
+
+template<typename T>
+struct greater
+{
+    bool operator()(const T& x, const T& y) const
+    {
+        return (x > y);
+    }
+};
+
+template<typename T>
+struct less
+{
+    bool operator()(const T& x, const T& y) const
+    {
+        return (x < y);
+    }
+};
+
+template<typename RandomAccessIter, typename Compare>
+void partial_sort(RandomAccessIter first, RandomAccessIter middle, RandomAccessIter last, Compare comp)
+{
+    // [TODO] heap sort should be used here, but we simply use bubble sort now
+    for (RandomAccessIter i = first; i < middle; ++i)
+    {
+        // bubble sort
+        for (RandomAccessIter j = last - 1; j > first; --j)
+        {
+            if (comp(*j, *(j - 1)))
+            {
+                swap(*j, *(j - 1));
+            }
+        }
+    }
+}
+
+template<typename T>
+struct vector
+{
+    vector()
+        : data_(0), size_(0), capacity_(0)
+    {
+    }
+    vector(const size_t new_size, const T& value = T())
+        : data_(0), size_(0), capacity_(0)
+    {
+        resize(new_size, value);
+    }
+    ~vector()
+    {
+        clear();
+    }
+    vector(const vector& v)
+        : data_(0), size_(0), capacity_(0)
+    {
+        resize(v.size());
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i] = v.data_[i];
+        }
+    }
+
+    vector& operator=(const vector& v)
+    {
+        if (this == &v)
+        {
+            return *this;
+        }
+        resize(0);
+        resize(v.size());
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i] = v.data_[i];
+        }
+        return *this;
+    }
+
+    void resize(const size_t new_size, const T& value = T())
+    {
+        try_alloc(new_size);
+        if (new_size > size_)
+        {
+            for (size_t i = size_; i < new_size; i++)
+            {
+                new (&data_[i]) T(value);
+            }
+        }
+        else if (new_size < size_)
+        {
+            for (size_t i = new_size; i < size_; i++)
+            {
+                data_[i].~T();
+            }
+        }
+        size_ = new_size;
+    }
+
+    void clear()
+    {
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i].~T();
+        }
+        delete[](char*) data_;
+        data_ = 0;
+        size_ = 0;
+        capacity_ = 0;
+    }
+
+    T* data() const
+    {
+        return data_;
+    }
+    size_t size() const
+    {
+        return size_;
+    }
+    T& operator[](size_t i) const
+    {
+        return data_[i];
+    }
+    T* begin() const
+    {
+        return &data_[0];
+    }
+    T* end() const
+    {
+        return &data_[size_];
+    }
+    bool empty() const
+    {
+        return size_ == 0;
+    }
+
+    void push_back(const T& t)
+    {
+        try_alloc(size_ + 1);
+        new (&data_[size_]) T(t);
+        size_++;
+    }
+
+    void insert(T* pos, T* b, T* e)
+    {
+        vector* v = 0;
+        if (b >= begin() && b < end())
+        {
+            //the same vector
+            v = new vector(*this);
+            b = v->begin() + (b - begin());
+            e = v->begin() + (e - begin());
+        }
+        size_t diff = pos - begin();
+        try_alloc(size_ + (e - b));
+        pos = begin() + diff;
+        memmove(pos + (e - b), pos, (end() - pos) * sizeof(T));
+        size_t len = e - b;
+        size_ += len;
+        for (size_t i = 0; i < len; i++)
+        {
+            *pos = *b;
+            pos++;
+            b++;
+        }
+        delete v;
+    }
+
+    T* erase(T* pos)
+    {
+        pos->~T();
+        memmove(pos, pos + 1, (end() - pos - 1) * sizeof(T));
+        size_--;
+        return pos;
+    }
+
+protected:
+    T* data_;
+    size_t size_;
+    size_t capacity_;
+    void try_alloc(size_t new_size)
+    {
+        if (new_size * 3 / 2 > capacity_ / 2)
+        {
+            capacity_ = new_size * 2;
+            T* new_data = (T*)new char[capacity_ * sizeof(T)];
+            memset(static_cast<void*>(new_data), 0, capacity_ * sizeof(T));
+            if (data_)
+            {
+                memmove(new_data, data_, sizeof(T) * size_);
+                delete[](char*) data_;
+            }
+            data_ = new_data;
+        }
+    }
+};
+
+struct NCNN_EXPORT string : public vector<char>
+{
+    string()
+    {
+    }
+    string(const char* str)
+    {
+        size_t len = strlen(str);
+        resize(len);
+        memcpy(data_, str, len);
+    }
+    const char* c_str() const
+    {
+        return (const char*)data_;
+    }
+    bool operator==(const string& str2) const
+    {
+        return strcmp(data_, str2.data_) == 0;
+    }
+    bool operator==(const char* str2) const
+    {
+        return strcmp(data_, str2) == 0;
+    }
+    bool operator!=(const char* str2) const
+    {
+        return strcmp(data_, str2) != 0;
+    }
+    string& operator+=(const string& str1)
+    {
+        insert(end(), str1.begin(), str1.end());
+        return *this;
+    }
+};
+
+inline string operator+(const string& str1, const string& str2)
+{
+    string str(str1);
+    str.insert(str.end(), str2.begin(), str2.end());
+    return str;
+}
+
+} // namespace std
+
+#endif // NCNN_SIMPLESTL_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/vulkan_header_fix.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/vulkan_header_fix.h
new file mode 100644
index 0000000..0a5ea9b
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/include/ncnn/vulkan_header_fix.h
@@ -0,0 +1,449 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_VULKAN_HEADER_FIX_H
+#define NCNN_VULKAN_HEADER_FIX_H
+
+#include <vulkan/vulkan.h>
+
+// This header contains new structure and function declearation to fix build with old vulkan sdk
+
+#if VK_HEADER_VERSION < 70
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES (VkStructureType)1000094000
+typedef enum VkSubgroupFeatureFlagBits
+{
+    VK_SUBGROUP_FEATURE_BASIC_BIT = 0x00000001,
+    VK_SUBGROUP_FEATURE_VOTE_BIT = 0x00000002,
+    VK_SUBGROUP_FEATURE_ARITHMETIC_BIT = 0x00000004,
+    VK_SUBGROUP_FEATURE_BALLOT_BIT = 0x00000008,
+    VK_SUBGROUP_FEATURE_SHUFFLE_BIT = 0x00000010,
+    VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT = 0x00000020,
+    VK_SUBGROUP_FEATURE_CLUSTERED_BIT = 0x00000040,
+    VK_SUBGROUP_FEATURE_QUAD_BIT = 0x00000080,
+    VK_SUBGROUP_FEATURE_PARTITIONED_BIT_NV = 0x00000100,
+    VK_SUBGROUP_FEATURE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkSubgroupFeatureFlagBits;
+typedef VkFlags VkSubgroupFeatureFlags;
+typedef struct VkPhysicalDeviceSubgroupProperties
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t subgroupSize;
+    VkShaderStageFlags supportedStages;
+    VkSubgroupFeatureFlags supportedOperations;
+    VkBool32 quadOperationsInAllStages;
+} VkPhysicalDeviceSubgroupProperties;
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_3_PROPERTIES (VkStructureType)1000168000
+#define VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_SUPPORT            (VkStructureType)1000168001
+typedef struct VkPhysicalDeviceMaintenance3Properties
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t maxPerSetDescriptors;
+    VkDeviceSize maxMemoryAllocationSize;
+} VkPhysicalDeviceMaintenance3Properties;
+typedef struct VkDescriptorSetLayoutSupport
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 supported;
+} VkDescriptorSetLayoutSupport;
+typedef VkPhysicalDeviceMaintenance3Properties VkPhysicalDeviceMaintenance3PropertiesKHR;
+typedef VkDescriptorSetLayoutSupport VkDescriptorSetLayoutSupportKHR;
+typedef void(VKAPI_PTR* PFN_vkGetDescriptorSetLayoutSupportKHR)(VkDevice device, const VkDescriptorSetLayoutCreateInfo* pCreateInfo, VkDescriptorSetLayoutSupport* pSupport);
+#endif // VK_HEADER_VERSION < 70
+
+#if VK_HEADER_VERSION < 80
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR (VkStructureType)1000177000
+typedef struct VkPhysicalDevice8BitStorageFeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 storageBuffer8BitAccess;
+    VkBool32 uniformAndStorageBuffer8BitAccess;
+    VkBool32 storagePushConstant8;
+} VkPhysicalDevice8BitStorageFeaturesKHR;
+#define VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2_KHR  (VkStructureType)1000109000
+#define VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2_KHR    (VkStructureType)1000109001
+#define VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2_KHR     (VkStructureType)1000109002
+#define VK_STRUCTURE_TYPE_SUBPASS_DEPENDENCY_2_KHR      (VkStructureType)1000109003
+#define VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2_KHR (VkStructureType)1000109004
+#define VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO_KHR        (VkStructureType)1000109005
+#define VK_STRUCTURE_TYPE_SUBPASS_END_INFO_KHR          (VkStructureType)1000109006
+typedef struct VkAttachmentDescription2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkAttachmentDescriptionFlags flags;
+    VkFormat format;
+    VkSampleCountFlagBits samples;
+    VkAttachmentLoadOp loadOp;
+    VkAttachmentStoreOp storeOp;
+    VkAttachmentLoadOp stencilLoadOp;
+    VkAttachmentStoreOp stencilStoreOp;
+    VkImageLayout initialLayout;
+    VkImageLayout finalLayout;
+} VkAttachmentDescription2KHR;
+typedef struct VkAttachmentReference2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint32_t attachment;
+    VkImageLayout layout;
+    VkImageAspectFlags aspectMask;
+} VkAttachmentReference2KHR;
+typedef struct VkSubpassDescription2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkSubpassDescriptionFlags flags;
+    VkPipelineBindPoint pipelineBindPoint;
+    uint32_t viewMask;
+    uint32_t inputAttachmentCount;
+    const VkAttachmentReference2KHR* pInputAttachments;
+    uint32_t colorAttachmentCount;
+    const VkAttachmentReference2KHR* pColorAttachments;
+    const VkAttachmentReference2KHR* pResolveAttachments;
+    const VkAttachmentReference2KHR* pDepthStencilAttachment;
+    uint32_t preserveAttachmentCount;
+    const uint32_t* pPreserveAttachments;
+} VkSubpassDescription2KHR;
+typedef struct VkSubpassDependency2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint32_t srcSubpass;
+    uint32_t dstSubpass;
+    VkPipelineStageFlags srcStageMask;
+    VkPipelineStageFlags dstStageMask;
+    VkAccessFlags srcAccessMask;
+    VkAccessFlags dstAccessMask;
+    VkDependencyFlags dependencyFlags;
+    int32_t viewOffset;
+} VkSubpassDependency2KHR;
+typedef struct VkRenderPassCreateInfo2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkRenderPassCreateFlags flags;
+    uint32_t attachmentCount;
+    const VkAttachmentDescription2KHR* pAttachments;
+    uint32_t subpassCount;
+    const VkSubpassDescription2KHR* pSubpasses;
+    uint32_t dependencyCount;
+    const VkSubpassDependency2KHR* pDependencies;
+    uint32_t correlatedViewMaskCount;
+    const uint32_t* pCorrelatedViewMasks;
+} VkRenderPassCreateInfo2KHR;
+typedef struct VkSubpassBeginInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkSubpassContents contents;
+} VkSubpassBeginInfoKHR;
+
+typedef struct VkSubpassEndInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+} VkSubpassEndInfoKHR;
+typedef VkResult(VKAPI_PTR* PFN_vkCreateRenderPass2KHR)(VkDevice device, const VkRenderPassCreateInfo2KHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkRenderPass* pRenderPass);
+typedef void(VKAPI_PTR* PFN_vkCmdBeginRenderPass2KHR)(VkCommandBuffer commandBuffer, const VkRenderPassBeginInfo* pRenderPassBegin, const VkSubpassBeginInfoKHR* pSubpassBeginInfo);
+typedef void(VKAPI_PTR* PFN_vkCmdNextSubpass2KHR)(VkCommandBuffer commandBuffer, const VkSubpassBeginInfoKHR* pSubpassBeginInfo, const VkSubpassEndInfoKHR* pSubpassEndInfo);
+typedef void(VKAPI_PTR* PFN_vkCmdEndRenderPass2KHR)(VkCommandBuffer commandBuffer, const VkSubpassEndInfoKHR* pSubpassEndInfo);
+#endif // VK_HEADER_VERSION < 80
+
+#if VK_HEADER_VERSION < 95
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR (VkStructureType)1000082000
+typedef struct VkPhysicalDeviceFloat16Int8FeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 shaderFloat16;
+    VkBool32 shaderInt8;
+} VkPhysicalDeviceFloat16Int8FeaturesKHR;
+#endif // VK_HEADER_VERSION < 95
+
+#if VK_HEADER_VERSION < 97
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT (VkStructureType)1000237000
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PRIORITY_FEATURES_EXT (VkStructureType)1000238000
+#define VK_STRUCTURE_TYPE_MEMORY_PRIORITY_ALLOCATE_INFO_EXT            (VkStructureType)1000238001
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_ADDRESS_FEATURES_EXT  (VkStructureType)1000244000
+#define VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO_EXT               (VkStructureType)1000244001
+#define VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_CREATE_INFO_EXT        (VkStructureType)1000244002
+#define VK_STRUCTURE_TYPE_VALIDATION_FEATURES_EXT                      (VkStructureType)1000247000
+#define VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT_EXT         (VkBufferCreateFlagBits)0x00020000
+#define VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_EXT                  (VkBufferUsageFlagBits)0x00020000
+typedef uint64_t VkDeviceAddress;
+typedef struct VkPhysicalDeviceMemoryBudgetPropertiesEXT
+{
+    VkStructureType sType;
+    void* pNext;
+    VkDeviceSize heapBudget[VK_MAX_MEMORY_HEAPS];
+    VkDeviceSize heapUsage[VK_MAX_MEMORY_HEAPS];
+} VkPhysicalDeviceMemoryBudgetPropertiesEXT;
+typedef struct VkPhysicalDeviceMemoryPriorityFeaturesEXT
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 memoryPriority;
+} VkPhysicalDeviceMemoryPriorityFeaturesEXT;
+typedef struct VkMemoryPriorityAllocateInfoEXT
+{
+    VkStructureType sType;
+    const void* pNext;
+    float priority;
+} VkMemoryPriorityAllocateInfoEXT;
+typedef struct VkPhysicalDeviceBufferAddressFeaturesEXT
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 bufferDeviceAddress;
+    VkBool32 bufferDeviceAddressCaptureReplay;
+    VkBool32 bufferDeviceAddressMultiDevice;
+} VkPhysicalDeviceBufferAddressFeaturesEXT;
+typedef struct VkBufferDeviceAddressInfoEXT
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkBuffer buffer;
+} VkBufferDeviceAddressInfoEXT;
+typedef struct VkBufferDeviceAddressCreateInfoEXT
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkDeviceSize deviceAddress;
+} VkBufferDeviceAddressCreateInfoEXT;
+typedef VkDeviceAddress(VKAPI_PTR* PFN_vkGetBufferDeviceAddressEXT)(VkDevice device, const VkBufferDeviceAddressInfoEXT* pInfo);
+typedef enum VkValidationFeatureEnableEXT
+{
+    VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT = 0,
+    VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT = 1,
+    VK_VALIDATION_FEATURE_ENABLE_BEGIN_RANGE_EXT = VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT,
+    VK_VALIDATION_FEATURE_ENABLE_END_RANGE_EXT = VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT,
+    VK_VALIDATION_FEATURE_ENABLE_RANGE_SIZE_EXT = (VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT - VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT + 1),
+    VK_VALIDATION_FEATURE_ENABLE_MAX_ENUM_EXT = 0x7FFFFFFF
+} VkValidationFeatureEnableEXT;
+typedef enum VkValidationFeatureDisableEXT
+{
+    VK_VALIDATION_FEATURE_DISABLE_ALL_EXT = 0,
+    VK_VALIDATION_FEATURE_DISABLE_SHADERS_EXT = 1,
+    VK_VALIDATION_FEATURE_DISABLE_THREAD_SAFETY_EXT = 2,
+    VK_VALIDATION_FEATURE_DISABLE_API_PARAMETERS_EXT = 3,
+    VK_VALIDATION_FEATURE_DISABLE_OBJECT_LIFETIMES_EXT = 4,
+    VK_VALIDATION_FEATURE_DISABLE_CORE_CHECKS_EXT = 5,
+    VK_VALIDATION_FEATURE_DISABLE_UNIQUE_HANDLES_EXT = 6,
+    VK_VALIDATION_FEATURE_DISABLE_BEGIN_RANGE_EXT = VK_VALIDATION_FEATURE_DISABLE_ALL_EXT,
+    VK_VALIDATION_FEATURE_DISABLE_END_RANGE_EXT = VK_VALIDATION_FEATURE_DISABLE_UNIQUE_HANDLES_EXT,
+    VK_VALIDATION_FEATURE_DISABLE_RANGE_SIZE_EXT = (VK_VALIDATION_FEATURE_DISABLE_UNIQUE_HANDLES_EXT - VK_VALIDATION_FEATURE_DISABLE_ALL_EXT + 1),
+    VK_VALIDATION_FEATURE_DISABLE_MAX_ENUM_EXT = 0x7FFFFFFF
+} VkValidationFeatureDisableEXT;
+typedef struct VkValidationFeaturesEXT
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint32_t enabledValidationFeatureCount;
+    const VkValidationFeatureEnableEXT* pEnabledValidationFeatures;
+    uint32_t disabledValidationFeatureCount;
+    const VkValidationFeatureDisableEXT* pDisabledValidationFeatures;
+} VkValidationFeaturesEXT;
+#endif // VK_HEADER_VERSION < 97
+
+#if VK_HEADER_VERSION < 101
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_NV   (VkStructureType)1000249000
+#define VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_NV                 (VkStructureType)1000249001
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_PROPERTIES_NV (VkStructureType)1000249002
+typedef enum VkComponentTypeNV
+{
+    VK_COMPONENT_TYPE_FLOAT16_NV = 0,
+    VK_COMPONENT_TYPE_FLOAT32_NV = 1,
+    VK_COMPONENT_TYPE_FLOAT64_NV = 2,
+    VK_COMPONENT_TYPE_SINT8_NV = 3,
+    VK_COMPONENT_TYPE_SINT16_NV = 4,
+    VK_COMPONENT_TYPE_SINT32_NV = 5,
+    VK_COMPONENT_TYPE_SINT64_NV = 6,
+    VK_COMPONENT_TYPE_UINT8_NV = 7,
+    VK_COMPONENT_TYPE_UINT16_NV = 8,
+    VK_COMPONENT_TYPE_UINT32_NV = 9,
+    VK_COMPONENT_TYPE_UINT64_NV = 10,
+    VK_COMPONENT_TYPE_BEGIN_RANGE_NV = VK_COMPONENT_TYPE_FLOAT16_NV,
+    VK_COMPONENT_TYPE_END_RANGE_NV = VK_COMPONENT_TYPE_UINT64_NV,
+    VK_COMPONENT_TYPE_RANGE_SIZE_NV = (VK_COMPONENT_TYPE_UINT64_NV - VK_COMPONENT_TYPE_FLOAT16_NV + 1),
+    VK_COMPONENT_TYPE_MAX_ENUM_NV = 0x7FFFFFFF
+} VkComponentTypeNV;
+typedef enum VkScopeNV
+{
+    VK_SCOPE_DEVICE_NV = 1,
+    VK_SCOPE_WORKGROUP_NV = 2,
+    VK_SCOPE_SUBGROUP_NV = 3,
+    VK_SCOPE_QUEUE_FAMILY_NV = 5,
+    VK_SCOPE_BEGIN_RANGE_NV = VK_SCOPE_DEVICE_NV,
+    VK_SCOPE_END_RANGE_NV = VK_SCOPE_QUEUE_FAMILY_NV,
+    VK_SCOPE_RANGE_SIZE_NV = (VK_SCOPE_QUEUE_FAMILY_NV - VK_SCOPE_DEVICE_NV + 1),
+    VK_SCOPE_MAX_ENUM_NV = 0x7FFFFFFF
+} VkScopeNV;
+typedef struct VkCooperativeMatrixPropertiesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t MSize;
+    uint32_t NSize;
+    uint32_t KSize;
+    VkComponentTypeNV AType;
+    VkComponentTypeNV BType;
+    VkComponentTypeNV CType;
+    VkComponentTypeNV DType;
+    VkScopeNV scope;
+} VkCooperativeMatrixPropertiesNV;
+typedef struct VkPhysicalDeviceCooperativeMatrixFeaturesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 cooperativeMatrix;
+    VkBool32 cooperativeMatrixRobustBufferAccess;
+} VkPhysicalDeviceCooperativeMatrixFeaturesNV;
+typedef struct VkPhysicalDeviceCooperativeMatrixPropertiesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    VkShaderStageFlags cooperativeMatrixSupportedStages;
+} VkPhysicalDeviceCooperativeMatrixPropertiesNV;
+typedef VkResult(VKAPI_PTR* PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesNV)(VkPhysicalDevice physicalDevice, uint32_t* pPropertyCount, VkCooperativeMatrixPropertiesNV* pProperties);
+#endif // VK_HEADER_VERSION < 101
+
+#if VK_HEADER_VERSION < 121
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COHERENT_MEMORY_FEATURES_AMD (VkStructureType)1000229000
+#define VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD                     (VkMemoryPropertyFlagBits)0x00000040
+#define VK_MEMORY_PROPERTY_DEVICE_UNCACHED_BIT_AMD                     (VkMemoryPropertyFlagBits)0x00000040
+typedef struct VkPhysicalDeviceCoherentMemoryFeaturesAMD
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 deviceCoherentMemory;
+} VkPhysicalDeviceCoherentMemoryFeaturesAMD;
+#endif // VK_HEADER_VERSION < 121
+
+#if VK_HEADER_VERSION < 129
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES_KHR (VkStructureType)1000257000
+#define VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO_KHR                     (VkStructureType)1000244001
+#define VK_STRUCTURE_TYPE_BUFFER_OPAQUE_CAPTURE_ADDRESS_CREATE_INFO_KHR      (VkStructureType)1000257002
+#define VK_STRUCTURE_TYPE_MEMORY_OPAQUE_CAPTURE_ADDRESS_ALLOCATE_INFO_KHR    (VkStructureType)1000257003
+#define VK_STRUCTURE_TYPE_DEVICE_MEMORY_OPAQUE_CAPTURE_ADDRESS_INFO_KHR      (VkStructureType)1000257004
+#define VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT_KHR               (VkBufferCreateFlagBits)0x00020000
+#define VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_KHR                        (VkBufferUsageFlagBits)0x00020000
+#define VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT_KHR                            (VkMemoryAllocateFlagBits)0x00000002
+#define VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT_KHR             (VkMemoryAllocateFlagBits)0x00000004
+typedef struct VkPhysicalDeviceBufferDeviceAddressFeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 bufferDeviceAddress;
+    VkBool32 bufferDeviceAddressCaptureReplay;
+    VkBool32 bufferDeviceAddressMultiDevice;
+} VkPhysicalDeviceBufferDeviceAddressFeaturesKHR;
+typedef struct VkBufferDeviceAddressInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkBuffer buffer;
+} VkBufferDeviceAddressInfoKHR;
+typedef struct VkBufferOpaqueCaptureAddressCreateInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint64_t opaqueCaptureAddress;
+} VkBufferOpaqueCaptureAddressCreateInfoKHR;
+typedef struct VkMemoryOpaqueCaptureAddressAllocateInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint64_t opaqueCaptureAddress;
+} VkMemoryOpaqueCaptureAddressAllocateInfoKHR;
+typedef struct VkDeviceMemoryOpaqueCaptureAddressInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkDeviceMemory memory;
+} VkDeviceMemoryOpaqueCaptureAddressInfoKHR;
+typedef VkDeviceAddress(VKAPI_PTR* PFN_vkGetBufferDeviceAddressKHR)(VkDevice device, const VkBufferDeviceAddressInfoKHR* pInfo);
+typedef uint64_t(VKAPI_PTR* PFN_vkGetBufferOpaqueCaptureAddressKHR)(VkDevice device, const VkBufferDeviceAddressInfoKHR* pInfo);
+typedef uint64_t(VKAPI_PTR* PFN_vkGetDeviceMemoryOpaqueCaptureAddressKHR)(VkDevice device, const VkDeviceMemoryOpaqueCaptureAddressInfoKHR* pInfo);
+#endif // VK_HEADER_VERSION < 129
+
+#if VK_HEADER_VERSION < 208
+typedef enum VkInstanceCreateFlagBits
+{
+    VK_INSTANCE_CREATE_ENUMERATE_PORTABILITY_BIT_KHR = 0x00000001,
+    VK_INSTANCE_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkInstanceCreateFlagBits;
+#endif // VK_HEADER_VERSION < 208
+
+#if VK_HEADER_VERSION < 255
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR   (VkStructureType)1000506000
+#define VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR                 (VkStructureType)1000506001
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_PROPERTIES_KHR (VkStructureType)1000506002
+typedef enum VkComponentTypeKHR
+{
+    VK_COMPONENT_TYPE_FLOAT16_KHR = 0,
+    VK_COMPONENT_TYPE_FLOAT32_KHR = 1,
+    VK_COMPONENT_TYPE_FLOAT64_KHR = 2,
+    VK_COMPONENT_TYPE_SINT8_KHR = 3,
+    VK_COMPONENT_TYPE_SINT16_KHR = 4,
+    VK_COMPONENT_TYPE_SINT32_KHR = 5,
+    VK_COMPONENT_TYPE_SINT64_KHR = 6,
+    VK_COMPONENT_TYPE_UINT8_KHR = 7,
+    VK_COMPONENT_TYPE_UINT16_KHR = 8,
+    VK_COMPONENT_TYPE_UINT32_KHR = 9,
+    VK_COMPONENT_TYPE_UINT64_KHR = 10,
+    VK_COMPONENT_TYPE_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkComponentTypeKHR;
+typedef enum VkScopeKHR
+{
+    VK_SCOPE_DEVICE_KHR = 1,
+    VK_SCOPE_WORKGROUP_KHR = 2,
+    VK_SCOPE_SUBGROUP_KHR = 3,
+    VK_SCOPE_QUEUE_FAMILY_KHR = 5,
+    VK_SCOPE_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkScopeKHR;
+typedef struct VkCooperativeMatrixPropertiesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t MSize;
+    uint32_t NSize;
+    uint32_t KSize;
+    VkComponentTypeKHR AType;
+    VkComponentTypeKHR BType;
+    VkComponentTypeKHR CType;
+    VkComponentTypeKHR ResultType;
+    VkBool32 saturatingAccumulation;
+    VkScopeKHR scope;
+} VkCooperativeMatrixPropertiesKHR;
+typedef struct VkPhysicalDeviceCooperativeMatrixFeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 cooperativeMatrix;
+    VkBool32 cooperativeMatrixRobustBufferAccess;
+} VkPhysicalDeviceCooperativeMatrixFeaturesKHR;
+typedef struct VkPhysicalDeviceCooperativeMatrixPropertiesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkShaderStageFlags cooperativeMatrixSupportedStages;
+} VkPhysicalDeviceCooperativeMatrixPropertiesKHR;
+typedef VkResult(VKAPI_PTR* PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR)(VkPhysicalDevice physicalDevice, uint32_t* pPropertyCount, VkCooperativeMatrixPropertiesKHR* pProperties);
+#endif // VK_HEADER_VERSION < 255
+
+#endif // NCNN_VULKAN_HEADER_FIX_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/lib/cmake/ncnn/ncnn-release.cmake b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/lib/cmake/ncnn/ncnn-release.cmake
new file mode 100644
index 0000000..1fb8660
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/lib/cmake/ncnn/ncnn-release.cmake
@@ -0,0 +1,19 @@
+#----------------------------------------------------------------
+# Generated CMake target import file for configuration "Release".
+#----------------------------------------------------------------
+
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+
+# Import target "ncnn" for configuration "Release"
+set_property(TARGET ncnn APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(ncnn PROPERTIES
+  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/libncnn.so"
+  IMPORTED_SONAME_RELEASE "libncnn.so"
+  )
+
+list(APPEND _cmake_import_check_targets ncnn )
+list(APPEND _cmake_import_check_files_for_ncnn "${_IMPORT_PREFIX}/lib/libncnn.so" )
+
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/lib/cmake/ncnn/ncnn.cmake b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/lib/cmake/ncnn/ncnn.cmake
new file mode 100644
index 0000000..6726e95
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/lib/cmake/ncnn/ncnn.cmake
@@ -0,0 +1,109 @@
+# Generated by CMake
+
+if("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.8)
+   message(FATAL_ERROR "CMake >= 2.8.0 required")
+endif()
+if(CMAKE_VERSION VERSION_LESS "2.8.3")
+   message(FATAL_ERROR "CMake >= 2.8.3 required")
+endif()
+cmake_policy(PUSH)
+cmake_policy(VERSION 2.8.3...3.25)
+#----------------------------------------------------------------
+# Generated CMake target import file.
+#----------------------------------------------------------------
+
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+
+# Protect against multiple inclusion, which would fail when already imported targets are added once more.
+set(_cmake_targets_defined "")
+set(_cmake_targets_not_defined "")
+set(_cmake_expected_targets "")
+foreach(_cmake_expected_target IN ITEMS ncnn)
+  list(APPEND _cmake_expected_targets "${_cmake_expected_target}")
+  if(TARGET "${_cmake_expected_target}")
+    list(APPEND _cmake_targets_defined "${_cmake_expected_target}")
+  else()
+    list(APPEND _cmake_targets_not_defined "${_cmake_expected_target}")
+  endif()
+endforeach()
+unset(_cmake_expected_target)
+if(_cmake_targets_defined STREQUAL _cmake_expected_targets)
+  unset(_cmake_targets_defined)
+  unset(_cmake_targets_not_defined)
+  unset(_cmake_expected_targets)
+  unset(CMAKE_IMPORT_FILE_VERSION)
+  cmake_policy(POP)
+  return()
+endif()
+if(NOT _cmake_targets_defined STREQUAL "")
+  string(REPLACE ";" ", " _cmake_targets_defined_text "${_cmake_targets_defined}")
+  string(REPLACE ";" ", " _cmake_targets_not_defined_text "${_cmake_targets_not_defined}")
+  message(FATAL_ERROR "Some (but not all) targets in this export set were already defined.\nTargets Defined: ${_cmake_targets_defined_text}\nTargets not yet defined: ${_cmake_targets_not_defined_text}\n")
+endif()
+unset(_cmake_targets_defined)
+unset(_cmake_targets_not_defined)
+unset(_cmake_expected_targets)
+
+
+# Compute the installation prefix relative to this file.
+get_filename_component(_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+if(_IMPORT_PREFIX STREQUAL "/")
+  set(_IMPORT_PREFIX "")
+endif()
+
+# Create imported target ncnn
+add_library(ncnn SHARED IMPORTED)
+
+set_target_properties(ncnn PROPERTIES
+  INTERFACE_COMPILE_OPTIONS "-fno-rtti;-fno-exceptions"
+  INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include/ncnn"
+  INTERFACE_LINK_LIBRARIES "-fopenmp;-static-openmp;-Wl,-wrap,__kmp_affinity_determine_capable;Threads::Threads;android;jnigraphics;log"
+  INTERFACE_POSITION_INDEPENDENT_CODE "ON"
+)
+
+if(CMAKE_VERSION VERSION_LESS 2.8.12)
+  message(FATAL_ERROR "This file relies on consumers using CMake 2.8.12 or greater.")
+endif()
+
+# Load information for each installed configuration.
+file(GLOB _cmake_config_files "${CMAKE_CURRENT_LIST_DIR}/ncnn-*.cmake")
+foreach(_cmake_config_file IN LISTS _cmake_config_files)
+  include("${_cmake_config_file}")
+endforeach()
+unset(_cmake_config_file)
+unset(_cmake_config_files)
+
+# Cleanup temporary variables.
+set(_IMPORT_PREFIX)
+
+# Loop over all imported files and verify that they actually exist
+foreach(_cmake_target IN LISTS _cmake_import_check_targets)
+  foreach(_cmake_file IN LISTS "_cmake_import_check_files_for_${_cmake_target}")
+    if(NOT EXISTS "${_cmake_file}")
+      message(FATAL_ERROR "The imported target \"${_cmake_target}\" references the file
+   \"${_cmake_file}\"
+but this file does not exist.  Possible reasons include:
+* The file was deleted, renamed, or moved to another location.
+* An install or uninstall procedure did not complete successfully.
+* The installation package was faulty and contained
+   \"${CMAKE_CURRENT_LIST_FILE}\"
+but not all the files it references.
+")
+    endif()
+  endforeach()
+  unset(_cmake_file)
+  unset("_cmake_import_check_files_for_${_cmake_target}")
+endforeach()
+unset(_cmake_target)
+unset(_cmake_import_check_targets)
+
+# This file does not depend on other imported targets which have
+# been exported from the same project but in a separate export set.
+
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)
+cmake_policy(POP)
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/lib/cmake/ncnn/ncnnConfig.cmake b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/lib/cmake/ncnn/ncnnConfig.cmake
new file mode 100644
index 0000000..d3ac286
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/lib/cmake/ncnn/ncnnConfig.cmake
@@ -0,0 +1,42 @@
+set(NCNN_OPENMP ON)
+set(NCNN_THREADS ON)
+set(NCNN_VULKAN OFF)
+set(NCNN_SHARED_LIB ON)
+set(NCNN_SYSTEM_GLSLANG OFF)
+
+if(NCNN_OPENMP)
+    find_package(OpenMP)
+endif()
+
+if(NCNN_THREADS)
+    set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
+    set(THREADS_PREFER_PTHREAD_FLAG TRUE)
+    find_package(Threads REQUIRED)
+endif()
+
+if(NCNN_VULKAN)
+    find_package(Vulkan REQUIRED)
+
+    if(NOT NCNN_SHARED_LIB)
+        if(NCNN_SYSTEM_GLSLANG)
+            find_package(glslang QUIET)
+            if(NOT glslang_FOUND)
+                set(GLSLANG_TARGET_DIR "")
+                include(${GLSLANG_TARGET_DIR}/OSDependentTargets.cmake)
+                include(${GLSLANG_TARGET_DIR}/OGLCompilerTargets.cmake)
+                if(EXISTS "${GLSLANG_TARGET_DIR}/HLSLTargets.cmake")
+                    # hlsl support can be optional
+                    include("${GLSLANG_TARGET_DIR}/HLSLTargets.cmake")
+                endif()
+                include(${GLSLANG_TARGET_DIR}/glslangTargets.cmake)
+                include(${GLSLANG_TARGET_DIR}/SPIRVTargets.cmake)
+            endif()
+        else()
+            set(glslang_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../lib/cmake/glslang")
+            find_package(glslang QUIET)
+        endif()
+
+    endif()
+endif()
+
+include(${CMAKE_CURRENT_LIST_DIR}/ncnn.cmake)
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/lib/pkgconfig/ncnn.pc b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/lib/pkgconfig/ncnn.pc
new file mode 100644
index 0000000..4e80236
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/arm64-v8a/lib/pkgconfig/ncnn.pc
@@ -0,0 +1,11 @@
+prefix=${pcfiledir}/../..
+librarydir=${prefix}/lib
+includedir=${prefix}/include
+
+Name: ncnn
+Description: high-performance neural network inference framework optimized for the mobile platform
+Version: 1.0.20231027
+URL: https://github.com/Tencent/ncnn
+Libs: -L"${librarydir}" -lncnn
+Cflags: -I"${includedir}"
+
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/allocator.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/allocator.h
new file mode 100644
index 0000000..3a5ebca
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/allocator.h
@@ -0,0 +1,448 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_ALLOCATOR_H
+#define NCNN_ALLOCATOR_H
+
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+
+#include "platform.h"
+
+#include <stdlib.h>
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+#include <android/hardware_buffer.h>
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+// the alignment of all the allocated buffers
+#if NCNN_AVX512
+#define NCNN_MALLOC_ALIGN 64
+#elif NCNN_AVX
+#define NCNN_MALLOC_ALIGN 32
+#else
+#define NCNN_MALLOC_ALIGN 16
+#endif
+
+// we have some optimized kernels that may overread buffer a bit in loop
+// it is common to interleave next-loop data load with arithmetic instructions
+// allocating more bytes keeps us safe from SEGV_ACCERR failure
+#define NCNN_MALLOC_OVERREAD 64
+
+// Aligns a pointer to the specified number of bytes
+// ptr Aligned pointer
+// n Alignment size that must be a power of two
+template<typename _Tp>
+static NCNN_FORCEINLINE _Tp* alignPtr(_Tp* ptr, int n = (int)sizeof(_Tp))
+{
+    return (_Tp*)(((size_t)ptr + n - 1) & -n);
+}
+
+// Aligns a buffer size to the specified number of bytes
+// The function returns the minimum number that is greater or equal to sz and is divisible by n
+// sz Buffer size to align
+// n Alignment size that must be a power of two
+static NCNN_FORCEINLINE size_t alignSize(size_t sz, int n)
+{
+    return (sz + n - 1) & -n;
+}
+
+static NCNN_FORCEINLINE void* fastMalloc(size_t size)
+{
+#if _MSC_VER
+    return _aligned_malloc(size, NCNN_MALLOC_ALIGN);
+#elif (defined(__unix__) || defined(__APPLE__)) && _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17)
+    void* ptr = 0;
+    if (posix_memalign(&ptr, NCNN_MALLOC_ALIGN, size + NCNN_MALLOC_OVERREAD))
+        ptr = 0;
+    return ptr;
+#elif __ANDROID__ && __ANDROID_API__ < 17
+    return memalign(NCNN_MALLOC_ALIGN, size + NCNN_MALLOC_OVERREAD);
+#else
+    unsigned char* udata = (unsigned char*)malloc(size + sizeof(void*) + NCNN_MALLOC_ALIGN + NCNN_MALLOC_OVERREAD);
+    if (!udata)
+        return 0;
+    unsigned char** adata = alignPtr((unsigned char**)udata + 1, NCNN_MALLOC_ALIGN);
+    adata[-1] = udata;
+    return adata;
+#endif
+}
+
+static NCNN_FORCEINLINE void fastFree(void* ptr)
+{
+    if (ptr)
+    {
+#if _MSC_VER
+        _aligned_free(ptr);
+#elif (defined(__unix__) || defined(__APPLE__)) && _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17)
+        free(ptr);
+#elif __ANDROID__ && __ANDROID_API__ < 17
+        free(ptr);
+#else
+        unsigned char* udata = ((unsigned char**)ptr)[-1];
+        free(udata);
+#endif
+    }
+}
+
+#if NCNN_THREADS
+// exchange-add operation for atomic operations on reference counters
+#if defined __riscv && !defined __riscv_atomic
+// riscv target without A extension
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#elif defined __INTEL_COMPILER && !(defined WIN32 || defined _WIN32)
+// atomic increment on the linux version of the Intel(tm) compiler
+#define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd(const_cast<void*>(reinterpret_cast<volatile void*>(addr)), delta)
+#elif defined __GNUC__
+#if defined __clang__ && __clang_major__ >= 3 && !defined __ANDROID__ && !defined __EMSCRIPTEN__ && !defined(__CUDACC__)
+#ifdef __ATOMIC_ACQ_REL
+#define NCNN_XADD(addr, delta) __c11_atomic_fetch_add((_Atomic(int)*)(addr), delta, __ATOMIC_ACQ_REL)
+#else
+#define NCNN_XADD(addr, delta) __atomic_fetch_add((_Atomic(int)*)(addr), delta, 4)
+#endif
+#else
+#if defined __ATOMIC_ACQ_REL && !defined __clang__
+// version for gcc >= 4.7
+#define NCNN_XADD(addr, delta) (int)__atomic_fetch_add((unsigned*)(addr), (unsigned)(delta), __ATOMIC_ACQ_REL)
+#else
+#define NCNN_XADD(addr, delta) (int)__sync_fetch_and_add((unsigned*)(addr), (unsigned)(delta))
+#endif
+#endif
+#elif defined _MSC_VER && !defined RC_INVOKED
+#define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile*)addr, delta)
+#else
+// thread-unsafe branch
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#endif
+#else  // NCNN_THREADS
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#endif // NCNN_THREADS
+
+class NCNN_EXPORT Allocator
+{
+public:
+    virtual ~Allocator();
+    virtual void* fastMalloc(size_t size) = 0;
+    virtual void fastFree(void* ptr) = 0;
+};
+
+class PoolAllocatorPrivate;
+class NCNN_EXPORT PoolAllocator : public Allocator
+{
+public:
+    PoolAllocator();
+    ~PoolAllocator();
+
+    // ratio range 0 ~ 1
+    // default cr = 0
+    void set_size_compare_ratio(float scr);
+
+    // budget drop threshold
+    // default threshold = 10
+    void set_size_drop_threshold(size_t);
+
+    // release all budgets immediately
+    void clear();
+
+    virtual void* fastMalloc(size_t size);
+    virtual void fastFree(void* ptr);
+
+private:
+    PoolAllocator(const PoolAllocator&);
+    PoolAllocator& operator=(const PoolAllocator&);
+
+private:
+    PoolAllocatorPrivate* const d;
+};
+
+class UnlockedPoolAllocatorPrivate;
+class NCNN_EXPORT UnlockedPoolAllocator : public Allocator
+{
+public:
+    UnlockedPoolAllocator();
+    ~UnlockedPoolAllocator();
+
+    // ratio range 0 ~ 1
+    // default cr = 0
+    void set_size_compare_ratio(float scr);
+
+    // budget drop threshold
+    // default threshold = 10
+    void set_size_drop_threshold(size_t);
+
+    // release all budgets immediately
+    void clear();
+
+    virtual void* fastMalloc(size_t size);
+    virtual void fastFree(void* ptr);
+
+private:
+    UnlockedPoolAllocator(const UnlockedPoolAllocator&);
+    UnlockedPoolAllocator& operator=(const UnlockedPoolAllocator&);
+
+private:
+    UnlockedPoolAllocatorPrivate* const d;
+};
+
+#if NCNN_VULKAN
+
+class VulkanDevice;
+
+class NCNN_EXPORT VkBufferMemory
+{
+public:
+    VkBuffer buffer;
+
+    // the base offset assigned by allocator
+    size_t offset;
+    size_t capacity;
+
+    VkDeviceMemory memory;
+    void* mapped_ptr;
+
+    // buffer state, modified by command functions internally
+    mutable VkAccessFlags access_flags;
+    mutable VkPipelineStageFlags stage_flags;
+
+    // initialize and modified by mat
+    int refcount;
+};
+
+class NCNN_EXPORT VkImageMemory
+{
+public:
+    VkImage image;
+    VkImageView imageview;
+
+    // underlying info assigned by allocator
+    int width;
+    int height;
+    int depth;
+    VkFormat format;
+
+    VkDeviceMemory memory;
+    void* mapped_ptr;
+
+    // the base offset assigned by allocator
+    size_t bind_offset;
+    size_t bind_capacity;
+
+    // image state, modified by command functions internally
+    mutable VkAccessFlags access_flags;
+    mutable VkImageLayout image_layout;
+    mutable VkPipelineStageFlags stage_flags;
+
+    // in-execution state, modified by command functions internally
+    mutable int command_refcount;
+
+    // initialize and modified by mat
+    int refcount;
+};
+
+class NCNN_EXPORT VkAllocator
+{
+public:
+    explicit VkAllocator(const VulkanDevice* _vkdev);
+    virtual ~VkAllocator();
+
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size) = 0;
+    virtual void fastFree(VkBufferMemory* ptr) = 0;
+    virtual int flush(VkBufferMemory* ptr);
+    virtual int invalidate(VkBufferMemory* ptr);
+
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack) = 0;
+    virtual void fastFree(VkImageMemory* ptr) = 0;
+
+public:
+    const VulkanDevice* vkdev;
+    uint32_t buffer_memory_type_index;
+    uint32_t image_memory_type_index;
+    uint32_t reserved_type_index;
+    bool mappable;
+    bool coherent;
+
+protected:
+    VkBuffer create_buffer(size_t size, VkBufferUsageFlags usage);
+    VkDeviceMemory allocate_memory(size_t size, uint32_t memory_type_index);
+    VkDeviceMemory allocate_dedicated_memory(size_t size, uint32_t memory_type_index, VkImage image, VkBuffer buffer);
+
+    VkImage create_image(int width, int height, int depth, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage);
+    VkImageView create_imageview(VkImage image, VkFormat format);
+};
+
+class VkBlobAllocatorPrivate;
+class NCNN_EXPORT VkBlobAllocator : public VkAllocator
+{
+public:
+    explicit VkBlobAllocator(const VulkanDevice* vkdev, size_t preferred_block_size = 16 * 1024 * 1024); // 16M
+    virtual ~VkBlobAllocator();
+
+public:
+    // release all budgets immediately
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkBlobAllocator(const VkBlobAllocator&);
+    VkBlobAllocator& operator=(const VkBlobAllocator&);
+
+private:
+    VkBlobAllocatorPrivate* const d;
+};
+
+class VkWeightAllocatorPrivate;
+class NCNN_EXPORT VkWeightAllocator : public VkAllocator
+{
+public:
+    explicit VkWeightAllocator(const VulkanDevice* vkdev, size_t preferred_block_size = 8 * 1024 * 1024); // 8M
+    virtual ~VkWeightAllocator();
+
+public:
+    // release all blocks immediately
+    virtual void clear();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkWeightAllocator(const VkWeightAllocator&);
+    VkWeightAllocator& operator=(const VkWeightAllocator&);
+
+private:
+    VkWeightAllocatorPrivate* const d;
+};
+
+class VkStagingAllocatorPrivate;
+class NCNN_EXPORT VkStagingAllocator : public VkAllocator
+{
+public:
+    explicit VkStagingAllocator(const VulkanDevice* vkdev);
+    virtual ~VkStagingAllocator();
+
+public:
+    // ratio range 0 ~ 1
+    // default cr = 0.75
+    void set_size_compare_ratio(float scr);
+
+    // release all budgets immediately
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkStagingAllocator(const VkStagingAllocator&);
+    VkStagingAllocator& operator=(const VkStagingAllocator&);
+
+private:
+    VkStagingAllocatorPrivate* const d;
+};
+
+class VkWeightStagingAllocatorPrivate;
+class NCNN_EXPORT VkWeightStagingAllocator : public VkAllocator
+{
+public:
+    explicit VkWeightStagingAllocator(const VulkanDevice* vkdev);
+    virtual ~VkWeightStagingAllocator();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkWeightStagingAllocator(const VkWeightStagingAllocator&);
+    VkWeightStagingAllocator& operator=(const VkWeightStagingAllocator&);
+
+private:
+    VkWeightStagingAllocatorPrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class NCNN_EXPORT VkAndroidHardwareBufferImageAllocator : public VkAllocator
+{
+public:
+    VkAndroidHardwareBufferImageAllocator(const VulkanDevice* _vkdev, AHardwareBuffer* _hb);
+    virtual ~VkAndroidHardwareBufferImageAllocator();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkAndroidHardwareBufferImageAllocator(const VkAndroidHardwareBufferImageAllocator&);
+    VkAndroidHardwareBufferImageAllocator& operator=(const VkAndroidHardwareBufferImageAllocator&);
+
+public:
+    int init();
+
+    int width() const;
+    int height() const;
+    uint64_t external_format() const;
+
+public:
+    AHardwareBuffer* hb;
+    AHardwareBuffer_Desc bufferDesc;
+    VkAndroidHardwareBufferFormatPropertiesANDROID bufferFormatProperties;
+    VkAndroidHardwareBufferPropertiesANDROID bufferProperties;
+    VkSamplerYcbcrConversionKHR samplerYcbcrConversion;
+};
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_ALLOCATOR_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/benchmark.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/benchmark.h
new file mode 100644
index 0000000..ed42c1a
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/benchmark.h
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_BENCHMARK_H
+#define NCNN_BENCHMARK_H
+
+#include "layer.h"
+#include "mat.h"
+#include "platform.h"
+
+namespace ncnn {
+
+// get now timestamp in ms
+NCNN_EXPORT double get_current_time();
+
+// sleep milliseconds
+NCNN_EXPORT void sleep(unsigned long long int milliseconds = 1000);
+
+#if NCNN_BENCHMARK
+
+NCNN_EXPORT void benchmark(const Layer* layer, double start, double end);
+NCNN_EXPORT void benchmark(const Layer* layer, const Mat& bottom_blob, Mat& top_blob, double start, double end);
+
+#endif // NCNN_BENCHMARK
+
+} // namespace ncnn
+
+#endif // NCNN_BENCHMARK_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/blob.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/blob.h
new file mode 100644
index 0000000..c9f144f
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/blob.h
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_BLOB_H
+#define NCNN_BLOB_H
+
+#include "mat.h"
+#include "platform.h"
+
+namespace ncnn {
+
+class NCNN_EXPORT Blob
+{
+public:
+    // empty
+    Blob();
+
+public:
+#if NCNN_STRING
+    // blob name
+    std::string name;
+#endif // NCNN_STRING
+    // layer index which produce this blob as output
+    int producer;
+    // layer index which need this blob as input
+    int consumer;
+    // shape hint
+    Mat shape;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_BLOB_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/c_api.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/c_api.h
new file mode 100644
index 0000000..31d5b6d
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/c_api.h
@@ -0,0 +1,347 @@
+/* Tencent is pleased to support the open source community by making ncnn available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed
+ * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+ * CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ */
+
+#ifndef NCNN_C_API_H
+#define NCNN_C_API_H
+
+#include "platform.h"
+
+#if NCNN_C_API
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+NCNN_EXPORT const char* ncnn_version();
+
+/* allocator api */
+typedef struct __ncnn_allocator_t* ncnn_allocator_t;
+struct NCNN_EXPORT __ncnn_allocator_t
+{
+    void* pthis;
+
+    void* (*fast_malloc)(ncnn_allocator_t allocator, size_t size);
+    void (*fast_free)(ncnn_allocator_t allocator, void* ptr);
+};
+
+NCNN_EXPORT ncnn_allocator_t ncnn_allocator_create_pool_allocator();
+NCNN_EXPORT ncnn_allocator_t ncnn_allocator_create_unlocked_pool_allocator();
+NCNN_EXPORT void ncnn_allocator_destroy(ncnn_allocator_t allocator);
+
+/* option api */
+typedef struct __ncnn_option_t* ncnn_option_t;
+
+NCNN_EXPORT ncnn_option_t ncnn_option_create();
+NCNN_EXPORT void ncnn_option_destroy(ncnn_option_t opt);
+
+NCNN_EXPORT int ncnn_option_get_num_threads(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_num_threads(ncnn_option_t opt, int num_threads);
+
+NCNN_EXPORT int ncnn_option_get_use_local_pool_allocator(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_use_local_pool_allocator(ncnn_option_t opt, int use_local_pool_allocator);
+
+NCNN_EXPORT void ncnn_option_set_blob_allocator(ncnn_option_t opt, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_option_set_workspace_allocator(ncnn_option_t opt, ncnn_allocator_t allocator);
+
+NCNN_EXPORT int ncnn_option_get_use_vulkan_compute(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_use_vulkan_compute(ncnn_option_t opt, int use_vulkan_compute);
+
+/* mat api */
+typedef struct __ncnn_mat_t* ncnn_mat_t;
+
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create();
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_1d(int w, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_2d(int w, int h, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_3d(int w, int h, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_4d(int w, int h, int d, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_1d(int w, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_2d(int w, int h, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_3d(int w, int h, int c, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_4d(int w, int h, int d, int c, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_1d_elem(int w, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_2d_elem(int w, int h, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_3d_elem(int w, int h, int c, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_4d_elem(int w, int h, int d, int c, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_1d_elem(int w, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_2d_elem(int w, int h, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_3d_elem(int w, int h, int c, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_4d_elem(int w, int h, int d, int c, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_mat_destroy(ncnn_mat_t mat);
+
+NCNN_EXPORT void ncnn_mat_fill_float(ncnn_mat_t mat, float v);
+
+NCNN_EXPORT ncnn_mat_t ncnn_mat_clone(const ncnn_mat_t mat, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_1d(const ncnn_mat_t mat, int w, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_2d(const ncnn_mat_t mat, int w, int h, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_3d(const ncnn_mat_t mat, int w, int h, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_4d(const ncnn_mat_t mat, int w, int h, int d, int c, ncnn_allocator_t allocator);
+
+NCNN_EXPORT int ncnn_mat_get_dims(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_w(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_h(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_d(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_c(const ncnn_mat_t mat);
+NCNN_EXPORT size_t ncnn_mat_get_elemsize(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_elempack(const ncnn_mat_t mat);
+NCNN_EXPORT size_t ncnn_mat_get_cstep(const ncnn_mat_t mat);
+NCNN_EXPORT void* ncnn_mat_get_data(const ncnn_mat_t mat);
+
+NCNN_EXPORT void* ncnn_mat_get_channel_data(const ncnn_mat_t mat, int c);
+
+#if NCNN_PIXEL
+
+/* mat pixel api */
+#define NCNN_MAT_PIXEL_RGB       1
+#define NCNN_MAT_PIXEL_BGR       2
+#define NCNN_MAT_PIXEL_GRAY      3
+#define NCNN_MAT_PIXEL_RGBA      4
+#define NCNN_MAT_PIXEL_BGRA      5
+#define NCNN_MAT_PIXEL_X2Y(X, Y) (X | (Y << 16))
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels(const unsigned char* pixels, int type, int w, int h, int stride, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int stride, int target_width, int target_height, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, int target_width, int target_height, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_mat_to_pixels(const ncnn_mat_t mat, unsigned char* pixels, int type, int stride);
+NCNN_EXPORT void ncnn_mat_to_pixels_resize(const ncnn_mat_t mat, unsigned char* pixels, int type, int target_width, int target_height, int target_stride);
+
+#endif /* NCNN_PIXEL */
+
+NCNN_EXPORT void ncnn_mat_substract_mean_normalize(ncnn_mat_t mat, const float* mean_vals, const float* norm_vals);
+
+NCNN_EXPORT void ncnn_convert_packing(const ncnn_mat_t src, ncnn_mat_t* dst, int elempack, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_flatten(const ncnn_mat_t src, ncnn_mat_t* dst, const ncnn_option_t opt);
+
+/* blob api */
+typedef struct __ncnn_blob_t* ncnn_blob_t;
+
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_blob_get_name(const ncnn_blob_t blob);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_blob_get_producer(const ncnn_blob_t blob);
+NCNN_EXPORT int ncnn_blob_get_consumer(const ncnn_blob_t blob);
+
+NCNN_EXPORT void ncnn_blob_get_shape(const ncnn_blob_t blob, int* dims, int* w, int* h, int* c);
+
+/* paramdict api */
+typedef struct __ncnn_paramdict_t* ncnn_paramdict_t;
+
+NCNN_EXPORT ncnn_paramdict_t ncnn_paramdict_create();
+NCNN_EXPORT void ncnn_paramdict_destroy(ncnn_paramdict_t pd);
+
+NCNN_EXPORT int ncnn_paramdict_get_type(const ncnn_paramdict_t pd, int id);
+
+NCNN_EXPORT int ncnn_paramdict_get_int(const ncnn_paramdict_t pd, int id, int def);
+NCNN_EXPORT float ncnn_paramdict_get_float(const ncnn_paramdict_t pd, int id, float def);
+NCNN_EXPORT ncnn_mat_t ncnn_paramdict_get_array(const ncnn_paramdict_t pd, int id, const ncnn_mat_t def);
+
+NCNN_EXPORT void ncnn_paramdict_set_int(ncnn_paramdict_t pd, int id, int i);
+NCNN_EXPORT void ncnn_paramdict_set_float(ncnn_paramdict_t pd, int id, float f);
+NCNN_EXPORT void ncnn_paramdict_set_array(ncnn_paramdict_t pd, int id, const ncnn_mat_t v);
+
+/* datareader api */
+typedef struct __ncnn_datareader_t* ncnn_datareader_t;
+struct NCNN_EXPORT __ncnn_datareader_t
+{
+    void* pthis;
+
+#if NCNN_STRING
+    int (*scan)(ncnn_datareader_t dr, const char* format, void* p);
+#endif /* NCNN_STRING */
+    size_t (*read)(ncnn_datareader_t dr, void* buf, size_t size);
+};
+
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create();
+#if NCNN_STDIO
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create_from_stdio(FILE* fp);
+#endif /* NCNN_STDIO */
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create_from_memory(const unsigned char** mem);
+NCNN_EXPORT void ncnn_datareader_destroy(ncnn_datareader_t dr);
+
+/* modelbin api */
+typedef struct __ncnn_modelbin_t* ncnn_modelbin_t;
+struct NCNN_EXPORT __ncnn_modelbin_t
+{
+    void* pthis;
+
+    ncnn_mat_t (*load_1d)(const ncnn_modelbin_t mb, int w, int type);
+    ncnn_mat_t (*load_2d)(const ncnn_modelbin_t mb, int w, int h, int type);
+    ncnn_mat_t (*load_3d)(const ncnn_modelbin_t mb, int w, int h, int c, int type);
+};
+
+NCNN_EXPORT ncnn_modelbin_t ncnn_modelbin_create_from_datareader(const ncnn_datareader_t dr);
+NCNN_EXPORT ncnn_modelbin_t ncnn_modelbin_create_from_mat_array(const ncnn_mat_t* weights, int n);
+NCNN_EXPORT void ncnn_modelbin_destroy(ncnn_modelbin_t mb);
+
+/* layer api */
+typedef struct __ncnn_layer_t* ncnn_layer_t;
+struct NCNN_EXPORT __ncnn_layer_t
+{
+    void* pthis;
+
+    int (*load_param)(ncnn_layer_t layer, const ncnn_paramdict_t pd);
+    int (*load_model)(ncnn_layer_t layer, const ncnn_modelbin_t mb);
+
+    int (*create_pipeline)(ncnn_layer_t layer, const ncnn_option_t opt);
+    int (*destroy_pipeline)(ncnn_layer_t layer, const ncnn_option_t opt);
+
+    int (*forward_1)(const ncnn_layer_t layer, const ncnn_mat_t bottom_blob, ncnn_mat_t* top_blob, const ncnn_option_t opt);
+    int (*forward_n)(const ncnn_layer_t layer, const ncnn_mat_t* bottom_blobs, int n, ncnn_mat_t* top_blobs, int n2, const ncnn_option_t opt);
+
+    int (*forward_inplace_1)(const ncnn_layer_t layer, ncnn_mat_t bottom_top_blob, const ncnn_option_t opt);
+    int (*forward_inplace_n)(const ncnn_layer_t layer, ncnn_mat_t* bottom_top_blobs, int n, const ncnn_option_t opt);
+};
+
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create();
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create_by_typeindex(int typeindex);
+#if NCNN_STRING
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create_by_type(const char* type);
+NCNN_EXPORT int ncnn_layer_type_to_index(const char* type);
+#endif /* NCNN_STRING */
+NCNN_EXPORT void ncnn_layer_destroy(ncnn_layer_t layer);
+
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_layer_get_name(const ncnn_layer_t layer);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_layer_get_typeindex(const ncnn_layer_t layer);
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_layer_get_type(const ncnn_layer_t layer);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_layer_get_one_blob_only(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_inplace(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_vulkan(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_packing(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_bf16_storage(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_fp16_storage(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_image_storage(const ncnn_layer_t layer);
+
+NCNN_EXPORT void ncnn_layer_set_one_blob_only(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_inplace(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_vulkan(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_packing(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_bf16_storage(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_fp16_storage(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_image_storage(ncnn_layer_t layer, int enable);
+
+NCNN_EXPORT int ncnn_layer_get_bottom_count(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_bottom(const ncnn_layer_t layer, int i);
+NCNN_EXPORT int ncnn_layer_get_top_count(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_top(const ncnn_layer_t layer, int i);
+
+NCNN_EXPORT void ncnn_blob_get_bottom_shape(const ncnn_layer_t layer, int i, int* dims, int* w, int* h, int* c);
+NCNN_EXPORT void ncnn_blob_get_top_shape(const ncnn_layer_t layer, int i, int* dims, int* w, int* h, int* c);
+
+/* layer factory function */
+typedef ncnn_layer_t (*ncnn_layer_creator_t)(void* userdata);
+typedef void (*ncnn_layer_destroyer_t)(ncnn_layer_t layer, void* userdata);
+
+typedef struct __ncnn_net_custom_layer_factory_t* ncnn_net_custom_layer_factory_t;
+struct __ncnn_net_custom_layer_factory_t
+{
+    ncnn_layer_creator_t creator;
+    ncnn_layer_destroyer_t destroyer;
+    void* userdata;
+    ncnn_net_custom_layer_factory_t next;
+};
+
+/* net api */
+typedef struct __ncnn_net_t* ncnn_net_t;
+struct __ncnn_net_t
+{
+    void* pthis;
+
+    ncnn_net_custom_layer_factory_t custom_layer_factory;
+};
+
+NCNN_EXPORT ncnn_net_t ncnn_net_create();
+NCNN_EXPORT void ncnn_net_destroy(ncnn_net_t net);
+
+NCNN_EXPORT ncnn_option_t ncnn_net_get_option(ncnn_net_t net);
+NCNN_EXPORT void ncnn_net_set_option(ncnn_net_t net, ncnn_option_t opt);
+
+#if NCNN_STRING
+NCNN_EXPORT void ncnn_net_register_custom_layer_by_type(ncnn_net_t net, const char* type, ncnn_layer_creator_t creator, ncnn_layer_destroyer_t destroyer, void* userdata);
+#endif /* NCNN_STRING */
+NCNN_EXPORT void ncnn_net_register_custom_layer_by_typeindex(ncnn_net_t net, int typeindex, ncnn_layer_creator_t creator, ncnn_layer_destroyer_t destroyer, void* userdata);
+
+#if NCNN_STDIO
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param(ncnn_net_t net, const char* path);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_net_load_param_bin(ncnn_net_t net, const char* path);
+NCNN_EXPORT int ncnn_net_load_model(ncnn_net_t net, const char* path);
+#endif /* NCNN_STDIO */
+
+#if NCNN_STDIO
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param_memory(ncnn_net_t net, const char* mem);
+#endif /* NCNN_STRING */
+#endif /* NCNN_STDIO */
+NCNN_EXPORT int ncnn_net_load_param_bin_memory(ncnn_net_t net, const unsigned char* mem);
+NCNN_EXPORT int ncnn_net_load_model_memory(ncnn_net_t net, const unsigned char* mem);
+
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_net_load_param_bin_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+NCNN_EXPORT int ncnn_net_load_model_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+
+NCNN_EXPORT void ncnn_net_clear(ncnn_net_t net);
+
+NCNN_EXPORT int ncnn_net_get_input_count(const ncnn_net_t net);
+NCNN_EXPORT int ncnn_net_get_output_count(const ncnn_net_t net);
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_net_get_input_name(const ncnn_net_t net, int i);
+NCNN_EXPORT const char* ncnn_net_get_output_name(const ncnn_net_t net, int i);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_net_get_input_index(const ncnn_net_t net, int i);
+NCNN_EXPORT int ncnn_net_get_output_index(const ncnn_net_t net, int i);
+
+/* extractor api */
+typedef struct __ncnn_extractor_t* ncnn_extractor_t;
+
+NCNN_EXPORT ncnn_extractor_t ncnn_extractor_create(ncnn_net_t net);
+NCNN_EXPORT void ncnn_extractor_destroy(ncnn_extractor_t ex);
+
+NCNN_EXPORT void ncnn_extractor_set_option(ncnn_extractor_t ex, const ncnn_option_t opt);
+
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_extractor_input(ncnn_extractor_t ex, const char* name, const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_extractor_extract(ncnn_extractor_t ex, const char* name, ncnn_mat_t* mat);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_extractor_input_index(ncnn_extractor_t ex, int index, const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_extractor_extract_index(ncnn_extractor_t ex, int index, ncnn_mat_t* mat);
+
+/* mat process api */
+#define NCNN_BORDER_CONSTANT    0
+#define NCNN_BORDER_REPLICATE   1
+#define NCNN_BORDER_REFLECT     2
+#define NCNN_BORDER_TRANSPARENT -233
+NCNN_EXPORT void ncnn_copy_make_border(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int type, float v, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_copy_make_border_3d(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int front, int behind, int type, float v, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_copy_cut_border(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_copy_cut_border_3d(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int front, int behind, const ncnn_option_t opt);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* NCNN_C_API */
+
+#endif /* NCNN_C_API_H */
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/command.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/command.h
new file mode 100644
index 0000000..337d085
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/command.h
@@ -0,0 +1,136 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_COMMAND_H
+#define NCNN_COMMAND_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+
+#include "mat.h"
+
+#include <vulkan/vulkan.h>
+
+namespace ncnn {
+
+class Pipeline;
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class ImportAndroidHardwareBufferPipeline;
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+class VkComputePrivate;
+class NCNN_EXPORT VkCompute
+{
+public:
+    explicit VkCompute(const VulkanDevice* vkdev);
+    virtual ~VkCompute();
+
+public:
+    void record_upload(const Mat& src, VkMat& dst, const Option& opt);
+
+    void record_upload(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    void record_download(const VkMat& src, Mat& dst, const Option& opt);
+
+    void record_download(const VkImageMat& src, Mat& dst, const Option& opt);
+
+    void record_buffer_to_image(const VkMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_image_to_buffer(const VkImageMat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const Mat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, Mat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, Mat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, VkMat& dst, const Option& opt);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkImageMat>& bindings, const std::vector<vk_constant_type>& constants, const VkImageMat& dispatcher);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher);
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const VkImageMat& dispatcher);
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const Mat& dispatcher);
+
+#if NCNN_BENCHMARK
+    void record_write_timestamp(uint32_t query);
+#endif // NCNN_BENCHMARK
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+    void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkMat& dst);
+
+    void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkImageMat& dst);
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+    int submit_and_wait();
+
+    int reset();
+
+#if NCNN_BENCHMARK
+    int create_query_pool(uint32_t query_count);
+
+    int get_query_pool_results(uint32_t first_query, uint32_t query_count, std::vector<uint64_t>& results);
+#endif // NCNN_BENCHMARK
+
+protected:
+    const VulkanDevice* vkdev;
+
+    void barrier_readwrite(const VkMat& binding);
+    void barrier_readwrite(const VkImageMat& binding);
+    void barrier_readonly(const VkImageMat& binding);
+
+private:
+    VkComputePrivate* const d;
+};
+
+class VkTransferPrivate;
+class NCNN_EXPORT VkTransfer
+{
+public:
+    explicit VkTransfer(const VulkanDevice* vkdev);
+    virtual ~VkTransfer();
+
+public:
+    void record_upload(const Mat& src, VkMat& dst, const Option& opt, bool flatten = true);
+
+    void record_upload(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    int submit_and_wait();
+
+protected:
+    const VulkanDevice* vkdev;
+
+private:
+    VkTransferPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_VULKAN
+
+#endif // NCNN_COMMAND_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/cpu.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/cpu.h
new file mode 100644
index 0000000..7d6bfce
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/cpu.h
@@ -0,0 +1,178 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_CPU_H
+#define NCNN_CPU_H
+
+#include <stddef.h>
+
+#if (defined _WIN32 && !(defined __MINGW32__))
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+#if defined __ANDROID__ || defined __linux__
+#include <sched.h> // cpu_set_t
+#endif
+
+#include "platform.h"
+
+namespace ncnn {
+
+class NCNN_EXPORT CpuSet
+{
+public:
+    CpuSet();
+    void enable(int cpu);
+    void disable(int cpu);
+    void disable_all();
+    bool is_enabled(int cpu) const;
+    int num_enabled() const;
+
+public:
+#if (defined _WIN32 && !(defined __MINGW32__))
+    ULONG_PTR mask;
+#endif
+#if defined __ANDROID__ || defined __linux__
+    cpu_set_t cpu_set;
+#endif
+#if __APPLE__
+    unsigned int policy;
+#endif
+};
+
+// test optional cpu features
+// edsp = armv7 edsp
+NCNN_EXPORT int cpu_support_arm_edsp();
+// neon = armv7 neon or aarch64 asimd
+NCNN_EXPORT int cpu_support_arm_neon();
+// vfpv4 = armv7 fp16 + fma
+NCNN_EXPORT int cpu_support_arm_vfpv4();
+// asimdhp = aarch64 asimd half precision
+NCNN_EXPORT int cpu_support_arm_asimdhp();
+// cpuid = aarch64 cpuid info
+NCNN_EXPORT int cpu_support_arm_cpuid();
+// asimddp = aarch64 asimd dot product
+NCNN_EXPORT int cpu_support_arm_asimddp();
+// asimdfhm = aarch64 asimd fhm
+NCNN_EXPORT int cpu_support_arm_asimdfhm();
+// bf16 = aarch64 bf16
+NCNN_EXPORT int cpu_support_arm_bf16();
+// i8mm = aarch64 i8mm
+NCNN_EXPORT int cpu_support_arm_i8mm();
+// sve = aarch64 sve
+NCNN_EXPORT int cpu_support_arm_sve();
+// sve2 = aarch64 sve2
+NCNN_EXPORT int cpu_support_arm_sve2();
+// svebf16 = aarch64 svebf16
+NCNN_EXPORT int cpu_support_arm_svebf16();
+// svei8mm = aarch64 svei8mm
+NCNN_EXPORT int cpu_support_arm_svei8mm();
+// svef32mm = aarch64 svef32mm
+NCNN_EXPORT int cpu_support_arm_svef32mm();
+
+// avx = x86 avx
+NCNN_EXPORT int cpu_support_x86_avx();
+// fma = x86 fma
+NCNN_EXPORT int cpu_support_x86_fma();
+// xop = x86 xop
+NCNN_EXPORT int cpu_support_x86_xop();
+// f16c = x86 f16c
+NCNN_EXPORT int cpu_support_x86_f16c();
+// avx2 = x86 avx2 + fma + f16c
+NCNN_EXPORT int cpu_support_x86_avx2();
+// avx_vnni = x86 avx vnni
+NCNN_EXPORT int cpu_support_x86_avx_vnni();
+// avx512 = x86 avx512f + avx512cd + avx512bw + avx512dq + avx512vl
+NCNN_EXPORT int cpu_support_x86_avx512();
+// avx512_vnni = x86 avx512 vnni
+NCNN_EXPORT int cpu_support_x86_avx512_vnni();
+// avx512_bf16 = x86 avx512 bf16
+NCNN_EXPORT int cpu_support_x86_avx512_bf16();
+// avx512_fp16 = x86 avx512 fp16
+NCNN_EXPORT int cpu_support_x86_avx512_fp16();
+
+// lsx = loongarch lsx
+NCNN_EXPORT int cpu_support_loongarch_lsx();
+// lasx = loongarch lasx
+NCNN_EXPORT int cpu_support_loongarch_lasx();
+
+// msa = mips mas
+NCNN_EXPORT int cpu_support_mips_msa();
+// mmi = loongson mmi
+NCNN_EXPORT int cpu_support_loongson_mmi();
+
+// v = riscv vector
+NCNN_EXPORT int cpu_support_riscv_v();
+// zfh = riscv half-precision float
+NCNN_EXPORT int cpu_support_riscv_zfh();
+// vlenb = riscv vector length in bytes
+NCNN_EXPORT int cpu_riscv_vlenb();
+
+// cpu info
+NCNN_EXPORT int get_cpu_count();
+NCNN_EXPORT int get_little_cpu_count();
+NCNN_EXPORT int get_big_cpu_count();
+
+NCNN_EXPORT int get_physical_cpu_count();
+NCNN_EXPORT int get_physical_little_cpu_count();
+NCNN_EXPORT int get_physical_big_cpu_count();
+
+// cpu l2 varies from 64k to 1M, but l3 can be zero
+NCNN_EXPORT int get_cpu_level2_cache_size();
+NCNN_EXPORT int get_cpu_level3_cache_size();
+
+// bind all threads on little clusters if powersave enabled
+// affects HMP arch cpu like ARM big.LITTLE
+// only implemented on android at the moment
+// switching powersave is expensive and not thread-safe
+// 0 = all cores enabled(default)
+// 1 = only little clusters enabled
+// 2 = only big clusters enabled
+// return 0 if success for setter function
+NCNN_EXPORT int get_cpu_powersave();
+NCNN_EXPORT int set_cpu_powersave(int powersave);
+
+// convenient wrapper
+NCNN_EXPORT const CpuSet& get_cpu_thread_affinity_mask(int powersave);
+
+// set explicit thread affinity
+NCNN_EXPORT int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask);
+
+// runtime thread affinity info
+NCNN_EXPORT int is_current_thread_running_on_a53_a55();
+
+// misc function wrapper for openmp routines
+NCNN_EXPORT int get_omp_num_threads();
+NCNN_EXPORT void set_omp_num_threads(int num_threads);
+
+NCNN_EXPORT int get_omp_dynamic();
+NCNN_EXPORT void set_omp_dynamic(int dynamic);
+
+NCNN_EXPORT int get_omp_thread_num();
+
+NCNN_EXPORT int get_kmp_blocktime();
+NCNN_EXPORT void set_kmp_blocktime(int time_ms);
+
+// need to flush denormals on Intel Chipset.
+// Other architectures such as ARM can be added as needed.
+// 0 = DAZ OFF, FTZ OFF
+// 1 = DAZ ON , FTZ OFF
+// 2 = DAZ OFF, FTZ ON
+// 3 = DAZ ON,  FTZ ON
+NCNN_EXPORT int get_flush_denormals();
+NCNN_EXPORT int set_flush_denormals(int flush_denormals);
+
+} // namespace ncnn
+
+#endif // NCNN_CPU_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/datareader.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/datareader.h
new file mode 100644
index 0000000..ed2aba3
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/datareader.h
@@ -0,0 +1,122 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_DATAREADER_H
+#define NCNN_DATAREADER_H
+
+#include "platform.h"
+#if NCNN_STDIO
+#include <stdio.h>
+#endif
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/asset_manager.h>
+#endif
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+// data read wrapper
+class NCNN_EXPORT DataReader
+{
+public:
+    DataReader();
+    virtual ~DataReader();
+
+#if NCNN_STRING
+    // parse plain param text
+    // return 1 if scan success
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+
+    // read binary param and model data
+    // return bytes read
+    virtual size_t read(void* buf, size_t size) const;
+
+    // get model data reference
+    // return bytes referenced
+    virtual size_t reference(size_t size, const void** buf) const;
+};
+
+#if NCNN_STDIO
+class DataReaderFromStdioPrivate;
+class NCNN_EXPORT DataReaderFromStdio : public DataReader
+{
+public:
+    explicit DataReaderFromStdio(FILE* fp);
+    virtual ~DataReaderFromStdio();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+
+private:
+    DataReaderFromStdio(const DataReaderFromStdio&);
+    DataReaderFromStdio& operator=(const DataReaderFromStdio&);
+
+private:
+    DataReaderFromStdioPrivate* const d;
+};
+#endif // NCNN_STDIO
+
+class DataReaderFromMemoryPrivate;
+class NCNN_EXPORT DataReaderFromMemory : public DataReader
+{
+public:
+    explicit DataReaderFromMemory(const unsigned char*& mem);
+    virtual ~DataReaderFromMemory();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+    virtual size_t reference(size_t size, const void** buf) const;
+
+private:
+    DataReaderFromMemory(const DataReaderFromMemory&);
+    DataReaderFromMemory& operator=(const DataReaderFromMemory&);
+
+private:
+    DataReaderFromMemoryPrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+class DataReaderFromAndroidAssetPrivate;
+class NCNN_EXPORT DataReaderFromAndroidAsset : public DataReader
+{
+public:
+    explicit DataReaderFromAndroidAsset(AAsset* asset);
+    virtual ~DataReaderFromAndroidAsset();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+
+private:
+    DataReaderFromAndroidAsset(const DataReaderFromAndroidAsset&);
+    DataReaderFromAndroidAsset& operator=(const DataReaderFromAndroidAsset&);
+
+private:
+    DataReaderFromAndroidAssetPrivate* const d;
+};
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+} // namespace ncnn
+
+#endif // NCNN_DATAREADER_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/gpu.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/gpu.h
new file mode 100644
index 0000000..1eff228
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/gpu.h
@@ -0,0 +1,392 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_GPU_H
+#define NCNN_GPU_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+
+#include "mat.h"
+
+#include <vulkan/vulkan.h>
+
+#include "vulkan_header_fix.h"
+
+namespace ncnn {
+
+// instance
+
+// Create VkInstance and initialize some objects that need to be calculated by GPU
+// Creates a VkInstance object, Checks the extended attributes supported by the Vulkan instance concerned,
+// Initializes, and creates Vulkan validation layers (if ENABLE_VALIDATION_LAYER is enabled),
+// Iterates over all supported physical devices, etc.
+NCNN_EXPORT int create_gpu_instance();
+
+// Get global VkInstance variable
+// Must be called after create_gpu_instance() and before destroy_gpu_instance()
+NCNN_EXPORT VkInstance get_gpu_instance();
+
+// Destroy VkInstance object and free the memory of the associated object
+// Usually called in the destructor of the main program exit
+NCNN_EXPORT void destroy_gpu_instance();
+
+// instance extension capability
+extern int support_VK_KHR_external_memory_capabilities;
+extern int support_VK_KHR_get_physical_device_properties2;
+extern int support_VK_KHR_get_surface_capabilities2;
+extern int support_VK_KHR_surface;
+extern int support_VK_EXT_debug_utils;
+extern int support_VK_EXT_validation_features;
+extern int support_VK_EXT_validation_flags;
+#if __ANDROID_API__ >= 26
+extern int support_VK_KHR_android_surface;
+#endif // __ANDROID_API__ >= 26
+
+// VK_KHR_cooperative_matrix
+extern PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR;
+
+// VK_KHR_external_memory_capabilities
+extern PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR vkGetPhysicalDeviceExternalBufferPropertiesKHR;
+
+// VK_KHR_get_physical_device_properties2
+extern PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR;
+extern PFN_vkGetPhysicalDeviceProperties2KHR vkGetPhysicalDeviceProperties2KHR;
+extern PFN_vkGetPhysicalDeviceFormatProperties2KHR vkGetPhysicalDeviceFormatProperties2KHR;
+extern PFN_vkGetPhysicalDeviceImageFormatProperties2KHR vkGetPhysicalDeviceImageFormatProperties2KHR;
+extern PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR vkGetPhysicalDeviceQueueFamilyProperties2KHR;
+extern PFN_vkGetPhysicalDeviceMemoryProperties2KHR vkGetPhysicalDeviceMemoryProperties2KHR;
+extern PFN_vkGetPhysicalDeviceSparseImageFormatProperties2KHR vkGetPhysicalDeviceSparseImageFormatProperties2KHR;
+
+// VK_KHR_get_surface_capabilities2
+extern PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR vkGetPhysicalDeviceSurfaceCapabilities2KHR;
+extern PFN_vkGetPhysicalDeviceSurfaceFormats2KHR vkGetPhysicalDeviceSurfaceFormats2KHR;
+
+// VK_KHR_surface
+extern PFN_vkDestroySurfaceKHR vkDestroySurfaceKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceSupportKHR vkGetPhysicalDeviceSurfaceSupportKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR vkGetPhysicalDeviceSurfaceCapabilitiesKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceFormatsKHR vkGetPhysicalDeviceSurfaceFormatsKHR;
+extern PFN_vkGetPhysicalDeviceSurfacePresentModesKHR vkGetPhysicalDeviceSurfacePresentModesKHR;
+
+#if __ANDROID_API__ >= 26
+// VK_KHR_android_surface
+extern PFN_vkCreateAndroidSurfaceKHR vkCreateAndroidSurfaceKHR;
+#endif // __ANDROID_API__ >= 26
+
+// VK_NV_cooperative_matrix
+extern PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesNV vkGetPhysicalDeviceCooperativeMatrixPropertiesNV;
+
+// get info
+NCNN_EXPORT int get_gpu_count();
+NCNN_EXPORT int get_default_gpu_index();
+
+class GpuInfoPrivate;
+class NCNN_EXPORT GpuInfo
+{
+public:
+    explicit GpuInfo();
+    virtual ~GpuInfo();
+
+    // vulkan physical device
+    VkPhysicalDevice physical_device() const;
+
+    // memory properties
+    const VkPhysicalDeviceMemoryProperties& physical_device_memory_properties() const;
+
+    // info
+    uint32_t api_version() const;
+    uint32_t driver_version() const;
+    uint32_t vendor_id() const;
+    uint32_t device_id() const;
+    const char* device_name() const;
+    uint8_t* pipeline_cache_uuid() const;
+
+    // 0 = discrete gpu
+    // 1 = integrated gpu
+    // 2 = virtual gpu
+    // 3 = cpu
+    int type() const;
+
+    // hardware limit
+    uint32_t max_shared_memory_size() const;
+    uint32_t max_workgroup_count_x() const;
+    uint32_t max_workgroup_count_y() const;
+    uint32_t max_workgroup_count_z() const;
+    uint32_t max_workgroup_invocations() const;
+    uint32_t max_workgroup_size_x() const;
+    uint32_t max_workgroup_size_y() const;
+    uint32_t max_workgroup_size_z() const;
+    size_t memory_map_alignment() const;
+    size_t buffer_offset_alignment() const;
+    size_t non_coherent_atom_size() const;
+    size_t buffer_image_granularity() const;
+    uint32_t max_image_dimension_1d() const;
+    uint32_t max_image_dimension_2d() const;
+    uint32_t max_image_dimension_3d() const;
+    float timestamp_period() const;
+
+    // runtime
+    uint32_t compute_queue_family_index() const;
+    uint32_t graphics_queue_family_index() const;
+    uint32_t transfer_queue_family_index() const;
+
+    uint32_t compute_queue_count() const;
+    uint32_t graphics_queue_count() const;
+    uint32_t transfer_queue_count() const;
+
+    // property
+    bool unified_compute_transfer_queue() const;
+
+    // subgroup
+    uint32_t subgroup_size() const;
+    bool support_subgroup_basic() const;
+    bool support_subgroup_vote() const;
+    bool support_subgroup_ballot() const;
+    bool support_subgroup_shuffle() const;
+
+    // bug is not feature
+    bool bug_storage_buffer_no_l1() const;
+    bool bug_corrupted_online_pipeline_cache() const;
+    bool bug_buffer_image_load_zero() const;
+
+    // but sometimes bug is a feature
+    bool bug_implicit_fp16_arithmetic() const;
+
+    // fp16 and int8 feature
+    bool support_fp16_packed() const;
+    bool support_fp16_storage() const;
+    bool support_fp16_arithmetic() const;
+    bool support_int8_packed() const;
+    bool support_int8_storage() const;
+    bool support_int8_arithmetic() const;
+
+    // ycbcr conversion feature
+    bool support_ycbcr_conversion() const;
+
+    // cooperative matrix feature
+    bool support_cooperative_matrix() const;
+    bool support_cooperative_matrix_16_8_8() const;
+    bool support_cooperative_matrix_16_8_16() const;
+    bool support_cooperative_matrix_16_16_16() const;
+
+    // extension capability
+    int support_VK_KHR_8bit_storage() const;
+    int support_VK_KHR_16bit_storage() const;
+    int support_VK_KHR_bind_memory2() const;
+    int support_VK_KHR_buffer_device_address() const;
+    int support_VK_KHR_create_renderpass2() const;
+    int support_VK_KHR_cooperative_matrix() const;
+    int support_VK_KHR_dedicated_allocation() const;
+    int support_VK_KHR_descriptor_update_template() const;
+    int support_VK_KHR_external_memory() const;
+    int support_VK_KHR_get_memory_requirements2() const;
+    int support_VK_KHR_maintenance1() const;
+    int support_VK_KHR_maintenance2() const;
+    int support_VK_KHR_maintenance3() const;
+    int support_VK_KHR_multiview() const;
+    int support_VK_KHR_portability_subset() const;
+    int support_VK_KHR_push_descriptor() const;
+    int support_VK_KHR_sampler_ycbcr_conversion() const;
+    int support_VK_KHR_shader_float16_int8() const;
+    int support_VK_KHR_shader_float_controls() const;
+    int support_VK_KHR_storage_buffer_storage_class() const;
+    int support_VK_KHR_swapchain() const;
+    int support_VK_EXT_buffer_device_address() const;
+    int support_VK_EXT_descriptor_indexing() const;
+    int support_VK_EXT_memory_budget() const;
+    int support_VK_EXT_memory_priority() const;
+    int support_VK_EXT_queue_family_foreign() const;
+    int support_VK_AMD_device_coherent_memory() const;
+#if __ANDROID_API__ >= 26
+    int support_VK_ANDROID_external_memory_android_hardware_buffer() const;
+#endif // __ANDROID_API__ >= 26
+    int support_VK_NV_cooperative_matrix() const;
+
+private:
+    GpuInfo(const GpuInfo&);
+    GpuInfo& operator=(const GpuInfo&);
+
+private:
+    friend int create_gpu_instance();
+    GpuInfoPrivate* const d;
+};
+
+NCNN_EXPORT const GpuInfo& get_gpu_info(int device_index = get_default_gpu_index());
+
+class VkAllocator;
+class VkCompute;
+class Option;
+class PipelineCache;
+class VulkanDevicePrivate;
+class NCNN_EXPORT VulkanDevice
+{
+public:
+    VulkanDevice(int device_index = get_default_gpu_index());
+    ~VulkanDevice();
+
+    const GpuInfo& info;
+
+    VkDevice vkdevice() const;
+
+    VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const;
+
+    // with fixed workgroup size
+    VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const;
+
+    // helper for creating pipeline
+    int create_descriptorset_layout(int binding_count, const int* binding_types, VkDescriptorSetLayout* descriptorset_layout) const;
+    int create_pipeline_layout(int push_constant_count, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout* pipeline_layout) const;
+    int create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, VkPipeline* pipeline) const;
+    int create_descriptor_update_template(int binding_count, const int* binding_types, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR* descriptor_update_template) const;
+
+    uint32_t find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const;
+    bool is_mappable(uint32_t memory_type_index) const;
+    bool is_coherent(uint32_t memory_type_index) const;
+
+    VkQueue acquire_queue(uint32_t queue_family_index) const;
+    void reclaim_queue(uint32_t queue_family_index, VkQueue queue) const;
+
+    // allocator on this device
+    VkAllocator* acquire_blob_allocator() const;
+    void reclaim_blob_allocator(VkAllocator* allocator) const;
+
+    VkAllocator* acquire_staging_allocator() const;
+    void reclaim_staging_allocator(VkAllocator* allocator) const;
+
+    // immutable sampler for texelfetch
+    const VkSampler* immutable_texelfetch_sampler() const;
+
+    // dummy buffer image
+    VkMat get_dummy_buffer() const;
+    VkImageMat get_dummy_image() const;
+    VkImageMat get_dummy_image_readonly() const;
+
+    // pipeline cache on this device
+    const PipelineCache* get_pipeline_cache() const;
+
+    // test image allocation
+    bool shape_support_image_storage(const Mat& shape) const;
+
+    // current gpu heap memory budget in MB
+    uint32_t get_heap_budget() const;
+
+    // utility operator
+    void convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkImageMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkImageMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+
+    // VK_KHR_bind_memory2
+    PFN_vkBindBufferMemory2KHR vkBindBufferMemory2KHR;
+    PFN_vkBindImageMemory2KHR vkBindImageMemory2KHR;
+
+    // VK_KHR_buffer_device_address
+    PFN_vkGetBufferDeviceAddressKHR vkGetBufferDeviceAddressKHR;
+    PFN_vkGetBufferOpaqueCaptureAddressKHR vkGetBufferOpaqueCaptureAddressKHR;
+    PFN_vkGetDeviceMemoryOpaqueCaptureAddressKHR vkGetDeviceMemoryOpaqueCaptureAddressKHR;
+
+    // VK_KHR_create_renderpass2
+    PFN_vkCmdBeginRenderPass2KHR vkCmdBeginRenderPass2KHR;
+    PFN_vkCmdEndRenderPass2KHR vkCmdEndRenderPass2KHR;
+    PFN_vkCmdNextSubpass2KHR vkCmdNextSubpass2KHR;
+    PFN_vkCreateRenderPass2KHR vkCreateRenderPass2KHR;
+
+    // VK_KHR_descriptor_update_template
+    PFN_vkCreateDescriptorUpdateTemplateKHR vkCreateDescriptorUpdateTemplateKHR;
+    PFN_vkDestroyDescriptorUpdateTemplateKHR vkDestroyDescriptorUpdateTemplateKHR;
+    PFN_vkUpdateDescriptorSetWithTemplateKHR vkUpdateDescriptorSetWithTemplateKHR;
+
+    // VK_KHR_get_memory_requirements2
+    PFN_vkGetImageMemoryRequirements2KHR vkGetImageMemoryRequirements2KHR;
+    PFN_vkGetBufferMemoryRequirements2KHR vkGetBufferMemoryRequirements2KHR;
+    PFN_vkGetImageSparseMemoryRequirements2KHR vkGetImageSparseMemoryRequirements2KHR;
+
+    // VK_KHR_maintenance1
+    PFN_vkTrimCommandPoolKHR vkTrimCommandPoolKHR;
+
+    // VK_KHR_maintenance3
+    PFN_vkGetDescriptorSetLayoutSupportKHR vkGetDescriptorSetLayoutSupportKHR;
+
+    // VK_KHR_push_descriptor
+    PFN_vkCmdPushDescriptorSetWithTemplateKHR vkCmdPushDescriptorSetWithTemplateKHR;
+    PFN_vkCmdPushDescriptorSetKHR vkCmdPushDescriptorSetKHR;
+
+    // VK_KHR_sampler_ycbcr_conversion
+    PFN_vkCreateSamplerYcbcrConversionKHR vkCreateSamplerYcbcrConversionKHR;
+    PFN_vkDestroySamplerYcbcrConversionKHR vkDestroySamplerYcbcrConversionKHR;
+
+    // VK_KHR_swapchain
+    PFN_vkCreateSwapchainKHR vkCreateSwapchainKHR;
+    PFN_vkDestroySwapchainKHR vkDestroySwapchainKHR;
+    PFN_vkGetSwapchainImagesKHR vkGetSwapchainImagesKHR;
+    PFN_vkAcquireNextImageKHR vkAcquireNextImageKHR;
+    PFN_vkQueuePresentKHR vkQueuePresentKHR;
+
+    // VK_EXT_buffer_device_address
+    PFN_vkGetBufferDeviceAddressEXT vkGetBufferDeviceAddressEXT;
+
+#if __ANDROID_API__ >= 26
+    // VK_ANDROID_external_memory_android_hardware_buffer
+    PFN_vkGetAndroidHardwareBufferPropertiesANDROID vkGetAndroidHardwareBufferPropertiesANDROID;
+    PFN_vkGetMemoryAndroidHardwareBufferANDROID vkGetMemoryAndroidHardwareBufferANDROID;
+#endif // __ANDROID_API__ >= 26
+
+protected:
+    // device extension
+    int init_device_extension();
+
+private:
+    VulkanDevice(const VulkanDevice&);
+    VulkanDevice& operator=(const VulkanDevice&);
+
+private:
+    VulkanDevicePrivate* const d;
+};
+
+NCNN_EXPORT VulkanDevice* get_gpu_device(int device_index = get_default_gpu_index());
+
+// online spirv compilation
+NCNN_EXPORT int compile_spirv_module(const char* comp_string, const Option& opt, std::vector<uint32_t>& spirv);
+NCNN_EXPORT int compile_spirv_module(const char* comp_data, int comp_data_size, const Option& opt, std::vector<uint32_t>& spirv);
+NCNN_EXPORT int compile_spirv_module(int shader_type_index, const Option& opt, std::vector<uint32_t>& spirv);
+
+// info from spirv
+class NCNN_EXPORT ShaderInfo
+{
+public:
+    int specialization_count;
+    int binding_count;
+    int push_constant_count;
+
+    // 0 = null
+    // 1 = storage buffer
+    // 2 = storage image
+    // 3 = combined image sampler
+    int binding_types[16]; // 16 is large enough I think ...
+
+    int reserved_0;
+    int reserved_1;
+    int reserved_2;
+    int reserved_3;
+};
+
+NCNN_EXPORT int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info);
+
+} // namespace ncnn
+
+#endif // NCNN_VULKAN
+
+#endif // NCNN_GPU_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/layer.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/layer.h
new file mode 100644
index 0000000..f0418a9
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/layer.h
@@ -0,0 +1,222 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_H
+#define NCNN_LAYER_H
+
+#include "mat.h"
+#include "modelbin.h"
+#include "option.h"
+#include "paramdict.h"
+#include "platform.h"
+
+#if NCNN_VULKAN
+#include "command.h"
+#include "pipeline.h"
+
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+namespace ncnn {
+
+class NCNN_EXPORT Layer
+{
+public:
+    // empty
+    Layer();
+    // virtual destructor
+    virtual ~Layer();
+
+    // load layer specific parameter from parsed dict
+    // return 0 if success
+    virtual int load_param(const ParamDict& pd);
+
+    // load layer specific weight data from model binary
+    // return 0 if success
+    virtual int load_model(const ModelBin& mb);
+
+    // layer implementation specific setup
+    // return 0 if success
+    virtual int create_pipeline(const Option& opt);
+
+    // layer implementation specific clean
+    // return 0 if success
+    virtual int destroy_pipeline(const Option& opt);
+
+public:
+    // one input and one output blob
+    bool one_blob_only;
+
+    // support inplace inference
+    bool support_inplace;
+
+    // support vulkan compute
+    bool support_vulkan;
+
+    // accept input blob with packed storage
+    bool support_packing;
+
+    // accept bf16
+    bool support_bf16_storage;
+
+    // accept fp16
+    bool support_fp16_storage;
+
+    // accept int8
+    bool support_int8_storage;
+
+    // shader image storage
+    bool support_image_storage;
+
+    // shader tensor storage
+    bool support_tensor_storage;
+
+    bool support_reserved_00;
+
+    bool support_reserved_0;
+    bool support_reserved_1;
+    bool support_reserved_2;
+    bool support_reserved_3;
+    bool support_reserved_4;
+    bool support_reserved_5;
+    bool support_reserved_6;
+    bool support_reserved_7;
+    bool support_reserved_8;
+    bool support_reserved_9;
+
+    // feature disabled set
+    int featmask;
+
+public:
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+
+#if NCNN_VULKAN
+public:
+    // upload weight blob from host to device
+    virtual int upload_model(VkTransfer& cmd, const Option& opt);
+
+public:
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<VkMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<VkImageMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    // assigned immediately after creating this layer
+    const VulkanDevice* vkdev;
+#endif // NCNN_VULKAN
+
+public:
+    // custom user data
+    void* userdata;
+    // layer type index
+    int typeindex;
+#if NCNN_STRING
+    // layer type name
+    std::string type;
+    // layer name
+    std::string name;
+#endif // NCNN_STRING
+    // blob index which this layer needs as input
+    std::vector<int> bottoms;
+    // blob index which this layer produces as output
+    std::vector<int> tops;
+    // shape hint
+    std::vector<Mat> bottom_shapes;
+    std::vector<Mat> top_shapes;
+};
+
+// layer factory function
+typedef Layer* (*layer_creator_func)(void*);
+typedef void (*layer_destroyer_func)(Layer*, void*);
+
+struct layer_registry_entry
+{
+#if NCNN_STRING
+    // layer type name
+    const char* name;
+#endif // NCNN_STRING
+    // layer factory entry
+    layer_creator_func creator;
+};
+
+struct custom_layer_registry_entry
+{
+#if NCNN_STRING
+    // layer type name
+    const char* name;
+#endif // NCNN_STRING
+    // layer factory entry
+    layer_creator_func creator;
+    layer_destroyer_func destroyer;
+    void* userdata;
+};
+
+struct overwrite_builtin_layer_registry_entry
+{
+    // layer type index
+    int typeindex;
+    // layer factory entry
+    layer_creator_func creator;
+    layer_destroyer_func destroyer;
+    void* userdata;
+};
+
+#if NCNN_STRING
+// get layer type from type name
+NCNN_EXPORT int layer_to_index(const char* type);
+// create layer from type name
+NCNN_EXPORT Layer* create_layer(const char* type);
+#endif // NCNN_STRING
+// create layer from layer type
+NCNN_EXPORT Layer* create_layer(int index);
+
+#define DEFINE_LAYER_CREATOR(name)                          \
+    ::ncnn::Layer* name##_layer_creator(void* /*userdata*/) \
+    {                                                       \
+        return new name;                                    \
+    }
+
+#define DEFINE_LAYER_DESTROYER(name)                                      \
+    void name##_layer_destroyer(::ncnn::Layer* layer, void* /*userdata*/) \
+    {                                                                     \
+        delete layer;                                                     \
+    }
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/layer_shader_type.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/layer_shader_type.h
new file mode 100644
index 0000000..c143e7d
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/layer_shader_type.h
@@ -0,0 +1,29 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_SHADER_TYPE_H
+#define NCNN_LAYER_SHADER_TYPE_H
+
+namespace ncnn {
+
+namespace LayerShaderType {
+enum LayerShaderType
+{
+#include "layer_shader_type_enum.h"
+};
+} // namespace LayerShaderType
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_SHADER_TYPE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/layer_shader_type_enum.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/layer_shader_type_enum.h
new file mode 100644
index 0000000..aac8803
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/layer_shader_type_enum.h
@@ -0,0 +1,5 @@
+// Layer Shader Enum header
+//
+// This file is auto-generated by cmake, don't edit it.
+
+
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/layer_type.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/layer_type.h
new file mode 100644
index 0000000..511c714
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/layer_type.h
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_TYPE_H
+#define NCNN_LAYER_TYPE_H
+
+namespace ncnn {
+
+namespace LayerType {
+enum LayerType
+{
+#include "layer_type_enum.h"
+    CustomBit = (1 << 8),
+};
+} // namespace LayerType
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_TYPE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/layer_type_enum.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/layer_type_enum.h
new file mode 100644
index 0000000..97153ed
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/layer_type_enum.h
@@ -0,0 +1,109 @@
+// Layer Type Enum header
+//
+// This file is auto-generated by cmake, don't edit it.
+
+AbsVal = 0,
+ArgMax = 1,
+BatchNorm = 2,
+Bias = 3,
+BNLL = 4,
+Concat = 5,
+Convolution = 6,
+Crop = 7,
+Deconvolution = 8,
+Dropout = 9,
+Eltwise = 10,
+ELU = 11,
+Embed = 12,
+Exp = 13,
+Flatten = 14,
+InnerProduct = 15,
+Input = 16,
+Log = 17,
+LRN = 18,
+MemoryData = 19,
+MVN = 20,
+Pooling = 21,
+Power = 22,
+PReLU = 23,
+Proposal = 24,
+Reduction = 25,
+ReLU = 26,
+Reshape = 27,
+ROIPooling = 28,
+Scale = 29,
+Sigmoid = 30,
+Slice = 31,
+Softmax = 32,
+Split = 33,
+SPP = 34,
+TanH = 35,
+Threshold = 36,
+Tile = 37,
+RNN = 38,
+LSTM = 39,
+BinaryOp = 40,
+UnaryOp = 41,
+ConvolutionDepthWise = 42,
+Padding = 43,
+Squeeze = 44,
+ExpandDims = 45,
+Normalize = 46,
+Permute = 47,
+PriorBox = 48,
+DetectionOutput = 49,
+Interp = 50,
+DeconvolutionDepthWise = 51,
+ShuffleChannel = 52,
+InstanceNorm = 53,
+Clip = 54,
+Reorg = 55,
+YoloDetectionOutput = 56,
+Quantize = 57,
+Dequantize = 58,
+Yolov3DetectionOutput = 59,
+PSROIPooling = 60,
+ROIAlign = 61,
+Packing = 62,
+Requantize = 63,
+Cast = 64,
+HardSigmoid = 65,
+SELU = 66,
+HardSwish = 67,
+Noop = 68,
+PixelShuffle = 69,
+DeepCopy = 70,
+Mish = 71,
+StatisticsPooling = 72,
+Swish = 73,
+Gemm = 74,
+GroupNorm = 75,
+LayerNorm = 76,
+Softplus = 77,
+GRU = 78,
+MultiHeadAttention = 79,
+GELU = 80,
+Convolution1D = 81,
+Pooling1D = 82,
+ConvolutionDepthWise1D = 83,
+Convolution3D = 84,
+ConvolutionDepthWise3D = 85,
+Pooling3D = 86,
+MatMul = 87,
+Deconvolution1D = 88,
+DeconvolutionDepthWise1D = 89,
+Deconvolution3D = 90,
+DeconvolutionDepthWise3D = 91,
+Einsum = 92,
+DeformableConv2D = 93,
+GLU = 94,
+Fold = 95,
+Unfold = 96,
+GridSample = 97,
+CumulativeSum = 98,
+CopyTo = 99,
+Erf = 100,
+Diag = 101,
+CELU = 102,
+Shrink = 103,
+
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/mat.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/mat.h
new file mode 100644
index 0000000..c6f59ef
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/mat.h
@@ -0,0 +1,1843 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_MAT_H
+#define NCNN_MAT_H
+
+#include <stdlib.h>
+#include <string.h>
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif
+#if __SSE2__
+#include <emmintrin.h>
+#if __AVX__
+#include <immintrin.h>
+#endif
+#endif
+#if __mips_msa
+#include <msa.h>
+#endif
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif
+#if __riscv_vector
+#include <riscv_vector.h>
+#include "cpu.h" // cpu_riscv_vlenb()
+#endif
+
+#include "allocator.h"
+#include "option.h"
+#include "platform.h"
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#if NCNN_PIXEL
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/bitmap.h>
+#include <jni.h>
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+#endif // NCNN_PIXEL
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkMat;
+class VkImageMat;
+#endif // NCNN_VULKAN
+
+// the three dimension matrix
+class NCNN_EXPORT Mat
+{
+public:
+    // empty
+    Mat();
+    // vec
+    Mat(int w, size_t elemsize = 4u, Allocator* allocator = 0);
+    // image
+    Mat(int w, int h, size_t elemsize = 4u, Allocator* allocator = 0);
+    // dim
+    Mat(int w, int h, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // cube
+    Mat(int w, int h, int d, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // packed vec
+    Mat(int w, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed image
+    Mat(int w, int h, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed dim
+    Mat(int w, int h, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed cube
+    Mat(int w, int h, int d, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // copy
+    Mat(const Mat& m);
+    // external vec
+    Mat(int w, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external image
+    Mat(int w, int h, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external dim
+    Mat(int w, int h, int c, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external cube
+    Mat(int w, int h, int d, int c, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external packed vec
+    Mat(int w, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed image
+    Mat(int w, int h, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed dim
+    Mat(int w, int h, int c, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed cube
+    Mat(int w, int h, int d, int c, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // release
+    ~Mat();
+    // assign
+    Mat& operator=(const Mat& m);
+    // set all
+    void fill(float v);
+    void fill(int v);
+#if __ARM_NEON
+    void fill(float32x4_t _v);
+    void fill(uint16x4_t _v);
+    void fill(int32x4_t _v);
+    void fill(int32x4_t _v0, int32x4_t _v1);
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    void fill(float16x4_t _v);
+    void fill(float16x8_t _v);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif // __ARM_NEON
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+    void fill(__m512 _v);
+#endif // __AVX512F__
+    void fill(__m256 _v, int i = 0);
+#endif // __AVX__
+    void fill(__m128 _v);
+    void fill(__m128i _v);
+#endif // __SSE2__
+#if __mips_msa
+    void fill(v4f32 _v);
+#endif // __mips_msa
+#if __loongarch_sx
+    void fill(__m128 _v);
+#endif //__loongarch_sx
+#if __riscv_vector
+    void fill(vfloat32m1_t _v);
+    void fill(vuint16m1_t _v);
+    void fill(vint8m1_t _v);
+#if __riscv_zfh
+    void fill(vfloat16m1_t _v);
+#endif // __riscv_zfh
+#endif // __riscv_vector
+    template<typename T>
+    void fill(T v);
+    // deep copy
+    Mat clone(Allocator* allocator = 0) const;
+    // deep copy from other mat, inplace
+    void clone_from(const ncnn::Mat& mat, Allocator* allocator = 0);
+    // reshape vec
+    Mat reshape(int w, Allocator* allocator = 0) const;
+    // reshape image
+    Mat reshape(int w, int h, Allocator* allocator = 0) const;
+    // reshape dim
+    Mat reshape(int w, int h, int c, Allocator* allocator = 0) const;
+    // reshape cube
+    Mat reshape(int w, int h, int d, int c, Allocator* allocator = 0) const;
+    // allocate vec
+    void create(int w, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate image
+    void create(int w, int h, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate cube
+    void create(int w, int h, int d, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed cube
+    void create(int w, int h, int d, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate like
+    void create_like(const Mat& m, Allocator* allocator = 0);
+#if NCNN_VULKAN
+    // allocate like
+    void create_like(const VkMat& m, Allocator* allocator = 0);
+    // allocate like
+    void create_like(const VkImageMat& im, Allocator* allocator = 0);
+#endif // NCNN_VULKAN
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // bits per element
+    int elembits() const;
+
+    // shape only
+    Mat shape() const;
+
+    // data reference
+    Mat channel(int c);
+    const Mat channel(int c) const;
+    Mat depth(int z);
+    const Mat depth(int z) const;
+    float* row(int y);
+    const float* row(int y) const;
+    template<typename T>
+    T* row(int y);
+    template<typename T>
+    const T* row(int y) const;
+
+    // range reference
+    Mat channel_range(int c, int channels);
+    const Mat channel_range(int c, int channels) const;
+    Mat depth_range(int z, int depths);
+    const Mat depth_range(int z, int depths) const;
+    Mat row_range(int y, int rows);
+    const Mat row_range(int y, int rows) const;
+    Mat range(int x, int n);
+    const Mat range(int x, int n) const;
+
+    // access raw data
+    template<typename T>
+    operator T*();
+    template<typename T>
+    operator const T*() const;
+
+    // convenient access float vec element
+    float& operator[](size_t i);
+    const float& operator[](size_t i) const;
+
+#if NCNN_PIXEL
+    enum PixelType
+    {
+        PIXEL_CONVERT_SHIFT = 16,
+        PIXEL_FORMAT_MASK = 0x0000ffff,
+        PIXEL_CONVERT_MASK = 0xffff0000,
+
+        PIXEL_RGB = 1,
+        PIXEL_BGR = 2,
+        PIXEL_GRAY = 3,
+        PIXEL_RGBA = 4,
+        PIXEL_BGRA = 5,
+
+        PIXEL_RGB2BGR = PIXEL_RGB | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGB2GRAY = PIXEL_RGB | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGB2RGBA = PIXEL_RGB | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGB2BGRA = PIXEL_RGB | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_BGR2RGB = PIXEL_BGR | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGR2GRAY = PIXEL_BGR | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGR2RGBA = PIXEL_BGR | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGR2BGRA = PIXEL_BGR | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_GRAY2RGB = PIXEL_GRAY | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_GRAY2BGR = PIXEL_GRAY | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_GRAY2RGBA = PIXEL_GRAY | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+        PIXEL_GRAY2BGRA = PIXEL_GRAY | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_RGBA2RGB = PIXEL_RGBA | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2BGR = PIXEL_RGBA | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2GRAY = PIXEL_RGBA | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2BGRA = PIXEL_RGBA | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_BGRA2RGB = PIXEL_BGRA | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGRA2BGR = PIXEL_BGRA | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGRA2GRAY = PIXEL_BGRA | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGRA2RGBA = PIXEL_BGRA | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+    };
+    // convenient construct from pixel data
+    static Mat from_pixels(const unsigned char* pixels, int type, int w, int h, Allocator* allocator = 0);
+    // convenient construct from pixel data with stride(bytes-per-row) parameter
+    static Mat from_pixels(const unsigned char* pixels, int type, int w, int h, int stride, Allocator* allocator = 0);
+    // convenient construct from pixel data and resize to specific size
+    static Mat from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from pixel data and resize to specific size with stride(bytes-per-row) parameter
+    static Mat from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int stride, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from pixel data roi
+    static Mat from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int roix, int roiy, int roiw, int roih, Allocator* allocator = 0);
+    // convenient construct from pixel data roi with stride(bytes-per-row) parameter
+    static Mat from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, Allocator* allocator = 0);
+    // convenient construct from pixel data roi and resize to specific size
+    static Mat from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from pixel data roi and resize to specific size with stride(bytes-per-row) parameter
+    static Mat from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator = 0);
+
+    // convenient export to pixel data
+    void to_pixels(unsigned char* pixels, int type) const;
+    // convenient export to pixel data with stride(bytes-per-row) parameter
+    void to_pixels(unsigned char* pixels, int type, int stride) const;
+    // convenient export to pixel data and resize to specific size
+    void to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height) const;
+    // convenient export to pixel data and resize to specific size with stride(bytes-per-row) parameter
+    void to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height, int target_stride) const;
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+    // convenient construct from android Bitmap
+    static Mat from_android_bitmap(JNIEnv* env, jobject bitmap, int type_to, Allocator* allocator = 0);
+    // convenient construct from android Bitmap and resize to specific size
+    static Mat from_android_bitmap_resize(JNIEnv* env, jobject bitmap, int type_to, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from android Bitmap roi
+    static Mat from_android_bitmap_roi(JNIEnv* env, jobject bitmap, int type_to, int roix, int roiy, int roiw, int roih, Allocator* allocator = 0);
+    // convenient construct from android Bitmap roi and resize to specific size
+    static Mat from_android_bitmap_roi_resize(JNIEnv* env, jobject bitmap, int type_to, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient export to android Bitmap and resize to the android Bitmap size
+    void to_android_bitmap(JNIEnv* env, jobject bitmap, int type_from) const;
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+#endif // NCNN_PIXEL
+
+    // substract channel-wise mean values, then multiply by normalize values, pass 0 to skip
+    void substract_mean_normalize(const float* mean_vals, const float* norm_vals);
+
+    // convenient construct from half precision floating point data
+    static Mat from_float16(const unsigned short* data, int size);
+
+    // pointer to the data
+    void* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-d-h-w-1  c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-d-h-w-4  c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-d-h-w-8  c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    Allocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int d;
+    int c;
+
+    size_t cstep;
+};
+
+#if NCNN_VULKAN
+
+// the three dimension matrix, vulkan version
+class NCNN_EXPORT VkMat
+{
+public:
+    // empty
+    VkMat();
+    // vec
+    VkMat(int w, size_t elemsize, VkAllocator* allocator);
+    // image
+    VkMat(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // dim
+    VkMat(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // cube
+    VkMat(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // packed vec
+    VkMat(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed image
+    VkMat(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed dim
+    VkMat(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed cube
+    VkMat(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // copy
+    VkMat(const VkMat& m);
+    // external vec
+    VkMat(int w, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external image
+    VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external dim
+    VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external cube
+    VkMat(int w, int h, int d, int c, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external packed vec
+    VkMat(int w, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed image
+    VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed dim
+    VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed cube
+    VkMat(int w, int h, int d, int c, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // release
+    ~VkMat();
+    // assign
+    VkMat& operator=(const VkMat& m);
+    // allocate vec
+    void create(int w, size_t elemsize, VkAllocator* allocator);
+    // allocate image
+    void create(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate cube
+    void create(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed cube
+    void create(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate like
+    void create_like(const Mat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkMat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkImageMat& im, VkAllocator* allocator);
+
+    // mapped
+    Mat mapped() const;
+    void* mapped_ptr() const;
+
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // bits per element
+    int elembits() const;
+
+    // shape only
+    Mat shape() const;
+
+    // low-level reference
+    VkBuffer buffer() const;
+    size_t buffer_offset() const;
+    size_t buffer_capacity() const;
+
+    // device buffer
+    VkBufferMemory* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-d-h-w-1  c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-d-h-w-4  c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-d-h-w-8  c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    VkAllocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int d;
+    int c;
+
+    size_t cstep;
+};
+
+class NCNN_EXPORT VkImageMat
+{
+public:
+    // empty
+    VkImageMat();
+    // vec
+    VkImageMat(int w, size_t elemsize, VkAllocator* allocator);
+    // image
+    VkImageMat(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // dim
+    VkImageMat(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // cube
+    VkImageMat(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // packed vec
+    VkImageMat(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed image
+    VkImageMat(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed dim
+    VkImageMat(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed cube
+    VkImageMat(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // copy
+    VkImageMat(const VkImageMat& m);
+    // external vec
+    VkImageMat(int w, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external image
+    VkImageMat(int w, int h, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external dim
+    VkImageMat(int w, int h, int c, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external cube
+    VkImageMat(int w, int h, int d, int c, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external packed vec
+    VkImageMat(int w, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed image
+    VkImageMat(int w, int h, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed dim
+    VkImageMat(int w, int h, int c, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed cube
+    VkImageMat(int w, int h, int d, int c, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // release
+    ~VkImageMat();
+    // assign
+    VkImageMat& operator=(const VkImageMat& m);
+    // allocate vec
+    void create(int w, size_t elemsize, VkAllocator* allocator);
+    // allocate image
+    void create(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate cube
+    void create(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed cube
+    void create(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate like
+    void create_like(const Mat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkMat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkImageMat& im, VkAllocator* allocator);
+
+    // mapped
+    Mat mapped() const;
+    void* mapped_ptr() const;
+
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // bits per element
+    int elembits() const;
+
+    // shape only
+    Mat shape() const;
+
+    // low-level reference
+    VkImage image() const;
+    VkImageView imageview() const;
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+    // convenient construct from android hardware buffer
+    static VkImageMat from_android_hardware_buffer(VkAndroidHardwareBufferImageAllocator* allocator);
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+    // device image
+    VkImageMemory* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-d-h-w-1  c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-d-h-w-4  c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-d-h-w-8  c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    VkAllocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int d;
+    int c;
+};
+
+// type for vulkan specialization constant and push constant
+union vk_specialization_type
+{
+    int i;
+    float f;
+    uint32_t u32;
+};
+union vk_constant_type
+{
+    int i;
+    float f;
+};
+#endif // NCNN_VULKAN
+
+// misc function
+#if NCNN_PIXEL
+// convert yuv420sp(nv21) to rgb, the fast approximate version
+NCNN_EXPORT void yuv420sp2rgb(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
+// convert yuv420sp(nv12) to rgb, the fast approximate version
+NCNN_EXPORT void yuv420sp2rgb_nv12(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
+// convert yuv420sp(nv21) to rgb with half resize, the faster approximate version
+NCNN_EXPORT void yuv420sp2rgb_half(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
+// image pixel bilinear resize
+NCNN_EXPORT void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+NCNN_EXPORT void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+NCNN_EXPORT void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+NCNN_EXPORT void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+// image pixel bilinear resize with stride(bytes-per-row) parameter
+NCNN_EXPORT void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+NCNN_EXPORT void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+NCNN_EXPORT void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+NCNN_EXPORT void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+// image pixel bilinear resize, convenient wrapper for yuv420sp(nv21/nv12)
+NCNN_EXPORT void resize_bilinear_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+#endif // NCNN_PIXEL
+#if NCNN_PIXEL_ROTATE
+// type is the from type, 6 means rotating from 6 to 1
+//
+//     1        2       3      4         5            6           7          8
+//
+//   888888  888888      88  88      8888888888  88                  88  8888888888
+//   88          88      88  88      88  88      88  88          88  88      88  88
+//   8888      8888    8888  8888    88          8888888888  8888888888          88
+//   88          88      88  88
+//   88          88  888888  888888
+//
+// ref http://sylvana.net/jpegcrop/exif_orientation.html
+// image pixel kanna rotate
+NCNN_EXPORT void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+NCNN_EXPORT void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+NCNN_EXPORT void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+NCNN_EXPORT void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+// image pixel kanna rotate with stride(bytes-per-row) parameter
+NCNN_EXPORT void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+NCNN_EXPORT void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+NCNN_EXPORT void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+NCNN_EXPORT void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+// image pixel kanna rotate, convenient wrapper for yuv420sp(nv21/nv12)
+NCNN_EXPORT void kanna_rotate_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+#endif // NCNN_PIXEL_ROTATE
+#if NCNN_PIXEL_AFFINE
+// resolve affine transform matrix from rotation angle, scale factor and x y offset
+NCNN_EXPORT void get_rotation_matrix(float angle, float scale, float dx, float dy, float* tm);
+// resolve affine transform matrix from two set of points, num_point must be >= 2
+NCNN_EXPORT void get_affine_transform(const float* points_from, const float* points_to, int num_point, float* tm);
+// resolve the inversion affine transform matrix
+NCNN_EXPORT void invert_affine_transform(const float* tm, float* tm_inv);
+// image pixel bilinear warpaffine inverse transform, set -233 for transparent border color, the color RGBA is little-endian encoded
+NCNN_EXPORT void warpaffine_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+// image pixel bilinear warpaffine inverse transform with stride(bytes-per-row) parameter, set -233 for transparent border color, the color RGBA is little-endian encoded
+NCNN_EXPORT void warpaffine_bilinear_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+// image pixel bilinear warpaffine, convenient wrapper for yuv420sp(nv21/nv12), set -233 for transparent border color, the color YUV_ is little-endian encoded
+NCNN_EXPORT void warpaffine_bilinear_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+#endif // NCNN_PIXEL_AFFINE
+#if NCNN_PIXEL_DRAWING
+// draw rectangle, set thickness -1 for filled rectangle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_rectangle_c1(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c2(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c3(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c4(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+// draw rectangle with stride(bytes-per-row) parameter, set thickness -1 for filled rectangle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_rectangle_c1(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c2(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c3(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c4(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+// draw rectangle, convenient wrapper for yuv420sp(nv21/nv12), set thickness -1 for filled rectangle, the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_rectangle_yuv420sp(unsigned char* yuv420sp, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+// draw circle, set thickness -1 for filled circle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_circle_c1(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c2(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c3(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c4(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+// draw circle with stride(bytes-per-row) parameter, set thickness -1 for filled circle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_circle_c1(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c2(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c3(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c4(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+// draw circle, convenient wrapper for yuv420sp(nv21/nv12), set thickness -1 for filled circle, the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_circle_yuv420sp(unsigned char* yuv420sp, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+// draw line, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_line_c1(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c2(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c3(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c4(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+// draw line with stride(bytes-per-row) parameter, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_line_c1(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c2(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c3(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c4(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+// draw line, convenient wrapper for yuv420sp(nv21/nv12), the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_line_yuv420sp(unsigned char* yuv420sp, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+// resolve text bounding box size
+NCNN_EXPORT void get_text_drawing_size(const char* text, int fontpixelsize, int* w, int* h);
+// draw ascii printables and newline, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_text_c1(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c2(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c3(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c4(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+// draw ascii printables and newline with stride(bytes-per-row) parameter, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_text_c1(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c2(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c3(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c4(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+// draw ascii printables and newline, convenient wrapper for yuv420sp(nv21/nv12), the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_text_yuv420sp(unsigned char* yuv420sp, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+#endif // NCNN_PIXEL_DRAWING
+
+// type conversion
+// convert float to half precision floating point
+NCNN_EXPORT unsigned short float32_to_float16(float value);
+// convert half precision floating point to float
+NCNN_EXPORT float float16_to_float32(unsigned short value);
+// convert float to brain half
+NCNN_EXPORT NCNN_FORCEINLINE unsigned short float32_to_bfloat16(float value)
+{
+    // 16 : 16
+    union
+    {
+        unsigned int u;
+        float f;
+    } tmp;
+    tmp.f = value;
+    return tmp.u >> 16;
+}
+// convert brain half to float
+NCNN_EXPORT NCNN_FORCEINLINE float bfloat16_to_float32(unsigned short value)
+{
+    // 16 : 16
+    union
+    {
+        unsigned int u;
+        float f;
+    } tmp;
+    tmp.u = value << 16;
+    return tmp.f;
+}
+
+// mat process
+enum BorderType
+{
+    BORDER_CONSTANT = 0,
+    BORDER_REPLICATE = 1,
+    BORDER_REFLECT = 2,
+    BORDER_TRANSPARENT = -233,
+};
+NCNN_EXPORT void copy_make_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int type, float v, const Option& opt = Option());
+NCNN_EXPORT void copy_make_border_3d(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int front, int behind, int type, float v, const Option& opt = Option());
+NCNN_EXPORT void copy_cut_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, const Option& opt = Option());
+NCNN_EXPORT void copy_cut_border_3d(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int front, int behind, const Option& opt = Option());
+NCNN_EXPORT void resize_nearest(const Mat& src, Mat& dst, int w, int h, const Option& opt = Option());
+NCNN_EXPORT void resize_bilinear(const Mat& src, Mat& dst, int w, int h, const Option& opt = Option());
+NCNN_EXPORT void resize_bicubic(const Mat& src, Mat& dst, int w, int h, const Option& opt = Option());
+NCNN_EXPORT void convert_packing(const Mat& src, Mat& dst, int elempack, const Option& opt = Option());
+NCNN_EXPORT void flatten(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_float32_to_float16(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_float16_to_float32(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_int8_to_float32(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_float32_to_bfloat16(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_bfloat16_to_float32(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void quantize_to_int8(const Mat& src, Mat& dst, const Mat& scale_data, const Option& opt = Option());
+NCNN_EXPORT void dequantize_from_int32(const Mat& src, Mat& dst, const Mat& scale_data, const Mat& bias_data, const Option& opt = Option());
+NCNN_EXPORT void requantize_from_int32_to_int8(const Mat& src, Mat& dst, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt = Option());
+
+NCNN_FORCEINLINE Mat::Mat()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(const Mat& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c), cstep(m.cstep)
+{
+    addref();
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = (size_t)w * h;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = (size_t)w * h;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::~Mat()
+{
+    release();
+}
+
+NCNN_FORCEINLINE void Mat::fill(float _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+
+    int i = 0;
+#if __ARM_NEON
+    float32x4_t _c = vdupq_n_f32(_v);
+    for (; i + 3 < size; i += 4)
+    {
+        vst1q_f32(ptr, _c);
+        ptr += 4;
+    }
+#endif // __ARM_NEON
+    for (; i < size; i++)
+    {
+        *ptr++ = _v;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(int _v)
+{
+    int size = (int)total();
+    int* ptr = (int*)data;
+
+    int i = 0;
+#if __ARM_NEON
+    int32x4_t _c = vdupq_n_s32(_v);
+    for (; i + 3 < size; i += 4)
+    {
+        vst1q_s32(ptr, _c);
+        ptr += 4;
+    }
+#endif // __ARM_NEON
+    for (; i < size; i++)
+    {
+        *ptr++ = _v;
+    }
+}
+
+#if __ARM_NEON
+NCNN_FORCEINLINE void Mat::fill(float32x4_t _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_f32(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(uint16x4_t _v)
+{
+    int size = (int)total();
+    unsigned short* ptr = (unsigned short*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1_u16(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(int32x4_t _v)
+{
+    int size = (int)total();
+    int* ptr = (int*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_s32(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(int32x4_t _v0, int32x4_t _v1)
+{
+    int size = (int)total();
+    int* ptr = (int*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_s32(ptr, _v0);
+        vst1q_s32(ptr + 4, _v1);
+        ptr += 8;
+    }
+}
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+NCNN_FORCEINLINE void Mat::fill(float16x4_t _v)
+{
+    int size = (int)total();
+    __fp16* ptr = (__fp16*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1_f16(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(float16x8_t _v)
+{
+    int size = (int)total();
+    __fp16* ptr = (__fp16*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_f16(ptr, _v);
+        ptr += 8;
+    }
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif // __ARM_NEON
+
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+NCNN_FORCEINLINE void Mat::fill(__m512 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm512_storeu_ps(ptr, _v);
+        ptr += 16;
+    }
+}
+#endif // __AVX512F__
+NCNN_FORCEINLINE void Mat::fill(__m256 _v, int _i)
+{
+    // old gcc cannot overload __m128 and __m256 type
+    // add a dummy int parameter for different mangled function symbol
+    (void)_i;
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm256_storeu_ps(ptr, _v);
+        ptr += 8;
+    }
+}
+#endif // __AVX__
+NCNN_FORCEINLINE void Mat::fill(__m128 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm_storeu_ps(ptr, _v);
+        ptr += 4;
+    }
+}
+NCNN_FORCEINLINE void Mat::fill(__m128i _v)
+{
+    int size = (int)total();
+    unsigned short* ptr = (unsigned short*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm_store_si128((__m128i*)ptr, _v);
+        ptr += 8;
+    }
+}
+#endif // __SSE2__
+
+#if __mips_msa
+NCNN_FORCEINLINE void Mat::fill(v4f32 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        __msa_st_w((v4i32)_v, ptr, 0);
+        ptr += 4;
+    }
+}
+#endif // __mips_msa
+
+#if __loongarch_sx
+NCNN_FORCEINLINE void Mat::fill(__m128 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        __lsx_vst(_v, ptr, 0);
+        ptr += 4;
+    }
+}
+#endif // __loongarch_sx
+#if __riscv_vector
+NCNN_FORCEINLINE void Mat::fill(vfloat32m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 4;
+    const size_t vl = vsetvl_e32m1(packn);
+
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse32_v_f32m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(vuint16m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 2;
+    const size_t vl = vsetvl_e16m1(packn);
+
+    int size = (int)total();
+    unsigned short* ptr = (unsigned short*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse16_v_u16m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(vint8m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 1;
+    const size_t vl = vsetvl_e8m1(packn);
+
+    int size = (int)total();
+    signed char* ptr = (signed char*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse8_v_i8m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+#if __riscv_zfh
+NCNN_FORCEINLINE void Mat::fill(vfloat16m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 2;
+    const size_t vl = vsetvl_e16m1(packn);
+
+    int size = (int)total();
+    __fp16* ptr = (__fp16*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse16_v_f16m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+#endif // __riscv_zfh
+#endif // __riscv_vector
+
+template<typename T>
+NCNN_FORCEINLINE void Mat::fill(T _v)
+{
+    int size = (int)total();
+    T* ptr = (T*)data;
+    for (int i = 0; i < size; i++)
+    {
+        ptr[i] = _v;
+    }
+}
+
+NCNN_FORCEINLINE Mat& Mat::operator=(const Mat& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        NCNN_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    d = m.d;
+    c = m.c;
+
+    cstep = m.cstep;
+
+    return *this;
+}
+
+NCNN_FORCEINLINE void Mat::addref()
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+}
+
+NCNN_FORCEINLINE void Mat::release()
+{
+    if (refcount && NCNN_XADD(refcount, -1) == 1)
+    {
+        if (allocator)
+            allocator->fastFree(data);
+        else
+            fastFree(data);
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    d = 0;
+    c = 0;
+
+    cstep = 0;
+
+    refcount = 0;
+}
+
+NCNN_FORCEINLINE bool Mat::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+NCNN_FORCEINLINE size_t Mat::total() const
+{
+    return cstep * c;
+}
+
+NCNN_FORCEINLINE int Mat::elembits() const
+{
+    return elempack ? static_cast<int>(elemsize * 8) / elempack : 0;
+}
+
+NCNN_FORCEINLINE Mat Mat::shape() const
+{
+    if (dims == 1)
+        return Mat(w * elempack, (void*)0);
+    if (dims == 2)
+        return Mat(w, h * elempack, (void*)0);
+    if (dims == 3)
+        return Mat(w, h, c * elempack, (void*)0);
+    if (dims == 4)
+        return Mat(w, h, d, c * elempack, (void*)0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE Mat Mat::channel(int _c)
+{
+    Mat m(w, h, d, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims - 1;
+    if (dims == 4)
+        m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::channel(int _c) const
+{
+    Mat m(w, h, d, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims - 1;
+    if (dims == 4)
+        m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE Mat Mat::depth(int z)
+{
+    return Mat(w, h, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE const Mat Mat::depth(int z) const
+{
+    return Mat(w, h, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE float* Mat::row(int y)
+{
+    return (float*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+NCNN_FORCEINLINE const float* Mat::row(int y) const
+{
+    return (const float*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+template<typename T>
+NCNN_FORCEINLINE T* Mat::row(int y)
+{
+    return (T*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+template<typename T>
+NCNN_FORCEINLINE const T* Mat::row(int y) const
+{
+    return (const T*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+NCNN_FORCEINLINE Mat Mat::channel_range(int _c, int channels)
+{
+    Mat m(w, h, d, channels, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::channel_range(int _c, int channels) const
+{
+    Mat m(w, h, d, channels, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims;
+    return m;
+}
+
+NCNN_FORCEINLINE Mat Mat::depth_range(int z, int depths)
+{
+    Mat m(w, h, depths, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+    m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::depth_range(int z, int depths) const
+{
+    Mat m(w, h, depths, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+    m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE Mat Mat::row_range(int y, int rows)
+{
+    return Mat(w, rows, (unsigned char*)data + (size_t)w * y * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE const Mat Mat::row_range(int y, int rows) const
+{
+    return Mat(w, rows, (unsigned char*)data + (size_t)w * y * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE Mat Mat::range(int x, int n)
+{
+    return Mat(n, (unsigned char*)data + x * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE const Mat Mat::range(int x, int n) const
+{
+    return Mat(n, (unsigned char*)data + x * elemsize, elemsize, elempack, allocator);
+}
+
+template<typename T>
+NCNN_FORCEINLINE Mat::operator T*()
+{
+    return (T*)data;
+}
+
+template<typename T>
+NCNN_FORCEINLINE Mat::operator const T*() const
+{
+    return (const T*)data;
+}
+
+NCNN_FORCEINLINE float& Mat::operator[](size_t i)
+{
+    return ((float*)data)[i];
+}
+
+NCNN_FORCEINLINE const float& Mat::operator[](size_t i) const
+{
+    return ((const float*)data)[i];
+}
+
+#if NCNN_VULKAN
+
+NCNN_FORCEINLINE VkMat::VkMat()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(const VkMat& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c)
+{
+    addref();
+
+    cstep = m.cstep;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = w * h;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize(w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize(w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = w * h;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize(w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize(w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::~VkMat()
+{
+    release();
+}
+
+NCNN_FORCEINLINE VkMat& VkMat::operator=(const VkMat& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        NCNN_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    d = m.d;
+    c = m.c;
+
+    cstep = m.cstep;
+
+    return *this;
+}
+
+NCNN_FORCEINLINE Mat VkMat::mapped() const
+{
+    if (!allocator->mappable)
+        return Mat();
+
+    if (dims == 1)
+        return Mat(w, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 2)
+        return Mat(w, h, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 3)
+        return Mat(w, h, c, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 4)
+        return Mat(w, h, d, c, mapped_ptr(), elemsize, elempack, 0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE void* VkMat::mapped_ptr() const
+{
+    if (!allocator->mappable)
+        return 0;
+
+    return (unsigned char*)data->mapped_ptr + data->offset;
+}
+
+NCNN_FORCEINLINE void VkMat::addref()
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+}
+
+NCNN_FORCEINLINE void VkMat::release()
+{
+    if (refcount && NCNN_XADD(refcount, -1) == 1)
+    {
+        if (allocator && data)
+        {
+            allocator->fastFree(data);
+        }
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    d = 0;
+    c = 0;
+
+    cstep = 0;
+
+    refcount = 0;
+}
+
+NCNN_FORCEINLINE bool VkMat::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+NCNN_FORCEINLINE size_t VkMat::total() const
+{
+    return cstep * c;
+}
+
+NCNN_FORCEINLINE int VkMat::elembits() const
+{
+    return elempack ? static_cast<int>(elemsize) * 8 / elempack : 0;
+}
+
+NCNN_FORCEINLINE Mat VkMat::shape() const
+{
+    if (dims == 1)
+        return Mat(w * elempack, (void*)0);
+    if (dims == 2)
+        return Mat(w, h * elempack, (void*)0);
+    if (dims == 3)
+        return Mat(w, h, c * elempack, (void*)0);
+    if (dims == 4)
+        return Mat(w, h, d, c * elempack, (void*)0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE VkBuffer VkMat::buffer() const
+{
+    return data->buffer;
+}
+
+NCNN_FORCEINLINE size_t VkMat::buffer_offset() const
+{
+    return data->offset;
+}
+
+NCNN_FORCEINLINE size_t VkMat::buffer_capacity() const
+{
+    return data->capacity;
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(const VkImageMat& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c)
+{
+    addref();
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::~VkImageMat()
+{
+    release();
+}
+
+NCNN_FORCEINLINE VkImageMat& VkImageMat::operator=(const VkImageMat& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        NCNN_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    d = m.d;
+    c = m.c;
+
+    return *this;
+}
+
+NCNN_FORCEINLINE Mat VkImageMat::mapped() const
+{
+    if (!allocator->mappable || !data->mapped_ptr)
+        return Mat();
+
+    if (dims == 1)
+        return Mat(w, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 2)
+        return Mat(w, h, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 3)
+        return Mat(w, h, c, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 4)
+        return Mat(w, h, d, c, mapped_ptr(), elemsize, elempack, 0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE void* VkImageMat::mapped_ptr() const
+{
+    if (!allocator->mappable || !data->mapped_ptr)
+        return 0;
+
+    return (unsigned char*)data->mapped_ptr + data->bind_offset;
+}
+
+NCNN_FORCEINLINE void VkImageMat::addref()
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+}
+
+NCNN_FORCEINLINE void VkImageMat::release()
+{
+    if (refcount && NCNN_XADD(refcount, -1) == 1)
+    {
+        if (allocator && data)
+        {
+            allocator->fastFree(data);
+        }
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    d = 0;
+    c = 0;
+
+    refcount = 0;
+}
+
+NCNN_FORCEINLINE bool VkImageMat::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+NCNN_FORCEINLINE size_t VkImageMat::total() const
+{
+    return w * h * d * c;
+}
+
+NCNN_FORCEINLINE int VkImageMat::elembits() const
+{
+    return elempack ? static_cast<int>(elemsize) * 8 / elempack : 0;
+}
+
+NCNN_FORCEINLINE Mat VkImageMat::shape() const
+{
+    if (dims == 1)
+        return Mat(w * elempack, (void*)0);
+    if (dims == 2)
+        return Mat(w, h * elempack, (void*)0);
+    if (dims == 3)
+        return Mat(w, h, c * elempack, (void*)0);
+    if (dims == 4)
+        return Mat(w, h, d, c * elempack, (void*)0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE VkImage VkImageMat::image() const
+{
+    return data->image;
+}
+
+NCNN_FORCEINLINE VkImageView VkImageMat::imageview() const
+{
+    return data->imageview;
+}
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_MAT_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/modelbin.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/modelbin.h
new file mode 100644
index 0000000..aada5f6
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/modelbin.h
@@ -0,0 +1,80 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_MODELBIN_H
+#define NCNN_MODELBIN_H
+
+#include "mat.h"
+
+namespace ncnn {
+
+class DataReader;
+class NCNN_EXPORT ModelBin
+{
+public:
+    ModelBin();
+    virtual ~ModelBin();
+    // element type
+    // 0 = auto
+    // 1 = float32
+    // 2 = float16
+    // 3 = int8
+    // load vec
+    virtual Mat load(int w, int type) const;
+    // load image
+    virtual Mat load(int w, int h, int type) const;
+    // load dim
+    virtual Mat load(int w, int h, int c, int type) const;
+    // load cube
+    virtual Mat load(int w, int h, int d, int c, int type) const;
+};
+
+class ModelBinFromDataReaderPrivate;
+class NCNN_EXPORT ModelBinFromDataReader : public ModelBin
+{
+public:
+    explicit ModelBinFromDataReader(const DataReader& dr);
+    virtual ~ModelBinFromDataReader();
+
+    virtual Mat load(int w, int type) const;
+
+private:
+    ModelBinFromDataReader(const ModelBinFromDataReader&);
+    ModelBinFromDataReader& operator=(const ModelBinFromDataReader&);
+
+private:
+    ModelBinFromDataReaderPrivate* const d;
+};
+
+class ModelBinFromMatArrayPrivate;
+class NCNN_EXPORT ModelBinFromMatArray : public ModelBin
+{
+public:
+    // construct from weight blob array
+    explicit ModelBinFromMatArray(const Mat* weights);
+    virtual ~ModelBinFromMatArray();
+
+    virtual Mat load(int w, int type) const;
+
+private:
+    ModelBinFromMatArray(const ModelBinFromMatArray&);
+    ModelBinFromMatArray& operator=(const ModelBinFromMatArray&);
+
+private:
+    ModelBinFromMatArrayPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_MODELBIN_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/ncnn_export.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/ncnn_export.h
new file mode 100644
index 0000000..e2f5fde
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/ncnn_export.h
@@ -0,0 +1,42 @@
+
+#ifndef NCNN_EXPORT_H
+#define NCNN_EXPORT_H
+
+#ifdef NCNN_STATIC_DEFINE
+#  define NCNN_EXPORT
+#  define NCNN_NO_EXPORT
+#else
+#  ifndef NCNN_EXPORT
+#    ifdef ncnn_EXPORTS
+        /* We are building this library */
+#      define NCNN_EXPORT __attribute__((visibility("default")))
+#    else
+        /* We are using this library */
+#      define NCNN_EXPORT __attribute__((visibility("default")))
+#    endif
+#  endif
+
+#  ifndef NCNN_NO_EXPORT
+#    define NCNN_NO_EXPORT __attribute__((visibility("hidden")))
+#  endif
+#endif
+
+#ifndef NCNN_DEPRECATED
+#  define NCNN_DEPRECATED __attribute__ ((__deprecated__))
+#endif
+
+#ifndef NCNN_DEPRECATED_EXPORT
+#  define NCNN_DEPRECATED_EXPORT NCNN_EXPORT NCNN_DEPRECATED
+#endif
+
+#ifndef NCNN_DEPRECATED_NO_EXPORT
+#  define NCNN_DEPRECATED_NO_EXPORT NCNN_NO_EXPORT NCNN_DEPRECATED
+#endif
+
+#if 0 /* DEFINE_NO_DEPRECATED */
+#  ifndef NCNN_NO_DEPRECATED
+#    define NCNN_NO_DEPRECATED
+#  endif
+#endif
+
+#endif /* NCNN_EXPORT_H */
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/net.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/net.h
new file mode 100644
index 0000000..98e3ec3
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/net.h
@@ -0,0 +1,274 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_NET_H
+#define NCNN_NET_H
+
+#include "blob.h"
+#include "layer.h"
+#include "mat.h"
+#include "option.h"
+#include "platform.h"
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/asset_manager.h>
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkCompute;
+#endif // NCNN_VULKAN
+class DataReader;
+class Extractor;
+class NetPrivate;
+class NCNN_EXPORT Net
+{
+public:
+    // empty init
+    Net();
+    // clear and destroy
+    virtual ~Net();
+
+public:
+    // option can be changed before loading
+    Option opt;
+
+#if NCNN_VULKAN
+    // set gpu device by index
+    void set_vulkan_device(int device_index);
+
+    // set gpu device by device handle, no owner transfer
+    void set_vulkan_device(const VulkanDevice* vkdev);
+
+    const VulkanDevice* vulkan_device() const;
+#endif // NCNN_VULKAN
+
+#if NCNN_STRING
+    // register custom layer or overwrite built-in layer by layer type name
+    // return 0 if success
+    int register_custom_layer(const char* type, layer_creator_func creator, layer_destroyer_func destroyer = 0, void* userdata = 0);
+    virtual int custom_layer_to_index(const char* type);
+#endif // NCNN_STRING
+    // register custom layer or overwrite built-in layer by layer type
+    // return 0 if success
+    int register_custom_layer(int index, layer_creator_func creator, layer_destroyer_func destroyer = 0, void* userdata = 0);
+
+#if NCNN_STRING
+    int load_param(const DataReader& dr);
+#endif // NCNN_STRING
+
+    int load_param_bin(const DataReader& dr);
+
+    int load_model(const DataReader& dr);
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    // load network structure from plain param file
+    // return 0 if success
+    int load_param(FILE* fp);
+    int load_param(const char* protopath);
+    int load_param_mem(const char* mem);
+#endif // NCNN_STRING
+    // load network structure from binary param file
+    // return 0 if success
+    int load_param_bin(FILE* fp);
+    int load_param_bin(const char* protopath);
+
+    // load network weight data from model file
+    // return 0 if success
+    int load_model(FILE* fp);
+    int load_model(const char* modelpath);
+#endif // NCNN_STDIO
+
+    // load network structure from external memory
+    // memory pointer must be 32-bit aligned
+    // return bytes consumed
+    int load_param(const unsigned char* mem);
+
+    // reference network weight data from external memory
+    // weight data is not copied but referenced
+    // so external memory should be retained when used
+    // memory pointer must be 32-bit aligned
+    // return bytes consumed
+    int load_model(const unsigned char* mem);
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#if NCNN_STRING
+    // convenient load network structure from android asset plain param file
+    int load_param(AAsset* asset);
+    int load_param(AAssetManager* mgr, const char* assetpath);
+#endif // NCNN_STRING
+    // convenient load network structure from android asset binary param file
+    int load_param_bin(AAsset* asset);
+    int load_param_bin(AAssetManager* mgr, const char* assetpath);
+
+    // convenient load network weight data from android asset model file
+    int load_model(AAsset* asset);
+    int load_model(AAssetManager* mgr, const char* assetpath);
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+    // unload network structure and weight data
+    void clear();
+
+    // construct an Extractor from network
+    Extractor create_extractor() const;
+
+    // get input/output indexes/names
+    const std::vector<int>& input_indexes() const;
+    const std::vector<int>& output_indexes() const;
+#if NCNN_STRING
+    const std::vector<const char*>& input_names() const;
+    const std::vector<const char*>& output_names() const;
+#endif
+
+    const std::vector<Blob>& blobs() const;
+    const std::vector<Layer*>& layers() const;
+
+    std::vector<Blob>& mutable_blobs();
+    std::vector<Layer*>& mutable_layers();
+
+protected:
+    friend class Extractor;
+#if NCNN_STRING
+    int find_blob_index_by_name(const char* name) const;
+    int find_layer_index_by_name(const char* name) const;
+    virtual Layer* create_custom_layer(const char* type);
+    virtual Layer* create_overwrite_builtin_layer(const char* type);
+#endif // NCNN_STRING
+    virtual Layer* create_custom_layer(int index);
+    virtual Layer* create_overwrite_builtin_layer(int typeindex);
+
+private:
+    Net(const Net&);
+    Net& operator=(const Net&);
+
+private:
+    NetPrivate* const d;
+};
+
+class ExtractorPrivate;
+class NCNN_EXPORT Extractor
+{
+public:
+    virtual ~Extractor();
+
+    // copy
+    Extractor(const Extractor&);
+
+    // assign
+    Extractor& operator=(const Extractor&);
+
+    // clear blob mats and alloctors
+    void clear();
+
+    // enable light mode
+    // intermediate blob will be recycled when enabled
+    // enabled by default
+    void set_light_mode(bool enable);
+
+    // set thread count for this extractor
+    // this will overwrite the global setting
+    // default count is system depended
+    void set_num_threads(int num_threads);
+
+    // set blob memory allocator
+    void set_blob_allocator(Allocator* allocator);
+
+    // set workspace memory allocator
+    void set_workspace_allocator(Allocator* allocator);
+
+#if NCNN_VULKAN
+    void set_vulkan_compute(bool enable);
+
+    void set_blob_vkallocator(VkAllocator* allocator);
+
+    void set_workspace_vkallocator(VkAllocator* allocator);
+
+    void set_staging_vkallocator(VkAllocator* allocator);
+#endif // NCNN_VULKAN
+
+#if NCNN_STRING
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const Mat& in);
+
+    // get result by blob name
+    // return 0 if success
+    // type = 0, default
+    // type = 1, do not convert fp16/bf16 or / and packing
+    int extract(const char* blob_name, Mat& feat, int type = 0);
+#endif // NCNN_STRING
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const Mat& in);
+
+    // get result by blob index
+    // return 0 if success
+    // type = 0, default
+    // type = 1, do not convert fp16/bf16 or / and packing
+    int extract(int blob_index, Mat& feat, int type = 0);
+
+#if NCNN_VULKAN
+#if NCNN_STRING
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const VkMat& in);
+
+    // get result by blob name
+    // return 0 if success
+    int extract(const char* blob_name, VkMat& feat, VkCompute& cmd);
+
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const VkImageMat& in);
+
+    // get result by blob name
+    // return 0 if success
+    int extract(const char* blob_name, VkImageMat& feat, VkCompute& cmd);
+#endif // NCNN_STRING
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const VkMat& in);
+
+    // get result by blob index
+    // return 0 if success
+    int extract(int blob_index, VkMat& feat, VkCompute& cmd);
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const VkImageMat& in);
+
+    // get result by blob index
+    // return 0 if success
+    int extract(int blob_index, VkImageMat& feat, VkCompute& cmd);
+#endif // NCNN_VULKAN
+
+protected:
+    friend Extractor Net::create_extractor() const;
+    Extractor(const Net* net, size_t blob_count);
+
+private:
+    ExtractorPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_NET_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/option.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/option.h
new file mode 100644
index 0000000..7d0cc60
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/option.h
@@ -0,0 +1,156 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_OPTION_H
+#define NCNN_OPTION_H
+
+#include "platform.h"
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkAllocator;
+class PipelineCache;
+#endif // NCNN_VULKAN
+
+class Allocator;
+class NCNN_EXPORT Option
+{
+public:
+    // default option
+    Option();
+
+public:
+    // light mode
+    // intermediate blob will be recycled when enabled
+    // enabled by default
+    bool lightmode;
+
+    // thread count
+    // default value is the one returned by get_cpu_count()
+    int num_threads;
+
+    // blob memory allocator
+    Allocator* blob_allocator;
+
+    // workspace memory allocator
+    Allocator* workspace_allocator;
+
+#if NCNN_VULKAN
+    // blob memory allocator
+    VkAllocator* blob_vkallocator;
+
+    // workspace memory allocator
+    VkAllocator* workspace_vkallocator;
+
+    // staging memory allocator
+    VkAllocator* staging_vkallocator;
+
+    // pipeline cache
+    PipelineCache* pipeline_cache;
+#endif // NCNN_VULKAN
+
+    // the time openmp threads busy-wait for more work before going to sleep
+    // default value is 20ms to keep the cores enabled
+    // without too much extra power consumption afterwards
+    int openmp_blocktime;
+
+    // enable winograd convolution optimization
+    // improve convolution 3x3 stride1 performance, may consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_winograd_convolution;
+
+    // enable sgemm convolution optimization
+    // improve convolution 1x1 stride1 performance, may consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_sgemm_convolution;
+
+    // enable quantized int8 inference
+    // use low-precision int8 path for quantized model
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_int8_inference;
+
+    // enable vulkan compute
+    bool use_vulkan_compute;
+
+    // enable bf16 data type for storage
+    // improve most operator performance on all arm devices, may consume more memory
+    bool use_bf16_storage;
+
+    // enable options for gpu inference
+    bool use_fp16_packed;
+    bool use_fp16_storage;
+    bool use_fp16_arithmetic;
+    bool use_int8_packed;
+    bool use_int8_storage;
+    bool use_int8_arithmetic;
+
+    // enable simd-friendly packed memory layout
+    // improve all operator performance on all arm devices, will consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_packing_layout;
+
+    bool use_shader_pack8;
+
+    // subgroup option
+    bool use_subgroup_basic;
+    bool use_subgroup_vote;
+    bool use_subgroup_ballot;
+    bool use_subgroup_shuffle;
+
+    // turn on for adreno
+    bool use_image_storage;
+    bool use_tensor_storage;
+
+    bool use_reserved_0;
+
+    // enable DAZ(Denormals-Are-Zero) and FTZ(Flush-To-Zero)
+    // default value is 3
+    // 0 = DAZ OFF, FTZ OFF
+    // 1 = DAZ ON , FTZ OFF
+    // 2 = DAZ OFF, FTZ ON
+    // 3 = DAZ ON,  FTZ ON
+    int flush_denormals;
+
+    bool use_local_pool_allocator;
+
+    // enable local memory optimization for gpu inference
+    bool use_shader_local_memory;
+
+    // enable cooperative matrix optimization for gpu inference
+    bool use_cooperative_matrix;
+
+    // more fine-grained control of winograd convolution
+    bool use_winograd23_convolution;
+    bool use_winograd43_convolution;
+    bool use_winograd63_convolution;
+
+    // this option is turned on for A53/A55 automatically
+    // but you can force this on/off if you wish
+    bool use_a53_a55_optimized_kernel;
+
+    bool use_reserved_7;
+    bool use_reserved_8;
+    bool use_reserved_9;
+    bool use_reserved_10;
+    bool use_reserved_11;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_OPTION_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/paramdict.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/paramdict.h
new file mode 100644
index 0000000..c2ef160
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/paramdict.h
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PARAMDICT_H
+#define NCNN_PARAMDICT_H
+
+#include "mat.h"
+
+// at most 32 parameters
+#define NCNN_MAX_PARAM_COUNT 32
+
+namespace ncnn {
+
+class DataReader;
+class Net;
+class ParamDictPrivate;
+class NCNN_EXPORT ParamDict
+{
+public:
+    // empty
+    ParamDict();
+
+    virtual ~ParamDict();
+
+    // copy
+    ParamDict(const ParamDict&);
+
+    // assign
+    ParamDict& operator=(const ParamDict&);
+
+    // get type
+    int type(int id) const;
+
+    // get int
+    int get(int id, int def) const;
+    // get float
+    float get(int id, float def) const;
+    // get array
+    Mat get(int id, const Mat& def) const;
+
+    // set int
+    void set(int id, int i);
+    // set float
+    void set(int id, float f);
+    // set array
+    void set(int id, const Mat& v);
+
+protected:
+    friend class Net;
+
+    void clear();
+
+    int load_param(const DataReader& dr);
+    int load_param_bin(const DataReader& dr);
+
+private:
+    ParamDictPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_PARAMDICT_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/pipeline.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/pipeline.h
new file mode 100644
index 0000000..c284a14
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/pipeline.h
@@ -0,0 +1,113 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PIPELINE_H
+#define NCNN_PIPELINE_H
+
+#include "mat.h"
+#include "platform.h"
+#if NCNN_VULKAN
+#include "gpu.h"
+
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class Option;
+class PipelinePrivate;
+class NCNN_EXPORT Pipeline
+{
+public:
+    explicit Pipeline(const VulkanDevice* vkdev);
+    virtual ~Pipeline();
+
+public:
+    void set_optimal_local_size_xyz(int w = 4, int h = 4, int c = 4);
+    void set_optimal_local_size_xyz(const Mat& local_size_xyz);
+    void set_local_size_xyz(int w, int h, int c);
+
+    int create(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations);
+
+    int create(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations);
+
+public:
+    VkShaderModule shader_module() const;
+    VkDescriptorSetLayout descriptorset_layout() const;
+    VkPipelineLayout pipeline_layout() const;
+    VkPipeline pipeline() const;
+    VkDescriptorUpdateTemplateKHR descriptor_update_template() const;
+
+    const ShaderInfo& shader_info() const;
+
+    uint32_t local_size_x() const;
+    uint32_t local_size_y() const;
+    uint32_t local_size_z() const;
+
+protected:
+    void set_shader_module(VkShaderModule shader_module);
+    void set_descriptorset_layout(VkDescriptorSetLayout descriptorset_layout);
+    void set_pipeline_layout(VkPipelineLayout pipeline_layout);
+    void set_pipeline(VkPipeline pipeline);
+    void set_descriptor_update_template(VkDescriptorUpdateTemplateKHR descriptor_update_template);
+
+    void set_shader_info(const ShaderInfo& shader_info);
+
+public:
+    const VulkanDevice* vkdev;
+
+private:
+    Pipeline(const Pipeline&);
+    Pipeline& operator=(const Pipeline&);
+
+private:
+    PipelinePrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class VkCompute;
+class NCNN_EXPORT ImportAndroidHardwareBufferPipeline : private Pipeline
+{
+public:
+    explicit ImportAndroidHardwareBufferPipeline(const VulkanDevice* vkdev);
+    virtual ~ImportAndroidHardwareBufferPipeline();
+
+    int create(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator, int type_to, int rotate_from, const Option& opt);
+    int create(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator, int type_to, int rotate_from, int target_width, int target_height, const Option& opt);
+    void destroy();
+
+    friend class VkCompute;
+
+protected:
+    int create_shader_module(const Option& opt);
+    int create_sampler(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator);
+    int create_descriptorset_layout();
+
+public:
+    int type_to;
+    int rotate_from;
+    bool need_resize;
+
+    VkSampler sampler;
+};
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_PIPELINE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/pipelinecache.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/pipelinecache.h
new file mode 100644
index 0000000..bb6b8fb
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/pipelinecache.h
@@ -0,0 +1,85 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PIPELINECACHE_H
+#define NCNN_PIPELINECACHE_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#include "mat.h"
+#include "gpu.h"
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+
+class VulkanDevice;
+class PipelineCachePrivate;
+class NCNN_EXPORT PipelineCache
+{
+public:
+    explicit PipelineCache(const VulkanDevice* _vkdev);
+
+    virtual ~PipelineCache();
+
+    void clear();
+
+    int get_pipeline(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations,
+                     uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                     VkShaderModule* shader_module,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template,
+                     ShaderInfo& shader_info) const;
+
+    int get_pipeline(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations,
+                     uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                     VkShaderModule* shader_module,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template,
+                     ShaderInfo& shader_info) const;
+
+protected:
+    int create_shader_module(int shader_type_index, const Option& opt, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                             VkShaderModule* _shader_module, ShaderInfo& si) const;
+
+    int new_pipeline(VkShaderModule shader_module, const ShaderInfo& shader_info, const std::vector<vk_specialization_type>& specializations,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template) const;
+
+protected:
+    const VulkanDevice* vkdev;
+
+private:
+    PipelineCache(const PipelineCache&);
+    PipelineCache& operator=(const PipelineCache&);
+
+private:
+    PipelineCachePrivate* const d;
+};
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_PIPELINECACHE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/platform.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/platform.h
new file mode 100644
index 0000000..46dc3b0
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/platform.h
@@ -0,0 +1,293 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PLATFORM_H
+#define NCNN_PLATFORM_H
+
+#define NCNN_STDIO 1
+#define NCNN_STRING 1
+#define NCNN_SIMPLEOCV 0
+#define NCNN_SIMPLEOMP 0
+#define NCNN_SIMPLESTL 0
+#define NCNN_SIMPLEMATH 0
+#define NCNN_THREADS 1
+#define NCNN_BENCHMARK 0
+#define NCNN_C_API 1
+#define NCNN_PLATFORM_API 1
+#define NCNN_PIXEL 1
+#define NCNN_PIXEL_ROTATE 1
+#define NCNN_PIXEL_AFFINE 1
+#define NCNN_PIXEL_DRAWING 1
+#define NCNN_VULKAN 0
+#define NCNN_SYSTEM_GLSLANG 0
+#define NCNN_RUNTIME_CPU 1
+#define NCNN_GNU_INLINE_ASM 1
+#define NCNN_AVX 0
+#define NCNN_XOP 0
+#define NCNN_FMA 0
+#define NCNN_F16C 0
+#define NCNN_AVX2 0
+#define NCNN_AVXVNNI 0
+#define NCNN_AVX512 0
+#define NCNN_AVX512VNNI 0
+#define NCNN_AVX512BF16 0
+#define NCNN_AVX512FP16 0
+#define NCNN_VFPV4 1
+#define NCNN_ARM82 0
+#define NCNN_ARM82DOT 0
+#define NCNN_ARM82FP16FML 0
+#define NCNN_ARM84BF16 0
+#define NCNN_ARM84I8MM 0
+#define NCNN_ARM86SVE 0
+#define NCNN_ARM86SVE2 0
+#define NCNN_ARM86SVEBF16 0
+#define NCNN_ARM86SVEI8MM 0
+#define NCNN_ARM86SVEF32MM 0
+#define NCNN_MSA 0
+#define NCNN_LSX 0
+#define NCNN_MMI 0
+#define NCNN_RVV 0
+#define NCNN_INT8 1
+#define NCNN_BF16 1
+#define NCNN_FORCE_INLINE 1
+
+#define NCNN_VERSION_STRING "1.0.20231027"
+
+#include "ncnn_export.h"
+
+#ifdef __cplusplus
+
+#if NCNN_THREADS
+#if (defined _WIN32 && !(defined __MINGW32__))
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <process.h>
+#else
+#include <pthread.h>
+#endif
+#endif // NCNN_THREADS
+
+#if __ANDROID_API__ >= 26
+#define VK_USE_PLATFORM_ANDROID_KHR
+#endif // __ANDROID_API__ >= 26
+
+namespace ncnn {
+
+#if NCNN_THREADS
+#if (defined _WIN32 && !(defined __MINGW32__))
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() { InitializeSRWLock(&srwlock); }
+    ~Mutex() {}
+    void lock() { AcquireSRWLockExclusive(&srwlock); }
+    void unlock() { ReleaseSRWLockExclusive(&srwlock); }
+private:
+    friend class ConditionVariable;
+    // NOTE SRWLock is available from windows vista
+    SRWLOCK srwlock;
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() { InitializeConditionVariable(&condvar); }
+    ~ConditionVariable() {}
+    void wait(Mutex& mutex) { SleepConditionVariableSRW(&condvar, &mutex.srwlock, INFINITE, 0); }
+    void broadcast() { WakeAllConditionVariable(&condvar); }
+    void signal() { WakeConditionVariable(&condvar); }
+private:
+    CONDITION_VARIABLE condvar;
+};
+
+static unsigned __stdcall start_wrapper(void* args);
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*start)(void*), void* args = 0) { _start = start; _args = args; handle = (HANDLE)_beginthreadex(0, 0, start_wrapper, this, 0, 0); }
+    ~Thread() {}
+    void join() { WaitForSingleObject(handle, INFINITE); CloseHandle(handle); }
+private:
+    friend unsigned __stdcall start_wrapper(void* args)
+    {
+        Thread* t = (Thread*)args;
+        t->_start(t->_args);
+        return 0;
+    }
+    HANDLE handle;
+    void* (*_start)(void*);
+    void* _args;
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { key = TlsAlloc(); }
+    ~ThreadLocalStorage() { TlsFree(key); }
+    void set(void* value) { TlsSetValue(key, (LPVOID)value); }
+    void* get() { return (void*)TlsGetValue(key); }
+private:
+    DWORD key;
+};
+#else // (defined _WIN32 && !(defined __MINGW32__))
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() { pthread_mutex_init(&mutex, 0); }
+    ~Mutex() { pthread_mutex_destroy(&mutex); }
+    void lock() { pthread_mutex_lock(&mutex); }
+    void unlock() { pthread_mutex_unlock(&mutex); }
+private:
+    friend class ConditionVariable;
+    pthread_mutex_t mutex;
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() { pthread_cond_init(&cond, 0); }
+    ~ConditionVariable() { pthread_cond_destroy(&cond); }
+    void wait(Mutex& mutex) { pthread_cond_wait(&cond, &mutex.mutex); }
+    void broadcast() { pthread_cond_broadcast(&cond); }
+    void signal() { pthread_cond_signal(&cond); }
+private:
+    pthread_cond_t cond;
+};
+
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*start)(void*), void* args = 0) { pthread_create(&t, 0, start, args); }
+    ~Thread() {}
+    void join() { pthread_join(t, 0); }
+private:
+    pthread_t t;
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { pthread_key_create(&key, 0); }
+    ~ThreadLocalStorage() { pthread_key_delete(key); }
+    void set(void* value) { pthread_setspecific(key, value); }
+    void* get() { return pthread_getspecific(key); }
+private:
+    pthread_key_t key;
+};
+#endif // (defined _WIN32 && !(defined __MINGW32__))
+#else // NCNN_THREADS
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() {}
+    ~Mutex() {}
+    void lock() {}
+    void unlock() {}
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() {}
+    ~ConditionVariable() {}
+    void wait(Mutex& /*mutex*/) {}
+    void broadcast() {}
+    void signal() {}
+};
+
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*/*start*/)(void*), void* /*args*/ = 0) {}
+    ~Thread() {}
+    void join() {}
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { data = 0; }
+    ~ThreadLocalStorage() {}
+    void set(void* value) { data = value; }
+    void* get() { return data; }
+private:
+    void* data;
+};
+#endif // NCNN_THREADS
+
+class NCNN_EXPORT MutexLockGuard
+{
+public:
+    MutexLockGuard(Mutex& _mutex) : mutex(_mutex) { mutex.lock(); }
+    ~MutexLockGuard() { mutex.unlock(); }
+private:
+    Mutex& mutex;
+};
+
+} // namespace ncnn
+
+#if NCNN_SIMPLESTL
+#include "simplestl.h"
+#else
+#include <algorithm>
+#include <list>
+#include <vector>
+#include <string>
+#endif
+
+// simplemath
+#if NCNN_SIMPLEMATH
+#include "simplemath.h"
+#else
+#include <math.h>
+#include <fenv.h>
+#endif
+
+#endif // __cplusplus
+
+#if NCNN_STDIO
+#if NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#include <android/log.h>
+#define NCNN_LOGE(...) do { \
+    fprintf(stderr, ##__VA_ARGS__); fprintf(stderr, "\n"); \
+    __android_log_print(ANDROID_LOG_WARN, "ncnn", ##__VA_ARGS__); } while(0)
+#else // NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#include <stdio.h>
+#define NCNN_LOGE(...) do { \
+    fprintf(stderr, ##__VA_ARGS__); fprintf(stderr, "\n"); } while(0)
+#endif // NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#else
+#define NCNN_LOGE(...)
+#endif
+
+
+#if NCNN_FORCE_INLINE
+#ifdef _MSC_VER
+    #define NCNN_FORCEINLINE __forceinline
+#elif defined(__GNUC__)
+    #define NCNN_FORCEINLINE inline __attribute__((__always_inline__))
+#elif defined(__CLANG__)
+    #if __has_attribute(__always_inline__)
+        #define NCNN_FORCEINLINE inline __attribute__((__always_inline__))
+    #else
+        #define NCNN_FORCEINLINE inline
+    #endif
+#else
+    #define NCNN_FORCEINLINE inline
+#endif
+#else
+    #define NCNN_FORCEINLINE inline
+#endif
+
+#endif // NCNN_PLATFORM_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/simplemath.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/simplemath.h
new file mode 100644
index 0000000..fd7fa69
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/simplemath.h
@@ -0,0 +1,102 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLEMATH_H
+#define NCNN_SIMPLEMATH_H
+
+#include "platform.h"
+
+#if NCNN_SIMPLEMATH
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+* ====================================================
+* discrete functions
+* ====================================================
+*/
+NCNN_EXPORT float fabs(float);
+NCNN_EXPORT float fabsf(float);
+NCNN_EXPORT float fmod(float, float);
+NCNN_EXPORT float floor(float);
+NCNN_EXPORT float floorf(float);
+NCNN_EXPORT float round(float);
+NCNN_EXPORT float roundf(float);
+NCNN_EXPORT float ceil(float);
+NCNN_EXPORT float ceilf(float);
+NCNN_EXPORT float fmaxf(float, float);
+NCNN_EXPORT float truncf(float);
+NCNN_EXPORT float frac(float);
+/*
+* ====================================================
+* trigonometric functions
+* ====================================================
+*/
+NCNN_EXPORT float sinf(float);
+NCNN_EXPORT float cosf(float);
+NCNN_EXPORT float tanf(float);
+NCNN_EXPORT float asinf(float);
+NCNN_EXPORT float acosf(float);
+NCNN_EXPORT float atanf(float);
+NCNN_EXPORT float atan2f(float, float);
+NCNN_EXPORT float tanhf(float);
+
+/*
+* ====================================================
+* power functions
+* ====================================================
+*/
+NCNN_EXPORT float sqrtf(float);
+NCNN_EXPORT float sqrt(float);
+NCNN_EXPORT float powf(float, float);
+
+/*
+* ====================================================
+* exponential and logarithm functions
+* ====================================================
+*/
+NCNN_EXPORT float expf(float);
+NCNN_EXPORT float frexp(float, int*);
+NCNN_EXPORT float logf(float);
+NCNN_EXPORT float log(float);
+NCNN_EXPORT float log10f(float);
+
+/*
+* ====================================================
+* probability functions
+* ====================================================
+*/
+NCNN_EXPORT float erf(float);
+NCNN_EXPORT float erfcf(float);
+
+/*
+* ====================================================
+* other functions
+* ====================================================
+*/
+NCNN_EXPORT int msb(unsigned int);
+NCNN_EXPORT float fmaf(float, float, float);
+NCNN_EXPORT float copysignf(float, float);
+NCNN_EXPORT void fesetround(int);
+NCNN_EXPORT int fegetround();
+NCNN_EXPORT float nearbyintf(float);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // NCNN_SIMPLEMATH
+
+#endif // NCNN_SIMPLEMATH_H
\ No newline at end of file
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/simpleocv.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/simpleocv.h
new file mode 100644
index 0000000..54b22d9
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/simpleocv.h
@@ -0,0 +1,503 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLEOCV_H
+#define NCNN_SIMPLEOCV_H
+
+#include "platform.h"
+
+#if NCNN_SIMPLEOCV
+
+#include <limits.h>
+#include <string.h>
+#include "allocator.h"
+#include "mat.h"
+
+#if defined(_MSC_VER) || defined(__GNUC__)
+#pragma push_macro("min")
+#pragma push_macro("max")
+#undef min
+#undef max
+#endif
+
+#ifndef NCNN_XADD
+using ncnn::NCNN_XADD;
+#endif
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+
+enum
+{
+    CV_LOAD_IMAGE_UNCHANGED = -1,
+    CV_LOAD_IMAGE_GRAYSCALE = 0,
+    CV_LOAD_IMAGE_COLOR = 1,
+};
+
+enum
+{
+    CV_IMWRITE_JPEG_QUALITY = 1
+};
+
+// minimal opencv style data structure implementation
+namespace cv {
+
+template<typename _Tp>
+static inline _Tp saturate_cast(int v)
+{
+    return _Tp(v);
+}
+template<>
+inline uchar saturate_cast<uchar>(int v)
+{
+    return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
+}
+
+template<typename _Tp>
+struct Scalar_
+{
+    Scalar_()
+    {
+        v[0] = 0;
+        v[1] = 0;
+        v[2] = 0;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0)
+    {
+        v[0] = _v0;
+        v[1] = 0;
+        v[2] = 0;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0, _Tp _v1, _Tp _v2)
+    {
+        v[0] = _v0;
+        v[1] = _v1;
+        v[2] = _v2;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0, _Tp _v1, _Tp _v2, _Tp _v3)
+    {
+        v[0] = _v0;
+        v[1] = _v1;
+        v[2] = _v2;
+        v[3] = _v3;
+    }
+
+    const _Tp operator[](const int i) const
+    {
+        return v[i];
+    }
+
+    _Tp operator[](const int i)
+    {
+        return v[i];
+    }
+
+    _Tp v[4];
+};
+
+typedef Scalar_<uchar> Scalar;
+
+template<typename _Tp>
+struct Point_
+{
+    Point_()
+        : x(0), y(0)
+    {
+    }
+    Point_(_Tp _x, _Tp _y)
+        : x(_x), y(_y)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Point_<_Tp2>() const
+    {
+        return Point_<_Tp2>(saturate_cast<_Tp2>(x), saturate_cast<_Tp2>(y));
+    }
+
+    _Tp x;
+    _Tp y;
+};
+
+typedef Point_<int> Point;
+typedef Point_<float> Point2f;
+
+template<typename _Tp>
+struct Size_
+{
+    Size_()
+        : width(0), height(0)
+    {
+    }
+    Size_(_Tp _w, _Tp _h)
+        : width(_w), height(_h)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Size_<_Tp2>() const
+    {
+        return Size_<_Tp2>(saturate_cast<_Tp2>(width), saturate_cast<_Tp2>(height));
+    }
+
+    _Tp width;
+    _Tp height;
+};
+
+typedef Size_<int> Size;
+typedef Size_<float> Size2f;
+
+template<typename _Tp>
+struct Rect_
+{
+    Rect_()
+        : x(0), y(0), width(0), height(0)
+    {
+    }
+    Rect_(_Tp _x, _Tp _y, _Tp _w, _Tp _h)
+        : x(_x), y(_y), width(_w), height(_h)
+    {
+    }
+    Rect_(Point_<_Tp> _p, Size_<_Tp> _size)
+        : x(_p.x), y(_p.y), width(_size.width), height(_size.height)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Rect_<_Tp2>() const
+    {
+        return Rect_<_Tp2>(saturate_cast<_Tp2>(x), saturate_cast<_Tp2>(y), saturate_cast<_Tp2>(width), saturate_cast<_Tp2>(height));
+    }
+
+    _Tp x;
+    _Tp y;
+    _Tp width;
+    _Tp height;
+
+    // area
+    _Tp area() const
+    {
+        return width * height;
+    }
+};
+
+template<typename _Tp>
+static inline Rect_<_Tp>& operator&=(Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    _Tp x1 = std::max(a.x, b.x), y1 = std::max(a.y, b.y);
+    a.width = std::min(a.x + a.width, b.x + b.width) - x1;
+    a.height = std::min(a.y + a.height, b.y + b.height) - y1;
+    a.x = x1;
+    a.y = y1;
+    if (a.width <= 0 || a.height <= 0)
+        a = Rect_<_Tp>();
+    return a;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp>& operator|=(Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    _Tp x1 = std::min(a.x, b.x), y1 = std::min(a.y, b.y);
+    a.width = std::max(a.x + a.width, b.x + b.width) - x1;
+    a.height = std::max(a.y + a.height, b.y + b.height) - y1;
+    a.x = x1;
+    a.y = y1;
+    return a;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp> operator&(const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    Rect_<_Tp> c = a;
+    return c &= b;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp> operator|(const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    Rect_<_Tp> c = a;
+    return c |= b;
+}
+
+typedef Rect_<int> Rect;
+typedef Rect_<float> Rect2f;
+
+#define CV_8UC1  1
+#define CV_8UC3  3
+#define CV_8UC4  4
+#define CV_32FC1 4
+
+struct NCNN_EXPORT Mat
+{
+    Mat()
+        : data(0), refcount(0), rows(0), cols(0), c(0)
+    {
+    }
+
+    Mat(int _rows, int _cols, int flags)
+        : data(0), refcount(0)
+    {
+        create(_rows, _cols, flags);
+    }
+
+    // copy
+    Mat(const Mat& m)
+        : data(m.data), refcount(m.refcount)
+    {
+        if (refcount)
+            NCNN_XADD(refcount, 1);
+
+        rows = m.rows;
+        cols = m.cols;
+        c = m.c;
+    }
+
+    Mat(int _rows, int _cols, int flags, void* _data)
+        : data((unsigned char*)_data), refcount(0)
+    {
+        rows = _rows;
+        cols = _cols;
+        c = flags;
+    }
+
+    ~Mat()
+    {
+        release();
+    }
+
+    // assign
+    Mat& operator=(const Mat& m)
+    {
+        if (this == &m)
+            return *this;
+
+        if (m.refcount)
+            NCNN_XADD(m.refcount, 1);
+
+        release();
+
+        data = m.data;
+        refcount = m.refcount;
+
+        rows = m.rows;
+        cols = m.cols;
+        c = m.c;
+
+        return *this;
+    }
+
+    Mat& operator=(const Scalar& s)
+    {
+        if (total() > 0)
+        {
+            uchar* p = data;
+            for (int i = 0; i < cols * rows; i++)
+            {
+                for (int j = 0; j < c; j++)
+                {
+                    *p++ = s[j];
+                }
+            }
+        }
+
+        return *this;
+    }
+
+    void create(int _rows, int _cols, int flags)
+    {
+        release();
+
+        rows = _rows;
+        cols = _cols;
+        c = flags;
+
+        if (total() > 0)
+        {
+            // refcount address must be aligned, so we expand totalsize here
+            size_t totalsize = (total() + 3) >> 2 << 2;
+            data = (uchar*)ncnn::fastMalloc(totalsize + (int)sizeof(*refcount));
+            refcount = (int*)(((uchar*)data) + totalsize);
+            *refcount = 1;
+        }
+    }
+
+    void release()
+    {
+        if (refcount && NCNN_XADD(refcount, -1) == 1)
+            ncnn::fastFree(data);
+
+        data = 0;
+
+        rows = 0;
+        cols = 0;
+        c = 0;
+
+        refcount = 0;
+    }
+
+    Mat clone() const
+    {
+        if (empty())
+            return Mat();
+
+        Mat m(rows, cols, c);
+
+        if (total() > 0)
+        {
+            memcpy(m.data, data, total());
+        }
+
+        return m;
+    }
+
+    bool empty() const
+    {
+        return data == 0 || total() == 0;
+    }
+
+    int channels() const
+    {
+        return c;
+    }
+
+    int type() const
+    {
+        return c;
+    }
+
+    size_t total() const
+    {
+        return cols * rows * c;
+    }
+
+    const uchar* ptr(int y) const
+    {
+        return data + y * cols * c;
+    }
+
+    uchar* ptr(int y)
+    {
+        return data + y * cols * c;
+    }
+
+    template<typename _Tp>
+    const _Tp* ptr(int y) const
+    {
+        return (const _Tp*)data + y * cols * c;
+    }
+
+    template<typename _Tp>
+    _Tp* ptr(int y)
+    {
+        return (_Tp*)data + y * cols * c;
+    }
+
+    // roi
+    Mat operator()(const Rect& roi) const
+    {
+        if (empty())
+            return Mat();
+
+        Mat m(roi.height, roi.width, c);
+
+        int sy = roi.y;
+        for (int y = 0; y < roi.height; y++)
+        {
+            const uchar* sptr = ptr(sy) + roi.x * c;
+            uchar* dptr = m.ptr(y);
+            memcpy(dptr, sptr, roi.width * c);
+            sy++;
+        }
+
+        return m;
+    }
+
+    uchar* data;
+
+    // pointer to the reference counter;
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    int rows;
+    int cols;
+
+    int c;
+};
+
+enum ImreadModes
+{
+    IMREAD_UNCHANGED = -1,
+    IMREAD_GRAYSCALE = 0,
+    IMREAD_COLOR = 1
+};
+
+NCNN_EXPORT Mat imread(const std::string& path, int flags = IMREAD_COLOR);
+
+NCNN_EXPORT Mat imdecode(const std::vector<uchar>& buf, int flags = IMREAD_COLOR);
+
+enum ImwriteFlags
+{
+    IMWRITE_JPEG_QUALITY = 1
+};
+
+NCNN_EXPORT bool imwrite(const std::string& path, const Mat& m, const std::vector<int>& params = std::vector<int>());
+
+NCNN_EXPORT void imshow(const std::string& name, const Mat& m);
+
+NCNN_EXPORT int waitKey(int delay = 0);
+
+#if NCNN_PIXEL
+NCNN_EXPORT void resize(const Mat& src, Mat& dst, const Size& size, float sw = 0.f, float sh = 0.f, int flags = 0);
+#endif // NCNN_PIXEL
+
+#if NCNN_PIXEL_DRAWING
+
+enum
+{
+    FILLED = -1
+};
+
+NCNN_EXPORT void rectangle(Mat& img, Point pt1, Point pt2, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void rectangle(Mat& img, Rect rec, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void circle(Mat& img, Point center, int radius, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void line(Mat& img, Point p0, Point p1, const Scalar& color, int thickness = 1);
+
+enum
+{
+    FONT_HERSHEY_SIMPLEX = 0
+};
+
+NCNN_EXPORT void putText(Mat& img, const std::string& text, Point org, int fontFace, double fontScale, Scalar color, int thickness = 1);
+
+NCNN_EXPORT Size getTextSize(const std::string& text, int fontFace, double fontScale, int thickness, int* baseLine);
+
+#endif // NCNN_PIXEL_DRAWING
+
+} // namespace cv
+
+#if defined(_MSC_VER) || defined(__GNUC__)
+#pragma pop_macro("min")
+#pragma pop_macro("max")
+#endif
+
+#endif // NCNN_SIMPLEOCV
+
+#endif // NCNN_SIMPLEOCV_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/simpleomp.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/simpleomp.h
new file mode 100644
index 0000000..13e2452
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/simpleomp.h
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLEOMP_H
+#define NCNN_SIMPLEOMP_H
+
+#include "platform.h"
+
+#if NCNN_SIMPLEOMP
+
+#include <stdint.h>
+
+// This minimal openmp runtime implementation only supports the llvm openmp abi
+// and only supports #pragma omp parallel for num_threads(X)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+NCNN_EXPORT int omp_get_max_threads();
+
+NCNN_EXPORT void omp_set_num_threads(int num_threads);
+
+NCNN_EXPORT int omp_get_dynamic();
+
+NCNN_EXPORT void omp_set_dynamic(int dynamic);
+
+NCNN_EXPORT int omp_get_num_threads();
+
+NCNN_EXPORT int omp_get_thread_num();
+
+NCNN_EXPORT int kmp_get_blocktime();
+
+NCNN_EXPORT void kmp_set_blocktime(int blocktime);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // NCNN_SIMPLEOMP
+
+#endif // NCNN_SIMPLEOMP_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/simplestl.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/simplestl.h
new file mode 100644
index 0000000..00ff468
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/simplestl.h
@@ -0,0 +1,565 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLESTL_H
+#define NCNN_SIMPLESTL_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#if !NCNN_SIMPLESTL
+
+#include <new>
+
+#else
+
+// allocation functions
+NCNN_EXPORT void* operator new(size_t size);
+NCNN_EXPORT void* operator new[](size_t size);
+// placement allocation functions
+NCNN_EXPORT void* operator new(size_t size, void* ptr);
+NCNN_EXPORT void* operator new[](size_t size, void* ptr);
+// deallocation functions
+NCNN_EXPORT void operator delete(void* ptr);
+NCNN_EXPORT void operator delete[](void* ptr);
+// deallocation functions since c++14
+#if __cplusplus >= 201402L
+NCNN_EXPORT void operator delete(void* ptr, size_t sz);
+NCNN_EXPORT void operator delete[](void* ptr, size_t sz);
+#endif
+// placement deallocation functions
+NCNN_EXPORT void operator delete(void* ptr, void* voidptr2);
+NCNN_EXPORT void operator delete[](void* ptr, void* voidptr2);
+
+#endif
+
+// minimal stl data structure implementation
+namespace std {
+
+template<typename T>
+const T& max(const T& a, const T& b)
+{
+    return (a < b) ? b : a;
+}
+
+template<typename T>
+const T& min(const T& a, const T& b)
+{
+    return (a > b) ? b : a;
+}
+
+template<typename T>
+void swap(T& a, T& b)
+{
+    T temp(a);
+    a = b;
+    b = temp;
+}
+
+template<typename T1, typename T2>
+struct pair
+{
+    pair()
+        : first(), second()
+    {
+    }
+    pair(const T1& t1, const T2& t2)
+        : first(t1), second(t2)
+    {
+    }
+
+    T1 first;
+    T2 second;
+};
+
+template<typename T1, typename T2>
+bool operator==(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return (x.first == y.first && x.second == y.second);
+}
+template<typename T1, typename T2>
+bool operator<(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return x.first < y.first || (!(y.first < x.first) && x.second < y.second);
+}
+template<typename T1, typename T2>
+bool operator!=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(x == y);
+}
+template<typename T1, typename T2>
+bool operator>(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return y < x;
+}
+template<typename T1, typename T2>
+bool operator<=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(y < x);
+}
+template<typename T1, typename T2>
+bool operator>=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(x < y);
+}
+
+template<typename T1, typename T2>
+pair<T1, T2> make_pair(const T1& t1, const T2& t2)
+{
+    return pair<T1, T2>(t1, t2);
+}
+
+template<typename T>
+struct node
+{
+    node* prev_;
+    node* next_;
+    T data_;
+
+    node()
+        : prev_(0), next_(0), data_()
+    {
+    }
+    node(const T& t)
+        : prev_(0), next_(0), data_(t)
+    {
+    }
+};
+
+template<typename T>
+struct iter_list
+{
+    iter_list()
+        : curr_(0)
+    {
+    }
+    iter_list(node<T>* n)
+        : curr_(n)
+    {
+    }
+    iter_list(const iter_list& i)
+        : curr_(i.curr_)
+    {
+    }
+    ~iter_list()
+    {
+    }
+
+    iter_list& operator=(const iter_list& i)
+    {
+        curr_ = i.curr_;
+        return *this;
+    }
+
+    T& operator*()
+    {
+        return curr_->data_;
+    }
+    T* operator->()
+    {
+        return &(curr_->data_);
+    }
+
+    bool operator==(const iter_list& i)
+    {
+        return curr_ == i.curr_;
+    }
+    bool operator!=(const iter_list& i)
+    {
+        return curr_ != i.curr_;
+    }
+
+    iter_list& operator++()
+    {
+        curr_ = curr_->next_;
+        return *this;
+    }
+    iter_list& operator--()
+    {
+        curr_ = curr_->prev_;
+        return *this;
+    }
+
+    node<T>* curr_;
+};
+
+template<typename T>
+struct list
+{
+    typedef iter_list<T> iterator;
+
+    list()
+    {
+        head_ = new node<T>();
+        tail_ = head_;
+        count_ = 0;
+    }
+    ~list()
+    {
+        clear();
+        delete head_;
+    }
+    list(const list& l)
+    {
+        head_ = new node<T>();
+        tail_ = head_;
+        count_ = 0;
+
+        for (iter_list<T> i = l.begin(); i != l.end(); ++i)
+        {
+            push_back(*i);
+        }
+    }
+
+    list& operator=(const list& l)
+    {
+        if (this == &l)
+        {
+            return *this;
+        }
+        clear();
+
+        for (iter_list<T> i = l.begin(); i != l.end(); ++i)
+        {
+            push_back(*i);
+        }
+        return *this;
+    }
+
+    void clear()
+    {
+        while (count_ > 0)
+        {
+            pop_front();
+        }
+    }
+
+    void pop_front()
+    {
+        if (count_ > 0)
+        {
+            head_ = head_->next_;
+            delete head_->prev_;
+            head_->prev_ = 0;
+            --count_;
+        }
+    }
+
+    size_t size() const
+    {
+        return count_;
+    }
+    iter_list<T> begin() const
+    {
+        return iter_list<T>(head_);
+    }
+    iter_list<T> end() const
+    {
+        return iter_list<T>(tail_);
+    }
+    bool empty() const
+    {
+        return count_ == 0;
+    }
+
+    void push_back(const T& t)
+    {
+        if (count_ == 0)
+        {
+            head_ = new node<T>(t);
+            head_->prev_ = 0;
+            head_->next_ = tail_;
+            tail_->prev_ = head_;
+            count_ = 1;
+        }
+        else
+        {
+            node<T>* temp = new node<T>(t);
+            temp->prev_ = tail_->prev_;
+            temp->next_ = tail_;
+            tail_->prev_->next_ = temp;
+            tail_->prev_ = temp;
+            ++count_;
+        }
+    }
+
+    iter_list<T> erase(iter_list<T> pos)
+    {
+        if (pos != end())
+        {
+            node<T>* temp = pos.curr_;
+            if (temp == head_)
+            {
+                ++pos;
+                temp->next_->prev_ = 0;
+                head_ = temp->next_;
+            }
+            else
+            {
+                --pos;
+                temp->next_->prev_ = temp->prev_;
+                temp->prev_->next_ = temp->next_;
+                ++pos;
+            }
+            delete temp;
+            --count_;
+        }
+        return pos;
+    }
+
+protected:
+    node<T>* head_;
+    node<T>* tail_;
+    size_t count_;
+};
+
+template<typename T>
+struct greater
+{
+    bool operator()(const T& x, const T& y) const
+    {
+        return (x > y);
+    }
+};
+
+template<typename T>
+struct less
+{
+    bool operator()(const T& x, const T& y) const
+    {
+        return (x < y);
+    }
+};
+
+template<typename RandomAccessIter, typename Compare>
+void partial_sort(RandomAccessIter first, RandomAccessIter middle, RandomAccessIter last, Compare comp)
+{
+    // [TODO] heap sort should be used here, but we simply use bubble sort now
+    for (RandomAccessIter i = first; i < middle; ++i)
+    {
+        // bubble sort
+        for (RandomAccessIter j = last - 1; j > first; --j)
+        {
+            if (comp(*j, *(j - 1)))
+            {
+                swap(*j, *(j - 1));
+            }
+        }
+    }
+}
+
+template<typename T>
+struct vector
+{
+    vector()
+        : data_(0), size_(0), capacity_(0)
+    {
+    }
+    vector(const size_t new_size, const T& value = T())
+        : data_(0), size_(0), capacity_(0)
+    {
+        resize(new_size, value);
+    }
+    ~vector()
+    {
+        clear();
+    }
+    vector(const vector& v)
+        : data_(0), size_(0), capacity_(0)
+    {
+        resize(v.size());
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i] = v.data_[i];
+        }
+    }
+
+    vector& operator=(const vector& v)
+    {
+        if (this == &v)
+        {
+            return *this;
+        }
+        resize(0);
+        resize(v.size());
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i] = v.data_[i];
+        }
+        return *this;
+    }
+
+    void resize(const size_t new_size, const T& value = T())
+    {
+        try_alloc(new_size);
+        if (new_size > size_)
+        {
+            for (size_t i = size_; i < new_size; i++)
+            {
+                new (&data_[i]) T(value);
+            }
+        }
+        else if (new_size < size_)
+        {
+            for (size_t i = new_size; i < size_; i++)
+            {
+                data_[i].~T();
+            }
+        }
+        size_ = new_size;
+    }
+
+    void clear()
+    {
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i].~T();
+        }
+        delete[](char*) data_;
+        data_ = 0;
+        size_ = 0;
+        capacity_ = 0;
+    }
+
+    T* data() const
+    {
+        return data_;
+    }
+    size_t size() const
+    {
+        return size_;
+    }
+    T& operator[](size_t i) const
+    {
+        return data_[i];
+    }
+    T* begin() const
+    {
+        return &data_[0];
+    }
+    T* end() const
+    {
+        return &data_[size_];
+    }
+    bool empty() const
+    {
+        return size_ == 0;
+    }
+
+    void push_back(const T& t)
+    {
+        try_alloc(size_ + 1);
+        new (&data_[size_]) T(t);
+        size_++;
+    }
+
+    void insert(T* pos, T* b, T* e)
+    {
+        vector* v = 0;
+        if (b >= begin() && b < end())
+        {
+            //the same vector
+            v = new vector(*this);
+            b = v->begin() + (b - begin());
+            e = v->begin() + (e - begin());
+        }
+        size_t diff = pos - begin();
+        try_alloc(size_ + (e - b));
+        pos = begin() + diff;
+        memmove(pos + (e - b), pos, (end() - pos) * sizeof(T));
+        size_t len = e - b;
+        size_ += len;
+        for (size_t i = 0; i < len; i++)
+        {
+            *pos = *b;
+            pos++;
+            b++;
+        }
+        delete v;
+    }
+
+    T* erase(T* pos)
+    {
+        pos->~T();
+        memmove(pos, pos + 1, (end() - pos - 1) * sizeof(T));
+        size_--;
+        return pos;
+    }
+
+protected:
+    T* data_;
+    size_t size_;
+    size_t capacity_;
+    void try_alloc(size_t new_size)
+    {
+        if (new_size * 3 / 2 > capacity_ / 2)
+        {
+            capacity_ = new_size * 2;
+            T* new_data = (T*)new char[capacity_ * sizeof(T)];
+            memset(static_cast<void*>(new_data), 0, capacity_ * sizeof(T));
+            if (data_)
+            {
+                memmove(new_data, data_, sizeof(T) * size_);
+                delete[](char*) data_;
+            }
+            data_ = new_data;
+        }
+    }
+};
+
+struct NCNN_EXPORT string : public vector<char>
+{
+    string()
+    {
+    }
+    string(const char* str)
+    {
+        size_t len = strlen(str);
+        resize(len);
+        memcpy(data_, str, len);
+    }
+    const char* c_str() const
+    {
+        return (const char*)data_;
+    }
+    bool operator==(const string& str2) const
+    {
+        return strcmp(data_, str2.data_) == 0;
+    }
+    bool operator==(const char* str2) const
+    {
+        return strcmp(data_, str2) == 0;
+    }
+    bool operator!=(const char* str2) const
+    {
+        return strcmp(data_, str2) != 0;
+    }
+    string& operator+=(const string& str1)
+    {
+        insert(end(), str1.begin(), str1.end());
+        return *this;
+    }
+};
+
+inline string operator+(const string& str1, const string& str2)
+{
+    string str(str1);
+    str.insert(str.end(), str2.begin(), str2.end());
+    return str;
+}
+
+} // namespace std
+
+#endif // NCNN_SIMPLESTL_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/vulkan_header_fix.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/vulkan_header_fix.h
new file mode 100644
index 0000000..0a5ea9b
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/include/ncnn/vulkan_header_fix.h
@@ -0,0 +1,449 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_VULKAN_HEADER_FIX_H
+#define NCNN_VULKAN_HEADER_FIX_H
+
+#include <vulkan/vulkan.h>
+
+// This header contains new structure and function declearation to fix build with old vulkan sdk
+
+#if VK_HEADER_VERSION < 70
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES (VkStructureType)1000094000
+typedef enum VkSubgroupFeatureFlagBits
+{
+    VK_SUBGROUP_FEATURE_BASIC_BIT = 0x00000001,
+    VK_SUBGROUP_FEATURE_VOTE_BIT = 0x00000002,
+    VK_SUBGROUP_FEATURE_ARITHMETIC_BIT = 0x00000004,
+    VK_SUBGROUP_FEATURE_BALLOT_BIT = 0x00000008,
+    VK_SUBGROUP_FEATURE_SHUFFLE_BIT = 0x00000010,
+    VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT = 0x00000020,
+    VK_SUBGROUP_FEATURE_CLUSTERED_BIT = 0x00000040,
+    VK_SUBGROUP_FEATURE_QUAD_BIT = 0x00000080,
+    VK_SUBGROUP_FEATURE_PARTITIONED_BIT_NV = 0x00000100,
+    VK_SUBGROUP_FEATURE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkSubgroupFeatureFlagBits;
+typedef VkFlags VkSubgroupFeatureFlags;
+typedef struct VkPhysicalDeviceSubgroupProperties
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t subgroupSize;
+    VkShaderStageFlags supportedStages;
+    VkSubgroupFeatureFlags supportedOperations;
+    VkBool32 quadOperationsInAllStages;
+} VkPhysicalDeviceSubgroupProperties;
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_3_PROPERTIES (VkStructureType)1000168000
+#define VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_SUPPORT            (VkStructureType)1000168001
+typedef struct VkPhysicalDeviceMaintenance3Properties
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t maxPerSetDescriptors;
+    VkDeviceSize maxMemoryAllocationSize;
+} VkPhysicalDeviceMaintenance3Properties;
+typedef struct VkDescriptorSetLayoutSupport
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 supported;
+} VkDescriptorSetLayoutSupport;
+typedef VkPhysicalDeviceMaintenance3Properties VkPhysicalDeviceMaintenance3PropertiesKHR;
+typedef VkDescriptorSetLayoutSupport VkDescriptorSetLayoutSupportKHR;
+typedef void(VKAPI_PTR* PFN_vkGetDescriptorSetLayoutSupportKHR)(VkDevice device, const VkDescriptorSetLayoutCreateInfo* pCreateInfo, VkDescriptorSetLayoutSupport* pSupport);
+#endif // VK_HEADER_VERSION < 70
+
+#if VK_HEADER_VERSION < 80
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR (VkStructureType)1000177000
+typedef struct VkPhysicalDevice8BitStorageFeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 storageBuffer8BitAccess;
+    VkBool32 uniformAndStorageBuffer8BitAccess;
+    VkBool32 storagePushConstant8;
+} VkPhysicalDevice8BitStorageFeaturesKHR;
+#define VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2_KHR  (VkStructureType)1000109000
+#define VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2_KHR    (VkStructureType)1000109001
+#define VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2_KHR     (VkStructureType)1000109002
+#define VK_STRUCTURE_TYPE_SUBPASS_DEPENDENCY_2_KHR      (VkStructureType)1000109003
+#define VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2_KHR (VkStructureType)1000109004
+#define VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO_KHR        (VkStructureType)1000109005
+#define VK_STRUCTURE_TYPE_SUBPASS_END_INFO_KHR          (VkStructureType)1000109006
+typedef struct VkAttachmentDescription2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkAttachmentDescriptionFlags flags;
+    VkFormat format;
+    VkSampleCountFlagBits samples;
+    VkAttachmentLoadOp loadOp;
+    VkAttachmentStoreOp storeOp;
+    VkAttachmentLoadOp stencilLoadOp;
+    VkAttachmentStoreOp stencilStoreOp;
+    VkImageLayout initialLayout;
+    VkImageLayout finalLayout;
+} VkAttachmentDescription2KHR;
+typedef struct VkAttachmentReference2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint32_t attachment;
+    VkImageLayout layout;
+    VkImageAspectFlags aspectMask;
+} VkAttachmentReference2KHR;
+typedef struct VkSubpassDescription2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkSubpassDescriptionFlags flags;
+    VkPipelineBindPoint pipelineBindPoint;
+    uint32_t viewMask;
+    uint32_t inputAttachmentCount;
+    const VkAttachmentReference2KHR* pInputAttachments;
+    uint32_t colorAttachmentCount;
+    const VkAttachmentReference2KHR* pColorAttachments;
+    const VkAttachmentReference2KHR* pResolveAttachments;
+    const VkAttachmentReference2KHR* pDepthStencilAttachment;
+    uint32_t preserveAttachmentCount;
+    const uint32_t* pPreserveAttachments;
+} VkSubpassDescription2KHR;
+typedef struct VkSubpassDependency2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint32_t srcSubpass;
+    uint32_t dstSubpass;
+    VkPipelineStageFlags srcStageMask;
+    VkPipelineStageFlags dstStageMask;
+    VkAccessFlags srcAccessMask;
+    VkAccessFlags dstAccessMask;
+    VkDependencyFlags dependencyFlags;
+    int32_t viewOffset;
+} VkSubpassDependency2KHR;
+typedef struct VkRenderPassCreateInfo2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkRenderPassCreateFlags flags;
+    uint32_t attachmentCount;
+    const VkAttachmentDescription2KHR* pAttachments;
+    uint32_t subpassCount;
+    const VkSubpassDescription2KHR* pSubpasses;
+    uint32_t dependencyCount;
+    const VkSubpassDependency2KHR* pDependencies;
+    uint32_t correlatedViewMaskCount;
+    const uint32_t* pCorrelatedViewMasks;
+} VkRenderPassCreateInfo2KHR;
+typedef struct VkSubpassBeginInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkSubpassContents contents;
+} VkSubpassBeginInfoKHR;
+
+typedef struct VkSubpassEndInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+} VkSubpassEndInfoKHR;
+typedef VkResult(VKAPI_PTR* PFN_vkCreateRenderPass2KHR)(VkDevice device, const VkRenderPassCreateInfo2KHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkRenderPass* pRenderPass);
+typedef void(VKAPI_PTR* PFN_vkCmdBeginRenderPass2KHR)(VkCommandBuffer commandBuffer, const VkRenderPassBeginInfo* pRenderPassBegin, const VkSubpassBeginInfoKHR* pSubpassBeginInfo);
+typedef void(VKAPI_PTR* PFN_vkCmdNextSubpass2KHR)(VkCommandBuffer commandBuffer, const VkSubpassBeginInfoKHR* pSubpassBeginInfo, const VkSubpassEndInfoKHR* pSubpassEndInfo);
+typedef void(VKAPI_PTR* PFN_vkCmdEndRenderPass2KHR)(VkCommandBuffer commandBuffer, const VkSubpassEndInfoKHR* pSubpassEndInfo);
+#endif // VK_HEADER_VERSION < 80
+
+#if VK_HEADER_VERSION < 95
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR (VkStructureType)1000082000
+typedef struct VkPhysicalDeviceFloat16Int8FeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 shaderFloat16;
+    VkBool32 shaderInt8;
+} VkPhysicalDeviceFloat16Int8FeaturesKHR;
+#endif // VK_HEADER_VERSION < 95
+
+#if VK_HEADER_VERSION < 97
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT (VkStructureType)1000237000
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PRIORITY_FEATURES_EXT (VkStructureType)1000238000
+#define VK_STRUCTURE_TYPE_MEMORY_PRIORITY_ALLOCATE_INFO_EXT            (VkStructureType)1000238001
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_ADDRESS_FEATURES_EXT  (VkStructureType)1000244000
+#define VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO_EXT               (VkStructureType)1000244001
+#define VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_CREATE_INFO_EXT        (VkStructureType)1000244002
+#define VK_STRUCTURE_TYPE_VALIDATION_FEATURES_EXT                      (VkStructureType)1000247000
+#define VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT_EXT         (VkBufferCreateFlagBits)0x00020000
+#define VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_EXT                  (VkBufferUsageFlagBits)0x00020000
+typedef uint64_t VkDeviceAddress;
+typedef struct VkPhysicalDeviceMemoryBudgetPropertiesEXT
+{
+    VkStructureType sType;
+    void* pNext;
+    VkDeviceSize heapBudget[VK_MAX_MEMORY_HEAPS];
+    VkDeviceSize heapUsage[VK_MAX_MEMORY_HEAPS];
+} VkPhysicalDeviceMemoryBudgetPropertiesEXT;
+typedef struct VkPhysicalDeviceMemoryPriorityFeaturesEXT
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 memoryPriority;
+} VkPhysicalDeviceMemoryPriorityFeaturesEXT;
+typedef struct VkMemoryPriorityAllocateInfoEXT
+{
+    VkStructureType sType;
+    const void* pNext;
+    float priority;
+} VkMemoryPriorityAllocateInfoEXT;
+typedef struct VkPhysicalDeviceBufferAddressFeaturesEXT
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 bufferDeviceAddress;
+    VkBool32 bufferDeviceAddressCaptureReplay;
+    VkBool32 bufferDeviceAddressMultiDevice;
+} VkPhysicalDeviceBufferAddressFeaturesEXT;
+typedef struct VkBufferDeviceAddressInfoEXT
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkBuffer buffer;
+} VkBufferDeviceAddressInfoEXT;
+typedef struct VkBufferDeviceAddressCreateInfoEXT
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkDeviceSize deviceAddress;
+} VkBufferDeviceAddressCreateInfoEXT;
+typedef VkDeviceAddress(VKAPI_PTR* PFN_vkGetBufferDeviceAddressEXT)(VkDevice device, const VkBufferDeviceAddressInfoEXT* pInfo);
+typedef enum VkValidationFeatureEnableEXT
+{
+    VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT = 0,
+    VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT = 1,
+    VK_VALIDATION_FEATURE_ENABLE_BEGIN_RANGE_EXT = VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT,
+    VK_VALIDATION_FEATURE_ENABLE_END_RANGE_EXT = VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT,
+    VK_VALIDATION_FEATURE_ENABLE_RANGE_SIZE_EXT = (VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT - VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT + 1),
+    VK_VALIDATION_FEATURE_ENABLE_MAX_ENUM_EXT = 0x7FFFFFFF
+} VkValidationFeatureEnableEXT;
+typedef enum VkValidationFeatureDisableEXT
+{
+    VK_VALIDATION_FEATURE_DISABLE_ALL_EXT = 0,
+    VK_VALIDATION_FEATURE_DISABLE_SHADERS_EXT = 1,
+    VK_VALIDATION_FEATURE_DISABLE_THREAD_SAFETY_EXT = 2,
+    VK_VALIDATION_FEATURE_DISABLE_API_PARAMETERS_EXT = 3,
+    VK_VALIDATION_FEATURE_DISABLE_OBJECT_LIFETIMES_EXT = 4,
+    VK_VALIDATION_FEATURE_DISABLE_CORE_CHECKS_EXT = 5,
+    VK_VALIDATION_FEATURE_DISABLE_UNIQUE_HANDLES_EXT = 6,
+    VK_VALIDATION_FEATURE_DISABLE_BEGIN_RANGE_EXT = VK_VALIDATION_FEATURE_DISABLE_ALL_EXT,
+    VK_VALIDATION_FEATURE_DISABLE_END_RANGE_EXT = VK_VALIDATION_FEATURE_DISABLE_UNIQUE_HANDLES_EXT,
+    VK_VALIDATION_FEATURE_DISABLE_RANGE_SIZE_EXT = (VK_VALIDATION_FEATURE_DISABLE_UNIQUE_HANDLES_EXT - VK_VALIDATION_FEATURE_DISABLE_ALL_EXT + 1),
+    VK_VALIDATION_FEATURE_DISABLE_MAX_ENUM_EXT = 0x7FFFFFFF
+} VkValidationFeatureDisableEXT;
+typedef struct VkValidationFeaturesEXT
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint32_t enabledValidationFeatureCount;
+    const VkValidationFeatureEnableEXT* pEnabledValidationFeatures;
+    uint32_t disabledValidationFeatureCount;
+    const VkValidationFeatureDisableEXT* pDisabledValidationFeatures;
+} VkValidationFeaturesEXT;
+#endif // VK_HEADER_VERSION < 97
+
+#if VK_HEADER_VERSION < 101
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_NV   (VkStructureType)1000249000
+#define VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_NV                 (VkStructureType)1000249001
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_PROPERTIES_NV (VkStructureType)1000249002
+typedef enum VkComponentTypeNV
+{
+    VK_COMPONENT_TYPE_FLOAT16_NV = 0,
+    VK_COMPONENT_TYPE_FLOAT32_NV = 1,
+    VK_COMPONENT_TYPE_FLOAT64_NV = 2,
+    VK_COMPONENT_TYPE_SINT8_NV = 3,
+    VK_COMPONENT_TYPE_SINT16_NV = 4,
+    VK_COMPONENT_TYPE_SINT32_NV = 5,
+    VK_COMPONENT_TYPE_SINT64_NV = 6,
+    VK_COMPONENT_TYPE_UINT8_NV = 7,
+    VK_COMPONENT_TYPE_UINT16_NV = 8,
+    VK_COMPONENT_TYPE_UINT32_NV = 9,
+    VK_COMPONENT_TYPE_UINT64_NV = 10,
+    VK_COMPONENT_TYPE_BEGIN_RANGE_NV = VK_COMPONENT_TYPE_FLOAT16_NV,
+    VK_COMPONENT_TYPE_END_RANGE_NV = VK_COMPONENT_TYPE_UINT64_NV,
+    VK_COMPONENT_TYPE_RANGE_SIZE_NV = (VK_COMPONENT_TYPE_UINT64_NV - VK_COMPONENT_TYPE_FLOAT16_NV + 1),
+    VK_COMPONENT_TYPE_MAX_ENUM_NV = 0x7FFFFFFF
+} VkComponentTypeNV;
+typedef enum VkScopeNV
+{
+    VK_SCOPE_DEVICE_NV = 1,
+    VK_SCOPE_WORKGROUP_NV = 2,
+    VK_SCOPE_SUBGROUP_NV = 3,
+    VK_SCOPE_QUEUE_FAMILY_NV = 5,
+    VK_SCOPE_BEGIN_RANGE_NV = VK_SCOPE_DEVICE_NV,
+    VK_SCOPE_END_RANGE_NV = VK_SCOPE_QUEUE_FAMILY_NV,
+    VK_SCOPE_RANGE_SIZE_NV = (VK_SCOPE_QUEUE_FAMILY_NV - VK_SCOPE_DEVICE_NV + 1),
+    VK_SCOPE_MAX_ENUM_NV = 0x7FFFFFFF
+} VkScopeNV;
+typedef struct VkCooperativeMatrixPropertiesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t MSize;
+    uint32_t NSize;
+    uint32_t KSize;
+    VkComponentTypeNV AType;
+    VkComponentTypeNV BType;
+    VkComponentTypeNV CType;
+    VkComponentTypeNV DType;
+    VkScopeNV scope;
+} VkCooperativeMatrixPropertiesNV;
+typedef struct VkPhysicalDeviceCooperativeMatrixFeaturesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 cooperativeMatrix;
+    VkBool32 cooperativeMatrixRobustBufferAccess;
+} VkPhysicalDeviceCooperativeMatrixFeaturesNV;
+typedef struct VkPhysicalDeviceCooperativeMatrixPropertiesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    VkShaderStageFlags cooperativeMatrixSupportedStages;
+} VkPhysicalDeviceCooperativeMatrixPropertiesNV;
+typedef VkResult(VKAPI_PTR* PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesNV)(VkPhysicalDevice physicalDevice, uint32_t* pPropertyCount, VkCooperativeMatrixPropertiesNV* pProperties);
+#endif // VK_HEADER_VERSION < 101
+
+#if VK_HEADER_VERSION < 121
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COHERENT_MEMORY_FEATURES_AMD (VkStructureType)1000229000
+#define VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD                     (VkMemoryPropertyFlagBits)0x00000040
+#define VK_MEMORY_PROPERTY_DEVICE_UNCACHED_BIT_AMD                     (VkMemoryPropertyFlagBits)0x00000040
+typedef struct VkPhysicalDeviceCoherentMemoryFeaturesAMD
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 deviceCoherentMemory;
+} VkPhysicalDeviceCoherentMemoryFeaturesAMD;
+#endif // VK_HEADER_VERSION < 121
+
+#if VK_HEADER_VERSION < 129
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES_KHR (VkStructureType)1000257000
+#define VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO_KHR                     (VkStructureType)1000244001
+#define VK_STRUCTURE_TYPE_BUFFER_OPAQUE_CAPTURE_ADDRESS_CREATE_INFO_KHR      (VkStructureType)1000257002
+#define VK_STRUCTURE_TYPE_MEMORY_OPAQUE_CAPTURE_ADDRESS_ALLOCATE_INFO_KHR    (VkStructureType)1000257003
+#define VK_STRUCTURE_TYPE_DEVICE_MEMORY_OPAQUE_CAPTURE_ADDRESS_INFO_KHR      (VkStructureType)1000257004
+#define VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT_KHR               (VkBufferCreateFlagBits)0x00020000
+#define VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_KHR                        (VkBufferUsageFlagBits)0x00020000
+#define VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT_KHR                            (VkMemoryAllocateFlagBits)0x00000002
+#define VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT_KHR             (VkMemoryAllocateFlagBits)0x00000004
+typedef struct VkPhysicalDeviceBufferDeviceAddressFeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 bufferDeviceAddress;
+    VkBool32 bufferDeviceAddressCaptureReplay;
+    VkBool32 bufferDeviceAddressMultiDevice;
+} VkPhysicalDeviceBufferDeviceAddressFeaturesKHR;
+typedef struct VkBufferDeviceAddressInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkBuffer buffer;
+} VkBufferDeviceAddressInfoKHR;
+typedef struct VkBufferOpaqueCaptureAddressCreateInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint64_t opaqueCaptureAddress;
+} VkBufferOpaqueCaptureAddressCreateInfoKHR;
+typedef struct VkMemoryOpaqueCaptureAddressAllocateInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint64_t opaqueCaptureAddress;
+} VkMemoryOpaqueCaptureAddressAllocateInfoKHR;
+typedef struct VkDeviceMemoryOpaqueCaptureAddressInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkDeviceMemory memory;
+} VkDeviceMemoryOpaqueCaptureAddressInfoKHR;
+typedef VkDeviceAddress(VKAPI_PTR* PFN_vkGetBufferDeviceAddressKHR)(VkDevice device, const VkBufferDeviceAddressInfoKHR* pInfo);
+typedef uint64_t(VKAPI_PTR* PFN_vkGetBufferOpaqueCaptureAddressKHR)(VkDevice device, const VkBufferDeviceAddressInfoKHR* pInfo);
+typedef uint64_t(VKAPI_PTR* PFN_vkGetDeviceMemoryOpaqueCaptureAddressKHR)(VkDevice device, const VkDeviceMemoryOpaqueCaptureAddressInfoKHR* pInfo);
+#endif // VK_HEADER_VERSION < 129
+
+#if VK_HEADER_VERSION < 208
+typedef enum VkInstanceCreateFlagBits
+{
+    VK_INSTANCE_CREATE_ENUMERATE_PORTABILITY_BIT_KHR = 0x00000001,
+    VK_INSTANCE_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkInstanceCreateFlagBits;
+#endif // VK_HEADER_VERSION < 208
+
+#if VK_HEADER_VERSION < 255
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR   (VkStructureType)1000506000
+#define VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR                 (VkStructureType)1000506001
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_PROPERTIES_KHR (VkStructureType)1000506002
+typedef enum VkComponentTypeKHR
+{
+    VK_COMPONENT_TYPE_FLOAT16_KHR = 0,
+    VK_COMPONENT_TYPE_FLOAT32_KHR = 1,
+    VK_COMPONENT_TYPE_FLOAT64_KHR = 2,
+    VK_COMPONENT_TYPE_SINT8_KHR = 3,
+    VK_COMPONENT_TYPE_SINT16_KHR = 4,
+    VK_COMPONENT_TYPE_SINT32_KHR = 5,
+    VK_COMPONENT_TYPE_SINT64_KHR = 6,
+    VK_COMPONENT_TYPE_UINT8_KHR = 7,
+    VK_COMPONENT_TYPE_UINT16_KHR = 8,
+    VK_COMPONENT_TYPE_UINT32_KHR = 9,
+    VK_COMPONENT_TYPE_UINT64_KHR = 10,
+    VK_COMPONENT_TYPE_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkComponentTypeKHR;
+typedef enum VkScopeKHR
+{
+    VK_SCOPE_DEVICE_KHR = 1,
+    VK_SCOPE_WORKGROUP_KHR = 2,
+    VK_SCOPE_SUBGROUP_KHR = 3,
+    VK_SCOPE_QUEUE_FAMILY_KHR = 5,
+    VK_SCOPE_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkScopeKHR;
+typedef struct VkCooperativeMatrixPropertiesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t MSize;
+    uint32_t NSize;
+    uint32_t KSize;
+    VkComponentTypeKHR AType;
+    VkComponentTypeKHR BType;
+    VkComponentTypeKHR CType;
+    VkComponentTypeKHR ResultType;
+    VkBool32 saturatingAccumulation;
+    VkScopeKHR scope;
+} VkCooperativeMatrixPropertiesKHR;
+typedef struct VkPhysicalDeviceCooperativeMatrixFeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 cooperativeMatrix;
+    VkBool32 cooperativeMatrixRobustBufferAccess;
+} VkPhysicalDeviceCooperativeMatrixFeaturesKHR;
+typedef struct VkPhysicalDeviceCooperativeMatrixPropertiesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkShaderStageFlags cooperativeMatrixSupportedStages;
+} VkPhysicalDeviceCooperativeMatrixPropertiesKHR;
+typedef VkResult(VKAPI_PTR* PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR)(VkPhysicalDevice physicalDevice, uint32_t* pPropertyCount, VkCooperativeMatrixPropertiesKHR* pProperties);
+#endif // VK_HEADER_VERSION < 255
+
+#endif // NCNN_VULKAN_HEADER_FIX_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/lib/cmake/ncnn/ncnn-release.cmake b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/lib/cmake/ncnn/ncnn-release.cmake
new file mode 100644
index 0000000..1fb8660
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/lib/cmake/ncnn/ncnn-release.cmake
@@ -0,0 +1,19 @@
+#----------------------------------------------------------------
+# Generated CMake target import file for configuration "Release".
+#----------------------------------------------------------------
+
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+
+# Import target "ncnn" for configuration "Release"
+set_property(TARGET ncnn APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(ncnn PROPERTIES
+  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/libncnn.so"
+  IMPORTED_SONAME_RELEASE "libncnn.so"
+  )
+
+list(APPEND _cmake_import_check_targets ncnn )
+list(APPEND _cmake_import_check_files_for_ncnn "${_IMPORT_PREFIX}/lib/libncnn.so" )
+
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/lib/cmake/ncnn/ncnn.cmake b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/lib/cmake/ncnn/ncnn.cmake
new file mode 100644
index 0000000..6726e95
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/lib/cmake/ncnn/ncnn.cmake
@@ -0,0 +1,109 @@
+# Generated by CMake
+
+if("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.8)
+   message(FATAL_ERROR "CMake >= 2.8.0 required")
+endif()
+if(CMAKE_VERSION VERSION_LESS "2.8.3")
+   message(FATAL_ERROR "CMake >= 2.8.3 required")
+endif()
+cmake_policy(PUSH)
+cmake_policy(VERSION 2.8.3...3.25)
+#----------------------------------------------------------------
+# Generated CMake target import file.
+#----------------------------------------------------------------
+
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+
+# Protect against multiple inclusion, which would fail when already imported targets are added once more.
+set(_cmake_targets_defined "")
+set(_cmake_targets_not_defined "")
+set(_cmake_expected_targets "")
+foreach(_cmake_expected_target IN ITEMS ncnn)
+  list(APPEND _cmake_expected_targets "${_cmake_expected_target}")
+  if(TARGET "${_cmake_expected_target}")
+    list(APPEND _cmake_targets_defined "${_cmake_expected_target}")
+  else()
+    list(APPEND _cmake_targets_not_defined "${_cmake_expected_target}")
+  endif()
+endforeach()
+unset(_cmake_expected_target)
+if(_cmake_targets_defined STREQUAL _cmake_expected_targets)
+  unset(_cmake_targets_defined)
+  unset(_cmake_targets_not_defined)
+  unset(_cmake_expected_targets)
+  unset(CMAKE_IMPORT_FILE_VERSION)
+  cmake_policy(POP)
+  return()
+endif()
+if(NOT _cmake_targets_defined STREQUAL "")
+  string(REPLACE ";" ", " _cmake_targets_defined_text "${_cmake_targets_defined}")
+  string(REPLACE ";" ", " _cmake_targets_not_defined_text "${_cmake_targets_not_defined}")
+  message(FATAL_ERROR "Some (but not all) targets in this export set were already defined.\nTargets Defined: ${_cmake_targets_defined_text}\nTargets not yet defined: ${_cmake_targets_not_defined_text}\n")
+endif()
+unset(_cmake_targets_defined)
+unset(_cmake_targets_not_defined)
+unset(_cmake_expected_targets)
+
+
+# Compute the installation prefix relative to this file.
+get_filename_component(_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+if(_IMPORT_PREFIX STREQUAL "/")
+  set(_IMPORT_PREFIX "")
+endif()
+
+# Create imported target ncnn
+add_library(ncnn SHARED IMPORTED)
+
+set_target_properties(ncnn PROPERTIES
+  INTERFACE_COMPILE_OPTIONS "-fno-rtti;-fno-exceptions"
+  INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include/ncnn"
+  INTERFACE_LINK_LIBRARIES "-fopenmp;-static-openmp;-Wl,-wrap,__kmp_affinity_determine_capable;Threads::Threads;android;jnigraphics;log"
+  INTERFACE_POSITION_INDEPENDENT_CODE "ON"
+)
+
+if(CMAKE_VERSION VERSION_LESS 2.8.12)
+  message(FATAL_ERROR "This file relies on consumers using CMake 2.8.12 or greater.")
+endif()
+
+# Load information for each installed configuration.
+file(GLOB _cmake_config_files "${CMAKE_CURRENT_LIST_DIR}/ncnn-*.cmake")
+foreach(_cmake_config_file IN LISTS _cmake_config_files)
+  include("${_cmake_config_file}")
+endforeach()
+unset(_cmake_config_file)
+unset(_cmake_config_files)
+
+# Cleanup temporary variables.
+set(_IMPORT_PREFIX)
+
+# Loop over all imported files and verify that they actually exist
+foreach(_cmake_target IN LISTS _cmake_import_check_targets)
+  foreach(_cmake_file IN LISTS "_cmake_import_check_files_for_${_cmake_target}")
+    if(NOT EXISTS "${_cmake_file}")
+      message(FATAL_ERROR "The imported target \"${_cmake_target}\" references the file
+   \"${_cmake_file}\"
+but this file does not exist.  Possible reasons include:
+* The file was deleted, renamed, or moved to another location.
+* An install or uninstall procedure did not complete successfully.
+* The installation package was faulty and contained
+   \"${CMAKE_CURRENT_LIST_FILE}\"
+but not all the files it references.
+")
+    endif()
+  endforeach()
+  unset(_cmake_file)
+  unset("_cmake_import_check_files_for_${_cmake_target}")
+endforeach()
+unset(_cmake_target)
+unset(_cmake_import_check_targets)
+
+# This file does not depend on other imported targets which have
+# been exported from the same project but in a separate export set.
+
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)
+cmake_policy(POP)
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/lib/cmake/ncnn/ncnnConfig.cmake b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/lib/cmake/ncnn/ncnnConfig.cmake
new file mode 100644
index 0000000..d3ac286
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/lib/cmake/ncnn/ncnnConfig.cmake
@@ -0,0 +1,42 @@
+set(NCNN_OPENMP ON)
+set(NCNN_THREADS ON)
+set(NCNN_VULKAN OFF)
+set(NCNN_SHARED_LIB ON)
+set(NCNN_SYSTEM_GLSLANG OFF)
+
+if(NCNN_OPENMP)
+    find_package(OpenMP)
+endif()
+
+if(NCNN_THREADS)
+    set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
+    set(THREADS_PREFER_PTHREAD_FLAG TRUE)
+    find_package(Threads REQUIRED)
+endif()
+
+if(NCNN_VULKAN)
+    find_package(Vulkan REQUIRED)
+
+    if(NOT NCNN_SHARED_LIB)
+        if(NCNN_SYSTEM_GLSLANG)
+            find_package(glslang QUIET)
+            if(NOT glslang_FOUND)
+                set(GLSLANG_TARGET_DIR "")
+                include(${GLSLANG_TARGET_DIR}/OSDependentTargets.cmake)
+                include(${GLSLANG_TARGET_DIR}/OGLCompilerTargets.cmake)
+                if(EXISTS "${GLSLANG_TARGET_DIR}/HLSLTargets.cmake")
+                    # hlsl support can be optional
+                    include("${GLSLANG_TARGET_DIR}/HLSLTargets.cmake")
+                endif()
+                include(${GLSLANG_TARGET_DIR}/glslangTargets.cmake)
+                include(${GLSLANG_TARGET_DIR}/SPIRVTargets.cmake)
+            endif()
+        else()
+            set(glslang_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../lib/cmake/glslang")
+            find_package(glslang QUIET)
+        endif()
+
+    endif()
+endif()
+
+include(${CMAKE_CURRENT_LIST_DIR}/ncnn.cmake)
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/lib/pkgconfig/ncnn.pc b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/lib/pkgconfig/ncnn.pc
new file mode 100644
index 0000000..4e80236
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/armeabi-v7a/lib/pkgconfig/ncnn.pc
@@ -0,0 +1,11 @@
+prefix=${pcfiledir}/../..
+librarydir=${prefix}/lib
+includedir=${prefix}/include
+
+Name: ncnn
+Description: high-performance neural network inference framework optimized for the mobile platform
+Version: 1.0.20231027
+URL: https://github.com/Tencent/ncnn
+Libs: -L"${librarydir}" -lncnn
+Cflags: -I"${includedir}"
+
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/allocator.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/allocator.h
new file mode 100644
index 0000000..3a5ebca
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/allocator.h
@@ -0,0 +1,448 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_ALLOCATOR_H
+#define NCNN_ALLOCATOR_H
+
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+
+#include "platform.h"
+
+#include <stdlib.h>
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+#include <android/hardware_buffer.h>
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+// the alignment of all the allocated buffers
+#if NCNN_AVX512
+#define NCNN_MALLOC_ALIGN 64
+#elif NCNN_AVX
+#define NCNN_MALLOC_ALIGN 32
+#else
+#define NCNN_MALLOC_ALIGN 16
+#endif
+
+// we have some optimized kernels that may overread buffer a bit in loop
+// it is common to interleave next-loop data load with arithmetic instructions
+// allocating more bytes keeps us safe from SEGV_ACCERR failure
+#define NCNN_MALLOC_OVERREAD 64
+
+// Aligns a pointer to the specified number of bytes
+// ptr Aligned pointer
+// n Alignment size that must be a power of two
+template<typename _Tp>
+static NCNN_FORCEINLINE _Tp* alignPtr(_Tp* ptr, int n = (int)sizeof(_Tp))
+{
+    return (_Tp*)(((size_t)ptr + n - 1) & -n);
+}
+
+// Aligns a buffer size to the specified number of bytes
+// The function returns the minimum number that is greater or equal to sz and is divisible by n
+// sz Buffer size to align
+// n Alignment size that must be a power of two
+static NCNN_FORCEINLINE size_t alignSize(size_t sz, int n)
+{
+    return (sz + n - 1) & -n;
+}
+
+static NCNN_FORCEINLINE void* fastMalloc(size_t size)
+{
+#if _MSC_VER
+    return _aligned_malloc(size, NCNN_MALLOC_ALIGN);
+#elif (defined(__unix__) || defined(__APPLE__)) && _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17)
+    void* ptr = 0;
+    if (posix_memalign(&ptr, NCNN_MALLOC_ALIGN, size + NCNN_MALLOC_OVERREAD))
+        ptr = 0;
+    return ptr;
+#elif __ANDROID__ && __ANDROID_API__ < 17
+    return memalign(NCNN_MALLOC_ALIGN, size + NCNN_MALLOC_OVERREAD);
+#else
+    unsigned char* udata = (unsigned char*)malloc(size + sizeof(void*) + NCNN_MALLOC_ALIGN + NCNN_MALLOC_OVERREAD);
+    if (!udata)
+        return 0;
+    unsigned char** adata = alignPtr((unsigned char**)udata + 1, NCNN_MALLOC_ALIGN);
+    adata[-1] = udata;
+    return adata;
+#endif
+}
+
+static NCNN_FORCEINLINE void fastFree(void* ptr)
+{
+    if (ptr)
+    {
+#if _MSC_VER
+        _aligned_free(ptr);
+#elif (defined(__unix__) || defined(__APPLE__)) && _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17)
+        free(ptr);
+#elif __ANDROID__ && __ANDROID_API__ < 17
+        free(ptr);
+#else
+        unsigned char* udata = ((unsigned char**)ptr)[-1];
+        free(udata);
+#endif
+    }
+}
+
+#if NCNN_THREADS
+// exchange-add operation for atomic operations on reference counters
+#if defined __riscv && !defined __riscv_atomic
+// riscv target without A extension
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#elif defined __INTEL_COMPILER && !(defined WIN32 || defined _WIN32)
+// atomic increment on the linux version of the Intel(tm) compiler
+#define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd(const_cast<void*>(reinterpret_cast<volatile void*>(addr)), delta)
+#elif defined __GNUC__
+#if defined __clang__ && __clang_major__ >= 3 && !defined __ANDROID__ && !defined __EMSCRIPTEN__ && !defined(__CUDACC__)
+#ifdef __ATOMIC_ACQ_REL
+#define NCNN_XADD(addr, delta) __c11_atomic_fetch_add((_Atomic(int)*)(addr), delta, __ATOMIC_ACQ_REL)
+#else
+#define NCNN_XADD(addr, delta) __atomic_fetch_add((_Atomic(int)*)(addr), delta, 4)
+#endif
+#else
+#if defined __ATOMIC_ACQ_REL && !defined __clang__
+// version for gcc >= 4.7
+#define NCNN_XADD(addr, delta) (int)__atomic_fetch_add((unsigned*)(addr), (unsigned)(delta), __ATOMIC_ACQ_REL)
+#else
+#define NCNN_XADD(addr, delta) (int)__sync_fetch_and_add((unsigned*)(addr), (unsigned)(delta))
+#endif
+#endif
+#elif defined _MSC_VER && !defined RC_INVOKED
+#define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile*)addr, delta)
+#else
+// thread-unsafe branch
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#endif
+#else  // NCNN_THREADS
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#endif // NCNN_THREADS
+
+class NCNN_EXPORT Allocator
+{
+public:
+    virtual ~Allocator();
+    virtual void* fastMalloc(size_t size) = 0;
+    virtual void fastFree(void* ptr) = 0;
+};
+
+class PoolAllocatorPrivate;
+class NCNN_EXPORT PoolAllocator : public Allocator
+{
+public:
+    PoolAllocator();
+    ~PoolAllocator();
+
+    // ratio range 0 ~ 1
+    // default cr = 0
+    void set_size_compare_ratio(float scr);
+
+    // budget drop threshold
+    // default threshold = 10
+    void set_size_drop_threshold(size_t);
+
+    // release all budgets immediately
+    void clear();
+
+    virtual void* fastMalloc(size_t size);
+    virtual void fastFree(void* ptr);
+
+private:
+    PoolAllocator(const PoolAllocator&);
+    PoolAllocator& operator=(const PoolAllocator&);
+
+private:
+    PoolAllocatorPrivate* const d;
+};
+
+class UnlockedPoolAllocatorPrivate;
+class NCNN_EXPORT UnlockedPoolAllocator : public Allocator
+{
+public:
+    UnlockedPoolAllocator();
+    ~UnlockedPoolAllocator();
+
+    // ratio range 0 ~ 1
+    // default cr = 0
+    void set_size_compare_ratio(float scr);
+
+    // budget drop threshold
+    // default threshold = 10
+    void set_size_drop_threshold(size_t);
+
+    // release all budgets immediately
+    void clear();
+
+    virtual void* fastMalloc(size_t size);
+    virtual void fastFree(void* ptr);
+
+private:
+    UnlockedPoolAllocator(const UnlockedPoolAllocator&);
+    UnlockedPoolAllocator& operator=(const UnlockedPoolAllocator&);
+
+private:
+    UnlockedPoolAllocatorPrivate* const d;
+};
+
+#if NCNN_VULKAN
+
+class VulkanDevice;
+
+class NCNN_EXPORT VkBufferMemory
+{
+public:
+    VkBuffer buffer;
+
+    // the base offset assigned by allocator
+    size_t offset;
+    size_t capacity;
+
+    VkDeviceMemory memory;
+    void* mapped_ptr;
+
+    // buffer state, modified by command functions internally
+    mutable VkAccessFlags access_flags;
+    mutable VkPipelineStageFlags stage_flags;
+
+    // initialize and modified by mat
+    int refcount;
+};
+
+class NCNN_EXPORT VkImageMemory
+{
+public:
+    VkImage image;
+    VkImageView imageview;
+
+    // underlying info assigned by allocator
+    int width;
+    int height;
+    int depth;
+    VkFormat format;
+
+    VkDeviceMemory memory;
+    void* mapped_ptr;
+
+    // the base offset assigned by allocator
+    size_t bind_offset;
+    size_t bind_capacity;
+
+    // image state, modified by command functions internally
+    mutable VkAccessFlags access_flags;
+    mutable VkImageLayout image_layout;
+    mutable VkPipelineStageFlags stage_flags;
+
+    // in-execution state, modified by command functions internally
+    mutable int command_refcount;
+
+    // initialize and modified by mat
+    int refcount;
+};
+
+class NCNN_EXPORT VkAllocator
+{
+public:
+    explicit VkAllocator(const VulkanDevice* _vkdev);
+    virtual ~VkAllocator();
+
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size) = 0;
+    virtual void fastFree(VkBufferMemory* ptr) = 0;
+    virtual int flush(VkBufferMemory* ptr);
+    virtual int invalidate(VkBufferMemory* ptr);
+
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack) = 0;
+    virtual void fastFree(VkImageMemory* ptr) = 0;
+
+public:
+    const VulkanDevice* vkdev;
+    uint32_t buffer_memory_type_index;
+    uint32_t image_memory_type_index;
+    uint32_t reserved_type_index;
+    bool mappable;
+    bool coherent;
+
+protected:
+    VkBuffer create_buffer(size_t size, VkBufferUsageFlags usage);
+    VkDeviceMemory allocate_memory(size_t size, uint32_t memory_type_index);
+    VkDeviceMemory allocate_dedicated_memory(size_t size, uint32_t memory_type_index, VkImage image, VkBuffer buffer);
+
+    VkImage create_image(int width, int height, int depth, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage);
+    VkImageView create_imageview(VkImage image, VkFormat format);
+};
+
+class VkBlobAllocatorPrivate;
+class NCNN_EXPORT VkBlobAllocator : public VkAllocator
+{
+public:
+    explicit VkBlobAllocator(const VulkanDevice* vkdev, size_t preferred_block_size = 16 * 1024 * 1024); // 16M
+    virtual ~VkBlobAllocator();
+
+public:
+    // release all budgets immediately
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkBlobAllocator(const VkBlobAllocator&);
+    VkBlobAllocator& operator=(const VkBlobAllocator&);
+
+private:
+    VkBlobAllocatorPrivate* const d;
+};
+
+class VkWeightAllocatorPrivate;
+class NCNN_EXPORT VkWeightAllocator : public VkAllocator
+{
+public:
+    explicit VkWeightAllocator(const VulkanDevice* vkdev, size_t preferred_block_size = 8 * 1024 * 1024); // 8M
+    virtual ~VkWeightAllocator();
+
+public:
+    // release all blocks immediately
+    virtual void clear();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkWeightAllocator(const VkWeightAllocator&);
+    VkWeightAllocator& operator=(const VkWeightAllocator&);
+
+private:
+    VkWeightAllocatorPrivate* const d;
+};
+
+class VkStagingAllocatorPrivate;
+class NCNN_EXPORT VkStagingAllocator : public VkAllocator
+{
+public:
+    explicit VkStagingAllocator(const VulkanDevice* vkdev);
+    virtual ~VkStagingAllocator();
+
+public:
+    // ratio range 0 ~ 1
+    // default cr = 0.75
+    void set_size_compare_ratio(float scr);
+
+    // release all budgets immediately
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkStagingAllocator(const VkStagingAllocator&);
+    VkStagingAllocator& operator=(const VkStagingAllocator&);
+
+private:
+    VkStagingAllocatorPrivate* const d;
+};
+
+class VkWeightStagingAllocatorPrivate;
+class NCNN_EXPORT VkWeightStagingAllocator : public VkAllocator
+{
+public:
+    explicit VkWeightStagingAllocator(const VulkanDevice* vkdev);
+    virtual ~VkWeightStagingAllocator();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkWeightStagingAllocator(const VkWeightStagingAllocator&);
+    VkWeightStagingAllocator& operator=(const VkWeightStagingAllocator&);
+
+private:
+    VkWeightStagingAllocatorPrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class NCNN_EXPORT VkAndroidHardwareBufferImageAllocator : public VkAllocator
+{
+public:
+    VkAndroidHardwareBufferImageAllocator(const VulkanDevice* _vkdev, AHardwareBuffer* _hb);
+    virtual ~VkAndroidHardwareBufferImageAllocator();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkAndroidHardwareBufferImageAllocator(const VkAndroidHardwareBufferImageAllocator&);
+    VkAndroidHardwareBufferImageAllocator& operator=(const VkAndroidHardwareBufferImageAllocator&);
+
+public:
+    int init();
+
+    int width() const;
+    int height() const;
+    uint64_t external_format() const;
+
+public:
+    AHardwareBuffer* hb;
+    AHardwareBuffer_Desc bufferDesc;
+    VkAndroidHardwareBufferFormatPropertiesANDROID bufferFormatProperties;
+    VkAndroidHardwareBufferPropertiesANDROID bufferProperties;
+    VkSamplerYcbcrConversionKHR samplerYcbcrConversion;
+};
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_ALLOCATOR_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/benchmark.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/benchmark.h
new file mode 100644
index 0000000..ed42c1a
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/benchmark.h
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_BENCHMARK_H
+#define NCNN_BENCHMARK_H
+
+#include "layer.h"
+#include "mat.h"
+#include "platform.h"
+
+namespace ncnn {
+
+// get now timestamp in ms
+NCNN_EXPORT double get_current_time();
+
+// sleep milliseconds
+NCNN_EXPORT void sleep(unsigned long long int milliseconds = 1000);
+
+#if NCNN_BENCHMARK
+
+NCNN_EXPORT void benchmark(const Layer* layer, double start, double end);
+NCNN_EXPORT void benchmark(const Layer* layer, const Mat& bottom_blob, Mat& top_blob, double start, double end);
+
+#endif // NCNN_BENCHMARK
+
+} // namespace ncnn
+
+#endif // NCNN_BENCHMARK_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/blob.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/blob.h
new file mode 100644
index 0000000..c9f144f
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/blob.h
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_BLOB_H
+#define NCNN_BLOB_H
+
+#include "mat.h"
+#include "platform.h"
+
+namespace ncnn {
+
+class NCNN_EXPORT Blob
+{
+public:
+    // empty
+    Blob();
+
+public:
+#if NCNN_STRING
+    // blob name
+    std::string name;
+#endif // NCNN_STRING
+    // layer index which produce this blob as output
+    int producer;
+    // layer index which need this blob as input
+    int consumer;
+    // shape hint
+    Mat shape;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_BLOB_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/c_api.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/c_api.h
new file mode 100644
index 0000000..31d5b6d
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/c_api.h
@@ -0,0 +1,347 @@
+/* Tencent is pleased to support the open source community by making ncnn available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed
+ * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+ * CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ */
+
+#ifndef NCNN_C_API_H
+#define NCNN_C_API_H
+
+#include "platform.h"
+
+#if NCNN_C_API
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+NCNN_EXPORT const char* ncnn_version();
+
+/* allocator api */
+typedef struct __ncnn_allocator_t* ncnn_allocator_t;
+struct NCNN_EXPORT __ncnn_allocator_t
+{
+    void* pthis;
+
+    void* (*fast_malloc)(ncnn_allocator_t allocator, size_t size);
+    void (*fast_free)(ncnn_allocator_t allocator, void* ptr);
+};
+
+NCNN_EXPORT ncnn_allocator_t ncnn_allocator_create_pool_allocator();
+NCNN_EXPORT ncnn_allocator_t ncnn_allocator_create_unlocked_pool_allocator();
+NCNN_EXPORT void ncnn_allocator_destroy(ncnn_allocator_t allocator);
+
+/* option api */
+typedef struct __ncnn_option_t* ncnn_option_t;
+
+NCNN_EXPORT ncnn_option_t ncnn_option_create();
+NCNN_EXPORT void ncnn_option_destroy(ncnn_option_t opt);
+
+NCNN_EXPORT int ncnn_option_get_num_threads(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_num_threads(ncnn_option_t opt, int num_threads);
+
+NCNN_EXPORT int ncnn_option_get_use_local_pool_allocator(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_use_local_pool_allocator(ncnn_option_t opt, int use_local_pool_allocator);
+
+NCNN_EXPORT void ncnn_option_set_blob_allocator(ncnn_option_t opt, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_option_set_workspace_allocator(ncnn_option_t opt, ncnn_allocator_t allocator);
+
+NCNN_EXPORT int ncnn_option_get_use_vulkan_compute(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_use_vulkan_compute(ncnn_option_t opt, int use_vulkan_compute);
+
+/* mat api */
+typedef struct __ncnn_mat_t* ncnn_mat_t;
+
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create();
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_1d(int w, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_2d(int w, int h, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_3d(int w, int h, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_4d(int w, int h, int d, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_1d(int w, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_2d(int w, int h, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_3d(int w, int h, int c, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_4d(int w, int h, int d, int c, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_1d_elem(int w, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_2d_elem(int w, int h, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_3d_elem(int w, int h, int c, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_4d_elem(int w, int h, int d, int c, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_1d_elem(int w, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_2d_elem(int w, int h, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_3d_elem(int w, int h, int c, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_4d_elem(int w, int h, int d, int c, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_mat_destroy(ncnn_mat_t mat);
+
+NCNN_EXPORT void ncnn_mat_fill_float(ncnn_mat_t mat, float v);
+
+NCNN_EXPORT ncnn_mat_t ncnn_mat_clone(const ncnn_mat_t mat, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_1d(const ncnn_mat_t mat, int w, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_2d(const ncnn_mat_t mat, int w, int h, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_3d(const ncnn_mat_t mat, int w, int h, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_4d(const ncnn_mat_t mat, int w, int h, int d, int c, ncnn_allocator_t allocator);
+
+NCNN_EXPORT int ncnn_mat_get_dims(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_w(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_h(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_d(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_c(const ncnn_mat_t mat);
+NCNN_EXPORT size_t ncnn_mat_get_elemsize(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_elempack(const ncnn_mat_t mat);
+NCNN_EXPORT size_t ncnn_mat_get_cstep(const ncnn_mat_t mat);
+NCNN_EXPORT void* ncnn_mat_get_data(const ncnn_mat_t mat);
+
+NCNN_EXPORT void* ncnn_mat_get_channel_data(const ncnn_mat_t mat, int c);
+
+#if NCNN_PIXEL
+
+/* mat pixel api */
+#define NCNN_MAT_PIXEL_RGB       1
+#define NCNN_MAT_PIXEL_BGR       2
+#define NCNN_MAT_PIXEL_GRAY      3
+#define NCNN_MAT_PIXEL_RGBA      4
+#define NCNN_MAT_PIXEL_BGRA      5
+#define NCNN_MAT_PIXEL_X2Y(X, Y) (X | (Y << 16))
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels(const unsigned char* pixels, int type, int w, int h, int stride, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int stride, int target_width, int target_height, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, int target_width, int target_height, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_mat_to_pixels(const ncnn_mat_t mat, unsigned char* pixels, int type, int stride);
+NCNN_EXPORT void ncnn_mat_to_pixels_resize(const ncnn_mat_t mat, unsigned char* pixels, int type, int target_width, int target_height, int target_stride);
+
+#endif /* NCNN_PIXEL */
+
+NCNN_EXPORT void ncnn_mat_substract_mean_normalize(ncnn_mat_t mat, const float* mean_vals, const float* norm_vals);
+
+NCNN_EXPORT void ncnn_convert_packing(const ncnn_mat_t src, ncnn_mat_t* dst, int elempack, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_flatten(const ncnn_mat_t src, ncnn_mat_t* dst, const ncnn_option_t opt);
+
+/* blob api */
+typedef struct __ncnn_blob_t* ncnn_blob_t;
+
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_blob_get_name(const ncnn_blob_t blob);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_blob_get_producer(const ncnn_blob_t blob);
+NCNN_EXPORT int ncnn_blob_get_consumer(const ncnn_blob_t blob);
+
+NCNN_EXPORT void ncnn_blob_get_shape(const ncnn_blob_t blob, int* dims, int* w, int* h, int* c);
+
+/* paramdict api */
+typedef struct __ncnn_paramdict_t* ncnn_paramdict_t;
+
+NCNN_EXPORT ncnn_paramdict_t ncnn_paramdict_create();
+NCNN_EXPORT void ncnn_paramdict_destroy(ncnn_paramdict_t pd);
+
+NCNN_EXPORT int ncnn_paramdict_get_type(const ncnn_paramdict_t pd, int id);
+
+NCNN_EXPORT int ncnn_paramdict_get_int(const ncnn_paramdict_t pd, int id, int def);
+NCNN_EXPORT float ncnn_paramdict_get_float(const ncnn_paramdict_t pd, int id, float def);
+NCNN_EXPORT ncnn_mat_t ncnn_paramdict_get_array(const ncnn_paramdict_t pd, int id, const ncnn_mat_t def);
+
+NCNN_EXPORT void ncnn_paramdict_set_int(ncnn_paramdict_t pd, int id, int i);
+NCNN_EXPORT void ncnn_paramdict_set_float(ncnn_paramdict_t pd, int id, float f);
+NCNN_EXPORT void ncnn_paramdict_set_array(ncnn_paramdict_t pd, int id, const ncnn_mat_t v);
+
+/* datareader api */
+typedef struct __ncnn_datareader_t* ncnn_datareader_t;
+struct NCNN_EXPORT __ncnn_datareader_t
+{
+    void* pthis;
+
+#if NCNN_STRING
+    int (*scan)(ncnn_datareader_t dr, const char* format, void* p);
+#endif /* NCNN_STRING */
+    size_t (*read)(ncnn_datareader_t dr, void* buf, size_t size);
+};
+
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create();
+#if NCNN_STDIO
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create_from_stdio(FILE* fp);
+#endif /* NCNN_STDIO */
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create_from_memory(const unsigned char** mem);
+NCNN_EXPORT void ncnn_datareader_destroy(ncnn_datareader_t dr);
+
+/* modelbin api */
+typedef struct __ncnn_modelbin_t* ncnn_modelbin_t;
+struct NCNN_EXPORT __ncnn_modelbin_t
+{
+    void* pthis;
+
+    ncnn_mat_t (*load_1d)(const ncnn_modelbin_t mb, int w, int type);
+    ncnn_mat_t (*load_2d)(const ncnn_modelbin_t mb, int w, int h, int type);
+    ncnn_mat_t (*load_3d)(const ncnn_modelbin_t mb, int w, int h, int c, int type);
+};
+
+NCNN_EXPORT ncnn_modelbin_t ncnn_modelbin_create_from_datareader(const ncnn_datareader_t dr);
+NCNN_EXPORT ncnn_modelbin_t ncnn_modelbin_create_from_mat_array(const ncnn_mat_t* weights, int n);
+NCNN_EXPORT void ncnn_modelbin_destroy(ncnn_modelbin_t mb);
+
+/* layer api */
+typedef struct __ncnn_layer_t* ncnn_layer_t;
+struct NCNN_EXPORT __ncnn_layer_t
+{
+    void* pthis;
+
+    int (*load_param)(ncnn_layer_t layer, const ncnn_paramdict_t pd);
+    int (*load_model)(ncnn_layer_t layer, const ncnn_modelbin_t mb);
+
+    int (*create_pipeline)(ncnn_layer_t layer, const ncnn_option_t opt);
+    int (*destroy_pipeline)(ncnn_layer_t layer, const ncnn_option_t opt);
+
+    int (*forward_1)(const ncnn_layer_t layer, const ncnn_mat_t bottom_blob, ncnn_mat_t* top_blob, const ncnn_option_t opt);
+    int (*forward_n)(const ncnn_layer_t layer, const ncnn_mat_t* bottom_blobs, int n, ncnn_mat_t* top_blobs, int n2, const ncnn_option_t opt);
+
+    int (*forward_inplace_1)(const ncnn_layer_t layer, ncnn_mat_t bottom_top_blob, const ncnn_option_t opt);
+    int (*forward_inplace_n)(const ncnn_layer_t layer, ncnn_mat_t* bottom_top_blobs, int n, const ncnn_option_t opt);
+};
+
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create();
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create_by_typeindex(int typeindex);
+#if NCNN_STRING
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create_by_type(const char* type);
+NCNN_EXPORT int ncnn_layer_type_to_index(const char* type);
+#endif /* NCNN_STRING */
+NCNN_EXPORT void ncnn_layer_destroy(ncnn_layer_t layer);
+
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_layer_get_name(const ncnn_layer_t layer);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_layer_get_typeindex(const ncnn_layer_t layer);
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_layer_get_type(const ncnn_layer_t layer);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_layer_get_one_blob_only(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_inplace(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_vulkan(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_packing(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_bf16_storage(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_fp16_storage(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_image_storage(const ncnn_layer_t layer);
+
+NCNN_EXPORT void ncnn_layer_set_one_blob_only(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_inplace(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_vulkan(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_packing(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_bf16_storage(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_fp16_storage(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_image_storage(ncnn_layer_t layer, int enable);
+
+NCNN_EXPORT int ncnn_layer_get_bottom_count(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_bottom(const ncnn_layer_t layer, int i);
+NCNN_EXPORT int ncnn_layer_get_top_count(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_top(const ncnn_layer_t layer, int i);
+
+NCNN_EXPORT void ncnn_blob_get_bottom_shape(const ncnn_layer_t layer, int i, int* dims, int* w, int* h, int* c);
+NCNN_EXPORT void ncnn_blob_get_top_shape(const ncnn_layer_t layer, int i, int* dims, int* w, int* h, int* c);
+
+/* layer factory function */
+typedef ncnn_layer_t (*ncnn_layer_creator_t)(void* userdata);
+typedef void (*ncnn_layer_destroyer_t)(ncnn_layer_t layer, void* userdata);
+
+typedef struct __ncnn_net_custom_layer_factory_t* ncnn_net_custom_layer_factory_t;
+struct __ncnn_net_custom_layer_factory_t
+{
+    ncnn_layer_creator_t creator;
+    ncnn_layer_destroyer_t destroyer;
+    void* userdata;
+    ncnn_net_custom_layer_factory_t next;
+};
+
+/* net api */
+typedef struct __ncnn_net_t* ncnn_net_t;
+struct __ncnn_net_t
+{
+    void* pthis;
+
+    ncnn_net_custom_layer_factory_t custom_layer_factory;
+};
+
+NCNN_EXPORT ncnn_net_t ncnn_net_create();
+NCNN_EXPORT void ncnn_net_destroy(ncnn_net_t net);
+
+NCNN_EXPORT ncnn_option_t ncnn_net_get_option(ncnn_net_t net);
+NCNN_EXPORT void ncnn_net_set_option(ncnn_net_t net, ncnn_option_t opt);
+
+#if NCNN_STRING
+NCNN_EXPORT void ncnn_net_register_custom_layer_by_type(ncnn_net_t net, const char* type, ncnn_layer_creator_t creator, ncnn_layer_destroyer_t destroyer, void* userdata);
+#endif /* NCNN_STRING */
+NCNN_EXPORT void ncnn_net_register_custom_layer_by_typeindex(ncnn_net_t net, int typeindex, ncnn_layer_creator_t creator, ncnn_layer_destroyer_t destroyer, void* userdata);
+
+#if NCNN_STDIO
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param(ncnn_net_t net, const char* path);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_net_load_param_bin(ncnn_net_t net, const char* path);
+NCNN_EXPORT int ncnn_net_load_model(ncnn_net_t net, const char* path);
+#endif /* NCNN_STDIO */
+
+#if NCNN_STDIO
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param_memory(ncnn_net_t net, const char* mem);
+#endif /* NCNN_STRING */
+#endif /* NCNN_STDIO */
+NCNN_EXPORT int ncnn_net_load_param_bin_memory(ncnn_net_t net, const unsigned char* mem);
+NCNN_EXPORT int ncnn_net_load_model_memory(ncnn_net_t net, const unsigned char* mem);
+
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_net_load_param_bin_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+NCNN_EXPORT int ncnn_net_load_model_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+
+NCNN_EXPORT void ncnn_net_clear(ncnn_net_t net);
+
+NCNN_EXPORT int ncnn_net_get_input_count(const ncnn_net_t net);
+NCNN_EXPORT int ncnn_net_get_output_count(const ncnn_net_t net);
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_net_get_input_name(const ncnn_net_t net, int i);
+NCNN_EXPORT const char* ncnn_net_get_output_name(const ncnn_net_t net, int i);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_net_get_input_index(const ncnn_net_t net, int i);
+NCNN_EXPORT int ncnn_net_get_output_index(const ncnn_net_t net, int i);
+
+/* extractor api */
+typedef struct __ncnn_extractor_t* ncnn_extractor_t;
+
+NCNN_EXPORT ncnn_extractor_t ncnn_extractor_create(ncnn_net_t net);
+NCNN_EXPORT void ncnn_extractor_destroy(ncnn_extractor_t ex);
+
+NCNN_EXPORT void ncnn_extractor_set_option(ncnn_extractor_t ex, const ncnn_option_t opt);
+
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_extractor_input(ncnn_extractor_t ex, const char* name, const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_extractor_extract(ncnn_extractor_t ex, const char* name, ncnn_mat_t* mat);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_extractor_input_index(ncnn_extractor_t ex, int index, const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_extractor_extract_index(ncnn_extractor_t ex, int index, ncnn_mat_t* mat);
+
+/* mat process api */
+#define NCNN_BORDER_CONSTANT    0
+#define NCNN_BORDER_REPLICATE   1
+#define NCNN_BORDER_REFLECT     2
+#define NCNN_BORDER_TRANSPARENT -233
+NCNN_EXPORT void ncnn_copy_make_border(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int type, float v, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_copy_make_border_3d(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int front, int behind, int type, float v, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_copy_cut_border(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_copy_cut_border_3d(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int front, int behind, const ncnn_option_t opt);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* NCNN_C_API */
+
+#endif /* NCNN_C_API_H */
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/command.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/command.h
new file mode 100644
index 0000000..337d085
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/command.h
@@ -0,0 +1,136 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_COMMAND_H
+#define NCNN_COMMAND_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+
+#include "mat.h"
+
+#include <vulkan/vulkan.h>
+
+namespace ncnn {
+
+class Pipeline;
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class ImportAndroidHardwareBufferPipeline;
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+class VkComputePrivate;
+class NCNN_EXPORT VkCompute
+{
+public:
+    explicit VkCompute(const VulkanDevice* vkdev);
+    virtual ~VkCompute();
+
+public:
+    void record_upload(const Mat& src, VkMat& dst, const Option& opt);
+
+    void record_upload(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    void record_download(const VkMat& src, Mat& dst, const Option& opt);
+
+    void record_download(const VkImageMat& src, Mat& dst, const Option& opt);
+
+    void record_buffer_to_image(const VkMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_image_to_buffer(const VkImageMat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const Mat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, Mat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, Mat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, VkMat& dst, const Option& opt);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkImageMat>& bindings, const std::vector<vk_constant_type>& constants, const VkImageMat& dispatcher);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher);
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const VkImageMat& dispatcher);
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const Mat& dispatcher);
+
+#if NCNN_BENCHMARK
+    void record_write_timestamp(uint32_t query);
+#endif // NCNN_BENCHMARK
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+    void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkMat& dst);
+
+    void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkImageMat& dst);
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+    int submit_and_wait();
+
+    int reset();
+
+#if NCNN_BENCHMARK
+    int create_query_pool(uint32_t query_count);
+
+    int get_query_pool_results(uint32_t first_query, uint32_t query_count, std::vector<uint64_t>& results);
+#endif // NCNN_BENCHMARK
+
+protected:
+    const VulkanDevice* vkdev;
+
+    void barrier_readwrite(const VkMat& binding);
+    void barrier_readwrite(const VkImageMat& binding);
+    void barrier_readonly(const VkImageMat& binding);
+
+private:
+    VkComputePrivate* const d;
+};
+
+class VkTransferPrivate;
+class NCNN_EXPORT VkTransfer
+{
+public:
+    explicit VkTransfer(const VulkanDevice* vkdev);
+    virtual ~VkTransfer();
+
+public:
+    void record_upload(const Mat& src, VkMat& dst, const Option& opt, bool flatten = true);
+
+    void record_upload(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    int submit_and_wait();
+
+protected:
+    const VulkanDevice* vkdev;
+
+private:
+    VkTransferPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_VULKAN
+
+#endif // NCNN_COMMAND_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/cpu.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/cpu.h
new file mode 100644
index 0000000..7d6bfce
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/cpu.h
@@ -0,0 +1,178 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_CPU_H
+#define NCNN_CPU_H
+
+#include <stddef.h>
+
+#if (defined _WIN32 && !(defined __MINGW32__))
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+#if defined __ANDROID__ || defined __linux__
+#include <sched.h> // cpu_set_t
+#endif
+
+#include "platform.h"
+
+namespace ncnn {
+
+class NCNN_EXPORT CpuSet
+{
+public:
+    CpuSet();
+    void enable(int cpu);
+    void disable(int cpu);
+    void disable_all();
+    bool is_enabled(int cpu) const;
+    int num_enabled() const;
+
+public:
+#if (defined _WIN32 && !(defined __MINGW32__))
+    ULONG_PTR mask;
+#endif
+#if defined __ANDROID__ || defined __linux__
+    cpu_set_t cpu_set;
+#endif
+#if __APPLE__
+    unsigned int policy;
+#endif
+};
+
+// test optional cpu features
+// edsp = armv7 edsp
+NCNN_EXPORT int cpu_support_arm_edsp();
+// neon = armv7 neon or aarch64 asimd
+NCNN_EXPORT int cpu_support_arm_neon();
+// vfpv4 = armv7 fp16 + fma
+NCNN_EXPORT int cpu_support_arm_vfpv4();
+// asimdhp = aarch64 asimd half precision
+NCNN_EXPORT int cpu_support_arm_asimdhp();
+// cpuid = aarch64 cpuid info
+NCNN_EXPORT int cpu_support_arm_cpuid();
+// asimddp = aarch64 asimd dot product
+NCNN_EXPORT int cpu_support_arm_asimddp();
+// asimdfhm = aarch64 asimd fhm
+NCNN_EXPORT int cpu_support_arm_asimdfhm();
+// bf16 = aarch64 bf16
+NCNN_EXPORT int cpu_support_arm_bf16();
+// i8mm = aarch64 i8mm
+NCNN_EXPORT int cpu_support_arm_i8mm();
+// sve = aarch64 sve
+NCNN_EXPORT int cpu_support_arm_sve();
+// sve2 = aarch64 sve2
+NCNN_EXPORT int cpu_support_arm_sve2();
+// svebf16 = aarch64 svebf16
+NCNN_EXPORT int cpu_support_arm_svebf16();
+// svei8mm = aarch64 svei8mm
+NCNN_EXPORT int cpu_support_arm_svei8mm();
+// svef32mm = aarch64 svef32mm
+NCNN_EXPORT int cpu_support_arm_svef32mm();
+
+// avx = x86 avx
+NCNN_EXPORT int cpu_support_x86_avx();
+// fma = x86 fma
+NCNN_EXPORT int cpu_support_x86_fma();
+// xop = x86 xop
+NCNN_EXPORT int cpu_support_x86_xop();
+// f16c = x86 f16c
+NCNN_EXPORT int cpu_support_x86_f16c();
+// avx2 = x86 avx2 + fma + f16c
+NCNN_EXPORT int cpu_support_x86_avx2();
+// avx_vnni = x86 avx vnni
+NCNN_EXPORT int cpu_support_x86_avx_vnni();
+// avx512 = x86 avx512f + avx512cd + avx512bw + avx512dq + avx512vl
+NCNN_EXPORT int cpu_support_x86_avx512();
+// avx512_vnni = x86 avx512 vnni
+NCNN_EXPORT int cpu_support_x86_avx512_vnni();
+// avx512_bf16 = x86 avx512 bf16
+NCNN_EXPORT int cpu_support_x86_avx512_bf16();
+// avx512_fp16 = x86 avx512 fp16
+NCNN_EXPORT int cpu_support_x86_avx512_fp16();
+
+// lsx = loongarch lsx
+NCNN_EXPORT int cpu_support_loongarch_lsx();
+// lasx = loongarch lasx
+NCNN_EXPORT int cpu_support_loongarch_lasx();
+
+// msa = mips mas
+NCNN_EXPORT int cpu_support_mips_msa();
+// mmi = loongson mmi
+NCNN_EXPORT int cpu_support_loongson_mmi();
+
+// v = riscv vector
+NCNN_EXPORT int cpu_support_riscv_v();
+// zfh = riscv half-precision float
+NCNN_EXPORT int cpu_support_riscv_zfh();
+// vlenb = riscv vector length in bytes
+NCNN_EXPORT int cpu_riscv_vlenb();
+
+// cpu info
+NCNN_EXPORT int get_cpu_count();
+NCNN_EXPORT int get_little_cpu_count();
+NCNN_EXPORT int get_big_cpu_count();
+
+NCNN_EXPORT int get_physical_cpu_count();
+NCNN_EXPORT int get_physical_little_cpu_count();
+NCNN_EXPORT int get_physical_big_cpu_count();
+
+// cpu l2 varies from 64k to 1M, but l3 can be zero
+NCNN_EXPORT int get_cpu_level2_cache_size();
+NCNN_EXPORT int get_cpu_level3_cache_size();
+
+// bind all threads on little clusters if powersave enabled
+// affects HMP arch cpu like ARM big.LITTLE
+// only implemented on android at the moment
+// switching powersave is expensive and not thread-safe
+// 0 = all cores enabled(default)
+// 1 = only little clusters enabled
+// 2 = only big clusters enabled
+// return 0 if success for setter function
+NCNN_EXPORT int get_cpu_powersave();
+NCNN_EXPORT int set_cpu_powersave(int powersave);
+
+// convenient wrapper
+NCNN_EXPORT const CpuSet& get_cpu_thread_affinity_mask(int powersave);
+
+// set explicit thread affinity
+NCNN_EXPORT int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask);
+
+// runtime thread affinity info
+NCNN_EXPORT int is_current_thread_running_on_a53_a55();
+
+// misc function wrapper for openmp routines
+NCNN_EXPORT int get_omp_num_threads();
+NCNN_EXPORT void set_omp_num_threads(int num_threads);
+
+NCNN_EXPORT int get_omp_dynamic();
+NCNN_EXPORT void set_omp_dynamic(int dynamic);
+
+NCNN_EXPORT int get_omp_thread_num();
+
+NCNN_EXPORT int get_kmp_blocktime();
+NCNN_EXPORT void set_kmp_blocktime(int time_ms);
+
+// need to flush denormals on Intel Chipset.
+// Other architectures such as ARM can be added as needed.
+// 0 = DAZ OFF, FTZ OFF
+// 1 = DAZ ON , FTZ OFF
+// 2 = DAZ OFF, FTZ ON
+// 3 = DAZ ON,  FTZ ON
+NCNN_EXPORT int get_flush_denormals();
+NCNN_EXPORT int set_flush_denormals(int flush_denormals);
+
+} // namespace ncnn
+
+#endif // NCNN_CPU_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/datareader.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/datareader.h
new file mode 100644
index 0000000..ed2aba3
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/datareader.h
@@ -0,0 +1,122 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_DATAREADER_H
+#define NCNN_DATAREADER_H
+
+#include "platform.h"
+#if NCNN_STDIO
+#include <stdio.h>
+#endif
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/asset_manager.h>
+#endif
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+// data read wrapper
+class NCNN_EXPORT DataReader
+{
+public:
+    DataReader();
+    virtual ~DataReader();
+
+#if NCNN_STRING
+    // parse plain param text
+    // return 1 if scan success
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+
+    // read binary param and model data
+    // return bytes read
+    virtual size_t read(void* buf, size_t size) const;
+
+    // get model data reference
+    // return bytes referenced
+    virtual size_t reference(size_t size, const void** buf) const;
+};
+
+#if NCNN_STDIO
+class DataReaderFromStdioPrivate;
+class NCNN_EXPORT DataReaderFromStdio : public DataReader
+{
+public:
+    explicit DataReaderFromStdio(FILE* fp);
+    virtual ~DataReaderFromStdio();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+
+private:
+    DataReaderFromStdio(const DataReaderFromStdio&);
+    DataReaderFromStdio& operator=(const DataReaderFromStdio&);
+
+private:
+    DataReaderFromStdioPrivate* const d;
+};
+#endif // NCNN_STDIO
+
+class DataReaderFromMemoryPrivate;
+class NCNN_EXPORT DataReaderFromMemory : public DataReader
+{
+public:
+    explicit DataReaderFromMemory(const unsigned char*& mem);
+    virtual ~DataReaderFromMemory();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+    virtual size_t reference(size_t size, const void** buf) const;
+
+private:
+    DataReaderFromMemory(const DataReaderFromMemory&);
+    DataReaderFromMemory& operator=(const DataReaderFromMemory&);
+
+private:
+    DataReaderFromMemoryPrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+class DataReaderFromAndroidAssetPrivate;
+class NCNN_EXPORT DataReaderFromAndroidAsset : public DataReader
+{
+public:
+    explicit DataReaderFromAndroidAsset(AAsset* asset);
+    virtual ~DataReaderFromAndroidAsset();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+
+private:
+    DataReaderFromAndroidAsset(const DataReaderFromAndroidAsset&);
+    DataReaderFromAndroidAsset& operator=(const DataReaderFromAndroidAsset&);
+
+private:
+    DataReaderFromAndroidAssetPrivate* const d;
+};
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+} // namespace ncnn
+
+#endif // NCNN_DATAREADER_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/gpu.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/gpu.h
new file mode 100644
index 0000000..1eff228
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/gpu.h
@@ -0,0 +1,392 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_GPU_H
+#define NCNN_GPU_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+
+#include "mat.h"
+
+#include <vulkan/vulkan.h>
+
+#include "vulkan_header_fix.h"
+
+namespace ncnn {
+
+// instance
+
+// Create VkInstance and initialize some objects that need to be calculated by GPU
+// Creates a VkInstance object, Checks the extended attributes supported by the Vulkan instance concerned,
+// Initializes, and creates Vulkan validation layers (if ENABLE_VALIDATION_LAYER is enabled),
+// Iterates over all supported physical devices, etc.
+NCNN_EXPORT int create_gpu_instance();
+
+// Get global VkInstance variable
+// Must be called after create_gpu_instance() and before destroy_gpu_instance()
+NCNN_EXPORT VkInstance get_gpu_instance();
+
+// Destroy VkInstance object and free the memory of the associated object
+// Usually called in the destructor of the main program exit
+NCNN_EXPORT void destroy_gpu_instance();
+
+// instance extension capability
+extern int support_VK_KHR_external_memory_capabilities;
+extern int support_VK_KHR_get_physical_device_properties2;
+extern int support_VK_KHR_get_surface_capabilities2;
+extern int support_VK_KHR_surface;
+extern int support_VK_EXT_debug_utils;
+extern int support_VK_EXT_validation_features;
+extern int support_VK_EXT_validation_flags;
+#if __ANDROID_API__ >= 26
+extern int support_VK_KHR_android_surface;
+#endif // __ANDROID_API__ >= 26
+
+// VK_KHR_cooperative_matrix
+extern PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR;
+
+// VK_KHR_external_memory_capabilities
+extern PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR vkGetPhysicalDeviceExternalBufferPropertiesKHR;
+
+// VK_KHR_get_physical_device_properties2
+extern PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR;
+extern PFN_vkGetPhysicalDeviceProperties2KHR vkGetPhysicalDeviceProperties2KHR;
+extern PFN_vkGetPhysicalDeviceFormatProperties2KHR vkGetPhysicalDeviceFormatProperties2KHR;
+extern PFN_vkGetPhysicalDeviceImageFormatProperties2KHR vkGetPhysicalDeviceImageFormatProperties2KHR;
+extern PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR vkGetPhysicalDeviceQueueFamilyProperties2KHR;
+extern PFN_vkGetPhysicalDeviceMemoryProperties2KHR vkGetPhysicalDeviceMemoryProperties2KHR;
+extern PFN_vkGetPhysicalDeviceSparseImageFormatProperties2KHR vkGetPhysicalDeviceSparseImageFormatProperties2KHR;
+
+// VK_KHR_get_surface_capabilities2
+extern PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR vkGetPhysicalDeviceSurfaceCapabilities2KHR;
+extern PFN_vkGetPhysicalDeviceSurfaceFormats2KHR vkGetPhysicalDeviceSurfaceFormats2KHR;
+
+// VK_KHR_surface
+extern PFN_vkDestroySurfaceKHR vkDestroySurfaceKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceSupportKHR vkGetPhysicalDeviceSurfaceSupportKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR vkGetPhysicalDeviceSurfaceCapabilitiesKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceFormatsKHR vkGetPhysicalDeviceSurfaceFormatsKHR;
+extern PFN_vkGetPhysicalDeviceSurfacePresentModesKHR vkGetPhysicalDeviceSurfacePresentModesKHR;
+
+#if __ANDROID_API__ >= 26
+// VK_KHR_android_surface
+extern PFN_vkCreateAndroidSurfaceKHR vkCreateAndroidSurfaceKHR;
+#endif // __ANDROID_API__ >= 26
+
+// VK_NV_cooperative_matrix
+extern PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesNV vkGetPhysicalDeviceCooperativeMatrixPropertiesNV;
+
+// get info
+NCNN_EXPORT int get_gpu_count();
+NCNN_EXPORT int get_default_gpu_index();
+
+class GpuInfoPrivate;
+class NCNN_EXPORT GpuInfo
+{
+public:
+    explicit GpuInfo();
+    virtual ~GpuInfo();
+
+    // vulkan physical device
+    VkPhysicalDevice physical_device() const;
+
+    // memory properties
+    const VkPhysicalDeviceMemoryProperties& physical_device_memory_properties() const;
+
+    // info
+    uint32_t api_version() const;
+    uint32_t driver_version() const;
+    uint32_t vendor_id() const;
+    uint32_t device_id() const;
+    const char* device_name() const;
+    uint8_t* pipeline_cache_uuid() const;
+
+    // 0 = discrete gpu
+    // 1 = integrated gpu
+    // 2 = virtual gpu
+    // 3 = cpu
+    int type() const;
+
+    // hardware limit
+    uint32_t max_shared_memory_size() const;
+    uint32_t max_workgroup_count_x() const;
+    uint32_t max_workgroup_count_y() const;
+    uint32_t max_workgroup_count_z() const;
+    uint32_t max_workgroup_invocations() const;
+    uint32_t max_workgroup_size_x() const;
+    uint32_t max_workgroup_size_y() const;
+    uint32_t max_workgroup_size_z() const;
+    size_t memory_map_alignment() const;
+    size_t buffer_offset_alignment() const;
+    size_t non_coherent_atom_size() const;
+    size_t buffer_image_granularity() const;
+    uint32_t max_image_dimension_1d() const;
+    uint32_t max_image_dimension_2d() const;
+    uint32_t max_image_dimension_3d() const;
+    float timestamp_period() const;
+
+    // runtime
+    uint32_t compute_queue_family_index() const;
+    uint32_t graphics_queue_family_index() const;
+    uint32_t transfer_queue_family_index() const;
+
+    uint32_t compute_queue_count() const;
+    uint32_t graphics_queue_count() const;
+    uint32_t transfer_queue_count() const;
+
+    // property
+    bool unified_compute_transfer_queue() const;
+
+    // subgroup
+    uint32_t subgroup_size() const;
+    bool support_subgroup_basic() const;
+    bool support_subgroup_vote() const;
+    bool support_subgroup_ballot() const;
+    bool support_subgroup_shuffle() const;
+
+    // bug is not feature
+    bool bug_storage_buffer_no_l1() const;
+    bool bug_corrupted_online_pipeline_cache() const;
+    bool bug_buffer_image_load_zero() const;
+
+    // but sometimes bug is a feature
+    bool bug_implicit_fp16_arithmetic() const;
+
+    // fp16 and int8 feature
+    bool support_fp16_packed() const;
+    bool support_fp16_storage() const;
+    bool support_fp16_arithmetic() const;
+    bool support_int8_packed() const;
+    bool support_int8_storage() const;
+    bool support_int8_arithmetic() const;
+
+    // ycbcr conversion feature
+    bool support_ycbcr_conversion() const;
+
+    // cooperative matrix feature
+    bool support_cooperative_matrix() const;
+    bool support_cooperative_matrix_16_8_8() const;
+    bool support_cooperative_matrix_16_8_16() const;
+    bool support_cooperative_matrix_16_16_16() const;
+
+    // extension capability
+    int support_VK_KHR_8bit_storage() const;
+    int support_VK_KHR_16bit_storage() const;
+    int support_VK_KHR_bind_memory2() const;
+    int support_VK_KHR_buffer_device_address() const;
+    int support_VK_KHR_create_renderpass2() const;
+    int support_VK_KHR_cooperative_matrix() const;
+    int support_VK_KHR_dedicated_allocation() const;
+    int support_VK_KHR_descriptor_update_template() const;
+    int support_VK_KHR_external_memory() const;
+    int support_VK_KHR_get_memory_requirements2() const;
+    int support_VK_KHR_maintenance1() const;
+    int support_VK_KHR_maintenance2() const;
+    int support_VK_KHR_maintenance3() const;
+    int support_VK_KHR_multiview() const;
+    int support_VK_KHR_portability_subset() const;
+    int support_VK_KHR_push_descriptor() const;
+    int support_VK_KHR_sampler_ycbcr_conversion() const;
+    int support_VK_KHR_shader_float16_int8() const;
+    int support_VK_KHR_shader_float_controls() const;
+    int support_VK_KHR_storage_buffer_storage_class() const;
+    int support_VK_KHR_swapchain() const;
+    int support_VK_EXT_buffer_device_address() const;
+    int support_VK_EXT_descriptor_indexing() const;
+    int support_VK_EXT_memory_budget() const;
+    int support_VK_EXT_memory_priority() const;
+    int support_VK_EXT_queue_family_foreign() const;
+    int support_VK_AMD_device_coherent_memory() const;
+#if __ANDROID_API__ >= 26
+    int support_VK_ANDROID_external_memory_android_hardware_buffer() const;
+#endif // __ANDROID_API__ >= 26
+    int support_VK_NV_cooperative_matrix() const;
+
+private:
+    GpuInfo(const GpuInfo&);
+    GpuInfo& operator=(const GpuInfo&);
+
+private:
+    friend int create_gpu_instance();
+    GpuInfoPrivate* const d;
+};
+
+NCNN_EXPORT const GpuInfo& get_gpu_info(int device_index = get_default_gpu_index());
+
+class VkAllocator;
+class VkCompute;
+class Option;
+class PipelineCache;
+class VulkanDevicePrivate;
+class NCNN_EXPORT VulkanDevice
+{
+public:
+    VulkanDevice(int device_index = get_default_gpu_index());
+    ~VulkanDevice();
+
+    const GpuInfo& info;
+
+    VkDevice vkdevice() const;
+
+    VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const;
+
+    // with fixed workgroup size
+    VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const;
+
+    // helper for creating pipeline
+    int create_descriptorset_layout(int binding_count, const int* binding_types, VkDescriptorSetLayout* descriptorset_layout) const;
+    int create_pipeline_layout(int push_constant_count, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout* pipeline_layout) const;
+    int create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, VkPipeline* pipeline) const;
+    int create_descriptor_update_template(int binding_count, const int* binding_types, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR* descriptor_update_template) const;
+
+    uint32_t find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const;
+    bool is_mappable(uint32_t memory_type_index) const;
+    bool is_coherent(uint32_t memory_type_index) const;
+
+    VkQueue acquire_queue(uint32_t queue_family_index) const;
+    void reclaim_queue(uint32_t queue_family_index, VkQueue queue) const;
+
+    // allocator on this device
+    VkAllocator* acquire_blob_allocator() const;
+    void reclaim_blob_allocator(VkAllocator* allocator) const;
+
+    VkAllocator* acquire_staging_allocator() const;
+    void reclaim_staging_allocator(VkAllocator* allocator) const;
+
+    // immutable sampler for texelfetch
+    const VkSampler* immutable_texelfetch_sampler() const;
+
+    // dummy buffer image
+    VkMat get_dummy_buffer() const;
+    VkImageMat get_dummy_image() const;
+    VkImageMat get_dummy_image_readonly() const;
+
+    // pipeline cache on this device
+    const PipelineCache* get_pipeline_cache() const;
+
+    // test image allocation
+    bool shape_support_image_storage(const Mat& shape) const;
+
+    // current gpu heap memory budget in MB
+    uint32_t get_heap_budget() const;
+
+    // utility operator
+    void convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkImageMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkImageMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+
+    // VK_KHR_bind_memory2
+    PFN_vkBindBufferMemory2KHR vkBindBufferMemory2KHR;
+    PFN_vkBindImageMemory2KHR vkBindImageMemory2KHR;
+
+    // VK_KHR_buffer_device_address
+    PFN_vkGetBufferDeviceAddressKHR vkGetBufferDeviceAddressKHR;
+    PFN_vkGetBufferOpaqueCaptureAddressKHR vkGetBufferOpaqueCaptureAddressKHR;
+    PFN_vkGetDeviceMemoryOpaqueCaptureAddressKHR vkGetDeviceMemoryOpaqueCaptureAddressKHR;
+
+    // VK_KHR_create_renderpass2
+    PFN_vkCmdBeginRenderPass2KHR vkCmdBeginRenderPass2KHR;
+    PFN_vkCmdEndRenderPass2KHR vkCmdEndRenderPass2KHR;
+    PFN_vkCmdNextSubpass2KHR vkCmdNextSubpass2KHR;
+    PFN_vkCreateRenderPass2KHR vkCreateRenderPass2KHR;
+
+    // VK_KHR_descriptor_update_template
+    PFN_vkCreateDescriptorUpdateTemplateKHR vkCreateDescriptorUpdateTemplateKHR;
+    PFN_vkDestroyDescriptorUpdateTemplateKHR vkDestroyDescriptorUpdateTemplateKHR;
+    PFN_vkUpdateDescriptorSetWithTemplateKHR vkUpdateDescriptorSetWithTemplateKHR;
+
+    // VK_KHR_get_memory_requirements2
+    PFN_vkGetImageMemoryRequirements2KHR vkGetImageMemoryRequirements2KHR;
+    PFN_vkGetBufferMemoryRequirements2KHR vkGetBufferMemoryRequirements2KHR;
+    PFN_vkGetImageSparseMemoryRequirements2KHR vkGetImageSparseMemoryRequirements2KHR;
+
+    // VK_KHR_maintenance1
+    PFN_vkTrimCommandPoolKHR vkTrimCommandPoolKHR;
+
+    // VK_KHR_maintenance3
+    PFN_vkGetDescriptorSetLayoutSupportKHR vkGetDescriptorSetLayoutSupportKHR;
+
+    // VK_KHR_push_descriptor
+    PFN_vkCmdPushDescriptorSetWithTemplateKHR vkCmdPushDescriptorSetWithTemplateKHR;
+    PFN_vkCmdPushDescriptorSetKHR vkCmdPushDescriptorSetKHR;
+
+    // VK_KHR_sampler_ycbcr_conversion
+    PFN_vkCreateSamplerYcbcrConversionKHR vkCreateSamplerYcbcrConversionKHR;
+    PFN_vkDestroySamplerYcbcrConversionKHR vkDestroySamplerYcbcrConversionKHR;
+
+    // VK_KHR_swapchain
+    PFN_vkCreateSwapchainKHR vkCreateSwapchainKHR;
+    PFN_vkDestroySwapchainKHR vkDestroySwapchainKHR;
+    PFN_vkGetSwapchainImagesKHR vkGetSwapchainImagesKHR;
+    PFN_vkAcquireNextImageKHR vkAcquireNextImageKHR;
+    PFN_vkQueuePresentKHR vkQueuePresentKHR;
+
+    // VK_EXT_buffer_device_address
+    PFN_vkGetBufferDeviceAddressEXT vkGetBufferDeviceAddressEXT;
+
+#if __ANDROID_API__ >= 26
+    // VK_ANDROID_external_memory_android_hardware_buffer
+    PFN_vkGetAndroidHardwareBufferPropertiesANDROID vkGetAndroidHardwareBufferPropertiesANDROID;
+    PFN_vkGetMemoryAndroidHardwareBufferANDROID vkGetMemoryAndroidHardwareBufferANDROID;
+#endif // __ANDROID_API__ >= 26
+
+protected:
+    // device extension
+    int init_device_extension();
+
+private:
+    VulkanDevice(const VulkanDevice&);
+    VulkanDevice& operator=(const VulkanDevice&);
+
+private:
+    VulkanDevicePrivate* const d;
+};
+
+NCNN_EXPORT VulkanDevice* get_gpu_device(int device_index = get_default_gpu_index());
+
+// online spirv compilation
+NCNN_EXPORT int compile_spirv_module(const char* comp_string, const Option& opt, std::vector<uint32_t>& spirv);
+NCNN_EXPORT int compile_spirv_module(const char* comp_data, int comp_data_size, const Option& opt, std::vector<uint32_t>& spirv);
+NCNN_EXPORT int compile_spirv_module(int shader_type_index, const Option& opt, std::vector<uint32_t>& spirv);
+
+// info from spirv
+class NCNN_EXPORT ShaderInfo
+{
+public:
+    int specialization_count;
+    int binding_count;
+    int push_constant_count;
+
+    // 0 = null
+    // 1 = storage buffer
+    // 2 = storage image
+    // 3 = combined image sampler
+    int binding_types[16]; // 16 is large enough I think ...
+
+    int reserved_0;
+    int reserved_1;
+    int reserved_2;
+    int reserved_3;
+};
+
+NCNN_EXPORT int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info);
+
+} // namespace ncnn
+
+#endif // NCNN_VULKAN
+
+#endif // NCNN_GPU_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/layer.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/layer.h
new file mode 100644
index 0000000..f0418a9
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/layer.h
@@ -0,0 +1,222 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_H
+#define NCNN_LAYER_H
+
+#include "mat.h"
+#include "modelbin.h"
+#include "option.h"
+#include "paramdict.h"
+#include "platform.h"
+
+#if NCNN_VULKAN
+#include "command.h"
+#include "pipeline.h"
+
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+namespace ncnn {
+
+class NCNN_EXPORT Layer
+{
+public:
+    // empty
+    Layer();
+    // virtual destructor
+    virtual ~Layer();
+
+    // load layer specific parameter from parsed dict
+    // return 0 if success
+    virtual int load_param(const ParamDict& pd);
+
+    // load layer specific weight data from model binary
+    // return 0 if success
+    virtual int load_model(const ModelBin& mb);
+
+    // layer implementation specific setup
+    // return 0 if success
+    virtual int create_pipeline(const Option& opt);
+
+    // layer implementation specific clean
+    // return 0 if success
+    virtual int destroy_pipeline(const Option& opt);
+
+public:
+    // one input and one output blob
+    bool one_blob_only;
+
+    // support inplace inference
+    bool support_inplace;
+
+    // support vulkan compute
+    bool support_vulkan;
+
+    // accept input blob with packed storage
+    bool support_packing;
+
+    // accept bf16
+    bool support_bf16_storage;
+
+    // accept fp16
+    bool support_fp16_storage;
+
+    // accept int8
+    bool support_int8_storage;
+
+    // shader image storage
+    bool support_image_storage;
+
+    // shader tensor storage
+    bool support_tensor_storage;
+
+    bool support_reserved_00;
+
+    bool support_reserved_0;
+    bool support_reserved_1;
+    bool support_reserved_2;
+    bool support_reserved_3;
+    bool support_reserved_4;
+    bool support_reserved_5;
+    bool support_reserved_6;
+    bool support_reserved_7;
+    bool support_reserved_8;
+    bool support_reserved_9;
+
+    // feature disabled set
+    int featmask;
+
+public:
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+
+#if NCNN_VULKAN
+public:
+    // upload weight blob from host to device
+    virtual int upload_model(VkTransfer& cmd, const Option& opt);
+
+public:
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<VkMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<VkImageMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    // assigned immediately after creating this layer
+    const VulkanDevice* vkdev;
+#endif // NCNN_VULKAN
+
+public:
+    // custom user data
+    void* userdata;
+    // layer type index
+    int typeindex;
+#if NCNN_STRING
+    // layer type name
+    std::string type;
+    // layer name
+    std::string name;
+#endif // NCNN_STRING
+    // blob index which this layer needs as input
+    std::vector<int> bottoms;
+    // blob index which this layer produces as output
+    std::vector<int> tops;
+    // shape hint
+    std::vector<Mat> bottom_shapes;
+    std::vector<Mat> top_shapes;
+};
+
+// layer factory function
+typedef Layer* (*layer_creator_func)(void*);
+typedef void (*layer_destroyer_func)(Layer*, void*);
+
+struct layer_registry_entry
+{
+#if NCNN_STRING
+    // layer type name
+    const char* name;
+#endif // NCNN_STRING
+    // layer factory entry
+    layer_creator_func creator;
+};
+
+struct custom_layer_registry_entry
+{
+#if NCNN_STRING
+    // layer type name
+    const char* name;
+#endif // NCNN_STRING
+    // layer factory entry
+    layer_creator_func creator;
+    layer_destroyer_func destroyer;
+    void* userdata;
+};
+
+struct overwrite_builtin_layer_registry_entry
+{
+    // layer type index
+    int typeindex;
+    // layer factory entry
+    layer_creator_func creator;
+    layer_destroyer_func destroyer;
+    void* userdata;
+};
+
+#if NCNN_STRING
+// get layer type from type name
+NCNN_EXPORT int layer_to_index(const char* type);
+// create layer from type name
+NCNN_EXPORT Layer* create_layer(const char* type);
+#endif // NCNN_STRING
+// create layer from layer type
+NCNN_EXPORT Layer* create_layer(int index);
+
+#define DEFINE_LAYER_CREATOR(name)                          \
+    ::ncnn::Layer* name##_layer_creator(void* /*userdata*/) \
+    {                                                       \
+        return new name;                                    \
+    }
+
+#define DEFINE_LAYER_DESTROYER(name)                                      \
+    void name##_layer_destroyer(::ncnn::Layer* layer, void* /*userdata*/) \
+    {                                                                     \
+        delete layer;                                                     \
+    }
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/layer_shader_type.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/layer_shader_type.h
new file mode 100644
index 0000000..c143e7d
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/layer_shader_type.h
@@ -0,0 +1,29 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_SHADER_TYPE_H
+#define NCNN_LAYER_SHADER_TYPE_H
+
+namespace ncnn {
+
+namespace LayerShaderType {
+enum LayerShaderType
+{
+#include "layer_shader_type_enum.h"
+};
+} // namespace LayerShaderType
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_SHADER_TYPE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/layer_shader_type_enum.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/layer_shader_type_enum.h
new file mode 100644
index 0000000..aac8803
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/layer_shader_type_enum.h
@@ -0,0 +1,5 @@
+// Layer Shader Enum header
+//
+// This file is auto-generated by cmake, don't edit it.
+
+
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/layer_type.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/layer_type.h
new file mode 100644
index 0000000..511c714
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/layer_type.h
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_TYPE_H
+#define NCNN_LAYER_TYPE_H
+
+namespace ncnn {
+
+namespace LayerType {
+enum LayerType
+{
+#include "layer_type_enum.h"
+    CustomBit = (1 << 8),
+};
+} // namespace LayerType
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_TYPE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/layer_type_enum.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/layer_type_enum.h
new file mode 100644
index 0000000..97153ed
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/layer_type_enum.h
@@ -0,0 +1,109 @@
+// Layer Type Enum header
+//
+// This file is auto-generated by cmake, don't edit it.
+
+AbsVal = 0,
+ArgMax = 1,
+BatchNorm = 2,
+Bias = 3,
+BNLL = 4,
+Concat = 5,
+Convolution = 6,
+Crop = 7,
+Deconvolution = 8,
+Dropout = 9,
+Eltwise = 10,
+ELU = 11,
+Embed = 12,
+Exp = 13,
+Flatten = 14,
+InnerProduct = 15,
+Input = 16,
+Log = 17,
+LRN = 18,
+MemoryData = 19,
+MVN = 20,
+Pooling = 21,
+Power = 22,
+PReLU = 23,
+Proposal = 24,
+Reduction = 25,
+ReLU = 26,
+Reshape = 27,
+ROIPooling = 28,
+Scale = 29,
+Sigmoid = 30,
+Slice = 31,
+Softmax = 32,
+Split = 33,
+SPP = 34,
+TanH = 35,
+Threshold = 36,
+Tile = 37,
+RNN = 38,
+LSTM = 39,
+BinaryOp = 40,
+UnaryOp = 41,
+ConvolutionDepthWise = 42,
+Padding = 43,
+Squeeze = 44,
+ExpandDims = 45,
+Normalize = 46,
+Permute = 47,
+PriorBox = 48,
+DetectionOutput = 49,
+Interp = 50,
+DeconvolutionDepthWise = 51,
+ShuffleChannel = 52,
+InstanceNorm = 53,
+Clip = 54,
+Reorg = 55,
+YoloDetectionOutput = 56,
+Quantize = 57,
+Dequantize = 58,
+Yolov3DetectionOutput = 59,
+PSROIPooling = 60,
+ROIAlign = 61,
+Packing = 62,
+Requantize = 63,
+Cast = 64,
+HardSigmoid = 65,
+SELU = 66,
+HardSwish = 67,
+Noop = 68,
+PixelShuffle = 69,
+DeepCopy = 70,
+Mish = 71,
+StatisticsPooling = 72,
+Swish = 73,
+Gemm = 74,
+GroupNorm = 75,
+LayerNorm = 76,
+Softplus = 77,
+GRU = 78,
+MultiHeadAttention = 79,
+GELU = 80,
+Convolution1D = 81,
+Pooling1D = 82,
+ConvolutionDepthWise1D = 83,
+Convolution3D = 84,
+ConvolutionDepthWise3D = 85,
+Pooling3D = 86,
+MatMul = 87,
+Deconvolution1D = 88,
+DeconvolutionDepthWise1D = 89,
+Deconvolution3D = 90,
+DeconvolutionDepthWise3D = 91,
+Einsum = 92,
+DeformableConv2D = 93,
+GLU = 94,
+Fold = 95,
+Unfold = 96,
+GridSample = 97,
+CumulativeSum = 98,
+CopyTo = 99,
+Erf = 100,
+Diag = 101,
+CELU = 102,
+Shrink = 103,
+
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/mat.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/mat.h
new file mode 100644
index 0000000..c6f59ef
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/mat.h
@@ -0,0 +1,1843 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_MAT_H
+#define NCNN_MAT_H
+
+#include <stdlib.h>
+#include <string.h>
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif
+#if __SSE2__
+#include <emmintrin.h>
+#if __AVX__
+#include <immintrin.h>
+#endif
+#endif
+#if __mips_msa
+#include <msa.h>
+#endif
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif
+#if __riscv_vector
+#include <riscv_vector.h>
+#include "cpu.h" // cpu_riscv_vlenb()
+#endif
+
+#include "allocator.h"
+#include "option.h"
+#include "platform.h"
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#if NCNN_PIXEL
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/bitmap.h>
+#include <jni.h>
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+#endif // NCNN_PIXEL
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkMat;
+class VkImageMat;
+#endif // NCNN_VULKAN
+
+// the three dimension matrix
+class NCNN_EXPORT Mat
+{
+public:
+    // empty
+    Mat();
+    // vec
+    Mat(int w, size_t elemsize = 4u, Allocator* allocator = 0);
+    // image
+    Mat(int w, int h, size_t elemsize = 4u, Allocator* allocator = 0);
+    // dim
+    Mat(int w, int h, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // cube
+    Mat(int w, int h, int d, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // packed vec
+    Mat(int w, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed image
+    Mat(int w, int h, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed dim
+    Mat(int w, int h, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed cube
+    Mat(int w, int h, int d, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // copy
+    Mat(const Mat& m);
+    // external vec
+    Mat(int w, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external image
+    Mat(int w, int h, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external dim
+    Mat(int w, int h, int c, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external cube
+    Mat(int w, int h, int d, int c, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external packed vec
+    Mat(int w, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed image
+    Mat(int w, int h, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed dim
+    Mat(int w, int h, int c, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed cube
+    Mat(int w, int h, int d, int c, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // release
+    ~Mat();
+    // assign
+    Mat& operator=(const Mat& m);
+    // set all
+    void fill(float v);
+    void fill(int v);
+#if __ARM_NEON
+    void fill(float32x4_t _v);
+    void fill(uint16x4_t _v);
+    void fill(int32x4_t _v);
+    void fill(int32x4_t _v0, int32x4_t _v1);
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    void fill(float16x4_t _v);
+    void fill(float16x8_t _v);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif // __ARM_NEON
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+    void fill(__m512 _v);
+#endif // __AVX512F__
+    void fill(__m256 _v, int i = 0);
+#endif // __AVX__
+    void fill(__m128 _v);
+    void fill(__m128i _v);
+#endif // __SSE2__
+#if __mips_msa
+    void fill(v4f32 _v);
+#endif // __mips_msa
+#if __loongarch_sx
+    void fill(__m128 _v);
+#endif //__loongarch_sx
+#if __riscv_vector
+    void fill(vfloat32m1_t _v);
+    void fill(vuint16m1_t _v);
+    void fill(vint8m1_t _v);
+#if __riscv_zfh
+    void fill(vfloat16m1_t _v);
+#endif // __riscv_zfh
+#endif // __riscv_vector
+    template<typename T>
+    void fill(T v);
+    // deep copy
+    Mat clone(Allocator* allocator = 0) const;
+    // deep copy from other mat, inplace
+    void clone_from(const ncnn::Mat& mat, Allocator* allocator = 0);
+    // reshape vec
+    Mat reshape(int w, Allocator* allocator = 0) const;
+    // reshape image
+    Mat reshape(int w, int h, Allocator* allocator = 0) const;
+    // reshape dim
+    Mat reshape(int w, int h, int c, Allocator* allocator = 0) const;
+    // reshape cube
+    Mat reshape(int w, int h, int d, int c, Allocator* allocator = 0) const;
+    // allocate vec
+    void create(int w, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate image
+    void create(int w, int h, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate cube
+    void create(int w, int h, int d, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed cube
+    void create(int w, int h, int d, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate like
+    void create_like(const Mat& m, Allocator* allocator = 0);
+#if NCNN_VULKAN
+    // allocate like
+    void create_like(const VkMat& m, Allocator* allocator = 0);
+    // allocate like
+    void create_like(const VkImageMat& im, Allocator* allocator = 0);
+#endif // NCNN_VULKAN
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // bits per element
+    int elembits() const;
+
+    // shape only
+    Mat shape() const;
+
+    // data reference
+    Mat channel(int c);
+    const Mat channel(int c) const;
+    Mat depth(int z);
+    const Mat depth(int z) const;
+    float* row(int y);
+    const float* row(int y) const;
+    template<typename T>
+    T* row(int y);
+    template<typename T>
+    const T* row(int y) const;
+
+    // range reference
+    Mat channel_range(int c, int channels);
+    const Mat channel_range(int c, int channels) const;
+    Mat depth_range(int z, int depths);
+    const Mat depth_range(int z, int depths) const;
+    Mat row_range(int y, int rows);
+    const Mat row_range(int y, int rows) const;
+    Mat range(int x, int n);
+    const Mat range(int x, int n) const;
+
+    // access raw data
+    template<typename T>
+    operator T*();
+    template<typename T>
+    operator const T*() const;
+
+    // convenient access float vec element
+    float& operator[](size_t i);
+    const float& operator[](size_t i) const;
+
+#if NCNN_PIXEL
+    enum PixelType
+    {
+        PIXEL_CONVERT_SHIFT = 16,
+        PIXEL_FORMAT_MASK = 0x0000ffff,
+        PIXEL_CONVERT_MASK = 0xffff0000,
+
+        PIXEL_RGB = 1,
+        PIXEL_BGR = 2,
+        PIXEL_GRAY = 3,
+        PIXEL_RGBA = 4,
+        PIXEL_BGRA = 5,
+
+        PIXEL_RGB2BGR = PIXEL_RGB | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGB2GRAY = PIXEL_RGB | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGB2RGBA = PIXEL_RGB | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGB2BGRA = PIXEL_RGB | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_BGR2RGB = PIXEL_BGR | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGR2GRAY = PIXEL_BGR | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGR2RGBA = PIXEL_BGR | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGR2BGRA = PIXEL_BGR | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_GRAY2RGB = PIXEL_GRAY | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_GRAY2BGR = PIXEL_GRAY | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_GRAY2RGBA = PIXEL_GRAY | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+        PIXEL_GRAY2BGRA = PIXEL_GRAY | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_RGBA2RGB = PIXEL_RGBA | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2BGR = PIXEL_RGBA | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2GRAY = PIXEL_RGBA | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2BGRA = PIXEL_RGBA | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_BGRA2RGB = PIXEL_BGRA | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGRA2BGR = PIXEL_BGRA | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGRA2GRAY = PIXEL_BGRA | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGRA2RGBA = PIXEL_BGRA | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+    };
+    // convenient construct from pixel data
+    static Mat from_pixels(const unsigned char* pixels, int type, int w, int h, Allocator* allocator = 0);
+    // convenient construct from pixel data with stride(bytes-per-row) parameter
+    static Mat from_pixels(const unsigned char* pixels, int type, int w, int h, int stride, Allocator* allocator = 0);
+    // convenient construct from pixel data and resize to specific size
+    static Mat from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from pixel data and resize to specific size with stride(bytes-per-row) parameter
+    static Mat from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int stride, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from pixel data roi
+    static Mat from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int roix, int roiy, int roiw, int roih, Allocator* allocator = 0);
+    // convenient construct from pixel data roi with stride(bytes-per-row) parameter
+    static Mat from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, Allocator* allocator = 0);
+    // convenient construct from pixel data roi and resize to specific size
+    static Mat from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from pixel data roi and resize to specific size with stride(bytes-per-row) parameter
+    static Mat from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator = 0);
+
+    // convenient export to pixel data
+    void to_pixels(unsigned char* pixels, int type) const;
+    // convenient export to pixel data with stride(bytes-per-row) parameter
+    void to_pixels(unsigned char* pixels, int type, int stride) const;
+    // convenient export to pixel data and resize to specific size
+    void to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height) const;
+    // convenient export to pixel data and resize to specific size with stride(bytes-per-row) parameter
+    void to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height, int target_stride) const;
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+    // convenient construct from android Bitmap
+    static Mat from_android_bitmap(JNIEnv* env, jobject bitmap, int type_to, Allocator* allocator = 0);
+    // convenient construct from android Bitmap and resize to specific size
+    static Mat from_android_bitmap_resize(JNIEnv* env, jobject bitmap, int type_to, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from android Bitmap roi
+    static Mat from_android_bitmap_roi(JNIEnv* env, jobject bitmap, int type_to, int roix, int roiy, int roiw, int roih, Allocator* allocator = 0);
+    // convenient construct from android Bitmap roi and resize to specific size
+    static Mat from_android_bitmap_roi_resize(JNIEnv* env, jobject bitmap, int type_to, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient export to android Bitmap and resize to the android Bitmap size
+    void to_android_bitmap(JNIEnv* env, jobject bitmap, int type_from) const;
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+#endif // NCNN_PIXEL
+
+    // substract channel-wise mean values, then multiply by normalize values, pass 0 to skip
+    void substract_mean_normalize(const float* mean_vals, const float* norm_vals);
+
+    // convenient construct from half precision floating point data
+    static Mat from_float16(const unsigned short* data, int size);
+
+    // pointer to the data
+    void* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-d-h-w-1  c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-d-h-w-4  c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-d-h-w-8  c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    Allocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int d;
+    int c;
+
+    size_t cstep;
+};
+
+#if NCNN_VULKAN
+
+// the three dimension matrix, vulkan version
+class NCNN_EXPORT VkMat
+{
+public:
+    // empty
+    VkMat();
+    // vec
+    VkMat(int w, size_t elemsize, VkAllocator* allocator);
+    // image
+    VkMat(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // dim
+    VkMat(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // cube
+    VkMat(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // packed vec
+    VkMat(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed image
+    VkMat(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed dim
+    VkMat(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed cube
+    VkMat(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // copy
+    VkMat(const VkMat& m);
+    // external vec
+    VkMat(int w, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external image
+    VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external dim
+    VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external cube
+    VkMat(int w, int h, int d, int c, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external packed vec
+    VkMat(int w, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed image
+    VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed dim
+    VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed cube
+    VkMat(int w, int h, int d, int c, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // release
+    ~VkMat();
+    // assign
+    VkMat& operator=(const VkMat& m);
+    // allocate vec
+    void create(int w, size_t elemsize, VkAllocator* allocator);
+    // allocate image
+    void create(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate cube
+    void create(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed cube
+    void create(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate like
+    void create_like(const Mat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkMat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkImageMat& im, VkAllocator* allocator);
+
+    // mapped
+    Mat mapped() const;
+    void* mapped_ptr() const;
+
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // bits per element
+    int elembits() const;
+
+    // shape only
+    Mat shape() const;
+
+    // low-level reference
+    VkBuffer buffer() const;
+    size_t buffer_offset() const;
+    size_t buffer_capacity() const;
+
+    // device buffer
+    VkBufferMemory* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-d-h-w-1  c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-d-h-w-4  c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-d-h-w-8  c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    VkAllocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int d;
+    int c;
+
+    size_t cstep;
+};
+
+class NCNN_EXPORT VkImageMat
+{
+public:
+    // empty
+    VkImageMat();
+    // vec
+    VkImageMat(int w, size_t elemsize, VkAllocator* allocator);
+    // image
+    VkImageMat(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // dim
+    VkImageMat(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // cube
+    VkImageMat(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // packed vec
+    VkImageMat(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed image
+    VkImageMat(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed dim
+    VkImageMat(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed cube
+    VkImageMat(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // copy
+    VkImageMat(const VkImageMat& m);
+    // external vec
+    VkImageMat(int w, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external image
+    VkImageMat(int w, int h, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external dim
+    VkImageMat(int w, int h, int c, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external cube
+    VkImageMat(int w, int h, int d, int c, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external packed vec
+    VkImageMat(int w, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed image
+    VkImageMat(int w, int h, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed dim
+    VkImageMat(int w, int h, int c, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed cube
+    VkImageMat(int w, int h, int d, int c, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // release
+    ~VkImageMat();
+    // assign
+    VkImageMat& operator=(const VkImageMat& m);
+    // allocate vec
+    void create(int w, size_t elemsize, VkAllocator* allocator);
+    // allocate image
+    void create(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate cube
+    void create(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed cube
+    void create(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate like
+    void create_like(const Mat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkMat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkImageMat& im, VkAllocator* allocator);
+
+    // mapped
+    Mat mapped() const;
+    void* mapped_ptr() const;
+
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // bits per element
+    int elembits() const;
+
+    // shape only
+    Mat shape() const;
+
+    // low-level reference
+    VkImage image() const;
+    VkImageView imageview() const;
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+    // convenient construct from android hardware buffer
+    static VkImageMat from_android_hardware_buffer(VkAndroidHardwareBufferImageAllocator* allocator);
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+    // device image
+    VkImageMemory* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-d-h-w-1  c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-d-h-w-4  c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-d-h-w-8  c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    VkAllocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int d;
+    int c;
+};
+
+// type for vulkan specialization constant and push constant
+union vk_specialization_type
+{
+    int i;
+    float f;
+    uint32_t u32;
+};
+union vk_constant_type
+{
+    int i;
+    float f;
+};
+#endif // NCNN_VULKAN
+
+// misc function
+#if NCNN_PIXEL
+// convert yuv420sp(nv21) to rgb, the fast approximate version
+NCNN_EXPORT void yuv420sp2rgb(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
+// convert yuv420sp(nv12) to rgb, the fast approximate version
+NCNN_EXPORT void yuv420sp2rgb_nv12(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
+// convert yuv420sp(nv21) to rgb with half resize, the faster approximate version
+NCNN_EXPORT void yuv420sp2rgb_half(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
+// image pixel bilinear resize
+NCNN_EXPORT void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+NCNN_EXPORT void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+NCNN_EXPORT void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+NCNN_EXPORT void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+// image pixel bilinear resize with stride(bytes-per-row) parameter
+NCNN_EXPORT void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+NCNN_EXPORT void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+NCNN_EXPORT void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+NCNN_EXPORT void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+// image pixel bilinear resize, convenient wrapper for yuv420sp(nv21/nv12)
+NCNN_EXPORT void resize_bilinear_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+#endif // NCNN_PIXEL
+#if NCNN_PIXEL_ROTATE
+// type is the from type, 6 means rotating from 6 to 1
+//
+//     1        2       3      4         5            6           7          8
+//
+//   888888  888888      88  88      8888888888  88                  88  8888888888
+//   88          88      88  88      88  88      88  88          88  88      88  88
+//   8888      8888    8888  8888    88          8888888888  8888888888          88
+//   88          88      88  88
+//   88          88  888888  888888
+//
+// ref http://sylvana.net/jpegcrop/exif_orientation.html
+// image pixel kanna rotate
+NCNN_EXPORT void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+NCNN_EXPORT void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+NCNN_EXPORT void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+NCNN_EXPORT void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+// image pixel kanna rotate with stride(bytes-per-row) parameter
+NCNN_EXPORT void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+NCNN_EXPORT void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+NCNN_EXPORT void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+NCNN_EXPORT void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+// image pixel kanna rotate, convenient wrapper for yuv420sp(nv21/nv12)
+NCNN_EXPORT void kanna_rotate_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+#endif // NCNN_PIXEL_ROTATE
+#if NCNN_PIXEL_AFFINE
+// resolve affine transform matrix from rotation angle, scale factor and x y offset
+NCNN_EXPORT void get_rotation_matrix(float angle, float scale, float dx, float dy, float* tm);
+// resolve affine transform matrix from two set of points, num_point must be >= 2
+NCNN_EXPORT void get_affine_transform(const float* points_from, const float* points_to, int num_point, float* tm);
+// resolve the inversion affine transform matrix
+NCNN_EXPORT void invert_affine_transform(const float* tm, float* tm_inv);
+// image pixel bilinear warpaffine inverse transform, set -233 for transparent border color, the color RGBA is little-endian encoded
+NCNN_EXPORT void warpaffine_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+// image pixel bilinear warpaffine inverse transform with stride(bytes-per-row) parameter, set -233 for transparent border color, the color RGBA is little-endian encoded
+NCNN_EXPORT void warpaffine_bilinear_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+// image pixel bilinear warpaffine, convenient wrapper for yuv420sp(nv21/nv12), set -233 for transparent border color, the color YUV_ is little-endian encoded
+NCNN_EXPORT void warpaffine_bilinear_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+#endif // NCNN_PIXEL_AFFINE
+#if NCNN_PIXEL_DRAWING
+// draw rectangle, set thickness -1 for filled rectangle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_rectangle_c1(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c2(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c3(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c4(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+// draw rectangle with stride(bytes-per-row) parameter, set thickness -1 for filled rectangle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_rectangle_c1(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c2(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c3(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c4(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+// draw rectangle, convenient wrapper for yuv420sp(nv21/nv12), set thickness -1 for filled rectangle, the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_rectangle_yuv420sp(unsigned char* yuv420sp, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+// draw circle, set thickness -1 for filled circle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_circle_c1(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c2(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c3(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c4(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+// draw circle with stride(bytes-per-row) parameter, set thickness -1 for filled circle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_circle_c1(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c2(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c3(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c4(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+// draw circle, convenient wrapper for yuv420sp(nv21/nv12), set thickness -1 for filled circle, the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_circle_yuv420sp(unsigned char* yuv420sp, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+// draw line, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_line_c1(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c2(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c3(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c4(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+// draw line with stride(bytes-per-row) parameter, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_line_c1(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c2(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c3(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c4(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+// draw line, convenient wrapper for yuv420sp(nv21/nv12), the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_line_yuv420sp(unsigned char* yuv420sp, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+// resolve text bounding box size
+NCNN_EXPORT void get_text_drawing_size(const char* text, int fontpixelsize, int* w, int* h);
+// draw ascii printables and newline, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_text_c1(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c2(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c3(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c4(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+// draw ascii printables and newline with stride(bytes-per-row) parameter, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_text_c1(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c2(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c3(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c4(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+// draw ascii printables and newline, convenient wrapper for yuv420sp(nv21/nv12), the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_text_yuv420sp(unsigned char* yuv420sp, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+#endif // NCNN_PIXEL_DRAWING
+
+// type conversion
+// convert float to half precision floating point
+NCNN_EXPORT unsigned short float32_to_float16(float value);
+// convert half precision floating point to float
+NCNN_EXPORT float float16_to_float32(unsigned short value);
+// convert float to brain half
+NCNN_EXPORT NCNN_FORCEINLINE unsigned short float32_to_bfloat16(float value)
+{
+    // 16 : 16
+    union
+    {
+        unsigned int u;
+        float f;
+    } tmp;
+    tmp.f = value;
+    return tmp.u >> 16;
+}
+// convert brain half to float
+NCNN_EXPORT NCNN_FORCEINLINE float bfloat16_to_float32(unsigned short value)
+{
+    // 16 : 16
+    union
+    {
+        unsigned int u;
+        float f;
+    } tmp;
+    tmp.u = value << 16;
+    return tmp.f;
+}
+
+// mat process
+enum BorderType
+{
+    BORDER_CONSTANT = 0,
+    BORDER_REPLICATE = 1,
+    BORDER_REFLECT = 2,
+    BORDER_TRANSPARENT = -233,
+};
+NCNN_EXPORT void copy_make_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int type, float v, const Option& opt = Option());
+NCNN_EXPORT void copy_make_border_3d(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int front, int behind, int type, float v, const Option& opt = Option());
+NCNN_EXPORT void copy_cut_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, const Option& opt = Option());
+NCNN_EXPORT void copy_cut_border_3d(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int front, int behind, const Option& opt = Option());
+NCNN_EXPORT void resize_nearest(const Mat& src, Mat& dst, int w, int h, const Option& opt = Option());
+NCNN_EXPORT void resize_bilinear(const Mat& src, Mat& dst, int w, int h, const Option& opt = Option());
+NCNN_EXPORT void resize_bicubic(const Mat& src, Mat& dst, int w, int h, const Option& opt = Option());
+NCNN_EXPORT void convert_packing(const Mat& src, Mat& dst, int elempack, const Option& opt = Option());
+NCNN_EXPORT void flatten(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_float32_to_float16(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_float16_to_float32(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_int8_to_float32(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_float32_to_bfloat16(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_bfloat16_to_float32(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void quantize_to_int8(const Mat& src, Mat& dst, const Mat& scale_data, const Option& opt = Option());
+NCNN_EXPORT void dequantize_from_int32(const Mat& src, Mat& dst, const Mat& scale_data, const Mat& bias_data, const Option& opt = Option());
+NCNN_EXPORT void requantize_from_int32_to_int8(const Mat& src, Mat& dst, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt = Option());
+
+NCNN_FORCEINLINE Mat::Mat()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(const Mat& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c), cstep(m.cstep)
+{
+    addref();
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = (size_t)w * h;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = (size_t)w * h;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::~Mat()
+{
+    release();
+}
+
+NCNN_FORCEINLINE void Mat::fill(float _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+
+    int i = 0;
+#if __ARM_NEON
+    float32x4_t _c = vdupq_n_f32(_v);
+    for (; i + 3 < size; i += 4)
+    {
+        vst1q_f32(ptr, _c);
+        ptr += 4;
+    }
+#endif // __ARM_NEON
+    for (; i < size; i++)
+    {
+        *ptr++ = _v;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(int _v)
+{
+    int size = (int)total();
+    int* ptr = (int*)data;
+
+    int i = 0;
+#if __ARM_NEON
+    int32x4_t _c = vdupq_n_s32(_v);
+    for (; i + 3 < size; i += 4)
+    {
+        vst1q_s32(ptr, _c);
+        ptr += 4;
+    }
+#endif // __ARM_NEON
+    for (; i < size; i++)
+    {
+        *ptr++ = _v;
+    }
+}
+
+#if __ARM_NEON
+NCNN_FORCEINLINE void Mat::fill(float32x4_t _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_f32(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(uint16x4_t _v)
+{
+    int size = (int)total();
+    unsigned short* ptr = (unsigned short*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1_u16(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(int32x4_t _v)
+{
+    int size = (int)total();
+    int* ptr = (int*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_s32(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(int32x4_t _v0, int32x4_t _v1)
+{
+    int size = (int)total();
+    int* ptr = (int*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_s32(ptr, _v0);
+        vst1q_s32(ptr + 4, _v1);
+        ptr += 8;
+    }
+}
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+NCNN_FORCEINLINE void Mat::fill(float16x4_t _v)
+{
+    int size = (int)total();
+    __fp16* ptr = (__fp16*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1_f16(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(float16x8_t _v)
+{
+    int size = (int)total();
+    __fp16* ptr = (__fp16*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_f16(ptr, _v);
+        ptr += 8;
+    }
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif // __ARM_NEON
+
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+NCNN_FORCEINLINE void Mat::fill(__m512 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm512_storeu_ps(ptr, _v);
+        ptr += 16;
+    }
+}
+#endif // __AVX512F__
+NCNN_FORCEINLINE void Mat::fill(__m256 _v, int _i)
+{
+    // old gcc cannot overload __m128 and __m256 type
+    // add a dummy int parameter for different mangled function symbol
+    (void)_i;
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm256_storeu_ps(ptr, _v);
+        ptr += 8;
+    }
+}
+#endif // __AVX__
+NCNN_FORCEINLINE void Mat::fill(__m128 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm_storeu_ps(ptr, _v);
+        ptr += 4;
+    }
+}
+NCNN_FORCEINLINE void Mat::fill(__m128i _v)
+{
+    int size = (int)total();
+    unsigned short* ptr = (unsigned short*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm_store_si128((__m128i*)ptr, _v);
+        ptr += 8;
+    }
+}
+#endif // __SSE2__
+
+#if __mips_msa
+NCNN_FORCEINLINE void Mat::fill(v4f32 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        __msa_st_w((v4i32)_v, ptr, 0);
+        ptr += 4;
+    }
+}
+#endif // __mips_msa
+
+#if __loongarch_sx
+NCNN_FORCEINLINE void Mat::fill(__m128 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        __lsx_vst(_v, ptr, 0);
+        ptr += 4;
+    }
+}
+#endif // __loongarch_sx
+#if __riscv_vector
+NCNN_FORCEINLINE void Mat::fill(vfloat32m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 4;
+    const size_t vl = vsetvl_e32m1(packn);
+
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse32_v_f32m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(vuint16m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 2;
+    const size_t vl = vsetvl_e16m1(packn);
+
+    int size = (int)total();
+    unsigned short* ptr = (unsigned short*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse16_v_u16m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(vint8m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 1;
+    const size_t vl = vsetvl_e8m1(packn);
+
+    int size = (int)total();
+    signed char* ptr = (signed char*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse8_v_i8m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+#if __riscv_zfh
+NCNN_FORCEINLINE void Mat::fill(vfloat16m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 2;
+    const size_t vl = vsetvl_e16m1(packn);
+
+    int size = (int)total();
+    __fp16* ptr = (__fp16*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse16_v_f16m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+#endif // __riscv_zfh
+#endif // __riscv_vector
+
+template<typename T>
+NCNN_FORCEINLINE void Mat::fill(T _v)
+{
+    int size = (int)total();
+    T* ptr = (T*)data;
+    for (int i = 0; i < size; i++)
+    {
+        ptr[i] = _v;
+    }
+}
+
+NCNN_FORCEINLINE Mat& Mat::operator=(const Mat& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        NCNN_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    d = m.d;
+    c = m.c;
+
+    cstep = m.cstep;
+
+    return *this;
+}
+
+NCNN_FORCEINLINE void Mat::addref()
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+}
+
+NCNN_FORCEINLINE void Mat::release()
+{
+    if (refcount && NCNN_XADD(refcount, -1) == 1)
+    {
+        if (allocator)
+            allocator->fastFree(data);
+        else
+            fastFree(data);
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    d = 0;
+    c = 0;
+
+    cstep = 0;
+
+    refcount = 0;
+}
+
+NCNN_FORCEINLINE bool Mat::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+NCNN_FORCEINLINE size_t Mat::total() const
+{
+    return cstep * c;
+}
+
+NCNN_FORCEINLINE int Mat::elembits() const
+{
+    return elempack ? static_cast<int>(elemsize * 8) / elempack : 0;
+}
+
+NCNN_FORCEINLINE Mat Mat::shape() const
+{
+    if (dims == 1)
+        return Mat(w * elempack, (void*)0);
+    if (dims == 2)
+        return Mat(w, h * elempack, (void*)0);
+    if (dims == 3)
+        return Mat(w, h, c * elempack, (void*)0);
+    if (dims == 4)
+        return Mat(w, h, d, c * elempack, (void*)0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE Mat Mat::channel(int _c)
+{
+    Mat m(w, h, d, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims - 1;
+    if (dims == 4)
+        m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::channel(int _c) const
+{
+    Mat m(w, h, d, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims - 1;
+    if (dims == 4)
+        m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE Mat Mat::depth(int z)
+{
+    return Mat(w, h, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE const Mat Mat::depth(int z) const
+{
+    return Mat(w, h, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE float* Mat::row(int y)
+{
+    return (float*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+NCNN_FORCEINLINE const float* Mat::row(int y) const
+{
+    return (const float*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+template<typename T>
+NCNN_FORCEINLINE T* Mat::row(int y)
+{
+    return (T*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+template<typename T>
+NCNN_FORCEINLINE const T* Mat::row(int y) const
+{
+    return (const T*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+NCNN_FORCEINLINE Mat Mat::channel_range(int _c, int channels)
+{
+    Mat m(w, h, d, channels, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::channel_range(int _c, int channels) const
+{
+    Mat m(w, h, d, channels, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims;
+    return m;
+}
+
+NCNN_FORCEINLINE Mat Mat::depth_range(int z, int depths)
+{
+    Mat m(w, h, depths, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+    m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::depth_range(int z, int depths) const
+{
+    Mat m(w, h, depths, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+    m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE Mat Mat::row_range(int y, int rows)
+{
+    return Mat(w, rows, (unsigned char*)data + (size_t)w * y * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE const Mat Mat::row_range(int y, int rows) const
+{
+    return Mat(w, rows, (unsigned char*)data + (size_t)w * y * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE Mat Mat::range(int x, int n)
+{
+    return Mat(n, (unsigned char*)data + x * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE const Mat Mat::range(int x, int n) const
+{
+    return Mat(n, (unsigned char*)data + x * elemsize, elemsize, elempack, allocator);
+}
+
+template<typename T>
+NCNN_FORCEINLINE Mat::operator T*()
+{
+    return (T*)data;
+}
+
+template<typename T>
+NCNN_FORCEINLINE Mat::operator const T*() const
+{
+    return (const T*)data;
+}
+
+NCNN_FORCEINLINE float& Mat::operator[](size_t i)
+{
+    return ((float*)data)[i];
+}
+
+NCNN_FORCEINLINE const float& Mat::operator[](size_t i) const
+{
+    return ((const float*)data)[i];
+}
+
+#if NCNN_VULKAN
+
+NCNN_FORCEINLINE VkMat::VkMat()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(const VkMat& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c)
+{
+    addref();
+
+    cstep = m.cstep;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = w * h;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize(w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize(w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = w * h;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize(w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize(w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::~VkMat()
+{
+    release();
+}
+
+NCNN_FORCEINLINE VkMat& VkMat::operator=(const VkMat& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        NCNN_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    d = m.d;
+    c = m.c;
+
+    cstep = m.cstep;
+
+    return *this;
+}
+
+NCNN_FORCEINLINE Mat VkMat::mapped() const
+{
+    if (!allocator->mappable)
+        return Mat();
+
+    if (dims == 1)
+        return Mat(w, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 2)
+        return Mat(w, h, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 3)
+        return Mat(w, h, c, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 4)
+        return Mat(w, h, d, c, mapped_ptr(), elemsize, elempack, 0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE void* VkMat::mapped_ptr() const
+{
+    if (!allocator->mappable)
+        return 0;
+
+    return (unsigned char*)data->mapped_ptr + data->offset;
+}
+
+NCNN_FORCEINLINE void VkMat::addref()
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+}
+
+NCNN_FORCEINLINE void VkMat::release()
+{
+    if (refcount && NCNN_XADD(refcount, -1) == 1)
+    {
+        if (allocator && data)
+        {
+            allocator->fastFree(data);
+        }
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    d = 0;
+    c = 0;
+
+    cstep = 0;
+
+    refcount = 0;
+}
+
+NCNN_FORCEINLINE bool VkMat::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+NCNN_FORCEINLINE size_t VkMat::total() const
+{
+    return cstep * c;
+}
+
+NCNN_FORCEINLINE int VkMat::elembits() const
+{
+    return elempack ? static_cast<int>(elemsize) * 8 / elempack : 0;
+}
+
+NCNN_FORCEINLINE Mat VkMat::shape() const
+{
+    if (dims == 1)
+        return Mat(w * elempack, (void*)0);
+    if (dims == 2)
+        return Mat(w, h * elempack, (void*)0);
+    if (dims == 3)
+        return Mat(w, h, c * elempack, (void*)0);
+    if (dims == 4)
+        return Mat(w, h, d, c * elempack, (void*)0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE VkBuffer VkMat::buffer() const
+{
+    return data->buffer;
+}
+
+NCNN_FORCEINLINE size_t VkMat::buffer_offset() const
+{
+    return data->offset;
+}
+
+NCNN_FORCEINLINE size_t VkMat::buffer_capacity() const
+{
+    return data->capacity;
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(const VkImageMat& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c)
+{
+    addref();
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::~VkImageMat()
+{
+    release();
+}
+
+NCNN_FORCEINLINE VkImageMat& VkImageMat::operator=(const VkImageMat& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        NCNN_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    d = m.d;
+    c = m.c;
+
+    return *this;
+}
+
+NCNN_FORCEINLINE Mat VkImageMat::mapped() const
+{
+    if (!allocator->mappable || !data->mapped_ptr)
+        return Mat();
+
+    if (dims == 1)
+        return Mat(w, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 2)
+        return Mat(w, h, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 3)
+        return Mat(w, h, c, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 4)
+        return Mat(w, h, d, c, mapped_ptr(), elemsize, elempack, 0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE void* VkImageMat::mapped_ptr() const
+{
+    if (!allocator->mappable || !data->mapped_ptr)
+        return 0;
+
+    return (unsigned char*)data->mapped_ptr + data->bind_offset;
+}
+
+NCNN_FORCEINLINE void VkImageMat::addref()
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+}
+
+NCNN_FORCEINLINE void VkImageMat::release()
+{
+    if (refcount && NCNN_XADD(refcount, -1) == 1)
+    {
+        if (allocator && data)
+        {
+            allocator->fastFree(data);
+        }
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    d = 0;
+    c = 0;
+
+    refcount = 0;
+}
+
+NCNN_FORCEINLINE bool VkImageMat::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+NCNN_FORCEINLINE size_t VkImageMat::total() const
+{
+    return w * h * d * c;
+}
+
+NCNN_FORCEINLINE int VkImageMat::elembits() const
+{
+    return elempack ? static_cast<int>(elemsize) * 8 / elempack : 0;
+}
+
+NCNN_FORCEINLINE Mat VkImageMat::shape() const
+{
+    if (dims == 1)
+        return Mat(w * elempack, (void*)0);
+    if (dims == 2)
+        return Mat(w, h * elempack, (void*)0);
+    if (dims == 3)
+        return Mat(w, h, c * elempack, (void*)0);
+    if (dims == 4)
+        return Mat(w, h, d, c * elempack, (void*)0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE VkImage VkImageMat::image() const
+{
+    return data->image;
+}
+
+NCNN_FORCEINLINE VkImageView VkImageMat::imageview() const
+{
+    return data->imageview;
+}
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_MAT_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/modelbin.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/modelbin.h
new file mode 100644
index 0000000..aada5f6
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/modelbin.h
@@ -0,0 +1,80 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_MODELBIN_H
+#define NCNN_MODELBIN_H
+
+#include "mat.h"
+
+namespace ncnn {
+
+class DataReader;
+class NCNN_EXPORT ModelBin
+{
+public:
+    ModelBin();
+    virtual ~ModelBin();
+    // element type
+    // 0 = auto
+    // 1 = float32
+    // 2 = float16
+    // 3 = int8
+    // load vec
+    virtual Mat load(int w, int type) const;
+    // load image
+    virtual Mat load(int w, int h, int type) const;
+    // load dim
+    virtual Mat load(int w, int h, int c, int type) const;
+    // load cube
+    virtual Mat load(int w, int h, int d, int c, int type) const;
+};
+
+class ModelBinFromDataReaderPrivate;
+class NCNN_EXPORT ModelBinFromDataReader : public ModelBin
+{
+public:
+    explicit ModelBinFromDataReader(const DataReader& dr);
+    virtual ~ModelBinFromDataReader();
+
+    virtual Mat load(int w, int type) const;
+
+private:
+    ModelBinFromDataReader(const ModelBinFromDataReader&);
+    ModelBinFromDataReader& operator=(const ModelBinFromDataReader&);
+
+private:
+    ModelBinFromDataReaderPrivate* const d;
+};
+
+class ModelBinFromMatArrayPrivate;
+class NCNN_EXPORT ModelBinFromMatArray : public ModelBin
+{
+public:
+    // construct from weight blob array
+    explicit ModelBinFromMatArray(const Mat* weights);
+    virtual ~ModelBinFromMatArray();
+
+    virtual Mat load(int w, int type) const;
+
+private:
+    ModelBinFromMatArray(const ModelBinFromMatArray&);
+    ModelBinFromMatArray& operator=(const ModelBinFromMatArray&);
+
+private:
+    ModelBinFromMatArrayPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_MODELBIN_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/ncnn_export.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/ncnn_export.h
new file mode 100644
index 0000000..e2f5fde
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/ncnn_export.h
@@ -0,0 +1,42 @@
+
+#ifndef NCNN_EXPORT_H
+#define NCNN_EXPORT_H
+
+#ifdef NCNN_STATIC_DEFINE
+#  define NCNN_EXPORT
+#  define NCNN_NO_EXPORT
+#else
+#  ifndef NCNN_EXPORT
+#    ifdef ncnn_EXPORTS
+        /* We are building this library */
+#      define NCNN_EXPORT __attribute__((visibility("default")))
+#    else
+        /* We are using this library */
+#      define NCNN_EXPORT __attribute__((visibility("default")))
+#    endif
+#  endif
+
+#  ifndef NCNN_NO_EXPORT
+#    define NCNN_NO_EXPORT __attribute__((visibility("hidden")))
+#  endif
+#endif
+
+#ifndef NCNN_DEPRECATED
+#  define NCNN_DEPRECATED __attribute__ ((__deprecated__))
+#endif
+
+#ifndef NCNN_DEPRECATED_EXPORT
+#  define NCNN_DEPRECATED_EXPORT NCNN_EXPORT NCNN_DEPRECATED
+#endif
+
+#ifndef NCNN_DEPRECATED_NO_EXPORT
+#  define NCNN_DEPRECATED_NO_EXPORT NCNN_NO_EXPORT NCNN_DEPRECATED
+#endif
+
+#if 0 /* DEFINE_NO_DEPRECATED */
+#  ifndef NCNN_NO_DEPRECATED
+#    define NCNN_NO_DEPRECATED
+#  endif
+#endif
+
+#endif /* NCNN_EXPORT_H */
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/net.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/net.h
new file mode 100644
index 0000000..98e3ec3
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/net.h
@@ -0,0 +1,274 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_NET_H
+#define NCNN_NET_H
+
+#include "blob.h"
+#include "layer.h"
+#include "mat.h"
+#include "option.h"
+#include "platform.h"
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/asset_manager.h>
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkCompute;
+#endif // NCNN_VULKAN
+class DataReader;
+class Extractor;
+class NetPrivate;
+class NCNN_EXPORT Net
+{
+public:
+    // empty init
+    Net();
+    // clear and destroy
+    virtual ~Net();
+
+public:
+    // option can be changed before loading
+    Option opt;
+
+#if NCNN_VULKAN
+    // set gpu device by index
+    void set_vulkan_device(int device_index);
+
+    // set gpu device by device handle, no owner transfer
+    void set_vulkan_device(const VulkanDevice* vkdev);
+
+    const VulkanDevice* vulkan_device() const;
+#endif // NCNN_VULKAN
+
+#if NCNN_STRING
+    // register custom layer or overwrite built-in layer by layer type name
+    // return 0 if success
+    int register_custom_layer(const char* type, layer_creator_func creator, layer_destroyer_func destroyer = 0, void* userdata = 0);
+    virtual int custom_layer_to_index(const char* type);
+#endif // NCNN_STRING
+    // register custom layer or overwrite built-in layer by layer type
+    // return 0 if success
+    int register_custom_layer(int index, layer_creator_func creator, layer_destroyer_func destroyer = 0, void* userdata = 0);
+
+#if NCNN_STRING
+    int load_param(const DataReader& dr);
+#endif // NCNN_STRING
+
+    int load_param_bin(const DataReader& dr);
+
+    int load_model(const DataReader& dr);
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    // load network structure from plain param file
+    // return 0 if success
+    int load_param(FILE* fp);
+    int load_param(const char* protopath);
+    int load_param_mem(const char* mem);
+#endif // NCNN_STRING
+    // load network structure from binary param file
+    // return 0 if success
+    int load_param_bin(FILE* fp);
+    int load_param_bin(const char* protopath);
+
+    // load network weight data from model file
+    // return 0 if success
+    int load_model(FILE* fp);
+    int load_model(const char* modelpath);
+#endif // NCNN_STDIO
+
+    // load network structure from external memory
+    // memory pointer must be 32-bit aligned
+    // return bytes consumed
+    int load_param(const unsigned char* mem);
+
+    // reference network weight data from external memory
+    // weight data is not copied but referenced
+    // so external memory should be retained when used
+    // memory pointer must be 32-bit aligned
+    // return bytes consumed
+    int load_model(const unsigned char* mem);
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#if NCNN_STRING
+    // convenient load network structure from android asset plain param file
+    int load_param(AAsset* asset);
+    int load_param(AAssetManager* mgr, const char* assetpath);
+#endif // NCNN_STRING
+    // convenient load network structure from android asset binary param file
+    int load_param_bin(AAsset* asset);
+    int load_param_bin(AAssetManager* mgr, const char* assetpath);
+
+    // convenient load network weight data from android asset model file
+    int load_model(AAsset* asset);
+    int load_model(AAssetManager* mgr, const char* assetpath);
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+    // unload network structure and weight data
+    void clear();
+
+    // construct an Extractor from network
+    Extractor create_extractor() const;
+
+    // get input/output indexes/names
+    const std::vector<int>& input_indexes() const;
+    const std::vector<int>& output_indexes() const;
+#if NCNN_STRING
+    const std::vector<const char*>& input_names() const;
+    const std::vector<const char*>& output_names() const;
+#endif
+
+    const std::vector<Blob>& blobs() const;
+    const std::vector<Layer*>& layers() const;
+
+    std::vector<Blob>& mutable_blobs();
+    std::vector<Layer*>& mutable_layers();
+
+protected:
+    friend class Extractor;
+#if NCNN_STRING
+    int find_blob_index_by_name(const char* name) const;
+    int find_layer_index_by_name(const char* name) const;
+    virtual Layer* create_custom_layer(const char* type);
+    virtual Layer* create_overwrite_builtin_layer(const char* type);
+#endif // NCNN_STRING
+    virtual Layer* create_custom_layer(int index);
+    virtual Layer* create_overwrite_builtin_layer(int typeindex);
+
+private:
+    Net(const Net&);
+    Net& operator=(const Net&);
+
+private:
+    NetPrivate* const d;
+};
+
+class ExtractorPrivate;
+class NCNN_EXPORT Extractor
+{
+public:
+    virtual ~Extractor();
+
+    // copy
+    Extractor(const Extractor&);
+
+    // assign
+    Extractor& operator=(const Extractor&);
+
+    // clear blob mats and alloctors
+    void clear();
+
+    // enable light mode
+    // intermediate blob will be recycled when enabled
+    // enabled by default
+    void set_light_mode(bool enable);
+
+    // set thread count for this extractor
+    // this will overwrite the global setting
+    // default count is system depended
+    void set_num_threads(int num_threads);
+
+    // set blob memory allocator
+    void set_blob_allocator(Allocator* allocator);
+
+    // set workspace memory allocator
+    void set_workspace_allocator(Allocator* allocator);
+
+#if NCNN_VULKAN
+    void set_vulkan_compute(bool enable);
+
+    void set_blob_vkallocator(VkAllocator* allocator);
+
+    void set_workspace_vkallocator(VkAllocator* allocator);
+
+    void set_staging_vkallocator(VkAllocator* allocator);
+#endif // NCNN_VULKAN
+
+#if NCNN_STRING
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const Mat& in);
+
+    // get result by blob name
+    // return 0 if success
+    // type = 0, default
+    // type = 1, do not convert fp16/bf16 or / and packing
+    int extract(const char* blob_name, Mat& feat, int type = 0);
+#endif // NCNN_STRING
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const Mat& in);
+
+    // get result by blob index
+    // return 0 if success
+    // type = 0, default
+    // type = 1, do not convert fp16/bf16 or / and packing
+    int extract(int blob_index, Mat& feat, int type = 0);
+
+#if NCNN_VULKAN
+#if NCNN_STRING
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const VkMat& in);
+
+    // get result by blob name
+    // return 0 if success
+    int extract(const char* blob_name, VkMat& feat, VkCompute& cmd);
+
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const VkImageMat& in);
+
+    // get result by blob name
+    // return 0 if success
+    int extract(const char* blob_name, VkImageMat& feat, VkCompute& cmd);
+#endif // NCNN_STRING
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const VkMat& in);
+
+    // get result by blob index
+    // return 0 if success
+    int extract(int blob_index, VkMat& feat, VkCompute& cmd);
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const VkImageMat& in);
+
+    // get result by blob index
+    // return 0 if success
+    int extract(int blob_index, VkImageMat& feat, VkCompute& cmd);
+#endif // NCNN_VULKAN
+
+protected:
+    friend Extractor Net::create_extractor() const;
+    Extractor(const Net* net, size_t blob_count);
+
+private:
+    ExtractorPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_NET_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/option.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/option.h
new file mode 100644
index 0000000..7d0cc60
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/option.h
@@ -0,0 +1,156 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_OPTION_H
+#define NCNN_OPTION_H
+
+#include "platform.h"
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkAllocator;
+class PipelineCache;
+#endif // NCNN_VULKAN
+
+class Allocator;
+class NCNN_EXPORT Option
+{
+public:
+    // default option
+    Option();
+
+public:
+    // light mode
+    // intermediate blob will be recycled when enabled
+    // enabled by default
+    bool lightmode;
+
+    // thread count
+    // default value is the one returned by get_cpu_count()
+    int num_threads;
+
+    // blob memory allocator
+    Allocator* blob_allocator;
+
+    // workspace memory allocator
+    Allocator* workspace_allocator;
+
+#if NCNN_VULKAN
+    // blob memory allocator
+    VkAllocator* blob_vkallocator;
+
+    // workspace memory allocator
+    VkAllocator* workspace_vkallocator;
+
+    // staging memory allocator
+    VkAllocator* staging_vkallocator;
+
+    // pipeline cache
+    PipelineCache* pipeline_cache;
+#endif // NCNN_VULKAN
+
+    // the time openmp threads busy-wait for more work before going to sleep
+    // default value is 20ms to keep the cores enabled
+    // without too much extra power consumption afterwards
+    int openmp_blocktime;
+
+    // enable winograd convolution optimization
+    // improve convolution 3x3 stride1 performance, may consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_winograd_convolution;
+
+    // enable sgemm convolution optimization
+    // improve convolution 1x1 stride1 performance, may consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_sgemm_convolution;
+
+    // enable quantized int8 inference
+    // use low-precision int8 path for quantized model
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_int8_inference;
+
+    // enable vulkan compute
+    bool use_vulkan_compute;
+
+    // enable bf16 data type for storage
+    // improve most operator performance on all arm devices, may consume more memory
+    bool use_bf16_storage;
+
+    // enable options for gpu inference
+    bool use_fp16_packed;
+    bool use_fp16_storage;
+    bool use_fp16_arithmetic;
+    bool use_int8_packed;
+    bool use_int8_storage;
+    bool use_int8_arithmetic;
+
+    // enable simd-friendly packed memory layout
+    // improve all operator performance on all arm devices, will consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_packing_layout;
+
+    bool use_shader_pack8;
+
+    // subgroup option
+    bool use_subgroup_basic;
+    bool use_subgroup_vote;
+    bool use_subgroup_ballot;
+    bool use_subgroup_shuffle;
+
+    // turn on for adreno
+    bool use_image_storage;
+    bool use_tensor_storage;
+
+    bool use_reserved_0;
+
+    // enable DAZ(Denormals-Are-Zero) and FTZ(Flush-To-Zero)
+    // default value is 3
+    // 0 = DAZ OFF, FTZ OFF
+    // 1 = DAZ ON , FTZ OFF
+    // 2 = DAZ OFF, FTZ ON
+    // 3 = DAZ ON,  FTZ ON
+    int flush_denormals;
+
+    bool use_local_pool_allocator;
+
+    // enable local memory optimization for gpu inference
+    bool use_shader_local_memory;
+
+    // enable cooperative matrix optimization for gpu inference
+    bool use_cooperative_matrix;
+
+    // more fine-grained control of winograd convolution
+    bool use_winograd23_convolution;
+    bool use_winograd43_convolution;
+    bool use_winograd63_convolution;
+
+    // this option is turned on for A53/A55 automatically
+    // but you can force this on/off if you wish
+    bool use_a53_a55_optimized_kernel;
+
+    bool use_reserved_7;
+    bool use_reserved_8;
+    bool use_reserved_9;
+    bool use_reserved_10;
+    bool use_reserved_11;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_OPTION_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/paramdict.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/paramdict.h
new file mode 100644
index 0000000..c2ef160
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/paramdict.h
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PARAMDICT_H
+#define NCNN_PARAMDICT_H
+
+#include "mat.h"
+
+// at most 32 parameters
+#define NCNN_MAX_PARAM_COUNT 32
+
+namespace ncnn {
+
+class DataReader;
+class Net;
+class ParamDictPrivate;
+class NCNN_EXPORT ParamDict
+{
+public:
+    // empty
+    ParamDict();
+
+    virtual ~ParamDict();
+
+    // copy
+    ParamDict(const ParamDict&);
+
+    // assign
+    ParamDict& operator=(const ParamDict&);
+
+    // get type
+    int type(int id) const;
+
+    // get int
+    int get(int id, int def) const;
+    // get float
+    float get(int id, float def) const;
+    // get array
+    Mat get(int id, const Mat& def) const;
+
+    // set int
+    void set(int id, int i);
+    // set float
+    void set(int id, float f);
+    // set array
+    void set(int id, const Mat& v);
+
+protected:
+    friend class Net;
+
+    void clear();
+
+    int load_param(const DataReader& dr);
+    int load_param_bin(const DataReader& dr);
+
+private:
+    ParamDictPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_PARAMDICT_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/pipeline.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/pipeline.h
new file mode 100644
index 0000000..c284a14
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/pipeline.h
@@ -0,0 +1,113 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PIPELINE_H
+#define NCNN_PIPELINE_H
+
+#include "mat.h"
+#include "platform.h"
+#if NCNN_VULKAN
+#include "gpu.h"
+
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class Option;
+class PipelinePrivate;
+class NCNN_EXPORT Pipeline
+{
+public:
+    explicit Pipeline(const VulkanDevice* vkdev);
+    virtual ~Pipeline();
+
+public:
+    void set_optimal_local_size_xyz(int w = 4, int h = 4, int c = 4);
+    void set_optimal_local_size_xyz(const Mat& local_size_xyz);
+    void set_local_size_xyz(int w, int h, int c);
+
+    int create(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations);
+
+    int create(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations);
+
+public:
+    VkShaderModule shader_module() const;
+    VkDescriptorSetLayout descriptorset_layout() const;
+    VkPipelineLayout pipeline_layout() const;
+    VkPipeline pipeline() const;
+    VkDescriptorUpdateTemplateKHR descriptor_update_template() const;
+
+    const ShaderInfo& shader_info() const;
+
+    uint32_t local_size_x() const;
+    uint32_t local_size_y() const;
+    uint32_t local_size_z() const;
+
+protected:
+    void set_shader_module(VkShaderModule shader_module);
+    void set_descriptorset_layout(VkDescriptorSetLayout descriptorset_layout);
+    void set_pipeline_layout(VkPipelineLayout pipeline_layout);
+    void set_pipeline(VkPipeline pipeline);
+    void set_descriptor_update_template(VkDescriptorUpdateTemplateKHR descriptor_update_template);
+
+    void set_shader_info(const ShaderInfo& shader_info);
+
+public:
+    const VulkanDevice* vkdev;
+
+private:
+    Pipeline(const Pipeline&);
+    Pipeline& operator=(const Pipeline&);
+
+private:
+    PipelinePrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class VkCompute;
+class NCNN_EXPORT ImportAndroidHardwareBufferPipeline : private Pipeline
+{
+public:
+    explicit ImportAndroidHardwareBufferPipeline(const VulkanDevice* vkdev);
+    virtual ~ImportAndroidHardwareBufferPipeline();
+
+    int create(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator, int type_to, int rotate_from, const Option& opt);
+    int create(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator, int type_to, int rotate_from, int target_width, int target_height, const Option& opt);
+    void destroy();
+
+    friend class VkCompute;
+
+protected:
+    int create_shader_module(const Option& opt);
+    int create_sampler(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator);
+    int create_descriptorset_layout();
+
+public:
+    int type_to;
+    int rotate_from;
+    bool need_resize;
+
+    VkSampler sampler;
+};
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_PIPELINE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/pipelinecache.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/pipelinecache.h
new file mode 100644
index 0000000..bb6b8fb
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/pipelinecache.h
@@ -0,0 +1,85 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PIPELINECACHE_H
+#define NCNN_PIPELINECACHE_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#include "mat.h"
+#include "gpu.h"
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+
+class VulkanDevice;
+class PipelineCachePrivate;
+class NCNN_EXPORT PipelineCache
+{
+public:
+    explicit PipelineCache(const VulkanDevice* _vkdev);
+
+    virtual ~PipelineCache();
+
+    void clear();
+
+    int get_pipeline(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations,
+                     uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                     VkShaderModule* shader_module,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template,
+                     ShaderInfo& shader_info) const;
+
+    int get_pipeline(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations,
+                     uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                     VkShaderModule* shader_module,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template,
+                     ShaderInfo& shader_info) const;
+
+protected:
+    int create_shader_module(int shader_type_index, const Option& opt, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                             VkShaderModule* _shader_module, ShaderInfo& si) const;
+
+    int new_pipeline(VkShaderModule shader_module, const ShaderInfo& shader_info, const std::vector<vk_specialization_type>& specializations,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template) const;
+
+protected:
+    const VulkanDevice* vkdev;
+
+private:
+    PipelineCache(const PipelineCache&);
+    PipelineCache& operator=(const PipelineCache&);
+
+private:
+    PipelineCachePrivate* const d;
+};
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_PIPELINECACHE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/platform.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/platform.h
new file mode 100644
index 0000000..b5f4337
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/platform.h
@@ -0,0 +1,293 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PLATFORM_H
+#define NCNN_PLATFORM_H
+
+#define NCNN_STDIO 1
+#define NCNN_STRING 1
+#define NCNN_SIMPLEOCV 0
+#define NCNN_SIMPLEOMP 0
+#define NCNN_SIMPLESTL 0
+#define NCNN_SIMPLEMATH 0
+#define NCNN_THREADS 1
+#define NCNN_BENCHMARK 0
+#define NCNN_C_API 1
+#define NCNN_PLATFORM_API 1
+#define NCNN_PIXEL 1
+#define NCNN_PIXEL_ROTATE 1
+#define NCNN_PIXEL_AFFINE 1
+#define NCNN_PIXEL_DRAWING 1
+#define NCNN_VULKAN 0
+#define NCNN_SYSTEM_GLSLANG 0
+#define NCNN_RUNTIME_CPU 1
+#define NCNN_GNU_INLINE_ASM 1
+#define NCNN_AVX 1
+#define NCNN_XOP 1
+#define NCNN_FMA 1
+#define NCNN_F16C 1
+#define NCNN_AVX2 1
+#define NCNN_AVXVNNI 1
+#define NCNN_AVX512 1
+#define NCNN_AVX512VNNI 1
+#define NCNN_AVX512BF16 1
+#define NCNN_AVX512FP16 1
+#define NCNN_VFPV4 0
+#define NCNN_ARM82 0
+#define NCNN_ARM82DOT 0
+#define NCNN_ARM82FP16FML 0
+#define NCNN_ARM84BF16 0
+#define NCNN_ARM84I8MM 0
+#define NCNN_ARM86SVE 0
+#define NCNN_ARM86SVE2 0
+#define NCNN_ARM86SVEBF16 0
+#define NCNN_ARM86SVEI8MM 0
+#define NCNN_ARM86SVEF32MM 0
+#define NCNN_MSA 0
+#define NCNN_LSX 0
+#define NCNN_MMI 0
+#define NCNN_RVV 0
+#define NCNN_INT8 1
+#define NCNN_BF16 1
+#define NCNN_FORCE_INLINE 1
+
+#define NCNN_VERSION_STRING "1.0.20231027"
+
+#include "ncnn_export.h"
+
+#ifdef __cplusplus
+
+#if NCNN_THREADS
+#if (defined _WIN32 && !(defined __MINGW32__))
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <process.h>
+#else
+#include <pthread.h>
+#endif
+#endif // NCNN_THREADS
+
+#if __ANDROID_API__ >= 26
+#define VK_USE_PLATFORM_ANDROID_KHR
+#endif // __ANDROID_API__ >= 26
+
+namespace ncnn {
+
+#if NCNN_THREADS
+#if (defined _WIN32 && !(defined __MINGW32__))
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() { InitializeSRWLock(&srwlock); }
+    ~Mutex() {}
+    void lock() { AcquireSRWLockExclusive(&srwlock); }
+    void unlock() { ReleaseSRWLockExclusive(&srwlock); }
+private:
+    friend class ConditionVariable;
+    // NOTE SRWLock is available from windows vista
+    SRWLOCK srwlock;
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() { InitializeConditionVariable(&condvar); }
+    ~ConditionVariable() {}
+    void wait(Mutex& mutex) { SleepConditionVariableSRW(&condvar, &mutex.srwlock, INFINITE, 0); }
+    void broadcast() { WakeAllConditionVariable(&condvar); }
+    void signal() { WakeConditionVariable(&condvar); }
+private:
+    CONDITION_VARIABLE condvar;
+};
+
+static unsigned __stdcall start_wrapper(void* args);
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*start)(void*), void* args = 0) { _start = start; _args = args; handle = (HANDLE)_beginthreadex(0, 0, start_wrapper, this, 0, 0); }
+    ~Thread() {}
+    void join() { WaitForSingleObject(handle, INFINITE); CloseHandle(handle); }
+private:
+    friend unsigned __stdcall start_wrapper(void* args)
+    {
+        Thread* t = (Thread*)args;
+        t->_start(t->_args);
+        return 0;
+    }
+    HANDLE handle;
+    void* (*_start)(void*);
+    void* _args;
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { key = TlsAlloc(); }
+    ~ThreadLocalStorage() { TlsFree(key); }
+    void set(void* value) { TlsSetValue(key, (LPVOID)value); }
+    void* get() { return (void*)TlsGetValue(key); }
+private:
+    DWORD key;
+};
+#else // (defined _WIN32 && !(defined __MINGW32__))
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() { pthread_mutex_init(&mutex, 0); }
+    ~Mutex() { pthread_mutex_destroy(&mutex); }
+    void lock() { pthread_mutex_lock(&mutex); }
+    void unlock() { pthread_mutex_unlock(&mutex); }
+private:
+    friend class ConditionVariable;
+    pthread_mutex_t mutex;
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() { pthread_cond_init(&cond, 0); }
+    ~ConditionVariable() { pthread_cond_destroy(&cond); }
+    void wait(Mutex& mutex) { pthread_cond_wait(&cond, &mutex.mutex); }
+    void broadcast() { pthread_cond_broadcast(&cond); }
+    void signal() { pthread_cond_signal(&cond); }
+private:
+    pthread_cond_t cond;
+};
+
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*start)(void*), void* args = 0) { pthread_create(&t, 0, start, args); }
+    ~Thread() {}
+    void join() { pthread_join(t, 0); }
+private:
+    pthread_t t;
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { pthread_key_create(&key, 0); }
+    ~ThreadLocalStorage() { pthread_key_delete(key); }
+    void set(void* value) { pthread_setspecific(key, value); }
+    void* get() { return pthread_getspecific(key); }
+private:
+    pthread_key_t key;
+};
+#endif // (defined _WIN32 && !(defined __MINGW32__))
+#else // NCNN_THREADS
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() {}
+    ~Mutex() {}
+    void lock() {}
+    void unlock() {}
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() {}
+    ~ConditionVariable() {}
+    void wait(Mutex& /*mutex*/) {}
+    void broadcast() {}
+    void signal() {}
+};
+
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*/*start*/)(void*), void* /*args*/ = 0) {}
+    ~Thread() {}
+    void join() {}
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { data = 0; }
+    ~ThreadLocalStorage() {}
+    void set(void* value) { data = value; }
+    void* get() { return data; }
+private:
+    void* data;
+};
+#endif // NCNN_THREADS
+
+class NCNN_EXPORT MutexLockGuard
+{
+public:
+    MutexLockGuard(Mutex& _mutex) : mutex(_mutex) { mutex.lock(); }
+    ~MutexLockGuard() { mutex.unlock(); }
+private:
+    Mutex& mutex;
+};
+
+} // namespace ncnn
+
+#if NCNN_SIMPLESTL
+#include "simplestl.h"
+#else
+#include <algorithm>
+#include <list>
+#include <vector>
+#include <string>
+#endif
+
+// simplemath
+#if NCNN_SIMPLEMATH
+#include "simplemath.h"
+#else
+#include <math.h>
+#include <fenv.h>
+#endif
+
+#endif // __cplusplus
+
+#if NCNN_STDIO
+#if NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#include <android/log.h>
+#define NCNN_LOGE(...) do { \
+    fprintf(stderr, ##__VA_ARGS__); fprintf(stderr, "\n"); \
+    __android_log_print(ANDROID_LOG_WARN, "ncnn", ##__VA_ARGS__); } while(0)
+#else // NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#include <stdio.h>
+#define NCNN_LOGE(...) do { \
+    fprintf(stderr, ##__VA_ARGS__); fprintf(stderr, "\n"); } while(0)
+#endif // NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#else
+#define NCNN_LOGE(...)
+#endif
+
+
+#if NCNN_FORCE_INLINE
+#ifdef _MSC_VER
+    #define NCNN_FORCEINLINE __forceinline
+#elif defined(__GNUC__)
+    #define NCNN_FORCEINLINE inline __attribute__((__always_inline__))
+#elif defined(__CLANG__)
+    #if __has_attribute(__always_inline__)
+        #define NCNN_FORCEINLINE inline __attribute__((__always_inline__))
+    #else
+        #define NCNN_FORCEINLINE inline
+    #endif
+#else
+    #define NCNN_FORCEINLINE inline
+#endif
+#else
+    #define NCNN_FORCEINLINE inline
+#endif
+
+#endif // NCNN_PLATFORM_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/simplemath.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/simplemath.h
new file mode 100644
index 0000000..fd7fa69
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/simplemath.h
@@ -0,0 +1,102 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLEMATH_H
+#define NCNN_SIMPLEMATH_H
+
+#include "platform.h"
+
+#if NCNN_SIMPLEMATH
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+* ====================================================
+* discrete functions
+* ====================================================
+*/
+NCNN_EXPORT float fabs(float);
+NCNN_EXPORT float fabsf(float);
+NCNN_EXPORT float fmod(float, float);
+NCNN_EXPORT float floor(float);
+NCNN_EXPORT float floorf(float);
+NCNN_EXPORT float round(float);
+NCNN_EXPORT float roundf(float);
+NCNN_EXPORT float ceil(float);
+NCNN_EXPORT float ceilf(float);
+NCNN_EXPORT float fmaxf(float, float);
+NCNN_EXPORT float truncf(float);
+NCNN_EXPORT float frac(float);
+/*
+* ====================================================
+* trigonometric functions
+* ====================================================
+*/
+NCNN_EXPORT float sinf(float);
+NCNN_EXPORT float cosf(float);
+NCNN_EXPORT float tanf(float);
+NCNN_EXPORT float asinf(float);
+NCNN_EXPORT float acosf(float);
+NCNN_EXPORT float atanf(float);
+NCNN_EXPORT float atan2f(float, float);
+NCNN_EXPORT float tanhf(float);
+
+/*
+* ====================================================
+* power functions
+* ====================================================
+*/
+NCNN_EXPORT float sqrtf(float);
+NCNN_EXPORT float sqrt(float);
+NCNN_EXPORT float powf(float, float);
+
+/*
+* ====================================================
+* exponential and logarithm functions
+* ====================================================
+*/
+NCNN_EXPORT float expf(float);
+NCNN_EXPORT float frexp(float, int*);
+NCNN_EXPORT float logf(float);
+NCNN_EXPORT float log(float);
+NCNN_EXPORT float log10f(float);
+
+/*
+* ====================================================
+* probability functions
+* ====================================================
+*/
+NCNN_EXPORT float erf(float);
+NCNN_EXPORT float erfcf(float);
+
+/*
+* ====================================================
+* other functions
+* ====================================================
+*/
+NCNN_EXPORT int msb(unsigned int);
+NCNN_EXPORT float fmaf(float, float, float);
+NCNN_EXPORT float copysignf(float, float);
+NCNN_EXPORT void fesetround(int);
+NCNN_EXPORT int fegetround();
+NCNN_EXPORT float nearbyintf(float);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // NCNN_SIMPLEMATH
+
+#endif // NCNN_SIMPLEMATH_H
\ No newline at end of file
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/simpleocv.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/simpleocv.h
new file mode 100644
index 0000000..54b22d9
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/simpleocv.h
@@ -0,0 +1,503 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLEOCV_H
+#define NCNN_SIMPLEOCV_H
+
+#include "platform.h"
+
+#if NCNN_SIMPLEOCV
+
+#include <limits.h>
+#include <string.h>
+#include "allocator.h"
+#include "mat.h"
+
+#if defined(_MSC_VER) || defined(__GNUC__)
+#pragma push_macro("min")
+#pragma push_macro("max")
+#undef min
+#undef max
+#endif
+
+#ifndef NCNN_XADD
+using ncnn::NCNN_XADD;
+#endif
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+
+enum
+{
+    CV_LOAD_IMAGE_UNCHANGED = -1,
+    CV_LOAD_IMAGE_GRAYSCALE = 0,
+    CV_LOAD_IMAGE_COLOR = 1,
+};
+
+enum
+{
+    CV_IMWRITE_JPEG_QUALITY = 1
+};
+
+// minimal opencv style data structure implementation
+namespace cv {
+
+template<typename _Tp>
+static inline _Tp saturate_cast(int v)
+{
+    return _Tp(v);
+}
+template<>
+inline uchar saturate_cast<uchar>(int v)
+{
+    return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
+}
+
+template<typename _Tp>
+struct Scalar_
+{
+    Scalar_()
+    {
+        v[0] = 0;
+        v[1] = 0;
+        v[2] = 0;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0)
+    {
+        v[0] = _v0;
+        v[1] = 0;
+        v[2] = 0;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0, _Tp _v1, _Tp _v2)
+    {
+        v[0] = _v0;
+        v[1] = _v1;
+        v[2] = _v2;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0, _Tp _v1, _Tp _v2, _Tp _v3)
+    {
+        v[0] = _v0;
+        v[1] = _v1;
+        v[2] = _v2;
+        v[3] = _v3;
+    }
+
+    const _Tp operator[](const int i) const
+    {
+        return v[i];
+    }
+
+    _Tp operator[](const int i)
+    {
+        return v[i];
+    }
+
+    _Tp v[4];
+};
+
+typedef Scalar_<uchar> Scalar;
+
+template<typename _Tp>
+struct Point_
+{
+    Point_()
+        : x(0), y(0)
+    {
+    }
+    Point_(_Tp _x, _Tp _y)
+        : x(_x), y(_y)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Point_<_Tp2>() const
+    {
+        return Point_<_Tp2>(saturate_cast<_Tp2>(x), saturate_cast<_Tp2>(y));
+    }
+
+    _Tp x;
+    _Tp y;
+};
+
+typedef Point_<int> Point;
+typedef Point_<float> Point2f;
+
+template<typename _Tp>
+struct Size_
+{
+    Size_()
+        : width(0), height(0)
+    {
+    }
+    Size_(_Tp _w, _Tp _h)
+        : width(_w), height(_h)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Size_<_Tp2>() const
+    {
+        return Size_<_Tp2>(saturate_cast<_Tp2>(width), saturate_cast<_Tp2>(height));
+    }
+
+    _Tp width;
+    _Tp height;
+};
+
+typedef Size_<int> Size;
+typedef Size_<float> Size2f;
+
+template<typename _Tp>
+struct Rect_
+{
+    Rect_()
+        : x(0), y(0), width(0), height(0)
+    {
+    }
+    Rect_(_Tp _x, _Tp _y, _Tp _w, _Tp _h)
+        : x(_x), y(_y), width(_w), height(_h)
+    {
+    }
+    Rect_(Point_<_Tp> _p, Size_<_Tp> _size)
+        : x(_p.x), y(_p.y), width(_size.width), height(_size.height)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Rect_<_Tp2>() const
+    {
+        return Rect_<_Tp2>(saturate_cast<_Tp2>(x), saturate_cast<_Tp2>(y), saturate_cast<_Tp2>(width), saturate_cast<_Tp2>(height));
+    }
+
+    _Tp x;
+    _Tp y;
+    _Tp width;
+    _Tp height;
+
+    // area
+    _Tp area() const
+    {
+        return width * height;
+    }
+};
+
+template<typename _Tp>
+static inline Rect_<_Tp>& operator&=(Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    _Tp x1 = std::max(a.x, b.x), y1 = std::max(a.y, b.y);
+    a.width = std::min(a.x + a.width, b.x + b.width) - x1;
+    a.height = std::min(a.y + a.height, b.y + b.height) - y1;
+    a.x = x1;
+    a.y = y1;
+    if (a.width <= 0 || a.height <= 0)
+        a = Rect_<_Tp>();
+    return a;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp>& operator|=(Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    _Tp x1 = std::min(a.x, b.x), y1 = std::min(a.y, b.y);
+    a.width = std::max(a.x + a.width, b.x + b.width) - x1;
+    a.height = std::max(a.y + a.height, b.y + b.height) - y1;
+    a.x = x1;
+    a.y = y1;
+    return a;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp> operator&(const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    Rect_<_Tp> c = a;
+    return c &= b;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp> operator|(const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    Rect_<_Tp> c = a;
+    return c |= b;
+}
+
+typedef Rect_<int> Rect;
+typedef Rect_<float> Rect2f;
+
+#define CV_8UC1  1
+#define CV_8UC3  3
+#define CV_8UC4  4
+#define CV_32FC1 4
+
+struct NCNN_EXPORT Mat
+{
+    Mat()
+        : data(0), refcount(0), rows(0), cols(0), c(0)
+    {
+    }
+
+    Mat(int _rows, int _cols, int flags)
+        : data(0), refcount(0)
+    {
+        create(_rows, _cols, flags);
+    }
+
+    // copy
+    Mat(const Mat& m)
+        : data(m.data), refcount(m.refcount)
+    {
+        if (refcount)
+            NCNN_XADD(refcount, 1);
+
+        rows = m.rows;
+        cols = m.cols;
+        c = m.c;
+    }
+
+    Mat(int _rows, int _cols, int flags, void* _data)
+        : data((unsigned char*)_data), refcount(0)
+    {
+        rows = _rows;
+        cols = _cols;
+        c = flags;
+    }
+
+    ~Mat()
+    {
+        release();
+    }
+
+    // assign
+    Mat& operator=(const Mat& m)
+    {
+        if (this == &m)
+            return *this;
+
+        if (m.refcount)
+            NCNN_XADD(m.refcount, 1);
+
+        release();
+
+        data = m.data;
+        refcount = m.refcount;
+
+        rows = m.rows;
+        cols = m.cols;
+        c = m.c;
+
+        return *this;
+    }
+
+    Mat& operator=(const Scalar& s)
+    {
+        if (total() > 0)
+        {
+            uchar* p = data;
+            for (int i = 0; i < cols * rows; i++)
+            {
+                for (int j = 0; j < c; j++)
+                {
+                    *p++ = s[j];
+                }
+            }
+        }
+
+        return *this;
+    }
+
+    void create(int _rows, int _cols, int flags)
+    {
+        release();
+
+        rows = _rows;
+        cols = _cols;
+        c = flags;
+
+        if (total() > 0)
+        {
+            // refcount address must be aligned, so we expand totalsize here
+            size_t totalsize = (total() + 3) >> 2 << 2;
+            data = (uchar*)ncnn::fastMalloc(totalsize + (int)sizeof(*refcount));
+            refcount = (int*)(((uchar*)data) + totalsize);
+            *refcount = 1;
+        }
+    }
+
+    void release()
+    {
+        if (refcount && NCNN_XADD(refcount, -1) == 1)
+            ncnn::fastFree(data);
+
+        data = 0;
+
+        rows = 0;
+        cols = 0;
+        c = 0;
+
+        refcount = 0;
+    }
+
+    Mat clone() const
+    {
+        if (empty())
+            return Mat();
+
+        Mat m(rows, cols, c);
+
+        if (total() > 0)
+        {
+            memcpy(m.data, data, total());
+        }
+
+        return m;
+    }
+
+    bool empty() const
+    {
+        return data == 0 || total() == 0;
+    }
+
+    int channels() const
+    {
+        return c;
+    }
+
+    int type() const
+    {
+        return c;
+    }
+
+    size_t total() const
+    {
+        return cols * rows * c;
+    }
+
+    const uchar* ptr(int y) const
+    {
+        return data + y * cols * c;
+    }
+
+    uchar* ptr(int y)
+    {
+        return data + y * cols * c;
+    }
+
+    template<typename _Tp>
+    const _Tp* ptr(int y) const
+    {
+        return (const _Tp*)data + y * cols * c;
+    }
+
+    template<typename _Tp>
+    _Tp* ptr(int y)
+    {
+        return (_Tp*)data + y * cols * c;
+    }
+
+    // roi
+    Mat operator()(const Rect& roi) const
+    {
+        if (empty())
+            return Mat();
+
+        Mat m(roi.height, roi.width, c);
+
+        int sy = roi.y;
+        for (int y = 0; y < roi.height; y++)
+        {
+            const uchar* sptr = ptr(sy) + roi.x * c;
+            uchar* dptr = m.ptr(y);
+            memcpy(dptr, sptr, roi.width * c);
+            sy++;
+        }
+
+        return m;
+    }
+
+    uchar* data;
+
+    // pointer to the reference counter;
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    int rows;
+    int cols;
+
+    int c;
+};
+
+enum ImreadModes
+{
+    IMREAD_UNCHANGED = -1,
+    IMREAD_GRAYSCALE = 0,
+    IMREAD_COLOR = 1
+};
+
+NCNN_EXPORT Mat imread(const std::string& path, int flags = IMREAD_COLOR);
+
+NCNN_EXPORT Mat imdecode(const std::vector<uchar>& buf, int flags = IMREAD_COLOR);
+
+enum ImwriteFlags
+{
+    IMWRITE_JPEG_QUALITY = 1
+};
+
+NCNN_EXPORT bool imwrite(const std::string& path, const Mat& m, const std::vector<int>& params = std::vector<int>());
+
+NCNN_EXPORT void imshow(const std::string& name, const Mat& m);
+
+NCNN_EXPORT int waitKey(int delay = 0);
+
+#if NCNN_PIXEL
+NCNN_EXPORT void resize(const Mat& src, Mat& dst, const Size& size, float sw = 0.f, float sh = 0.f, int flags = 0);
+#endif // NCNN_PIXEL
+
+#if NCNN_PIXEL_DRAWING
+
+enum
+{
+    FILLED = -1
+};
+
+NCNN_EXPORT void rectangle(Mat& img, Point pt1, Point pt2, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void rectangle(Mat& img, Rect rec, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void circle(Mat& img, Point center, int radius, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void line(Mat& img, Point p0, Point p1, const Scalar& color, int thickness = 1);
+
+enum
+{
+    FONT_HERSHEY_SIMPLEX = 0
+};
+
+NCNN_EXPORT void putText(Mat& img, const std::string& text, Point org, int fontFace, double fontScale, Scalar color, int thickness = 1);
+
+NCNN_EXPORT Size getTextSize(const std::string& text, int fontFace, double fontScale, int thickness, int* baseLine);
+
+#endif // NCNN_PIXEL_DRAWING
+
+} // namespace cv
+
+#if defined(_MSC_VER) || defined(__GNUC__)
+#pragma pop_macro("min")
+#pragma pop_macro("max")
+#endif
+
+#endif // NCNN_SIMPLEOCV
+
+#endif // NCNN_SIMPLEOCV_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/simpleomp.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/simpleomp.h
new file mode 100644
index 0000000..13e2452
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/simpleomp.h
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLEOMP_H
+#define NCNN_SIMPLEOMP_H
+
+#include "platform.h"
+
+#if NCNN_SIMPLEOMP
+
+#include <stdint.h>
+
+// This minimal openmp runtime implementation only supports the llvm openmp abi
+// and only supports #pragma omp parallel for num_threads(X)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+NCNN_EXPORT int omp_get_max_threads();
+
+NCNN_EXPORT void omp_set_num_threads(int num_threads);
+
+NCNN_EXPORT int omp_get_dynamic();
+
+NCNN_EXPORT void omp_set_dynamic(int dynamic);
+
+NCNN_EXPORT int omp_get_num_threads();
+
+NCNN_EXPORT int omp_get_thread_num();
+
+NCNN_EXPORT int kmp_get_blocktime();
+
+NCNN_EXPORT void kmp_set_blocktime(int blocktime);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // NCNN_SIMPLEOMP
+
+#endif // NCNN_SIMPLEOMP_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/simplestl.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/simplestl.h
new file mode 100644
index 0000000..00ff468
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/simplestl.h
@@ -0,0 +1,565 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLESTL_H
+#define NCNN_SIMPLESTL_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#if !NCNN_SIMPLESTL
+
+#include <new>
+
+#else
+
+// allocation functions
+NCNN_EXPORT void* operator new(size_t size);
+NCNN_EXPORT void* operator new[](size_t size);
+// placement allocation functions
+NCNN_EXPORT void* operator new(size_t size, void* ptr);
+NCNN_EXPORT void* operator new[](size_t size, void* ptr);
+// deallocation functions
+NCNN_EXPORT void operator delete(void* ptr);
+NCNN_EXPORT void operator delete[](void* ptr);
+// deallocation functions since c++14
+#if __cplusplus >= 201402L
+NCNN_EXPORT void operator delete(void* ptr, size_t sz);
+NCNN_EXPORT void operator delete[](void* ptr, size_t sz);
+#endif
+// placement deallocation functions
+NCNN_EXPORT void operator delete(void* ptr, void* voidptr2);
+NCNN_EXPORT void operator delete[](void* ptr, void* voidptr2);
+
+#endif
+
+// minimal stl data structure implementation
+namespace std {
+
+template<typename T>
+const T& max(const T& a, const T& b)
+{
+    return (a < b) ? b : a;
+}
+
+template<typename T>
+const T& min(const T& a, const T& b)
+{
+    return (a > b) ? b : a;
+}
+
+template<typename T>
+void swap(T& a, T& b)
+{
+    T temp(a);
+    a = b;
+    b = temp;
+}
+
+template<typename T1, typename T2>
+struct pair
+{
+    pair()
+        : first(), second()
+    {
+    }
+    pair(const T1& t1, const T2& t2)
+        : first(t1), second(t2)
+    {
+    }
+
+    T1 first;
+    T2 second;
+};
+
+template<typename T1, typename T2>
+bool operator==(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return (x.first == y.first && x.second == y.second);
+}
+template<typename T1, typename T2>
+bool operator<(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return x.first < y.first || (!(y.first < x.first) && x.second < y.second);
+}
+template<typename T1, typename T2>
+bool operator!=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(x == y);
+}
+template<typename T1, typename T2>
+bool operator>(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return y < x;
+}
+template<typename T1, typename T2>
+bool operator<=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(y < x);
+}
+template<typename T1, typename T2>
+bool operator>=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(x < y);
+}
+
+template<typename T1, typename T2>
+pair<T1, T2> make_pair(const T1& t1, const T2& t2)
+{
+    return pair<T1, T2>(t1, t2);
+}
+
+template<typename T>
+struct node
+{
+    node* prev_;
+    node* next_;
+    T data_;
+
+    node()
+        : prev_(0), next_(0), data_()
+    {
+    }
+    node(const T& t)
+        : prev_(0), next_(0), data_(t)
+    {
+    }
+};
+
+template<typename T>
+struct iter_list
+{
+    iter_list()
+        : curr_(0)
+    {
+    }
+    iter_list(node<T>* n)
+        : curr_(n)
+    {
+    }
+    iter_list(const iter_list& i)
+        : curr_(i.curr_)
+    {
+    }
+    ~iter_list()
+    {
+    }
+
+    iter_list& operator=(const iter_list& i)
+    {
+        curr_ = i.curr_;
+        return *this;
+    }
+
+    T& operator*()
+    {
+        return curr_->data_;
+    }
+    T* operator->()
+    {
+        return &(curr_->data_);
+    }
+
+    bool operator==(const iter_list& i)
+    {
+        return curr_ == i.curr_;
+    }
+    bool operator!=(const iter_list& i)
+    {
+        return curr_ != i.curr_;
+    }
+
+    iter_list& operator++()
+    {
+        curr_ = curr_->next_;
+        return *this;
+    }
+    iter_list& operator--()
+    {
+        curr_ = curr_->prev_;
+        return *this;
+    }
+
+    node<T>* curr_;
+};
+
+template<typename T>
+struct list
+{
+    typedef iter_list<T> iterator;
+
+    list()
+    {
+        head_ = new node<T>();
+        tail_ = head_;
+        count_ = 0;
+    }
+    ~list()
+    {
+        clear();
+        delete head_;
+    }
+    list(const list& l)
+    {
+        head_ = new node<T>();
+        tail_ = head_;
+        count_ = 0;
+
+        for (iter_list<T> i = l.begin(); i != l.end(); ++i)
+        {
+            push_back(*i);
+        }
+    }
+
+    list& operator=(const list& l)
+    {
+        if (this == &l)
+        {
+            return *this;
+        }
+        clear();
+
+        for (iter_list<T> i = l.begin(); i != l.end(); ++i)
+        {
+            push_back(*i);
+        }
+        return *this;
+    }
+
+    void clear()
+    {
+        while (count_ > 0)
+        {
+            pop_front();
+        }
+    }
+
+    void pop_front()
+    {
+        if (count_ > 0)
+        {
+            head_ = head_->next_;
+            delete head_->prev_;
+            head_->prev_ = 0;
+            --count_;
+        }
+    }
+
+    size_t size() const
+    {
+        return count_;
+    }
+    iter_list<T> begin() const
+    {
+        return iter_list<T>(head_);
+    }
+    iter_list<T> end() const
+    {
+        return iter_list<T>(tail_);
+    }
+    bool empty() const
+    {
+        return count_ == 0;
+    }
+
+    void push_back(const T& t)
+    {
+        if (count_ == 0)
+        {
+            head_ = new node<T>(t);
+            head_->prev_ = 0;
+            head_->next_ = tail_;
+            tail_->prev_ = head_;
+            count_ = 1;
+        }
+        else
+        {
+            node<T>* temp = new node<T>(t);
+            temp->prev_ = tail_->prev_;
+            temp->next_ = tail_;
+            tail_->prev_->next_ = temp;
+            tail_->prev_ = temp;
+            ++count_;
+        }
+    }
+
+    iter_list<T> erase(iter_list<T> pos)
+    {
+        if (pos != end())
+        {
+            node<T>* temp = pos.curr_;
+            if (temp == head_)
+            {
+                ++pos;
+                temp->next_->prev_ = 0;
+                head_ = temp->next_;
+            }
+            else
+            {
+                --pos;
+                temp->next_->prev_ = temp->prev_;
+                temp->prev_->next_ = temp->next_;
+                ++pos;
+            }
+            delete temp;
+            --count_;
+        }
+        return pos;
+    }
+
+protected:
+    node<T>* head_;
+    node<T>* tail_;
+    size_t count_;
+};
+
+template<typename T>
+struct greater
+{
+    bool operator()(const T& x, const T& y) const
+    {
+        return (x > y);
+    }
+};
+
+template<typename T>
+struct less
+{
+    bool operator()(const T& x, const T& y) const
+    {
+        return (x < y);
+    }
+};
+
+template<typename RandomAccessIter, typename Compare>
+void partial_sort(RandomAccessIter first, RandomAccessIter middle, RandomAccessIter last, Compare comp)
+{
+    // [TODO] heap sort should be used here, but we simply use bubble sort now
+    for (RandomAccessIter i = first; i < middle; ++i)
+    {
+        // bubble sort
+        for (RandomAccessIter j = last - 1; j > first; --j)
+        {
+            if (comp(*j, *(j - 1)))
+            {
+                swap(*j, *(j - 1));
+            }
+        }
+    }
+}
+
+template<typename T>
+struct vector
+{
+    vector()
+        : data_(0), size_(0), capacity_(0)
+    {
+    }
+    vector(const size_t new_size, const T& value = T())
+        : data_(0), size_(0), capacity_(0)
+    {
+        resize(new_size, value);
+    }
+    ~vector()
+    {
+        clear();
+    }
+    vector(const vector& v)
+        : data_(0), size_(0), capacity_(0)
+    {
+        resize(v.size());
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i] = v.data_[i];
+        }
+    }
+
+    vector& operator=(const vector& v)
+    {
+        if (this == &v)
+        {
+            return *this;
+        }
+        resize(0);
+        resize(v.size());
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i] = v.data_[i];
+        }
+        return *this;
+    }
+
+    void resize(const size_t new_size, const T& value = T())
+    {
+        try_alloc(new_size);
+        if (new_size > size_)
+        {
+            for (size_t i = size_; i < new_size; i++)
+            {
+                new (&data_[i]) T(value);
+            }
+        }
+        else if (new_size < size_)
+        {
+            for (size_t i = new_size; i < size_; i++)
+            {
+                data_[i].~T();
+            }
+        }
+        size_ = new_size;
+    }
+
+    void clear()
+    {
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i].~T();
+        }
+        delete[](char*) data_;
+        data_ = 0;
+        size_ = 0;
+        capacity_ = 0;
+    }
+
+    T* data() const
+    {
+        return data_;
+    }
+    size_t size() const
+    {
+        return size_;
+    }
+    T& operator[](size_t i) const
+    {
+        return data_[i];
+    }
+    T* begin() const
+    {
+        return &data_[0];
+    }
+    T* end() const
+    {
+        return &data_[size_];
+    }
+    bool empty() const
+    {
+        return size_ == 0;
+    }
+
+    void push_back(const T& t)
+    {
+        try_alloc(size_ + 1);
+        new (&data_[size_]) T(t);
+        size_++;
+    }
+
+    void insert(T* pos, T* b, T* e)
+    {
+        vector* v = 0;
+        if (b >= begin() && b < end())
+        {
+            //the same vector
+            v = new vector(*this);
+            b = v->begin() + (b - begin());
+            e = v->begin() + (e - begin());
+        }
+        size_t diff = pos - begin();
+        try_alloc(size_ + (e - b));
+        pos = begin() + diff;
+        memmove(pos + (e - b), pos, (end() - pos) * sizeof(T));
+        size_t len = e - b;
+        size_ += len;
+        for (size_t i = 0; i < len; i++)
+        {
+            *pos = *b;
+            pos++;
+            b++;
+        }
+        delete v;
+    }
+
+    T* erase(T* pos)
+    {
+        pos->~T();
+        memmove(pos, pos + 1, (end() - pos - 1) * sizeof(T));
+        size_--;
+        return pos;
+    }
+
+protected:
+    T* data_;
+    size_t size_;
+    size_t capacity_;
+    void try_alloc(size_t new_size)
+    {
+        if (new_size * 3 / 2 > capacity_ / 2)
+        {
+            capacity_ = new_size * 2;
+            T* new_data = (T*)new char[capacity_ * sizeof(T)];
+            memset(static_cast<void*>(new_data), 0, capacity_ * sizeof(T));
+            if (data_)
+            {
+                memmove(new_data, data_, sizeof(T) * size_);
+                delete[](char*) data_;
+            }
+            data_ = new_data;
+        }
+    }
+};
+
+struct NCNN_EXPORT string : public vector<char>
+{
+    string()
+    {
+    }
+    string(const char* str)
+    {
+        size_t len = strlen(str);
+        resize(len);
+        memcpy(data_, str, len);
+    }
+    const char* c_str() const
+    {
+        return (const char*)data_;
+    }
+    bool operator==(const string& str2) const
+    {
+        return strcmp(data_, str2.data_) == 0;
+    }
+    bool operator==(const char* str2) const
+    {
+        return strcmp(data_, str2) == 0;
+    }
+    bool operator!=(const char* str2) const
+    {
+        return strcmp(data_, str2) != 0;
+    }
+    string& operator+=(const string& str1)
+    {
+        insert(end(), str1.begin(), str1.end());
+        return *this;
+    }
+};
+
+inline string operator+(const string& str1, const string& str2)
+{
+    string str(str1);
+    str.insert(str.end(), str2.begin(), str2.end());
+    return str;
+}
+
+} // namespace std
+
+#endif // NCNN_SIMPLESTL_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/vulkan_header_fix.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/vulkan_header_fix.h
new file mode 100644
index 0000000..0a5ea9b
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/include/ncnn/vulkan_header_fix.h
@@ -0,0 +1,449 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_VULKAN_HEADER_FIX_H
+#define NCNN_VULKAN_HEADER_FIX_H
+
+#include <vulkan/vulkan.h>
+
+// This header contains new structure and function declearation to fix build with old vulkan sdk
+
+#if VK_HEADER_VERSION < 70
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES (VkStructureType)1000094000
+typedef enum VkSubgroupFeatureFlagBits
+{
+    VK_SUBGROUP_FEATURE_BASIC_BIT = 0x00000001,
+    VK_SUBGROUP_FEATURE_VOTE_BIT = 0x00000002,
+    VK_SUBGROUP_FEATURE_ARITHMETIC_BIT = 0x00000004,
+    VK_SUBGROUP_FEATURE_BALLOT_BIT = 0x00000008,
+    VK_SUBGROUP_FEATURE_SHUFFLE_BIT = 0x00000010,
+    VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT = 0x00000020,
+    VK_SUBGROUP_FEATURE_CLUSTERED_BIT = 0x00000040,
+    VK_SUBGROUP_FEATURE_QUAD_BIT = 0x00000080,
+    VK_SUBGROUP_FEATURE_PARTITIONED_BIT_NV = 0x00000100,
+    VK_SUBGROUP_FEATURE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkSubgroupFeatureFlagBits;
+typedef VkFlags VkSubgroupFeatureFlags;
+typedef struct VkPhysicalDeviceSubgroupProperties
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t subgroupSize;
+    VkShaderStageFlags supportedStages;
+    VkSubgroupFeatureFlags supportedOperations;
+    VkBool32 quadOperationsInAllStages;
+} VkPhysicalDeviceSubgroupProperties;
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_3_PROPERTIES (VkStructureType)1000168000
+#define VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_SUPPORT            (VkStructureType)1000168001
+typedef struct VkPhysicalDeviceMaintenance3Properties
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t maxPerSetDescriptors;
+    VkDeviceSize maxMemoryAllocationSize;
+} VkPhysicalDeviceMaintenance3Properties;
+typedef struct VkDescriptorSetLayoutSupport
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 supported;
+} VkDescriptorSetLayoutSupport;
+typedef VkPhysicalDeviceMaintenance3Properties VkPhysicalDeviceMaintenance3PropertiesKHR;
+typedef VkDescriptorSetLayoutSupport VkDescriptorSetLayoutSupportKHR;
+typedef void(VKAPI_PTR* PFN_vkGetDescriptorSetLayoutSupportKHR)(VkDevice device, const VkDescriptorSetLayoutCreateInfo* pCreateInfo, VkDescriptorSetLayoutSupport* pSupport);
+#endif // VK_HEADER_VERSION < 70
+
+#if VK_HEADER_VERSION < 80
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR (VkStructureType)1000177000
+typedef struct VkPhysicalDevice8BitStorageFeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 storageBuffer8BitAccess;
+    VkBool32 uniformAndStorageBuffer8BitAccess;
+    VkBool32 storagePushConstant8;
+} VkPhysicalDevice8BitStorageFeaturesKHR;
+#define VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2_KHR  (VkStructureType)1000109000
+#define VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2_KHR    (VkStructureType)1000109001
+#define VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2_KHR     (VkStructureType)1000109002
+#define VK_STRUCTURE_TYPE_SUBPASS_DEPENDENCY_2_KHR      (VkStructureType)1000109003
+#define VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2_KHR (VkStructureType)1000109004
+#define VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO_KHR        (VkStructureType)1000109005
+#define VK_STRUCTURE_TYPE_SUBPASS_END_INFO_KHR          (VkStructureType)1000109006
+typedef struct VkAttachmentDescription2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkAttachmentDescriptionFlags flags;
+    VkFormat format;
+    VkSampleCountFlagBits samples;
+    VkAttachmentLoadOp loadOp;
+    VkAttachmentStoreOp storeOp;
+    VkAttachmentLoadOp stencilLoadOp;
+    VkAttachmentStoreOp stencilStoreOp;
+    VkImageLayout initialLayout;
+    VkImageLayout finalLayout;
+} VkAttachmentDescription2KHR;
+typedef struct VkAttachmentReference2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint32_t attachment;
+    VkImageLayout layout;
+    VkImageAspectFlags aspectMask;
+} VkAttachmentReference2KHR;
+typedef struct VkSubpassDescription2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkSubpassDescriptionFlags flags;
+    VkPipelineBindPoint pipelineBindPoint;
+    uint32_t viewMask;
+    uint32_t inputAttachmentCount;
+    const VkAttachmentReference2KHR* pInputAttachments;
+    uint32_t colorAttachmentCount;
+    const VkAttachmentReference2KHR* pColorAttachments;
+    const VkAttachmentReference2KHR* pResolveAttachments;
+    const VkAttachmentReference2KHR* pDepthStencilAttachment;
+    uint32_t preserveAttachmentCount;
+    const uint32_t* pPreserveAttachments;
+} VkSubpassDescription2KHR;
+typedef struct VkSubpassDependency2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint32_t srcSubpass;
+    uint32_t dstSubpass;
+    VkPipelineStageFlags srcStageMask;
+    VkPipelineStageFlags dstStageMask;
+    VkAccessFlags srcAccessMask;
+    VkAccessFlags dstAccessMask;
+    VkDependencyFlags dependencyFlags;
+    int32_t viewOffset;
+} VkSubpassDependency2KHR;
+typedef struct VkRenderPassCreateInfo2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkRenderPassCreateFlags flags;
+    uint32_t attachmentCount;
+    const VkAttachmentDescription2KHR* pAttachments;
+    uint32_t subpassCount;
+    const VkSubpassDescription2KHR* pSubpasses;
+    uint32_t dependencyCount;
+    const VkSubpassDependency2KHR* pDependencies;
+    uint32_t correlatedViewMaskCount;
+    const uint32_t* pCorrelatedViewMasks;
+} VkRenderPassCreateInfo2KHR;
+typedef struct VkSubpassBeginInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkSubpassContents contents;
+} VkSubpassBeginInfoKHR;
+
+typedef struct VkSubpassEndInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+} VkSubpassEndInfoKHR;
+typedef VkResult(VKAPI_PTR* PFN_vkCreateRenderPass2KHR)(VkDevice device, const VkRenderPassCreateInfo2KHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkRenderPass* pRenderPass);
+typedef void(VKAPI_PTR* PFN_vkCmdBeginRenderPass2KHR)(VkCommandBuffer commandBuffer, const VkRenderPassBeginInfo* pRenderPassBegin, const VkSubpassBeginInfoKHR* pSubpassBeginInfo);
+typedef void(VKAPI_PTR* PFN_vkCmdNextSubpass2KHR)(VkCommandBuffer commandBuffer, const VkSubpassBeginInfoKHR* pSubpassBeginInfo, const VkSubpassEndInfoKHR* pSubpassEndInfo);
+typedef void(VKAPI_PTR* PFN_vkCmdEndRenderPass2KHR)(VkCommandBuffer commandBuffer, const VkSubpassEndInfoKHR* pSubpassEndInfo);
+#endif // VK_HEADER_VERSION < 80
+
+#if VK_HEADER_VERSION < 95
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR (VkStructureType)1000082000
+typedef struct VkPhysicalDeviceFloat16Int8FeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 shaderFloat16;
+    VkBool32 shaderInt8;
+} VkPhysicalDeviceFloat16Int8FeaturesKHR;
+#endif // VK_HEADER_VERSION < 95
+
+#if VK_HEADER_VERSION < 97
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT (VkStructureType)1000237000
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PRIORITY_FEATURES_EXT (VkStructureType)1000238000
+#define VK_STRUCTURE_TYPE_MEMORY_PRIORITY_ALLOCATE_INFO_EXT            (VkStructureType)1000238001
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_ADDRESS_FEATURES_EXT  (VkStructureType)1000244000
+#define VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO_EXT               (VkStructureType)1000244001
+#define VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_CREATE_INFO_EXT        (VkStructureType)1000244002
+#define VK_STRUCTURE_TYPE_VALIDATION_FEATURES_EXT                      (VkStructureType)1000247000
+#define VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT_EXT         (VkBufferCreateFlagBits)0x00020000
+#define VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_EXT                  (VkBufferUsageFlagBits)0x00020000
+typedef uint64_t VkDeviceAddress;
+typedef struct VkPhysicalDeviceMemoryBudgetPropertiesEXT
+{
+    VkStructureType sType;
+    void* pNext;
+    VkDeviceSize heapBudget[VK_MAX_MEMORY_HEAPS];
+    VkDeviceSize heapUsage[VK_MAX_MEMORY_HEAPS];
+} VkPhysicalDeviceMemoryBudgetPropertiesEXT;
+typedef struct VkPhysicalDeviceMemoryPriorityFeaturesEXT
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 memoryPriority;
+} VkPhysicalDeviceMemoryPriorityFeaturesEXT;
+typedef struct VkMemoryPriorityAllocateInfoEXT
+{
+    VkStructureType sType;
+    const void* pNext;
+    float priority;
+} VkMemoryPriorityAllocateInfoEXT;
+typedef struct VkPhysicalDeviceBufferAddressFeaturesEXT
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 bufferDeviceAddress;
+    VkBool32 bufferDeviceAddressCaptureReplay;
+    VkBool32 bufferDeviceAddressMultiDevice;
+} VkPhysicalDeviceBufferAddressFeaturesEXT;
+typedef struct VkBufferDeviceAddressInfoEXT
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkBuffer buffer;
+} VkBufferDeviceAddressInfoEXT;
+typedef struct VkBufferDeviceAddressCreateInfoEXT
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkDeviceSize deviceAddress;
+} VkBufferDeviceAddressCreateInfoEXT;
+typedef VkDeviceAddress(VKAPI_PTR* PFN_vkGetBufferDeviceAddressEXT)(VkDevice device, const VkBufferDeviceAddressInfoEXT* pInfo);
+typedef enum VkValidationFeatureEnableEXT
+{
+    VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT = 0,
+    VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT = 1,
+    VK_VALIDATION_FEATURE_ENABLE_BEGIN_RANGE_EXT = VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT,
+    VK_VALIDATION_FEATURE_ENABLE_END_RANGE_EXT = VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT,
+    VK_VALIDATION_FEATURE_ENABLE_RANGE_SIZE_EXT = (VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT - VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT + 1),
+    VK_VALIDATION_FEATURE_ENABLE_MAX_ENUM_EXT = 0x7FFFFFFF
+} VkValidationFeatureEnableEXT;
+typedef enum VkValidationFeatureDisableEXT
+{
+    VK_VALIDATION_FEATURE_DISABLE_ALL_EXT = 0,
+    VK_VALIDATION_FEATURE_DISABLE_SHADERS_EXT = 1,
+    VK_VALIDATION_FEATURE_DISABLE_THREAD_SAFETY_EXT = 2,
+    VK_VALIDATION_FEATURE_DISABLE_API_PARAMETERS_EXT = 3,
+    VK_VALIDATION_FEATURE_DISABLE_OBJECT_LIFETIMES_EXT = 4,
+    VK_VALIDATION_FEATURE_DISABLE_CORE_CHECKS_EXT = 5,
+    VK_VALIDATION_FEATURE_DISABLE_UNIQUE_HANDLES_EXT = 6,
+    VK_VALIDATION_FEATURE_DISABLE_BEGIN_RANGE_EXT = VK_VALIDATION_FEATURE_DISABLE_ALL_EXT,
+    VK_VALIDATION_FEATURE_DISABLE_END_RANGE_EXT = VK_VALIDATION_FEATURE_DISABLE_UNIQUE_HANDLES_EXT,
+    VK_VALIDATION_FEATURE_DISABLE_RANGE_SIZE_EXT = (VK_VALIDATION_FEATURE_DISABLE_UNIQUE_HANDLES_EXT - VK_VALIDATION_FEATURE_DISABLE_ALL_EXT + 1),
+    VK_VALIDATION_FEATURE_DISABLE_MAX_ENUM_EXT = 0x7FFFFFFF
+} VkValidationFeatureDisableEXT;
+typedef struct VkValidationFeaturesEXT
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint32_t enabledValidationFeatureCount;
+    const VkValidationFeatureEnableEXT* pEnabledValidationFeatures;
+    uint32_t disabledValidationFeatureCount;
+    const VkValidationFeatureDisableEXT* pDisabledValidationFeatures;
+} VkValidationFeaturesEXT;
+#endif // VK_HEADER_VERSION < 97
+
+#if VK_HEADER_VERSION < 101
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_NV   (VkStructureType)1000249000
+#define VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_NV                 (VkStructureType)1000249001
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_PROPERTIES_NV (VkStructureType)1000249002
+typedef enum VkComponentTypeNV
+{
+    VK_COMPONENT_TYPE_FLOAT16_NV = 0,
+    VK_COMPONENT_TYPE_FLOAT32_NV = 1,
+    VK_COMPONENT_TYPE_FLOAT64_NV = 2,
+    VK_COMPONENT_TYPE_SINT8_NV = 3,
+    VK_COMPONENT_TYPE_SINT16_NV = 4,
+    VK_COMPONENT_TYPE_SINT32_NV = 5,
+    VK_COMPONENT_TYPE_SINT64_NV = 6,
+    VK_COMPONENT_TYPE_UINT8_NV = 7,
+    VK_COMPONENT_TYPE_UINT16_NV = 8,
+    VK_COMPONENT_TYPE_UINT32_NV = 9,
+    VK_COMPONENT_TYPE_UINT64_NV = 10,
+    VK_COMPONENT_TYPE_BEGIN_RANGE_NV = VK_COMPONENT_TYPE_FLOAT16_NV,
+    VK_COMPONENT_TYPE_END_RANGE_NV = VK_COMPONENT_TYPE_UINT64_NV,
+    VK_COMPONENT_TYPE_RANGE_SIZE_NV = (VK_COMPONENT_TYPE_UINT64_NV - VK_COMPONENT_TYPE_FLOAT16_NV + 1),
+    VK_COMPONENT_TYPE_MAX_ENUM_NV = 0x7FFFFFFF
+} VkComponentTypeNV;
+typedef enum VkScopeNV
+{
+    VK_SCOPE_DEVICE_NV = 1,
+    VK_SCOPE_WORKGROUP_NV = 2,
+    VK_SCOPE_SUBGROUP_NV = 3,
+    VK_SCOPE_QUEUE_FAMILY_NV = 5,
+    VK_SCOPE_BEGIN_RANGE_NV = VK_SCOPE_DEVICE_NV,
+    VK_SCOPE_END_RANGE_NV = VK_SCOPE_QUEUE_FAMILY_NV,
+    VK_SCOPE_RANGE_SIZE_NV = (VK_SCOPE_QUEUE_FAMILY_NV - VK_SCOPE_DEVICE_NV + 1),
+    VK_SCOPE_MAX_ENUM_NV = 0x7FFFFFFF
+} VkScopeNV;
+typedef struct VkCooperativeMatrixPropertiesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t MSize;
+    uint32_t NSize;
+    uint32_t KSize;
+    VkComponentTypeNV AType;
+    VkComponentTypeNV BType;
+    VkComponentTypeNV CType;
+    VkComponentTypeNV DType;
+    VkScopeNV scope;
+} VkCooperativeMatrixPropertiesNV;
+typedef struct VkPhysicalDeviceCooperativeMatrixFeaturesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 cooperativeMatrix;
+    VkBool32 cooperativeMatrixRobustBufferAccess;
+} VkPhysicalDeviceCooperativeMatrixFeaturesNV;
+typedef struct VkPhysicalDeviceCooperativeMatrixPropertiesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    VkShaderStageFlags cooperativeMatrixSupportedStages;
+} VkPhysicalDeviceCooperativeMatrixPropertiesNV;
+typedef VkResult(VKAPI_PTR* PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesNV)(VkPhysicalDevice physicalDevice, uint32_t* pPropertyCount, VkCooperativeMatrixPropertiesNV* pProperties);
+#endif // VK_HEADER_VERSION < 101
+
+#if VK_HEADER_VERSION < 121
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COHERENT_MEMORY_FEATURES_AMD (VkStructureType)1000229000
+#define VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD                     (VkMemoryPropertyFlagBits)0x00000040
+#define VK_MEMORY_PROPERTY_DEVICE_UNCACHED_BIT_AMD                     (VkMemoryPropertyFlagBits)0x00000040
+typedef struct VkPhysicalDeviceCoherentMemoryFeaturesAMD
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 deviceCoherentMemory;
+} VkPhysicalDeviceCoherentMemoryFeaturesAMD;
+#endif // VK_HEADER_VERSION < 121
+
+#if VK_HEADER_VERSION < 129
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES_KHR (VkStructureType)1000257000
+#define VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO_KHR                     (VkStructureType)1000244001
+#define VK_STRUCTURE_TYPE_BUFFER_OPAQUE_CAPTURE_ADDRESS_CREATE_INFO_KHR      (VkStructureType)1000257002
+#define VK_STRUCTURE_TYPE_MEMORY_OPAQUE_CAPTURE_ADDRESS_ALLOCATE_INFO_KHR    (VkStructureType)1000257003
+#define VK_STRUCTURE_TYPE_DEVICE_MEMORY_OPAQUE_CAPTURE_ADDRESS_INFO_KHR      (VkStructureType)1000257004
+#define VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT_KHR               (VkBufferCreateFlagBits)0x00020000
+#define VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_KHR                        (VkBufferUsageFlagBits)0x00020000
+#define VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT_KHR                            (VkMemoryAllocateFlagBits)0x00000002
+#define VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT_KHR             (VkMemoryAllocateFlagBits)0x00000004
+typedef struct VkPhysicalDeviceBufferDeviceAddressFeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 bufferDeviceAddress;
+    VkBool32 bufferDeviceAddressCaptureReplay;
+    VkBool32 bufferDeviceAddressMultiDevice;
+} VkPhysicalDeviceBufferDeviceAddressFeaturesKHR;
+typedef struct VkBufferDeviceAddressInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkBuffer buffer;
+} VkBufferDeviceAddressInfoKHR;
+typedef struct VkBufferOpaqueCaptureAddressCreateInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint64_t opaqueCaptureAddress;
+} VkBufferOpaqueCaptureAddressCreateInfoKHR;
+typedef struct VkMemoryOpaqueCaptureAddressAllocateInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint64_t opaqueCaptureAddress;
+} VkMemoryOpaqueCaptureAddressAllocateInfoKHR;
+typedef struct VkDeviceMemoryOpaqueCaptureAddressInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkDeviceMemory memory;
+} VkDeviceMemoryOpaqueCaptureAddressInfoKHR;
+typedef VkDeviceAddress(VKAPI_PTR* PFN_vkGetBufferDeviceAddressKHR)(VkDevice device, const VkBufferDeviceAddressInfoKHR* pInfo);
+typedef uint64_t(VKAPI_PTR* PFN_vkGetBufferOpaqueCaptureAddressKHR)(VkDevice device, const VkBufferDeviceAddressInfoKHR* pInfo);
+typedef uint64_t(VKAPI_PTR* PFN_vkGetDeviceMemoryOpaqueCaptureAddressKHR)(VkDevice device, const VkDeviceMemoryOpaqueCaptureAddressInfoKHR* pInfo);
+#endif // VK_HEADER_VERSION < 129
+
+#if VK_HEADER_VERSION < 208
+typedef enum VkInstanceCreateFlagBits
+{
+    VK_INSTANCE_CREATE_ENUMERATE_PORTABILITY_BIT_KHR = 0x00000001,
+    VK_INSTANCE_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkInstanceCreateFlagBits;
+#endif // VK_HEADER_VERSION < 208
+
+#if VK_HEADER_VERSION < 255
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR   (VkStructureType)1000506000
+#define VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR                 (VkStructureType)1000506001
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_PROPERTIES_KHR (VkStructureType)1000506002
+typedef enum VkComponentTypeKHR
+{
+    VK_COMPONENT_TYPE_FLOAT16_KHR = 0,
+    VK_COMPONENT_TYPE_FLOAT32_KHR = 1,
+    VK_COMPONENT_TYPE_FLOAT64_KHR = 2,
+    VK_COMPONENT_TYPE_SINT8_KHR = 3,
+    VK_COMPONENT_TYPE_SINT16_KHR = 4,
+    VK_COMPONENT_TYPE_SINT32_KHR = 5,
+    VK_COMPONENT_TYPE_SINT64_KHR = 6,
+    VK_COMPONENT_TYPE_UINT8_KHR = 7,
+    VK_COMPONENT_TYPE_UINT16_KHR = 8,
+    VK_COMPONENT_TYPE_UINT32_KHR = 9,
+    VK_COMPONENT_TYPE_UINT64_KHR = 10,
+    VK_COMPONENT_TYPE_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkComponentTypeKHR;
+typedef enum VkScopeKHR
+{
+    VK_SCOPE_DEVICE_KHR = 1,
+    VK_SCOPE_WORKGROUP_KHR = 2,
+    VK_SCOPE_SUBGROUP_KHR = 3,
+    VK_SCOPE_QUEUE_FAMILY_KHR = 5,
+    VK_SCOPE_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkScopeKHR;
+typedef struct VkCooperativeMatrixPropertiesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t MSize;
+    uint32_t NSize;
+    uint32_t KSize;
+    VkComponentTypeKHR AType;
+    VkComponentTypeKHR BType;
+    VkComponentTypeKHR CType;
+    VkComponentTypeKHR ResultType;
+    VkBool32 saturatingAccumulation;
+    VkScopeKHR scope;
+} VkCooperativeMatrixPropertiesKHR;
+typedef struct VkPhysicalDeviceCooperativeMatrixFeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 cooperativeMatrix;
+    VkBool32 cooperativeMatrixRobustBufferAccess;
+} VkPhysicalDeviceCooperativeMatrixFeaturesKHR;
+typedef struct VkPhysicalDeviceCooperativeMatrixPropertiesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkShaderStageFlags cooperativeMatrixSupportedStages;
+} VkPhysicalDeviceCooperativeMatrixPropertiesKHR;
+typedef VkResult(VKAPI_PTR* PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR)(VkPhysicalDevice physicalDevice, uint32_t* pPropertyCount, VkCooperativeMatrixPropertiesKHR* pProperties);
+#endif // VK_HEADER_VERSION < 255
+
+#endif // NCNN_VULKAN_HEADER_FIX_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/lib/cmake/ncnn/ncnn-release.cmake b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/lib/cmake/ncnn/ncnn-release.cmake
new file mode 100644
index 0000000..1fb8660
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/lib/cmake/ncnn/ncnn-release.cmake
@@ -0,0 +1,19 @@
+#----------------------------------------------------------------
+# Generated CMake target import file for configuration "Release".
+#----------------------------------------------------------------
+
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+
+# Import target "ncnn" for configuration "Release"
+set_property(TARGET ncnn APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(ncnn PROPERTIES
+  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/libncnn.so"
+  IMPORTED_SONAME_RELEASE "libncnn.so"
+  )
+
+list(APPEND _cmake_import_check_targets ncnn )
+list(APPEND _cmake_import_check_files_for_ncnn "${_IMPORT_PREFIX}/lib/libncnn.so" )
+
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/lib/cmake/ncnn/ncnn.cmake b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/lib/cmake/ncnn/ncnn.cmake
new file mode 100644
index 0000000..6726e95
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/lib/cmake/ncnn/ncnn.cmake
@@ -0,0 +1,109 @@
+# Generated by CMake
+
+if("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.8)
+   message(FATAL_ERROR "CMake >= 2.8.0 required")
+endif()
+if(CMAKE_VERSION VERSION_LESS "2.8.3")
+   message(FATAL_ERROR "CMake >= 2.8.3 required")
+endif()
+cmake_policy(PUSH)
+cmake_policy(VERSION 2.8.3...3.25)
+#----------------------------------------------------------------
+# Generated CMake target import file.
+#----------------------------------------------------------------
+
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+
+# Protect against multiple inclusion, which would fail when already imported targets are added once more.
+set(_cmake_targets_defined "")
+set(_cmake_targets_not_defined "")
+set(_cmake_expected_targets "")
+foreach(_cmake_expected_target IN ITEMS ncnn)
+  list(APPEND _cmake_expected_targets "${_cmake_expected_target}")
+  if(TARGET "${_cmake_expected_target}")
+    list(APPEND _cmake_targets_defined "${_cmake_expected_target}")
+  else()
+    list(APPEND _cmake_targets_not_defined "${_cmake_expected_target}")
+  endif()
+endforeach()
+unset(_cmake_expected_target)
+if(_cmake_targets_defined STREQUAL _cmake_expected_targets)
+  unset(_cmake_targets_defined)
+  unset(_cmake_targets_not_defined)
+  unset(_cmake_expected_targets)
+  unset(CMAKE_IMPORT_FILE_VERSION)
+  cmake_policy(POP)
+  return()
+endif()
+if(NOT _cmake_targets_defined STREQUAL "")
+  string(REPLACE ";" ", " _cmake_targets_defined_text "${_cmake_targets_defined}")
+  string(REPLACE ";" ", " _cmake_targets_not_defined_text "${_cmake_targets_not_defined}")
+  message(FATAL_ERROR "Some (but not all) targets in this export set were already defined.\nTargets Defined: ${_cmake_targets_defined_text}\nTargets not yet defined: ${_cmake_targets_not_defined_text}\n")
+endif()
+unset(_cmake_targets_defined)
+unset(_cmake_targets_not_defined)
+unset(_cmake_expected_targets)
+
+
+# Compute the installation prefix relative to this file.
+get_filename_component(_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+if(_IMPORT_PREFIX STREQUAL "/")
+  set(_IMPORT_PREFIX "")
+endif()
+
+# Create imported target ncnn
+add_library(ncnn SHARED IMPORTED)
+
+set_target_properties(ncnn PROPERTIES
+  INTERFACE_COMPILE_OPTIONS "-fno-rtti;-fno-exceptions"
+  INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include/ncnn"
+  INTERFACE_LINK_LIBRARIES "-fopenmp;-static-openmp;-Wl,-wrap,__kmp_affinity_determine_capable;Threads::Threads;android;jnigraphics;log"
+  INTERFACE_POSITION_INDEPENDENT_CODE "ON"
+)
+
+if(CMAKE_VERSION VERSION_LESS 2.8.12)
+  message(FATAL_ERROR "This file relies on consumers using CMake 2.8.12 or greater.")
+endif()
+
+# Load information for each installed configuration.
+file(GLOB _cmake_config_files "${CMAKE_CURRENT_LIST_DIR}/ncnn-*.cmake")
+foreach(_cmake_config_file IN LISTS _cmake_config_files)
+  include("${_cmake_config_file}")
+endforeach()
+unset(_cmake_config_file)
+unset(_cmake_config_files)
+
+# Cleanup temporary variables.
+set(_IMPORT_PREFIX)
+
+# Loop over all imported files and verify that they actually exist
+foreach(_cmake_target IN LISTS _cmake_import_check_targets)
+  foreach(_cmake_file IN LISTS "_cmake_import_check_files_for_${_cmake_target}")
+    if(NOT EXISTS "${_cmake_file}")
+      message(FATAL_ERROR "The imported target \"${_cmake_target}\" references the file
+   \"${_cmake_file}\"
+but this file does not exist.  Possible reasons include:
+* The file was deleted, renamed, or moved to another location.
+* An install or uninstall procedure did not complete successfully.
+* The installation package was faulty and contained
+   \"${CMAKE_CURRENT_LIST_FILE}\"
+but not all the files it references.
+")
+    endif()
+  endforeach()
+  unset(_cmake_file)
+  unset("_cmake_import_check_files_for_${_cmake_target}")
+endforeach()
+unset(_cmake_target)
+unset(_cmake_import_check_targets)
+
+# This file does not depend on other imported targets which have
+# been exported from the same project but in a separate export set.
+
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)
+cmake_policy(POP)
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/lib/cmake/ncnn/ncnnConfig.cmake b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/lib/cmake/ncnn/ncnnConfig.cmake
new file mode 100644
index 0000000..d3ac286
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/lib/cmake/ncnn/ncnnConfig.cmake
@@ -0,0 +1,42 @@
+set(NCNN_OPENMP ON)
+set(NCNN_THREADS ON)
+set(NCNN_VULKAN OFF)
+set(NCNN_SHARED_LIB ON)
+set(NCNN_SYSTEM_GLSLANG OFF)
+
+if(NCNN_OPENMP)
+    find_package(OpenMP)
+endif()
+
+if(NCNN_THREADS)
+    set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
+    set(THREADS_PREFER_PTHREAD_FLAG TRUE)
+    find_package(Threads REQUIRED)
+endif()
+
+if(NCNN_VULKAN)
+    find_package(Vulkan REQUIRED)
+
+    if(NOT NCNN_SHARED_LIB)
+        if(NCNN_SYSTEM_GLSLANG)
+            find_package(glslang QUIET)
+            if(NOT glslang_FOUND)
+                set(GLSLANG_TARGET_DIR "")
+                include(${GLSLANG_TARGET_DIR}/OSDependentTargets.cmake)
+                include(${GLSLANG_TARGET_DIR}/OGLCompilerTargets.cmake)
+                if(EXISTS "${GLSLANG_TARGET_DIR}/HLSLTargets.cmake")
+                    # hlsl support can be optional
+                    include("${GLSLANG_TARGET_DIR}/HLSLTargets.cmake")
+                endif()
+                include(${GLSLANG_TARGET_DIR}/glslangTargets.cmake)
+                include(${GLSLANG_TARGET_DIR}/SPIRVTargets.cmake)
+            endif()
+        else()
+            set(glslang_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../lib/cmake/glslang")
+            find_package(glslang QUIET)
+        endif()
+
+    endif()
+endif()
+
+include(${CMAKE_CURRENT_LIST_DIR}/ncnn.cmake)
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/lib/pkgconfig/ncnn.pc b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/lib/pkgconfig/ncnn.pc
new file mode 100644
index 0000000..4e80236
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86/lib/pkgconfig/ncnn.pc
@@ -0,0 +1,11 @@
+prefix=${pcfiledir}/../..
+librarydir=${prefix}/lib
+includedir=${prefix}/include
+
+Name: ncnn
+Description: high-performance neural network inference framework optimized for the mobile platform
+Version: 1.0.20231027
+URL: https://github.com/Tencent/ncnn
+Libs: -L"${librarydir}" -lncnn
+Cflags: -I"${includedir}"
+
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/allocator.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/allocator.h
new file mode 100644
index 0000000..3a5ebca
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/allocator.h
@@ -0,0 +1,448 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_ALLOCATOR_H
+#define NCNN_ALLOCATOR_H
+
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+
+#include "platform.h"
+
+#include <stdlib.h>
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+#include <android/hardware_buffer.h>
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+// the alignment of all the allocated buffers
+#if NCNN_AVX512
+#define NCNN_MALLOC_ALIGN 64
+#elif NCNN_AVX
+#define NCNN_MALLOC_ALIGN 32
+#else
+#define NCNN_MALLOC_ALIGN 16
+#endif
+
+// we have some optimized kernels that may overread buffer a bit in loop
+// it is common to interleave next-loop data load with arithmetic instructions
+// allocating more bytes keeps us safe from SEGV_ACCERR failure
+#define NCNN_MALLOC_OVERREAD 64
+
+// Aligns a pointer to the specified number of bytes
+// ptr Aligned pointer
+// n Alignment size that must be a power of two
+template<typename _Tp>
+static NCNN_FORCEINLINE _Tp* alignPtr(_Tp* ptr, int n = (int)sizeof(_Tp))
+{
+    return (_Tp*)(((size_t)ptr + n - 1) & -n);
+}
+
+// Aligns a buffer size to the specified number of bytes
+// The function returns the minimum number that is greater or equal to sz and is divisible by n
+// sz Buffer size to align
+// n Alignment size that must be a power of two
+static NCNN_FORCEINLINE size_t alignSize(size_t sz, int n)
+{
+    return (sz + n - 1) & -n;
+}
+
+static NCNN_FORCEINLINE void* fastMalloc(size_t size)
+{
+#if _MSC_VER
+    return _aligned_malloc(size, NCNN_MALLOC_ALIGN);
+#elif (defined(__unix__) || defined(__APPLE__)) && _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17)
+    void* ptr = 0;
+    if (posix_memalign(&ptr, NCNN_MALLOC_ALIGN, size + NCNN_MALLOC_OVERREAD))
+        ptr = 0;
+    return ptr;
+#elif __ANDROID__ && __ANDROID_API__ < 17
+    return memalign(NCNN_MALLOC_ALIGN, size + NCNN_MALLOC_OVERREAD);
+#else
+    unsigned char* udata = (unsigned char*)malloc(size + sizeof(void*) + NCNN_MALLOC_ALIGN + NCNN_MALLOC_OVERREAD);
+    if (!udata)
+        return 0;
+    unsigned char** adata = alignPtr((unsigned char**)udata + 1, NCNN_MALLOC_ALIGN);
+    adata[-1] = udata;
+    return adata;
+#endif
+}
+
+static NCNN_FORCEINLINE void fastFree(void* ptr)
+{
+    if (ptr)
+    {
+#if _MSC_VER
+        _aligned_free(ptr);
+#elif (defined(__unix__) || defined(__APPLE__)) && _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17)
+        free(ptr);
+#elif __ANDROID__ && __ANDROID_API__ < 17
+        free(ptr);
+#else
+        unsigned char* udata = ((unsigned char**)ptr)[-1];
+        free(udata);
+#endif
+    }
+}
+
+#if NCNN_THREADS
+// exchange-add operation for atomic operations on reference counters
+#if defined __riscv && !defined __riscv_atomic
+// riscv target without A extension
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#elif defined __INTEL_COMPILER && !(defined WIN32 || defined _WIN32)
+// atomic increment on the linux version of the Intel(tm) compiler
+#define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd(const_cast<void*>(reinterpret_cast<volatile void*>(addr)), delta)
+#elif defined __GNUC__
+#if defined __clang__ && __clang_major__ >= 3 && !defined __ANDROID__ && !defined __EMSCRIPTEN__ && !defined(__CUDACC__)
+#ifdef __ATOMIC_ACQ_REL
+#define NCNN_XADD(addr, delta) __c11_atomic_fetch_add((_Atomic(int)*)(addr), delta, __ATOMIC_ACQ_REL)
+#else
+#define NCNN_XADD(addr, delta) __atomic_fetch_add((_Atomic(int)*)(addr), delta, 4)
+#endif
+#else
+#if defined __ATOMIC_ACQ_REL && !defined __clang__
+// version for gcc >= 4.7
+#define NCNN_XADD(addr, delta) (int)__atomic_fetch_add((unsigned*)(addr), (unsigned)(delta), __ATOMIC_ACQ_REL)
+#else
+#define NCNN_XADD(addr, delta) (int)__sync_fetch_and_add((unsigned*)(addr), (unsigned)(delta))
+#endif
+#endif
+#elif defined _MSC_VER && !defined RC_INVOKED
+#define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile*)addr, delta)
+#else
+// thread-unsafe branch
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#endif
+#else  // NCNN_THREADS
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#endif // NCNN_THREADS
+
+class NCNN_EXPORT Allocator
+{
+public:
+    virtual ~Allocator();
+    virtual void* fastMalloc(size_t size) = 0;
+    virtual void fastFree(void* ptr) = 0;
+};
+
+class PoolAllocatorPrivate;
+class NCNN_EXPORT PoolAllocator : public Allocator
+{
+public:
+    PoolAllocator();
+    ~PoolAllocator();
+
+    // ratio range 0 ~ 1
+    // default cr = 0
+    void set_size_compare_ratio(float scr);
+
+    // budget drop threshold
+    // default threshold = 10
+    void set_size_drop_threshold(size_t);
+
+    // release all budgets immediately
+    void clear();
+
+    virtual void* fastMalloc(size_t size);
+    virtual void fastFree(void* ptr);
+
+private:
+    PoolAllocator(const PoolAllocator&);
+    PoolAllocator& operator=(const PoolAllocator&);
+
+private:
+    PoolAllocatorPrivate* const d;
+};
+
+class UnlockedPoolAllocatorPrivate;
+class NCNN_EXPORT UnlockedPoolAllocator : public Allocator
+{
+public:
+    UnlockedPoolAllocator();
+    ~UnlockedPoolAllocator();
+
+    // ratio range 0 ~ 1
+    // default cr = 0
+    void set_size_compare_ratio(float scr);
+
+    // budget drop threshold
+    // default threshold = 10
+    void set_size_drop_threshold(size_t);
+
+    // release all budgets immediately
+    void clear();
+
+    virtual void* fastMalloc(size_t size);
+    virtual void fastFree(void* ptr);
+
+private:
+    UnlockedPoolAllocator(const UnlockedPoolAllocator&);
+    UnlockedPoolAllocator& operator=(const UnlockedPoolAllocator&);
+
+private:
+    UnlockedPoolAllocatorPrivate* const d;
+};
+
+#if NCNN_VULKAN
+
+class VulkanDevice;
+
+class NCNN_EXPORT VkBufferMemory
+{
+public:
+    VkBuffer buffer;
+
+    // the base offset assigned by allocator
+    size_t offset;
+    size_t capacity;
+
+    VkDeviceMemory memory;
+    void* mapped_ptr;
+
+    // buffer state, modified by command functions internally
+    mutable VkAccessFlags access_flags;
+    mutable VkPipelineStageFlags stage_flags;
+
+    // initialize and modified by mat
+    int refcount;
+};
+
+class NCNN_EXPORT VkImageMemory
+{
+public:
+    VkImage image;
+    VkImageView imageview;
+
+    // underlying info assigned by allocator
+    int width;
+    int height;
+    int depth;
+    VkFormat format;
+
+    VkDeviceMemory memory;
+    void* mapped_ptr;
+
+    // the base offset assigned by allocator
+    size_t bind_offset;
+    size_t bind_capacity;
+
+    // image state, modified by command functions internally
+    mutable VkAccessFlags access_flags;
+    mutable VkImageLayout image_layout;
+    mutable VkPipelineStageFlags stage_flags;
+
+    // in-execution state, modified by command functions internally
+    mutable int command_refcount;
+
+    // initialize and modified by mat
+    int refcount;
+};
+
+class NCNN_EXPORT VkAllocator
+{
+public:
+    explicit VkAllocator(const VulkanDevice* _vkdev);
+    virtual ~VkAllocator();
+
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size) = 0;
+    virtual void fastFree(VkBufferMemory* ptr) = 0;
+    virtual int flush(VkBufferMemory* ptr);
+    virtual int invalidate(VkBufferMemory* ptr);
+
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack) = 0;
+    virtual void fastFree(VkImageMemory* ptr) = 0;
+
+public:
+    const VulkanDevice* vkdev;
+    uint32_t buffer_memory_type_index;
+    uint32_t image_memory_type_index;
+    uint32_t reserved_type_index;
+    bool mappable;
+    bool coherent;
+
+protected:
+    VkBuffer create_buffer(size_t size, VkBufferUsageFlags usage);
+    VkDeviceMemory allocate_memory(size_t size, uint32_t memory_type_index);
+    VkDeviceMemory allocate_dedicated_memory(size_t size, uint32_t memory_type_index, VkImage image, VkBuffer buffer);
+
+    VkImage create_image(int width, int height, int depth, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage);
+    VkImageView create_imageview(VkImage image, VkFormat format);
+};
+
+class VkBlobAllocatorPrivate;
+class NCNN_EXPORT VkBlobAllocator : public VkAllocator
+{
+public:
+    explicit VkBlobAllocator(const VulkanDevice* vkdev, size_t preferred_block_size = 16 * 1024 * 1024); // 16M
+    virtual ~VkBlobAllocator();
+
+public:
+    // release all budgets immediately
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkBlobAllocator(const VkBlobAllocator&);
+    VkBlobAllocator& operator=(const VkBlobAllocator&);
+
+private:
+    VkBlobAllocatorPrivate* const d;
+};
+
+class VkWeightAllocatorPrivate;
+class NCNN_EXPORT VkWeightAllocator : public VkAllocator
+{
+public:
+    explicit VkWeightAllocator(const VulkanDevice* vkdev, size_t preferred_block_size = 8 * 1024 * 1024); // 8M
+    virtual ~VkWeightAllocator();
+
+public:
+    // release all blocks immediately
+    virtual void clear();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkWeightAllocator(const VkWeightAllocator&);
+    VkWeightAllocator& operator=(const VkWeightAllocator&);
+
+private:
+    VkWeightAllocatorPrivate* const d;
+};
+
+class VkStagingAllocatorPrivate;
+class NCNN_EXPORT VkStagingAllocator : public VkAllocator
+{
+public:
+    explicit VkStagingAllocator(const VulkanDevice* vkdev);
+    virtual ~VkStagingAllocator();
+
+public:
+    // ratio range 0 ~ 1
+    // default cr = 0.75
+    void set_size_compare_ratio(float scr);
+
+    // release all budgets immediately
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkStagingAllocator(const VkStagingAllocator&);
+    VkStagingAllocator& operator=(const VkStagingAllocator&);
+
+private:
+    VkStagingAllocatorPrivate* const d;
+};
+
+class VkWeightStagingAllocatorPrivate;
+class NCNN_EXPORT VkWeightStagingAllocator : public VkAllocator
+{
+public:
+    explicit VkWeightStagingAllocator(const VulkanDevice* vkdev);
+    virtual ~VkWeightStagingAllocator();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkWeightStagingAllocator(const VkWeightStagingAllocator&);
+    VkWeightStagingAllocator& operator=(const VkWeightStagingAllocator&);
+
+private:
+    VkWeightStagingAllocatorPrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class NCNN_EXPORT VkAndroidHardwareBufferImageAllocator : public VkAllocator
+{
+public:
+    VkAndroidHardwareBufferImageAllocator(const VulkanDevice* _vkdev, AHardwareBuffer* _hb);
+    virtual ~VkAndroidHardwareBufferImageAllocator();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkAndroidHardwareBufferImageAllocator(const VkAndroidHardwareBufferImageAllocator&);
+    VkAndroidHardwareBufferImageAllocator& operator=(const VkAndroidHardwareBufferImageAllocator&);
+
+public:
+    int init();
+
+    int width() const;
+    int height() const;
+    uint64_t external_format() const;
+
+public:
+    AHardwareBuffer* hb;
+    AHardwareBuffer_Desc bufferDesc;
+    VkAndroidHardwareBufferFormatPropertiesANDROID bufferFormatProperties;
+    VkAndroidHardwareBufferPropertiesANDROID bufferProperties;
+    VkSamplerYcbcrConversionKHR samplerYcbcrConversion;
+};
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_ALLOCATOR_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/benchmark.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/benchmark.h
new file mode 100644
index 0000000..ed42c1a
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/benchmark.h
@@ -0,0 +1,39 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_BENCHMARK_H
+#define NCNN_BENCHMARK_H
+
+#include "layer.h"
+#include "mat.h"
+#include "platform.h"
+
+namespace ncnn {
+
+// get now timestamp in ms
+NCNN_EXPORT double get_current_time();
+
+// sleep milliseconds
+NCNN_EXPORT void sleep(unsigned long long int milliseconds = 1000);
+
+#if NCNN_BENCHMARK
+
+NCNN_EXPORT void benchmark(const Layer* layer, double start, double end);
+NCNN_EXPORT void benchmark(const Layer* layer, const Mat& bottom_blob, Mat& top_blob, double start, double end);
+
+#endif // NCNN_BENCHMARK
+
+} // namespace ncnn
+
+#endif // NCNN_BENCHMARK_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/blob.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/blob.h
new file mode 100644
index 0000000..c9f144f
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/blob.h
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_BLOB_H
+#define NCNN_BLOB_H
+
+#include "mat.h"
+#include "platform.h"
+
+namespace ncnn {
+
+class NCNN_EXPORT Blob
+{
+public:
+    // empty
+    Blob();
+
+public:
+#if NCNN_STRING
+    // blob name
+    std::string name;
+#endif // NCNN_STRING
+    // layer index which produce this blob as output
+    int producer;
+    // layer index which need this blob as input
+    int consumer;
+    // shape hint
+    Mat shape;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_BLOB_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/c_api.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/c_api.h
new file mode 100644
index 0000000..31d5b6d
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/c_api.h
@@ -0,0 +1,347 @@
+/* Tencent is pleased to support the open source community by making ncnn available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed
+ * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+ * CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ */
+
+#ifndef NCNN_C_API_H
+#define NCNN_C_API_H
+
+#include "platform.h"
+
+#if NCNN_C_API
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+NCNN_EXPORT const char* ncnn_version();
+
+/* allocator api */
+typedef struct __ncnn_allocator_t* ncnn_allocator_t;
+struct NCNN_EXPORT __ncnn_allocator_t
+{
+    void* pthis;
+
+    void* (*fast_malloc)(ncnn_allocator_t allocator, size_t size);
+    void (*fast_free)(ncnn_allocator_t allocator, void* ptr);
+};
+
+NCNN_EXPORT ncnn_allocator_t ncnn_allocator_create_pool_allocator();
+NCNN_EXPORT ncnn_allocator_t ncnn_allocator_create_unlocked_pool_allocator();
+NCNN_EXPORT void ncnn_allocator_destroy(ncnn_allocator_t allocator);
+
+/* option api */
+typedef struct __ncnn_option_t* ncnn_option_t;
+
+NCNN_EXPORT ncnn_option_t ncnn_option_create();
+NCNN_EXPORT void ncnn_option_destroy(ncnn_option_t opt);
+
+NCNN_EXPORT int ncnn_option_get_num_threads(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_num_threads(ncnn_option_t opt, int num_threads);
+
+NCNN_EXPORT int ncnn_option_get_use_local_pool_allocator(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_use_local_pool_allocator(ncnn_option_t opt, int use_local_pool_allocator);
+
+NCNN_EXPORT void ncnn_option_set_blob_allocator(ncnn_option_t opt, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_option_set_workspace_allocator(ncnn_option_t opt, ncnn_allocator_t allocator);
+
+NCNN_EXPORT int ncnn_option_get_use_vulkan_compute(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_use_vulkan_compute(ncnn_option_t opt, int use_vulkan_compute);
+
+/* mat api */
+typedef struct __ncnn_mat_t* ncnn_mat_t;
+
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create();
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_1d(int w, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_2d(int w, int h, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_3d(int w, int h, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_4d(int w, int h, int d, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_1d(int w, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_2d(int w, int h, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_3d(int w, int h, int c, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_4d(int w, int h, int d, int c, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_1d_elem(int w, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_2d_elem(int w, int h, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_3d_elem(int w, int h, int c, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_4d_elem(int w, int h, int d, int c, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_1d_elem(int w, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_2d_elem(int w, int h, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_3d_elem(int w, int h, int c, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_4d_elem(int w, int h, int d, int c, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_mat_destroy(ncnn_mat_t mat);
+
+NCNN_EXPORT void ncnn_mat_fill_float(ncnn_mat_t mat, float v);
+
+NCNN_EXPORT ncnn_mat_t ncnn_mat_clone(const ncnn_mat_t mat, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_1d(const ncnn_mat_t mat, int w, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_2d(const ncnn_mat_t mat, int w, int h, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_3d(const ncnn_mat_t mat, int w, int h, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_4d(const ncnn_mat_t mat, int w, int h, int d, int c, ncnn_allocator_t allocator);
+
+NCNN_EXPORT int ncnn_mat_get_dims(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_w(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_h(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_d(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_c(const ncnn_mat_t mat);
+NCNN_EXPORT size_t ncnn_mat_get_elemsize(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_elempack(const ncnn_mat_t mat);
+NCNN_EXPORT size_t ncnn_mat_get_cstep(const ncnn_mat_t mat);
+NCNN_EXPORT void* ncnn_mat_get_data(const ncnn_mat_t mat);
+
+NCNN_EXPORT void* ncnn_mat_get_channel_data(const ncnn_mat_t mat, int c);
+
+#if NCNN_PIXEL
+
+/* mat pixel api */
+#define NCNN_MAT_PIXEL_RGB       1
+#define NCNN_MAT_PIXEL_BGR       2
+#define NCNN_MAT_PIXEL_GRAY      3
+#define NCNN_MAT_PIXEL_RGBA      4
+#define NCNN_MAT_PIXEL_BGRA      5
+#define NCNN_MAT_PIXEL_X2Y(X, Y) (X | (Y << 16))
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels(const unsigned char* pixels, int type, int w, int h, int stride, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int stride, int target_width, int target_height, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, int target_width, int target_height, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_mat_to_pixels(const ncnn_mat_t mat, unsigned char* pixels, int type, int stride);
+NCNN_EXPORT void ncnn_mat_to_pixels_resize(const ncnn_mat_t mat, unsigned char* pixels, int type, int target_width, int target_height, int target_stride);
+
+#endif /* NCNN_PIXEL */
+
+NCNN_EXPORT void ncnn_mat_substract_mean_normalize(ncnn_mat_t mat, const float* mean_vals, const float* norm_vals);
+
+NCNN_EXPORT void ncnn_convert_packing(const ncnn_mat_t src, ncnn_mat_t* dst, int elempack, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_flatten(const ncnn_mat_t src, ncnn_mat_t* dst, const ncnn_option_t opt);
+
+/* blob api */
+typedef struct __ncnn_blob_t* ncnn_blob_t;
+
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_blob_get_name(const ncnn_blob_t blob);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_blob_get_producer(const ncnn_blob_t blob);
+NCNN_EXPORT int ncnn_blob_get_consumer(const ncnn_blob_t blob);
+
+NCNN_EXPORT void ncnn_blob_get_shape(const ncnn_blob_t blob, int* dims, int* w, int* h, int* c);
+
+/* paramdict api */
+typedef struct __ncnn_paramdict_t* ncnn_paramdict_t;
+
+NCNN_EXPORT ncnn_paramdict_t ncnn_paramdict_create();
+NCNN_EXPORT void ncnn_paramdict_destroy(ncnn_paramdict_t pd);
+
+NCNN_EXPORT int ncnn_paramdict_get_type(const ncnn_paramdict_t pd, int id);
+
+NCNN_EXPORT int ncnn_paramdict_get_int(const ncnn_paramdict_t pd, int id, int def);
+NCNN_EXPORT float ncnn_paramdict_get_float(const ncnn_paramdict_t pd, int id, float def);
+NCNN_EXPORT ncnn_mat_t ncnn_paramdict_get_array(const ncnn_paramdict_t pd, int id, const ncnn_mat_t def);
+
+NCNN_EXPORT void ncnn_paramdict_set_int(ncnn_paramdict_t pd, int id, int i);
+NCNN_EXPORT void ncnn_paramdict_set_float(ncnn_paramdict_t pd, int id, float f);
+NCNN_EXPORT void ncnn_paramdict_set_array(ncnn_paramdict_t pd, int id, const ncnn_mat_t v);
+
+/* datareader api */
+typedef struct __ncnn_datareader_t* ncnn_datareader_t;
+struct NCNN_EXPORT __ncnn_datareader_t
+{
+    void* pthis;
+
+#if NCNN_STRING
+    int (*scan)(ncnn_datareader_t dr, const char* format, void* p);
+#endif /* NCNN_STRING */
+    size_t (*read)(ncnn_datareader_t dr, void* buf, size_t size);
+};
+
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create();
+#if NCNN_STDIO
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create_from_stdio(FILE* fp);
+#endif /* NCNN_STDIO */
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create_from_memory(const unsigned char** mem);
+NCNN_EXPORT void ncnn_datareader_destroy(ncnn_datareader_t dr);
+
+/* modelbin api */
+typedef struct __ncnn_modelbin_t* ncnn_modelbin_t;
+struct NCNN_EXPORT __ncnn_modelbin_t
+{
+    void* pthis;
+
+    ncnn_mat_t (*load_1d)(const ncnn_modelbin_t mb, int w, int type);
+    ncnn_mat_t (*load_2d)(const ncnn_modelbin_t mb, int w, int h, int type);
+    ncnn_mat_t (*load_3d)(const ncnn_modelbin_t mb, int w, int h, int c, int type);
+};
+
+NCNN_EXPORT ncnn_modelbin_t ncnn_modelbin_create_from_datareader(const ncnn_datareader_t dr);
+NCNN_EXPORT ncnn_modelbin_t ncnn_modelbin_create_from_mat_array(const ncnn_mat_t* weights, int n);
+NCNN_EXPORT void ncnn_modelbin_destroy(ncnn_modelbin_t mb);
+
+/* layer api */
+typedef struct __ncnn_layer_t* ncnn_layer_t;
+struct NCNN_EXPORT __ncnn_layer_t
+{
+    void* pthis;
+
+    int (*load_param)(ncnn_layer_t layer, const ncnn_paramdict_t pd);
+    int (*load_model)(ncnn_layer_t layer, const ncnn_modelbin_t mb);
+
+    int (*create_pipeline)(ncnn_layer_t layer, const ncnn_option_t opt);
+    int (*destroy_pipeline)(ncnn_layer_t layer, const ncnn_option_t opt);
+
+    int (*forward_1)(const ncnn_layer_t layer, const ncnn_mat_t bottom_blob, ncnn_mat_t* top_blob, const ncnn_option_t opt);
+    int (*forward_n)(const ncnn_layer_t layer, const ncnn_mat_t* bottom_blobs, int n, ncnn_mat_t* top_blobs, int n2, const ncnn_option_t opt);
+
+    int (*forward_inplace_1)(const ncnn_layer_t layer, ncnn_mat_t bottom_top_blob, const ncnn_option_t opt);
+    int (*forward_inplace_n)(const ncnn_layer_t layer, ncnn_mat_t* bottom_top_blobs, int n, const ncnn_option_t opt);
+};
+
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create();
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create_by_typeindex(int typeindex);
+#if NCNN_STRING
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create_by_type(const char* type);
+NCNN_EXPORT int ncnn_layer_type_to_index(const char* type);
+#endif /* NCNN_STRING */
+NCNN_EXPORT void ncnn_layer_destroy(ncnn_layer_t layer);
+
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_layer_get_name(const ncnn_layer_t layer);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_layer_get_typeindex(const ncnn_layer_t layer);
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_layer_get_type(const ncnn_layer_t layer);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_layer_get_one_blob_only(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_inplace(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_vulkan(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_packing(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_bf16_storage(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_fp16_storage(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_image_storage(const ncnn_layer_t layer);
+
+NCNN_EXPORT void ncnn_layer_set_one_blob_only(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_inplace(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_vulkan(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_packing(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_bf16_storage(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_fp16_storage(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_image_storage(ncnn_layer_t layer, int enable);
+
+NCNN_EXPORT int ncnn_layer_get_bottom_count(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_bottom(const ncnn_layer_t layer, int i);
+NCNN_EXPORT int ncnn_layer_get_top_count(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_top(const ncnn_layer_t layer, int i);
+
+NCNN_EXPORT void ncnn_blob_get_bottom_shape(const ncnn_layer_t layer, int i, int* dims, int* w, int* h, int* c);
+NCNN_EXPORT void ncnn_blob_get_top_shape(const ncnn_layer_t layer, int i, int* dims, int* w, int* h, int* c);
+
+/* layer factory function */
+typedef ncnn_layer_t (*ncnn_layer_creator_t)(void* userdata);
+typedef void (*ncnn_layer_destroyer_t)(ncnn_layer_t layer, void* userdata);
+
+typedef struct __ncnn_net_custom_layer_factory_t* ncnn_net_custom_layer_factory_t;
+struct __ncnn_net_custom_layer_factory_t
+{
+    ncnn_layer_creator_t creator;
+    ncnn_layer_destroyer_t destroyer;
+    void* userdata;
+    ncnn_net_custom_layer_factory_t next;
+};
+
+/* net api */
+typedef struct __ncnn_net_t* ncnn_net_t;
+struct __ncnn_net_t
+{
+    void* pthis;
+
+    ncnn_net_custom_layer_factory_t custom_layer_factory;
+};
+
+NCNN_EXPORT ncnn_net_t ncnn_net_create();
+NCNN_EXPORT void ncnn_net_destroy(ncnn_net_t net);
+
+NCNN_EXPORT ncnn_option_t ncnn_net_get_option(ncnn_net_t net);
+NCNN_EXPORT void ncnn_net_set_option(ncnn_net_t net, ncnn_option_t opt);
+
+#if NCNN_STRING
+NCNN_EXPORT void ncnn_net_register_custom_layer_by_type(ncnn_net_t net, const char* type, ncnn_layer_creator_t creator, ncnn_layer_destroyer_t destroyer, void* userdata);
+#endif /* NCNN_STRING */
+NCNN_EXPORT void ncnn_net_register_custom_layer_by_typeindex(ncnn_net_t net, int typeindex, ncnn_layer_creator_t creator, ncnn_layer_destroyer_t destroyer, void* userdata);
+
+#if NCNN_STDIO
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param(ncnn_net_t net, const char* path);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_net_load_param_bin(ncnn_net_t net, const char* path);
+NCNN_EXPORT int ncnn_net_load_model(ncnn_net_t net, const char* path);
+#endif /* NCNN_STDIO */
+
+#if NCNN_STDIO
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param_memory(ncnn_net_t net, const char* mem);
+#endif /* NCNN_STRING */
+#endif /* NCNN_STDIO */
+NCNN_EXPORT int ncnn_net_load_param_bin_memory(ncnn_net_t net, const unsigned char* mem);
+NCNN_EXPORT int ncnn_net_load_model_memory(ncnn_net_t net, const unsigned char* mem);
+
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_net_load_param_bin_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+NCNN_EXPORT int ncnn_net_load_model_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+
+NCNN_EXPORT void ncnn_net_clear(ncnn_net_t net);
+
+NCNN_EXPORT int ncnn_net_get_input_count(const ncnn_net_t net);
+NCNN_EXPORT int ncnn_net_get_output_count(const ncnn_net_t net);
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_net_get_input_name(const ncnn_net_t net, int i);
+NCNN_EXPORT const char* ncnn_net_get_output_name(const ncnn_net_t net, int i);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_net_get_input_index(const ncnn_net_t net, int i);
+NCNN_EXPORT int ncnn_net_get_output_index(const ncnn_net_t net, int i);
+
+/* extractor api */
+typedef struct __ncnn_extractor_t* ncnn_extractor_t;
+
+NCNN_EXPORT ncnn_extractor_t ncnn_extractor_create(ncnn_net_t net);
+NCNN_EXPORT void ncnn_extractor_destroy(ncnn_extractor_t ex);
+
+NCNN_EXPORT void ncnn_extractor_set_option(ncnn_extractor_t ex, const ncnn_option_t opt);
+
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_extractor_input(ncnn_extractor_t ex, const char* name, const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_extractor_extract(ncnn_extractor_t ex, const char* name, ncnn_mat_t* mat);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_extractor_input_index(ncnn_extractor_t ex, int index, const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_extractor_extract_index(ncnn_extractor_t ex, int index, ncnn_mat_t* mat);
+
+/* mat process api */
+#define NCNN_BORDER_CONSTANT    0
+#define NCNN_BORDER_REPLICATE   1
+#define NCNN_BORDER_REFLECT     2
+#define NCNN_BORDER_TRANSPARENT -233
+NCNN_EXPORT void ncnn_copy_make_border(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int type, float v, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_copy_make_border_3d(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int front, int behind, int type, float v, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_copy_cut_border(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_copy_cut_border_3d(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int front, int behind, const ncnn_option_t opt);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* NCNN_C_API */
+
+#endif /* NCNN_C_API_H */
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/command.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/command.h
new file mode 100644
index 0000000..337d085
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/command.h
@@ -0,0 +1,136 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_COMMAND_H
+#define NCNN_COMMAND_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+
+#include "mat.h"
+
+#include <vulkan/vulkan.h>
+
+namespace ncnn {
+
+class Pipeline;
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class ImportAndroidHardwareBufferPipeline;
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+class VkComputePrivate;
+class NCNN_EXPORT VkCompute
+{
+public:
+    explicit VkCompute(const VulkanDevice* vkdev);
+    virtual ~VkCompute();
+
+public:
+    void record_upload(const Mat& src, VkMat& dst, const Option& opt);
+
+    void record_upload(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    void record_download(const VkMat& src, Mat& dst, const Option& opt);
+
+    void record_download(const VkImageMat& src, Mat& dst, const Option& opt);
+
+    void record_buffer_to_image(const VkMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_image_to_buffer(const VkImageMat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const Mat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, Mat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, Mat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, VkMat& dst, const Option& opt);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkImageMat>& bindings, const std::vector<vk_constant_type>& constants, const VkImageMat& dispatcher);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher);
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const VkImageMat& dispatcher);
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const Mat& dispatcher);
+
+#if NCNN_BENCHMARK
+    void record_write_timestamp(uint32_t query);
+#endif // NCNN_BENCHMARK
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+    void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkMat& dst);
+
+    void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkImageMat& dst);
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+    int submit_and_wait();
+
+    int reset();
+
+#if NCNN_BENCHMARK
+    int create_query_pool(uint32_t query_count);
+
+    int get_query_pool_results(uint32_t first_query, uint32_t query_count, std::vector<uint64_t>& results);
+#endif // NCNN_BENCHMARK
+
+protected:
+    const VulkanDevice* vkdev;
+
+    void barrier_readwrite(const VkMat& binding);
+    void barrier_readwrite(const VkImageMat& binding);
+    void barrier_readonly(const VkImageMat& binding);
+
+private:
+    VkComputePrivate* const d;
+};
+
+class VkTransferPrivate;
+class NCNN_EXPORT VkTransfer
+{
+public:
+    explicit VkTransfer(const VulkanDevice* vkdev);
+    virtual ~VkTransfer();
+
+public:
+    void record_upload(const Mat& src, VkMat& dst, const Option& opt, bool flatten = true);
+
+    void record_upload(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    int submit_and_wait();
+
+protected:
+    const VulkanDevice* vkdev;
+
+private:
+    VkTransferPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_VULKAN
+
+#endif // NCNN_COMMAND_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/cpu.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/cpu.h
new file mode 100644
index 0000000..7d6bfce
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/cpu.h
@@ -0,0 +1,178 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_CPU_H
+#define NCNN_CPU_H
+
+#include <stddef.h>
+
+#if (defined _WIN32 && !(defined __MINGW32__))
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+#if defined __ANDROID__ || defined __linux__
+#include <sched.h> // cpu_set_t
+#endif
+
+#include "platform.h"
+
+namespace ncnn {
+
+class NCNN_EXPORT CpuSet
+{
+public:
+    CpuSet();
+    void enable(int cpu);
+    void disable(int cpu);
+    void disable_all();
+    bool is_enabled(int cpu) const;
+    int num_enabled() const;
+
+public:
+#if (defined _WIN32 && !(defined __MINGW32__))
+    ULONG_PTR mask;
+#endif
+#if defined __ANDROID__ || defined __linux__
+    cpu_set_t cpu_set;
+#endif
+#if __APPLE__
+    unsigned int policy;
+#endif
+};
+
+// test optional cpu features
+// edsp = armv7 edsp
+NCNN_EXPORT int cpu_support_arm_edsp();
+// neon = armv7 neon or aarch64 asimd
+NCNN_EXPORT int cpu_support_arm_neon();
+// vfpv4 = armv7 fp16 + fma
+NCNN_EXPORT int cpu_support_arm_vfpv4();
+// asimdhp = aarch64 asimd half precision
+NCNN_EXPORT int cpu_support_arm_asimdhp();
+// cpuid = aarch64 cpuid info
+NCNN_EXPORT int cpu_support_arm_cpuid();
+// asimddp = aarch64 asimd dot product
+NCNN_EXPORT int cpu_support_arm_asimddp();
+// asimdfhm = aarch64 asimd fhm
+NCNN_EXPORT int cpu_support_arm_asimdfhm();
+// bf16 = aarch64 bf16
+NCNN_EXPORT int cpu_support_arm_bf16();
+// i8mm = aarch64 i8mm
+NCNN_EXPORT int cpu_support_arm_i8mm();
+// sve = aarch64 sve
+NCNN_EXPORT int cpu_support_arm_sve();
+// sve2 = aarch64 sve2
+NCNN_EXPORT int cpu_support_arm_sve2();
+// svebf16 = aarch64 svebf16
+NCNN_EXPORT int cpu_support_arm_svebf16();
+// svei8mm = aarch64 svei8mm
+NCNN_EXPORT int cpu_support_arm_svei8mm();
+// svef32mm = aarch64 svef32mm
+NCNN_EXPORT int cpu_support_arm_svef32mm();
+
+// avx = x86 avx
+NCNN_EXPORT int cpu_support_x86_avx();
+// fma = x86 fma
+NCNN_EXPORT int cpu_support_x86_fma();
+// xop = x86 xop
+NCNN_EXPORT int cpu_support_x86_xop();
+// f16c = x86 f16c
+NCNN_EXPORT int cpu_support_x86_f16c();
+// avx2 = x86 avx2 + fma + f16c
+NCNN_EXPORT int cpu_support_x86_avx2();
+// avx_vnni = x86 avx vnni
+NCNN_EXPORT int cpu_support_x86_avx_vnni();
+// avx512 = x86 avx512f + avx512cd + avx512bw + avx512dq + avx512vl
+NCNN_EXPORT int cpu_support_x86_avx512();
+// avx512_vnni = x86 avx512 vnni
+NCNN_EXPORT int cpu_support_x86_avx512_vnni();
+// avx512_bf16 = x86 avx512 bf16
+NCNN_EXPORT int cpu_support_x86_avx512_bf16();
+// avx512_fp16 = x86 avx512 fp16
+NCNN_EXPORT int cpu_support_x86_avx512_fp16();
+
+// lsx = loongarch lsx
+NCNN_EXPORT int cpu_support_loongarch_lsx();
+// lasx = loongarch lasx
+NCNN_EXPORT int cpu_support_loongarch_lasx();
+
+// msa = mips mas
+NCNN_EXPORT int cpu_support_mips_msa();
+// mmi = loongson mmi
+NCNN_EXPORT int cpu_support_loongson_mmi();
+
+// v = riscv vector
+NCNN_EXPORT int cpu_support_riscv_v();
+// zfh = riscv half-precision float
+NCNN_EXPORT int cpu_support_riscv_zfh();
+// vlenb = riscv vector length in bytes
+NCNN_EXPORT int cpu_riscv_vlenb();
+
+// cpu info
+NCNN_EXPORT int get_cpu_count();
+NCNN_EXPORT int get_little_cpu_count();
+NCNN_EXPORT int get_big_cpu_count();
+
+NCNN_EXPORT int get_physical_cpu_count();
+NCNN_EXPORT int get_physical_little_cpu_count();
+NCNN_EXPORT int get_physical_big_cpu_count();
+
+// cpu l2 varies from 64k to 1M, but l3 can be zero
+NCNN_EXPORT int get_cpu_level2_cache_size();
+NCNN_EXPORT int get_cpu_level3_cache_size();
+
+// bind all threads on little clusters if powersave enabled
+// affects HMP arch cpu like ARM big.LITTLE
+// only implemented on android at the moment
+// switching powersave is expensive and not thread-safe
+// 0 = all cores enabled(default)
+// 1 = only little clusters enabled
+// 2 = only big clusters enabled
+// return 0 if success for setter function
+NCNN_EXPORT int get_cpu_powersave();
+NCNN_EXPORT int set_cpu_powersave(int powersave);
+
+// convenient wrapper
+NCNN_EXPORT const CpuSet& get_cpu_thread_affinity_mask(int powersave);
+
+// set explicit thread affinity
+NCNN_EXPORT int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask);
+
+// runtime thread affinity info
+NCNN_EXPORT int is_current_thread_running_on_a53_a55();
+
+// misc function wrapper for openmp routines
+NCNN_EXPORT int get_omp_num_threads();
+NCNN_EXPORT void set_omp_num_threads(int num_threads);
+
+NCNN_EXPORT int get_omp_dynamic();
+NCNN_EXPORT void set_omp_dynamic(int dynamic);
+
+NCNN_EXPORT int get_omp_thread_num();
+
+NCNN_EXPORT int get_kmp_blocktime();
+NCNN_EXPORT void set_kmp_blocktime(int time_ms);
+
+// need to flush denormals on Intel Chipset.
+// Other architectures such as ARM can be added as needed.
+// 0 = DAZ OFF, FTZ OFF
+// 1 = DAZ ON , FTZ OFF
+// 2 = DAZ OFF, FTZ ON
+// 3 = DAZ ON,  FTZ ON
+NCNN_EXPORT int get_flush_denormals();
+NCNN_EXPORT int set_flush_denormals(int flush_denormals);
+
+} // namespace ncnn
+
+#endif // NCNN_CPU_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/datareader.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/datareader.h
new file mode 100644
index 0000000..ed2aba3
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/datareader.h
@@ -0,0 +1,122 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_DATAREADER_H
+#define NCNN_DATAREADER_H
+
+#include "platform.h"
+#if NCNN_STDIO
+#include <stdio.h>
+#endif
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/asset_manager.h>
+#endif
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+// data read wrapper
+class NCNN_EXPORT DataReader
+{
+public:
+    DataReader();
+    virtual ~DataReader();
+
+#if NCNN_STRING
+    // parse plain param text
+    // return 1 if scan success
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+
+    // read binary param and model data
+    // return bytes read
+    virtual size_t read(void* buf, size_t size) const;
+
+    // get model data reference
+    // return bytes referenced
+    virtual size_t reference(size_t size, const void** buf) const;
+};
+
+#if NCNN_STDIO
+class DataReaderFromStdioPrivate;
+class NCNN_EXPORT DataReaderFromStdio : public DataReader
+{
+public:
+    explicit DataReaderFromStdio(FILE* fp);
+    virtual ~DataReaderFromStdio();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+
+private:
+    DataReaderFromStdio(const DataReaderFromStdio&);
+    DataReaderFromStdio& operator=(const DataReaderFromStdio&);
+
+private:
+    DataReaderFromStdioPrivate* const d;
+};
+#endif // NCNN_STDIO
+
+class DataReaderFromMemoryPrivate;
+class NCNN_EXPORT DataReaderFromMemory : public DataReader
+{
+public:
+    explicit DataReaderFromMemory(const unsigned char*& mem);
+    virtual ~DataReaderFromMemory();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+    virtual size_t reference(size_t size, const void** buf) const;
+
+private:
+    DataReaderFromMemory(const DataReaderFromMemory&);
+    DataReaderFromMemory& operator=(const DataReaderFromMemory&);
+
+private:
+    DataReaderFromMemoryPrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+class DataReaderFromAndroidAssetPrivate;
+class NCNN_EXPORT DataReaderFromAndroidAsset : public DataReader
+{
+public:
+    explicit DataReaderFromAndroidAsset(AAsset* asset);
+    virtual ~DataReaderFromAndroidAsset();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+
+private:
+    DataReaderFromAndroidAsset(const DataReaderFromAndroidAsset&);
+    DataReaderFromAndroidAsset& operator=(const DataReaderFromAndroidAsset&);
+
+private:
+    DataReaderFromAndroidAssetPrivate* const d;
+};
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+} // namespace ncnn
+
+#endif // NCNN_DATAREADER_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/gpu.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/gpu.h
new file mode 100644
index 0000000..1eff228
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/gpu.h
@@ -0,0 +1,392 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_GPU_H
+#define NCNN_GPU_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+
+#include "mat.h"
+
+#include <vulkan/vulkan.h>
+
+#include "vulkan_header_fix.h"
+
+namespace ncnn {
+
+// instance
+
+// Create VkInstance and initialize some objects that need to be calculated by GPU
+// Creates a VkInstance object, Checks the extended attributes supported by the Vulkan instance concerned,
+// Initializes, and creates Vulkan validation layers (if ENABLE_VALIDATION_LAYER is enabled),
+// Iterates over all supported physical devices, etc.
+NCNN_EXPORT int create_gpu_instance();
+
+// Get global VkInstance variable
+// Must be called after create_gpu_instance() and before destroy_gpu_instance()
+NCNN_EXPORT VkInstance get_gpu_instance();
+
+// Destroy VkInstance object and free the memory of the associated object
+// Usually called in the destructor of the main program exit
+NCNN_EXPORT void destroy_gpu_instance();
+
+// instance extension capability
+extern int support_VK_KHR_external_memory_capabilities;
+extern int support_VK_KHR_get_physical_device_properties2;
+extern int support_VK_KHR_get_surface_capabilities2;
+extern int support_VK_KHR_surface;
+extern int support_VK_EXT_debug_utils;
+extern int support_VK_EXT_validation_features;
+extern int support_VK_EXT_validation_flags;
+#if __ANDROID_API__ >= 26
+extern int support_VK_KHR_android_surface;
+#endif // __ANDROID_API__ >= 26
+
+// VK_KHR_cooperative_matrix
+extern PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR;
+
+// VK_KHR_external_memory_capabilities
+extern PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR vkGetPhysicalDeviceExternalBufferPropertiesKHR;
+
+// VK_KHR_get_physical_device_properties2
+extern PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR;
+extern PFN_vkGetPhysicalDeviceProperties2KHR vkGetPhysicalDeviceProperties2KHR;
+extern PFN_vkGetPhysicalDeviceFormatProperties2KHR vkGetPhysicalDeviceFormatProperties2KHR;
+extern PFN_vkGetPhysicalDeviceImageFormatProperties2KHR vkGetPhysicalDeviceImageFormatProperties2KHR;
+extern PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR vkGetPhysicalDeviceQueueFamilyProperties2KHR;
+extern PFN_vkGetPhysicalDeviceMemoryProperties2KHR vkGetPhysicalDeviceMemoryProperties2KHR;
+extern PFN_vkGetPhysicalDeviceSparseImageFormatProperties2KHR vkGetPhysicalDeviceSparseImageFormatProperties2KHR;
+
+// VK_KHR_get_surface_capabilities2
+extern PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR vkGetPhysicalDeviceSurfaceCapabilities2KHR;
+extern PFN_vkGetPhysicalDeviceSurfaceFormats2KHR vkGetPhysicalDeviceSurfaceFormats2KHR;
+
+// VK_KHR_surface
+extern PFN_vkDestroySurfaceKHR vkDestroySurfaceKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceSupportKHR vkGetPhysicalDeviceSurfaceSupportKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR vkGetPhysicalDeviceSurfaceCapabilitiesKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceFormatsKHR vkGetPhysicalDeviceSurfaceFormatsKHR;
+extern PFN_vkGetPhysicalDeviceSurfacePresentModesKHR vkGetPhysicalDeviceSurfacePresentModesKHR;
+
+#if __ANDROID_API__ >= 26
+// VK_KHR_android_surface
+extern PFN_vkCreateAndroidSurfaceKHR vkCreateAndroidSurfaceKHR;
+#endif // __ANDROID_API__ >= 26
+
+// VK_NV_cooperative_matrix
+extern PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesNV vkGetPhysicalDeviceCooperativeMatrixPropertiesNV;
+
+// get info
+NCNN_EXPORT int get_gpu_count();
+NCNN_EXPORT int get_default_gpu_index();
+
+class GpuInfoPrivate;
+class NCNN_EXPORT GpuInfo
+{
+public:
+    explicit GpuInfo();
+    virtual ~GpuInfo();
+
+    // vulkan physical device
+    VkPhysicalDevice physical_device() const;
+
+    // memory properties
+    const VkPhysicalDeviceMemoryProperties& physical_device_memory_properties() const;
+
+    // info
+    uint32_t api_version() const;
+    uint32_t driver_version() const;
+    uint32_t vendor_id() const;
+    uint32_t device_id() const;
+    const char* device_name() const;
+    uint8_t* pipeline_cache_uuid() const;
+
+    // 0 = discrete gpu
+    // 1 = integrated gpu
+    // 2 = virtual gpu
+    // 3 = cpu
+    int type() const;
+
+    // hardware limit
+    uint32_t max_shared_memory_size() const;
+    uint32_t max_workgroup_count_x() const;
+    uint32_t max_workgroup_count_y() const;
+    uint32_t max_workgroup_count_z() const;
+    uint32_t max_workgroup_invocations() const;
+    uint32_t max_workgroup_size_x() const;
+    uint32_t max_workgroup_size_y() const;
+    uint32_t max_workgroup_size_z() const;
+    size_t memory_map_alignment() const;
+    size_t buffer_offset_alignment() const;
+    size_t non_coherent_atom_size() const;
+    size_t buffer_image_granularity() const;
+    uint32_t max_image_dimension_1d() const;
+    uint32_t max_image_dimension_2d() const;
+    uint32_t max_image_dimension_3d() const;
+    float timestamp_period() const;
+
+    // runtime
+    uint32_t compute_queue_family_index() const;
+    uint32_t graphics_queue_family_index() const;
+    uint32_t transfer_queue_family_index() const;
+
+    uint32_t compute_queue_count() const;
+    uint32_t graphics_queue_count() const;
+    uint32_t transfer_queue_count() const;
+
+    // property
+    bool unified_compute_transfer_queue() const;
+
+    // subgroup
+    uint32_t subgroup_size() const;
+    bool support_subgroup_basic() const;
+    bool support_subgroup_vote() const;
+    bool support_subgroup_ballot() const;
+    bool support_subgroup_shuffle() const;
+
+    // bug is not feature
+    bool bug_storage_buffer_no_l1() const;
+    bool bug_corrupted_online_pipeline_cache() const;
+    bool bug_buffer_image_load_zero() const;
+
+    // but sometimes bug is a feature
+    bool bug_implicit_fp16_arithmetic() const;
+
+    // fp16 and int8 feature
+    bool support_fp16_packed() const;
+    bool support_fp16_storage() const;
+    bool support_fp16_arithmetic() const;
+    bool support_int8_packed() const;
+    bool support_int8_storage() const;
+    bool support_int8_arithmetic() const;
+
+    // ycbcr conversion feature
+    bool support_ycbcr_conversion() const;
+
+    // cooperative matrix feature
+    bool support_cooperative_matrix() const;
+    bool support_cooperative_matrix_16_8_8() const;
+    bool support_cooperative_matrix_16_8_16() const;
+    bool support_cooperative_matrix_16_16_16() const;
+
+    // extension capability
+    int support_VK_KHR_8bit_storage() const;
+    int support_VK_KHR_16bit_storage() const;
+    int support_VK_KHR_bind_memory2() const;
+    int support_VK_KHR_buffer_device_address() const;
+    int support_VK_KHR_create_renderpass2() const;
+    int support_VK_KHR_cooperative_matrix() const;
+    int support_VK_KHR_dedicated_allocation() const;
+    int support_VK_KHR_descriptor_update_template() const;
+    int support_VK_KHR_external_memory() const;
+    int support_VK_KHR_get_memory_requirements2() const;
+    int support_VK_KHR_maintenance1() const;
+    int support_VK_KHR_maintenance2() const;
+    int support_VK_KHR_maintenance3() const;
+    int support_VK_KHR_multiview() const;
+    int support_VK_KHR_portability_subset() const;
+    int support_VK_KHR_push_descriptor() const;
+    int support_VK_KHR_sampler_ycbcr_conversion() const;
+    int support_VK_KHR_shader_float16_int8() const;
+    int support_VK_KHR_shader_float_controls() const;
+    int support_VK_KHR_storage_buffer_storage_class() const;
+    int support_VK_KHR_swapchain() const;
+    int support_VK_EXT_buffer_device_address() const;
+    int support_VK_EXT_descriptor_indexing() const;
+    int support_VK_EXT_memory_budget() const;
+    int support_VK_EXT_memory_priority() const;
+    int support_VK_EXT_queue_family_foreign() const;
+    int support_VK_AMD_device_coherent_memory() const;
+#if __ANDROID_API__ >= 26
+    int support_VK_ANDROID_external_memory_android_hardware_buffer() const;
+#endif // __ANDROID_API__ >= 26
+    int support_VK_NV_cooperative_matrix() const;
+
+private:
+    GpuInfo(const GpuInfo&);
+    GpuInfo& operator=(const GpuInfo&);
+
+private:
+    friend int create_gpu_instance();
+    GpuInfoPrivate* const d;
+};
+
+NCNN_EXPORT const GpuInfo& get_gpu_info(int device_index = get_default_gpu_index());
+
+class VkAllocator;
+class VkCompute;
+class Option;
+class PipelineCache;
+class VulkanDevicePrivate;
+class NCNN_EXPORT VulkanDevice
+{
+public:
+    VulkanDevice(int device_index = get_default_gpu_index());
+    ~VulkanDevice();
+
+    const GpuInfo& info;
+
+    VkDevice vkdevice() const;
+
+    VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const;
+
+    // with fixed workgroup size
+    VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const;
+
+    // helper for creating pipeline
+    int create_descriptorset_layout(int binding_count, const int* binding_types, VkDescriptorSetLayout* descriptorset_layout) const;
+    int create_pipeline_layout(int push_constant_count, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout* pipeline_layout) const;
+    int create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, VkPipeline* pipeline) const;
+    int create_descriptor_update_template(int binding_count, const int* binding_types, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR* descriptor_update_template) const;
+
+    uint32_t find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const;
+    bool is_mappable(uint32_t memory_type_index) const;
+    bool is_coherent(uint32_t memory_type_index) const;
+
+    VkQueue acquire_queue(uint32_t queue_family_index) const;
+    void reclaim_queue(uint32_t queue_family_index, VkQueue queue) const;
+
+    // allocator on this device
+    VkAllocator* acquire_blob_allocator() const;
+    void reclaim_blob_allocator(VkAllocator* allocator) const;
+
+    VkAllocator* acquire_staging_allocator() const;
+    void reclaim_staging_allocator(VkAllocator* allocator) const;
+
+    // immutable sampler for texelfetch
+    const VkSampler* immutable_texelfetch_sampler() const;
+
+    // dummy buffer image
+    VkMat get_dummy_buffer() const;
+    VkImageMat get_dummy_image() const;
+    VkImageMat get_dummy_image_readonly() const;
+
+    // pipeline cache on this device
+    const PipelineCache* get_pipeline_cache() const;
+
+    // test image allocation
+    bool shape_support_image_storage(const Mat& shape) const;
+
+    // current gpu heap memory budget in MB
+    uint32_t get_heap_budget() const;
+
+    // utility operator
+    void convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkImageMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkImageMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+
+    // VK_KHR_bind_memory2
+    PFN_vkBindBufferMemory2KHR vkBindBufferMemory2KHR;
+    PFN_vkBindImageMemory2KHR vkBindImageMemory2KHR;
+
+    // VK_KHR_buffer_device_address
+    PFN_vkGetBufferDeviceAddressKHR vkGetBufferDeviceAddressKHR;
+    PFN_vkGetBufferOpaqueCaptureAddressKHR vkGetBufferOpaqueCaptureAddressKHR;
+    PFN_vkGetDeviceMemoryOpaqueCaptureAddressKHR vkGetDeviceMemoryOpaqueCaptureAddressKHR;
+
+    // VK_KHR_create_renderpass2
+    PFN_vkCmdBeginRenderPass2KHR vkCmdBeginRenderPass2KHR;
+    PFN_vkCmdEndRenderPass2KHR vkCmdEndRenderPass2KHR;
+    PFN_vkCmdNextSubpass2KHR vkCmdNextSubpass2KHR;
+    PFN_vkCreateRenderPass2KHR vkCreateRenderPass2KHR;
+
+    // VK_KHR_descriptor_update_template
+    PFN_vkCreateDescriptorUpdateTemplateKHR vkCreateDescriptorUpdateTemplateKHR;
+    PFN_vkDestroyDescriptorUpdateTemplateKHR vkDestroyDescriptorUpdateTemplateKHR;
+    PFN_vkUpdateDescriptorSetWithTemplateKHR vkUpdateDescriptorSetWithTemplateKHR;
+
+    // VK_KHR_get_memory_requirements2
+    PFN_vkGetImageMemoryRequirements2KHR vkGetImageMemoryRequirements2KHR;
+    PFN_vkGetBufferMemoryRequirements2KHR vkGetBufferMemoryRequirements2KHR;
+    PFN_vkGetImageSparseMemoryRequirements2KHR vkGetImageSparseMemoryRequirements2KHR;
+
+    // VK_KHR_maintenance1
+    PFN_vkTrimCommandPoolKHR vkTrimCommandPoolKHR;
+
+    // VK_KHR_maintenance3
+    PFN_vkGetDescriptorSetLayoutSupportKHR vkGetDescriptorSetLayoutSupportKHR;
+
+    // VK_KHR_push_descriptor
+    PFN_vkCmdPushDescriptorSetWithTemplateKHR vkCmdPushDescriptorSetWithTemplateKHR;
+    PFN_vkCmdPushDescriptorSetKHR vkCmdPushDescriptorSetKHR;
+
+    // VK_KHR_sampler_ycbcr_conversion
+    PFN_vkCreateSamplerYcbcrConversionKHR vkCreateSamplerYcbcrConversionKHR;
+    PFN_vkDestroySamplerYcbcrConversionKHR vkDestroySamplerYcbcrConversionKHR;
+
+    // VK_KHR_swapchain
+    PFN_vkCreateSwapchainKHR vkCreateSwapchainKHR;
+    PFN_vkDestroySwapchainKHR vkDestroySwapchainKHR;
+    PFN_vkGetSwapchainImagesKHR vkGetSwapchainImagesKHR;
+    PFN_vkAcquireNextImageKHR vkAcquireNextImageKHR;
+    PFN_vkQueuePresentKHR vkQueuePresentKHR;
+
+    // VK_EXT_buffer_device_address
+    PFN_vkGetBufferDeviceAddressEXT vkGetBufferDeviceAddressEXT;
+
+#if __ANDROID_API__ >= 26
+    // VK_ANDROID_external_memory_android_hardware_buffer
+    PFN_vkGetAndroidHardwareBufferPropertiesANDROID vkGetAndroidHardwareBufferPropertiesANDROID;
+    PFN_vkGetMemoryAndroidHardwareBufferANDROID vkGetMemoryAndroidHardwareBufferANDROID;
+#endif // __ANDROID_API__ >= 26
+
+protected:
+    // device extension
+    int init_device_extension();
+
+private:
+    VulkanDevice(const VulkanDevice&);
+    VulkanDevice& operator=(const VulkanDevice&);
+
+private:
+    VulkanDevicePrivate* const d;
+};
+
+NCNN_EXPORT VulkanDevice* get_gpu_device(int device_index = get_default_gpu_index());
+
+// online spirv compilation
+NCNN_EXPORT int compile_spirv_module(const char* comp_string, const Option& opt, std::vector<uint32_t>& spirv);
+NCNN_EXPORT int compile_spirv_module(const char* comp_data, int comp_data_size, const Option& opt, std::vector<uint32_t>& spirv);
+NCNN_EXPORT int compile_spirv_module(int shader_type_index, const Option& opt, std::vector<uint32_t>& spirv);
+
+// info from spirv
+class NCNN_EXPORT ShaderInfo
+{
+public:
+    int specialization_count;
+    int binding_count;
+    int push_constant_count;
+
+    // 0 = null
+    // 1 = storage buffer
+    // 2 = storage image
+    // 3 = combined image sampler
+    int binding_types[16]; // 16 is large enough I think ...
+
+    int reserved_0;
+    int reserved_1;
+    int reserved_2;
+    int reserved_3;
+};
+
+NCNN_EXPORT int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info);
+
+} // namespace ncnn
+
+#endif // NCNN_VULKAN
+
+#endif // NCNN_GPU_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/layer.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/layer.h
new file mode 100644
index 0000000..f0418a9
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/layer.h
@@ -0,0 +1,222 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_H
+#define NCNN_LAYER_H
+
+#include "mat.h"
+#include "modelbin.h"
+#include "option.h"
+#include "paramdict.h"
+#include "platform.h"
+
+#if NCNN_VULKAN
+#include "command.h"
+#include "pipeline.h"
+
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+namespace ncnn {
+
+class NCNN_EXPORT Layer
+{
+public:
+    // empty
+    Layer();
+    // virtual destructor
+    virtual ~Layer();
+
+    // load layer specific parameter from parsed dict
+    // return 0 if success
+    virtual int load_param(const ParamDict& pd);
+
+    // load layer specific weight data from model binary
+    // return 0 if success
+    virtual int load_model(const ModelBin& mb);
+
+    // layer implementation specific setup
+    // return 0 if success
+    virtual int create_pipeline(const Option& opt);
+
+    // layer implementation specific clean
+    // return 0 if success
+    virtual int destroy_pipeline(const Option& opt);
+
+public:
+    // one input and one output blob
+    bool one_blob_only;
+
+    // support inplace inference
+    bool support_inplace;
+
+    // support vulkan compute
+    bool support_vulkan;
+
+    // accept input blob with packed storage
+    bool support_packing;
+
+    // accept bf16
+    bool support_bf16_storage;
+
+    // accept fp16
+    bool support_fp16_storage;
+
+    // accept int8
+    bool support_int8_storage;
+
+    // shader image storage
+    bool support_image_storage;
+
+    // shader tensor storage
+    bool support_tensor_storage;
+
+    bool support_reserved_00;
+
+    bool support_reserved_0;
+    bool support_reserved_1;
+    bool support_reserved_2;
+    bool support_reserved_3;
+    bool support_reserved_4;
+    bool support_reserved_5;
+    bool support_reserved_6;
+    bool support_reserved_7;
+    bool support_reserved_8;
+    bool support_reserved_9;
+
+    // feature disabled set
+    int featmask;
+
+public:
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+
+#if NCNN_VULKAN
+public:
+    // upload weight blob from host to device
+    virtual int upload_model(VkTransfer& cmd, const Option& opt);
+
+public:
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<VkMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<VkImageMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    // assigned immediately after creating this layer
+    const VulkanDevice* vkdev;
+#endif // NCNN_VULKAN
+
+public:
+    // custom user data
+    void* userdata;
+    // layer type index
+    int typeindex;
+#if NCNN_STRING
+    // layer type name
+    std::string type;
+    // layer name
+    std::string name;
+#endif // NCNN_STRING
+    // blob index which this layer needs as input
+    std::vector<int> bottoms;
+    // blob index which this layer produces as output
+    std::vector<int> tops;
+    // shape hint
+    std::vector<Mat> bottom_shapes;
+    std::vector<Mat> top_shapes;
+};
+
+// layer factory function
+typedef Layer* (*layer_creator_func)(void*);
+typedef void (*layer_destroyer_func)(Layer*, void*);
+
+struct layer_registry_entry
+{
+#if NCNN_STRING
+    // layer type name
+    const char* name;
+#endif // NCNN_STRING
+    // layer factory entry
+    layer_creator_func creator;
+};
+
+struct custom_layer_registry_entry
+{
+#if NCNN_STRING
+    // layer type name
+    const char* name;
+#endif // NCNN_STRING
+    // layer factory entry
+    layer_creator_func creator;
+    layer_destroyer_func destroyer;
+    void* userdata;
+};
+
+struct overwrite_builtin_layer_registry_entry
+{
+    // layer type index
+    int typeindex;
+    // layer factory entry
+    layer_creator_func creator;
+    layer_destroyer_func destroyer;
+    void* userdata;
+};
+
+#if NCNN_STRING
+// get layer type from type name
+NCNN_EXPORT int layer_to_index(const char* type);
+// create layer from type name
+NCNN_EXPORT Layer* create_layer(const char* type);
+#endif // NCNN_STRING
+// create layer from layer type
+NCNN_EXPORT Layer* create_layer(int index);
+
+#define DEFINE_LAYER_CREATOR(name)                          \
+    ::ncnn::Layer* name##_layer_creator(void* /*userdata*/) \
+    {                                                       \
+        return new name;                                    \
+    }
+
+#define DEFINE_LAYER_DESTROYER(name)                                      \
+    void name##_layer_destroyer(::ncnn::Layer* layer, void* /*userdata*/) \
+    {                                                                     \
+        delete layer;                                                     \
+    }
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/layer_shader_type.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/layer_shader_type.h
new file mode 100644
index 0000000..c143e7d
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/layer_shader_type.h
@@ -0,0 +1,29 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_SHADER_TYPE_H
+#define NCNN_LAYER_SHADER_TYPE_H
+
+namespace ncnn {
+
+namespace LayerShaderType {
+enum LayerShaderType
+{
+#include "layer_shader_type_enum.h"
+};
+} // namespace LayerShaderType
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_SHADER_TYPE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/layer_shader_type_enum.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/layer_shader_type_enum.h
new file mode 100644
index 0000000..aac8803
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/layer_shader_type_enum.h
@@ -0,0 +1,5 @@
+// Layer Shader Enum header
+//
+// This file is auto-generated by cmake, don't edit it.
+
+
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/layer_type.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/layer_type.h
new file mode 100644
index 0000000..511c714
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/layer_type.h
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_TYPE_H
+#define NCNN_LAYER_TYPE_H
+
+namespace ncnn {
+
+namespace LayerType {
+enum LayerType
+{
+#include "layer_type_enum.h"
+    CustomBit = (1 << 8),
+};
+} // namespace LayerType
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_TYPE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/layer_type_enum.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/layer_type_enum.h
new file mode 100644
index 0000000..97153ed
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/layer_type_enum.h
@@ -0,0 +1,109 @@
+// Layer Type Enum header
+//
+// This file is auto-generated by cmake, don't edit it.
+
+AbsVal = 0,
+ArgMax = 1,
+BatchNorm = 2,
+Bias = 3,
+BNLL = 4,
+Concat = 5,
+Convolution = 6,
+Crop = 7,
+Deconvolution = 8,
+Dropout = 9,
+Eltwise = 10,
+ELU = 11,
+Embed = 12,
+Exp = 13,
+Flatten = 14,
+InnerProduct = 15,
+Input = 16,
+Log = 17,
+LRN = 18,
+MemoryData = 19,
+MVN = 20,
+Pooling = 21,
+Power = 22,
+PReLU = 23,
+Proposal = 24,
+Reduction = 25,
+ReLU = 26,
+Reshape = 27,
+ROIPooling = 28,
+Scale = 29,
+Sigmoid = 30,
+Slice = 31,
+Softmax = 32,
+Split = 33,
+SPP = 34,
+TanH = 35,
+Threshold = 36,
+Tile = 37,
+RNN = 38,
+LSTM = 39,
+BinaryOp = 40,
+UnaryOp = 41,
+ConvolutionDepthWise = 42,
+Padding = 43,
+Squeeze = 44,
+ExpandDims = 45,
+Normalize = 46,
+Permute = 47,
+PriorBox = 48,
+DetectionOutput = 49,
+Interp = 50,
+DeconvolutionDepthWise = 51,
+ShuffleChannel = 52,
+InstanceNorm = 53,
+Clip = 54,
+Reorg = 55,
+YoloDetectionOutput = 56,
+Quantize = 57,
+Dequantize = 58,
+Yolov3DetectionOutput = 59,
+PSROIPooling = 60,
+ROIAlign = 61,
+Packing = 62,
+Requantize = 63,
+Cast = 64,
+HardSigmoid = 65,
+SELU = 66,
+HardSwish = 67,
+Noop = 68,
+PixelShuffle = 69,
+DeepCopy = 70,
+Mish = 71,
+StatisticsPooling = 72,
+Swish = 73,
+Gemm = 74,
+GroupNorm = 75,
+LayerNorm = 76,
+Softplus = 77,
+GRU = 78,
+MultiHeadAttention = 79,
+GELU = 80,
+Convolution1D = 81,
+Pooling1D = 82,
+ConvolutionDepthWise1D = 83,
+Convolution3D = 84,
+ConvolutionDepthWise3D = 85,
+Pooling3D = 86,
+MatMul = 87,
+Deconvolution1D = 88,
+DeconvolutionDepthWise1D = 89,
+Deconvolution3D = 90,
+DeconvolutionDepthWise3D = 91,
+Einsum = 92,
+DeformableConv2D = 93,
+GLU = 94,
+Fold = 95,
+Unfold = 96,
+GridSample = 97,
+CumulativeSum = 98,
+CopyTo = 99,
+Erf = 100,
+Diag = 101,
+CELU = 102,
+Shrink = 103,
+
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/mat.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/mat.h
new file mode 100644
index 0000000..c6f59ef
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/mat.h
@@ -0,0 +1,1843 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_MAT_H
+#define NCNN_MAT_H
+
+#include <stdlib.h>
+#include <string.h>
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif
+#if __SSE2__
+#include <emmintrin.h>
+#if __AVX__
+#include <immintrin.h>
+#endif
+#endif
+#if __mips_msa
+#include <msa.h>
+#endif
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif
+#if __riscv_vector
+#include <riscv_vector.h>
+#include "cpu.h" // cpu_riscv_vlenb()
+#endif
+
+#include "allocator.h"
+#include "option.h"
+#include "platform.h"
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#if NCNN_PIXEL
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/bitmap.h>
+#include <jni.h>
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+#endif // NCNN_PIXEL
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkMat;
+class VkImageMat;
+#endif // NCNN_VULKAN
+
+// the three dimension matrix
+class NCNN_EXPORT Mat
+{
+public:
+    // empty
+    Mat();
+    // vec
+    Mat(int w, size_t elemsize = 4u, Allocator* allocator = 0);
+    // image
+    Mat(int w, int h, size_t elemsize = 4u, Allocator* allocator = 0);
+    // dim
+    Mat(int w, int h, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // cube
+    Mat(int w, int h, int d, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // packed vec
+    Mat(int w, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed image
+    Mat(int w, int h, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed dim
+    Mat(int w, int h, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed cube
+    Mat(int w, int h, int d, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // copy
+    Mat(const Mat& m);
+    // external vec
+    Mat(int w, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external image
+    Mat(int w, int h, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external dim
+    Mat(int w, int h, int c, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external cube
+    Mat(int w, int h, int d, int c, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external packed vec
+    Mat(int w, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed image
+    Mat(int w, int h, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed dim
+    Mat(int w, int h, int c, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed cube
+    Mat(int w, int h, int d, int c, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // release
+    ~Mat();
+    // assign
+    Mat& operator=(const Mat& m);
+    // set all
+    void fill(float v);
+    void fill(int v);
+#if __ARM_NEON
+    void fill(float32x4_t _v);
+    void fill(uint16x4_t _v);
+    void fill(int32x4_t _v);
+    void fill(int32x4_t _v0, int32x4_t _v1);
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    void fill(float16x4_t _v);
+    void fill(float16x8_t _v);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif // __ARM_NEON
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+    void fill(__m512 _v);
+#endif // __AVX512F__
+    void fill(__m256 _v, int i = 0);
+#endif // __AVX__
+    void fill(__m128 _v);
+    void fill(__m128i _v);
+#endif // __SSE2__
+#if __mips_msa
+    void fill(v4f32 _v);
+#endif // __mips_msa
+#if __loongarch_sx
+    void fill(__m128 _v);
+#endif //__loongarch_sx
+#if __riscv_vector
+    void fill(vfloat32m1_t _v);
+    void fill(vuint16m1_t _v);
+    void fill(vint8m1_t _v);
+#if __riscv_zfh
+    void fill(vfloat16m1_t _v);
+#endif // __riscv_zfh
+#endif // __riscv_vector
+    template<typename T>
+    void fill(T v);
+    // deep copy
+    Mat clone(Allocator* allocator = 0) const;
+    // deep copy from other mat, inplace
+    void clone_from(const ncnn::Mat& mat, Allocator* allocator = 0);
+    // reshape vec
+    Mat reshape(int w, Allocator* allocator = 0) const;
+    // reshape image
+    Mat reshape(int w, int h, Allocator* allocator = 0) const;
+    // reshape dim
+    Mat reshape(int w, int h, int c, Allocator* allocator = 0) const;
+    // reshape cube
+    Mat reshape(int w, int h, int d, int c, Allocator* allocator = 0) const;
+    // allocate vec
+    void create(int w, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate image
+    void create(int w, int h, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate cube
+    void create(int w, int h, int d, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed cube
+    void create(int w, int h, int d, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate like
+    void create_like(const Mat& m, Allocator* allocator = 0);
+#if NCNN_VULKAN
+    // allocate like
+    void create_like(const VkMat& m, Allocator* allocator = 0);
+    // allocate like
+    void create_like(const VkImageMat& im, Allocator* allocator = 0);
+#endif // NCNN_VULKAN
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // bits per element
+    int elembits() const;
+
+    // shape only
+    Mat shape() const;
+
+    // data reference
+    Mat channel(int c);
+    const Mat channel(int c) const;
+    Mat depth(int z);
+    const Mat depth(int z) const;
+    float* row(int y);
+    const float* row(int y) const;
+    template<typename T>
+    T* row(int y);
+    template<typename T>
+    const T* row(int y) const;
+
+    // range reference
+    Mat channel_range(int c, int channels);
+    const Mat channel_range(int c, int channels) const;
+    Mat depth_range(int z, int depths);
+    const Mat depth_range(int z, int depths) const;
+    Mat row_range(int y, int rows);
+    const Mat row_range(int y, int rows) const;
+    Mat range(int x, int n);
+    const Mat range(int x, int n) const;
+
+    // access raw data
+    template<typename T>
+    operator T*();
+    template<typename T>
+    operator const T*() const;
+
+    // convenient access float vec element
+    float& operator[](size_t i);
+    const float& operator[](size_t i) const;
+
+#if NCNN_PIXEL
+    enum PixelType
+    {
+        PIXEL_CONVERT_SHIFT = 16,
+        PIXEL_FORMAT_MASK = 0x0000ffff,
+        PIXEL_CONVERT_MASK = 0xffff0000,
+
+        PIXEL_RGB = 1,
+        PIXEL_BGR = 2,
+        PIXEL_GRAY = 3,
+        PIXEL_RGBA = 4,
+        PIXEL_BGRA = 5,
+
+        PIXEL_RGB2BGR = PIXEL_RGB | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGB2GRAY = PIXEL_RGB | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGB2RGBA = PIXEL_RGB | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGB2BGRA = PIXEL_RGB | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_BGR2RGB = PIXEL_BGR | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGR2GRAY = PIXEL_BGR | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGR2RGBA = PIXEL_BGR | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGR2BGRA = PIXEL_BGR | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_GRAY2RGB = PIXEL_GRAY | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_GRAY2BGR = PIXEL_GRAY | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_GRAY2RGBA = PIXEL_GRAY | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+        PIXEL_GRAY2BGRA = PIXEL_GRAY | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_RGBA2RGB = PIXEL_RGBA | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2BGR = PIXEL_RGBA | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2GRAY = PIXEL_RGBA | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_RGBA2BGRA = PIXEL_RGBA | (PIXEL_BGRA << PIXEL_CONVERT_SHIFT),
+
+        PIXEL_BGRA2RGB = PIXEL_BGRA | (PIXEL_RGB << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGRA2BGR = PIXEL_BGRA | (PIXEL_BGR << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGRA2GRAY = PIXEL_BGRA | (PIXEL_GRAY << PIXEL_CONVERT_SHIFT),
+        PIXEL_BGRA2RGBA = PIXEL_BGRA | (PIXEL_RGBA << PIXEL_CONVERT_SHIFT),
+    };
+    // convenient construct from pixel data
+    static Mat from_pixels(const unsigned char* pixels, int type, int w, int h, Allocator* allocator = 0);
+    // convenient construct from pixel data with stride(bytes-per-row) parameter
+    static Mat from_pixels(const unsigned char* pixels, int type, int w, int h, int stride, Allocator* allocator = 0);
+    // convenient construct from pixel data and resize to specific size
+    static Mat from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from pixel data and resize to specific size with stride(bytes-per-row) parameter
+    static Mat from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int stride, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from pixel data roi
+    static Mat from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int roix, int roiy, int roiw, int roih, Allocator* allocator = 0);
+    // convenient construct from pixel data roi with stride(bytes-per-row) parameter
+    static Mat from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, Allocator* allocator = 0);
+    // convenient construct from pixel data roi and resize to specific size
+    static Mat from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from pixel data roi and resize to specific size with stride(bytes-per-row) parameter
+    static Mat from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator = 0);
+
+    // convenient export to pixel data
+    void to_pixels(unsigned char* pixels, int type) const;
+    // convenient export to pixel data with stride(bytes-per-row) parameter
+    void to_pixels(unsigned char* pixels, int type, int stride) const;
+    // convenient export to pixel data and resize to specific size
+    void to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height) const;
+    // convenient export to pixel data and resize to specific size with stride(bytes-per-row) parameter
+    void to_pixels_resize(unsigned char* pixels, int type, int target_width, int target_height, int target_stride) const;
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+    // convenient construct from android Bitmap
+    static Mat from_android_bitmap(JNIEnv* env, jobject bitmap, int type_to, Allocator* allocator = 0);
+    // convenient construct from android Bitmap and resize to specific size
+    static Mat from_android_bitmap_resize(JNIEnv* env, jobject bitmap, int type_to, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient construct from android Bitmap roi
+    static Mat from_android_bitmap_roi(JNIEnv* env, jobject bitmap, int type_to, int roix, int roiy, int roiw, int roih, Allocator* allocator = 0);
+    // convenient construct from android Bitmap roi and resize to specific size
+    static Mat from_android_bitmap_roi_resize(JNIEnv* env, jobject bitmap, int type_to, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator = 0);
+    // convenient export to android Bitmap and resize to the android Bitmap size
+    void to_android_bitmap(JNIEnv* env, jobject bitmap, int type_from) const;
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+#endif // NCNN_PIXEL
+
+    // substract channel-wise mean values, then multiply by normalize values, pass 0 to skip
+    void substract_mean_normalize(const float* mean_vals, const float* norm_vals);
+
+    // convenient construct from half precision floating point data
+    static Mat from_float16(const unsigned short* data, int size);
+
+    // pointer to the data
+    void* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-d-h-w-1  c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-d-h-w-4  c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-d-h-w-8  c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    Allocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int d;
+    int c;
+
+    size_t cstep;
+};
+
+#if NCNN_VULKAN
+
+// the three dimension matrix, vulkan version
+class NCNN_EXPORT VkMat
+{
+public:
+    // empty
+    VkMat();
+    // vec
+    VkMat(int w, size_t elemsize, VkAllocator* allocator);
+    // image
+    VkMat(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // dim
+    VkMat(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // cube
+    VkMat(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // packed vec
+    VkMat(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed image
+    VkMat(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed dim
+    VkMat(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed cube
+    VkMat(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // copy
+    VkMat(const VkMat& m);
+    // external vec
+    VkMat(int w, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external image
+    VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external dim
+    VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external cube
+    VkMat(int w, int h, int d, int c, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external packed vec
+    VkMat(int w, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed image
+    VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed dim
+    VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed cube
+    VkMat(int w, int h, int d, int c, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // release
+    ~VkMat();
+    // assign
+    VkMat& operator=(const VkMat& m);
+    // allocate vec
+    void create(int w, size_t elemsize, VkAllocator* allocator);
+    // allocate image
+    void create(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate cube
+    void create(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed cube
+    void create(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate like
+    void create_like(const Mat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkMat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkImageMat& im, VkAllocator* allocator);
+
+    // mapped
+    Mat mapped() const;
+    void* mapped_ptr() const;
+
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // bits per element
+    int elembits() const;
+
+    // shape only
+    Mat shape() const;
+
+    // low-level reference
+    VkBuffer buffer() const;
+    size_t buffer_offset() const;
+    size_t buffer_capacity() const;
+
+    // device buffer
+    VkBufferMemory* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-d-h-w-1  c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-d-h-w-4  c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-d-h-w-8  c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    VkAllocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int d;
+    int c;
+
+    size_t cstep;
+};
+
+class NCNN_EXPORT VkImageMat
+{
+public:
+    // empty
+    VkImageMat();
+    // vec
+    VkImageMat(int w, size_t elemsize, VkAllocator* allocator);
+    // image
+    VkImageMat(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // dim
+    VkImageMat(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // cube
+    VkImageMat(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // packed vec
+    VkImageMat(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed image
+    VkImageMat(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed dim
+    VkImageMat(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed cube
+    VkImageMat(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // copy
+    VkImageMat(const VkImageMat& m);
+    // external vec
+    VkImageMat(int w, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external image
+    VkImageMat(int w, int h, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external dim
+    VkImageMat(int w, int h, int c, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external cube
+    VkImageMat(int w, int h, int d, int c, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external packed vec
+    VkImageMat(int w, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed image
+    VkImageMat(int w, int h, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed dim
+    VkImageMat(int w, int h, int c, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed cube
+    VkImageMat(int w, int h, int d, int c, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // release
+    ~VkImageMat();
+    // assign
+    VkImageMat& operator=(const VkImageMat& m);
+    // allocate vec
+    void create(int w, size_t elemsize, VkAllocator* allocator);
+    // allocate image
+    void create(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate cube
+    void create(int w, int h, int d, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed cube
+    void create(int w, int h, int d, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate like
+    void create_like(const Mat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkMat& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkImageMat& im, VkAllocator* allocator);
+
+    // mapped
+    Mat mapped() const;
+    void* mapped_ptr() const;
+
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // bits per element
+    int elembits() const;
+
+    // shape only
+    Mat shape() const;
+
+    // low-level reference
+    VkImage image() const;
+    VkImageView imageview() const;
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+    // convenient construct from android hardware buffer
+    static VkImageMat from_android_hardware_buffer(VkAndroidHardwareBufferImageAllocator* allocator);
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+    // device image
+    VkImageMemory* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-d-h-w-1  c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-d-h-w-4  c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-d-h-w-8  c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    VkAllocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int d;
+    int c;
+};
+
+// type for vulkan specialization constant and push constant
+union vk_specialization_type
+{
+    int i;
+    float f;
+    uint32_t u32;
+};
+union vk_constant_type
+{
+    int i;
+    float f;
+};
+#endif // NCNN_VULKAN
+
+// misc function
+#if NCNN_PIXEL
+// convert yuv420sp(nv21) to rgb, the fast approximate version
+NCNN_EXPORT void yuv420sp2rgb(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
+// convert yuv420sp(nv12) to rgb, the fast approximate version
+NCNN_EXPORT void yuv420sp2rgb_nv12(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
+// convert yuv420sp(nv21) to rgb with half resize, the faster approximate version
+NCNN_EXPORT void yuv420sp2rgb_half(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb);
+// image pixel bilinear resize
+NCNN_EXPORT void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+NCNN_EXPORT void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+NCNN_EXPORT void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+NCNN_EXPORT void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+// image pixel bilinear resize with stride(bytes-per-row) parameter
+NCNN_EXPORT void resize_bilinear_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+NCNN_EXPORT void resize_bilinear_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+NCNN_EXPORT void resize_bilinear_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+NCNN_EXPORT void resize_bilinear_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride);
+// image pixel bilinear resize, convenient wrapper for yuv420sp(nv21/nv12)
+NCNN_EXPORT void resize_bilinear_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h);
+#endif // NCNN_PIXEL
+#if NCNN_PIXEL_ROTATE
+// type is the from type, 6 means rotating from 6 to 1
+//
+//     1        2       3      4         5            6           7          8
+//
+//   888888  888888      88  88      8888888888  88                  88  8888888888
+//   88          88      88  88      88  88      88  88          88  88      88  88
+//   8888      8888    8888  8888    88          8888888888  8888888888          88
+//   88          88      88  88
+//   88          88  888888  888888
+//
+// ref http://sylvana.net/jpegcrop/exif_orientation.html
+// image pixel kanna rotate
+NCNN_EXPORT void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+NCNN_EXPORT void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+NCNN_EXPORT void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+NCNN_EXPORT void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+// image pixel kanna rotate with stride(bytes-per-row) parameter
+NCNN_EXPORT void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+NCNN_EXPORT void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+NCNN_EXPORT void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+NCNN_EXPORT void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type);
+// image pixel kanna rotate, convenient wrapper for yuv420sp(nv21/nv12)
+NCNN_EXPORT void kanna_rotate_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type);
+#endif // NCNN_PIXEL_ROTATE
+#if NCNN_PIXEL_AFFINE
+// resolve affine transform matrix from rotation angle, scale factor and x y offset
+NCNN_EXPORT void get_rotation_matrix(float angle, float scale, float dx, float dy, float* tm);
+// resolve affine transform matrix from two set of points, num_point must be >= 2
+NCNN_EXPORT void get_affine_transform(const float* points_from, const float* points_to, int num_point, float* tm);
+// resolve the inversion affine transform matrix
+NCNN_EXPORT void invert_affine_transform(const float* tm, float* tm_inv);
+// image pixel bilinear warpaffine inverse transform, set -233 for transparent border color, the color RGBA is little-endian encoded
+NCNN_EXPORT void warpaffine_bilinear_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+// image pixel bilinear warpaffine inverse transform with stride(bytes-per-row) parameter, set -233 for transparent border color, the color RGBA is little-endian encoded
+NCNN_EXPORT void warpaffine_bilinear_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+NCNN_EXPORT void warpaffine_bilinear_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, const float* tm, int type = 0, unsigned int v = 0);
+// image pixel bilinear warpaffine, convenient wrapper for yuv420sp(nv21/nv12), set -233 for transparent border color, the color YUV_ is little-endian encoded
+NCNN_EXPORT void warpaffine_bilinear_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, const float* tm, int type = 0, unsigned int v = 0);
+#endif // NCNN_PIXEL_AFFINE
+#if NCNN_PIXEL_DRAWING
+// draw rectangle, set thickness -1 for filled rectangle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_rectangle_c1(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c2(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c3(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c4(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+// draw rectangle with stride(bytes-per-row) parameter, set thickness -1 for filled rectangle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_rectangle_c1(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c2(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c3(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void draw_rectangle_c4(unsigned char* pixels, int w, int h, int stride, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+// draw rectangle, convenient wrapper for yuv420sp(nv21/nv12), set thickness -1 for filled rectangle, the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_rectangle_yuv420sp(unsigned char* yuv420sp, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+// draw circle, set thickness -1 for filled circle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_circle_c1(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c2(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c3(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c4(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+// draw circle with stride(bytes-per-row) parameter, set thickness -1 for filled circle, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_circle_c1(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c2(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c3(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void draw_circle_c4(unsigned char* pixels, int w, int h, int stride, int cx, int cy, int radius, unsigned int color, int thickness);
+// draw circle, convenient wrapper for yuv420sp(nv21/nv12), set thickness -1 for filled circle, the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_circle_yuv420sp(unsigned char* yuv420sp, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+// draw line, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_line_c1(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c2(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c3(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c4(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+// draw line with stride(bytes-per-row) parameter, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_line_c1(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c2(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c3(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void draw_line_c4(unsigned char* pixels, int w, int h, int stride, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+// draw line, convenient wrapper for yuv420sp(nv21/nv12), the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_line_yuv420sp(unsigned char* yuv420sp, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+// resolve text bounding box size
+NCNN_EXPORT void get_text_drawing_size(const char* text, int fontpixelsize, int* w, int* h);
+// draw ascii printables and newline, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_text_c1(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c2(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c3(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c4(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+// draw ascii printables and newline with stride(bytes-per-row) parameter, the color RGBA is little-endian encoded
+NCNN_EXPORT void draw_text_c1(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c2(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c3(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void draw_text_c4(unsigned char* pixels, int w, int h, int stride, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+// draw ascii printables and newline, convenient wrapper for yuv420sp(nv21/nv12), the color YUV_ is little-endian encoded
+NCNN_EXPORT void draw_text_yuv420sp(unsigned char* yuv420sp, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+#endif // NCNN_PIXEL_DRAWING
+
+// type conversion
+// convert float to half precision floating point
+NCNN_EXPORT unsigned short float32_to_float16(float value);
+// convert half precision floating point to float
+NCNN_EXPORT float float16_to_float32(unsigned short value);
+// convert float to brain half
+NCNN_EXPORT NCNN_FORCEINLINE unsigned short float32_to_bfloat16(float value)
+{
+    // 16 : 16
+    union
+    {
+        unsigned int u;
+        float f;
+    } tmp;
+    tmp.f = value;
+    return tmp.u >> 16;
+}
+// convert brain half to float
+NCNN_EXPORT NCNN_FORCEINLINE float bfloat16_to_float32(unsigned short value)
+{
+    // 16 : 16
+    union
+    {
+        unsigned int u;
+        float f;
+    } tmp;
+    tmp.u = value << 16;
+    return tmp.f;
+}
+
+// mat process
+enum BorderType
+{
+    BORDER_CONSTANT = 0,
+    BORDER_REPLICATE = 1,
+    BORDER_REFLECT = 2,
+    BORDER_TRANSPARENT = -233,
+};
+NCNN_EXPORT void copy_make_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int type, float v, const Option& opt = Option());
+NCNN_EXPORT void copy_make_border_3d(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int front, int behind, int type, float v, const Option& opt = Option());
+NCNN_EXPORT void copy_cut_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, const Option& opt = Option());
+NCNN_EXPORT void copy_cut_border_3d(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int front, int behind, const Option& opt = Option());
+NCNN_EXPORT void resize_nearest(const Mat& src, Mat& dst, int w, int h, const Option& opt = Option());
+NCNN_EXPORT void resize_bilinear(const Mat& src, Mat& dst, int w, int h, const Option& opt = Option());
+NCNN_EXPORT void resize_bicubic(const Mat& src, Mat& dst, int w, int h, const Option& opt = Option());
+NCNN_EXPORT void convert_packing(const Mat& src, Mat& dst, int elempack, const Option& opt = Option());
+NCNN_EXPORT void flatten(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_float32_to_float16(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_float16_to_float32(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_int8_to_float32(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_float32_to_bfloat16(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void cast_bfloat16_to_float32(const Mat& src, Mat& dst, const Option& opt = Option());
+NCNN_EXPORT void quantize_to_int8(const Mat& src, Mat& dst, const Mat& scale_data, const Option& opt = Option());
+NCNN_EXPORT void dequantize_from_int32(const Mat& src, Mat& dst, const Mat& scale_data, const Mat& bias_data, const Option& opt = Option());
+NCNN_EXPORT void requantize_from_int32_to_int8(const Mat& src, Mat& dst, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt = Option());
+
+NCNN_FORCEINLINE Mat::Mat()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE Mat::Mat(const Mat& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c), cstep(m.cstep)
+{
+    addref();
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = (size_t)w * h;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = (size_t)w * h;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _c, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize((size_t)w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::Mat(int _w, int _h, int _d, int _c, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize((size_t)w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE Mat::~Mat()
+{
+    release();
+}
+
+NCNN_FORCEINLINE void Mat::fill(float _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+
+    int i = 0;
+#if __ARM_NEON
+    float32x4_t _c = vdupq_n_f32(_v);
+    for (; i + 3 < size; i += 4)
+    {
+        vst1q_f32(ptr, _c);
+        ptr += 4;
+    }
+#endif // __ARM_NEON
+    for (; i < size; i++)
+    {
+        *ptr++ = _v;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(int _v)
+{
+    int size = (int)total();
+    int* ptr = (int*)data;
+
+    int i = 0;
+#if __ARM_NEON
+    int32x4_t _c = vdupq_n_s32(_v);
+    for (; i + 3 < size; i += 4)
+    {
+        vst1q_s32(ptr, _c);
+        ptr += 4;
+    }
+#endif // __ARM_NEON
+    for (; i < size; i++)
+    {
+        *ptr++ = _v;
+    }
+}
+
+#if __ARM_NEON
+NCNN_FORCEINLINE void Mat::fill(float32x4_t _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_f32(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(uint16x4_t _v)
+{
+    int size = (int)total();
+    unsigned short* ptr = (unsigned short*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1_u16(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(int32x4_t _v)
+{
+    int size = (int)total();
+    int* ptr = (int*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_s32(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(int32x4_t _v0, int32x4_t _v1)
+{
+    int size = (int)total();
+    int* ptr = (int*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_s32(ptr, _v0);
+        vst1q_s32(ptr + 4, _v1);
+        ptr += 8;
+    }
+}
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+NCNN_FORCEINLINE void Mat::fill(float16x4_t _v)
+{
+    int size = (int)total();
+    __fp16* ptr = (__fp16*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1_f16(ptr, _v);
+        ptr += 4;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(float16x8_t _v)
+{
+    int size = (int)total();
+    __fp16* ptr = (__fp16*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vst1q_f16(ptr, _v);
+        ptr += 8;
+    }
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif // __ARM_NEON
+
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+NCNN_FORCEINLINE void Mat::fill(__m512 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm512_storeu_ps(ptr, _v);
+        ptr += 16;
+    }
+}
+#endif // __AVX512F__
+NCNN_FORCEINLINE void Mat::fill(__m256 _v, int _i)
+{
+    // old gcc cannot overload __m128 and __m256 type
+    // add a dummy int parameter for different mangled function symbol
+    (void)_i;
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm256_storeu_ps(ptr, _v);
+        ptr += 8;
+    }
+}
+#endif // __AVX__
+NCNN_FORCEINLINE void Mat::fill(__m128 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm_storeu_ps(ptr, _v);
+        ptr += 4;
+    }
+}
+NCNN_FORCEINLINE void Mat::fill(__m128i _v)
+{
+    int size = (int)total();
+    unsigned short* ptr = (unsigned short*)data;
+    for (int i = 0; i < size; i++)
+    {
+        _mm_store_si128((__m128i*)ptr, _v);
+        ptr += 8;
+    }
+}
+#endif // __SSE2__
+
+#if __mips_msa
+NCNN_FORCEINLINE void Mat::fill(v4f32 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        __msa_st_w((v4i32)_v, ptr, 0);
+        ptr += 4;
+    }
+}
+#endif // __mips_msa
+
+#if __loongarch_sx
+NCNN_FORCEINLINE void Mat::fill(__m128 _v)
+{
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        __lsx_vst(_v, ptr, 0);
+        ptr += 4;
+    }
+}
+#endif // __loongarch_sx
+#if __riscv_vector
+NCNN_FORCEINLINE void Mat::fill(vfloat32m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 4;
+    const size_t vl = vsetvl_e32m1(packn);
+
+    int size = (int)total();
+    float* ptr = (float*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse32_v_f32m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(vuint16m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 2;
+    const size_t vl = vsetvl_e16m1(packn);
+
+    int size = (int)total();
+    unsigned short* ptr = (unsigned short*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse16_v_u16m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+
+NCNN_FORCEINLINE void Mat::fill(vint8m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 1;
+    const size_t vl = vsetvl_e8m1(packn);
+
+    int size = (int)total();
+    signed char* ptr = (signed char*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse8_v_i8m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+#if __riscv_zfh
+NCNN_FORCEINLINE void Mat::fill(vfloat16m1_t _v)
+{
+    const int packn = cpu_riscv_vlenb() / 2;
+    const size_t vl = vsetvl_e16m1(packn);
+
+    int size = (int)total();
+    __fp16* ptr = (__fp16*)data;
+    for (int i = 0; i < size; i++)
+    {
+        vse16_v_f16m1(ptr, _v, vl);
+        ptr += packn;
+    }
+}
+#endif // __riscv_zfh
+#endif // __riscv_vector
+
+template<typename T>
+NCNN_FORCEINLINE void Mat::fill(T _v)
+{
+    int size = (int)total();
+    T* ptr = (T*)data;
+    for (int i = 0; i < size; i++)
+    {
+        ptr[i] = _v;
+    }
+}
+
+NCNN_FORCEINLINE Mat& Mat::operator=(const Mat& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        NCNN_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    d = m.d;
+    c = m.c;
+
+    cstep = m.cstep;
+
+    return *this;
+}
+
+NCNN_FORCEINLINE void Mat::addref()
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+}
+
+NCNN_FORCEINLINE void Mat::release()
+{
+    if (refcount && NCNN_XADD(refcount, -1) == 1)
+    {
+        if (allocator)
+            allocator->fastFree(data);
+        else
+            fastFree(data);
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    d = 0;
+    c = 0;
+
+    cstep = 0;
+
+    refcount = 0;
+}
+
+NCNN_FORCEINLINE bool Mat::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+NCNN_FORCEINLINE size_t Mat::total() const
+{
+    return cstep * c;
+}
+
+NCNN_FORCEINLINE int Mat::elembits() const
+{
+    return elempack ? static_cast<int>(elemsize * 8) / elempack : 0;
+}
+
+NCNN_FORCEINLINE Mat Mat::shape() const
+{
+    if (dims == 1)
+        return Mat(w * elempack, (void*)0);
+    if (dims == 2)
+        return Mat(w, h * elempack, (void*)0);
+    if (dims == 3)
+        return Mat(w, h, c * elempack, (void*)0);
+    if (dims == 4)
+        return Mat(w, h, d, c * elempack, (void*)0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE Mat Mat::channel(int _c)
+{
+    Mat m(w, h, d, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims - 1;
+    if (dims == 4)
+        m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::channel(int _c) const
+{
+    Mat m(w, h, d, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims - 1;
+    if (dims == 4)
+        m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE Mat Mat::depth(int z)
+{
+    return Mat(w, h, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE const Mat Mat::depth(int z) const
+{
+    return Mat(w, h, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE float* Mat::row(int y)
+{
+    return (float*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+NCNN_FORCEINLINE const float* Mat::row(int y) const
+{
+    return (const float*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+template<typename T>
+NCNN_FORCEINLINE T* Mat::row(int y)
+{
+    return (T*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+template<typename T>
+NCNN_FORCEINLINE const T* Mat::row(int y) const
+{
+    return (const T*)((unsigned char*)data + (size_t)w * y * elemsize);
+}
+
+NCNN_FORCEINLINE Mat Mat::channel_range(int _c, int channels)
+{
+    Mat m(w, h, d, channels, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::channel_range(int _c, int channels) const
+{
+    Mat m(w, h, d, channels, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+    m.dims = dims;
+    return m;
+}
+
+NCNN_FORCEINLINE Mat Mat::depth_range(int z, int depths)
+{
+    Mat m(w, h, depths, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+    m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE const Mat Mat::depth_range(int z, int depths) const
+{
+    Mat m(w, h, depths, (unsigned char*)data + (size_t)w * h * z * elemsize, elemsize, elempack, allocator);
+    m.cstep = (size_t)w * h;
+    return m;
+}
+
+NCNN_FORCEINLINE Mat Mat::row_range(int y, int rows)
+{
+    return Mat(w, rows, (unsigned char*)data + (size_t)w * y * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE const Mat Mat::row_range(int y, int rows) const
+{
+    return Mat(w, rows, (unsigned char*)data + (size_t)w * y * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE Mat Mat::range(int x, int n)
+{
+    return Mat(n, (unsigned char*)data + x * elemsize, elemsize, elempack, allocator);
+}
+
+NCNN_FORCEINLINE const Mat Mat::range(int x, int n) const
+{
+    return Mat(n, (unsigned char*)data + x * elemsize, elemsize, elempack, allocator);
+}
+
+template<typename T>
+NCNN_FORCEINLINE Mat::operator T*()
+{
+    return (T*)data;
+}
+
+template<typename T>
+NCNN_FORCEINLINE Mat::operator const T*() const
+{
+    return (const T*)data;
+}
+
+NCNN_FORCEINLINE float& Mat::operator[](size_t i)
+{
+    return ((float*)data)[i];
+}
+
+NCNN_FORCEINLINE const float& Mat::operator[](size_t i) const
+{
+    return ((const float*)data)[i];
+}
+
+#if NCNN_VULKAN
+
+NCNN_FORCEINLINE VkMat::VkMat()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0), cstep(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(const VkMat& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c)
+{
+    addref();
+
+    cstep = m.cstep;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = w * h;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize(w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize(w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+    cstep = w;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+    cstep = w * h;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+    cstep = alignSize(w * h * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::VkMat(int _w, int _h, int _d, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+    cstep = alignSize(w * h * d * elemsize, 16) / elemsize;
+}
+
+NCNN_FORCEINLINE VkMat::~VkMat()
+{
+    release();
+}
+
+NCNN_FORCEINLINE VkMat& VkMat::operator=(const VkMat& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        NCNN_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    d = m.d;
+    c = m.c;
+
+    cstep = m.cstep;
+
+    return *this;
+}
+
+NCNN_FORCEINLINE Mat VkMat::mapped() const
+{
+    if (!allocator->mappable)
+        return Mat();
+
+    if (dims == 1)
+        return Mat(w, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 2)
+        return Mat(w, h, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 3)
+        return Mat(w, h, c, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 4)
+        return Mat(w, h, d, c, mapped_ptr(), elemsize, elempack, 0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE void* VkMat::mapped_ptr() const
+{
+    if (!allocator->mappable)
+        return 0;
+
+    return (unsigned char*)data->mapped_ptr + data->offset;
+}
+
+NCNN_FORCEINLINE void VkMat::addref()
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+}
+
+NCNN_FORCEINLINE void VkMat::release()
+{
+    if (refcount && NCNN_XADD(refcount, -1) == 1)
+    {
+        if (allocator && data)
+        {
+            allocator->fastFree(data);
+        }
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    d = 0;
+    c = 0;
+
+    cstep = 0;
+
+    refcount = 0;
+}
+
+NCNN_FORCEINLINE bool VkMat::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+NCNN_FORCEINLINE size_t VkMat::total() const
+{
+    return cstep * c;
+}
+
+NCNN_FORCEINLINE int VkMat::elembits() const
+{
+    return elempack ? static_cast<int>(elemsize) * 8 / elempack : 0;
+}
+
+NCNN_FORCEINLINE Mat VkMat::shape() const
+{
+    if (dims == 1)
+        return Mat(w * elempack, (void*)0);
+    if (dims == 2)
+        return Mat(w, h * elempack, (void*)0);
+    if (dims == 3)
+        return Mat(w, h, c * elempack, (void*)0);
+    if (dims == 4)
+        return Mat(w, h, d, c * elempack, (void*)0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE VkBuffer VkMat::buffer() const
+{
+    return data->buffer;
+}
+
+NCNN_FORCEINLINE size_t VkMat::buffer_offset() const
+{
+    return data->offset;
+}
+
+NCNN_FORCEINLINE size_t VkMat::buffer_capacity() const
+{
+    return data->capacity;
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), d(0), c(0)
+{
+    create(_w, _h, _d, _c, _elemsize, _elempack, _allocator);
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(const VkImageMat& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), d(m.d), c(m.c)
+{
+    addref();
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), d(1), c(1)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _c, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), d(1), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::VkImageMat(int _w, int _h, int _d, int _c, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(4), w(_w), h(_h), d(_d), c(_c)
+{
+}
+
+NCNN_FORCEINLINE VkImageMat::~VkImageMat()
+{
+    release();
+}
+
+NCNN_FORCEINLINE VkImageMat& VkImageMat::operator=(const VkImageMat& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        NCNN_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    d = m.d;
+    c = m.c;
+
+    return *this;
+}
+
+NCNN_FORCEINLINE Mat VkImageMat::mapped() const
+{
+    if (!allocator->mappable || !data->mapped_ptr)
+        return Mat();
+
+    if (dims == 1)
+        return Mat(w, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 2)
+        return Mat(w, h, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 3)
+        return Mat(w, h, c, mapped_ptr(), elemsize, elempack, 0);
+
+    if (dims == 4)
+        return Mat(w, h, d, c, mapped_ptr(), elemsize, elempack, 0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE void* VkImageMat::mapped_ptr() const
+{
+    if (!allocator->mappable || !data->mapped_ptr)
+        return 0;
+
+    return (unsigned char*)data->mapped_ptr + data->bind_offset;
+}
+
+NCNN_FORCEINLINE void VkImageMat::addref()
+{
+    if (refcount)
+        NCNN_XADD(refcount, 1);
+}
+
+NCNN_FORCEINLINE void VkImageMat::release()
+{
+    if (refcount && NCNN_XADD(refcount, -1) == 1)
+    {
+        if (allocator && data)
+        {
+            allocator->fastFree(data);
+        }
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    d = 0;
+    c = 0;
+
+    refcount = 0;
+}
+
+NCNN_FORCEINLINE bool VkImageMat::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+NCNN_FORCEINLINE size_t VkImageMat::total() const
+{
+    return w * h * d * c;
+}
+
+NCNN_FORCEINLINE int VkImageMat::elembits() const
+{
+    return elempack ? static_cast<int>(elemsize) * 8 / elempack : 0;
+}
+
+NCNN_FORCEINLINE Mat VkImageMat::shape() const
+{
+    if (dims == 1)
+        return Mat(w * elempack, (void*)0);
+    if (dims == 2)
+        return Mat(w, h * elempack, (void*)0);
+    if (dims == 3)
+        return Mat(w, h, c * elempack, (void*)0);
+    if (dims == 4)
+        return Mat(w, h, d, c * elempack, (void*)0);
+
+    return Mat();
+}
+
+NCNN_FORCEINLINE VkImage VkImageMat::image() const
+{
+    return data->image;
+}
+
+NCNN_FORCEINLINE VkImageView VkImageMat::imageview() const
+{
+    return data->imageview;
+}
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_MAT_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/modelbin.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/modelbin.h
new file mode 100644
index 0000000..aada5f6
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/modelbin.h
@@ -0,0 +1,80 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_MODELBIN_H
+#define NCNN_MODELBIN_H
+
+#include "mat.h"
+
+namespace ncnn {
+
+class DataReader;
+class NCNN_EXPORT ModelBin
+{
+public:
+    ModelBin();
+    virtual ~ModelBin();
+    // element type
+    // 0 = auto
+    // 1 = float32
+    // 2 = float16
+    // 3 = int8
+    // load vec
+    virtual Mat load(int w, int type) const;
+    // load image
+    virtual Mat load(int w, int h, int type) const;
+    // load dim
+    virtual Mat load(int w, int h, int c, int type) const;
+    // load cube
+    virtual Mat load(int w, int h, int d, int c, int type) const;
+};
+
+class ModelBinFromDataReaderPrivate;
+class NCNN_EXPORT ModelBinFromDataReader : public ModelBin
+{
+public:
+    explicit ModelBinFromDataReader(const DataReader& dr);
+    virtual ~ModelBinFromDataReader();
+
+    virtual Mat load(int w, int type) const;
+
+private:
+    ModelBinFromDataReader(const ModelBinFromDataReader&);
+    ModelBinFromDataReader& operator=(const ModelBinFromDataReader&);
+
+private:
+    ModelBinFromDataReaderPrivate* const d;
+};
+
+class ModelBinFromMatArrayPrivate;
+class NCNN_EXPORT ModelBinFromMatArray : public ModelBin
+{
+public:
+    // construct from weight blob array
+    explicit ModelBinFromMatArray(const Mat* weights);
+    virtual ~ModelBinFromMatArray();
+
+    virtual Mat load(int w, int type) const;
+
+private:
+    ModelBinFromMatArray(const ModelBinFromMatArray&);
+    ModelBinFromMatArray& operator=(const ModelBinFromMatArray&);
+
+private:
+    ModelBinFromMatArrayPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_MODELBIN_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/ncnn_export.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/ncnn_export.h
new file mode 100644
index 0000000..e2f5fde
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/ncnn_export.h
@@ -0,0 +1,42 @@
+
+#ifndef NCNN_EXPORT_H
+#define NCNN_EXPORT_H
+
+#ifdef NCNN_STATIC_DEFINE
+#  define NCNN_EXPORT
+#  define NCNN_NO_EXPORT
+#else
+#  ifndef NCNN_EXPORT
+#    ifdef ncnn_EXPORTS
+        /* We are building this library */
+#      define NCNN_EXPORT __attribute__((visibility("default")))
+#    else
+        /* We are using this library */
+#      define NCNN_EXPORT __attribute__((visibility("default")))
+#    endif
+#  endif
+
+#  ifndef NCNN_NO_EXPORT
+#    define NCNN_NO_EXPORT __attribute__((visibility("hidden")))
+#  endif
+#endif
+
+#ifndef NCNN_DEPRECATED
+#  define NCNN_DEPRECATED __attribute__ ((__deprecated__))
+#endif
+
+#ifndef NCNN_DEPRECATED_EXPORT
+#  define NCNN_DEPRECATED_EXPORT NCNN_EXPORT NCNN_DEPRECATED
+#endif
+
+#ifndef NCNN_DEPRECATED_NO_EXPORT
+#  define NCNN_DEPRECATED_NO_EXPORT NCNN_NO_EXPORT NCNN_DEPRECATED
+#endif
+
+#if 0 /* DEFINE_NO_DEPRECATED */
+#  ifndef NCNN_NO_DEPRECATED
+#    define NCNN_NO_DEPRECATED
+#  endif
+#endif
+
+#endif /* NCNN_EXPORT_H */
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/net.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/net.h
new file mode 100644
index 0000000..98e3ec3
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/net.h
@@ -0,0 +1,274 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_NET_H
+#define NCNN_NET_H
+
+#include "blob.h"
+#include "layer.h"
+#include "mat.h"
+#include "option.h"
+#include "platform.h"
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/asset_manager.h>
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkCompute;
+#endif // NCNN_VULKAN
+class DataReader;
+class Extractor;
+class NetPrivate;
+class NCNN_EXPORT Net
+{
+public:
+    // empty init
+    Net();
+    // clear and destroy
+    virtual ~Net();
+
+public:
+    // option can be changed before loading
+    Option opt;
+
+#if NCNN_VULKAN
+    // set gpu device by index
+    void set_vulkan_device(int device_index);
+
+    // set gpu device by device handle, no owner transfer
+    void set_vulkan_device(const VulkanDevice* vkdev);
+
+    const VulkanDevice* vulkan_device() const;
+#endif // NCNN_VULKAN
+
+#if NCNN_STRING
+    // register custom layer or overwrite built-in layer by layer type name
+    // return 0 if success
+    int register_custom_layer(const char* type, layer_creator_func creator, layer_destroyer_func destroyer = 0, void* userdata = 0);
+    virtual int custom_layer_to_index(const char* type);
+#endif // NCNN_STRING
+    // register custom layer or overwrite built-in layer by layer type
+    // return 0 if success
+    int register_custom_layer(int index, layer_creator_func creator, layer_destroyer_func destroyer = 0, void* userdata = 0);
+
+#if NCNN_STRING
+    int load_param(const DataReader& dr);
+#endif // NCNN_STRING
+
+    int load_param_bin(const DataReader& dr);
+
+    int load_model(const DataReader& dr);
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    // load network structure from plain param file
+    // return 0 if success
+    int load_param(FILE* fp);
+    int load_param(const char* protopath);
+    int load_param_mem(const char* mem);
+#endif // NCNN_STRING
+    // load network structure from binary param file
+    // return 0 if success
+    int load_param_bin(FILE* fp);
+    int load_param_bin(const char* protopath);
+
+    // load network weight data from model file
+    // return 0 if success
+    int load_model(FILE* fp);
+    int load_model(const char* modelpath);
+#endif // NCNN_STDIO
+
+    // load network structure from external memory
+    // memory pointer must be 32-bit aligned
+    // return bytes consumed
+    int load_param(const unsigned char* mem);
+
+    // reference network weight data from external memory
+    // weight data is not copied but referenced
+    // so external memory should be retained when used
+    // memory pointer must be 32-bit aligned
+    // return bytes consumed
+    int load_model(const unsigned char* mem);
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#if NCNN_STRING
+    // convenient load network structure from android asset plain param file
+    int load_param(AAsset* asset);
+    int load_param(AAssetManager* mgr, const char* assetpath);
+#endif // NCNN_STRING
+    // convenient load network structure from android asset binary param file
+    int load_param_bin(AAsset* asset);
+    int load_param_bin(AAssetManager* mgr, const char* assetpath);
+
+    // convenient load network weight data from android asset model file
+    int load_model(AAsset* asset);
+    int load_model(AAssetManager* mgr, const char* assetpath);
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+    // unload network structure and weight data
+    void clear();
+
+    // construct an Extractor from network
+    Extractor create_extractor() const;
+
+    // get input/output indexes/names
+    const std::vector<int>& input_indexes() const;
+    const std::vector<int>& output_indexes() const;
+#if NCNN_STRING
+    const std::vector<const char*>& input_names() const;
+    const std::vector<const char*>& output_names() const;
+#endif
+
+    const std::vector<Blob>& blobs() const;
+    const std::vector<Layer*>& layers() const;
+
+    std::vector<Blob>& mutable_blobs();
+    std::vector<Layer*>& mutable_layers();
+
+protected:
+    friend class Extractor;
+#if NCNN_STRING
+    int find_blob_index_by_name(const char* name) const;
+    int find_layer_index_by_name(const char* name) const;
+    virtual Layer* create_custom_layer(const char* type);
+    virtual Layer* create_overwrite_builtin_layer(const char* type);
+#endif // NCNN_STRING
+    virtual Layer* create_custom_layer(int index);
+    virtual Layer* create_overwrite_builtin_layer(int typeindex);
+
+private:
+    Net(const Net&);
+    Net& operator=(const Net&);
+
+private:
+    NetPrivate* const d;
+};
+
+class ExtractorPrivate;
+class NCNN_EXPORT Extractor
+{
+public:
+    virtual ~Extractor();
+
+    // copy
+    Extractor(const Extractor&);
+
+    // assign
+    Extractor& operator=(const Extractor&);
+
+    // clear blob mats and alloctors
+    void clear();
+
+    // enable light mode
+    // intermediate blob will be recycled when enabled
+    // enabled by default
+    void set_light_mode(bool enable);
+
+    // set thread count for this extractor
+    // this will overwrite the global setting
+    // default count is system depended
+    void set_num_threads(int num_threads);
+
+    // set blob memory allocator
+    void set_blob_allocator(Allocator* allocator);
+
+    // set workspace memory allocator
+    void set_workspace_allocator(Allocator* allocator);
+
+#if NCNN_VULKAN
+    void set_vulkan_compute(bool enable);
+
+    void set_blob_vkallocator(VkAllocator* allocator);
+
+    void set_workspace_vkallocator(VkAllocator* allocator);
+
+    void set_staging_vkallocator(VkAllocator* allocator);
+#endif // NCNN_VULKAN
+
+#if NCNN_STRING
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const Mat& in);
+
+    // get result by blob name
+    // return 0 if success
+    // type = 0, default
+    // type = 1, do not convert fp16/bf16 or / and packing
+    int extract(const char* blob_name, Mat& feat, int type = 0);
+#endif // NCNN_STRING
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const Mat& in);
+
+    // get result by blob index
+    // return 0 if success
+    // type = 0, default
+    // type = 1, do not convert fp16/bf16 or / and packing
+    int extract(int blob_index, Mat& feat, int type = 0);
+
+#if NCNN_VULKAN
+#if NCNN_STRING
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const VkMat& in);
+
+    // get result by blob name
+    // return 0 if success
+    int extract(const char* blob_name, VkMat& feat, VkCompute& cmd);
+
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const VkImageMat& in);
+
+    // get result by blob name
+    // return 0 if success
+    int extract(const char* blob_name, VkImageMat& feat, VkCompute& cmd);
+#endif // NCNN_STRING
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const VkMat& in);
+
+    // get result by blob index
+    // return 0 if success
+    int extract(int blob_index, VkMat& feat, VkCompute& cmd);
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const VkImageMat& in);
+
+    // get result by blob index
+    // return 0 if success
+    int extract(int blob_index, VkImageMat& feat, VkCompute& cmd);
+#endif // NCNN_VULKAN
+
+protected:
+    friend Extractor Net::create_extractor() const;
+    Extractor(const Net* net, size_t blob_count);
+
+private:
+    ExtractorPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_NET_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/option.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/option.h
new file mode 100644
index 0000000..7d0cc60
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/option.h
@@ -0,0 +1,156 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_OPTION_H
+#define NCNN_OPTION_H
+
+#include "platform.h"
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkAllocator;
+class PipelineCache;
+#endif // NCNN_VULKAN
+
+class Allocator;
+class NCNN_EXPORT Option
+{
+public:
+    // default option
+    Option();
+
+public:
+    // light mode
+    // intermediate blob will be recycled when enabled
+    // enabled by default
+    bool lightmode;
+
+    // thread count
+    // default value is the one returned by get_cpu_count()
+    int num_threads;
+
+    // blob memory allocator
+    Allocator* blob_allocator;
+
+    // workspace memory allocator
+    Allocator* workspace_allocator;
+
+#if NCNN_VULKAN
+    // blob memory allocator
+    VkAllocator* blob_vkallocator;
+
+    // workspace memory allocator
+    VkAllocator* workspace_vkallocator;
+
+    // staging memory allocator
+    VkAllocator* staging_vkallocator;
+
+    // pipeline cache
+    PipelineCache* pipeline_cache;
+#endif // NCNN_VULKAN
+
+    // the time openmp threads busy-wait for more work before going to sleep
+    // default value is 20ms to keep the cores enabled
+    // without too much extra power consumption afterwards
+    int openmp_blocktime;
+
+    // enable winograd convolution optimization
+    // improve convolution 3x3 stride1 performance, may consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_winograd_convolution;
+
+    // enable sgemm convolution optimization
+    // improve convolution 1x1 stride1 performance, may consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_sgemm_convolution;
+
+    // enable quantized int8 inference
+    // use low-precision int8 path for quantized model
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_int8_inference;
+
+    // enable vulkan compute
+    bool use_vulkan_compute;
+
+    // enable bf16 data type for storage
+    // improve most operator performance on all arm devices, may consume more memory
+    bool use_bf16_storage;
+
+    // enable options for gpu inference
+    bool use_fp16_packed;
+    bool use_fp16_storage;
+    bool use_fp16_arithmetic;
+    bool use_int8_packed;
+    bool use_int8_storage;
+    bool use_int8_arithmetic;
+
+    // enable simd-friendly packed memory layout
+    // improve all operator performance on all arm devices, will consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_packing_layout;
+
+    bool use_shader_pack8;
+
+    // subgroup option
+    bool use_subgroup_basic;
+    bool use_subgroup_vote;
+    bool use_subgroup_ballot;
+    bool use_subgroup_shuffle;
+
+    // turn on for adreno
+    bool use_image_storage;
+    bool use_tensor_storage;
+
+    bool use_reserved_0;
+
+    // enable DAZ(Denormals-Are-Zero) and FTZ(Flush-To-Zero)
+    // default value is 3
+    // 0 = DAZ OFF, FTZ OFF
+    // 1 = DAZ ON , FTZ OFF
+    // 2 = DAZ OFF, FTZ ON
+    // 3 = DAZ ON,  FTZ ON
+    int flush_denormals;
+
+    bool use_local_pool_allocator;
+
+    // enable local memory optimization for gpu inference
+    bool use_shader_local_memory;
+
+    // enable cooperative matrix optimization for gpu inference
+    bool use_cooperative_matrix;
+
+    // more fine-grained control of winograd convolution
+    bool use_winograd23_convolution;
+    bool use_winograd43_convolution;
+    bool use_winograd63_convolution;
+
+    // this option is turned on for A53/A55 automatically
+    // but you can force this on/off if you wish
+    bool use_a53_a55_optimized_kernel;
+
+    bool use_reserved_7;
+    bool use_reserved_8;
+    bool use_reserved_9;
+    bool use_reserved_10;
+    bool use_reserved_11;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_OPTION_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/paramdict.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/paramdict.h
new file mode 100644
index 0000000..c2ef160
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/paramdict.h
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PARAMDICT_H
+#define NCNN_PARAMDICT_H
+
+#include "mat.h"
+
+// at most 32 parameters
+#define NCNN_MAX_PARAM_COUNT 32
+
+namespace ncnn {
+
+class DataReader;
+class Net;
+class ParamDictPrivate;
+class NCNN_EXPORT ParamDict
+{
+public:
+    // empty
+    ParamDict();
+
+    virtual ~ParamDict();
+
+    // copy
+    ParamDict(const ParamDict&);
+
+    // assign
+    ParamDict& operator=(const ParamDict&);
+
+    // get type
+    int type(int id) const;
+
+    // get int
+    int get(int id, int def) const;
+    // get float
+    float get(int id, float def) const;
+    // get array
+    Mat get(int id, const Mat& def) const;
+
+    // set int
+    void set(int id, int i);
+    // set float
+    void set(int id, float f);
+    // set array
+    void set(int id, const Mat& v);
+
+protected:
+    friend class Net;
+
+    void clear();
+
+    int load_param(const DataReader& dr);
+    int load_param_bin(const DataReader& dr);
+
+private:
+    ParamDictPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_PARAMDICT_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/pipeline.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/pipeline.h
new file mode 100644
index 0000000..c284a14
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/pipeline.h
@@ -0,0 +1,113 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PIPELINE_H
+#define NCNN_PIPELINE_H
+
+#include "mat.h"
+#include "platform.h"
+#if NCNN_VULKAN
+#include "gpu.h"
+
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class Option;
+class PipelinePrivate;
+class NCNN_EXPORT Pipeline
+{
+public:
+    explicit Pipeline(const VulkanDevice* vkdev);
+    virtual ~Pipeline();
+
+public:
+    void set_optimal_local_size_xyz(int w = 4, int h = 4, int c = 4);
+    void set_optimal_local_size_xyz(const Mat& local_size_xyz);
+    void set_local_size_xyz(int w, int h, int c);
+
+    int create(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations);
+
+    int create(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations);
+
+public:
+    VkShaderModule shader_module() const;
+    VkDescriptorSetLayout descriptorset_layout() const;
+    VkPipelineLayout pipeline_layout() const;
+    VkPipeline pipeline() const;
+    VkDescriptorUpdateTemplateKHR descriptor_update_template() const;
+
+    const ShaderInfo& shader_info() const;
+
+    uint32_t local_size_x() const;
+    uint32_t local_size_y() const;
+    uint32_t local_size_z() const;
+
+protected:
+    void set_shader_module(VkShaderModule shader_module);
+    void set_descriptorset_layout(VkDescriptorSetLayout descriptorset_layout);
+    void set_pipeline_layout(VkPipelineLayout pipeline_layout);
+    void set_pipeline(VkPipeline pipeline);
+    void set_descriptor_update_template(VkDescriptorUpdateTemplateKHR descriptor_update_template);
+
+    void set_shader_info(const ShaderInfo& shader_info);
+
+public:
+    const VulkanDevice* vkdev;
+
+private:
+    Pipeline(const Pipeline&);
+    Pipeline& operator=(const Pipeline&);
+
+private:
+    PipelinePrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class VkCompute;
+class NCNN_EXPORT ImportAndroidHardwareBufferPipeline : private Pipeline
+{
+public:
+    explicit ImportAndroidHardwareBufferPipeline(const VulkanDevice* vkdev);
+    virtual ~ImportAndroidHardwareBufferPipeline();
+
+    int create(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator, int type_to, int rotate_from, const Option& opt);
+    int create(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator, int type_to, int rotate_from, int target_width, int target_height, const Option& opt);
+    void destroy();
+
+    friend class VkCompute;
+
+protected:
+    int create_shader_module(const Option& opt);
+    int create_sampler(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator);
+    int create_descriptorset_layout();
+
+public:
+    int type_to;
+    int rotate_from;
+    bool need_resize;
+
+    VkSampler sampler;
+};
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_PIPELINE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/pipelinecache.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/pipelinecache.h
new file mode 100644
index 0000000..bb6b8fb
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/pipelinecache.h
@@ -0,0 +1,85 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PIPELINECACHE_H
+#define NCNN_PIPELINECACHE_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#include "mat.h"
+#include "gpu.h"
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+
+class VulkanDevice;
+class PipelineCachePrivate;
+class NCNN_EXPORT PipelineCache
+{
+public:
+    explicit PipelineCache(const VulkanDevice* _vkdev);
+
+    virtual ~PipelineCache();
+
+    void clear();
+
+    int get_pipeline(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations,
+                     uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                     VkShaderModule* shader_module,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template,
+                     ShaderInfo& shader_info) const;
+
+    int get_pipeline(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations,
+                     uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                     VkShaderModule* shader_module,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template,
+                     ShaderInfo& shader_info) const;
+
+protected:
+    int create_shader_module(int shader_type_index, const Option& opt, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                             VkShaderModule* _shader_module, ShaderInfo& si) const;
+
+    int new_pipeline(VkShaderModule shader_module, const ShaderInfo& shader_info, const std::vector<vk_specialization_type>& specializations,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template) const;
+
+protected:
+    const VulkanDevice* vkdev;
+
+private:
+    PipelineCache(const PipelineCache&);
+    PipelineCache& operator=(const PipelineCache&);
+
+private:
+    PipelineCachePrivate* const d;
+};
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_PIPELINECACHE_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/platform.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/platform.h
new file mode 100644
index 0000000..b5f4337
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/platform.h
@@ -0,0 +1,293 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PLATFORM_H
+#define NCNN_PLATFORM_H
+
+#define NCNN_STDIO 1
+#define NCNN_STRING 1
+#define NCNN_SIMPLEOCV 0
+#define NCNN_SIMPLEOMP 0
+#define NCNN_SIMPLESTL 0
+#define NCNN_SIMPLEMATH 0
+#define NCNN_THREADS 1
+#define NCNN_BENCHMARK 0
+#define NCNN_C_API 1
+#define NCNN_PLATFORM_API 1
+#define NCNN_PIXEL 1
+#define NCNN_PIXEL_ROTATE 1
+#define NCNN_PIXEL_AFFINE 1
+#define NCNN_PIXEL_DRAWING 1
+#define NCNN_VULKAN 0
+#define NCNN_SYSTEM_GLSLANG 0
+#define NCNN_RUNTIME_CPU 1
+#define NCNN_GNU_INLINE_ASM 1
+#define NCNN_AVX 1
+#define NCNN_XOP 1
+#define NCNN_FMA 1
+#define NCNN_F16C 1
+#define NCNN_AVX2 1
+#define NCNN_AVXVNNI 1
+#define NCNN_AVX512 1
+#define NCNN_AVX512VNNI 1
+#define NCNN_AVX512BF16 1
+#define NCNN_AVX512FP16 1
+#define NCNN_VFPV4 0
+#define NCNN_ARM82 0
+#define NCNN_ARM82DOT 0
+#define NCNN_ARM82FP16FML 0
+#define NCNN_ARM84BF16 0
+#define NCNN_ARM84I8MM 0
+#define NCNN_ARM86SVE 0
+#define NCNN_ARM86SVE2 0
+#define NCNN_ARM86SVEBF16 0
+#define NCNN_ARM86SVEI8MM 0
+#define NCNN_ARM86SVEF32MM 0
+#define NCNN_MSA 0
+#define NCNN_LSX 0
+#define NCNN_MMI 0
+#define NCNN_RVV 0
+#define NCNN_INT8 1
+#define NCNN_BF16 1
+#define NCNN_FORCE_INLINE 1
+
+#define NCNN_VERSION_STRING "1.0.20231027"
+
+#include "ncnn_export.h"
+
+#ifdef __cplusplus
+
+#if NCNN_THREADS
+#if (defined _WIN32 && !(defined __MINGW32__))
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <process.h>
+#else
+#include <pthread.h>
+#endif
+#endif // NCNN_THREADS
+
+#if __ANDROID_API__ >= 26
+#define VK_USE_PLATFORM_ANDROID_KHR
+#endif // __ANDROID_API__ >= 26
+
+namespace ncnn {
+
+#if NCNN_THREADS
+#if (defined _WIN32 && !(defined __MINGW32__))
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() { InitializeSRWLock(&srwlock); }
+    ~Mutex() {}
+    void lock() { AcquireSRWLockExclusive(&srwlock); }
+    void unlock() { ReleaseSRWLockExclusive(&srwlock); }
+private:
+    friend class ConditionVariable;
+    // NOTE SRWLock is available from windows vista
+    SRWLOCK srwlock;
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() { InitializeConditionVariable(&condvar); }
+    ~ConditionVariable() {}
+    void wait(Mutex& mutex) { SleepConditionVariableSRW(&condvar, &mutex.srwlock, INFINITE, 0); }
+    void broadcast() { WakeAllConditionVariable(&condvar); }
+    void signal() { WakeConditionVariable(&condvar); }
+private:
+    CONDITION_VARIABLE condvar;
+};
+
+static unsigned __stdcall start_wrapper(void* args);
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*start)(void*), void* args = 0) { _start = start; _args = args; handle = (HANDLE)_beginthreadex(0, 0, start_wrapper, this, 0, 0); }
+    ~Thread() {}
+    void join() { WaitForSingleObject(handle, INFINITE); CloseHandle(handle); }
+private:
+    friend unsigned __stdcall start_wrapper(void* args)
+    {
+        Thread* t = (Thread*)args;
+        t->_start(t->_args);
+        return 0;
+    }
+    HANDLE handle;
+    void* (*_start)(void*);
+    void* _args;
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { key = TlsAlloc(); }
+    ~ThreadLocalStorage() { TlsFree(key); }
+    void set(void* value) { TlsSetValue(key, (LPVOID)value); }
+    void* get() { return (void*)TlsGetValue(key); }
+private:
+    DWORD key;
+};
+#else // (defined _WIN32 && !(defined __MINGW32__))
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() { pthread_mutex_init(&mutex, 0); }
+    ~Mutex() { pthread_mutex_destroy(&mutex); }
+    void lock() { pthread_mutex_lock(&mutex); }
+    void unlock() { pthread_mutex_unlock(&mutex); }
+private:
+    friend class ConditionVariable;
+    pthread_mutex_t mutex;
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() { pthread_cond_init(&cond, 0); }
+    ~ConditionVariable() { pthread_cond_destroy(&cond); }
+    void wait(Mutex& mutex) { pthread_cond_wait(&cond, &mutex.mutex); }
+    void broadcast() { pthread_cond_broadcast(&cond); }
+    void signal() { pthread_cond_signal(&cond); }
+private:
+    pthread_cond_t cond;
+};
+
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*start)(void*), void* args = 0) { pthread_create(&t, 0, start, args); }
+    ~Thread() {}
+    void join() { pthread_join(t, 0); }
+private:
+    pthread_t t;
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { pthread_key_create(&key, 0); }
+    ~ThreadLocalStorage() { pthread_key_delete(key); }
+    void set(void* value) { pthread_setspecific(key, value); }
+    void* get() { return pthread_getspecific(key); }
+private:
+    pthread_key_t key;
+};
+#endif // (defined _WIN32 && !(defined __MINGW32__))
+#else // NCNN_THREADS
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() {}
+    ~Mutex() {}
+    void lock() {}
+    void unlock() {}
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() {}
+    ~ConditionVariable() {}
+    void wait(Mutex& /*mutex*/) {}
+    void broadcast() {}
+    void signal() {}
+};
+
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*/*start*/)(void*), void* /*args*/ = 0) {}
+    ~Thread() {}
+    void join() {}
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { data = 0; }
+    ~ThreadLocalStorage() {}
+    void set(void* value) { data = value; }
+    void* get() { return data; }
+private:
+    void* data;
+};
+#endif // NCNN_THREADS
+
+class NCNN_EXPORT MutexLockGuard
+{
+public:
+    MutexLockGuard(Mutex& _mutex) : mutex(_mutex) { mutex.lock(); }
+    ~MutexLockGuard() { mutex.unlock(); }
+private:
+    Mutex& mutex;
+};
+
+} // namespace ncnn
+
+#if NCNN_SIMPLESTL
+#include "simplestl.h"
+#else
+#include <algorithm>
+#include <list>
+#include <vector>
+#include <string>
+#endif
+
+// simplemath
+#if NCNN_SIMPLEMATH
+#include "simplemath.h"
+#else
+#include <math.h>
+#include <fenv.h>
+#endif
+
+#endif // __cplusplus
+
+#if NCNN_STDIO
+#if NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#include <android/log.h>
+#define NCNN_LOGE(...) do { \
+    fprintf(stderr, ##__VA_ARGS__); fprintf(stderr, "\n"); \
+    __android_log_print(ANDROID_LOG_WARN, "ncnn", ##__VA_ARGS__); } while(0)
+#else // NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#include <stdio.h>
+#define NCNN_LOGE(...) do { \
+    fprintf(stderr, ##__VA_ARGS__); fprintf(stderr, "\n"); } while(0)
+#endif // NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#else
+#define NCNN_LOGE(...)
+#endif
+
+
+#if NCNN_FORCE_INLINE
+#ifdef _MSC_VER
+    #define NCNN_FORCEINLINE __forceinline
+#elif defined(__GNUC__)
+    #define NCNN_FORCEINLINE inline __attribute__((__always_inline__))
+#elif defined(__CLANG__)
+    #if __has_attribute(__always_inline__)
+        #define NCNN_FORCEINLINE inline __attribute__((__always_inline__))
+    #else
+        #define NCNN_FORCEINLINE inline
+    #endif
+#else
+    #define NCNN_FORCEINLINE inline
+#endif
+#else
+    #define NCNN_FORCEINLINE inline
+#endif
+
+#endif // NCNN_PLATFORM_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/simplemath.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/simplemath.h
new file mode 100644
index 0000000..fd7fa69
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/simplemath.h
@@ -0,0 +1,102 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLEMATH_H
+#define NCNN_SIMPLEMATH_H
+
+#include "platform.h"
+
+#if NCNN_SIMPLEMATH
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+* ====================================================
+* discrete functions
+* ====================================================
+*/
+NCNN_EXPORT float fabs(float);
+NCNN_EXPORT float fabsf(float);
+NCNN_EXPORT float fmod(float, float);
+NCNN_EXPORT float floor(float);
+NCNN_EXPORT float floorf(float);
+NCNN_EXPORT float round(float);
+NCNN_EXPORT float roundf(float);
+NCNN_EXPORT float ceil(float);
+NCNN_EXPORT float ceilf(float);
+NCNN_EXPORT float fmaxf(float, float);
+NCNN_EXPORT float truncf(float);
+NCNN_EXPORT float frac(float);
+/*
+* ====================================================
+* trigonometric functions
+* ====================================================
+*/
+NCNN_EXPORT float sinf(float);
+NCNN_EXPORT float cosf(float);
+NCNN_EXPORT float tanf(float);
+NCNN_EXPORT float asinf(float);
+NCNN_EXPORT float acosf(float);
+NCNN_EXPORT float atanf(float);
+NCNN_EXPORT float atan2f(float, float);
+NCNN_EXPORT float tanhf(float);
+
+/*
+* ====================================================
+* power functions
+* ====================================================
+*/
+NCNN_EXPORT float sqrtf(float);
+NCNN_EXPORT float sqrt(float);
+NCNN_EXPORT float powf(float, float);
+
+/*
+* ====================================================
+* exponential and logarithm functions
+* ====================================================
+*/
+NCNN_EXPORT float expf(float);
+NCNN_EXPORT float frexp(float, int*);
+NCNN_EXPORT float logf(float);
+NCNN_EXPORT float log(float);
+NCNN_EXPORT float log10f(float);
+
+/*
+* ====================================================
+* probability functions
+* ====================================================
+*/
+NCNN_EXPORT float erf(float);
+NCNN_EXPORT float erfcf(float);
+
+/*
+* ====================================================
+* other functions
+* ====================================================
+*/
+NCNN_EXPORT int msb(unsigned int);
+NCNN_EXPORT float fmaf(float, float, float);
+NCNN_EXPORT float copysignf(float, float);
+NCNN_EXPORT void fesetround(int);
+NCNN_EXPORT int fegetround();
+NCNN_EXPORT float nearbyintf(float);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // NCNN_SIMPLEMATH
+
+#endif // NCNN_SIMPLEMATH_H
\ No newline at end of file
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/simpleocv.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/simpleocv.h
new file mode 100644
index 0000000..54b22d9
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/simpleocv.h
@@ -0,0 +1,503 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLEOCV_H
+#define NCNN_SIMPLEOCV_H
+
+#include "platform.h"
+
+#if NCNN_SIMPLEOCV
+
+#include <limits.h>
+#include <string.h>
+#include "allocator.h"
+#include "mat.h"
+
+#if defined(_MSC_VER) || defined(__GNUC__)
+#pragma push_macro("min")
+#pragma push_macro("max")
+#undef min
+#undef max
+#endif
+
+#ifndef NCNN_XADD
+using ncnn::NCNN_XADD;
+#endif
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+
+enum
+{
+    CV_LOAD_IMAGE_UNCHANGED = -1,
+    CV_LOAD_IMAGE_GRAYSCALE = 0,
+    CV_LOAD_IMAGE_COLOR = 1,
+};
+
+enum
+{
+    CV_IMWRITE_JPEG_QUALITY = 1
+};
+
+// minimal opencv style data structure implementation
+namespace cv {
+
+template<typename _Tp>
+static inline _Tp saturate_cast(int v)
+{
+    return _Tp(v);
+}
+template<>
+inline uchar saturate_cast<uchar>(int v)
+{
+    return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
+}
+
+template<typename _Tp>
+struct Scalar_
+{
+    Scalar_()
+    {
+        v[0] = 0;
+        v[1] = 0;
+        v[2] = 0;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0)
+    {
+        v[0] = _v0;
+        v[1] = 0;
+        v[2] = 0;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0, _Tp _v1, _Tp _v2)
+    {
+        v[0] = _v0;
+        v[1] = _v1;
+        v[2] = _v2;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0, _Tp _v1, _Tp _v2, _Tp _v3)
+    {
+        v[0] = _v0;
+        v[1] = _v1;
+        v[2] = _v2;
+        v[3] = _v3;
+    }
+
+    const _Tp operator[](const int i) const
+    {
+        return v[i];
+    }
+
+    _Tp operator[](const int i)
+    {
+        return v[i];
+    }
+
+    _Tp v[4];
+};
+
+typedef Scalar_<uchar> Scalar;
+
+template<typename _Tp>
+struct Point_
+{
+    Point_()
+        : x(0), y(0)
+    {
+    }
+    Point_(_Tp _x, _Tp _y)
+        : x(_x), y(_y)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Point_<_Tp2>() const
+    {
+        return Point_<_Tp2>(saturate_cast<_Tp2>(x), saturate_cast<_Tp2>(y));
+    }
+
+    _Tp x;
+    _Tp y;
+};
+
+typedef Point_<int> Point;
+typedef Point_<float> Point2f;
+
+template<typename _Tp>
+struct Size_
+{
+    Size_()
+        : width(0), height(0)
+    {
+    }
+    Size_(_Tp _w, _Tp _h)
+        : width(_w), height(_h)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Size_<_Tp2>() const
+    {
+        return Size_<_Tp2>(saturate_cast<_Tp2>(width), saturate_cast<_Tp2>(height));
+    }
+
+    _Tp width;
+    _Tp height;
+};
+
+typedef Size_<int> Size;
+typedef Size_<float> Size2f;
+
+template<typename _Tp>
+struct Rect_
+{
+    Rect_()
+        : x(0), y(0), width(0), height(0)
+    {
+    }
+    Rect_(_Tp _x, _Tp _y, _Tp _w, _Tp _h)
+        : x(_x), y(_y), width(_w), height(_h)
+    {
+    }
+    Rect_(Point_<_Tp> _p, Size_<_Tp> _size)
+        : x(_p.x), y(_p.y), width(_size.width), height(_size.height)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Rect_<_Tp2>() const
+    {
+        return Rect_<_Tp2>(saturate_cast<_Tp2>(x), saturate_cast<_Tp2>(y), saturate_cast<_Tp2>(width), saturate_cast<_Tp2>(height));
+    }
+
+    _Tp x;
+    _Tp y;
+    _Tp width;
+    _Tp height;
+
+    // area
+    _Tp area() const
+    {
+        return width * height;
+    }
+};
+
+template<typename _Tp>
+static inline Rect_<_Tp>& operator&=(Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    _Tp x1 = std::max(a.x, b.x), y1 = std::max(a.y, b.y);
+    a.width = std::min(a.x + a.width, b.x + b.width) - x1;
+    a.height = std::min(a.y + a.height, b.y + b.height) - y1;
+    a.x = x1;
+    a.y = y1;
+    if (a.width <= 0 || a.height <= 0)
+        a = Rect_<_Tp>();
+    return a;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp>& operator|=(Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    _Tp x1 = std::min(a.x, b.x), y1 = std::min(a.y, b.y);
+    a.width = std::max(a.x + a.width, b.x + b.width) - x1;
+    a.height = std::max(a.y + a.height, b.y + b.height) - y1;
+    a.x = x1;
+    a.y = y1;
+    return a;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp> operator&(const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    Rect_<_Tp> c = a;
+    return c &= b;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp> operator|(const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    Rect_<_Tp> c = a;
+    return c |= b;
+}
+
+typedef Rect_<int> Rect;
+typedef Rect_<float> Rect2f;
+
+#define CV_8UC1  1
+#define CV_8UC3  3
+#define CV_8UC4  4
+#define CV_32FC1 4
+
+struct NCNN_EXPORT Mat
+{
+    Mat()
+        : data(0), refcount(0), rows(0), cols(0), c(0)
+    {
+    }
+
+    Mat(int _rows, int _cols, int flags)
+        : data(0), refcount(0)
+    {
+        create(_rows, _cols, flags);
+    }
+
+    // copy
+    Mat(const Mat& m)
+        : data(m.data), refcount(m.refcount)
+    {
+        if (refcount)
+            NCNN_XADD(refcount, 1);
+
+        rows = m.rows;
+        cols = m.cols;
+        c = m.c;
+    }
+
+    Mat(int _rows, int _cols, int flags, void* _data)
+        : data((unsigned char*)_data), refcount(0)
+    {
+        rows = _rows;
+        cols = _cols;
+        c = flags;
+    }
+
+    ~Mat()
+    {
+        release();
+    }
+
+    // assign
+    Mat& operator=(const Mat& m)
+    {
+        if (this == &m)
+            return *this;
+
+        if (m.refcount)
+            NCNN_XADD(m.refcount, 1);
+
+        release();
+
+        data = m.data;
+        refcount = m.refcount;
+
+        rows = m.rows;
+        cols = m.cols;
+        c = m.c;
+
+        return *this;
+    }
+
+    Mat& operator=(const Scalar& s)
+    {
+        if (total() > 0)
+        {
+            uchar* p = data;
+            for (int i = 0; i < cols * rows; i++)
+            {
+                for (int j = 0; j < c; j++)
+                {
+                    *p++ = s[j];
+                }
+            }
+        }
+
+        return *this;
+    }
+
+    void create(int _rows, int _cols, int flags)
+    {
+        release();
+
+        rows = _rows;
+        cols = _cols;
+        c = flags;
+
+        if (total() > 0)
+        {
+            // refcount address must be aligned, so we expand totalsize here
+            size_t totalsize = (total() + 3) >> 2 << 2;
+            data = (uchar*)ncnn::fastMalloc(totalsize + (int)sizeof(*refcount));
+            refcount = (int*)(((uchar*)data) + totalsize);
+            *refcount = 1;
+        }
+    }
+
+    void release()
+    {
+        if (refcount && NCNN_XADD(refcount, -1) == 1)
+            ncnn::fastFree(data);
+
+        data = 0;
+
+        rows = 0;
+        cols = 0;
+        c = 0;
+
+        refcount = 0;
+    }
+
+    Mat clone() const
+    {
+        if (empty())
+            return Mat();
+
+        Mat m(rows, cols, c);
+
+        if (total() > 0)
+        {
+            memcpy(m.data, data, total());
+        }
+
+        return m;
+    }
+
+    bool empty() const
+    {
+        return data == 0 || total() == 0;
+    }
+
+    int channels() const
+    {
+        return c;
+    }
+
+    int type() const
+    {
+        return c;
+    }
+
+    size_t total() const
+    {
+        return cols * rows * c;
+    }
+
+    const uchar* ptr(int y) const
+    {
+        return data + y * cols * c;
+    }
+
+    uchar* ptr(int y)
+    {
+        return data + y * cols * c;
+    }
+
+    template<typename _Tp>
+    const _Tp* ptr(int y) const
+    {
+        return (const _Tp*)data + y * cols * c;
+    }
+
+    template<typename _Tp>
+    _Tp* ptr(int y)
+    {
+        return (_Tp*)data + y * cols * c;
+    }
+
+    // roi
+    Mat operator()(const Rect& roi) const
+    {
+        if (empty())
+            return Mat();
+
+        Mat m(roi.height, roi.width, c);
+
+        int sy = roi.y;
+        for (int y = 0; y < roi.height; y++)
+        {
+            const uchar* sptr = ptr(sy) + roi.x * c;
+            uchar* dptr = m.ptr(y);
+            memcpy(dptr, sptr, roi.width * c);
+            sy++;
+        }
+
+        return m;
+    }
+
+    uchar* data;
+
+    // pointer to the reference counter;
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    int rows;
+    int cols;
+
+    int c;
+};
+
+enum ImreadModes
+{
+    IMREAD_UNCHANGED = -1,
+    IMREAD_GRAYSCALE = 0,
+    IMREAD_COLOR = 1
+};
+
+NCNN_EXPORT Mat imread(const std::string& path, int flags = IMREAD_COLOR);
+
+NCNN_EXPORT Mat imdecode(const std::vector<uchar>& buf, int flags = IMREAD_COLOR);
+
+enum ImwriteFlags
+{
+    IMWRITE_JPEG_QUALITY = 1
+};
+
+NCNN_EXPORT bool imwrite(const std::string& path, const Mat& m, const std::vector<int>& params = std::vector<int>());
+
+NCNN_EXPORT void imshow(const std::string& name, const Mat& m);
+
+NCNN_EXPORT int waitKey(int delay = 0);
+
+#if NCNN_PIXEL
+NCNN_EXPORT void resize(const Mat& src, Mat& dst, const Size& size, float sw = 0.f, float sh = 0.f, int flags = 0);
+#endif // NCNN_PIXEL
+
+#if NCNN_PIXEL_DRAWING
+
+enum
+{
+    FILLED = -1
+};
+
+NCNN_EXPORT void rectangle(Mat& img, Point pt1, Point pt2, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void rectangle(Mat& img, Rect rec, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void circle(Mat& img, Point center, int radius, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void line(Mat& img, Point p0, Point p1, const Scalar& color, int thickness = 1);
+
+enum
+{
+    FONT_HERSHEY_SIMPLEX = 0
+};
+
+NCNN_EXPORT void putText(Mat& img, const std::string& text, Point org, int fontFace, double fontScale, Scalar color, int thickness = 1);
+
+NCNN_EXPORT Size getTextSize(const std::string& text, int fontFace, double fontScale, int thickness, int* baseLine);
+
+#endif // NCNN_PIXEL_DRAWING
+
+} // namespace cv
+
+#if defined(_MSC_VER) || defined(__GNUC__)
+#pragma pop_macro("min")
+#pragma pop_macro("max")
+#endif
+
+#endif // NCNN_SIMPLEOCV
+
+#endif // NCNN_SIMPLEOCV_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/simpleomp.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/simpleomp.h
new file mode 100644
index 0000000..13e2452
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/simpleomp.h
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLEOMP_H
+#define NCNN_SIMPLEOMP_H
+
+#include "platform.h"
+
+#if NCNN_SIMPLEOMP
+
+#include <stdint.h>
+
+// This minimal openmp runtime implementation only supports the llvm openmp abi
+// and only supports #pragma omp parallel for num_threads(X)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+NCNN_EXPORT int omp_get_max_threads();
+
+NCNN_EXPORT void omp_set_num_threads(int num_threads);
+
+NCNN_EXPORT int omp_get_dynamic();
+
+NCNN_EXPORT void omp_set_dynamic(int dynamic);
+
+NCNN_EXPORT int omp_get_num_threads();
+
+NCNN_EXPORT int omp_get_thread_num();
+
+NCNN_EXPORT int kmp_get_blocktime();
+
+NCNN_EXPORT void kmp_set_blocktime(int blocktime);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // NCNN_SIMPLEOMP
+
+#endif // NCNN_SIMPLEOMP_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/simplestl.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/simplestl.h
new file mode 100644
index 0000000..00ff468
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/simplestl.h
@@ -0,0 +1,565 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLESTL_H
+#define NCNN_SIMPLESTL_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#if !NCNN_SIMPLESTL
+
+#include <new>
+
+#else
+
+// allocation functions
+NCNN_EXPORT void* operator new(size_t size);
+NCNN_EXPORT void* operator new[](size_t size);
+// placement allocation functions
+NCNN_EXPORT void* operator new(size_t size, void* ptr);
+NCNN_EXPORT void* operator new[](size_t size, void* ptr);
+// deallocation functions
+NCNN_EXPORT void operator delete(void* ptr);
+NCNN_EXPORT void operator delete[](void* ptr);
+// deallocation functions since c++14
+#if __cplusplus >= 201402L
+NCNN_EXPORT void operator delete(void* ptr, size_t sz);
+NCNN_EXPORT void operator delete[](void* ptr, size_t sz);
+#endif
+// placement deallocation functions
+NCNN_EXPORT void operator delete(void* ptr, void* voidptr2);
+NCNN_EXPORT void operator delete[](void* ptr, void* voidptr2);
+
+#endif
+
+// minimal stl data structure implementation
+namespace std {
+
+template<typename T>
+const T& max(const T& a, const T& b)
+{
+    return (a < b) ? b : a;
+}
+
+template<typename T>
+const T& min(const T& a, const T& b)
+{
+    return (a > b) ? b : a;
+}
+
+template<typename T>
+void swap(T& a, T& b)
+{
+    T temp(a);
+    a = b;
+    b = temp;
+}
+
+template<typename T1, typename T2>
+struct pair
+{
+    pair()
+        : first(), second()
+    {
+    }
+    pair(const T1& t1, const T2& t2)
+        : first(t1), second(t2)
+    {
+    }
+
+    T1 first;
+    T2 second;
+};
+
+template<typename T1, typename T2>
+bool operator==(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return (x.first == y.first && x.second == y.second);
+}
+template<typename T1, typename T2>
+bool operator<(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return x.first < y.first || (!(y.first < x.first) && x.second < y.second);
+}
+template<typename T1, typename T2>
+bool operator!=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(x == y);
+}
+template<typename T1, typename T2>
+bool operator>(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return y < x;
+}
+template<typename T1, typename T2>
+bool operator<=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(y < x);
+}
+template<typename T1, typename T2>
+bool operator>=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(x < y);
+}
+
+template<typename T1, typename T2>
+pair<T1, T2> make_pair(const T1& t1, const T2& t2)
+{
+    return pair<T1, T2>(t1, t2);
+}
+
+template<typename T>
+struct node
+{
+    node* prev_;
+    node* next_;
+    T data_;
+
+    node()
+        : prev_(0), next_(0), data_()
+    {
+    }
+    node(const T& t)
+        : prev_(0), next_(0), data_(t)
+    {
+    }
+};
+
+template<typename T>
+struct iter_list
+{
+    iter_list()
+        : curr_(0)
+    {
+    }
+    iter_list(node<T>* n)
+        : curr_(n)
+    {
+    }
+    iter_list(const iter_list& i)
+        : curr_(i.curr_)
+    {
+    }
+    ~iter_list()
+    {
+    }
+
+    iter_list& operator=(const iter_list& i)
+    {
+        curr_ = i.curr_;
+        return *this;
+    }
+
+    T& operator*()
+    {
+        return curr_->data_;
+    }
+    T* operator->()
+    {
+        return &(curr_->data_);
+    }
+
+    bool operator==(const iter_list& i)
+    {
+        return curr_ == i.curr_;
+    }
+    bool operator!=(const iter_list& i)
+    {
+        return curr_ != i.curr_;
+    }
+
+    iter_list& operator++()
+    {
+        curr_ = curr_->next_;
+        return *this;
+    }
+    iter_list& operator--()
+    {
+        curr_ = curr_->prev_;
+        return *this;
+    }
+
+    node<T>* curr_;
+};
+
+template<typename T>
+struct list
+{
+    typedef iter_list<T> iterator;
+
+    list()
+    {
+        head_ = new node<T>();
+        tail_ = head_;
+        count_ = 0;
+    }
+    ~list()
+    {
+        clear();
+        delete head_;
+    }
+    list(const list& l)
+    {
+        head_ = new node<T>();
+        tail_ = head_;
+        count_ = 0;
+
+        for (iter_list<T> i = l.begin(); i != l.end(); ++i)
+        {
+            push_back(*i);
+        }
+    }
+
+    list& operator=(const list& l)
+    {
+        if (this == &l)
+        {
+            return *this;
+        }
+        clear();
+
+        for (iter_list<T> i = l.begin(); i != l.end(); ++i)
+        {
+            push_back(*i);
+        }
+        return *this;
+    }
+
+    void clear()
+    {
+        while (count_ > 0)
+        {
+            pop_front();
+        }
+    }
+
+    void pop_front()
+    {
+        if (count_ > 0)
+        {
+            head_ = head_->next_;
+            delete head_->prev_;
+            head_->prev_ = 0;
+            --count_;
+        }
+    }
+
+    size_t size() const
+    {
+        return count_;
+    }
+    iter_list<T> begin() const
+    {
+        return iter_list<T>(head_);
+    }
+    iter_list<T> end() const
+    {
+        return iter_list<T>(tail_);
+    }
+    bool empty() const
+    {
+        return count_ == 0;
+    }
+
+    void push_back(const T& t)
+    {
+        if (count_ == 0)
+        {
+            head_ = new node<T>(t);
+            head_->prev_ = 0;
+            head_->next_ = tail_;
+            tail_->prev_ = head_;
+            count_ = 1;
+        }
+        else
+        {
+            node<T>* temp = new node<T>(t);
+            temp->prev_ = tail_->prev_;
+            temp->next_ = tail_;
+            tail_->prev_->next_ = temp;
+            tail_->prev_ = temp;
+            ++count_;
+        }
+    }
+
+    iter_list<T> erase(iter_list<T> pos)
+    {
+        if (pos != end())
+        {
+            node<T>* temp = pos.curr_;
+            if (temp == head_)
+            {
+                ++pos;
+                temp->next_->prev_ = 0;
+                head_ = temp->next_;
+            }
+            else
+            {
+                --pos;
+                temp->next_->prev_ = temp->prev_;
+                temp->prev_->next_ = temp->next_;
+                ++pos;
+            }
+            delete temp;
+            --count_;
+        }
+        return pos;
+    }
+
+protected:
+    node<T>* head_;
+    node<T>* tail_;
+    size_t count_;
+};
+
+template<typename T>
+struct greater
+{
+    bool operator()(const T& x, const T& y) const
+    {
+        return (x > y);
+    }
+};
+
+template<typename T>
+struct less
+{
+    bool operator()(const T& x, const T& y) const
+    {
+        return (x < y);
+    }
+};
+
+template<typename RandomAccessIter, typename Compare>
+void partial_sort(RandomAccessIter first, RandomAccessIter middle, RandomAccessIter last, Compare comp)
+{
+    // [TODO] heap sort should be used here, but we simply use bubble sort now
+    for (RandomAccessIter i = first; i < middle; ++i)
+    {
+        // bubble sort
+        for (RandomAccessIter j = last - 1; j > first; --j)
+        {
+            if (comp(*j, *(j - 1)))
+            {
+                swap(*j, *(j - 1));
+            }
+        }
+    }
+}
+
+template<typename T>
+struct vector
+{
+    vector()
+        : data_(0), size_(0), capacity_(0)
+    {
+    }
+    vector(const size_t new_size, const T& value = T())
+        : data_(0), size_(0), capacity_(0)
+    {
+        resize(new_size, value);
+    }
+    ~vector()
+    {
+        clear();
+    }
+    vector(const vector& v)
+        : data_(0), size_(0), capacity_(0)
+    {
+        resize(v.size());
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i] = v.data_[i];
+        }
+    }
+
+    vector& operator=(const vector& v)
+    {
+        if (this == &v)
+        {
+            return *this;
+        }
+        resize(0);
+        resize(v.size());
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i] = v.data_[i];
+        }
+        return *this;
+    }
+
+    void resize(const size_t new_size, const T& value = T())
+    {
+        try_alloc(new_size);
+        if (new_size > size_)
+        {
+            for (size_t i = size_; i < new_size; i++)
+            {
+                new (&data_[i]) T(value);
+            }
+        }
+        else if (new_size < size_)
+        {
+            for (size_t i = new_size; i < size_; i++)
+            {
+                data_[i].~T();
+            }
+        }
+        size_ = new_size;
+    }
+
+    void clear()
+    {
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i].~T();
+        }
+        delete[](char*) data_;
+        data_ = 0;
+        size_ = 0;
+        capacity_ = 0;
+    }
+
+    T* data() const
+    {
+        return data_;
+    }
+    size_t size() const
+    {
+        return size_;
+    }
+    T& operator[](size_t i) const
+    {
+        return data_[i];
+    }
+    T* begin() const
+    {
+        return &data_[0];
+    }
+    T* end() const
+    {
+        return &data_[size_];
+    }
+    bool empty() const
+    {
+        return size_ == 0;
+    }
+
+    void push_back(const T& t)
+    {
+        try_alloc(size_ + 1);
+        new (&data_[size_]) T(t);
+        size_++;
+    }
+
+    void insert(T* pos, T* b, T* e)
+    {
+        vector* v = 0;
+        if (b >= begin() && b < end())
+        {
+            //the same vector
+            v = new vector(*this);
+            b = v->begin() + (b - begin());
+            e = v->begin() + (e - begin());
+        }
+        size_t diff = pos - begin();
+        try_alloc(size_ + (e - b));
+        pos = begin() + diff;
+        memmove(pos + (e - b), pos, (end() - pos) * sizeof(T));
+        size_t len = e - b;
+        size_ += len;
+        for (size_t i = 0; i < len; i++)
+        {
+            *pos = *b;
+            pos++;
+            b++;
+        }
+        delete v;
+    }
+
+    T* erase(T* pos)
+    {
+        pos->~T();
+        memmove(pos, pos + 1, (end() - pos - 1) * sizeof(T));
+        size_--;
+        return pos;
+    }
+
+protected:
+    T* data_;
+    size_t size_;
+    size_t capacity_;
+    void try_alloc(size_t new_size)
+    {
+        if (new_size * 3 / 2 > capacity_ / 2)
+        {
+            capacity_ = new_size * 2;
+            T* new_data = (T*)new char[capacity_ * sizeof(T)];
+            memset(static_cast<void*>(new_data), 0, capacity_ * sizeof(T));
+            if (data_)
+            {
+                memmove(new_data, data_, sizeof(T) * size_);
+                delete[](char*) data_;
+            }
+            data_ = new_data;
+        }
+    }
+};
+
+struct NCNN_EXPORT string : public vector<char>
+{
+    string()
+    {
+    }
+    string(const char* str)
+    {
+        size_t len = strlen(str);
+        resize(len);
+        memcpy(data_, str, len);
+    }
+    const char* c_str() const
+    {
+        return (const char*)data_;
+    }
+    bool operator==(const string& str2) const
+    {
+        return strcmp(data_, str2.data_) == 0;
+    }
+    bool operator==(const char* str2) const
+    {
+        return strcmp(data_, str2) == 0;
+    }
+    bool operator!=(const char* str2) const
+    {
+        return strcmp(data_, str2) != 0;
+    }
+    string& operator+=(const string& str1)
+    {
+        insert(end(), str1.begin(), str1.end());
+        return *this;
+    }
+};
+
+inline string operator+(const string& str1, const string& str2)
+{
+    string str(str1);
+    str.insert(str.end(), str2.begin(), str2.end());
+    return str;
+}
+
+} // namespace std
+
+#endif // NCNN_SIMPLESTL_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/vulkan_header_fix.h b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/vulkan_header_fix.h
new file mode 100644
index 0000000..0a5ea9b
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/include/ncnn/vulkan_header_fix.h
@@ -0,0 +1,449 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_VULKAN_HEADER_FIX_H
+#define NCNN_VULKAN_HEADER_FIX_H
+
+#include <vulkan/vulkan.h>
+
+// This header contains new structure and function declearation to fix build with old vulkan sdk
+
+#if VK_HEADER_VERSION < 70
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES (VkStructureType)1000094000
+typedef enum VkSubgroupFeatureFlagBits
+{
+    VK_SUBGROUP_FEATURE_BASIC_BIT = 0x00000001,
+    VK_SUBGROUP_FEATURE_VOTE_BIT = 0x00000002,
+    VK_SUBGROUP_FEATURE_ARITHMETIC_BIT = 0x00000004,
+    VK_SUBGROUP_FEATURE_BALLOT_BIT = 0x00000008,
+    VK_SUBGROUP_FEATURE_SHUFFLE_BIT = 0x00000010,
+    VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT = 0x00000020,
+    VK_SUBGROUP_FEATURE_CLUSTERED_BIT = 0x00000040,
+    VK_SUBGROUP_FEATURE_QUAD_BIT = 0x00000080,
+    VK_SUBGROUP_FEATURE_PARTITIONED_BIT_NV = 0x00000100,
+    VK_SUBGROUP_FEATURE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkSubgroupFeatureFlagBits;
+typedef VkFlags VkSubgroupFeatureFlags;
+typedef struct VkPhysicalDeviceSubgroupProperties
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t subgroupSize;
+    VkShaderStageFlags supportedStages;
+    VkSubgroupFeatureFlags supportedOperations;
+    VkBool32 quadOperationsInAllStages;
+} VkPhysicalDeviceSubgroupProperties;
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_3_PROPERTIES (VkStructureType)1000168000
+#define VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_SUPPORT            (VkStructureType)1000168001
+typedef struct VkPhysicalDeviceMaintenance3Properties
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t maxPerSetDescriptors;
+    VkDeviceSize maxMemoryAllocationSize;
+} VkPhysicalDeviceMaintenance3Properties;
+typedef struct VkDescriptorSetLayoutSupport
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 supported;
+} VkDescriptorSetLayoutSupport;
+typedef VkPhysicalDeviceMaintenance3Properties VkPhysicalDeviceMaintenance3PropertiesKHR;
+typedef VkDescriptorSetLayoutSupport VkDescriptorSetLayoutSupportKHR;
+typedef void(VKAPI_PTR* PFN_vkGetDescriptorSetLayoutSupportKHR)(VkDevice device, const VkDescriptorSetLayoutCreateInfo* pCreateInfo, VkDescriptorSetLayoutSupport* pSupport);
+#endif // VK_HEADER_VERSION < 70
+
+#if VK_HEADER_VERSION < 80
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR (VkStructureType)1000177000
+typedef struct VkPhysicalDevice8BitStorageFeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 storageBuffer8BitAccess;
+    VkBool32 uniformAndStorageBuffer8BitAccess;
+    VkBool32 storagePushConstant8;
+} VkPhysicalDevice8BitStorageFeaturesKHR;
+#define VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2_KHR  (VkStructureType)1000109000
+#define VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2_KHR    (VkStructureType)1000109001
+#define VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2_KHR     (VkStructureType)1000109002
+#define VK_STRUCTURE_TYPE_SUBPASS_DEPENDENCY_2_KHR      (VkStructureType)1000109003
+#define VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2_KHR (VkStructureType)1000109004
+#define VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO_KHR        (VkStructureType)1000109005
+#define VK_STRUCTURE_TYPE_SUBPASS_END_INFO_KHR          (VkStructureType)1000109006
+typedef struct VkAttachmentDescription2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkAttachmentDescriptionFlags flags;
+    VkFormat format;
+    VkSampleCountFlagBits samples;
+    VkAttachmentLoadOp loadOp;
+    VkAttachmentStoreOp storeOp;
+    VkAttachmentLoadOp stencilLoadOp;
+    VkAttachmentStoreOp stencilStoreOp;
+    VkImageLayout initialLayout;
+    VkImageLayout finalLayout;
+} VkAttachmentDescription2KHR;
+typedef struct VkAttachmentReference2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint32_t attachment;
+    VkImageLayout layout;
+    VkImageAspectFlags aspectMask;
+} VkAttachmentReference2KHR;
+typedef struct VkSubpassDescription2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkSubpassDescriptionFlags flags;
+    VkPipelineBindPoint pipelineBindPoint;
+    uint32_t viewMask;
+    uint32_t inputAttachmentCount;
+    const VkAttachmentReference2KHR* pInputAttachments;
+    uint32_t colorAttachmentCount;
+    const VkAttachmentReference2KHR* pColorAttachments;
+    const VkAttachmentReference2KHR* pResolveAttachments;
+    const VkAttachmentReference2KHR* pDepthStencilAttachment;
+    uint32_t preserveAttachmentCount;
+    const uint32_t* pPreserveAttachments;
+} VkSubpassDescription2KHR;
+typedef struct VkSubpassDependency2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint32_t srcSubpass;
+    uint32_t dstSubpass;
+    VkPipelineStageFlags srcStageMask;
+    VkPipelineStageFlags dstStageMask;
+    VkAccessFlags srcAccessMask;
+    VkAccessFlags dstAccessMask;
+    VkDependencyFlags dependencyFlags;
+    int32_t viewOffset;
+} VkSubpassDependency2KHR;
+typedef struct VkRenderPassCreateInfo2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkRenderPassCreateFlags flags;
+    uint32_t attachmentCount;
+    const VkAttachmentDescription2KHR* pAttachments;
+    uint32_t subpassCount;
+    const VkSubpassDescription2KHR* pSubpasses;
+    uint32_t dependencyCount;
+    const VkSubpassDependency2KHR* pDependencies;
+    uint32_t correlatedViewMaskCount;
+    const uint32_t* pCorrelatedViewMasks;
+} VkRenderPassCreateInfo2KHR;
+typedef struct VkSubpassBeginInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkSubpassContents contents;
+} VkSubpassBeginInfoKHR;
+
+typedef struct VkSubpassEndInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+} VkSubpassEndInfoKHR;
+typedef VkResult(VKAPI_PTR* PFN_vkCreateRenderPass2KHR)(VkDevice device, const VkRenderPassCreateInfo2KHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkRenderPass* pRenderPass);
+typedef void(VKAPI_PTR* PFN_vkCmdBeginRenderPass2KHR)(VkCommandBuffer commandBuffer, const VkRenderPassBeginInfo* pRenderPassBegin, const VkSubpassBeginInfoKHR* pSubpassBeginInfo);
+typedef void(VKAPI_PTR* PFN_vkCmdNextSubpass2KHR)(VkCommandBuffer commandBuffer, const VkSubpassBeginInfoKHR* pSubpassBeginInfo, const VkSubpassEndInfoKHR* pSubpassEndInfo);
+typedef void(VKAPI_PTR* PFN_vkCmdEndRenderPass2KHR)(VkCommandBuffer commandBuffer, const VkSubpassEndInfoKHR* pSubpassEndInfo);
+#endif // VK_HEADER_VERSION < 80
+
+#if VK_HEADER_VERSION < 95
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR (VkStructureType)1000082000
+typedef struct VkPhysicalDeviceFloat16Int8FeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 shaderFloat16;
+    VkBool32 shaderInt8;
+} VkPhysicalDeviceFloat16Int8FeaturesKHR;
+#endif // VK_HEADER_VERSION < 95
+
+#if VK_HEADER_VERSION < 97
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT (VkStructureType)1000237000
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PRIORITY_FEATURES_EXT (VkStructureType)1000238000
+#define VK_STRUCTURE_TYPE_MEMORY_PRIORITY_ALLOCATE_INFO_EXT            (VkStructureType)1000238001
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_ADDRESS_FEATURES_EXT  (VkStructureType)1000244000
+#define VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO_EXT               (VkStructureType)1000244001
+#define VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_CREATE_INFO_EXT        (VkStructureType)1000244002
+#define VK_STRUCTURE_TYPE_VALIDATION_FEATURES_EXT                      (VkStructureType)1000247000
+#define VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT_EXT         (VkBufferCreateFlagBits)0x00020000
+#define VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_EXT                  (VkBufferUsageFlagBits)0x00020000
+typedef uint64_t VkDeviceAddress;
+typedef struct VkPhysicalDeviceMemoryBudgetPropertiesEXT
+{
+    VkStructureType sType;
+    void* pNext;
+    VkDeviceSize heapBudget[VK_MAX_MEMORY_HEAPS];
+    VkDeviceSize heapUsage[VK_MAX_MEMORY_HEAPS];
+} VkPhysicalDeviceMemoryBudgetPropertiesEXT;
+typedef struct VkPhysicalDeviceMemoryPriorityFeaturesEXT
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 memoryPriority;
+} VkPhysicalDeviceMemoryPriorityFeaturesEXT;
+typedef struct VkMemoryPriorityAllocateInfoEXT
+{
+    VkStructureType sType;
+    const void* pNext;
+    float priority;
+} VkMemoryPriorityAllocateInfoEXT;
+typedef struct VkPhysicalDeviceBufferAddressFeaturesEXT
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 bufferDeviceAddress;
+    VkBool32 bufferDeviceAddressCaptureReplay;
+    VkBool32 bufferDeviceAddressMultiDevice;
+} VkPhysicalDeviceBufferAddressFeaturesEXT;
+typedef struct VkBufferDeviceAddressInfoEXT
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkBuffer buffer;
+} VkBufferDeviceAddressInfoEXT;
+typedef struct VkBufferDeviceAddressCreateInfoEXT
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkDeviceSize deviceAddress;
+} VkBufferDeviceAddressCreateInfoEXT;
+typedef VkDeviceAddress(VKAPI_PTR* PFN_vkGetBufferDeviceAddressEXT)(VkDevice device, const VkBufferDeviceAddressInfoEXT* pInfo);
+typedef enum VkValidationFeatureEnableEXT
+{
+    VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT = 0,
+    VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT = 1,
+    VK_VALIDATION_FEATURE_ENABLE_BEGIN_RANGE_EXT = VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT,
+    VK_VALIDATION_FEATURE_ENABLE_END_RANGE_EXT = VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT,
+    VK_VALIDATION_FEATURE_ENABLE_RANGE_SIZE_EXT = (VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT - VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT + 1),
+    VK_VALIDATION_FEATURE_ENABLE_MAX_ENUM_EXT = 0x7FFFFFFF
+} VkValidationFeatureEnableEXT;
+typedef enum VkValidationFeatureDisableEXT
+{
+    VK_VALIDATION_FEATURE_DISABLE_ALL_EXT = 0,
+    VK_VALIDATION_FEATURE_DISABLE_SHADERS_EXT = 1,
+    VK_VALIDATION_FEATURE_DISABLE_THREAD_SAFETY_EXT = 2,
+    VK_VALIDATION_FEATURE_DISABLE_API_PARAMETERS_EXT = 3,
+    VK_VALIDATION_FEATURE_DISABLE_OBJECT_LIFETIMES_EXT = 4,
+    VK_VALIDATION_FEATURE_DISABLE_CORE_CHECKS_EXT = 5,
+    VK_VALIDATION_FEATURE_DISABLE_UNIQUE_HANDLES_EXT = 6,
+    VK_VALIDATION_FEATURE_DISABLE_BEGIN_RANGE_EXT = VK_VALIDATION_FEATURE_DISABLE_ALL_EXT,
+    VK_VALIDATION_FEATURE_DISABLE_END_RANGE_EXT = VK_VALIDATION_FEATURE_DISABLE_UNIQUE_HANDLES_EXT,
+    VK_VALIDATION_FEATURE_DISABLE_RANGE_SIZE_EXT = (VK_VALIDATION_FEATURE_DISABLE_UNIQUE_HANDLES_EXT - VK_VALIDATION_FEATURE_DISABLE_ALL_EXT + 1),
+    VK_VALIDATION_FEATURE_DISABLE_MAX_ENUM_EXT = 0x7FFFFFFF
+} VkValidationFeatureDisableEXT;
+typedef struct VkValidationFeaturesEXT
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint32_t enabledValidationFeatureCount;
+    const VkValidationFeatureEnableEXT* pEnabledValidationFeatures;
+    uint32_t disabledValidationFeatureCount;
+    const VkValidationFeatureDisableEXT* pDisabledValidationFeatures;
+} VkValidationFeaturesEXT;
+#endif // VK_HEADER_VERSION < 97
+
+#if VK_HEADER_VERSION < 101
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_NV   (VkStructureType)1000249000
+#define VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_NV                 (VkStructureType)1000249001
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_PROPERTIES_NV (VkStructureType)1000249002
+typedef enum VkComponentTypeNV
+{
+    VK_COMPONENT_TYPE_FLOAT16_NV = 0,
+    VK_COMPONENT_TYPE_FLOAT32_NV = 1,
+    VK_COMPONENT_TYPE_FLOAT64_NV = 2,
+    VK_COMPONENT_TYPE_SINT8_NV = 3,
+    VK_COMPONENT_TYPE_SINT16_NV = 4,
+    VK_COMPONENT_TYPE_SINT32_NV = 5,
+    VK_COMPONENT_TYPE_SINT64_NV = 6,
+    VK_COMPONENT_TYPE_UINT8_NV = 7,
+    VK_COMPONENT_TYPE_UINT16_NV = 8,
+    VK_COMPONENT_TYPE_UINT32_NV = 9,
+    VK_COMPONENT_TYPE_UINT64_NV = 10,
+    VK_COMPONENT_TYPE_BEGIN_RANGE_NV = VK_COMPONENT_TYPE_FLOAT16_NV,
+    VK_COMPONENT_TYPE_END_RANGE_NV = VK_COMPONENT_TYPE_UINT64_NV,
+    VK_COMPONENT_TYPE_RANGE_SIZE_NV = (VK_COMPONENT_TYPE_UINT64_NV - VK_COMPONENT_TYPE_FLOAT16_NV + 1),
+    VK_COMPONENT_TYPE_MAX_ENUM_NV = 0x7FFFFFFF
+} VkComponentTypeNV;
+typedef enum VkScopeNV
+{
+    VK_SCOPE_DEVICE_NV = 1,
+    VK_SCOPE_WORKGROUP_NV = 2,
+    VK_SCOPE_SUBGROUP_NV = 3,
+    VK_SCOPE_QUEUE_FAMILY_NV = 5,
+    VK_SCOPE_BEGIN_RANGE_NV = VK_SCOPE_DEVICE_NV,
+    VK_SCOPE_END_RANGE_NV = VK_SCOPE_QUEUE_FAMILY_NV,
+    VK_SCOPE_RANGE_SIZE_NV = (VK_SCOPE_QUEUE_FAMILY_NV - VK_SCOPE_DEVICE_NV + 1),
+    VK_SCOPE_MAX_ENUM_NV = 0x7FFFFFFF
+} VkScopeNV;
+typedef struct VkCooperativeMatrixPropertiesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t MSize;
+    uint32_t NSize;
+    uint32_t KSize;
+    VkComponentTypeNV AType;
+    VkComponentTypeNV BType;
+    VkComponentTypeNV CType;
+    VkComponentTypeNV DType;
+    VkScopeNV scope;
+} VkCooperativeMatrixPropertiesNV;
+typedef struct VkPhysicalDeviceCooperativeMatrixFeaturesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 cooperativeMatrix;
+    VkBool32 cooperativeMatrixRobustBufferAccess;
+} VkPhysicalDeviceCooperativeMatrixFeaturesNV;
+typedef struct VkPhysicalDeviceCooperativeMatrixPropertiesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    VkShaderStageFlags cooperativeMatrixSupportedStages;
+} VkPhysicalDeviceCooperativeMatrixPropertiesNV;
+typedef VkResult(VKAPI_PTR* PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesNV)(VkPhysicalDevice physicalDevice, uint32_t* pPropertyCount, VkCooperativeMatrixPropertiesNV* pProperties);
+#endif // VK_HEADER_VERSION < 101
+
+#if VK_HEADER_VERSION < 121
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COHERENT_MEMORY_FEATURES_AMD (VkStructureType)1000229000
+#define VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD                     (VkMemoryPropertyFlagBits)0x00000040
+#define VK_MEMORY_PROPERTY_DEVICE_UNCACHED_BIT_AMD                     (VkMemoryPropertyFlagBits)0x00000040
+typedef struct VkPhysicalDeviceCoherentMemoryFeaturesAMD
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 deviceCoherentMemory;
+} VkPhysicalDeviceCoherentMemoryFeaturesAMD;
+#endif // VK_HEADER_VERSION < 121
+
+#if VK_HEADER_VERSION < 129
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES_KHR (VkStructureType)1000257000
+#define VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO_KHR                     (VkStructureType)1000244001
+#define VK_STRUCTURE_TYPE_BUFFER_OPAQUE_CAPTURE_ADDRESS_CREATE_INFO_KHR      (VkStructureType)1000257002
+#define VK_STRUCTURE_TYPE_MEMORY_OPAQUE_CAPTURE_ADDRESS_ALLOCATE_INFO_KHR    (VkStructureType)1000257003
+#define VK_STRUCTURE_TYPE_DEVICE_MEMORY_OPAQUE_CAPTURE_ADDRESS_INFO_KHR      (VkStructureType)1000257004
+#define VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT_KHR               (VkBufferCreateFlagBits)0x00020000
+#define VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_KHR                        (VkBufferUsageFlagBits)0x00020000
+#define VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT_KHR                            (VkMemoryAllocateFlagBits)0x00000002
+#define VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT_KHR             (VkMemoryAllocateFlagBits)0x00000004
+typedef struct VkPhysicalDeviceBufferDeviceAddressFeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 bufferDeviceAddress;
+    VkBool32 bufferDeviceAddressCaptureReplay;
+    VkBool32 bufferDeviceAddressMultiDevice;
+} VkPhysicalDeviceBufferDeviceAddressFeaturesKHR;
+typedef struct VkBufferDeviceAddressInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkBuffer buffer;
+} VkBufferDeviceAddressInfoKHR;
+typedef struct VkBufferOpaqueCaptureAddressCreateInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint64_t opaqueCaptureAddress;
+} VkBufferOpaqueCaptureAddressCreateInfoKHR;
+typedef struct VkMemoryOpaqueCaptureAddressAllocateInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint64_t opaqueCaptureAddress;
+} VkMemoryOpaqueCaptureAddressAllocateInfoKHR;
+typedef struct VkDeviceMemoryOpaqueCaptureAddressInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkDeviceMemory memory;
+} VkDeviceMemoryOpaqueCaptureAddressInfoKHR;
+typedef VkDeviceAddress(VKAPI_PTR* PFN_vkGetBufferDeviceAddressKHR)(VkDevice device, const VkBufferDeviceAddressInfoKHR* pInfo);
+typedef uint64_t(VKAPI_PTR* PFN_vkGetBufferOpaqueCaptureAddressKHR)(VkDevice device, const VkBufferDeviceAddressInfoKHR* pInfo);
+typedef uint64_t(VKAPI_PTR* PFN_vkGetDeviceMemoryOpaqueCaptureAddressKHR)(VkDevice device, const VkDeviceMemoryOpaqueCaptureAddressInfoKHR* pInfo);
+#endif // VK_HEADER_VERSION < 129
+
+#if VK_HEADER_VERSION < 208
+typedef enum VkInstanceCreateFlagBits
+{
+    VK_INSTANCE_CREATE_ENUMERATE_PORTABILITY_BIT_KHR = 0x00000001,
+    VK_INSTANCE_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkInstanceCreateFlagBits;
+#endif // VK_HEADER_VERSION < 208
+
+#if VK_HEADER_VERSION < 255
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR   (VkStructureType)1000506000
+#define VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR                 (VkStructureType)1000506001
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_PROPERTIES_KHR (VkStructureType)1000506002
+typedef enum VkComponentTypeKHR
+{
+    VK_COMPONENT_TYPE_FLOAT16_KHR = 0,
+    VK_COMPONENT_TYPE_FLOAT32_KHR = 1,
+    VK_COMPONENT_TYPE_FLOAT64_KHR = 2,
+    VK_COMPONENT_TYPE_SINT8_KHR = 3,
+    VK_COMPONENT_TYPE_SINT16_KHR = 4,
+    VK_COMPONENT_TYPE_SINT32_KHR = 5,
+    VK_COMPONENT_TYPE_SINT64_KHR = 6,
+    VK_COMPONENT_TYPE_UINT8_KHR = 7,
+    VK_COMPONENT_TYPE_UINT16_KHR = 8,
+    VK_COMPONENT_TYPE_UINT32_KHR = 9,
+    VK_COMPONENT_TYPE_UINT64_KHR = 10,
+    VK_COMPONENT_TYPE_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkComponentTypeKHR;
+typedef enum VkScopeKHR
+{
+    VK_SCOPE_DEVICE_KHR = 1,
+    VK_SCOPE_WORKGROUP_KHR = 2,
+    VK_SCOPE_SUBGROUP_KHR = 3,
+    VK_SCOPE_QUEUE_FAMILY_KHR = 5,
+    VK_SCOPE_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkScopeKHR;
+typedef struct VkCooperativeMatrixPropertiesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t MSize;
+    uint32_t NSize;
+    uint32_t KSize;
+    VkComponentTypeKHR AType;
+    VkComponentTypeKHR BType;
+    VkComponentTypeKHR CType;
+    VkComponentTypeKHR ResultType;
+    VkBool32 saturatingAccumulation;
+    VkScopeKHR scope;
+} VkCooperativeMatrixPropertiesKHR;
+typedef struct VkPhysicalDeviceCooperativeMatrixFeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 cooperativeMatrix;
+    VkBool32 cooperativeMatrixRobustBufferAccess;
+} VkPhysicalDeviceCooperativeMatrixFeaturesKHR;
+typedef struct VkPhysicalDeviceCooperativeMatrixPropertiesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkShaderStageFlags cooperativeMatrixSupportedStages;
+} VkPhysicalDeviceCooperativeMatrixPropertiesKHR;
+typedef VkResult(VKAPI_PTR* PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR)(VkPhysicalDevice physicalDevice, uint32_t* pPropertyCount, VkCooperativeMatrixPropertiesKHR* pProperties);
+#endif // VK_HEADER_VERSION < 255
+
+#endif // NCNN_VULKAN_HEADER_FIX_H
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/lib/cmake/ncnn/ncnn-release.cmake b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/lib/cmake/ncnn/ncnn-release.cmake
new file mode 100644
index 0000000..1fb8660
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/lib/cmake/ncnn/ncnn-release.cmake
@@ -0,0 +1,19 @@
+#----------------------------------------------------------------
+# Generated CMake target import file for configuration "Release".
+#----------------------------------------------------------------
+
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+
+# Import target "ncnn" for configuration "Release"
+set_property(TARGET ncnn APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(ncnn PROPERTIES
+  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/libncnn.so"
+  IMPORTED_SONAME_RELEASE "libncnn.so"
+  )
+
+list(APPEND _cmake_import_check_targets ncnn )
+list(APPEND _cmake_import_check_files_for_ncnn "${_IMPORT_PREFIX}/lib/libncnn.so" )
+
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/lib/cmake/ncnn/ncnn.cmake b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/lib/cmake/ncnn/ncnn.cmake
new file mode 100644
index 0000000..6726e95
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/lib/cmake/ncnn/ncnn.cmake
@@ -0,0 +1,109 @@
+# Generated by CMake
+
+if("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.8)
+   message(FATAL_ERROR "CMake >= 2.8.0 required")
+endif()
+if(CMAKE_VERSION VERSION_LESS "2.8.3")
+   message(FATAL_ERROR "CMake >= 2.8.3 required")
+endif()
+cmake_policy(PUSH)
+cmake_policy(VERSION 2.8.3...3.25)
+#----------------------------------------------------------------
+# Generated CMake target import file.
+#----------------------------------------------------------------
+
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+
+# Protect against multiple inclusion, which would fail when already imported targets are added once more.
+set(_cmake_targets_defined "")
+set(_cmake_targets_not_defined "")
+set(_cmake_expected_targets "")
+foreach(_cmake_expected_target IN ITEMS ncnn)
+  list(APPEND _cmake_expected_targets "${_cmake_expected_target}")
+  if(TARGET "${_cmake_expected_target}")
+    list(APPEND _cmake_targets_defined "${_cmake_expected_target}")
+  else()
+    list(APPEND _cmake_targets_not_defined "${_cmake_expected_target}")
+  endif()
+endforeach()
+unset(_cmake_expected_target)
+if(_cmake_targets_defined STREQUAL _cmake_expected_targets)
+  unset(_cmake_targets_defined)
+  unset(_cmake_targets_not_defined)
+  unset(_cmake_expected_targets)
+  unset(CMAKE_IMPORT_FILE_VERSION)
+  cmake_policy(POP)
+  return()
+endif()
+if(NOT _cmake_targets_defined STREQUAL "")
+  string(REPLACE ";" ", " _cmake_targets_defined_text "${_cmake_targets_defined}")
+  string(REPLACE ";" ", " _cmake_targets_not_defined_text "${_cmake_targets_not_defined}")
+  message(FATAL_ERROR "Some (but not all) targets in this export set were already defined.\nTargets Defined: ${_cmake_targets_defined_text}\nTargets not yet defined: ${_cmake_targets_not_defined_text}\n")
+endif()
+unset(_cmake_targets_defined)
+unset(_cmake_targets_not_defined)
+unset(_cmake_expected_targets)
+
+
+# Compute the installation prefix relative to this file.
+get_filename_component(_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+if(_IMPORT_PREFIX STREQUAL "/")
+  set(_IMPORT_PREFIX "")
+endif()
+
+# Create imported target ncnn
+add_library(ncnn SHARED IMPORTED)
+
+set_target_properties(ncnn PROPERTIES
+  INTERFACE_COMPILE_OPTIONS "-fno-rtti;-fno-exceptions"
+  INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include/ncnn"
+  INTERFACE_LINK_LIBRARIES "-fopenmp;-static-openmp;-Wl,-wrap,__kmp_affinity_determine_capable;Threads::Threads;android;jnigraphics;log"
+  INTERFACE_POSITION_INDEPENDENT_CODE "ON"
+)
+
+if(CMAKE_VERSION VERSION_LESS 2.8.12)
+  message(FATAL_ERROR "This file relies on consumers using CMake 2.8.12 or greater.")
+endif()
+
+# Load information for each installed configuration.
+file(GLOB _cmake_config_files "${CMAKE_CURRENT_LIST_DIR}/ncnn-*.cmake")
+foreach(_cmake_config_file IN LISTS _cmake_config_files)
+  include("${_cmake_config_file}")
+endforeach()
+unset(_cmake_config_file)
+unset(_cmake_config_files)
+
+# Cleanup temporary variables.
+set(_IMPORT_PREFIX)
+
+# Loop over all imported files and verify that they actually exist
+foreach(_cmake_target IN LISTS _cmake_import_check_targets)
+  foreach(_cmake_file IN LISTS "_cmake_import_check_files_for_${_cmake_target}")
+    if(NOT EXISTS "${_cmake_file}")
+      message(FATAL_ERROR "The imported target \"${_cmake_target}\" references the file
+   \"${_cmake_file}\"
+but this file does not exist.  Possible reasons include:
+* The file was deleted, renamed, or moved to another location.
+* An install or uninstall procedure did not complete successfully.
+* The installation package was faulty and contained
+   \"${CMAKE_CURRENT_LIST_FILE}\"
+but not all the files it references.
+")
+    endif()
+  endforeach()
+  unset(_cmake_file)
+  unset("_cmake_import_check_files_for_${_cmake_target}")
+endforeach()
+unset(_cmake_target)
+unset(_cmake_import_check_targets)
+
+# This file does not depend on other imported targets which have
+# been exported from the same project but in a separate export set.
+
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)
+cmake_policy(POP)
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/lib/cmake/ncnn/ncnnConfig.cmake b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/lib/cmake/ncnn/ncnnConfig.cmake
new file mode 100644
index 0000000..d3ac286
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/lib/cmake/ncnn/ncnnConfig.cmake
@@ -0,0 +1,42 @@
+set(NCNN_OPENMP ON)
+set(NCNN_THREADS ON)
+set(NCNN_VULKAN OFF)
+set(NCNN_SHARED_LIB ON)
+set(NCNN_SYSTEM_GLSLANG OFF)
+
+if(NCNN_OPENMP)
+    find_package(OpenMP)
+endif()
+
+if(NCNN_THREADS)
+    set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
+    set(THREADS_PREFER_PTHREAD_FLAG TRUE)
+    find_package(Threads REQUIRED)
+endif()
+
+if(NCNN_VULKAN)
+    find_package(Vulkan REQUIRED)
+
+    if(NOT NCNN_SHARED_LIB)
+        if(NCNN_SYSTEM_GLSLANG)
+            find_package(glslang QUIET)
+            if(NOT glslang_FOUND)
+                set(GLSLANG_TARGET_DIR "")
+                include(${GLSLANG_TARGET_DIR}/OSDependentTargets.cmake)
+                include(${GLSLANG_TARGET_DIR}/OGLCompilerTargets.cmake)
+                if(EXISTS "${GLSLANG_TARGET_DIR}/HLSLTargets.cmake")
+                    # hlsl support can be optional
+                    include("${GLSLANG_TARGET_DIR}/HLSLTargets.cmake")
+                endif()
+                include(${GLSLANG_TARGET_DIR}/glslangTargets.cmake)
+                include(${GLSLANG_TARGET_DIR}/SPIRVTargets.cmake)
+            endif()
+        else()
+            set(glslang_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../lib/cmake/glslang")
+            find_package(glslang QUIET)
+        endif()
+
+    endif()
+endif()
+
+include(${CMAKE_CURRENT_LIST_DIR}/ncnn.cmake)
diff --git a/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/lib/pkgconfig/ncnn.pc b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/lib/pkgconfig/ncnn.pc
new file mode 100644
index 0000000..4e80236
--- /dev/null
+++ b/duix-sdk/src/main/cpp/third/ncnn-20231027-android-shared/x86_64/lib/pkgconfig/ncnn.pc
@@ -0,0 +1,11 @@
+prefix=${pcfiledir}/../..
+librarydir=${prefix}/lib
+includedir=${prefix}/include
+
+Name: ncnn
+Description: high-performance neural network inference framework optimized for the mobile platform
+Version: 1.0.20231027
+URL: https://github.com/Tencent/ncnn
+Libs: -L"${librarydir}" -lncnn
+Cflags: -I"${includedir}"
+
diff --git a/duix-sdk/src/main/java/ai/guiji/duix/DuixNcnn.java b/duix-sdk/src/main/java/ai/guiji/duix/DuixNcnn.java
new file mode 100644
index 0000000..c54ff31
--- /dev/null
+++ b/duix-sdk/src/main/java/ai/guiji/duix/DuixNcnn.java
@@ -0,0 +1,34 @@
+package ai.guiji.duix;
+
+public class DuixNcnn
+{
+    public native int alloc(int taskid,int mincalc,int width,int height);
+    public native int free(int taskid);
+    public native int initPcmex(int maxsize,int minoff,int minblock,int maxblock,int rgb);
+    public native int initWenet(String fnwenet);
+    public native int initMunet(String fnparam,String fnbin,String fnmask);
+    public native int initMunetex(String fnparam,String fnbin,String fnmask, int kind);
+
+    public native long newsession();
+    public native int finsession(long sessid);
+    public native int consession(long sessid);
+    public native int allcnt(long sessid);
+    public native int readycnt(long sessid);
+    public native int pushpcm(long sessid,byte[] arrbuf,int size, int kind);
+
+    public native int filerst(long sessid,String picfn,String mskfn,
+        int[] arrbox,String fgpic,int index, byte[] arrimg,byte[] arrmsk,int imgsize);
+
+    public native int bufrst(long sessid, int[] arrbox,int index, byte[] arrimg,int imgsize);
+
+    public native int fileload(String picfn,String mskfn,int width,int height,
+         byte[] arrpic,byte[] arrmsk,int imgsize);
+
+    public native int startgpg(String picfn,String gpgfn);
+    public native int stopgpg();
+    public native int processmd5(int kind,String infn,String outfn);
+
+    static {
+             System.loadLibrary("gjduix");
+    }
+}
diff --git a/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/Callback.java b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/Callback.java
new file mode 100644
index 0000000..274eb14
--- /dev/null
+++ b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/Callback.java
@@ -0,0 +1,7 @@
+package ai.guiji.duix.sdk.client;
+
+public interface Callback {
+
+    void onEvent(String event, String msg, Object info);
+
+}
\ No newline at end of file
diff --git a/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/Constant.java b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/Constant.java
new file mode 100644
index 0000000..c783c05
--- /dev/null
+++ b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/Constant.java
@@ -0,0 +1,21 @@
+package ai.guiji.duix.sdk.client;
+
+public class Constant {
+
+    public static final int VERSION_CODE = BuildConfig.VERSION_CODE;
+    public static final String VERSION_NAME = BuildConfig.VERSION_NAME;
+
+
+    // DUIX的回调事件
+    public static final String CALLBACK_EVENT_INIT_READY = "init.ready";
+    public static final String CALLBACK_EVENT_INIT_ERROR = "init.error";
+    public static final String CALLBACK_EVENT_AUDIO_PLAY_START = "play.start";
+    public static final String CALLBACK_EVENT_AUDIO_PLAY_END = "play.end";
+    public static final String CALLBACK_EVENT_AUDIO_PLAY_ERROR = "play.error";
+    public static final String CALLBACK_EVENT_MOTION_START = "motion.start";
+    public static final String CALLBACK_EVENT_MOTION_END = "motion.end";
+
+
+    public static final String BASE_DOWNLOAD_URL = "https://github.com/GuijiAI/duix.ai/releases/download/v1.0.0/gj_dh_res.zip"; // 基础配置文件包
+
+}
diff --git a/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/DUIX.java b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/DUIX.java
new file mode 100644
index 0000000..3f16e6f
--- /dev/null
+++ b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/DUIX.java
@@ -0,0 +1,231 @@
+package ai.guiji.duix.sdk.client;
+
+import android.content.Context;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.util.Arrays;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+import ai.guiji.duix.sdk.client.loader.ModelInfo;
+import ai.guiji.duix.sdk.client.render.RenderSink;
+import ai.guiji.duix.sdk.client.thread.RenderThread;
+
+public class DUIX {
+
+    private final Context mContext;
+    private final Callback mCallback;
+    private final String modelName;
+    private final RenderSink renderSink;
+    private ExecutorService commonExecutor = Executors.newSingleThreadExecutor();
+    private RenderThread mRenderThread;
+
+    private boolean isReady;            // 准备完成的标记
+    private float mVolume = 1.0F;
+    private RenderThread.Reporter reporter;
+
+    public DUIX(Context context, String modelName, RenderSink sink, Callback callback) {
+        this.mContext = context;
+        this.mCallback = callback;
+        this.modelName = modelName;
+        this.renderSink = sink;
+    }
+
+    /**
+     * 模型读取
+     */
+    public void init() {
+        // 先检查模型文件
+        File duixDir = mContext.getExternalFilesDir("duix");
+
+        File baseConfigDir = new File(duixDir + "/model/gj_dh_res");
+        File baseConfigTag = new File(duixDir + "/model/tmp/gj_dh_res");
+        if (!baseConfigDir.exists() || !baseConfigTag.exists()){
+            if (mCallback != null){
+                mCallback.onEvent(Constant.CALLBACK_EVENT_INIT_ERROR, "[gj_dh_res] does not exist", null);
+            }
+            return;
+        }
+
+        String dirName = "";
+        if (modelName.startsWith("https://") || modelName.startsWith("http://")){
+            try {
+                dirName = modelName.substring(modelName.lastIndexOf("/") + 1).replace(".zip", "");
+            }catch (Exception ignore){
+            }
+        } else {
+            dirName = modelName;
+        }
+        File modelDir = new File(duixDir + "/model", dirName);
+        File modelTag = new File(duixDir + "/model/tmp", dirName);
+        if (!modelDir.exists() || !modelTag.exists()){
+            if (mCallback != null){
+                mCallback.onEvent(Constant.CALLBACK_EVENT_INIT_ERROR,  "[" + dirName + "] does not exist", null);
+            }
+            return;
+        }
+
+        if (mRenderThread != null) {
+            mRenderThread.stopPreview();
+            mRenderThread = null;
+        }
+        mRenderThread = new RenderThread(mContext, modelDir, renderSink, mVolume, new RenderThread.RenderCallback() {
+
+            @Override
+            public void onInitResult(int code, int subCode, String message, ModelInfo modelInfo) {
+                if (code == 0){
+                    isReady = true;
+                    if (mCallback != null){
+                        mCallback.onEvent(Constant.CALLBACK_EVENT_INIT_READY, "init ok", modelInfo);
+                    }
+                } else {
+                    if (mCallback != null){
+                        mCallback.onEvent(Constant.CALLBACK_EVENT_INIT_ERROR, code + ", " + subCode + ", " + message, null);
+                    }
+                }
+            }
+
+            @Override
+            public void onPlayStart() {
+                if (mCallback != null){
+                    mCallback.onEvent(Constant.CALLBACK_EVENT_AUDIO_PLAY_START, "play start", null);
+                }
+            }
+
+            @Override
+            public void onPlayEnd() {
+                if (mCallback != null){
+                    mCallback.onEvent(Constant.CALLBACK_EVENT_AUDIO_PLAY_END, "play end", null);
+                }
+            }
+
+            @Override
+            public void onPlayError(int code, String msg) {
+                if (mCallback != null){
+                    mCallback.onEvent(Constant.CALLBACK_EVENT_AUDIO_PLAY_ERROR, "audio play error code: " + code + " msg: " + msg, null);
+                }
+            }
+
+            @Override
+            public void onMotionPlayStart(String name) {
+                if (mCallback != null){
+                    mCallback.onEvent(Constant.CALLBACK_EVENT_MOTION_START, "", null);
+                }
+            }
+
+            @Override
+            public void onMotionPlayComplete(String name) {
+                if (mCallback != null){
+                    mCallback.onEvent(Constant.CALLBACK_EVENT_MOTION_END, "", null);
+                }
+            }
+        }, reporter);
+        mRenderThread.setName("DUIXRender-Thread");
+        mRenderThread.start();
+    }
+
+    public boolean isReady() {
+        return isReady;
+    }
+
+    public void setVolume(float volume){
+        if (volume >= 0.0F && volume <= 1.0F){
+            mVolume = volume;
+            if (mRenderThread != null){
+                mRenderThread.setVolume(volume);
+            }
+        }
+    }
+
+    public void startPush(){
+        if (mRenderThread != null){
+            mRenderThread.startPush();
+        }
+    }
+
+    public void pushPcm(byte[] buffer){
+        if (mRenderThread != null){
+            mRenderThread.pushAudio(buffer.clone());
+        }
+    }
+
+    public void stopPush(){
+        if (mRenderThread != null){
+            mRenderThread.stopPush();
+        }
+    }
+
+
+    /**
+     * 播放音频文件
+     * 这里演示了兼容旧的wav音频文件驱动
+     * @param wavPath 16k采样率单通道16位深的wav本地文件
+     */
+    public void playAudio(String wavPath) {
+        File wavFile = new File(wavPath);
+        if (isReady && mRenderThread != null && wavFile.exists() && wavFile.length() > 44) {
+//            mRenderThread.prepareAudio(wavPath);
+            // 这里默认wav的头是44bytes，并且采样率是16000、单通道、16bit深度
+            byte[] data = new byte[(int) wavFile.length()];
+            try (FileInputStream inputStream = new FileInputStream(wavFile)) {
+                inputStream.read(data);
+            } catch (Exception e) {
+                throw new RuntimeException(e);
+            }
+            byte[] slice = Arrays.copyOfRange(data, 44, data.length);
+            startPush();
+            pushPcm(slice);
+            stopPush();
+        }
+    }
+
+    /**
+     * 停止音频播放
+     */
+    public boolean stopAudio() {
+        if (isReady && mRenderThread != null) {
+            mRenderThread.stopPlayAudio();
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+
+    /**
+     * 播放一只指定动作区间
+     */
+    public void startMotion(String name, boolean now) {
+        if (mRenderThread != null) {
+            mRenderThread.requireMotion(name, now);
+        }
+    }
+
+    /**
+     * 随机播放一个动作区间
+     */
+    public void startRandomMotion(boolean now) {
+        if (mRenderThread != null) {
+            mRenderThread.requireRandomMotion(now);
+        }
+    }
+
+    public void release() {
+        isReady = false;
+        if (commonExecutor != null) {
+            commonExecutor.shutdown();
+            commonExecutor = null;
+        }
+        if (mRenderThread != null) {
+            mRenderThread.stopPreview();
+        }
+    }
+
+    public void setReporter(RenderThread.Reporter reporter){
+        this.reporter = reporter;
+        if (mRenderThread != null) {
+            mRenderThread.setReporter(reporter);
+        }
+    }
+}
diff --git a/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/VirtualModelUtil.java b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/VirtualModelUtil.java
new file mode 100644
index 0000000..20c4e8d
--- /dev/null
+++ b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/VirtualModelUtil.java
@@ -0,0 +1,162 @@
+package ai.guiji.duix.sdk.client;
+
+import android.content.Context;
+import android.text.TextUtils;
+
+import java.io.File;
+
+import ai.guiji.duix.sdk.client.net.DownloadZipService;
+
+
+public class VirtualModelUtil {
+
+    public static boolean checkBaseConfig(Context context){
+        String duixDir = context.getExternalFilesDir("duix").getAbsolutePath();
+        File baseDir = new File(duixDir + "/model", "gj_dh_res");
+        File baseTag = new File(duixDir + "/model/tmp", "gj_dh_res");
+        return baseDir.exists() && baseTag.exists();
+    }
+
+    public static boolean checkModel(Context context, String name){
+        if (!TextUtils.isEmpty(name)){
+            String duixDir = context.getExternalFilesDir("duix").getAbsolutePath();
+            if (name.startsWith("https://") || name.startsWith("http://")){
+                String dirName = "";
+                try {
+                    dirName = name.substring(name.lastIndexOf("/") + 1).replace(".zip", "");
+                }catch (Exception ignore){
+                }
+                if (!TextUtils.isEmpty(dirName)){
+                    File modelDir = new File(duixDir + "/model", dirName);
+                    File modelTag = new File(duixDir + "/model/tmp", dirName);
+                    return modelDir.exists() && modelTag.exists();
+                } else {
+                    return false;
+                }
+            } else {
+                File modelDir = new File(duixDir + "/model", name);
+                File modelTag = new File(duixDir + "/model/tmp", name);
+                return modelDir.exists() && modelTag.exists();
+            }
+        } else {
+            return false;
+        }
+    }
+
+    public static void baseConfigDownload(Context context, ModelDownloadCallback callback){
+        String url = Constant.BASE_DOWNLOAD_URL;
+        baseConfigDownload(context, url, callback);
+    }
+    /**
+     * 基础配置文件下载
+     */
+    public static void baseConfigDownload(Context context, String url, ModelDownloadCallback callback){
+        String duixDir = context.getExternalFilesDir("duix").getAbsolutePath();
+        File baseDir = new File(duixDir + "/model", "gj_dh_res");
+        DownloadZipService.downloadAndUnzip(context, url, baseDir, new DownloadZipService.Callback() {
+            @Override
+            public void onDownloadProgress(long current, long total) {
+                if (callback != null){
+                    callback.onDownloadProgress(url, current, total);
+                }
+            }
+
+            @Override
+            public void onUnzipProgress(long current, long total) {
+                if (callback != null){
+                    callback.onUnzipProgress(url, current, total);
+                }
+            }
+
+            @Override
+            public void onComplete(File baseDirFile) {
+                // init model
+                if (callback != null){
+                    callback.onDownloadComplete(url, baseDirFile);
+                }
+            }
+
+            @Override
+            public void onError(int code, String msg) {
+                if (callback != null){
+                    callback.onDownloadFail(url, code, msg);
+                }
+            }
+        }, true);
+    }
+
+    /**
+     * 模型文件下载
+     */
+    public static void modelDownload(Context context, String modelUrl, ModelDownloadCallback callback){
+        String duixDir = context.getExternalFilesDir("duix").getAbsolutePath();
+        if (!TextUtils.isEmpty(modelUrl) && (modelUrl.startsWith("https://") || modelUrl.startsWith("http://"))){
+            String dirName = "";
+            try {
+                dirName = modelUrl.substring(modelUrl.lastIndexOf("/") + 1).replace(".zip", "");
+            }catch (Exception ignore){
+            }
+            if (!TextUtils.isEmpty(dirName)){
+                File modelDir = new File(duixDir + "/model", dirName);
+                // 下载模型文件
+                DownloadZipService.downloadAndUnzip(context, modelUrl, modelDir, new DownloadZipService.Callback() {
+                    @Override
+                    public void onDownloadProgress(long current, long total) {
+                        if (callback != null){
+                            callback.onDownloadProgress(modelUrl, current, total);
+                        }
+                    }
+
+                    @Override
+                    public void onUnzipProgress(long current, long total) {
+                        if (callback != null){
+                            callback.onUnzipProgress(modelUrl, current, total);
+                        }
+                    }
+
+                    @Override
+                    public void onComplete(File modelFile) {
+                        // init model
+                        if (callback != null){
+                            callback.onDownloadComplete(modelUrl, modelFile);
+                        }
+                    }
+
+                    @Override
+                    public void onError(int code, String msg) {
+                        if (callback != null){
+                            callback.onDownloadFail(modelUrl, code, msg);
+                        }
+                    }
+                }, true);
+            } else {
+                if (callback != null){
+                    callback.onDownloadFail(modelUrl, -1004, "Illegal model url[" + modelUrl + "]");
+                }
+            }
+        } else {
+            if (callback != null){
+                callback.onDownloadFail(modelUrl, -1003, "Illegal download url[" + modelUrl + "]");
+            }
+        }
+    }
+
+    public interface ModelDownloadCallback {
+
+        void onDownloadProgress(String url, long current, long total);
+
+        void onUnzipProgress(String url, long current, long total);
+
+        void onDownloadComplete(String url, File dir);
+
+        /**
+         * -1000    Compressed file download failed
+         * -1001    An exception occurred while decompressing the file
+         * -1002    Target folder not found
+         * -1003    Illegal download url
+         * -1004    Illegal model url
+         * -1005    Service not initialized
+         */
+        void onDownloadFail(String url, int code, String msg);
+    }
+}
diff --git a/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/audio/AudioPlayer.java b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/audio/AudioPlayer.java
new file mode 100644
index 0000000..43a4cf1
--- /dev/null
+++ b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/audio/AudioPlayer.java
@@ -0,0 +1,172 @@
+package ai.guiji.duix.sdk.client.audio;
+
+import android.media.AudioFormat;
+import android.media.AudioManager;
+import android.media.AudioTrack;
+
+import java.nio.ByteBuffer;
+import java.util.concurrent.ConcurrentLinkedQueue;
+
+import ai.guiji.duix.sdk.client.bean.AudioFrame;
+import ai.guiji.duix.sdk.client.util.Logger;
+
+
+public class AudioPlayer {
+
+    private AudioTrack audioTrack;
+    private PlaybackThread playbackThread;
+
+    private int sampleRate = 16000; // 采样率
+    private int channelConfig = AudioFormat.CHANNEL_OUT_MONO; // 声道配置
+    private int audioFormat = AudioFormat.ENCODING_PCM_16BIT; // 音频格式
+    //    int bufferSize = AudioTrack.getMinBufferSize(sampleRate, channelConfig, audioFormat);
+    private int bufferSize = 1280;       // 10ms 320
+
+    private ConcurrentLinkedQueue<AudioFrame> mPlayQueue = new ConcurrentLinkedQueue<>();       // 播放帧
+
+    private AudioPlayerCallback callback;
+
+    private ByteBuffer waitNextBuffer;
+
+    public AudioPlayer(AudioPlayerCallback callback, float volume){
+        this.callback = callback;
+        audioTrack = new AudioTrack(AudioManager.STREAM_MUSIC, // 音频流类型
+                sampleRate,
+                channelConfig,
+                audioFormat,
+                bufferSize,
+                AudioTrack.MODE_STREAM); // 流模式
+        if (volume != 1.0F){
+            audioTrack.setVolume(volume);
+        }
+        int minBufferSize = AudioTrack.getMinBufferSize(sampleRate, channelConfig, audioFormat);
+        Logger.d("AudioPlayer init bufferSize: " + bufferSize + " minBufferSize: " + minBufferSize);
+        waitNextBuffer = ByteBuffer.allocate(bufferSize);
+    }
+
+    public void setVolume(float volume){
+        audioTrack.setVolume(volume);
+    }
+
+    public void startPlay(){
+        stop();
+        try {
+            audioTrack.play();
+            callback.onPlayStart();
+        } catch (Exception e){
+            callback.onPlayError(-1000, e.getMessage());
+        }
+//        Logger.e("AudioPlayer 开始播放");
+        playbackThread = new PlaybackThread();
+        playbackThread.start();
+    }
+
+    public void pushStart(){
+        mPlayQueue.clear();
+        waitNextBuffer.clear();
+        waitNextBuffer.position(0);
+    }
+
+    public void pushData(ByteBuffer data){
+        while (data.hasRemaining()){
+            int min = Math.min(waitNextBuffer.remaining(), data.remaining());
+            byte[] b = new byte[min];
+            data.get(b);
+            waitNextBuffer.put(b);
+            if (!waitNextBuffer.hasRemaining()){
+                waitNextBuffer.position(0);
+                byte[] pushBytes = new byte[waitNextBuffer.remaining()];
+                waitNextBuffer.get(pushBytes);
+                waitNextBuffer.position(0);
+                mPlayQueue.add(new AudioFrame(pushBytes, pushBytes.length));
+            }
+        }
+    }
+
+    public void pushDone(){
+        int size = waitNextBuffer.position();
+        if (size > 0){
+            waitNextBuffer.position(0);
+            byte[] pushBytes = new byte[waitNextBuffer.remaining()];
+            waitNextBuffer.get(pushBytes);
+            waitNextBuffer.position(0);
+            mPlayQueue.add(new AudioFrame(pushBytes, size));
+        }
+        mPlayQueue.add(new AudioFrame(true));
+    }
+
+    public void stop() {
+        if (playbackThread != null) {
+            try {
+                playbackThread.stopPlay();
+                playbackThread.join();
+            } catch (InterruptedException e) {
+                e.printStackTrace();
+            }
+            playbackThread = null;
+        }
+        if (audioTrack != null) {
+            audioTrack.stop();
+        }
+    }
+
+    public void release(){
+        stop();
+        if (audioTrack != null) {
+            audioTrack.release();
+        }
+        mPlayQueue.clear();
+    }
+
+    public int getPlayIndex(){
+        long framesPlayed = audioTrack.getPlaybackHeadPosition();
+        int durationInMillis = (int)((framesPlayed * 1000L) / audioTrack.getSampleRate());
+        return durationInMillis / 40;
+    }
+
+    private class PlaybackThread extends Thread {
+
+        private volatile boolean isPlaying = true;
+        private final Object mPlayingFence = new Object();        // 给isPlaying加一个对象锁
+
+        public void stopPlay(){
+            synchronized (mPlayingFence){
+                isPlaying = false;
+            }
+        }
+
+        private boolean isPlaying(){
+            synchronized (mPlayingFence){
+                return isPlaying;
+            }
+        }
+
+        @Override
+        public void run() {
+            super.run();
+            while (isPlaying()) {
+                AudioFrame top = mPlayQueue.poll();
+                if (top != null){
+                    if (top.completeEmptyFrame){
+                        callback.onPlayEnd();
+                        stopPlay();
+                        break;
+                    } else {
+                        audioTrack.write(top.buffer, 0, top.size, AudioTrack.WRITE_BLOCKING);
+                    }
+                }
+            }
+            if (audioTrack != null) {
+                audioTrack.stop();
+            }
+            Logger.i("AudioPlayer play finish");
+        }
+    }
+
+    public interface AudioPlayerCallback{
+        void onPlayStart();
+        void onPlayEnd();
+        void onPlayError(int code, String msg);
+    }
+
+}
diff --git a/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/bean/AudioFrame.java b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/bean/AudioFrame.java
new file mode 100644
index 0000000..b60f374
--- /dev/null
+++ b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/bean/AudioFrame.java
@@ -0,0 +1,25 @@
+package ai.guiji.duix.sdk.client.bean;
+
+public class AudioFrame {
+
+    public AudioFrame(byte[] buffer, int size){
+        this.buffer = buffer;
+        this.size = size;
+    }
+
+    public AudioFrame(boolean completeEmptyFrame){
+        this.completeEmptyFrame = completeEmptyFrame;
+    }
+
+    public boolean completeEmptyFrame;
+
+    public byte[] buffer;
+    public int size;
+
+    @Override
+    public String toString() {
+        return "AudioFrame{" +
+                "completeEmptyFrame=" + completeEmptyFrame +
+                '}';
+    }
+}
diff --git a/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/bean/ImageFrame.java b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/bean/ImageFrame.java
new file mode 100644
index 0000000..d814832
--- /dev/null
+++ b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/bean/ImageFrame.java
@@ -0,0 +1,21 @@
+package ai.guiji.duix.sdk.client.bean;
+
+
+import java.nio.ByteBuffer;
+
+public class ImageFrame {
+
+    public ImageFrame(ByteBuffer rawBuffer, ByteBuffer maskBuffer, int width, int height) {
+        this.rawBuffer = rawBuffer;
+        this.maskBuffer = maskBuffer;
+        this.width = width;
+        this.height = height;
+    }
+
+    public int width;
+    public int height;
+
+    public ByteBuffer rawBuffer;
+    public ByteBuffer maskBuffer;
+
+}
diff --git a/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/net/DownloadZipService.java b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/net/DownloadZipService.java
new file mode 100644
index 0000000..ed0ee50
--- /dev/null
+++ b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/net/DownloadZipService.java
@@ -0,0 +1,99 @@
+package ai.guiji.duix.sdk.client.net;
+
+import android.content.Context;
+
+import java.io.File;
+import java.util.concurrent.Executor;
+import java.util.concurrent.Executors;
+
+import ai.guiji.duix.sdk.client.util.Logger;
+import ai.guiji.duix.sdk.client.util.MD5Util;
+import ai.guiji.duix.sdk.client.util.ZipUtil;
+
+
+public class DownloadZipService {
+
+    public interface Callback {
+        void onDownloadProgress(long current, long total);
+
+        void onUnzipProgress(long current, long total);
+
+        void onComplete(File dirFile);
+
+        void onError(int code, String msg);
+    }
+
+    /**
+     * 下载zip文件并解压
+     *
+     */
+    public static void downloadAndUnzip(Context context, String url, File targetDirFile, Callback callback, boolean deleteZip) {
+        Executor executor = Executors.newSingleThreadExecutor();
+        executor.execute(() -> {
+            File cacheDir = context.getExternalCacheDir();
+            if (!cacheDir.exists()) {
+                cacheDir.mkdirs();
+            }
+            File zipFile = new File(cacheDir, MD5Util.string2MD5(url));
+            boolean result = true;
+            if (!zipFile.exists()) {
+                Logger.d("zip not found, try download.");
+                result = new FileDownloader(url, zipFile.getAbsolutePath(), callback::onDownloadProgress).download();
+                Logger.d("download file done.");
+            } else {
+                Logger.d( "found cache zip file.");
+            }
+            if (result) {
+                Logger.e( "try unzip file.");
+                if (targetDirFile.exists()) {
+                    Logger.e("delete old files.");
+                    deleteContents(targetDirFile);
+                }
+                // 拿到目标路径的父级
+                File targetParentDir = targetDirFile.getParentFile();
+                if (!targetParentDir.exists()) {
+                    targetParentDir.mkdirs();
+                }
+                result = ZipUtil.unzip(zipFile.getAbsolutePath(), targetParentDir.getAbsolutePath(), callback::onUnzipProgress);
+                if (result) {
+                    Logger.d( "unzip file complete.");
+                    // 这里时候targetDirFile应该是存在的
+                    if (targetDirFile.exists()) {
+                        File tmpDir = new File(targetParentDir, "tmp/" + targetDirFile.getName());
+                        if (!tmpDir.mkdirs()){
+                            Logger.e("make tmp dir fail");
+                        }
+                        if (deleteZip && zipFile.exists()){
+                            zipFile.delete();
+                        }
+                        callback.onComplete(targetDirFile);
+                    } else {
+                        callback.onError(-1002,"unzip dir not found!");
+                    }
+                } else {
+                    callback.onError(-1001, "unzip file error!");
+                    zipFile.delete();
+                }
+            } else {
+                callback.onError(-1000, "zip file download error");
+            }
+        });
+    }
+
+    public static boolean deleteContents(File dir) {
+        File[] files = dir.listFiles();
+        boolean success = true;
+        if (files != null) {
+            for (File file : files) {
+                if (file.isDirectory()) {
+                    success &= deleteContents(file);
+                }
+                if (!file.delete()) {
+                    success = false;
+                }
+            }
+        }
+        return success;
+    }
+
+}
diff --git a/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/net/FileDownloader.java b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/net/FileDownloader.java
new file mode 100644
index 0000000..4f7d8e9
--- /dev/null
+++ b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/net/FileDownloader.java
@@ -0,0 +1,75 @@
+package ai.guiji.duix.sdk.client.net;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.URL;
+
+import ai.guiji.duix.sdk.client.util.Logger;
+
+
+public class FileDownloader {
+
+    private String url;
+    private String path;
+
+    private Callback callback;
+
+    public FileDownloader(String url, String path, Callback callback){
+        this.url = url;
+        this.path = path;
+        this.callback = callback;
+    }
+
+    public boolean download(){
+        try {
+            URL httpUrl = new URL(url);
+            HttpURLConnection conn = (HttpURLConnection) httpUrl.openConnection();
+            conn.setConnectTimeout(15000);
+            conn.setReadTimeout(15000);
+            long contentLength = conn.getContentLengthLong();
+            conn.connect();
+            int httpCode = conn.getResponseCode();//获取HTTP状态码
+            if (httpCode == HttpURLConnection.HTTP_OK) {
+                File tmpFile = new File(path + ".tmp");
+                File parent = tmpFile.getParentFile();
+                if (parent != null && !parent.exists()) {
+                    if (!parent.mkdirs()) {
+                        return false;
+                    }
+                }
+                if (tmpFile.exists()) {
+                    tmpFile.delete();
+                }
+                FileOutputStream fileOutputStream = new FileOutputStream(tmpFile);
+                long downloadLength = 0;
+                int len;
+                byte[] data = new byte[1024];
+                InputStream is = conn.getInputStream();
+                while ((len = is.read(data)) != -1) {
+                    fileOutputStream.write(data, 0, len);
+                    downloadLength += len;
+                    if (callback != null){
+                        callback.onProgress(downloadLength , contentLength);
+                    }
+                }
+                fileOutputStream.flush();
+                is.close();
+                fileOutputStream.close();
+                File target = new File(path);
+                if (tmpFile.renameTo(target)) {
+                    return true;
+                }
+            }
+        } catch (Exception e){
+            Logger.d("download error:" + e);
+        }
+        return false;
+    }
+
+    public interface Callback {
+        void onProgress(long current, long total);
+    }
+
+}
diff --git a/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/render/DUIXRenderer.java b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/render/DUIXRenderer.java
new file mode 100644
index 0000000..82b97b9
--- /dev/null
+++ b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/render/DUIXRenderer.java
@@ -0,0 +1,110 @@
+package ai.guiji.duix.sdk.client.render;
+
+import android.content.Context;
+import android.opengl.GLES20;
+import android.util.Log;
+
+import javax.microedition.khronos.egl.EGLConfig;
+import javax.microedition.khronos.opengles.GL10;
+
+import ai.guiji.duix.sdk.client.bean.ImageFrame;
+import ai.guiji.duix.sdk.client.util.OpenGLUtil;
+
+public final class DUIXRenderer implements DUIXTextureView.Renderer, RenderSink{
+
+    private static final String TAG = "DUIXRenderer";
+
+    DUIXTextureView glTextureView;
+    private Context mContext;
+
+    private int mViewWidth = 0;
+    private int mViewHeight = 0;
+    private int mVideoWidth = 0;
+    private int mVideoHeight = 0;
+
+    protected float[] mMvpMatrix = new float[16];           // 缩放的矩阵
+
+    private ImageFrame pendingFrame;
+
+    private ImageDrawer mImageDrawer;
+
+    private static final int SCALE_TYPE_CROP = 0;
+    private static final int SCALE_TYPE_INSIDE = 1;
+    private int scaleType = SCALE_TYPE_CROP;
+
+    public DUIXRenderer(Context context, DUIXTextureView glTextureView) {
+        mContext = context;
+        this.glTextureView = glTextureView;
+    }
+
+    @Override
+    public void onVideoFrame(ImageFrame imageFrame) {
+        pendingFrame = imageFrame;
+        if (mVideoWidth != imageFrame.width || mVideoHeight != imageFrame.height) {
+            mVideoWidth = imageFrame.width;
+            mVideoHeight = imageFrame.height;
+            tryChangeScale();
+        }
+        glTextureView.requestRender();
+    }
+
+    public void setScaleType(int scaleType) {
+        this.scaleType = scaleType;
+    }
+
+    @Override
+    public void onSurfaceCreated(GL10 gl, EGLConfig config) {
+        Log.e(TAG, "onSurfaceCreated");
+        mImageDrawer = new ImageDrawer();
+    }
+
+    @Override
+    public void onSurfaceChanged(GL10 gl, int width, int height) {
+        Log.e(TAG, "onSurfaceChanged size: " + " width: " + width + " height: " + height);
+        GLES20.glViewport(0, 0, width, height);
+        mViewWidth = width;
+        mViewHeight = height;
+        tryChangeScale();
+    }
+
+    private void tryChangeScale() {
+        if (mViewWidth > 0 && mViewHeight > 0 && mVideoWidth > 0 && mVideoHeight > 0) {
+            if (scaleType == SCALE_TYPE_CROP){
+                mMvpMatrix = OpenGLUtil.changeMvpMatrixCrop(mViewWidth, mViewHeight, mVideoWidth, mVideoHeight);
+            } else {
+                mMvpMatrix = OpenGLUtil.changeMvpMatrixInside(mViewWidth, mViewHeight, mVideoWidth, mVideoHeight);
+            }
+        }
+    }
+
+    @Override
+    public void onDrawFrame(GL10 gl) {
+//        Log.e(TAG, "onDrawFrame");
+        GLES20.glClear(GLES20.GL_COLOR_BUFFER_BIT | GLES20.GL_DEPTH_BUFFER_BIT);
+        GLES20.glClearColor(0, 0, 0, 0);
+        // 融合，不然透明会有问题
+        GLES20.glEnable(GLES20.GL_BLEND);
+        // 半透明会有白色边框
+//        GLES20.glBlendFunc(GLES20.GL_ONE, GLES20.GL_ZERO);
+        // 使用glBlendFuncSeparate为RGB和alpha通道分别设置不同的选项：照常设置了RGB分量，但让最终的alpha分量只被源颜色向量的alpha值影响。
+        GLES20.glBlendFuncSeparate(GLES20.GL_SRC_ALPHA, GLES20.GL_ONE_MINUS_SRC_ALPHA, GLES20.GL_ONE, GLES20.GL_ZERO);
+
+        if (pendingFrame != null) {
+            mImageDrawer.draw(pendingFrame, mMvpMatrix);
+        }
+    }
+
+    @Override
+    public void onSurfaceDestroyed(GL10 gl) {
+        Log.e(TAG, "onSurfaceDestroyed");
+        if (mImageDrawer != null) {
+            mImageDrawer.release();
+        }
+    }
+
+
+    public void release() {
+
+    }
+
+}
diff --git a/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/render/DUIXTextureView.java b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/render/DUIXTextureView.java
new file mode 100644
index 0000000..dae8213
--- /dev/null
+++ b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/render/DUIXTextureView.java
@@ -0,0 +1,1832 @@
+package ai.guiji.duix.sdk.client.render;
+
+import android.content.Context;
+import android.graphics.SurfaceTexture;
+import android.opengl.GLDebugHelper;
+import android.util.AttributeSet;
+import android.util.Log;
+import android.view.TextureView;
+import android.view.View;
+
+import java.io.Writer;
+import java.lang.ref.WeakReference;
+import java.util.ArrayList;
+
+import javax.microedition.khronos.egl.EGL10;
+import javax.microedition.khronos.egl.EGL11;
+import javax.microedition.khronos.egl.EGLConfig;
+import javax.microedition.khronos.egl.EGLContext;
+import javax.microedition.khronos.egl.EGLDisplay;
+import javax.microedition.khronos.egl.EGLSurface;
+import javax.microedition.khronos.opengles.GL;
+import javax.microedition.khronos.opengles.GL10;
+
+import ai.guiji.duix.sdk.client.BuildConfig;
+
+
+public class DUIXTextureView
+        extends TextureView
+        implements TextureView.SurfaceTextureListener,
+        View.OnLayoutChangeListener {
+
+    private final static boolean DEBUG = false;
+
+    private final static String TAG = "DUIXTextureView";
+    private final static boolean LOG_ATTACH_DETACH = DEBUG;
+    private final static boolean LOG_THREADS = DEBUG;
+    private final static boolean LOG_PAUSE_RESUME = DEBUG;
+    private final static boolean LOG_SURFACE = DEBUG;
+    private final static boolean LOG_RENDERER = DEBUG;
+    private final static boolean LOG_RENDERER_DRAW_FRAME = DEBUG;
+    private final static boolean LOG_EGL = DEBUG;
+    /**
+     * The renderer only renders
+     * when the surface is created, or when {@link #requestRender} is called.
+     *
+     * @see #getRenderMode()
+     * @see #setRenderMode(int)
+     * @see #requestRender()
+     */
+    public final static int RENDERMODE_WHEN_DIRTY = 0;
+    /**
+     * The renderer is called
+     * continuously to re-render the scene.
+     *
+     * @see #getRenderMode()
+     * @see #setRenderMode(int)
+     */
+    public final static int RENDERMODE_CONTINUOUSLY = 1;
+
+    /**
+     * Check glError() after every GL call and throw an exception if glError indicates
+     * that an error has occurred. This can be used to help track down which OpenGL ES call
+     * is causing an error.
+     *
+     * @see #getDebugFlags
+     * @see #setDebugFlags
+     */
+    public final static int DEBUG_CHECK_GL_ERROR = 1;
+
+    /**
+     * Log GL calls to the system log at "verbose" level with tag "DUIXTextureView".
+     *
+     * @see #getDebugFlags
+     * @see #setDebugFlags
+     */
+    public final static int DEBUG_LOG_GL_CALLS = 2;
+
+    /**
+     * Standard View constructor. In order to render something, you
+     * must call {@link #setRenderer} to register a renderer.
+     */
+    public DUIXTextureView(Context context) {
+        super(context);
+        init();
+    }
+
+    /**
+     * Standard View constructor. In order to render something, you
+     * must call {@link #setRenderer} to register a renderer.
+     */
+    public DUIXTextureView(Context context, AttributeSet attrs) {
+        super(context, attrs);
+        init();
+    }
+
+    @Override
+    protected void finalize() throws Throwable {
+        try {
+            if (mGLThread != null) {
+                // GLThread may still be running if this view was never
+                // attached to a window.
+                mGLThread.requestExitAndWait();
+            }
+        } finally {
+            super.finalize();
+        }
+    }
+
+    private void init() {
+        setSurfaceTextureListener(this);
+    }
+
+//    /**
+//     * 设置监听器，上报 DUIXTextureView 内部的错误
+//     */
+//    private IMonitor monitor;
+//    public void setMonitor(IMonitor monitor) {
+//        this.monitor = monitor;
+//    }
+//
+//    private void sendMonitor(boolean status, String info) {
+//        if (monitor != null) {
+//            monitor.monitor(status, "unknown", 0, 0, info);
+//        }
+//    }
+
+    /**
+     * Set the glWrapper. If the glWrapper is not null, its
+     * {@link GLWrapper#wrap(GL)} method is called
+     * whenever a surface is created. A GLWrapper can be used to wrap
+     * the GL object that's passed to the renderer. Wrapping a GL
+     * object enables examining and modifying the behavior of the
+     * GL calls made by the renderer.
+     * <p>
+     * Wrapping is typically used for debugging purposes.
+     * <p>
+     * The default value is null.
+     * @param glWrapper the new GLWrapper
+     */
+    public void setGLWrapper(GLWrapper glWrapper) {
+        mGLWrapper = glWrapper;
+    }
+
+    /**
+     * Set the debug flags to a new value. The value is
+     * constructed by OR-together zero or more
+     * of the DEBUG_CHECK_* constants. The debug flags take effect
+     * whenever a surface is created. The default value is zero.
+     * @param debugFlags the new debug flags
+     * @see #DEBUG_CHECK_GL_ERROR
+     * @see #DEBUG_LOG_GL_CALLS
+     */
+    public void setDebugFlags(int debugFlags) {
+        mDebugFlags = debugFlags;
+    }
+
+    /**
+     * Get the current value of the debug flags.
+     * @return the current value of the debug flags.
+     */
+    public int getDebugFlags() {
+        return mDebugFlags;
+    }
+
+    /**
+     * Control whether the EGL context is preserved when the DUIXTextureView is paused and
+     * resumed.
+     * <p>
+     * If set to true, then the EGL context may be preserved when the DUIXTextureView is paused.
+     * Whether the EGL context is actually preserved or not depends upon whether the
+     * Android device that the program is running on can support an arbitrary number of EGL
+     * contexts or not. Devices that can only support a limited number of EGL contexts must
+     * release the  EGL context in order to allow multiple applications to share the GPU.
+     * <p>
+     * If set to false, the EGL context will be released when the DUIXTextureView is paused,
+     * and recreated when the DUIXTextureView is resumed.
+     * <p>
+     *
+     * The default is false.
+     *
+     * @param preserveOnPause preserve the EGL context when paused
+     */
+    public void setPreserveEGLContextOnPause(boolean preserveOnPause) {
+        mPreserveEGLContextOnPause = preserveOnPause;
+    }
+
+    /**
+     * @return true if the EGL context will be preserved when paused
+     */
+    public boolean getPreserveEGLContextOnPause() {
+        return mPreserveEGLContextOnPause;
+    }
+
+    /**
+     * Set the renderer associated with this view. Also starts the thread that
+     * will call the renderer, which in turn causes the rendering to start.
+     * <p>This method should be called once and only once in the life-cycle of
+     * a DUIXTextureView.
+     * <p>The following DUIXTextureView methods can only be called <em>before</em>
+     * setRenderer is called:
+     * <ul>
+     * <li>{@link #setEGLConfigChooser(boolean)}
+     * <li>{@link #setEGLConfigChooser(EGLConfigChooser)}
+     * <li>{@link #setEGLConfigChooser(int, int, int, int, int, int)}
+     * </ul>
+     * <p>
+     * The following DUIXTextureView methods can only be called <em>after</em>
+     * setRenderer is called:
+     * <ul>
+     * <li>{@link #getRenderMode()}
+     * <li>{@link #onPause()}
+     * <li>{@link #onResume()}
+     * <li>{@link #queueEvent(Runnable)}
+     * <li>{@link #requestRender()}
+     * <li>{@link #setRenderMode(int)}
+     * </ul>
+     *
+     * @param renderer the renderer to use to perform OpenGL drawing.
+     */
+    public void setRenderer(Renderer renderer) {
+        checkRenderThreadState();
+        if (mEGLConfigChooser == null) {
+            mEGLConfigChooser = new SimpleEGLConfigChooser(true);
+        }
+        if (mEGLContextFactory == null) {
+            mEGLContextFactory = new DefaultContextFactory();
+        }
+        if (mEGLWindowSurfaceFactory == null) {
+            mEGLWindowSurfaceFactory = new DefaultWindowSurfaceFactory();
+        }
+        mRenderer = renderer;
+        mGLThread = new GLThread(mThisWeakRef);
+        mGLThread.start();
+    }
+
+    /**
+     * Install a custom EGLContextFactory.
+     * <p>If this method is
+     * called, it must be called before {@link #setRenderer(Renderer)}
+     * is called.
+     * <p>
+     * If this method is not called, then by default
+     * a context will be created with no shared context and
+     * with a null attribute list.
+     */
+    public void setEGLContextFactory(EGLContextFactory factory) {
+        checkRenderThreadState();
+        mEGLContextFactory = factory;
+    }
+
+    /**
+     * Install a custom EGLWindowSurfaceFactory.
+     * <p>If this method is
+     * called, it must be called before {@link #setRenderer(Renderer)}
+     * is called.
+     * <p>
+     * If this method is not called, then by default
+     * a window surface will be created with a null attribute list.
+     */
+    public void setEGLWindowSurfaceFactory(EGLWindowSurfaceFactory factory) {
+        checkRenderThreadState();
+        mEGLWindowSurfaceFactory = factory;
+    }
+
+    /**
+     * Install a custom EGLConfigChooser.
+     * <p>If this method is
+     * called, it must be called before {@link #setRenderer(Renderer)}
+     * is called.
+     * <p>
+     * If no setEGLConfigChooser method is called, then by default the
+     * view will choose an EGLConfig that is compatible with the current
+     * android.view.Surface, with a depth buffer depth of
+     * at least 16 bits.
+     * @param configChooser
+     */
+    public void setEGLConfigChooser(EGLConfigChooser configChooser) {
+        checkRenderThreadState();
+        mEGLConfigChooser = configChooser;
+    }
+
+    /**
+     * Install a config chooser which will choose a config
+     * as close to 16-bit RGB as possible, with or without an optional depth
+     * buffer as close to 16-bits as possible.
+     * <p>If this method is
+     * called, it must be called before {@link #setRenderer(Renderer)}
+     * is called.
+     * <p>
+     * If no setEGLConfigChooser method is called, then by default the
+     * view will choose an RGB_888 surface with a depth buffer depth of
+     * at least 16 bits.
+     *
+     * @param needDepth
+     */
+    public void setEGLConfigChooser(boolean needDepth) {
+        setEGLConfigChooser(new SimpleEGLConfigChooser(needDepth));
+    }
+
+    /**
+     * Install a config chooser which will choose a config
+     * with at least the specified depthSize and stencilSize,
+     * and exactly the specified redSize, greenSize, blueSize and alphaSize.
+     * <p>If this method is
+     * called, it must be called before {@link #setRenderer(Renderer)}
+     * is called.
+     * <p>
+     * If no setEGLConfigChooser method is called, then by default the
+     * view will choose an RGB_888 surface with a depth buffer depth of
+     * at least 16 bits.
+     *
+     */
+    public void setEGLConfigChooser(int redSize, int greenSize, int blueSize,
+                                    int alphaSize, int depthSize, int stencilSize) {
+        setEGLConfigChooser(new ComponentSizeChooser(redSize, greenSize,
+                blueSize, alphaSize, depthSize, stencilSize));
+    }
+
+    /**
+     * Inform the default EGLContextFactory and default EGLConfigChooser
+     * which EGLContext client version to pick.
+     * <p>Use this method to create an OpenGL ES 2.0-compatible context.
+     * Example:
+     * <pre class="prettyprint">
+     *     public MyView(Context context) {
+     *         super(context);
+     *         setEGLContextClientVersion(2); // Pick an OpenGL ES 2.0 context.
+     *         setRenderer(new MyRenderer());
+     *     }
+     * </pre>
+     * <p>Note: Activities which require OpenGL ES 2.0 should indicate this by
+     * setting @lt;uses-feature android:glEsVersion="0x00020000" /> in the activity's
+     * AndroidManifest.xml file.
+     * <p>If this method is called, it must be called before {@link #setRenderer(Renderer)}
+     * is called.
+     * <p>This method only affects the behavior of the default EGLContexFactory and the
+     * default EGLConfigChooser. If
+     * {@link #setEGLContextFactory(EGLContextFactory)} has been called, then the supplied
+     * EGLContextFactory is responsible for creating an OpenGL ES 2.0-compatible context.
+     * If
+     * {@link #setEGLConfigChooser(EGLConfigChooser)} has been called, then the supplied
+     * EGLConfigChooser is responsible for choosing an OpenGL ES 2.0-compatible config.
+     * @param version The EGLContext client version to choose. Use 2 for OpenGL ES 2.0
+     */
+    public void setEGLContextClientVersion(int version) {
+        checkRenderThreadState();
+        mEGLContextClientVersion = version;
+    }
+
+    /**
+     * Set the rendering mode. When renderMode is
+     * RENDERMODE_CONTINUOUSLY, the renderer is called
+     * repeatedly to re-render the scene. When renderMode
+     * is RENDERMODE_WHEN_DIRTY, the renderer only rendered when the surface
+     * is created, or when {@link #requestRender} is called. Defaults to RENDERMODE_CONTINUOUSLY.
+     * <p>
+     * Using RENDERMODE_WHEN_DIRTY can improve battery life and overall system performance
+     * by allowing the GPU and CPU to idle when the view does not need to be updated.
+     * <p>
+     * This method can only be called after {@link #setRenderer(Renderer)}
+     *
+     * @param renderMode one of the RENDERMODE_X constants
+     * @see #RENDERMODE_CONTINUOUSLY
+     * @see #RENDERMODE_WHEN_DIRTY
+     */
+    public void setRenderMode(int renderMode) {
+        mGLThread.setRenderMode(renderMode);
+    }
+
+    /**
+     * Get the current rendering mode. May be called
+     * from any thread. Must not be called before a renderer has been set.
+     * @return the current rendering mode.
+     * @see #RENDERMODE_CONTINUOUSLY
+     * @see #RENDERMODE_WHEN_DIRTY
+     */
+    public int getRenderMode() {
+        return mGLThread.getRenderMode();
+    }
+
+    /**
+     * Request that the renderer render a frame.
+     * This method is typically used when the render mode has been set to
+     * {@link #RENDERMODE_WHEN_DIRTY}, so that frames are only rendered on demand.
+     * May be called
+     * from any thread. Must not be called before a renderer has been set.
+     */
+    public void requestRender() {
+        mGLThread.requestRender();
+    }
+
+    /**
+     * This method is part of the SurfaceHolder.Callback interface, and is
+     * not normally called or subclassed by clients of DUIXTextureView.
+     */
+    public void surfaceCreated(SurfaceTexture texture) {
+        mGLThread.surfaceCreated();
+    }
+
+    /**
+     * This method is part of the SurfaceHolder.Callback interface, and is
+     * not normally called or subclassed by clients of DUIXTextureView.
+     */
+    public void surfaceDestroyed(SurfaceTexture texture) {
+        // Surface will be destroyed when we return
+        mGLThread.surfaceDestroyed();
+    }
+
+    /**
+     * This method is part of the SurfaceHolder.Callback interface, and is
+     * not normally called or subclassed by clients of DUIXTextureView.
+     */
+    public void surfaceChanged(SurfaceTexture texture, int format, int w, int h) {
+        mGLThread.onWindowResize(w, h);
+    }
+
+    /**
+     * Inform the view that the activity is paused. The owner of this view must
+     * call this method when the activity is paused. Calling this method will
+     * pause the rendering thread.
+     * Must not be called before a renderer has been set.
+     */
+    public void onPause() {
+        mGLThread.onPause();
+    }
+
+    /**
+     * Inform the view that the activity is resumed. The owner of this view must
+     * call this method when the activity is resumed. Calling this method will
+     * recreate the OpenGL display and resume the rendering
+     * thread.
+     * Must not be called before a renderer has been set.
+     */
+    public void onResume() {
+        mGLThread.onResume();
+    }
+
+    /**
+     * Queue a runnable to be run on the GL rendering thread. This can be used
+     * to communicate with the Renderer on the rendering thread.
+     * Must not be called before a renderer has been set.
+     * @param r the runnable to be run on the GL rendering thread.
+     */
+    public void queueEvent(Runnable r) {
+        mGLThread.queueEvent(r);
+    }
+
+    /**
+     * This method is used as part of the View class and is not normally
+     * called or subclassed by clients of DUIXTextureView.
+     */
+    @Override
+    protected void onAttachedToWindow() {
+        super.onAttachedToWindow();
+        if (LOG_ATTACH_DETACH) {
+            Log.d(TAG, "onAttachedToWindow reattach =" + mDetached);
+        }
+        if (mDetached && (mRenderer != null)) {
+            int renderMode = RENDERMODE_CONTINUOUSLY;
+            if (mGLThread != null) {
+                renderMode = mGLThread.getRenderMode();
+            }
+            mGLThread = new GLThread(mThisWeakRef);
+            if (renderMode != RENDERMODE_CONTINUOUSLY) {
+                mGLThread.setRenderMode(renderMode);
+            }
+            mGLThread.start();
+        }
+        mDetached = false;
+    }
+
+    /**
+     * This method is used as part of the View class and is not normally
+     * called or subclassed by clients of DUIXTextureView.
+     * Must not be called before a renderer has been set.
+     */
+    @Override
+    protected void onDetachedFromWindow() {
+        if (LOG_ATTACH_DETACH) {
+            Log.d(TAG, "onDetachedFromWindow");
+        }
+        if (mGLThread != null) {
+            mGLThread.requestExitAndWait();
+        }
+        mDetached = true;
+        super.onDetachedFromWindow();
+    }
+
+    public void onLayoutChange(View v, int left, int top, int right, int bottom,
+                               int oldLeft, int oldTop, int oldRight, int oldBottom) {
+        surfaceChanged(getSurfaceTexture(), 0, right - left, bottom - top);
+    }
+
+    public void onSurfaceTextureAvailable(SurfaceTexture surface, int width, int height) {
+        surfaceCreated(surface);
+        surfaceChanged(surface, 0, width, height);
+    }
+
+    public void onSurfaceTextureSizeChanged(SurfaceTexture surface, int width, int height) {
+        surfaceChanged(surface, 0, width, height);
+    }
+
+    public boolean onSurfaceTextureDestroyed(SurfaceTexture surface) {
+        surfaceDestroyed(surface);
+        return true;
+    }
+
+    public void onSurfaceTextureUpdated(SurfaceTexture surface) {
+//        requestRender();
+    }
+
+    // ----------------------------------------------------------------------
+
+    /**
+     * An interface used to wrap a GL interface.
+     * <p>Typically
+     * used for implementing debugging and tracing on top of the default
+     * GL interface. You would typically use this by creating your own class
+     * that implemented all the GL methods by delegating to another GL instance.
+     * Then you could add your own behavior before or after calling the
+     * delegate. All the GLWrapper would do was instantiate and return the
+     * wrapper GL instance:
+     * <pre class="prettyprint">
+     * class MyGLWrapper implements GLWrapper {
+     *     GL wrap(GL gl) {
+     *         return new MyGLImplementation(gl);
+     *     }
+     *     static class MyGLImplementation implements GL,GL10,GL11,... {
+     *         ...
+     *     }
+     * }
+     * </pre>
+     * @see #setGLWrapper(GLWrapper)
+     */
+    public interface GLWrapper {
+        /**
+         * Wraps a gl interface in another gl interface.
+         * @param gl a GL interface that is to be wrapped.
+         * @return either the input argument or another GL object that wraps the input argument.
+         */
+        GL wrap(GL gl);
+    }
+
+    /**
+     * A generic renderer interface.
+     * <p>
+     * The renderer is responsible for making OpenGL calls to render a frame.
+     * <p>
+     * DUIXTextureView clients typically create their own classes that implement
+     * this interface, and then call {@link DUIXTextureView#setRenderer} to
+     * register the renderer with the DUIXTextureView.
+     * <p>
+     *
+     * <div class="special reference">
+     * <h3>Developer Guides</h3>
+     * <p>For more information about how to use OpenGL, read the
+     * <a href="{@docRoot}guide/topics/graphics/opengl.html">OpenGL</a> developer guide.</p>
+     * </div>
+     *
+     * <h3>Threading</h3>
+     * The renderer will be called on a separate thread, so that rendering
+     * performance is decoupled from the UI thread. Clients typically need to
+     * communicate with the renderer from the UI thread, because that's where
+     * input events are received. Clients can communicate using any of the
+     * standard Java techniques for cross-thread communication, or they can
+     * use the {@link DUIXTextureView#queueEvent(Runnable)} convenience method.
+     * <p>
+     * <h3>EGL Context Lost</h3>
+     * There are situations where the EGL rendering context will be lost. This
+     * typically happens when device wakes up after going to sleep. When
+     * the EGL context is lost, all OpenGL resources (such as textures) that are
+     * associated with that context will be automatically deleted. In order to
+     * keep rendering correctly, a renderer must recreate any lost resources
+     * that it still needs. The {@link #onSurfaceCreated(GL10, EGLConfig)} method
+     * is a convenient place to do this.
+     *
+     *
+     * @see #setRenderer(Renderer)
+     */
+    public interface Renderer {
+        /**
+         * Called when the surface is created or recreated.
+         * <p>
+         * Called when the rendering thread
+         * starts and whenever the EGL context is lost. The EGL context will typically
+         * be lost when the Android device awakes after going to sleep.
+         * <p>
+         * Since this method is called at the beginning of rendering, as well as
+         * every time the EGL context is lost, this method is a convenient place to put
+         * code to create resources that need to be created when the rendering
+         * starts, and that need to be recreated when the EGL context is lost.
+         * Textures are an example of a resource that you might want to create
+         * here.
+         * <p>
+         * Note that when the EGL context is lost, all OpenGL resources associated
+         * with that context will be automatically deleted. You do not need to call
+         * the corresponding "glDelete" methods such as glDeleteTextures to
+         * manually delete these lost resources.
+         * <p>
+         * @param gl the GL interface. Use <code>instanceof</code> to
+         * test if the interface supports GL11 or higher interfaces.
+         * @param config the EGLConfig of the created surface. Can be used
+         * to create matching pbuffers.
+         */
+        void onSurfaceCreated(GL10 gl, EGLConfig config);
+
+        /**
+         * Called when the surface changed size.
+         * <p>
+         * Called after the surface is created and whenever
+         * the OpenGL ES surface size changes.
+         * <p>
+         * Typically you will set your viewport here. If your camera
+         * is fixed then you could also set your projection matrix here:
+         * <pre class="prettyprint">
+         * void onSurfaceChanged(GL10 gl, int width, int height) {
+         *     gl.glViewport(0, 0, width, height);
+         *     // for a fixed camera, set the projection too
+         *     float ratio = (float) width / height;
+         *     gl.glMatrixMode(GL10.GL_PROJECTION);
+         *     gl.glLoadIdentity();
+         *     gl.glFrustumf(-ratio, ratio, -1, 1, 1, 10);
+         * }
+         * </pre>
+         * @param gl the GL interface. Use <code>instanceof</code> to
+         * test if the interface supports GL11 or higher interfaces.
+         * @param width
+         * @param height
+         */
+        void onSurfaceChanged(GL10 gl, int width, int height);
+
+        /**
+         * Called to draw the current frame.
+         * <p>
+         * This method is responsible for drawing the current frame.
+         * <p>
+         * The implementation of this method typically looks like this:
+         * <pre class="prettyprint">
+         * void onDrawFrame(GL10 gl) {
+         *     gl.glClear(GL10.GL_COLOR_BUFFER_BIT | GL10.GL_DEPTH_BUFFER_BIT);
+         *     //... other gl calls to render the scene ...
+         * }
+         * </pre>
+         * @param gl the GL interface. Use <code>instanceof</code> to
+         * test if the interface supports GL11 or higher interfaces.
+         */
+        void onDrawFrame(GL10 gl);
+
+        void onSurfaceDestroyed(GL10 gl);
+    }
+
+    /**
+     * An interface for customizing the eglCreateContext and eglDestroyContext calls.
+     * <p>
+     * This interface must be implemented by clients wishing to call
+     * {@link DUIXTextureView#setEGLContextFactory(EGLContextFactory)}
+     */
+    public interface EGLContextFactory {
+        EGLContext createContext(EGL10 egl, EGLDisplay display, EGLConfig eglConfig);
+        void destroyContext(EGL10 egl, EGLDisplay display, EGLContext context);
+    }
+
+    private class DefaultContextFactory implements EGLContextFactory {
+        private int EGL_CONTEXT_CLIENT_VERSION = 0x3098;
+
+        public EGLContext createContext(EGL10 egl, EGLDisplay display, EGLConfig config) {
+            int[] attrib_list = {EGL_CONTEXT_CLIENT_VERSION, mEGLContextClientVersion,
+                    EGL10.EGL_NONE };
+
+            return egl.eglCreateContext(display, config, EGL10.EGL_NO_CONTEXT,
+                    mEGLContextClientVersion != 0 ? attrib_list : null);
+        }
+
+        public void destroyContext(EGL10 egl, EGLDisplay display,
+                                   EGLContext context) {
+            if (!egl.eglDestroyContext(display, context)) {
+                Log.e("DefaultContextFactory", "display:" + display + " context: " + context);
+                if (LOG_THREADS) {
+                    Log.i("DefaultContextFactory", "tid=" + Thread.currentThread().getId());
+                }
+                EglHelper.throwEglException("eglDestroyContex", egl.eglGetError());
+            }
+        }
+    }
+
+    /**
+     * An interface for customizing the eglCreateWindowSurface and eglDestroySurface calls.
+     * <p>
+     * This interface must be implemented by clients wishing to call
+     * {@link DUIXTextureView#setEGLWindowSurfaceFactory(EGLWindowSurfaceFactory)}
+     */
+    public interface EGLWindowSurfaceFactory {
+        /**
+         *  @return null if the surface cannot be constructed.
+         */
+        EGLSurface createWindowSurface(EGL10 egl, EGLDisplay display, EGLConfig config,
+                                       Object nativeWindow);
+        void destroySurface(EGL10 egl, EGLDisplay display, EGLSurface surface);
+    }
+
+    private static class DefaultWindowSurfaceFactory implements EGLWindowSurfaceFactory {
+
+        public EGLSurface createWindowSurface(EGL10 egl, EGLDisplay display,
+                                              EGLConfig config, Object nativeWindow) {
+            EGLSurface result = null;
+            try {
+                result = egl.eglCreateWindowSurface(display, config, nativeWindow, null);
+            } catch (IllegalArgumentException e) {
+                // This exception indicates that the surface flinger surface
+                // is not valid. This can happen if the surface flinger surface has
+                // been torn down, but the application has not yet been
+                // notified via SurfaceHolder.Callback.surfaceDestroyed.
+                // In theory the application should be notified first,
+                // but in practice sometimes it is not. See b/4588890
+                Log.e(TAG, "eglCreateWindowSurface", e);
+            }
+            return result;
+        }
+
+        public void destroySurface(EGL10 egl, EGLDisplay display,
+                                   EGLSurface surface) {
+            egl.eglDestroySurface(display, surface);
+        }
+    }
+
+    /**
+     * An interface for choosing an EGLConfig configuration from a list of
+     * potential configurations.
+     * <p>
+     * This interface must be implemented by clients wishing to call
+     * {@link DUIXTextureView#setEGLConfigChooser(EGLConfigChooser)}
+     */
+    public interface EGLConfigChooser {
+        /**
+         * Choose a configuration from the list. Implementors typically
+         * implement this method by calling
+         * {@link EGL10#eglChooseConfig} and iterating through the results. Please consult the
+         * EGL specification available from The Khronos Group to learn how to call eglChooseConfig.
+         * @param egl the EGL10 for the current display.
+         * @param display the current display.
+         * @return the chosen configuration.
+         */
+        EGLConfig chooseConfig(EGL10 egl, EGLDisplay display);
+    }
+
+    private abstract class BaseConfigChooser
+            implements EGLConfigChooser {
+        public BaseConfigChooser(int[] configSpec) {
+            mConfigSpec = filterConfigSpec(configSpec);
+        }
+
+        public EGLConfig chooseConfig(EGL10 egl, EGLDisplay display) {
+            int[] num_config = new int[1];
+            if (!egl.eglChooseConfig(display, mConfigSpec, null, 0,
+                    num_config)) {
+//                sendMonitor(false, Log.getStackTraceString(new IllegalArgumentException("eglChooseConfig failed")));
+                throw new IllegalArgumentException("eglChooseConfig failed");
+            }
+
+            int numConfigs = num_config[0];
+
+            if (numConfigs <= 0) {
+                throw new IllegalArgumentException(
+                        "No configs match configSpec");
+            }
+
+            EGLConfig[] configs = new EGLConfig[numConfigs];
+            if (!egl.eglChooseConfig(display, mConfigSpec, configs, numConfigs,
+                    num_config)) {
+                throw new IllegalArgumentException("eglChooseConfig#2 failed");
+            }
+            EGLConfig config = chooseConfig(egl, display, configs);
+            if (config == null) {
+                throw new IllegalArgumentException("No config chosen");
+            }
+            return config;
+        }
+
+        abstract EGLConfig chooseConfig(EGL10 egl, EGLDisplay display,
+                                        EGLConfig[] configs);
+
+        protected int[] mConfigSpec;
+
+        private int[] filterConfigSpec(int[] configSpec) {
+            if (mEGLContextClientVersion != 2) {
+                return configSpec;
+            }
+            /* We know none of the subclasses define EGL_RENDERABLE_TYPE.
+             * And we know the configSpec is well formed.
+             */
+            int len = configSpec.length;
+            int[] newConfigSpec = new int[len + 2];
+            System.arraycopy(configSpec, 0, newConfigSpec, 0, len-1);
+            newConfigSpec[len-1] = EGL10.EGL_RENDERABLE_TYPE;
+            newConfigSpec[len] = 4; /* EGL_OPENGL_ES2_BIT */
+            newConfigSpec[len+1] = EGL10.EGL_NONE;
+            return newConfigSpec;
+        }
+    }
+
+    /**
+     * Choose a configuration with exactly the specified r,g,b,a sizes,
+     * and at least the specified depth and stencil sizes.
+     */
+    private class ComponentSizeChooser extends BaseConfigChooser {
+        public ComponentSizeChooser(int redSize, int greenSize, int blueSize,
+                                    int alphaSize, int depthSize, int stencilSize) {
+            super(new int[] {
+                    EGL10.EGL_RED_SIZE, redSize,
+                    EGL10.EGL_GREEN_SIZE, greenSize,
+                    EGL10.EGL_BLUE_SIZE, blueSize,
+                    EGL10.EGL_ALPHA_SIZE, alphaSize,
+                    EGL10.EGL_DEPTH_SIZE, depthSize,
+                    EGL10.EGL_STENCIL_SIZE, stencilSize,
+                    EGL10.EGL_NONE});
+            mValue = new int[1];
+            mRedSize = redSize;
+            mGreenSize = greenSize;
+            mBlueSize = blueSize;
+            mAlphaSize = alphaSize;
+            mDepthSize = depthSize;
+            mStencilSize = stencilSize;
+        }
+
+        @Override
+        public EGLConfig chooseConfig(EGL10 egl, EGLDisplay display,
+                                      EGLConfig[] configs) {
+            for (EGLConfig config : configs) {
+                int d = findConfigAttrib(egl, display, config,
+                        EGL10.EGL_DEPTH_SIZE, 0);
+                int s = findConfigAttrib(egl, display, config,
+                        EGL10.EGL_STENCIL_SIZE, 0);
+                if ((d >= mDepthSize) && (s >= mStencilSize)) {
+                    int r = findConfigAttrib(egl, display, config,
+                            EGL10.EGL_RED_SIZE, 0);
+                    int g = findConfigAttrib(egl, display, config,
+                            EGL10.EGL_GREEN_SIZE, 0);
+                    int b = findConfigAttrib(egl, display, config,
+                            EGL10.EGL_BLUE_SIZE, 0);
+                    int a = findConfigAttrib(egl, display, config,
+                            EGL10.EGL_ALPHA_SIZE, 0);
+                    if ((r == mRedSize) && (g == mGreenSize)
+                            && (b == mBlueSize) && (a == mAlphaSize)) {
+                        return config;
+                    }
+                }
+            }
+            return null;
+        }
+
+        private int findConfigAttrib(EGL10 egl, EGLDisplay display,
+                                     EGLConfig config, int attribute, int defaultValue) {
+
+            if (egl.eglGetConfigAttrib(display, config, attribute, mValue)) {
+                return mValue[0];
+            }
+            return defaultValue;
+        }
+
+        private int[] mValue;
+        // Subclasses can adjust these values:
+        protected int mRedSize;
+        protected int mGreenSize;
+        protected int mBlueSize;
+        protected int mAlphaSize;
+        protected int mDepthSize;
+        protected int mStencilSize;
+    }
+
+    /**
+     * This class will choose a RGB_888 surface with
+     * or without a depth buffer.
+     *
+     */
+    private class SimpleEGLConfigChooser extends ComponentSizeChooser {
+        public SimpleEGLConfigChooser(boolean withDepthBuffer) {
+            super(8, 8, 8, 0, withDepthBuffer ? 16 : 0, 0);
+        }
+    }
+
+    /**
+     * An EGL helper class.
+     */
+
+    private static class EglHelper {
+        public EglHelper(WeakReference<DUIXTextureView> glSurfaceViewWeakRef) {
+            mGLSurfaceViewWeakRef = glSurfaceViewWeakRef;
+        }
+
+        /**
+         * Initialize EGL for a given configuration spec.
+         * @param
+         */
+        public void start() {
+            if (LOG_EGL) {
+                Log.w("EglHelper", "start() tid=" + Thread.currentThread().getId());
+            }
+            /*
+             * Get an EGL instance
+             */
+            mEgl = (EGL10) EGLContext.getEGL();
+
+            /*
+             * Get to the default display.
+             */
+            mEglDisplay = mEgl.eglGetDisplay(EGL10.EGL_DEFAULT_DISPLAY);
+
+            if (mEglDisplay == EGL10.EGL_NO_DISPLAY) {
+                throw new RuntimeException("eglGetDisplay failed");
+            }
+
+            /*
+             * We can now initialize EGL for that display
+             */
+            int[] version = new int[2];
+            if(!mEgl.eglInitialize(mEglDisplay, version)) {
+                throw new RuntimeException("eglInitialize failed");
+            }
+            DUIXTextureView view = mGLSurfaceViewWeakRef.get();
+            if (view == null) {
+                mEglConfig = null;
+                mEglContext = null;
+            } else {
+                mEglConfig = view.mEGLConfigChooser.chooseConfig(mEgl, mEglDisplay);
+
+                /*
+                 * Create an EGL context. We want to do this as rarely as we can, because an
+                 * EGL context is a somewhat heavy object.
+                 */
+                mEglContext = view.mEGLContextFactory.createContext(mEgl, mEglDisplay, mEglConfig);
+            }
+            if (mEglContext == null || mEglContext == EGL10.EGL_NO_CONTEXT) {
+                mEglContext = null;
+                throwEglException("createContext");
+            }
+            if (LOG_EGL) {
+                Log.w("EglHelper", "createContext " + mEglContext + " tid=" + Thread.currentThread().getId());
+            }
+
+            mEglSurface = null;
+        }
+
+        /**
+         * Create an egl surface for the current SurfaceHolder surface. If a surface
+         * already exists, destroy it before creating the new surface.
+         *
+         * @return true if the surface was created successfully.
+         */
+        public boolean createSurface() {
+            if (LOG_EGL) {
+                Log.w("EglHelper", "createSurface()  tid=" + Thread.currentThread().getId());
+            }
+            /*
+             * Check preconditions.
+             */
+            if (mEgl == null) {
+                throw new RuntimeException("egl not initialized");
+            }
+            if (mEglDisplay == null) {
+                throw new RuntimeException("eglDisplay not initialized");
+            }
+            if (mEglConfig == null) {
+                throw new RuntimeException("mEglConfig not initialized");
+            }
+
+            /*
+             *  The window size has changed, so we need to create a new
+             *  surface.
+             */
+            destroySurfaceImp();
+
+            /*
+             * Create an EGL surface we can render into.
+             */
+            DUIXTextureView view = mGLSurfaceViewWeakRef.get();
+            if (view != null) {
+                mEglSurface = view.mEGLWindowSurfaceFactory.createWindowSurface(mEgl,
+                        mEglDisplay, mEglConfig, view.getSurfaceTexture());
+            } else {
+                mEglSurface = null;
+            }
+
+            if (mEglSurface == null || mEglSurface == EGL10.EGL_NO_SURFACE) {
+                int error = mEgl.eglGetError();
+                if (error == EGL10.EGL_BAD_NATIVE_WINDOW) {
+                    Log.e("EglHelper", "createWindowSurface returned EGL_BAD_NATIVE_WINDOW.");
+                }
+                return false;
+            }
+
+            /*
+             * Before we can issue GL commands, we need to make sure
+             * the context is current and bound to a surface.
+             */
+            if (!mEgl.eglMakeCurrent(mEglDisplay, mEglSurface, mEglSurface, mEglContext)) {
+                /*
+                 * Could not make the context current, probably because the underlying
+                 * SurfaceView surface has been destroyed.
+                 */
+                logEglErrorAsWarning("EGLHelper", "eglMakeCurrent", mEgl.eglGetError());
+                return false;
+            }
+
+            return true;
+        }
+
+        /**
+         * Create a GL object for the current EGL context.
+         * @return
+         */
+        GL createGL() {
+
+            GL gl = mEglContext.getGL();
+            DUIXTextureView view = mGLSurfaceViewWeakRef.get();
+            if (view != null) {
+                if (view.mGLWrapper != null) {
+                    gl = view.mGLWrapper.wrap(gl);
+                }
+
+                if ((view.mDebugFlags & (DEBUG_CHECK_GL_ERROR | DEBUG_LOG_GL_CALLS)) != 0) {
+                    int configFlags = 0;
+                    Writer log = null;
+                    if ((view.mDebugFlags & DEBUG_CHECK_GL_ERROR) != 0) {
+                        configFlags |= GLDebugHelper.CONFIG_CHECK_GL_ERROR;
+                    }
+                    if ((view.mDebugFlags & DEBUG_LOG_GL_CALLS) != 0) {
+                        log = new LogWriter();
+                    }
+                    gl = GLDebugHelper.wrap(gl, configFlags, log);
+                }
+            }
+            return gl;
+        }
+
+        /**
+         * Display the current render surface.
+         * @return the EGL error code from eglSwapBuffers.
+         */
+        public int swap() {
+            if (! mEgl.eglSwapBuffers(mEglDisplay, mEglSurface)) {
+                return mEgl.eglGetError();
+            }
+            return EGL10.EGL_SUCCESS;
+        }
+
+        public void destroySurface() {
+            if (LOG_EGL) {
+                Log.w("EglHelper", "destroySurface()  tid=" + Thread.currentThread().getId());
+            }
+            destroySurfaceImp();
+        }
+
+        private void destroySurfaceImp() {
+            if (mEglSurface != null && mEglSurface != EGL10.EGL_NO_SURFACE) {
+                mEgl.eglMakeCurrent(mEglDisplay, EGL10.EGL_NO_SURFACE,
+                        EGL10.EGL_NO_SURFACE,
+                        EGL10.EGL_NO_CONTEXT);
+                DUIXTextureView view = mGLSurfaceViewWeakRef.get();
+                if (view != null) {
+                    view.mEGLWindowSurfaceFactory.destroySurface(mEgl, mEglDisplay, mEglSurface);
+                }
+                mEglSurface = null;
+            }
+        }
+
+        public void finish() {
+            if (LOG_EGL) {
+                Log.w("EglHelper", "finish() tid=" + Thread.currentThread().getId());
+            }
+            if (mEglContext != null) {
+                DUIXTextureView view = mGLSurfaceViewWeakRef.get();
+                if (view != null) {
+                    view.mEGLContextFactory.destroyContext(mEgl, mEglDisplay, mEglContext);
+                }
+                mEglContext = null;
+            }
+            if (mEglDisplay != null) {
+                mEgl.eglTerminate(mEglDisplay);
+                mEglDisplay = null;
+            }
+        }
+
+        private void throwEglException(String function) {
+            throwEglException(function, mEgl.eglGetError());
+        }
+
+        public static void throwEglException(String function, int error) {
+            String message = formatEglError(function, error);
+            if (LOG_THREADS) {
+                Log.e("EglHelper", "throwEglException tid=" + Thread.currentThread().getId() + " "
+                        + message);
+            }
+//            throw new RuntimeException(message);
+        }
+
+        public static void logEglErrorAsWarning(String tag, String function, int error) {
+            Log.w(tag, formatEglError(function, error));
+        }
+
+        public static String formatEglError(String function, int error) {
+            return function + " failed: " + error;
+        }
+
+        private WeakReference<DUIXTextureView> mGLSurfaceViewWeakRef;
+        EGL10 mEgl;
+        EGLDisplay mEglDisplay;
+        EGLSurface mEglSurface;
+        EGLConfig mEglConfig;
+        EGLContext mEglContext;
+
+    }
+
+    /**
+     * A generic GL Thread. Takes care of initializing EGL and GL. Delegates
+     * to a Renderer instance to do the actual drawing. Can be configured to
+     * render continuously or on request.
+     *
+     * All potentially blocking synchronization is done through the
+     * sGLThreadManager object. This avoids multiple-lock ordering issues.
+     *
+     */
+    static class GLThread extends Thread {
+        GLThread(WeakReference<DUIXTextureView> glSurfaceViewWeakRef) {
+            super();
+            mWidth = 0;
+            mHeight = 0;
+            mRequestRender = true;
+            mRenderMode = RENDERMODE_CONTINUOUSLY;
+            mGLSurfaceViewWeakRef = glSurfaceViewWeakRef;
+        }
+
+        @Override
+        public void run() {
+            setName("GLThread " + getId());
+            if (LOG_THREADS) {
+                Log.i("GLThread", "starting tid=" + getId());
+            }
+
+            try {
+                guardedRun();
+            } catch (InterruptedException e) {
+                // fall thru and exit normally
+            } finally {
+                sGLThreadManager.threadExiting(this);
+            }
+        }
+
+        /*
+         * This private method should only be called inside a
+         * synchronized(sGLThreadManager) block.
+         */
+        private void stopEglSurfaceLocked() {
+            if (mHaveEglSurface) {
+                mHaveEglSurface = false;
+                mEglHelper.destroySurface();
+            }
+        }
+
+        /*
+         * This private method should only be called inside a
+         * synchronized(sGLThreadManager) block.
+         */
+        private void stopEglContextLocked() {
+            if (mHaveEglContext) {
+                mEglHelper.finish();
+                mHaveEglContext = false;
+                sGLThreadManager.releaseEglContextLocked(this);
+            }
+        }
+        private void guardedRun() throws InterruptedException {
+            mEglHelper = new EglHelper(mGLSurfaceViewWeakRef);
+            mHaveEglContext = false;
+            mHaveEglSurface = false;
+            try {
+                GL10 gl = null;
+                boolean createEglContext = false;
+                boolean createEglSurface = false;
+                boolean createGlInterface = false;
+                boolean lostEglContext = false;
+                boolean sizeChanged = false;
+                boolean wantRenderNotification = false;
+                boolean doRenderNotification = false;
+                boolean askedToReleaseEglContext = false;
+                int w = 0;
+                int h = 0;
+                Runnable event = null;
+
+                while (true) {
+                    synchronized (sGLThreadManager) {
+                        while (true) {
+                            if (mShouldExit) {
+                                return;
+                            }
+
+                            if (! mEventQueue.isEmpty()) {
+                                event = mEventQueue.remove(0);
+                                break;
+                            }
+
+                            // Update the pause state.
+                            boolean pausing = false;
+                            if (mPaused != mRequestPaused) {
+                                pausing = mRequestPaused;
+                                mPaused = mRequestPaused;
+                                sGLThreadManager.notifyAll();
+                                if (LOG_PAUSE_RESUME) {
+                                    Log.i("GLThread", "mPaused is now " + mPaused + " tid=" + getId());
+                                }
+                            }
+
+                            // Do we need to give up the EGL context?
+                            if (mShouldReleaseEglContext) {
+                                if (LOG_SURFACE) {
+                                    Log.i("GLThread", "releasing EGL context because asked to tid=" + getId());
+                                }
+                                stopEglSurfaceLocked();
+                                stopEglContextLocked();
+                                mShouldReleaseEglContext = false;
+                                askedToReleaseEglContext = true;
+                            }
+
+                            // Have we lost the EGL context?
+                            if (lostEglContext) {
+                                stopEglSurfaceLocked();
+                                stopEglContextLocked();
+                                lostEglContext = false;
+                            }
+
+                            // When pausing, release the EGL surface:
+                            if (pausing && mHaveEglSurface) {
+                                if (LOG_SURFACE) {
+                                    Log.i("GLThread", "releasing EGL surface because paused tid=" + getId());
+                                }
+                                stopEglSurfaceLocked();
+                            }
+
+                            // When pausing, optionally release the EGL Context:
+                            if (pausing && mHaveEglContext) {
+                                DUIXTextureView view = mGLSurfaceViewWeakRef.get();
+                                boolean preserveEglContextOnPause = view == null ?
+                                        false : view.mPreserveEGLContextOnPause;
+                                if (!preserveEglContextOnPause || sGLThreadManager.shouldReleaseEGLContextWhenPausing()) {
+                                    stopEglContextLocked();
+                                    if (LOG_SURFACE) {
+                                        Log.i("GLThread", "releasing EGL context because paused tid=" + getId());
+                                    }
+                                }
+                            }
+
+                            // When pausing, optionally terminate EGL:
+                            if (pausing) {
+                                if (sGLThreadManager.shouldTerminateEGLWhenPausing()) {
+                                    mEglHelper.finish();
+                                    if (LOG_SURFACE) {
+                                        Log.i("GLThread", "terminating EGL because paused tid=" + getId());
+                                    }
+                                }
+                            }
+
+                            // Have we lost the SurfaceView surface?
+                            if ((! mHasSurface) && (! mWaitingForSurface)) {
+                                if (LOG_SURFACE) {
+                                    Log.i("GLThread", "noticed surfaceView surface lost tid=" + getId());
+                                }
+                                if (mHaveEglSurface) {
+                                    stopEglSurfaceLocked();
+                                }
+                                mWaitingForSurface = true;
+                                mSurfaceIsBad = false;
+                                sGLThreadManager.notifyAll();
+                            }
+
+                            // Have we acquired the surface view surface?
+                            if (mHasSurface && mWaitingForSurface) {
+                                if (LOG_SURFACE) {
+                                    Log.i("GLThread", "noticed surfaceView surface acquired tid=" + getId());
+                                }
+                                mWaitingForSurface = false;
+                                sGLThreadManager.notifyAll();
+                            }
+
+                            if (doRenderNotification) {
+                                if (LOG_SURFACE) {
+                                    Log.i("GLThread", "sending render notification tid=" + getId());
+                                }
+                                wantRenderNotification = false;
+                                doRenderNotification = false;
+                                mRenderComplete = true;
+                                sGLThreadManager.notifyAll();
+                            }
+
+                            // Ready to draw?
+                            if (readyToDraw()) {
+
+                                // If we don't have an EGL context, try to acquire one.
+                                if (! mHaveEglContext) {
+                                    if (askedToReleaseEglContext) {
+                                        askedToReleaseEglContext = false;
+                                    } else if (sGLThreadManager.tryAcquireEglContextLocked(this)) {
+                                        try {
+                                            mEglHelper.start();
+                                        } catch (RuntimeException t) {
+                                            sGLThreadManager.releaseEglContextLocked(this);
+                                            throw t;
+                                        }
+                                        mHaveEglContext = true;
+                                        createEglContext = true;
+
+                                        sGLThreadManager.notifyAll();
+                                    }
+                                }
+
+                                if (mHaveEglContext && !mHaveEglSurface) {
+                                    mHaveEglSurface = true;
+                                    createEglSurface = true;
+                                    createGlInterface = true;
+                                    sizeChanged = true;
+                                }
+
+                                if (mHaveEglSurface) {
+                                    if (mSizeChanged) {
+                                        sizeChanged = true;
+                                        w = mWidth;
+                                        h = mHeight;
+                                        wantRenderNotification = true;
+                                        if (LOG_SURFACE) {
+                                            Log.i("GLThread",
+                                                    "noticing that we want render notification tid="
+                                                            + getId());
+                                        }
+
+                                        // Destroy and recreate the EGL surface.
+                                        createEglSurface = true;
+
+                                        mSizeChanged = false;
+                                    }
+                                    mRequestRender = false;
+                                    sGLThreadManager.notifyAll();
+                                    break;
+                                }
+                            }
+
+                            // By design, this is the only place in a GLThread thread where we wait().
+                            if (LOG_THREADS) {
+                                Log.i("GLThread", "waiting tid=" + getId()
+                                        + " mHaveEglContext: " + mHaveEglContext
+                                        + " mHaveEglSurface: " + mHaveEglSurface
+                                        + " mPaused: " + mPaused
+                                        + " mHasSurface: " + mHasSurface
+                                        + " mSurfaceIsBad: " + mSurfaceIsBad
+                                        + " mWaitingForSurface: " + mWaitingForSurface
+                                        + " mWidth: " + mWidth
+                                        + " mHeight: " + mHeight
+                                        + " mRequestRender: " + mRequestRender
+                                        + " mRenderMode: " + mRenderMode);
+                            }
+                            sGLThreadManager.wait();
+                        }
+                    } // end of synchronized(sGLThreadManager)
+
+                    if (event != null) {
+                        event.run();
+                        event = null;
+                        continue;
+                    }
+
+                    if (createEglSurface) {
+                        if (LOG_SURFACE) {
+                            Log.w("GLThread", "egl createSurface");
+                        }
+                        if (!mEglHelper.createSurface()) {
+                            synchronized(sGLThreadManager) {
+                                mSurfaceIsBad = true;
+                                sGLThreadManager.notifyAll();
+                            }
+                            continue;
+                        }
+                        createEglSurface = false;
+                    }
+
+                    if (createGlInterface) {
+                        gl = (GL10) mEglHelper.createGL();
+
+                        sGLThreadManager.checkGLDriver(gl);
+                        createGlInterface = false;
+                    }
+
+                    if (createEglContext) {
+                        if (LOG_RENDERER) {
+                            Log.w("GLThread", "onSurfaceCreated");
+                        }
+                        DUIXTextureView view = mGLSurfaceViewWeakRef.get();
+                        if (view != null) {
+                            view.mRenderer.onSurfaceCreated(gl, mEglHelper.mEglConfig);
+                        }
+                        createEglContext = false;
+                    }
+
+                    if (sizeChanged) {
+                        if (LOG_RENDERER) {
+                            Log.w("GLThread", "onSurfaceChanged(" + w + ", " + h + ")");
+                        }
+                        DUIXTextureView view = mGLSurfaceViewWeakRef.get();
+                        if (view != null) {
+                            view.mRenderer.onSurfaceChanged(gl, w, h);
+                        }
+                        sizeChanged = false;
+                    }
+
+                    if (LOG_RENDERER_DRAW_FRAME) {
+                        Log.w("GLThread", "onDrawFrame tid=" + getId());
+                    }
+                    {
+                        DUIXTextureView view = mGLSurfaceViewWeakRef.get();
+                        if (view != null) {
+                            view.mRenderer.onDrawFrame(gl);
+                        }
+                    }
+                    int swapError = mEglHelper.swap();
+                    switch (swapError) {
+                        case EGL10.EGL_SUCCESS:
+                            break;
+                        case EGL11.EGL_CONTEXT_LOST:
+                            if (LOG_SURFACE) {
+                                Log.i("GLThread", "egl context lost tid=" + getId());
+                            }
+                            lostEglContext = true;
+                            break;
+                        default:
+                            // Other errors typically mean that the current surface is bad,
+                            // probably because the SurfaceView surface has been destroyed,
+                            // but we haven't been notified yet.
+                            // Log the error to help developers understand why rendering stopped.
+                            EglHelper.logEglErrorAsWarning("GLThread", "eglSwapBuffers", swapError);
+
+                            synchronized(sGLThreadManager) {
+                                mSurfaceIsBad = true;
+                                sGLThreadManager.notifyAll();
+                            }
+                            break;
+                    }
+
+                    if (wantRenderNotification) {
+                        doRenderNotification = true;
+                    }
+                }
+
+            } finally {
+                /*
+                 * clean-up everything...
+                 */
+                synchronized (sGLThreadManager) {
+                    stopEglSurfaceLocked();
+                    stopEglContextLocked();
+                }
+            }
+        }
+
+        public boolean ableToDraw() {
+            return mHaveEglContext && mHaveEglSurface && readyToDraw();
+        }
+
+        private boolean readyToDraw() {
+            return (!mPaused) && mHasSurface && (!mSurfaceIsBad)
+                    && (mWidth > 0) && (mHeight > 0)
+                    && (mRequestRender || (mRenderMode == RENDERMODE_CONTINUOUSLY));
+        }
+
+        public void setRenderMode(int renderMode) {
+            if ( !((RENDERMODE_WHEN_DIRTY <= renderMode) && (renderMode <= RENDERMODE_CONTINUOUSLY)) ) {
+                throw new IllegalArgumentException("renderMode");
+            }
+            synchronized(sGLThreadManager) {
+                mRenderMode = renderMode;
+                sGLThreadManager.notifyAll();
+            }
+        }
+
+        public int getRenderMode() {
+            synchronized(sGLThreadManager) {
+                return mRenderMode;
+            }
+        }
+
+        public void requestRender() {
+            synchronized(sGLThreadManager) {
+                mRequestRender = true;
+                sGLThreadManager.notifyAll();
+            }
+        }
+
+        public void surfaceCreated() {
+            synchronized(sGLThreadManager) {
+                if (LOG_THREADS) {
+                    Log.i("GLThread", "surfaceCreated tid=" + getId());
+                }
+                mHasSurface = true;
+                sGLThreadManager.notifyAll();
+                while((mWaitingForSurface) && (!mExited)) {
+                    try {
+                        sGLThreadManager.wait();
+                    } catch (InterruptedException e) {
+                        Thread.currentThread().interrupt();
+                    }
+                }
+            }
+        }
+
+        public void surfaceDestroyed() {
+            synchronized(sGLThreadManager) {
+                if (LOG_THREADS) {
+                    Log.i("GLThread", "surfaceDestroyed tid=" + getId());
+                }
+                mHasSurface = false;
+                sGLThreadManager.notifyAll();
+                while((!mWaitingForSurface) && (!mExited)) {
+                    try {
+                        sGLThreadManager.wait();
+                    } catch (InterruptedException e) {
+                        Thread.currentThread().interrupt();
+                    }
+                }
+            }
+        }
+
+        public void onPause() {
+            synchronized (sGLThreadManager) {
+                if (LOG_PAUSE_RESUME) {
+                    Log.i("GLThread", "onPause tid=" + getId());
+                }
+                mRequestPaused = true;
+                sGLThreadManager.notifyAll();
+                while ((! mExited) && (! mPaused)) {
+                    if (LOG_PAUSE_RESUME) {
+                        Log.i("Main thread", "onPause waiting for mPaused.");
+                    }
+                    try {
+                        sGLThreadManager.wait();
+                    } catch (InterruptedException ex) {
+                        Thread.currentThread().interrupt();
+                    }
+                }
+            }
+        }
+
+        public void onResume() {
+            synchronized (sGLThreadManager) {
+                if (LOG_PAUSE_RESUME) {
+                    Log.i("GLThread", "onResume tid=" + getId());
+                }
+                mRequestPaused = false;
+                mRequestRender = true;
+                mRenderComplete = false;
+                sGLThreadManager.notifyAll();
+                while ((! mExited) && mPaused && (!mRenderComplete)) {
+                    if (LOG_PAUSE_RESUME) {
+                        Log.i("Main thread", "onResume waiting for !mPaused.");
+                    }
+                    try {
+                        sGLThreadManager.wait();
+                    } catch (InterruptedException ex) {
+                        Thread.currentThread().interrupt();
+                    }
+                }
+            }
+        }
+
+        public void onWindowResize(int w, int h) {
+            synchronized (sGLThreadManager) {
+                mWidth = w;
+                mHeight = h;
+                mSizeChanged = true;
+                mRequestRender = true;
+                mRenderComplete = false;
+                sGLThreadManager.notifyAll();
+
+                // Wait for thread to react to resize and render a frame
+                while (! mExited && !mPaused && !mRenderComplete
+                        && ableToDraw()) {
+                    if (LOG_SURFACE) {
+                        Log.i("Main thread", "onWindowResize waiting for render complete from tid=" + getId());
+                    }
+                    try {
+                        sGLThreadManager.wait();
+                    } catch (InterruptedException ex) {
+                        Thread.currentThread().interrupt();
+                    }
+                }
+            }
+        }
+
+        public void requestExitAndWait() {
+            // don't call this from GLThread thread or it is a guaranteed
+            // deadlock!
+            synchronized(sGLThreadManager) {
+                mShouldExit = true;
+                sGLThreadManager.notifyAll();
+                while (! mExited) {
+                    try {
+                        sGLThreadManager.wait();
+                    } catch (InterruptedException ex) {
+                        Thread.currentThread().interrupt();
+                    }
+                }
+            }
+        }
+
+        public void requestReleaseEglContextLocked() {
+            mShouldReleaseEglContext = true;
+            sGLThreadManager.notifyAll();
+        }
+
+        /**
+         * Queue an "event" to be run on the GL rendering thread.
+         * @param r the runnable to be run on the GL rendering thread.
+         */
+        public void queueEvent(Runnable r) {
+            if (r == null) {
+                throw new IllegalArgumentException("r must not be null");
+            }
+            synchronized(sGLThreadManager) {
+                mEventQueue.add(r);
+                sGLThreadManager.notifyAll();
+            }
+        }
+
+        // Once the thread is started, all accesses to the following member
+        // variables are protected by the sGLThreadManager monitor
+        private boolean mShouldExit;
+        private boolean mExited;
+        private boolean mRequestPaused;
+        private boolean mPaused;
+        private boolean mHasSurface;
+        private boolean mSurfaceIsBad;
+        private boolean mWaitingForSurface;
+        private boolean mHaveEglContext;
+        private boolean mHaveEglSurface;
+        private boolean mShouldReleaseEglContext;
+        private int mWidth;
+        private int mHeight;
+        private int mRenderMode;
+        private boolean mRequestRender;
+        private boolean mRenderComplete;
+        private ArrayList<Runnable> mEventQueue = new ArrayList<Runnable>();
+        private boolean mSizeChanged = true;
+
+        // End of member variables protected by the sGLThreadManager monitor.
+
+        private EglHelper mEglHelper;
+
+        /**
+         * Set once at thread construction time, nulled out when the parent view is garbage
+         * called. This weak reference allows the DUIXTextureView to be garbage collected while
+         * the GLThread is still alive.
+         */
+        private WeakReference<DUIXTextureView> mGLSurfaceViewWeakRef;
+
+    }
+
+    static class LogWriter extends Writer {
+
+        @Override
+        public void close() {
+            flushBuilder();
+        }
+
+        @Override
+        public void flush() {
+            flushBuilder();
+        }
+
+        @Override
+        public void write(char[] buf, int offset, int count) {
+            for(int i = 0; i < count; i++) {
+                char c = buf[offset + i];
+                if ( c == '\n') {
+                    flushBuilder();
+                }
+                else {
+                    mBuilder.append(c);
+                }
+            }
+        }
+
+        private void flushBuilder() {
+            if (mBuilder.length() > 0) {
+                Log.v("DUIXTextureView", mBuilder.toString());
+                mBuilder.delete(0, mBuilder.length());
+            }
+        }
+
+        private StringBuilder mBuilder = new StringBuilder();
+    }
+
+
+    private void checkRenderThreadState() {
+        if (mGLThread != null) {
+            throw new IllegalStateException(
+                    "setRenderer has already been called for this instance.");
+        }
+    }
+
+    private static class GLThreadManager {
+        private static String TAG = "GLThreadManager";
+
+        public synchronized void threadExiting(GLThread thread) {
+            if (LOG_THREADS) {
+                Log.i("GLThread", "exiting tid=" +  thread.getId());
+            }
+            thread.mExited = true;
+            if (mEglOwner == thread) {
+                mEglOwner = null;
+            }
+            notifyAll();
+        }
+
+        /*
+         * Tries once to acquire the right to use an EGL
+         * context. Does not block. Requires that we are already
+         * in the sGLThreadManager monitor when this is called.
+         *
+         * @return true if the right to use an EGL context was acquired.
+         */
+        public boolean tryAcquireEglContextLocked(GLThread thread) {
+            if (mEglOwner == thread || mEglOwner == null) {
+                mEglOwner = thread;
+                notifyAll();
+                return true;
+            }
+            checkGLESVersion();
+            if (mMultipleGLESContextsAllowed) {
+                return true;
+            }
+            // Notify the owning thread that it should release the context.
+            // TODO: implement a fairness policy. Currently
+            // if the owning thread is drawing continuously it will just
+            // reacquire the EGL context.
+            if (mEglOwner != null) {
+                mEglOwner.requestReleaseEglContextLocked();
+            }
+            return false;
+        }
+
+        /*
+         * Releases the EGL context. Requires that we are already in the
+         * sGLThreadManager monitor when this is called.
+         */
+        public void releaseEglContextLocked(GLThread thread) {
+            if (mEglOwner == thread) {
+                mEglOwner = null;
+            }
+            notifyAll();
+        }
+
+        public synchronized boolean shouldReleaseEGLContextWhenPausing() {
+            // Release the EGL context when pausing even if
+            // the hardware supports multiple EGL contexts.
+            // Otherwise the device could run out of EGL contexts.
+            return mLimitedGLESContexts;
+        }
+
+        public synchronized boolean shouldTerminateEGLWhenPausing() {
+            checkGLESVersion();
+            return !mMultipleGLESContextsAllowed;
+        }
+
+        public synchronized void checkGLDriver(GL10 gl) {
+            if (! mGLESDriverCheckComplete) {
+                checkGLESVersion();
+                String renderer = gl.glGetString(GL10.GL_RENDERER);
+                if (mGLESVersion < kGLES_20) {
+                    mMultipleGLESContextsAllowed =
+                            ! renderer.startsWith(kMSM7K_RENDERER_PREFIX);
+                    notifyAll();
+                }
+                mLimitedGLESContexts = !mMultipleGLESContextsAllowed;
+                if (LOG_SURFACE) {
+                    Log.w(TAG, "checkGLDriver renderer = \"" + renderer + "\" multipleContextsAllowed = "
+                            + mMultipleGLESContextsAllowed
+                            + " mLimitedGLESContexts = " + mLimitedGLESContexts);
+                }
+                mGLESDriverCheckComplete = true;
+            }
+        }
+
+        private void checkGLESVersion() {
+            if (! mGLESVersionCheckComplete) {
+//                mGLESVersion = SystemProperties.getInt(
+//                        "ro.opengles.version",
+//                        ConfigurationInfo.GL_ES_VERSION_UNDEFINED);
+//                if (mGLESVersion >= kGLES_20) {
+//                    mMultipleGLESContextsAllowed = true;
+//                }
+//                if (LOG_SURFACE) {
+//                    Log.w(TAG, "checkGLESVersion mGLESVersion =" +
+//                            " " + mGLESVersion + " mMultipleGLESContextsAllowed = " + mMultipleGLESContextsAllowed);
+//                }
+                mGLESVersionCheckComplete = true;
+            }
+        }
+
+        /**
+         * This check was required for some pre-Android-3.0 hardware. Android 3.0 provides
+         * support for hardware-accelerated views, therefore multiple EGL contexts are
+         * supported on all Android 3.0+ EGL drivers.
+         */
+        private boolean mGLESVersionCheckComplete;
+        private int mGLESVersion;
+        private boolean mGLESDriverCheckComplete;
+        private boolean mMultipleGLESContextsAllowed;
+        private boolean mLimitedGLESContexts;
+        private static final int kGLES_20 = 0x20000;
+        private static final String kMSM7K_RENDERER_PREFIX =
+                "Q3Dimension MSM7500 ";
+        private GLThread mEglOwner;
+    }
+
+    private static final GLThreadManager sGLThreadManager = new GLThreadManager();
+
+    private final WeakReference<DUIXTextureView> mThisWeakRef =
+            new WeakReference<DUIXTextureView>(this);
+    private GLThread mGLThread;
+    private Renderer mRenderer;
+    private boolean mDetached;
+    private EGLConfigChooser mEGLConfigChooser;
+    private EGLContextFactory mEGLContextFactory;
+    private EGLWindowSurfaceFactory mEGLWindowSurfaceFactory;
+    private GLWrapper mGLWrapper;
+    private int mDebugFlags;
+    private int mEGLContextClientVersion;
+    private boolean mPreserveEGLContextOnPause;
+}
\ No newline at end of file
diff --git a/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/render/ImageDrawer.java b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/render/ImageDrawer.java
new file mode 100644
index 0000000..7f30900
--- /dev/null
+++ b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/render/ImageDrawer.java
@@ -0,0 +1,176 @@
+package ai.guiji.duix.sdk.client.render;
+
+import android.opengl.GLES20;
+
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.FloatBuffer;
+
+import ai.guiji.duix.sdk.client.bean.ImageFrame;
+import ai.guiji.duix.sdk.client.util.OpenGLUtil;
+
+
+public class ImageDrawer {
+
+    protected int mImageTexId;
+
+    protected int mMaskTexId;
+
+    private final String vertexShaderCode =
+            "uniform mat4 uMVPMatrix;\n" +
+                    "attribute vec4 inputTextureCoordinate;\n" +
+                    " varying vec2 textureCoordinate;\n" +
+                    "attribute vec4 vPosition;\n" +
+                    "void main() {\n" +
+                    "  gl_Position = uMVPMatrix * vPosition;\n" +
+                    "  textureCoordinate = inputTextureCoordinate.xy;\n" +
+                    "}\n";
+
+    private final String fragmentShaderCode =
+            "precision mediump float;\n" +
+                    "varying vec2 textureCoordinate;\n" +
+                    "uniform sampler2D inputImageTexture;\n" +
+                    "uniform sampler2D inputImageTexture2;\n" +
+                    "vec4 imageColor;\n" +
+                    "vec4 maskColor;\n" +
+                    "float alpha;\n" +
+                    "void main() {\n" +
+                    "  imageColor = texture2D(inputImageTexture, textureCoordinate);\n" +
+                    "  maskColor = texture2D(inputImageTexture2, textureCoordinate);\n" +
+                    "  alpha = (maskColor.r + maskColor.g + maskColor.b) / 3.0;\n" +
+                    "  gl_FragColor = vec4(imageColor.b, imageColor.g, imageColor.r, alpha);\n" +
+                    "}\n";
+
+
+    //顶点坐标
+    static float vertexData[] = {   // in counterclockwise order:
+            -1f, -1f, 0.0f, // bottom left
+            1f, -1f, 0.0f, // bottom right
+            -1f, 1f, 0.0f, // top left
+            1f, 1f, 0.0f,  // top right
+    };
+
+    private FloatBuffer vertexBuffer; // buffer holding the vertices
+
+    // 绘制索引
+    //纹理坐标  对应顶点坐标  与之映射
+    static float textureData[] = {   // in counterclockwise order:
+            0f, 1f, 0.0f, // bottom left
+            1f, 1f, 0.0f, // bottom right
+            0f, 0f, 0.0f, // top left
+            1f, 0f, 0.0f,  // top right
+    };
+
+    private FloatBuffer textureBuffer; // buffer holding the texture coordinates
+
+    //每一次取点的时候取几个点
+    static final int COORDS_PER_VERTEX = 3;
+
+    private final int vertexCount = vertexData.length / COORDS_PER_VERTEX;
+
+    //每一次取的总的点 大小
+    private final int vertexStride = COORDS_PER_VERTEX * 4; // 4 bytes per vertex
+
+    private int mProgram;
+
+    private int uMVPMatrixLoc;
+    private int maPositionLoc;
+    private int maTextureCoordLoc;
+
+    private int inputImageTextureLoc;
+
+    private int inputImageTexture2Loc;
+
+    public ImageDrawer() {
+        mImageTexId = OpenGLUtil.createTextureObject(GLES20.GL_TEXTURE_2D);
+        mMaskTexId = OpenGLUtil.createTextureObject(GLES20.GL_TEXTURE_2D);
+
+        ByteBuffer byteBuffer = ByteBuffer.allocateDirect(vertexData.length * 4);
+        byteBuffer.order(ByteOrder.nativeOrder());
+        vertexBuffer = byteBuffer.asFloatBuffer();
+        vertexBuffer.put(vertexData);
+        vertexBuffer.position(0);
+
+        byteBuffer = ByteBuffer.allocateDirect(textureData.length * 4);
+        byteBuffer.order(ByteOrder.nativeOrder());
+        textureBuffer = byteBuffer.asFloatBuffer();
+        textureBuffer.put(textureData);
+        textureBuffer.position(0);
+
+        mProgram = OpenGLUtil.createProgram(vertexShaderCode, fragmentShaderCode);
+
+        if (mProgram > 0) {
+            // 正交投影
+            uMVPMatrixLoc = GLES20.glGetUniformLocation(mProgram, "uMVPMatrix");
+            OpenGLUtil.checkLocation(uMVPMatrixLoc, "uMVPMatrix");
+
+            //获取顶点坐标字段
+            maPositionLoc = GLES20.glGetAttribLocation(mProgram, "vPosition");
+            OpenGLUtil.checkLocation(maPositionLoc, "vPosition");
+            //获取纹理坐标字段
+            maTextureCoordLoc = GLES20.glGetAttribLocation(mProgram, "inputTextureCoordinate");
+            OpenGLUtil.checkLocation(maTextureCoordLoc, "inputTextureCoordinate");
+
+            inputImageTextureLoc = GLES20.glGetUniformLocation(mProgram, "inputImageTexture");
+            OpenGLUtil.checkLocation(inputImageTextureLoc, "inputImageTexture");
+
+            inputImageTexture2Loc = GLES20.glGetUniformLocation(mProgram, "inputImageTexture2");
+            OpenGLUtil.checkLocation(inputImageTexture2Loc, "inputImageTexture2");
+        }
+
+    }
+
+    public void draw(ImageFrame imageFrame, float[] mvpMatrix) {
+        // 将程序添加到OpenGL ES环境
+        GLES20.glUseProgram(mProgram);
+
+        GLES20.glActiveTexture(GLES20.GL_TEXTURE0);
+        GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mImageTexId);
+        GLES20.glPixelStorei(GLES20.GL_UNPACK_ALIGNMENT, 1); // 设置1字节对齐
+//        GLUtils.texImage2D(GLES20.GL_TEXTURE_2D, 0, imageFrame.bitmap, 0);
+        GLES20.glTexImage2D(GLES20.GL_TEXTURE_2D, 0, GLES20.GL_RGB, imageFrame.width, imageFrame.height, 0,
+                GLES20.GL_RGB, GLES20.GL_UNSIGNED_BYTE, imageFrame.rawBuffer);
+        GLES20.glUniform1i(inputImageTextureLoc, 0);
+
+        GLES20.glActiveTexture(GLES20.GL_TEXTURE1);
+        GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mMaskTexId);
+//        GLUtils.texImage2D(GLES20.GL_TEXTURE_2D, 0, maskBitmap, 0);
+        GLES20.glTexImage2D(GLES20.GL_TEXTURE_2D, 0, GLES20.GL_RGB, imageFrame.width, imageFrame.height, 0,
+                GLES20.GL_RGB, GLES20.GL_UNSIGNED_BYTE, imageFrame.maskBuffer);
+        GLES20.glUniform1i(inputImageTexture2Loc, 1);
+
+        GLES20.glUniformMatrix4fv(uMVPMatrixLoc, 1, false, mvpMatrix, 0);
+
+        GLES20.glEnableVertexAttribArray(maPositionLoc);
+        GLES20.glEnableVertexAttribArray(maTextureCoordLoc);
+        //设置顶点位置值
+        GLES20.glVertexAttribPointer(maPositionLoc, COORDS_PER_VERTEX, GLES20.GL_FLOAT, false, vertexStride, vertexBuffer);
+        //设置纹理位置值
+        GLES20.glVertexAttribPointer(maTextureCoordLoc, COORDS_PER_VERTEX, GLES20.GL_FLOAT, false, vertexStride, textureBuffer);
+        //绘制 GLES20.GL_TRIANGLE_STRIP:复用坐标
+        GLES20.glDrawArrays(GLES20.GL_TRIANGLE_STRIP, 0, vertexCount);
+
+        // 禁用顶点数组
+        GLES20.glDisableVertexAttribArray(maPositionLoc);
+        GLES20.glDisableVertexAttribArray(maTextureCoordLoc);
+        GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
+        GLES20.glUseProgram(0);
+    }
+
+    public void release() {
+        if (mProgram > 0) {
+            GLES20.glDeleteProgram(mProgram);
+            mProgram = -1;
+        }
+        if (mImageTexId != 0) {
+            GLES20.glDeleteTextures(1, new int[]{mImageTexId}, 0);
+            mImageTexId = 0;
+        }
+        if (mMaskTexId != 0) {
+            GLES20.glDeleteTextures(1, new int[]{mMaskTexId}, 0);
+            mMaskTexId = 0;
+        }
+
+    }
+
+}
diff --git a/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/render/RenderSink.java b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/render/RenderSink.java
new file mode 100644
index 0000000..3070698
--- /dev/null
+++ b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/render/RenderSink.java
@@ -0,0 +1,12 @@
+package ai.guiji.duix.sdk.client.render;
+
+import ai.guiji.duix.sdk.client.bean.ImageFrame;
+
+/**
+ * 渲染管道，通过该接口返回渲染数据
+ */
+public interface RenderSink {
+
+    void onVideoFrame(ImageFrame imageFrame);
+
+}
diff --git a/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/render/TextureMatrix.java b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/render/TextureMatrix.java
new file mode 100644
index 0000000..f06262d
--- /dev/null
+++ b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/render/TextureMatrix.java
@@ -0,0 +1,161 @@
+package ai.guiji.duix.sdk.client.render;
+
+public class TextureMatrix {
+
+
+    public static final float[] DINGDING_TEXTURE_MATRIX = {
+            1.0f, 0.0f, 0.0f, 0.0f,
+            0.0f, -1.0f, 0.0f, 0.0f,
+            0.0f, 0.0f, 1.0f, 0.0f,
+            0.0f, 1.0f, 0.0f, 1.0f
+    };
+    //    private static final float[] TEXTURE_MATRIX = {
+//            0.0f, -1.0f, 0.0f, 0.0f,
+//            1.0f, 0.0f, 0.0f, 0.0f,
+//            0.0f, 0.0f, 1.0f, 0.0f,
+//            0.0f, 1.0f, 0.0f, 1.0f
+//    };
+    //X 轴旋转 180 // 中心点需要在加一下//
+    public static final float[] TEXTURE_MATRIX = {
+            0.0f, 1.0f, 0.0f, 0.0f,
+            -1.0f, 0.0f, 0.0f, 0.0f,
+            0.0f, 0.0f, 1.0f, 0.0f,
+            0.0f, 0.0f, 0.0f, 1.0f
+    };
+    //Y 轴旋转
+     /*public static final float[] TEXTURE_MATRIX = {
+             1.0f, 0.0f, 0.0f, 0.0f,
+             0.0f, 0.0f, 1.0f, 0.0f,
+             0.0f, 1.0f, 0.0f, 0.0f,
+             0.0f, 0.0f, 0.0f, 1.0f
+     };*/
+
+    public static final float[] TEXTURE_MATRIX_0 = {
+            1.0f, 0.0f, 0.0f, 0.0f,
+            0.0f, 1.0f, 0.0f, 0.0f,
+            0.0f, 0.0f, 1.0f, 0.0f,
+            0.0f, 0.0f, 0.0f, 1.0f
+    };
+    // X轴旋转 90度
+    public static final float[] TEXTURE_MATRIX_90 = {
+            //X轴旋转 90度
+//            0.0f, 1.0f, 0.0f, 0.0f,
+//            1.0f, 0.0f, 0.0f, 0.0f,
+//            0.0f, 0.0f, 1.0f, 0.0f,
+//            0.0f, 0.0f, 0.0f, 1.0f
+
+//            0.0f, -1.0f, 0.0f, 0.0f,
+//            1.0f, 0.0f, 0.0f, 0.0f,
+//            0.0f, 0.0f, 1.0f, 0.0f,
+//            0.0f, 1.0f, 0.0f, 1.0f
+            //90度水平翻转
+            0.0f, -1.0f, 0.0f, 0.0f,
+            1.0f, 0.0f, 0.0f, 0.0f,
+            0.0f, 0.0f, 1.0f, 0.0f,
+            0.0f, 0.0f, 0.0f, 1.0f
+
+    };
+    public static final float[] TEXTURE_MATRIX_90_V = {
+            //X轴旋转 90度
+            0.0f, 1.0f, 0.0f, 0.0f,
+            1.0f, 0.0f, 0.0f, 0.0f,
+            0.0f, 0.0f, 1.0f, 0.0f,
+            0.0f, 0.0f, 0.0f, 1.0f
+
+//            0.0f, -1.0f, 0.0f, 0.0f,
+//            1.0f, 0.0f, 0.0f, 0.0f,
+//            0.0f, 0.0f, 1.0f, 0.0f,
+//            0.0f, 1.0f, 0.0f, 1.0f
+            //90度水平翻转
+//            0.0f, -1.0f, 0.0f, 0.0f,
+//            1.0f, 0.0f, 0.0f, 0.0f,
+//            0.0f, 0.0f, 1.0f, 0.0f,
+//            0.0f, 0.0f, 0.0f, 1.0f
+
+    };
+    // X轴旋转 180度
+    public static final float[] TEXTURE_MATRIX_180 = {
+            -1.0f, 0.0f, 0.0f, 0.0f,
+            0.0f, -1.0f, 0.0f, 0.0f,
+            0.0f, 0.0f, -1.0f, 0.0f,
+            0.0f, 0.0f, 0.0f, 1.0f
+
+//            -1.0f, 0.0f, 0.0f, 0.0f,
+//            0.0f, 1.0f, 0.0f, 0.0f,
+//            0.0f, 0.0f, -1.0f, 0.0f,
+//            0.0f, 0.0f, 0.0f, 1.0f
+
+//            1.0f, 0.0f, 0.0f, 0.0f,
+//            0.0f, -1.0f, 0.0f, 0.0f,
+//            0.0f, 0.0f, 1.0f, 0.0f,
+//            0.0f, 1.0f, 0.0f, 1.0f
+
+    };
+    // X轴旋转 270度
+    public static final float[] TEXTURE_MATRIX_270 = {
+            0.0f, 1.0f, 0.0f, 0.0f,
+            -1.0f, 0.0f, 0.0f, 0.0f,
+            0.0f, 0.0f, 1.0f, 0.0f,
+            0.0f, 0.0f, 0.0f, 1.0f
+    };
+
+    public static float[] getRotation(final int rotation, boolean flipHorizontal,
+                                      boolean flipVertical) {
+        float[] rotatedTex;
+        boolean temp;
+        switch (rotation) {
+            case 90:
+                rotatedTex = TEXTURE_MATRIX_90;
+                temp = flipHorizontal;
+                flipHorizontal = flipVertical;
+                flipVertical = temp;
+                break;
+            case 180:
+                rotatedTex = TEXTURE_MATRIX_180;
+                break;
+            case 270:
+                rotatedTex = TEXTURE_MATRIX_270;
+                temp = flipHorizontal;
+                flipHorizontal = flipVertical;
+                flipVertical = temp;
+                break;
+            case 0:
+            case 360:
+            default:
+                rotatedTex = TEXTURE_MATRIX_0;
+                break;
+        }
+        if (flipHorizontal) {
+            rotatedTex = new float[]{
+                    flip(rotatedTex[0]), rotatedTex[1],
+                    flip(rotatedTex[2]), rotatedTex[3],
+                    flip(rotatedTex[4]), rotatedTex[5],
+                    flip(rotatedTex[6]), rotatedTex[7],
+                    flip(rotatedTex[8]), rotatedTex[9],
+                    flip(rotatedTex[10]), rotatedTex[11],
+                    flip(rotatedTex[12]), rotatedTex[13],
+                    flip(rotatedTex[14]), rotatedTex[15],
+            };
+        }
+        if (flipVertical) {
+            rotatedTex = new float[]{
+                    rotatedTex[0], flip(rotatedTex[1]),
+                    rotatedTex[2], flip(rotatedTex[3]),
+                    rotatedTex[4], flip(rotatedTex[5]),
+                    rotatedTex[6], flip(rotatedTex[7]),
+                    rotatedTex[8], flip(rotatedTex[9]),
+                    rotatedTex[10], flip(rotatedTex[11]),
+                    rotatedTex[12], flip(rotatedTex[13]),
+                    rotatedTex[14], flip(rotatedTex[15]),
+            };
+        }
+        return rotatedTex;
+    }
+
+    private static float flip(final float i) {
+        if (i == 0.0f) {
+            return 1.0f;
+        }
+        return 0.0f;
+    }
+}
diff --git a/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/thread/RenderThread.java b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/thread/RenderThread.java
new file mode 100644
index 0000000..acff7b1
--- /dev/null
+++ b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/thread/RenderThread.java
@@ -0,0 +1,460 @@
+package ai.guiji.duix.sdk.client.thread;
+
+import android.content.Context;
+import android.os.Handler;
+import android.os.Looper;
+import android.os.Message;
+import android.text.TextUtils;
+
+
+import java.io.File;
+import java.lang.ref.WeakReference;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Random;
+import java.util.concurrent.ConcurrentLinkedQueue;
+
+import ai.guiji.duix.DuixNcnn;
+import ai.guiji.duix.sdk.client.audio.AudioPlayer;
+import ai.guiji.duix.sdk.client.bean.ImageFrame;
+import ai.guiji.duix.sdk.client.loader.ModelInfo;
+import ai.guiji.duix.sdk.client.loader.ModelInfoLoader;
+import ai.guiji.duix.sdk.client.render.RenderSink;
+import ai.guiji.duix.sdk.client.util.Logger;
+
+
+public class RenderThread extends Thread {
+
+    private static final int MSG_RENDER_STEP = 1;                   // 请求下一帧渲染
+    private static final int MSG_STOP_RENDER = 2;                   // 停止渲染
+    private static final int MSG_QUIT = 3;                          // 退出线程
+    private static final int MSG_STOP_PUSH_AUDIO = 5;               // 停止音频推送
+    private static final int MSG_STOP_PLAY_AUDIO = 6;               // 停止音频播放
+
+    private static final int MSG_REQUIRE_MOTION = 7;                // 请求播放一个指定动作区间
+    private static final int MSG_REQUIRE_MOTION_RANDOM = 8;         // 请求随机播放一个动作区间
+    private static final int MSG_START_PUSH_AUDIO = 11;             // 启动音频推送
+    private static final int MSG_PUSH_AUDIO = 12;                   // 推送播放音频
+
+
+    private volatile boolean isRendering = false;                     // 为false时终止线程
+    RenderHandler mHandler;                                 // 使用该处理器来调度线程的事件
+
+    private final Object mReadyFence = new Object();        // 给isReady加一个对象锁
+
+    private final Object mBnfFence = new Object();        // 给isReady加一个对象锁
+
+    private final Context mContext;
+    private DuixNcnn scrfdncnn;
+
+    private final RenderCallback callback;
+
+    private RenderSink mRenderSink;
+
+    private ConcurrentLinkedQueue<ModelInfo.Frame> mPreviewQueue;       // 播放帧
+
+    private boolean requireMotion = false;                  // 请求播放动作
+    private ModelInfo.Region prepareActionRegion;           // 准备在静默节点或动作节点播放完播放的动作区间
+
+    private ModelInfo mModelInfo;                           // 模型的全部信息都放在这里面
+    private ByteBuffer rawBuffer;
+    private ByteBuffer maskBuffer;
+    private final File modelDir;
+
+    private AudioPlayer audioPlayer;
+    private long mCurrentBnfSession = -1;
+    private long mLastBnfSession = -1;
+
+    private float mVolume;
+
+    private int scrfRst;
+    private boolean isLip = false;      // 用于统计是否正在渲染口型
+
+    private Reporter mReporter;
+
+    public RenderThread(Context context, File modelDir, RenderSink renderSink, float volume, RenderCallback callback, Reporter reporter) {
+        this.mContext = context;
+        this.modelDir = modelDir;
+        this.mRenderSink = renderSink;
+        this.callback = callback;
+        this.mReporter = reporter;
+        this.mVolume  = volume;
+    }
+
+    public void setReporter(Reporter reporter){
+        this.mReporter = reporter;
+    }
+
+    @Override
+    public void run() {
+        super.run();
+        Looper.prepare();
+        mHandler = new RenderHandler(this);
+        mPreviewQueue = new ConcurrentLinkedQueue<>();
+        audioPlayer = new AudioPlayer(new AudioPlayer.AudioPlayerCallback() {
+            @Override
+            public void onPlayStart() {
+                callback.onPlayStart();
+            }
+
+            @Override
+            public void onPlayEnd() {
+                mCurrentBnfSession = -1;
+                callback.onPlayEnd();
+            }
+
+            @Override
+            public void onPlayError(int code, String message) {
+                callback.onPlayError(code, message);
+            }
+        }, mVolume);
+
+        scrfdncnn = new DuixNcnn();
+        String duixDir = mContext.getExternalFilesDir("duix").getAbsolutePath();
+        ModelInfo info = ModelInfoLoader.load(mContext, scrfdncnn, duixDir + "/model/gj_dh_res", modelDir.getAbsolutePath());
+        if (info != null) {
+            try {
+                scrfdncnn.alloc(0, 20, info.getWidth(), info.getHeight());
+                scrfdncnn.initPcmex(0,10,20,50,0);
+                if (info.getModelkind() > 0){
+                    scrfdncnn.initMunetex(info.getUnetparam(), info.getUnetbin(), info .getUnetmsk(), info.getModelkind());
+                } else {
+                    scrfdncnn.initMunet(info.getUnetparam(), info.getUnetbin(), info.getUnetmsk());
+                }
+                scrfdncnn.initWenet(info.getWenetfn());
+                mModelInfo = info;
+                Logger.d("分辨率: " + mModelInfo.getWidth() + "x" + mModelInfo.getHeight());
+                rawBuffer = ByteBuffer.allocate(mModelInfo.getWidth() * mModelInfo.getHeight() * 3);
+                maskBuffer = ByteBuffer.allocate(mModelInfo.getWidth() * mModelInfo.getHeight() * 3);
+                if (!mModelInfo.isHasMask()) {
+                    // 用纯白填充mask
+                    Arrays.fill(maskBuffer.array(), (byte) 255);
+                }
+                Logger.d("模型初始化完成");
+                if (callback != null) {
+                    callback.onInitResult(0, 0, mModelInfo.toString(), mModelInfo);
+                }
+            } catch (Exception e){
+                if (callback != null) {
+                    callback.onInitResult(-1002, -1001, "Model loading exception: " + e, null);
+                }
+            }
+        } else {
+            if (callback != null) {
+                callback.onInitResult(-1002, -1000, "Model configuration read exception", null);
+            }
+        }
+
+        synchronized (mReadyFence) {
+            mReadyFence.notify();
+        }
+        isRendering = true;
+        handleAudioStep();
+        Looper.loop();
+        synchronized (mBnfFence) {
+            // 线程最后释放NCNN
+            scrfdncnn.free(0);
+        }
+        Logger.d("NCNN释放");
+        if (audioPlayer != null) {
+            audioPlayer.release();
+            audioPlayer = null;
+        }
+        synchronized (mReadyFence) {
+            mHandler = null;
+        }
+    }
+
+    public void setVolume(float volume){
+        if (audioPlayer != null){
+            audioPlayer.setVolume(volume);
+        }
+    }
+
+    public void stopPreview() {
+        if (mHandler != null) {
+            mHandler.sendEmptyMessage(MSG_STOP_RENDER);
+        }
+    }
+
+    public void startPush() {
+        if (mHandler != null) {
+            mHandler.sendEmptyMessage(MSG_START_PUSH_AUDIO);
+        }
+    }
+
+    public void pushAudio(byte[] data){
+        if (mHandler != null) {
+            Message message = new Message();
+            message.what = MSG_PUSH_AUDIO;
+            message.obj = data;
+            mHandler.sendMessage(message);
+        }
+    }
+
+    public void stopPush() {
+        if (mHandler != null) {
+            mHandler.sendEmptyMessage(MSG_STOP_PUSH_AUDIO);
+        }
+    }
+
+    public void stopPlayAudio(){
+        if (mHandler != null) {
+            mHandler.sendEmptyMessage(MSG_STOP_PLAY_AUDIO);
+        }
+    }
+
+    public void requireMotion(String name, boolean now) {
+        if (mHandler != null) {
+            Message message = new Message();
+            message.what = MSG_REQUIRE_MOTION;
+            message.obj = name;
+            message.arg1 = now ? 0 : 1;
+            mHandler.sendMessage(message);
+        }
+    }
+
+    public void requireRandomMotion(boolean now){
+        if (mHandler != null) {
+            Message message = new Message();
+            message.what = MSG_REQUIRE_MOTION_RANDOM;
+            message.arg1 = now ? 0 : 1;
+            mHandler.sendMessage(message);
+        }
+    }
+
+    private void handleAudioStep() {
+        if (isRendering) {
+            long useTime = renderStep();
+            long delay = 40 - (useTime);
+            if (delay < 0) {
+                Logger.w("渲染耗时过高: " + (useTime) + "(>40ms)");
+                delay = 0;
+            }
+            if (mHandler != null) {
+                mHandler.sendMessageDelayed(mHandler.obtainMessage(MSG_RENDER_STEP), delay);
+            }
+        } else {
+            if (mPreviewQueue != null) {
+                mPreviewQueue.clear();
+            }
+            if (mHandler != null) {
+                mHandler.sendEmptyMessage(MSG_QUIT);
+            }
+        }
+    }
+
+    private long renderStep() {
+        long startTime = System.currentTimeMillis();
+        ModelInfo.Frame frame;
+        if (requireMotion) {
+            // 收到动作的通知
+            requireMotion = false;
+            if (prepareActionRegion != null){
+                mPreviewQueue.clear();
+                Logger.d("发现想要播放的动作区间region: " + prepareActionRegion);
+                mPreviewQueue.addAll(prepareActionRegion.frames);
+            }
+        }
+        if (mPreviewQueue.isEmpty()) {
+            // 先假设把静默的都加进来
+            ModelInfo.Region silenceRegion = mModelInfo.getSilenceRegion();
+            mPreviewQueue.addAll(silenceRegion.frames);
+            List<ModelInfo.Frame> copiedList = new ArrayList<>(silenceRegion.frames);
+            // 反向的也加进来
+            Collections.reverse(copiedList);
+            mPreviewQueue.addAll(copiedList);
+        }
+        frame = mPreviewQueue.poll();
+
+        if (frame != null) {
+            int readyCnt = scrfdncnn.readycnt(mCurrentBnfSession);
+            if (readyCnt > 0 && audioPlayer != null){
+                if (mLastBnfSession != mCurrentBnfSession){
+                    mLastBnfSession = mCurrentBnfSession;
+                    // 通知新的一段读取完成了,准备播放
+                    audioPlayer.startPlay();
+                }
+                int bnfIndex = audioPlayer.getPlayIndex();
+                Logger.i("scrfdncnn readyCnt: " + readyCnt + " bnfIndex: " + bnfIndex);
+                scrfRst = scrfdncnn.filerst(mCurrentBnfSession, !TextUtils.isEmpty(frame.sgPath) ? frame.sgPath : frame.rawPath, !TextUtils.isEmpty(frame.maskPath) ? frame.maskPath : "", frame.rect, "", bnfIndex, rawBuffer.array(),  maskBuffer.array(),mModelInfo.getWidth() * mModelInfo.getHeight() * 3);
+                isLip = true;
+                if (scrfRst < 0){
+                    Logger.i("scrfdncnn.filerst bnf index: " + bnfIndex + " rst: " + scrfRst);
+                }
+            } else {
+                isLip = false;
+                scrfRst = scrfdncnn.fileload(!TextUtils.isEmpty(frame.sgPath) ? frame.sgPath : frame.rawPath, !TextUtils.isEmpty(frame.maskPath) ? frame.maskPath : "", mModelInfo.getWidth(), mModelInfo.getHeight(), rawBuffer.array(), maskBuffer.array(), mModelInfo.getWidth() * mModelInfo.getHeight() * 3);
+                if (scrfRst < 0){
+                    Logger.i("scrfdncnn.fileload rst: " + scrfRst);
+                }
+            }
+            if (frame.startFlag){
+                callback.onMotionPlayStart(frame.actionName);
+            }
+            if (frame.endFlag){
+                callback.onMotionPlayComplete(frame.actionName);
+            }
+            if (mRenderSink != null) {
+                mRenderSink.onVideoFrame(new ImageFrame(rawBuffer, maskBuffer, mModelInfo.getWidth(), mModelInfo.getHeight()));
+            }
+        }
+        long useTime = System.currentTimeMillis() - startTime;
+        if (mReporter != null){
+            mReporter.onRenderStat(scrfRst, isLip, useTime);
+        }
+        return useTime;
+    }
+
+    private void handleStopRender() {
+        Logger.i("handleStopRender");
+        if (isRendering) {
+            isRendering = false;
+        } else {
+            mHandler.sendEmptyMessage(MSG_QUIT);
+        }
+    }
+
+    private void handleStartPushAudio(){
+        if (mCurrentBnfSession > 0){
+            scrfdncnn.finsession(mCurrentBnfSession);
+        }
+        mCurrentBnfSession = scrfdncnn.newsession();
+        if (audioPlayer != null && isRendering){
+            audioPlayer.pushStart();
+        }
+    }
+
+    private void handlePushAudio(byte[] data){
+        if (audioPlayer != null && isRendering){
+            scrfdncnn.pushpcm(mCurrentBnfSession, data, data.length, 0);
+            audioPlayer.pushData(ByteBuffer.wrap(data));
+        }
+    }
+
+    private void handleStopPushAudio() {
+        if (scrfdncnn != null && isRendering){
+            scrfdncnn.finsession(mCurrentBnfSession);
+        }
+        if (audioPlayer != null){
+            audioPlayer.pushDone();
+        }
+    }
+
+    private void handleStopPlayAudio(){
+        if (scrfdncnn != null && isRendering){
+            scrfdncnn.finsession(mCurrentBnfSession);
+            mCurrentBnfSession = -1;
+            if (audioPlayer != null){
+                audioPlayer.stop();
+            }
+        }
+    }
+
+    private void handleRequireMotion(String name, boolean now) {
+        ModelInfo.Region matchRegion = null;
+        for (ModelInfo.Region region : mModelInfo.getMotionRegions()){
+            if (name != null && name.equals(region.name)){
+                matchRegion = region;
+            }
+        }
+        if (matchRegion != null){
+            if (now){
+                prepareActionRegion = matchRegion;
+                requireMotion = true;
+            } else {
+                Logger.d("在播放队列最后插入动作区间region: " + matchRegion);
+                mPreviewQueue.addAll(matchRegion.frames);
+            }
+        }
+    }
+
+    private void handleRequireMotionRandom(boolean now){
+        if (!mModelInfo.getMotionRegions().isEmpty()){
+            int randomIndex = new Random().nextInt(mModelInfo.getMotionRegions().size());
+            ModelInfo.Region region = mModelInfo.getMotionRegions().get(randomIndex);
+            if (now){
+                requireMotion = true;
+                prepareActionRegion = region;
+            } else {
+                Logger.d("在播放队列最后插入随机动作区间region: " + region);
+                mPreviewQueue.addAll(region.frames);
+            }
+        }
+    }
+
+    static class RenderHandler extends Handler {
+
+        private final WeakReference<RenderThread> encoderWeakReference;
+
+        public RenderHandler(RenderThread render) {
+            encoderWeakReference = new WeakReference<>(render);
+        }
+
+        @Override
+        public void handleMessage(Message msg) {
+            int what = msg.what;
+            RenderThread render = encoderWeakReference.get();
+            if (render == null) {
+                return;
+            }
+            switch (what) {
+                case MSG_RENDER_STEP:
+                    render.handleAudioStep();
+                    break;
+                case MSG_STOP_RENDER:
+                    render.handleStopRender();
+                    break;
+                case MSG_STOP_PUSH_AUDIO:
+                    render.handleStopPushAudio();
+                    break;
+                case MSG_REQUIRE_MOTION:
+                    String name = (String)msg.obj;
+                    render.handleRequireMotion(name, msg.arg1 == 0);
+                    break;
+                case MSG_REQUIRE_MOTION_RANDOM:
+                    render.handleRequireMotionRandom(msg.arg1 == 0);
+                    break;
+                case MSG_QUIT:
+                    Logger.i("duix thread quit!");
+                    Looper myLooper = Looper.myLooper();
+                    if (myLooper != null) {
+                        myLooper.quit();
+                    }
+                    break;
+                case MSG_START_PUSH_AUDIO:
+                    render.handleStartPushAudio();
+                    break;
+                case MSG_PUSH_AUDIO:
+                    byte[] data = (byte[])msg.obj;
+                    render.handlePushAudio(data);
+                    break;
+                case MSG_STOP_PLAY_AUDIO:
+                    render.handleStopPlayAudio();
+                    break;
+            }
+        }
+
+    }
+
+    public interface RenderCallback {
+        void onInitResult(int code, int subCode, String message, ModelInfo modelInfo);
+
+        void onPlayStart();
+
+        void onPlayEnd();
+
+        void onPlayError(int code, String msg);
+
+        void onMotionPlayStart(String name);
+
+        void onMotionPlayComplete(String name);
+    }
+
+    public interface Reporter {
+        void onRenderStat(int resultCode, boolean isLip, long useTime);
+    }
+}
diff --git a/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/util/DeviceUtils.java b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/util/DeviceUtils.java
new file mode 100644
index 0000000..14789da
--- /dev/null
+++ b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/util/DeviceUtils.java
@@ -0,0 +1,149 @@
+package ai.guiji.duix.sdk.client.util;
+
+import android.annotation.SuppressLint;
+import android.content.Context;
+import android.content.SharedPreferences;
+import android.content.pm.PackageInfo;
+import android.content.pm.PackageManager;
+import android.net.ConnectivityManager;
+import android.net.NetworkInfo;
+import android.provider.Settings;
+import android.text.TextUtils;
+import android.util.DisplayMetrics;
+import android.util.Log;
+import android.view.Display;
+import android.view.WindowManager;
+
+import java.util.UUID;
+
+public class DeviceUtils {
+
+    public static String getVersionName(Context context) {
+        try {
+            PackageInfo pi = context.getPackageManager().getPackageInfo(context.getPackageName(), 0);
+            return pi.versionName;
+        } catch (PackageManager.NameNotFoundException e) {
+            e.printStackTrace();
+            return "Unknown";
+        }
+    }
+
+    public static int getVersionCode(Context context) {
+        try {
+            PackageInfo pi = context.getPackageManager().getPackageInfo(context.getPackageName(), 0);
+            return pi.versionCode;
+        } catch (PackageManager.NameNotFoundException e) {
+            e.printStackTrace();
+            return 0;
+        }
+    }
+
+    @SuppressLint("MissingPermission")
+    public static boolean isWifi(Context mContext) {
+        ConnectivityManager connectivityManager = (ConnectivityManager) mContext
+                .getSystemService(Context.CONNECTIVITY_SERVICE);
+        NetworkInfo info = null;
+        if (connectivityManager != null) {
+            info = connectivityManager.getActiveNetworkInfo();
+        }
+        return info != null && info.getType() == ConnectivityManager.TYPE_WIFI;
+    }
+
+    private static String getAndroidID(Context context) {
+        String id = Settings.System.getString(context.getContentResolver(), Settings.Secure.ANDROID_ID);
+        return id;
+    }
+
+    private static String getDeviceUUid(Context context) {
+        String androidId = DeviceUtils.getAndroidID(context);
+        if (TextUtils.isEmpty(androidId)) {
+            return null;
+        }
+        UUID deviceUuid = new UUID(androidId.hashCode(), ((long) androidId.hashCode() << 32));
+        return deviceUuid.toString();
+    }
+
+    private static String getAppUUid(Context context) {
+        SharedPreferences preferences = context.getSharedPreferences("device", Context.MODE_PRIVATE);
+        String uuid = preferences.getString("UUID", null);
+        if (TextUtils.isEmpty(uuid)) {
+            uuid = UUID.randomUUID().toString();
+            preferences.edit().putString("UUID", uuid).apply();
+        }
+        return uuid;
+    }
+
+    public static String getUUID(Context context) {
+        Log.d("DeviceUtils", "getUUID: ");
+        String uuid = getDeviceUUid(context);
+        if (TextUtils.isEmpty(uuid)) {
+            uuid = getAppUUid(context);
+        }
+        return uuid;
+    }
+
+    public static boolean isNetworkConnected(Context context) {
+        if (context != null) {
+            ConnectivityManager mConnectivityManager = (ConnectivityManager) context
+                    .getSystemService(Context.CONNECTIVITY_SERVICE);
+            @SuppressLint("MissingPermission") NetworkInfo mNetworkInfo = mConnectivityManager.getActiveNetworkInfo();
+            if (mNetworkInfo != null) {
+                return mNetworkInfo.isAvailable();
+            }
+        }
+        return false;
+    }
+
+    /**
+     * 获取屏幕信息
+     *
+     * @param context
+     * @return
+     */
+    public static ScreenInfo getScreenInfo(Context context) {
+        return new ScreenInfo(context);
+    }
+
+    public static class ScreenInfo {
+
+        public final float screenWidthDp;//  dp单位
+        public final float screenHeightDp;//  dp单位
+        public final int screenWidthPx; //屏幕宽 px
+        public final int screenHeightPx; //屏幕高 px
+        public final float uiScale;//  density
+        public final float fontScale;//  scaledDensity
+
+        public ScreenInfo(Context context) {
+            WindowManager windowManager = (WindowManager) context.getSystemService(Context.WINDOW_SERVICE);
+            Display defaultDisplay = windowManager.getDefaultDisplay();
+            DisplayMetrics displayMetrics = new DisplayMetrics();
+            defaultDisplay.getMetrics(displayMetrics);
+            screenWidthPx = displayMetrics.widthPixels;
+            screenHeightPx = displayMetrics.heightPixels;
+            uiScale = context.getResources().getDisplayMetrics().density;
+            fontScale = context.getResources().getDisplayMetrics().scaledDensity;
+            screenWidthDp = px2dp(screenWidthPx);
+            screenHeightDp = px2dp(screenHeightPx);
+        }
+
+        public int dp2px(float dpValue) {
+            return (int) (dpValue * uiScale + 0.5f);
+        }
+
+        public int px2dp(float pxValue) {
+            return (int) (pxValue / uiScale + 0.5f);
+        }
+
+        public int sp2px(float spValue) {
+            return (int) (spValue * fontScale + 0.5f);
+        }
+
+        public int px2sp(float pxValue) {
+            return (int) (pxValue / fontScale + 0.5f);
+        }
+
+
+    }
+
+
+}
diff --git a/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/util/FileUtil.java b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/util/FileUtil.java
new file mode 100644
index 0000000..6b23e9f
--- /dev/null
+++ b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/util/FileUtil.java
@@ -0,0 +1,23 @@
+package ai.guiji.duix.sdk.client.util;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+
+public class FileUtil {
+
+    public static String readFile(String path){
+        StringBuilder sb = new StringBuilder();
+        try (BufferedReader reader = new BufferedReader(new FileReader(path))){
+            String line;
+            while ((line = reader.readLine()) != null) {
+                // 处理每行数据
+                sb.append(line);
+            }
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+        return sb.toString();
+    }
+
+}
diff --git a/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/util/Logger.java b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/util/Logger.java
new file mode 100644
index 0000000..5240c21
--- /dev/null
+++ b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/util/Logger.java
@@ -0,0 +1,56 @@
+package ai.guiji.duix.sdk.client.util;
+
+import android.util.Log;
+
+import ai.guiji.duix.sdk.client.BuildConfig;
+
+public class Logger {
+
+    private static boolean logFlag = BuildConfig.DEBUG;
+
+    private static String TAG = "DUIX-SDK";
+
+    public static void debugEnable(boolean enable) {
+        logFlag = enable;
+    }
+
+    public static void e(String text) {
+        e("", text);
+    }
+
+    public static void e(String tag, String text) {
+        if (logFlag) {
+            Log.e(TAG, tag + "~" + text);
+        }
+    }
+
+    public static void i(String text) {
+        i("", text);
+    }
+
+    public static void i(String tag, String text) {
+        if (logFlag) {
+            Log.i(TAG, tag + "~" + text);
+        }
+    }
+
+    public static void d(String text) {
+        d("", text);
+    }
+
+    public static void d(String tag, String text) {
+        if (logFlag) {
+            Log.d(TAG, tag + "~" + text);
+        }
+    }
+
+    public static void w(String text) {
+        w("", text);
+    }
+
+    public static void w(String tag, String text) {
+        if (logFlag) {
+            Log.w(TAG, tag + "~" + text);
+        }
+    }
+}
diff --git a/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/util/MD5Util.java b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/util/MD5Util.java
new file mode 100644
index 0000000..15589a6
--- /dev/null
+++ b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/util/MD5Util.java
@@ -0,0 +1,172 @@
+package ai.guiji.duix.sdk.client.util;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.InputStream;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+
+/**
+ * 采用MD5加密解密
+ *
+ * @author tfq
+ * @datetime 2011-10-13
+ */
+public class MD5Util {
+    private static final char hexDigits[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
+            'a', 'b', 'c', 'd', 'e', 'f'};
+
+    public final static String md5Encode(final String s) {
+        try {
+            final byte[] strTemp = s.getBytes();
+            final MessageDigest mdTemp = MessageDigest.getInstance("MD5");
+            mdTemp.update(strTemp);
+            final byte[] md = mdTemp.digest();
+            final int j = md.length;
+            final char str[] = new char[j * 2];
+            int k = 0;
+            for (int i = 0; i < j; i++) {
+                final byte byte0 = md[i];
+                str[k++] = hexDigits[byte0 >>> 4 & 0xf];
+                str[k++] = hexDigits[byte0 & 0xf];
+            }
+            return new String(str);
+        } catch (Exception e) {
+            return null;
+        }
+    }
+
+    public final static String md5Encode(final byte[] md) {
+        try {
+            final int j = md.length;
+            final char str[] = new char[j * 2];
+            int k = 0;
+            for (int i = 0; i < j; i++) {
+                final byte byte0 = md[i];
+                str[k++] = hexDigits[byte0 >>> 4 & 0xf];
+                str[k++] = hexDigits[byte0 & 0xf];
+            }
+            return new String(str);
+        } catch (Exception e) {
+            return null;
+        }
+    }
+
+    private static String toHexString(final byte[] b) {
+        final StringBuilder sb = new StringBuilder(b.length * 2);
+        for (int i = 0; i < b.length; i++) {
+            final byte byte0 = b[i];
+            sb.append(hexDigits[byte0 >>> 4 & 0xf]);
+            sb.append(hexDigits[byte0 & 0x0f]);
+        }
+        return sb.toString();
+    }
+
+    /**
+     * 求一个文件的MD5签名
+     *
+     * @return
+     */
+    public final static String md5EncodeFile(final File file) {
+        try {
+            return md5EncodeFile(new FileInputStream(file));
+        } catch (FileNotFoundException e) {
+            e.printStackTrace();
+            return null;
+        }
+    }
+
+    public final static String md5EncodeFile(InputStream inputStream) {
+        final byte[] buffer = new byte[1024];
+        int bytes;
+        MessageDigest md5;
+        try {
+            final InputStream is = inputStream;
+            md5 = MessageDigest.getInstance("MD5");
+            while ((bytes = is.read(buffer)) > 0) {
+                md5.update(buffer, 0, bytes);
+            }
+            is.close();
+            return toHexString(md5.digest());
+        } catch (Exception e) {
+            return null;
+        }
+    }
+
+    public final static String md5EncodeFile(final String fileName) {
+        try {
+            return md5EncodeFile(new FileInputStream(fileName));
+        } catch (FileNotFoundException e) {
+            e.printStackTrace();
+            return null;
+        }
+    }
+
+
+    public final static boolean verifyMD5File(File file, String md5) {
+        String md5File = md5EncodeFile(file);
+        if (md5File != null && md5 != null) {
+            return md5File.toLowerCase().trim().equals(md5.toLowerCase().trim());
+        }
+        return false;
+    }
+
+
+    /**
+     * MD5加码 生成32位md5码
+     */
+    public static byte[] MD5(String val) {
+        MessageDigest md5 = null;
+        try {
+            md5 = MessageDigest.getInstance("MD5");
+            md5.update(val.getBytes());
+            byte[] m = md5.digest();//加密  MessageDigest md5 = null;
+            return m;
+        } catch (NoSuchAlgorithmException e) {
+            e.printStackTrace();
+            return null;
+        }
+    }
+
+    public static String string2MD5(String inStr) {
+        MessageDigest md5 = null;
+        try {
+            md5 = MessageDigest.getInstance("MD5");
+        } catch (Exception e) {
+            System.out.println(e.toString());
+            e.printStackTrace();
+            return "";
+        }
+        char[] charArray = inStr.toCharArray();
+        byte[] byteArray = new byte[charArray.length];
+
+        for (int i = 0; i < charArray.length; i++)
+            byteArray[i] = (byte) charArray[i];
+        byte[] md5Bytes = md5.digest(byteArray);
+        StringBuffer hexValue = new StringBuffer();
+        for (int i = 0; i < md5Bytes.length; i++) {
+            int val = ((int) md5Bytes[i]) & 0xff;
+            if (val < 16)
+                hexValue.append("0");
+            hexValue.append(Integer.toHexString(val));
+        }
+        return hexValue.toString();
+
+    }
+
+    /**
+     * 加密解密算法 执行一次加密，两次解密
+     */
+    public static String convertMD5(String inStr) {
+
+        char[] a = inStr.toCharArray();
+        for (int i = 0; i < a.length; i++) {
+            a[i] = (char) (a[i] ^ 't');
+        }
+        String s = new String(a);
+        return s;
+
+    }
+
+}
diff --git a/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/util/OpenGLUtil.java b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/util/OpenGLUtil.java
new file mode 100644
index 0000000..295b8ff
--- /dev/null
+++ b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/util/OpenGLUtil.java
@@ -0,0 +1,537 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package ai.guiji.duix.sdk.client.util;
+
+import android.app.ActivityManager;
+import android.content.Context;
+import android.content.pm.ConfigurationInfo;
+import android.graphics.Bitmap;
+import android.opengl.GLES10;
+import android.opengl.GLES20;
+import android.opengl.GLES30;
+import android.opengl.GLUtils;
+import android.opengl.Matrix;
+import android.os.Build;
+import android.util.Log;
+
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.FloatBuffer;
+import java.util.Arrays;
+
+import javax.microedition.khronos.egl.EGL10;
+import javax.microedition.khronos.egl.EGLConfig;
+import javax.microedition.khronos.egl.EGLContext;
+import javax.microedition.khronos.egl.EGLDisplay;
+import javax.microedition.khronos.egl.EGLSurface;
+
+public class OpenGLUtil {
+    public static final String TAG = "OpenGLUtil";
+    public static final int NO_TEXTURE = -1;
+    public static final float[] IDENTITY_MATRIX;
+    private static final int SIZEOF_FLOAT = 4;
+
+    static {
+        IDENTITY_MATRIX = new float[16];
+        Matrix.setIdentityM(IDENTITY_MATRIX, 0);
+    }
+
+    public static int loadTexture(final Bitmap img, final int usedTexId) {
+        return loadTexture(img, usedTexId, true);
+    }
+
+    public static int loadTexture(final Bitmap img, final int usedTexId, final boolean recycle) {
+        int textures[] = new int[1];
+        if (usedTexId == NO_TEXTURE) {
+            GLES20.glGenTextures(1, textures, 0);
+            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, textures[0]);
+            GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR);
+            GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR);
+            GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE);
+            GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE);
+
+            GLUtils.texImage2D(GLES20.GL_TEXTURE_2D, 0, img, 0);
+        } else {
+            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, usedTexId);
+            GLUtils.texSubImage2D(GLES20.GL_TEXTURE_2D, 0, 0, 0, img);
+            textures[0] = usedTexId;
+        }
+        if (recycle) {
+            img.recycle();
+        }
+        return textures[0];
+    }
+
+    public static int loadTexture(final ByteBuffer data, final int width, final int height, final int usedTexId) {
+        int textures[] = new int[1];
+        if (usedTexId == NO_TEXTURE) {
+            GLES20.glGenTextures(1, textures, 0);
+            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, textures[0]);
+            GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
+                    GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR);
+            GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
+                    GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR);
+            GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
+                    GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE);
+            GLES20.glTexParameterf(GLES20.GL_TEXTURE_2D,
+                    GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE);
+            GLES20.glTexImage2D(GLES20.GL_TEXTURE_2D, 0, GLES20.GL_RGBA, width, height,
+                    0, GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, data);
+        } else {
+            GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, usedTexId);
+            GLES20.glTexSubImage2D(GLES20.GL_TEXTURE_2D, 0, 0, 0, width,
+                    height, GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, data);
+            textures[0] = usedTexId;
+        }
+        return textures[0];
+    }
+
+    public static int createProgram(String vertexSource, String fragmentSource) {
+        int vertexShader = loadShader(GLES20.GL_VERTEX_SHADER, vertexSource);
+        if (vertexShader == 0) {
+            return 0;
+        }
+        int pixelShader = loadShader(GLES20.GL_FRAGMENT_SHADER, fragmentSource);
+        if (pixelShader == 0) {
+            return 0;
+        }
+
+        int program = GLES20.glCreateProgram();
+        checkGLError("glCreateProgram");
+        if (program == 0) {
+            Log.e(TAG, "Could not create program");
+        }
+        GLES20.glAttachShader(program, vertexShader);
+        checkGLError("glAttachShader");
+        GLES20.glAttachShader(program, pixelShader);
+        checkGLError("glAttachShader");
+        GLES20.glLinkProgram(program);
+        int[] linkStatus = new int[1];
+        GLES20.glGetProgramiv(program, GLES20.GL_LINK_STATUS, linkStatus, 0);
+        if (linkStatus[0] != GLES20.GL_TRUE) {
+            Log.e(TAG, "Could not link program: ");
+            Log.e(TAG, GLES20.glGetProgramInfoLog(program));
+            GLES20.glDeleteProgram(program);
+            program = 0;
+        }
+        return program;
+    }
+
+    public static int loadShader(int shaderType, String source) {
+        int shader = GLES20.glCreateShader(shaderType);
+        checkGLError("glCreateShader type=" + shaderType);
+        GLES20.glShaderSource(shader, source);
+        GLES20.glCompileShader(shader);
+        int[] compiled = new int[1];
+        GLES20.glGetShaderiv(shader, GLES20.GL_COMPILE_STATUS, compiled, 0);
+        if (compiled[0] == 0) {
+            Log.e(TAG, "Could not compile shader " + shaderType + ":");
+            Log.e(TAG, " " + GLES20.glGetShaderInfoLog(shader));
+            GLES20.glDeleteShader(shader);
+            shader = 0;
+        }
+        return shader;
+    }
+
+    public static void checkGLError(String op) {
+        int error = GLES20.glGetError();
+        if (error != GLES20.GL_NO_ERROR) {
+            String msg = op + ": glError 0x" + Integer.toHexString(error);
+            Log.e(TAG, msg);
+        }
+    }
+
+    public static void checkLocation(int location, String label) {
+        if (location < 0) {
+            Log.e(TAG, "Unable to locate '" + label + "' in program");
+        }
+    }
+
+    /**
+     * Creates a texture from raw data.
+     *
+     * @param data   Image data, in a "direct" ByteBuffer.
+     * @param width  Texture width, in pixels (not bytes).
+     * @param height Texture height, in pixels.
+     * @param format Image data format (use constant appropriate for glTexImage2D(), e.g. GL_RGBA).
+     * @return Handle to texture.
+     */
+    public static int createImageTexture(ByteBuffer data, int width, int height, int format) {
+        int[] textureHandles = new int[1];
+        int textureHandle;
+
+        GLES20.glGenTextures(1, textureHandles, 0);
+        textureHandle = textureHandles[0];
+        OpenGLUtil.checkGLError("glGenTextures");
+
+        GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, textureHandle);
+
+        GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_MIN_FILTER,
+                GLES20.GL_LINEAR);
+        GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_MAG_FILTER,
+                GLES20.GL_LINEAR);
+        OpenGLUtil.checkGLError("loadImageTexture");
+
+        // Load the data from the buffer into the texture handle.
+        GLES20.glTexImage2D(GLES20.GL_TEXTURE_2D, /*level*/ 0, format,
+                width, height, /*border*/ 0, format, GLES20.GL_UNSIGNED_BYTE, data);
+        OpenGLUtil.checkGLError("loadImageTexture");
+
+        return textureHandle;
+    }
+
+    public static int createImageTexture(Bitmap bmp) {
+        int[] textureHandles = new int[1];
+        int textureHandle;
+
+        GLES20.glGenTextures(1, textureHandles, 0);
+        textureHandle = textureHandles[0];
+        OpenGLUtil.checkGLError("glGenTextures");
+
+        GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, textureHandle);
+        GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_MIN_FILTER,
+                GLES20.GL_LINEAR);
+        GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_MAG_FILTER,
+                GLES20.GL_LINEAR);
+        GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_WRAP_S,
+                GLES20.GL_CLAMP_TO_EDGE);
+        GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_WRAP_T,
+                GLES20.GL_CLAMP_TO_EDGE);
+        OpenGLUtil.checkGLError("loadImageTexture");
+        // Load the data from the buffer into the texture handle.
+        GLUtils.texImage2D(GLES20.GL_TEXTURE_2D, /*level*/ 0, bmp, 0);
+        OpenGLUtil.checkGLError("loadImageTexture");
+
+        return textureHandle;
+    }
+
+    /**
+     * Allocates a direct float buffer, and populates it with the float array data.
+     */
+    public static FloatBuffer createFloatBuffer(float[] coords) {
+        // Allocate a direct ByteBuffer, using 4 bytes per float, and copy coords into it.
+        ByteBuffer bb = ByteBuffer.allocateDirect(coords.length * SIZEOF_FLOAT);
+        bb.order(ByteOrder.nativeOrder());
+        FloatBuffer fb = bb.asFloatBuffer();
+        fb.put(coords);
+        fb.position(0);
+        return fb;
+    }
+
+    /**
+     * Writes GL version info to the log.
+     */
+    public static void printVersionInfo() {
+        Log.i(TAG, "vendor  : " + GLES20.glGetString(GLES20.GL_VENDOR));
+        Log.i(TAG, "renderer: " + GLES20.glGetString(GLES20.GL_RENDERER));
+        Log.i(TAG, "version : " + GLES20.glGetString(GLES20.GL_VERSION));
+
+        int[] values = new int[1];
+        GLES30.glGetIntegerv(GLES30.GL_MAJOR_VERSION, values, 0);
+        int majorVersion = values[0];
+        GLES30.glGetIntegerv(GLES30.GL_MINOR_VERSION, values, 0);
+        int minorVersion = values[0];
+        if (GLES30.glGetError() == GLES30.GL_NO_ERROR) {
+            Log.i(TAG, "OpenGL Version: " + majorVersion + "." + minorVersion);
+        }
+    }
+
+    /**
+     * 获取OpenGL主版本号，必须在GL线程调用
+     *
+     * @return 主版本号
+     */
+    public static int getGlMajorVersion() {
+        int[] values = new int[1];
+        GLES30.glGetIntegerv(GLES30.GL_MAJOR_VERSION, values, 0);
+        int majorVersion = values[0];
+        return majorVersion;
+    }
+
+    public static int createTextureObject(int textureTarget) {
+        int[] textures = new int[1];
+        GLES20.glGenTextures(1, textures, 0);
+        OpenGLUtil.checkGLError("glGenTextures");
+
+        int texId = textures[0];
+        GLES20.glBindTexture(textureTarget, texId);
+        OpenGLUtil.checkGLError("glBindTexture " + texId);
+
+        GLES20.glTexParameterf(textureTarget, GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR);
+        GLES20.glTexParameterf(textureTarget, GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR);
+        GLES20.glTexParameteri(textureTarget, GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE);
+        GLES20.glTexParameteri(textureTarget, GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE);
+        OpenGLUtil.checkGLError("glTexParameter");
+
+        return texId;
+    }
+
+    public static void deleteTextures(int[] textureId) {
+        if (textureId != null && textureId.length > 0) {
+            GLES20.glDeleteTextures(textureId.length, textureId, 0);
+        }
+    }
+
+    public static void createFrameBuffers(int[] fboTex, int[] fboId, int width, int height) {
+        GLES20.glGenFramebuffers(fboId.length, fboId, 0);
+        GLES20.glGenTextures(fboTex.length, fboTex, 0);
+        GLES20.glBindFramebuffer(GLES20.GL_FRAMEBUFFER, fboId[0]);
+        GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, fboTex[0]);
+        GLES20.glTexImage2D(GLES20.GL_TEXTURE_2D, 0, GLES20.GL_RGBA, width, height, 0, GLES20.GL_RGBA, GLES20.GL_UNSIGNED_BYTE, null);
+        GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE);
+        GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE);
+        GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR);
+        GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR);
+        GLES20.glFramebufferTexture2D(GLES20.GL_FRAMEBUFFER, GLES20.GL_COLOR_ATTACHMENT0, GLES20.GL_TEXTURE_2D, fboTex[0], 0);
+        GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, 0);
+        GLES20.glBindFramebuffer(GLES20.GL_FRAMEBUFFER, 0);
+    }
+
+    public static void deleteFrameBuffers(int[] fboId) {
+        if (fboId != null && fboId.length > 0) {
+            GLES20.glDeleteFramebuffers(fboId.length, fboId, 0);
+        }
+    }
+
+    public static float[] changeMvpMatrixCrop(float viewWidth, float viewHeight, float textureWidth, float textureHeight) {
+        float scale = viewWidth * textureHeight / viewHeight / textureWidth;
+        float[] mvp = Arrays.copyOf(IDENTITY_MATRIX, IDENTITY_MATRIX.length);
+        Matrix.scaleM(mvp, 0, scale > 1 ? 1F : (1F / scale), scale > 1 ? scale : 1F, 1F);
+//        Matrix.translateM(mvp,0,0f,0f,-2.5f);
+//        Matrix.rotateM(mvp,0,-60f,1f,0f,0f);
+        return mvp;
+    }
+
+    public static float[] changeMvpMatrixInside(float viewWidth, float viewHeight, float textureWidth, float textureHeight) {
+        float scale = viewWidth * textureHeight / viewHeight / textureWidth;
+        float[] mvp = Arrays.copyOf(IDENTITY_MATRIX, IDENTITY_MATRIX.length);
+        Matrix.scaleM(mvp, 0, scale > 1 ? (1F / scale) : 1F, scale > 1 ? 1F : scale, 1F);
+        return mvp;
+    }
+
+    /**
+     * 获取当前设备支持openGL ES版本，优先返回3。0
+     *
+     * @param context
+     * @return ver
+     */
+    public static int getSupportGLVersion(Context context) {
+        final ActivityManager activityManager = (ActivityManager) context.getSystemService(Context.ACTIVITY_SERVICE);
+        final ConfigurationInfo configurationInfo = activityManager.getDeviceConfigurationInfo();
+        int version = configurationInfo.reqGlEsVersion >= 0x30000 ? 3 : 2;
+        String glEsVersion = configurationInfo.getGlEsVersion();
+        Log.d(TAG, "OpenGL ES Version: " + Integer.toHexString(configurationInfo.reqGlEsVersion)
+                + ", glEsVersion: " + glEsVersion + ", return: " + version);
+        return version;
+    }
+
+    /**
+     * 获取当前设备支持OpenGL最大纹理大小
+     *
+     * @return
+     */
+    public static int getMaxTextureSize() {
+        if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.LOLLIPOP) {
+            EGL10 egl = (EGL10) EGLContext.getEGL();
+            EGLDisplay dpy = egl.eglGetDisplay(EGL10.EGL_DEFAULT_DISPLAY);
+            int[] vers = new int[2];
+            egl.eglInitialize(dpy, vers);
+            int[] configAttr = {
+                    EGL10.EGL_COLOR_BUFFER_TYPE, EGL10.EGL_RGB_BUFFER,
+                    EGL10.EGL_LEVEL, 0,
+                    EGL10.EGL_SURFACE_TYPE, EGL10.EGL_PBUFFER_BIT,
+                    EGL10.EGL_NONE
+            };
+            EGLConfig[] configs = new EGLConfig[1];
+            int[] numConfig = new int[1];
+            egl.eglChooseConfig(dpy, configAttr, configs, 1, numConfig);
+            if (numConfig[0] == 0) {// TROUBLE! No config found.
+            }
+            EGLConfig config = configs[0];
+            int[] surfAttr = {
+                    EGL10.EGL_WIDTH, 64,
+                    EGL10.EGL_HEIGHT, 64,
+                    EGL10.EGL_NONE
+            };
+            EGLSurface surf = egl.eglCreatePbufferSurface(dpy, config, surfAttr);
+            final int EGL_CONTEXT_CLIENT_VERSION = 0x3098;  // missing in EGL10
+            int[] ctxAttrib = {
+                    EGL_CONTEXT_CLIENT_VERSION, 1,
+                    EGL10.EGL_NONE
+            };
+            EGLContext ctx = egl.eglCreateContext(dpy, config, EGL10.EGL_NO_CONTEXT, ctxAttrib);
+            egl.eglMakeCurrent(dpy, surf, surf, ctx);
+            int[] maxSize = new int[1];
+            GLES10.glGetIntegerv(GLES10.GL_MAX_TEXTURE_SIZE, maxSize, 0);
+            egl.eglMakeCurrent(dpy, EGL10.EGL_NO_SURFACE, EGL10.EGL_NO_SURFACE, EGL10.EGL_NO_CONTEXT);
+            egl.eglDestroySurface(dpy, surf);
+            egl.eglDestroyContext(dpy, ctx);
+            egl.eglTerminate(dpy);
+            return maxSize[0];
+        } else {
+            int[] maxSize = new int[1];
+            GLES10.glGetIntegerv(GLES10.GL_MAX_TEXTURE_SIZE, maxSize, 0);
+            return maxSize[0];
+        }
+    }
+
+    /**
+     * Helper function that compiles the shaders, links and validates the
+     * program, returning the program ID.
+     */
+    public static int buildProgram(String vertexShaderSource,
+                                   String fragmentShaderSource) {
+        int program;
+
+        // Compile the shaders.
+        int vertexShader = compileVertexShader(vertexShaderSource);
+        int fragmentShader = compileFragmentShader(fragmentShaderSource);
+
+        // Link them into a shader program.
+        program = linkProgram(vertexShader, fragmentShader);
+
+        validateProgram(program);
+
+        return program;
+    }
+
+    /**
+     * Loads and compiles a vertex shader, returning the OpenGL object ID.
+     */
+    public static int compileVertexShader(String shaderCode) {
+        return compileShader(GLES20.GL_VERTEX_SHADER, shaderCode);
+    }
+
+    /**
+     * Loads and compiles a fragment shader, returning the OpenGL object ID.
+     */
+    public static int compileFragmentShader(String shaderCode) {
+        return compileShader(GLES20.GL_FRAGMENT_SHADER, shaderCode);
+    }
+
+    /**
+     * Links a vertex shader and a fragment shader together into an OpenGL
+     * program. Returns the OpenGL program object ID, or 0 if linking failed.
+     */
+    public static int linkProgram(int vertexShaderId, int fragmentShaderId) {
+
+        // Create a new program object.
+        final int programObjectId = GLES20.glCreateProgram();
+
+        if (programObjectId == 0) {
+            Log.w(TAG, "Could not create new program");
+            return 0;
+        }
+
+        // Attach the vertex shader to the program.
+        GLES20.glAttachShader(programObjectId, vertexShaderId);
+
+        // Attach the fragment shader to the program.
+        GLES20.glAttachShader(programObjectId, fragmentShaderId);
+
+        // Link the two shaders together into a program.
+        GLES20.glLinkProgram(programObjectId);
+
+        // Get the link status.
+        final int[] linkStatus = new int[1];
+        GLES20.glGetProgramiv(programObjectId, GLES20.GL_LINK_STATUS,
+                linkStatus, 0);
+
+        // Print the program info log to the Android log output.
+        Log.v(TAG,
+                "Results of linking program:\n"
+                        + GLES20.glGetProgramInfoLog(programObjectId));
+
+        // Verify the link status.
+        if (linkStatus[0] == 0) {
+            // If it failed, delete the program object.
+            GLES20.glDeleteProgram(programObjectId);
+            Log.w(TAG, "Linking of program failed.");
+            return 0;
+        }
+
+        // Return the program object ID.
+        return programObjectId;
+    }
+
+    /**
+     * Validates an OpenGL program. Should only be called when developing the
+     * application.
+     */
+    public static boolean validateProgram(int programObjectId) {
+        GLES20.glValidateProgram(programObjectId);
+        final int[] validateStatus = new int[1];
+        GLES20.glGetProgramiv(programObjectId, GLES20.GL_VALIDATE_STATUS,
+                validateStatus, 0);
+        Log.v(TAG, "Results of validating program: " + validateStatus[0]
+                + "\nLog:" + GLES20.glGetProgramInfoLog(programObjectId));
+
+        return validateStatus[0] != 0;
+    }
+
+    /**
+     * Compiles a shader, returning the OpenGL object ID.
+     */
+    private static int compileShader(int type, String shaderCode) {
+        // Create a new shader object.
+        final int shaderObjectId = GLES20.glCreateShader(type);
+
+        if (shaderObjectId == 0) {
+            Log.e(TAG, "Could not create new shader.");
+            return 0;
+        }
+
+        // Pass in the shader source.
+        GLES20.glShaderSource(shaderObjectId, shaderCode);
+
+        // Compile the shader.
+        GLES20.glCompileShader(shaderObjectId);
+
+        // Get the compilation status.
+        final int[] compileStatus = new int[1];
+        GLES20.glGetShaderiv(shaderObjectId, GLES20.GL_COMPILE_STATUS,
+                compileStatus, 0);
+
+        // Verify the compile status.
+        if (compileStatus[0] == 0) {
+            // If it failed, delete the shader object.
+            GLES20.glDeleteShader(shaderObjectId);
+            Log.e(TAG, "Compilation of shader failed.");
+            return 0;
+        }
+
+        // Return the shader object ID.
+        return shaderObjectId;
+    }
+
+    public static float[] scale(float[] m) {
+        Matrix.scaleM(m, 0, 2.0F, 1.0F, 1);
+        return m;
+    }
+
+    public static float[] rotate(float[] m, float angle) {
+        Matrix.rotateM(m, 0, angle, 0, 0, 1);
+        return m;
+    }
+
+    public static float[] flip(float[] m, boolean x, boolean y) {
+        if (x || y) {
+            Matrix.scaleM(m, 0, x ? -1 : 1, y ? -1 : 1, 1);
+        }
+        return m;
+    }
+
+}
diff --git a/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/util/SystemUtils.java b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/util/SystemUtils.java
new file mode 100644
index 0000000..12d5485
--- /dev/null
+++ b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/util/SystemUtils.java
@@ -0,0 +1,71 @@
+package ai.guiji.duix.sdk.client.util;
+
+import android.content.Context;
+import android.content.pm.PackageManager;
+
+public class SystemUtils {
+
+    public static int getVersionCode(Context context) {
+        try {
+            return context.getPackageManager().getPackageInfo(context.getPackageName(), 0).versionCode;
+        } catch (PackageManager.NameNotFoundException e) {
+            e.printStackTrace();
+            return -1;
+        }
+    }
+
+    public static String getVersionName(Context context) {
+        try {
+            return context.getPackageManager().getPackageInfo(context.getPackageName(), 0).versionName;
+        } catch (PackageManager.NameNotFoundException e) {
+            e.printStackTrace();
+            return "";
+        }
+    }
+
+    /**
+     * 版本号比较
+     *
+     * @param version1
+     * @param version2
+     * @return
+     */
+    public static int compareVersion(String version1, String version2) {
+        try {
+            if (version1.equals(version2)) {
+                return 0;
+            }
+            String[] version1Array = version1.split("\\.");
+            String[] version2Array = version2.split("\\.");
+            int index = 0;
+            // 获取最小长度值
+            int minLen = Math.min(version1Array.length, version2Array.length);
+            int diff = 0;
+            while (index < minLen
+                    && (diff = Integer.parseInt(version1Array[index])
+                    - Integer.parseInt(version2Array[index])) == 0) {
+                index++;
+            }
+            if (diff == 0) {
+                // 如果位数不一致，比较多余位数
+                for (int i = index; i < version1Array.length; i++) {
+                    if (Integer.parseInt(version1Array[i]) > 0) {
+                        return 1;
+                    }
+                }
+
+                for (int i = index; i < version2Array.length; i++) {
+                    if (Integer.parseInt(version2Array[i]) > 0) {
+                        return -1;
+                    }
+                }
+                return 0;
+            } else {
+                return diff > 0 ? 1 : -1;
+            }
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+        return -1;
+    }
+}
diff --git a/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/util/ZipUtil.java b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/util/ZipUtil.java
new file mode 100644
index 0000000..29bb261
--- /dev/null
+++ b/duix-sdk/src/main/java/ai/guiji/duix/sdk/client/util/ZipUtil.java
@@ -0,0 +1,98 @@
+package ai.guiji.duix.sdk.client.util;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.Enumeration;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
+import java.util.zip.ZipInputStream;
+
+public class ZipUtil {
+
+    //解压缩文件
+    // 这里unzip需要使用canonicalPath做校验，但是该方法取的路径可能和getAbsolutePath()方式获取的不一致
+    // getCanonicalPath()   /data/data/.......
+    // getAbsolutePath      /data/user/0/......
+    public static boolean unzip(String zipFilePath, String outOutPath, Callback callback) {
+        try (FileInputStream fis = new FileInputStream(zipFilePath)) {
+            long total = 0;
+            if (callback != null){
+                total = getZipSize(zipFilePath);
+            }
+            ZipInputStream inZip = new ZipInputStream(fis);
+            long currentSize = 0;
+            ZipEntry zipEntry;
+            String szName = "";
+            while ((zipEntry = inZip.getNextEntry()) != null) {
+                szName = zipEntry.getName();
+                if (zipEntry.isDirectory()) {
+                    szName = szName.substring(0, szName.length() - 1);
+                    File folder = new File(outOutPath + File.separator + szName);
+                    String canonicalPath = folder.getCanonicalPath();
+                    if (!canonicalPath.startsWith(outOutPath)) {
+                        Logger.e("绝对值路径比较异常忽略该地址: " + folder.getAbsolutePath());
+                    } else {
+                        if (!folder.exists()) {
+                            if (!folder.mkdirs()) {
+                                return false;
+                            }
+                        }
+                    }
+                } else {
+                    File file = new File(outOutPath + File.separator + szName);
+                    String canonicalPath = file.getCanonicalPath();
+                    if (!canonicalPath.startsWith(outOutPath)) {
+                        Logger.e( "绝对值路径比较异常忽略该地址: " + file.getAbsolutePath());
+                    } else {
+                        if (!file.exists()) {
+                            if (!file.getParentFile().exists()) {
+                                file.getParentFile().mkdirs();
+                            }
+                            if (!file.createNewFile()) {
+                                return false;
+                            }
+                            FileOutputStream out = new FileOutputStream(file);
+                            int len;
+                            byte[] buffer = new byte[2048];
+                            while ((len = inZip.read(buffer)) != -1) {
+                                out.write(buffer, 0, len);
+                                out.flush();
+                                if (callback != null) {
+                                    currentSize += len;
+                                    callback.onProgress(currentSize, total); // 通过回调函数更新进度
+                                }
+                            }
+                            out.close();
+                        }
+                    }
+                }
+            }
+            inZip.close();
+            return true;
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+        return false;
+    }
+
+    private static long getZipSize(String filePath){
+        long size = 0;
+        ZipFile f;
+        try {
+            f = new ZipFile(filePath);
+            Enumeration<? extends ZipEntry> en = f.entries();
+            while (en.hasMoreElements()) {
+                size += en.nextElement().getSize();
+            }
+        } catch (IOException e) {
+            size = 0;
+        }
+        return size;
+    }
+
+    public interface Callback {
+        void onProgress(long current, long total);
+    }
+}
diff --git a/gradle.properties b/gradle.properties
new file mode 100644
index 0000000..38d44b7
--- /dev/null
+++ b/gradle.properties
@@ -0,0 +1,17 @@
+android.enableJetifier=true
+android.useAndroidX=true
+org.gradle.daemon=true
+org.gradle.jvmargs=-Xms2048m -Xmx6656m
+
+appcompatVersion=1.3.0
+recyclerviewVersion=1.2.1
+preferenceVersion=1.1.0
+annotationVersion=1.1.0
+ottoVersion=1.3.8
+ijkVersion=0.8.8
+core_ktx=1.5.0
+lifecycle_ktx=2.3.1
+android.injected.testOnly=false
+android.defaults.buildfeatures.buildconfig=true
+android.nonTransitiveRClass=false
+android.nonFinalResIds=false
\ No newline at end of file
diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar
new file mode 100644
index 0000000..f6b961f
Binary files /dev/null and b/gradle/wrapper/gradle-wrapper.jar differ
diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties
new file mode 100644
index 0000000..34e4582
--- /dev/null
+++ b/gradle/wrapper/gradle-wrapper.properties
@@ -0,0 +1,6 @@
+#Sun Aug 25 10:34:48 CST 2019
+distributionBase=GRADLE_USER_HOME
+distributionPath=wrapper/dists
+zipStoreBase=GRADLE_USER_HOME
+zipStorePath=wrapper/dists
+distributionUrl=https\://services.gradle.org/distributions/gradle-8.0-all.zip
diff --git a/settings.gradle b/settings.gradle
new file mode 100644
index 0000000..959ad2d
--- /dev/null
+++ b/settings.gradle
@@ -0,0 +1,2 @@
+include ':duix-sdk'
+include ':test'
diff --git a/test/.gitignore b/test/.gitignore
new file mode 100644
index 0000000..42afabf
--- /dev/null
+++ b/test/.gitignore
@@ -0,0 +1 @@
+/build
\ No newline at end of file
diff --git a/test/build.gradle b/test/build.gradle
new file mode 100644
index 0000000..f5e9fab
--- /dev/null
+++ b/test/build.gradle
@@ -0,0 +1,90 @@
+plugins {
+    id 'com.android.application'
+    id 'org.jetbrains.kotlin.android'
+}
+
+static def getCurrentTime() {
+    return new Date().format("yyyy-MM-dd-HH-mm", TimeZone.getTimeZone("GMT+08:00"))
+}
+
+android {
+    namespace 'ai.guiji.duix.test'
+    compileSdk 34
+
+    defaultConfig {
+        applicationId "ai.guiji.duix.test"
+        minSdk 24
+        targetSdk 34
+        versionCode 13
+        versionName '4.1.1'
+    }
+
+    signingConfigs {
+        release {
+            storeFile file('../demo.jks')
+            storePassword '123456'
+            keyAlias 'demo'
+            keyPassword '123456'
+        }
+    }
+
+    buildTypes {
+        debug {
+            minifyEnabled false
+            proguardFiles getDefaultProguardFile('proguard-android-optimize.txt'), 'proguard-rules.pro'
+            signingConfig signingConfigs.release
+        }
+        release {
+            minifyEnabled true
+            proguardFiles getDefaultProguardFile('proguard-android-optimize.txt'), 'proguard-rules.pro'
+            signingConfig signingConfigs.release
+
+            applicationVariants.all { variant ->
+                variant.outputs.all {
+                    outputFileName = "duix_mobile_test_${buildType.name}_${getCurrentTime()}_${defaultConfig.versionName}.apk"
+                }
+            }
+        }
+    }
+    compileOptions {
+        sourceCompatibility JavaVersion.VERSION_1_8
+        targetCompatibility JavaVersion.VERSION_1_8
+    }
+    kotlinOptions {
+        jvmTarget = '1.8'
+    }
+    buildFeatures {
+        viewBinding true
+    }
+    composeOptions {
+        kotlinCompilerExtensionVersion '1.4.3'
+    }
+    packaging {
+        resources {
+            excludes += '/META-INF/{AL2.0,LGPL2.1}'
+        }
+    }
+
+    lint {
+        baseline = file("lint-baseline.xml")
+    }
+    lintOptions{
+        checkReleaseBuilds false
+    }
+}
+
+dependencies {
+
+    implementation fileTree(include: ['*.jar', '*.aar'], dir: 'libs')
+    implementation 'androidx.core:core-ktx:1.12.0'
+    implementation 'androidx.appcompat:appcompat:1.2.0'
+    implementation 'com.google.android.material:material:1.4.0'
+    implementation 'androidx.constraintlayout:constraintlayout:2.1.1'
+    implementation "androidx.activity:activity:1.3.0"
+    implementation "androidx.fragment:fragment:1.3.0"
+
+    implementation 'com.github.bumptech.glide:glide:4.12.0'
+
+    implementation project(":duix-sdk")
+    implementation 'com.squareup.okhttp3:okhttp:4.10.0'
+}
\ No newline at end of file
diff --git a/test/lint-baseline.xml b/test/lint-baseline.xml
new file mode 100644
index 0000000..3654247
--- /dev/null
+++ b/test/lint-baseline.xml
@@ -0,0 +1,561 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<issues format="6" by="lint 8.1.2" type="baseline" client="gradle" dependencies="false" name="AGP (8.1.2)" variant="all" version="8.1.2">
+
+    <issue
+        id="CanvasSize"
+        message="Calling `Canvas.getHeight()` is usually wrong; you should be calling `getHeight()` instead"
+        errorLine1="                    canvas.getWidth(), canvas.getHeight(), Bitmap.Config.ARGB_8888);"
+        errorLine2="                                       ~~~~~~~~~~~~~~~~~~">
+        <location
+            file="src/main/java/ai/guiji/duix/test/ui/view/VisualizerView.java"
+            line="98"
+            column="40"/>
+    </issue>
+
+    <issue
+        id="CanvasSize"
+        message="Calling `Canvas.getWidth()` is usually wrong; you should be calling `getWidth()` instead"
+        errorLine1="                    canvas.getWidth(), canvas.getHeight(), Bitmap.Config.ARGB_8888);"
+        errorLine2="                    ~~~~~~~~~~~~~~~~~">
+        <location
+            file="src/main/java/ai/guiji/duix/test/ui/view/VisualizerView.java"
+            line="98"
+            column="21"/>
+    </issue>
+
+    <issue
+        id="CustomViewStyleable"
+        message="By convention, the custom view (`VisualizerView`) and the declare-styleable (`visualizerView`) should have the same name (various editor features rely on this convention)"
+        errorLine1="        TypedArray args = context.obtainStyledAttributes(attrs, R.styleable.visualizerView);"
+        errorLine2="                                                                ~~~~~~~~~~~~~~~~~~~~~~~~~~">
+        <location
+            file="src/main/java/ai/guiji/duix/test/ui/view/VisualizerView.java"
+            line="73"
+            column="65"/>
+    </issue>
+
+    <issue
+        id="NotificationPermission"
+        message="When targeting Android 13 or higher, posting a permission requires holding the `POST_NOTIFICATIONS` permission (usage from com.bumptech.glide.request.target.NotificationTarget)">
+        <location
+            file="src/main/AndroidManifest.xml"/>
+    </issue>
+
+    <issue
+        id="SimpleDateFormat"
+        message="To get local formatting use `getDateInstance()`, `getDateTimeInstance()`, or `getTimeInstance()`, or use `new SimpleDateFormat(String template, Locale locale)` with for example `Locale.US` for ASCII dates."
+        errorLine1="        SimpleDateFormat fmt = new SimpleDateFormat(&quot;yyyyMMddHHmmssSSS&quot;);"
+        errorLine2="                               ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~">
+        <location
+            file="src/main/java/ai/guiji/duix/test/util/StringUtils.java"
+            line="170"
+            column="32"/>
+    </issue>
+
+    <issue
+        id="SimpleDateFormat"
+        message="To get local formatting use `getDateInstance()`, `getDateTimeInstance()`, or `getTimeInstance()`, or use `new SimpleDateFormat(String template, Locale locale)` with for example `Locale.US` for ASCII dates."
+        errorLine1="        SimpleDateFormat format = new SimpleDateFormat(&quot;yyyyMMddHHmmssSSS&quot;);"
+        errorLine2="                                  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~">
+        <location
+            file="src/main/java/ai/guiji/duix/test/util/StringUtils.java"
+            line="177"
+            column="35"/>
+    </issue>
+
+    <issue
+        id="SimpleDateFormat"
+        message="To get local formatting use `getDateInstance()`, `getDateTimeInstance()`, or `getTimeInstance()`, or use `new SimpleDateFormat(String template, Locale locale)` with for example `Locale.US` for ASCII dates."
+        errorLine1="        SimpleDateFormat format = new SimpleDateFormat(&quot;yyyy-MM&quot;);"
+        errorLine2="                                  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~">
+        <location
+            file="src/main/java/ai/guiji/duix/test/util/StringUtils.java"
+            line="183"
+            column="35"/>
+    </issue>
+
+    <issue
+        id="SimpleDateFormat"
+        message="To get local formatting use `getDateInstance()`, `getDateTimeInstance()`, or `getTimeInstance()`, or use `new SimpleDateFormat(String template, Locale locale)` with for example `Locale.US` for ASCII dates."
+        errorLine1="        SimpleDateFormat format = new SimpleDateFormat(&quot;MM-dd HH:mm&quot;);"
+        errorLine2="                                  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~">
+        <location
+            file="src/main/java/ai/guiji/duix/test/util/StringUtils.java"
+            line="189"
+            column="35"/>
+    </issue>
+
+    <issue
+        id="SimpleDateFormat"
+        message="To get local formatting use `getDateInstance()`, `getDateTimeInstance()`, or `getTimeInstance()`, or use `new SimpleDateFormat(String template, Locale locale)` with for example `Locale.US` for ASCII dates."
+        errorLine1="        SimpleDateFormat sdf = new SimpleDateFormat(&quot;yyyy-MM-dd&quot;);"
+        errorLine2="                               ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~">
+        <location
+            file="src/main/java/ai/guiji/duix/test/util/StringUtils.java"
+            line="365"
+            column="32"/>
+    </issue>
+
+    <issue
+        id="UseSwitchCompatOrMaterialXml"
+        message="Use `SwitchCompat` from AppCompat or `SwitchMaterial` from Material library"
+        errorLine1="            &lt;Switch"
+        errorLine2="            ^">
+        <location
+            file="src/main/res/layout/activity_main.xml"
+            line="157"
+            column="13"/>
+    </issue>
+
+    <issue
+        id="GradleDependency"
+        message="A newer version of androidx.core:core-ktx than 1.12.0 is available: 1.13.0"
+        errorLine1=""
+        errorLine2="                   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~">
+        <location
+            file="build.gradle"
+            line="47"
+            column="20"/>
+    </issue>
+
+    <issue
+        id="GradleDependency"
+        message="A newer version of androidx.appcompat:appcompat than 1.2.0 is available: 1.6.1"
+        errorLine1="dependencies {"
+        errorLine2="                   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~">
+        <location
+            file="build.gradle"
+            line="48"
+            column="20"/>
+    </issue>
+
+    <issue
+        id="GradleDependency"
+        message="A newer version of com.google.android.material:material than 1.4.0 is available: 1.11.0"
+        errorLine1=""
+        errorLine2="                   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~">
+        <location
+            file="build.gradle"
+            line="49"
+            column="20"/>
+    </issue>
+
+    <issue
+        id="GradleDependency"
+        message="A newer version of androidx.constraintlayout:constraintlayout than 2.1.1 is available: 2.1.4"
+        errorLine1="    implementation fileTree(include: [&apos;*.jar&apos;, &apos;*.aar&apos;], dir: &apos;libs&apos;)"
+        errorLine2="                   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~">
+        <location
+            file="build.gradle"
+            line="50"
+            column="20"/>
+    </issue>
+
+    <issue
+        id="GradleDependency"
+        message="A newer version of androidx.activity:activity than 1.3.0 is available: 1.8.2"
+        errorLine1="    implementation &apos;androidx.core:core-ktx:1.12.0&apos;"
+        errorLine2="                   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~">
+        <location
+            file="build.gradle"
+            line="51"
+            column="20"/>
+    </issue>
+
+    <issue
+        id="GradleDependency"
+        message="A newer version of androidx.fragment:fragment than 1.3.0 is available: 1.5.7"
+        errorLine1="    implementation &apos;androidx.appcompat:appcompat:1.2.0&apos;"
+        errorLine2="                   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~">
+        <location
+            file="build.gradle"
+            line="52"
+            column="20"/>
+    </issue>
+
+    <issue
+        id="GradleDependency"
+        message="A newer version of com.google.android.exoplayer:exoplayer than 2.14.2 is available: 2.18.5"
+        errorLine1=""
+        errorLine2="                   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~">
+        <location
+            file="build.gradle"
+            line="59"
+            column="20"/>
+    </issue>
+
+    <issue
+        id="SpUsage"
+        message="Should use &quot;`sp`&quot; instead of &quot;`dp`&quot; for text sizes"
+        errorLine1="        android:textSize=&quot;18dp&quot;"
+        errorLine2="        ~~~~~~~~~~~~~~~~~~~~~~~">
+        <location
+            file="src/main/res/layout/activity_main.xml"
+            line="15"
+            column="9"/>
+    </issue>
+
+    <issue
+        id="DrawAllocation"
+        message="Avoid object allocations during draw/layout operations (preallocate and reuse instead)"
+        errorLine1="        canvas.drawBitmap(mCanvasBitmap, new Matrix(), null);"
+        errorLine2="                                         ~~~~~~~~~~~~">
+        <location
+            file="src/main/java/ai/guiji/duix/test/ui/view/VisualizerView.java"
+            line="116"
+            column="42"/>
+    </issue>
+
+    <issue
+        id="ObsoleteSdkInt"
+        message="Unnecessary; SDK_INT is never &lt; 24"
+        errorLine1="        if (Build.VERSION.SDK_INT &lt; Build.VERSION_CODES.M) {"
+        errorLine2="            ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~">
+        <location
+            file="src/main/java/ai/guiji/duix/test/ui/activity/BaseActivity.java"
+            line="89"
+            column="13"/>
+    </issue>
+
+    <issue
+        id="UnusedResources"
+        message="The resource `R.color.purple_200` appears to be unused"
+        errorLine1="    &lt;color name=&quot;purple_200&quot;>#FFBB86FC&lt;/color>"
+        errorLine2="           ~~~~~~~~~~~~~~~~~">
+        <location
+            file="src/main/res/values/colors.xml"
+            line="3"
+            column="12"/>
+    </issue>
+
+    <issue
+        id="UnusedResources"
+        message="The resource `R.color.purple_500` appears to be unused"
+        errorLine1="    &lt;color name=&quot;purple_500&quot;>#FF6200EE&lt;/color>"
+        errorLine2="           ~~~~~~~~~~~~~~~~~">
+        <location
+            file="src/main/res/values/colors.xml"
+            line="4"
+            column="12"/>
+    </issue>
+
+    <issue
+        id="UnusedResources"
+        message="The resource `R.color.purple_700` appears to be unused"
+        errorLine1="    &lt;color name=&quot;purple_700&quot;>#FF3700B3&lt;/color>"
+        errorLine2="           ~~~~~~~~~~~~~~~~~">
+        <location
+            file="src/main/res/values/colors.xml"
+            line="5"
+            column="12"/>
+    </issue>
+
+    <issue
+        id="UnusedResources"
+        message="The resource `R.color.teal_200` appears to be unused"
+        errorLine1="    &lt;color name=&quot;teal_200&quot;>#FF03DAC5&lt;/color>"
+        errorLine2="           ~~~~~~~~~~~~~~~">
+        <location
+            file="src/main/res/values/colors.xml"
+            line="6"
+            column="12"/>
+    </issue>
+
+    <issue
+        id="UnusedResources"
+        message="The resource `R.color.teal_700` appears to be unused"
+        errorLine1="    &lt;color name=&quot;teal_700&quot;>#FF018786&lt;/color>"
+        errorLine2="           ~~~~~~~~~~~~~~~">
+        <location
+            file="src/main/res/values/colors.xml"
+            line="7"
+            column="12"/>
+    </issue>
+
+    <issue
+        id="UnusedResources"
+        message="The resource `R.drawable.ic_launcher_background` appears to be unused"
+        errorLine1="&lt;vector xmlns:android=&quot;http://schemas.android.com/apk/res/android&quot;"
+        errorLine2="^">
+        <location
+            file="src/main/res/drawable/ic_launcher_background.xml"
+            line="2"
+            column="1"/>
+    </issue>
+
+    <issue
+        id="UnusedResources"
+        message="The resource `R.drawable.ic_launcher_foreground` appears to be unused"
+        errorLine1="&lt;vector xmlns:android=&quot;http://schemas.android.com/apk/res/android&quot;"
+        errorLine2="^">
+        <location
+            file="src/main/res/drawable/ic_launcher_foreground.xml"
+            line="1"
+            column="1"/>
+    </issue>
+
+    <issue
+        id="UnusedResources"
+        message="The resource `R.string.need_record_permission` appears to be unused"
+        errorLine1="    &lt;string name=&quot;need_record_permission&quot;>需要授予语音权限以继续操作&lt;/string>"
+        errorLine2="            ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~">
+        <location
+            file="src/main/res/values/strings.xml"
+            line="15"
+            column="13"/>
+    </issue>
+
+    <issue
+        id="TextFields"
+        message="This text field does not specify an `inputType`"
+        errorLine1="            &lt;EditText"
+        errorLine2="             ~~~~~~~~">
+        <location
+            file="src/main/res/layout/activity_main.xml"
+            line="47"
+            column="14"/>
+    </issue>
+
+    <issue
+        id="TextFields"
+        message="This text field does not specify an `inputType`"
+        errorLine1="            &lt;EditText"
+        errorLine2="             ~~~~~~~~">
+        <location
+            file="src/main/res/layout/activity_main.xml"
+            line="70"
+            column="14"/>
+    </issue>
+
+    <issue
+        id="TextFields"
+        message="This text field does not specify an `inputType`"
+        errorLine1="            &lt;EditText"
+        errorLine2="             ~~~~~~~~">
+        <location
+            file="src/main/res/layout/activity_main.xml"
+            line="93"
+            column="14"/>
+    </issue>
+
+    <issue
+        id="TextFields"
+        message="This text field does not specify an `inputType`"
+        errorLine1="            &lt;EditText"
+        errorLine2="             ~~~~~~~~">
+        <location
+            file="src/main/res/layout/activity_main.xml"
+            line="115"
+            column="14"/>
+    </issue>
+
+    <issue
+        id="Autofill"
+        message="Missing `autofillHints` attribute"
+        errorLine1="            &lt;EditText"
+        errorLine2="             ~~~~~~~~">
+        <location
+            file="src/main/res/layout/activity_main.xml"
+            line="47"
+            column="14"/>
+    </issue>
+
+    <issue
+        id="Autofill"
+        message="Missing `autofillHints` attribute"
+        errorLine1="            &lt;EditText"
+        errorLine2="             ~~~~~~~~">
+        <location
+            file="src/main/res/layout/activity_main.xml"
+            line="70"
+            column="14"/>
+    </issue>
+
+    <issue
+        id="Autofill"
+        message="Missing `autofillHints` attribute"
+        errorLine1="            &lt;EditText"
+        errorLine2="             ~~~~~~~~">
+        <location
+            file="src/main/res/layout/activity_main.xml"
+            line="93"
+            column="14"/>
+    </issue>
+
+    <issue
+        id="Autofill"
+        message="Missing `autofillHints` attribute"
+        errorLine1="            &lt;EditText"
+        errorLine2="             ~~~~~~~~">
+        <location
+            file="src/main/res/layout/activity_main.xml"
+            line="115"
+            column="14"/>
+    </issue>
+
+    <issue
+        id="ContentDescription"
+        message="Missing `contentDescription` attribute on image"
+        errorLine1="    &lt;ImageView"
+        errorLine2="     ~~~~~~~~~">
+        <location
+            file="src/main/res/layout/activity_call.xml"
+            line="7"
+            column="6"/>
+    </issue>
+
+    <issue
+        id="ContentDescription"
+        message="Missing `contentDescription` attribute on image"
+        errorLine1="            &lt;ImageView"
+        errorLine2="             ~~~~~~~~~">
+        <location
+            file="src/main/res/layout/activity_call.xml"
+            line="33"
+            column="14"/>
+    </issue>
+
+    <issue
+        id="ContentDescription"
+        message="Missing `contentDescription` attribute on image"
+        errorLine1="            &lt;ImageView"
+        errorLine2="             ~~~~~~~~~">
+        <location
+            file="src/main/res/layout/activity_call.xml"
+            line="43"
+            column="14"/>
+    </issue>
+
+    <issue
+        id="ContentDescription"
+        message="Missing `contentDescription` attribute on image"
+        errorLine1="            &lt;ImageView"
+        errorLine2="             ~~~~~~~~~">
+        <location
+            file="src/main/res/layout/activity_call.xml"
+            line="93"
+            column="14"/>
+    </issue>
+
+    <issue
+        id="ContentDescription"
+        message="Missing `contentDescription` attribute on image"
+        errorLine1="            &lt;ImageView"
+        errorLine2="             ~~~~~~~~~">
+        <location
+            file="src/main/res/layout/activity_call.xml"
+            line="103"
+            column="14"/>
+    </issue>
+
+    <issue
+        id="LabelFor"
+        message="Missing accessibility label: provide either a view with an `android:labelFor` that references this view or provide an `android:hint`"
+        errorLine1="            &lt;EditText"
+        errorLine2="             ~~~~~~~~">
+        <location
+            file="src/main/res/layout/activity_main.xml"
+            line="47"
+            column="14"/>
+    </issue>
+
+    <issue
+        id="LabelFor"
+        message="Missing accessibility label: provide either a view with an `android:labelFor` that references this view or provide an `android:hint`"
+        errorLine1="            &lt;EditText"
+        errorLine2="             ~~~~~~~~">
+        <location
+            file="src/main/res/layout/activity_main.xml"
+            line="70"
+            column="14"/>
+    </issue>
+
+    <issue
+        id="LabelFor"
+        message="Missing accessibility label: provide either a view with an `android:labelFor` that references this view or provide an `android:hint`"
+        errorLine1="            &lt;EditText"
+        errorLine2="             ~~~~~~~~">
+        <location
+            file="src/main/res/layout/activity_main.xml"
+            line="93"
+            column="14"/>
+    </issue>
+
+    <issue
+        id="LabelFor"
+        message="Missing accessibility label: provide either a view with an `android:labelFor` that references this view or provide an `android:hint`"
+        errorLine1="            &lt;EditText"
+        errorLine2="             ~~~~~~~~">
+        <location
+            file="src/main/res/layout/activity_main.xml"
+            line="115"
+            column="14"/>
+    </issue>
+
+    <issue
+        id="HardcodedText"
+        message="Hardcoded string &quot;1214228509954805760&quot;, should use `@string` resource"
+        errorLine1="                android:text=&quot;1214228509954805760&quot;"
+        errorLine2="                ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~">
+        <location
+            file="src/main/res/layout/activity_main.xml"
+            line="51"
+            column="17"/>
+    </issue>
+
+    <issue
+        id="HardcodedText"
+        message="Hardcoded string &quot;5b8b91f1-fb2d-4cdf-a8f0-5f6dd178687b&quot;, should use `@string` resource"
+        errorLine1="                android:text=&quot;5b8b91f1-fb2d-4cdf-a8f0-5f6dd178687b&quot;"
+        errorLine2="                ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~">
+        <location
+            file="src/main/res/layout/activity_main.xml"
+            line="74"
+            column="17"/>
+    </issue>
+
+    <issue
+        id="HardcodedText"
+        message="Hardcoded string &quot;1764548680285933569&quot;, should use `@string` resource"
+        errorLine1="                android:text=&quot;1764548680285933569&quot;"
+        errorLine2="                ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~">
+        <location
+            file="src/main/res/layout/activity_main.xml"
+            line="97"
+            column="17"/>
+    </issue>
+
+    <issue
+        id="HardcodedText"
+        message="Hardcoded string &quot;300&quot;, should use `@string` resource"
+        errorLine1="                android:text=&quot;300&quot;"
+        errorLine2="                ~~~~~~~~~~~~~~~~~~">
+        <location
+            file="src/main/res/layout/activity_main.xml"
+            line="119"
+            column="17"/>
+    </issue>
+
+    <issue
+        id="HardcodedText"
+        message="Hardcoded string &quot;启动ASR&quot;, should use `@string` resource"
+        errorLine1="        android:text=&quot;启动ASR&quot;"
+        errorLine2="        ~~~~~~~~~~~~~~~~~~~~">
+        <location
+            file="src/main/res/layout/activity_test_asr.xml"
+            line="19"
+            column="9"/>
+    </issue>
+
+    <issue
+        id="HardcodedText"
+        message="Hardcoded string &quot;关闭ASR&quot;, should use `@string` resource"
+        errorLine1="        android:text=&quot;关闭ASR&quot;"
+        errorLine2="        ~~~~~~~~~~~~~~~~~~~~">
+        <location
+            file="src/main/res/layout/activity_test_asr.xml"
+            line="27"
+            column="9"/>
+    </issue>
+
+</issues>
diff --git a/test/proguard-rules.pro b/test/proguard-rules.pro
new file mode 100644
index 0000000..30e622b
--- /dev/null
+++ b/test/proguard-rules.pro
@@ -0,0 +1,26 @@
+# Add project specific ProGuard rules here.
+# You can control the set of applied configuration files using the
+# proguardFiles setting in build.gradle.
+#
+# For more details, see
+#   http://developer.android.com/guide/developing/tools/proguard.html
+
+# If your project uses WebView with JS, uncomment the following
+# and specify the fully qualified class name to the JavaScript interface
+# class:
+#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
+#   public *;
+#}
+
+# Uncomment this to preserve the line number information for
+# debugging stack traces.
+#-keepattributes SourceFile,LineNumberTable
+
+# If you keep the line number information, uncomment this to
+# hide the original source file name.
+#-renamesourcefileattribute SourceFile
+
+-dontwarn com.squareup.okhttp3.**
+-keep class com.squareup.okhttp3.** { *;}
+
+-keep class ai.guiji.duix.DuixNcnn{*; }
\ No newline at end of file
diff --git a/test/src/main/AndroidManifest.xml b/test/src/main/AndroidManifest.xml
new file mode 100644
index 0000000..62f5af7
--- /dev/null
+++ b/test/src/main/AndroidManifest.xml
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android">
+
+    <uses-permission android:name="android.permission.INTERNET" />
+    <uses-permission android:name="android.permission.RECORD_AUDIO"/>
+    <uses-permission android:name="android.permission.CAMERA"/>
+    <uses-feature android:name="android.hardware.camera" android:required="false"/>
+    <uses-feature android:name="android.hardware.camera.front" android:required="false"/>
+
+    <application
+        android:name=".App"
+        android:allowBackup="true"
+        android:icon="@mipmap/ic_launcher"
+        android:networkSecurityConfig="@xml/network_security_config"
+        android:label="@string/app_name"
+        android:roundIcon="@mipmap/ic_launcher_round"
+        android:supportsRtl="true"
+        android:theme="@style/Theme.DUIX.Test">
+        <activity
+            android:name=".ui.activity.MainActivity"
+            android:exported="true">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+
+        <activity
+            android:name=".ui.activity.CallActivity"
+            android:exported="false">
+        </activity>
+    </application>
+
+</manifest>
\ No newline at end of file
diff --git a/test/src/main/assets/bg/bg1.png b/test/src/main/assets/bg/bg1.png
new file mode 100644
index 0000000..57d25ed
Binary files /dev/null and b/test/src/main/assets/bg/bg1.png differ
diff --git a/test/src/main/assets/bg/bg2.jpg b/test/src/main/assets/bg/bg2.jpg
new file mode 100644
index 0000000..67dfb4b
Binary files /dev/null and b/test/src/main/assets/bg/bg2.jpg differ
diff --git a/test/src/main/assets/pcm/2.pcm b/test/src/main/assets/pcm/2.pcm
new file mode 100644
index 0000000..46f4670
Binary files /dev/null and b/test/src/main/assets/pcm/2.pcm differ
diff --git a/test/src/main/assets/pcm/222.pcm b/test/src/main/assets/pcm/222.pcm
new file mode 100644
index 0000000..32c54d3
Binary files /dev/null and b/test/src/main/assets/pcm/222.pcm differ
diff --git a/test/src/main/assets/wav/1.wav b/test/src/main/assets/wav/1.wav
new file mode 100644
index 0000000..4811eea
Binary files /dev/null and b/test/src/main/assets/wav/1.wav differ
diff --git a/test/src/main/assets/wav/2.wav b/test/src/main/assets/wav/2.wav
new file mode 100644
index 0000000..22a25d4
Binary files /dev/null and b/test/src/main/assets/wav/2.wav differ
diff --git a/test/src/main/java/ai/guiji/duix/test/App.java b/test/src/main/java/ai/guiji/duix/test/App.java
new file mode 100644
index 0000000..0c2a350
--- /dev/null
+++ b/test/src/main/java/ai/guiji/duix/test/App.java
@@ -0,0 +1,31 @@
+package ai.guiji.duix.test;
+
+import android.app.Application;
+import android.text.TextUtils;
+
+import java.util.concurrent.TimeUnit;
+
+import okhttp3.OkHttpClient;
+
+public class App extends Application {
+
+    public static App mApp;
+    private static OkHttpClient mOkHttpClient;
+
+    @Override
+    public void onCreate() {
+        super.onCreate();
+        mApp = this;
+    }
+
+    public static OkHttpClient getOkHttpClient() {
+        if (mOkHttpClient == null) {
+            mOkHttpClient = new OkHttpClient.Builder()
+                    .connectTimeout(15, TimeUnit.SECONDS)
+                    .writeTimeout(15, TimeUnit.SECONDS)
+                    .readTimeout(15, TimeUnit.SECONDS)
+                    .build();
+        }
+        return mOkHttpClient;
+    }
+}
diff --git a/test/src/main/java/ai/guiji/duix/test/audio/AudioRecorder.java b/test/src/main/java/ai/guiji/duix/test/audio/AudioRecorder.java
new file mode 100644
index 0000000..a1d71bb
--- /dev/null
+++ b/test/src/main/java/ai/guiji/duix/test/audio/AudioRecorder.java
@@ -0,0 +1,110 @@
+package ai.guiji.duix.test.audio;
+
+import android.annotation.SuppressLint;
+import android.content.Context;
+import android.media.AudioFormat;
+import android.media.AudioRecord;
+import android.media.MediaRecorder;
+import android.util.Log;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.util.concurrent.Executor;
+import java.util.concurrent.Executors;
+
+import ai.guiji.duix.test.util.StringUtils;
+
+public class AudioRecorder {
+
+    private final String TAG = getClass().getSimpleName();
+
+    private final Context mContext;
+    private AudioRecord mAudioRecorder; //录音器
+    private final Executor mExecutor = Executors.newSingleThreadExecutor();
+    private final RecorderCallback callback;
+
+    public AudioRecorder(Context context, RecorderCallback callback){
+        this.mContext = context;
+        this.callback = callback;
+    }
+
+    @SuppressLint("MissingPermission")
+    public void start(){
+        int sampleRateInHz = 16000;
+        int channelConfig = AudioFormat.CHANNEL_IN_MONO;
+        int audioFormat = AudioFormat.ENCODING_PCM_16BIT;
+        //20ms audio for 16k/16bit/mono
+//        int WAVE_FRAM_SIZE = 20 * 2 * 1 * SAMPLE_RATE / 1000;
+        int minBufferSize = AudioRecord.getMinBufferSize(
+                sampleRateInHz,
+                channelConfig,
+                audioFormat
+        );
+        Log.d(TAG, "minBufferSize: " + minBufferSize);
+        mAudioRecorder = new AudioRecord(MediaRecorder.AudioSource.DEFAULT,
+                sampleRateInHz, channelConfig,
+                audioFormat,
+                minBufferSize);
+        if (mAudioRecorder.getState() != AudioRecord.STATE_UNINITIALIZED){
+            mAudioRecorder.startRecording();
+            mExecutor.execute(() -> {
+                long startTime = System.currentTimeMillis();
+                File cacheDir = mContext.getExternalCacheDir();
+                if (!cacheDir.exists()){
+                    if (!cacheDir.mkdirs()) Log.e(TAG, "mkdirs fail path: " + cacheDir.getAbsolutePath());
+                }
+                String pcmName = StringUtils.createFileName("record_", ".pcm");
+                File pcmFile = new File(cacheDir, pcmName);
+                try (FileOutputStream outputStream = new FileOutputStream(pcmFile)) {
+                    byte[] data = new byte[minBufferSize];
+                    while (mAudioRecorder.getRecordingState() == AudioRecord.RECORDSTATE_RECORDING){
+                        int length = mAudioRecorder.read(data, 0, minBufferSize);
+                        if (length > 0){
+                            outputStream.write(data, 0, length);
+                            if (callback != null){
+                                callback.onReadData(data, 0, length);
+                            }
+                        }
+                    }
+                    Log.d(TAG, "Record done.");
+                    long diff = System.currentTimeMillis() - startTime;
+                    if (callback != null){
+                        if (diff > 200){
+                            callback.onFinish(pcmFile.getAbsolutePath());
+                        } else {
+                            callback.onRecordError(-2, "too short!");
+                        }
+                    }
+                } catch (Exception e) {
+                    Log.e(TAG, "Record error: " + e);
+                    if (callback != null){
+                        callback.onRecordError(-1, "Record error: " + e);
+                    }
+                }
+            });
+        }
+    }
+
+    public void stop(){
+        if (mAudioRecorder != null){
+            if (mAudioRecorder.getRecordingState() == AudioRecord.RECORDSTATE_RECORDING){
+                mAudioRecorder.stop();
+            }
+        }
+    }
+
+    public void release(){
+        if (mAudioRecorder != null){
+            mAudioRecorder.release();
+            mAudioRecorder = null;
+        }
+    }
+
+    public interface RecorderCallback{
+        void onReadData(byte[] data, int offsetInBytes, int length);
+
+        void onRecordError(int code, String message);
+
+        void onFinish(String path);
+    }
+}
diff --git a/test/src/main/java/ai/guiji/duix/test/audio/AudioResampler.java b/test/src/main/java/ai/guiji/duix/test/audio/AudioResampler.java
new file mode 100644
index 0000000..2c913dc
--- /dev/null
+++ b/test/src/main/java/ai/guiji/duix/test/audio/AudioResampler.java
@@ -0,0 +1,52 @@
+package ai.guiji.duix.test.audio;
+
+/**
+ * 简单线性插值重采样：24kHz 16bit mono -> 16kHz 16bit mono
+ * 用于将 Qwen-Omni-Realtime 输出的 24kHz 音频转换为 DUIX 所需的 16kHz
+ */
+public class AudioResampler {
+
+    private static final int INPUT_RATE = 24000;
+    private static final int OUTPUT_RATE = 16000;
+    private static final double RATIO = (double) OUTPUT_RATE / INPUT_RATE; // 2/3
+
+    /**
+     * 将 24kHz 16bit 单声道 PCM 重采样为 16kHz
+     *
+     * @param input 24kHz 16bit mono PCM
+     * @return 16kHz 16bit mono PCM
+     */
+    public static byte[] resample24kTo16k(byte[] input) {
+        if (input == null || input.length < 2) return new byte[0];
+
+        int inputSamples = input.length / 2; // 16bit = 2 bytes per sample
+        int outputSamples = (int) Math.ceil(inputSamples * RATIO);
+
+        byte[] output = new byte[outputSamples * 2];
+
+        for (int i = 0; i < outputSamples; i++) {
+            double srcPos = i / RATIO;
+            int idx0 = (int) Math.floor(srcPos);
+            int idx1 = Math.min(idx0 + 1, inputSamples - 1);
+            double frac = srcPos - idx0;
+
+            short s0 = getShortLE(input, idx0 * 2);
+            short s1 = getShortLE(input, idx1 * 2);
+            short interpolated = (short) (s0 + (s1 - s0) * frac);
+            putShortLE(output, i * 2, interpolated);
+        }
+        return output;
+    }
+
+    private static short getShortLE(byte[] data, int offset) {
+        if (offset + 1 >= data.length) return 0;
+        return (short) ((data[offset] & 0xFF) | (data[offset + 1] << 8));
+    }
+
+    private static void putShortLE(byte[] data, int offset, short value) {
+        if (offset + 1 < data.length) {
+            data[offset] = (byte) (value & 0xFF);
+            data[offset + 1] = (byte) ((value >> 8) & 0xFF);
+        }
+    }
+}
diff --git a/test/src/main/java/ai/guiji/duix/test/audio/StreamingAudioRecorder.java b/test/src/main/java/ai/guiji/duix/test/audio/StreamingAudioRecorder.java
new file mode 100644
index 0000000..da7b0b9
--- /dev/null
+++ b/test/src/main/java/ai/guiji/duix/test/audio/StreamingAudioRecorder.java
@@ -0,0 +1,106 @@
+package ai.guiji.duix.test.audio;
+
+import android.annotation.SuppressLint;
+import android.content.Context;
+import android.media.AudioFormat;
+import android.media.AudioRecord;
+import android.media.MediaRecorder;
+import android.util.Log;
+
+import java.util.concurrent.Executor;
+import java.util.concurrent.Executors;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+/**
+ * 流式麦克风录音器，用于实时将音频发送到 OmniRealtime API
+ * 16kHz, 16bit, mono - 符合 API 输入格式要求
+ */
+public class StreamingAudioRecorder {
+
+    private static final String TAG = "StreamingAudioRecorder";
+    private static final int SAMPLE_RATE = 16000;
+    private static final int CHANNEL_CONFIG = AudioFormat.CHANNEL_IN_MONO;
+    private static final int AUDIO_FORMAT = AudioFormat.ENCODING_PCM_16BIT;
+    // 20ms 一帧: 16000 * 2 * 0.02 = 640 bytes, 使用 3200 与文档示例一致(100ms)
+    private static final int CHUNK_SIZE = 3200;
+
+    private final Context mContext;
+    private AudioRecord mAudioRecord;
+    private final Executor mExecutor = Executors.newSingleThreadExecutor();
+    private final StreamingCallback callback;
+    private final AtomicBoolean isRecording = new AtomicBoolean(false);
+
+    public StreamingAudioRecorder(Context context, StreamingCallback callback) {
+        this.mContext = context;
+        this.callback = callback;
+    }
+
+    @SuppressLint("MissingPermission")
+    public void start() {
+        if (isRecording.get()) {
+            Log.w(TAG, "Already recording");
+            return;
+        }
+        int minBufferSize = AudioRecord.getMinBufferSize(SAMPLE_RATE, CHANNEL_CONFIG, AUDIO_FORMAT);
+        int bufferSize = Math.max(minBufferSize, CHUNK_SIZE * 2);
+
+        mAudioRecord = new AudioRecord(
+                MediaRecorder.AudioSource.VOICE_COMMUNICATION,
+                SAMPLE_RATE,
+                CHANNEL_CONFIG,
+                AUDIO_FORMAT,
+                bufferSize
+        );
+
+        if (mAudioRecord.getState() != AudioRecord.STATE_INITIALIZED) {
+            Log.e(TAG, "AudioRecord init failed");
+            if (callback != null) {
+                callback.onError("麦克风初始化失败");
+            }
+            return;
+        }
+
+        isRecording.set(true);
+        mAudioRecord.startRecording();
+
+        mExecutor.execute(() -> {
+            byte[] buffer = new byte[CHUNK_SIZE];
+            while (isRecording.get() && mAudioRecord.getRecordingState() == AudioRecord.RECORDSTATE_RECORDING) {
+                int read = mAudioRecord.read(buffer, 0, buffer.length);
+                if (read > 0 && callback != null) {
+                    byte[] data = new byte[read];
+                    System.arraycopy(buffer, 0, data, 0, read);
+                    callback.onAudioData(data);
+                }
+            }
+            Log.d(TAG, "Recording thread ended");
+        });
+    }
+
+    public void stop() {
+        isRecording.set(false);
+        if (mAudioRecord != null) {
+            if (mAudioRecord.getRecordingState() == AudioRecord.RECORDSTATE_RECORDING) {
+                mAudioRecord.stop();
+            }
+        }
+    }
+
+    public void release() {
+        stop();
+        if (mAudioRecord != null) {
+            mAudioRecord.release();
+            mAudioRecord = null;
+        }
+    }
+
+    public boolean isRecording() {
+        return isRecording.get();
+    }
+
+    public interface StreamingCallback {
+        void onAudioData(byte[] pcmData);
+
+        void onError(String message);
+    }
+}
diff --git a/test/src/main/java/ai/guiji/duix/test/camera/CameraFrameCapture.java b/test/src/main/java/ai/guiji/duix/test/camera/CameraFrameCapture.java
new file mode 100644
index 0000000..b4815e3
--- /dev/null
+++ b/test/src/main/java/ai/guiji/duix/test/camera/CameraFrameCapture.java
@@ -0,0 +1,299 @@
+package ai.guiji.duix.test.camera;
+
+import android.content.Context;
+import android.content.pm.PackageManager;
+import android.graphics.ImageFormat;
+import android.hardware.camera2.CameraAccessException;
+import android.hardware.camera2.CameraCaptureSession;
+import android.hardware.camera2.CameraCharacteristics;
+import android.hardware.camera2.CameraDevice;
+import android.hardware.camera2.CameraManager;
+import android.hardware.camera2.CaptureRequest;
+import android.media.Image;
+import android.media.ImageReader;
+import android.os.Handler;
+import android.os.HandlerThread;
+import android.util.Log;
+import android.view.Surface;
+import android.view.TextureView;
+
+import androidx.annotation.NonNull;
+import androidx.core.content.ContextCompat;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+/**
+ * 前置摄像头帧采集，按约 1 张/秒 输出 JPEG 供 OmniRealtime 使用
+ * 推荐 480P/720P，单张不超过 500KB
+ */
+public class CameraFrameCapture {
+
+    private static final String TAG = "CameraFrameCapture";
+    private static final int TARGET_WIDTH = 640;   // 480P 左右
+    private static final int TARGET_HEIGHT = 480;
+    private static final int JPEG_QUALITY = 85;
+    private static final long CAPTURE_INTERVAL_MS = 1000; // 1 张/秒
+
+    private final Context context;
+    private final FrameCallback callback;
+    private final TextureView previewView;
+    private Surface previewSurface;
+    private CameraManager cameraManager;
+    private String frontCameraId;
+    private CameraDevice cameraDevice;
+    private CameraCaptureSession captureSession;
+    private ImageReader imageReader;
+    private HandlerThread backgroundThread;
+    private Handler backgroundHandler;
+    private final AtomicBoolean isCapturing = new AtomicBoolean(false);
+    private final AtomicBoolean isRunning = new AtomicBoolean(false);
+
+    public CameraFrameCapture(Context context, FrameCallback callback) {
+        this(context, callback, null);
+    }
+
+    public CameraFrameCapture(Context context, FrameCallback callback, TextureView previewView) {
+        this.context = context.getApplicationContext();
+        this.callback = callback;
+        this.previewView = previewView;
+    }
+
+    public void start() {
+        if (isRunning.get()) {
+            Log.w(TAG, "Already running");
+            return;
+        }
+        if (ContextCompat.checkSelfPermission(context, android.Manifest.permission.CAMERA) != PackageManager.PERMISSION_GRANTED) {
+            if (callback != null) callback.onError("需要相机权限");
+            return;
+        }
+        cameraManager = (CameraManager) context.getSystemService(Context.CAMERA_SERVICE);
+        if (cameraManager == null) {
+            if (callback != null) callback.onError("相机不可用");
+            return;
+        }
+        try {
+            for (String id : cameraManager.getCameraIdList()) {
+                CameraCharacteristics chars = cameraManager.getCameraCharacteristics(id);
+                Integer facing = chars.get(CameraCharacteristics.LENS_FACING);
+                if (facing != null && facing == CameraCharacteristics.LENS_FACING_FRONT) {
+                    frontCameraId = id;
+                    break;
+                }
+            }
+            if (frontCameraId == null) {
+                if (callback != null) callback.onError("未找到前置摄像头");
+                return;
+            }
+        } catch (CameraAccessException e) {
+            Log.e(TAG, "Camera access error: " + e.getMessage());
+            if (callback != null) callback.onError("相机访问失败: " + e.getMessage());
+            return;
+        }
+
+        backgroundThread = new HandlerThread("CameraFrameCapture");
+        backgroundThread.start();
+        backgroundHandler = new Handler(backgroundThread.getLooper());
+
+        imageReader = ImageReader.newInstance(TARGET_WIDTH, TARGET_HEIGHT, ImageFormat.JPEG, 2);
+        imageReader.setOnImageAvailableListener(reader -> {
+            try (Image image = reader.acquireLatestImage()) {
+                if (image != null && callback != null) {
+                    byte[] jpeg = imageToJpeg(image);
+                    if (jpeg != null && jpeg.length > 0 && jpeg.length < 500 * 1024) {
+                        callback.onFrame(jpeg);
+                    }
+                }
+            } catch (Exception e) {
+                Log.e(TAG, "Image process error: " + e.getMessage());
+            }
+        }, backgroundHandler);
+
+        isRunning.set(true);
+        if (previewView != null) {
+            if (previewView.isAvailable()) {
+                setupPreviewSurface();
+                openCamera();
+            } else {
+                previewView.setSurfaceTextureListener(new TextureView.SurfaceTextureListener() {
+                    @Override
+                    public void onSurfaceTextureAvailable(android.graphics.SurfaceTexture surface, int width, int height) {
+                        setupPreviewSurface();
+                        openCamera();
+                    }
+                    @Override
+                    public void onSurfaceTextureSizeChanged(android.graphics.SurfaceTexture surface, int width, int height) {}
+                    @Override
+                    public boolean onSurfaceTextureDestroyed(android.graphics.SurfaceTexture surface) {
+                        return true;
+                    }
+                    @Override
+                    public void onSurfaceTextureUpdated(android.graphics.SurfaceTexture surface) {}
+                });
+            }
+        } else {
+            openCamera();
+        }
+    }
+
+    private void setupPreviewSurface() {
+        if (previewView != null && previewView.getSurfaceTexture() != null) {
+            android.graphics.SurfaceTexture st = previewView.getSurfaceTexture();
+            st.setDefaultBufferSize(TARGET_WIDTH, TARGET_HEIGHT);
+            previewSurface = new Surface(st);
+        }
+    }
+
+    private void openCamera() {
+        try {
+            cameraManager.openCamera(frontCameraId, new CameraDevice.StateCallback() {
+                @Override
+                public void onOpened(@NonNull CameraDevice camera) {
+                    cameraDevice = camera;
+                    createCaptureSession();
+                }
+
+                @Override
+                public void onDisconnected(@NonNull CameraDevice camera) {
+                    camera.close();
+                    cameraDevice = null;
+                }
+
+                @Override
+                public void onError(@NonNull CameraDevice camera, int error) {
+                    Log.e(TAG, "Camera error: " + error);
+                    camera.close();
+                    cameraDevice = null;
+                    if (callback != null) callback.onError("相机打开失败");
+                }
+            }, backgroundHandler);
+        } catch (CameraAccessException e) {
+            Log.e(TAG, "Open camera error: " + e.getMessage());
+            if (callback != null) callback.onError("相机打开失败: " + e.getMessage());
+        } catch (SecurityException e) {
+            Log.e(TAG, "Camera permission denied: " + e.getMessage());
+            if (callback != null) callback.onError("需要相机权限");
+        }
+    }
+
+    private void createCaptureSession() {
+        if (cameraDevice == null || imageReader == null || !isRunning.get()) return;
+        try {
+            java.util.List<Surface> surfaces = new java.util.ArrayList<>();
+            surfaces.add(imageReader.getSurface());
+            if (previewSurface != null) {
+                surfaces.add(previewSurface);
+            }
+            cameraDevice.createCaptureSession(
+                    surfaces,
+                    new CameraCaptureSession.StateCallback() {
+                        @Override
+                        public void onConfigured(@NonNull CameraCaptureSession session) {
+                            captureSession = session;
+                            isCapturing.set(true);
+                            startPreview();
+                            scheduleNextCapture();
+                        }
+
+                        @Override
+                        public void onConfigureFailed(@NonNull CameraCaptureSession session) {
+                            Log.e(TAG, "Capture session configure failed");
+                            if (callback != null) callback.onError("相机配置失败");
+                        }
+                    },
+                    backgroundHandler
+            );
+        } catch (CameraAccessException e) {
+            Log.e(TAG, "Create session error: " + e.getMessage());
+        }
+    }
+
+    private void startPreview() {
+        if (captureSession == null || cameraDevice == null || previewSurface == null) return;
+        try {
+            CaptureRequest.Builder builder = cameraDevice.createCaptureRequest(CameraDevice.TEMPLATE_PREVIEW);
+            builder.addTarget(previewSurface);
+            builder.set(CaptureRequest.CONTROL_AF_MODE, CaptureRequest.CONTROL_AF_MODE_CONTINUOUS_PICTURE);
+            captureSession.setRepeatingRequest(builder.build(), null, backgroundHandler);
+        } catch (CameraAccessException e) {
+            Log.e(TAG, "Start preview error: " + e.getMessage());
+        }
+    }
+
+    private void scheduleNextCapture() {
+        if (!isCapturing.get() || !isRunning.get() || captureSession == null || cameraDevice == null) return;
+        try {
+            CaptureRequest.Builder builder = cameraDevice.createCaptureRequest(CameraDevice.TEMPLATE_PREVIEW);
+            builder.addTarget(imageReader.getSurface());
+            builder.set(CaptureRequest.CONTROL_AF_MODE, CaptureRequest.CONTROL_AF_MODE_CONTINUOUS_PICTURE);
+            CaptureRequest request = builder.build();
+            captureSession.capture(request, new CameraCaptureSession.CaptureCallback() {
+                @Override
+                public void onCaptureCompleted(@NonNull CameraCaptureSession session,
+                        @NonNull android.hardware.camera2.CaptureRequest request,
+                        @NonNull android.hardware.camera2.TotalCaptureResult result) {
+                    if (isRunning.get()) {
+                        backgroundHandler.postDelayed(CameraFrameCapture.this::scheduleNextCapture, CAPTURE_INTERVAL_MS);
+                    }
+                }
+            }, backgroundHandler);
+        } catch (CameraAccessException e) {
+            Log.e(TAG, "Capture error: " + e.getMessage());
+        }
+    }
+
+    private byte[] imageToJpeg(Image image) {
+        Image.Plane[] planes = image.getPlanes();
+        if (planes.length == 0) return null;
+        ByteBuffer buffer = planes[0].getBuffer();
+        byte[] data = new byte[buffer.remaining()];
+        buffer.get(data);
+        return data;
+    }
+
+    public void stop() {
+        isRunning.set(false);
+        isCapturing.set(false);
+        try {
+            if (captureSession != null) {
+                captureSession.close();
+                captureSession = null;
+            }
+            if (cameraDevice != null) {
+                cameraDevice.close();
+                cameraDevice = null;
+            }
+            if (imageReader != null) {
+                imageReader.close();
+                imageReader = null;
+            }
+            if (previewSurface != null) {
+                previewSurface.release();
+                previewSurface = null;
+            }
+        } catch (Exception e) {
+            Log.e(TAG, "Stop error: " + e.getMessage());
+        }
+        if (backgroundThread != null && backgroundThread.isAlive()) {
+            backgroundThread.quitSafely();
+            try {
+                backgroundThread.join(500);
+            } catch (InterruptedException ignored) {
+            }
+            backgroundThread = null;
+        }
+        backgroundHandler = null;
+    }
+
+    public boolean isRunning() {
+        return isRunning.get();
+    }
+
+    public interface FrameCallback {
+        void onFrame(byte[] jpegData);
+
+        void onError(String message);
+    }
+}
diff --git a/test/src/main/java/ai/guiji/duix/test/net/SyncDownloadFile.java b/test/src/main/java/ai/guiji/duix/test/net/SyncDownloadFile.java
new file mode 100644
index 0000000..b2dedf8
--- /dev/null
+++ b/test/src/main/java/ai/guiji/duix/test/net/SyncDownloadFile.java
@@ -0,0 +1,83 @@
+package ai.guiji.duix.test.net;
+
+import android.util.Log;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.InputStream;
+
+import ai.guiji.duix.test.App;
+import okhttp3.Call;
+import okhttp3.Request;
+import okhttp3.Response;
+import okhttp3.ResponseBody;
+
+/**
+ * 同步下载小文件,不要再UI线程里面直接调用
+ */
+public class SyncDownloadFile {
+
+    private String url;
+    private String path;
+    private Callback callback;
+
+    public SyncDownloadFile(String url, String path, Callback callback) {
+        this.url = url;
+        this.path = path;
+        this.callback = callback;
+    }
+
+    public boolean download() {
+        try {
+            Request request = new Request.Builder()
+                    .url(url)
+                    .build();
+            Call call = App.getOkHttpClient().newCall(request);
+
+            Response response = call.execute();
+            if (response.code() == 200) {
+                ResponseBody body = response.body();
+                if (body != null) {
+                    long contentLength = body.contentLength();
+                    InputStream is = body.byteStream();
+                    File tmpFile = new File(path + ".tmp");
+                    File parent = tmpFile.getParentFile();
+                    if (parent != null && !parent.exists()) {
+                        if (!parent.mkdirs()) {
+                            return false;
+                        }
+                    }
+                    if (tmpFile.exists()) {
+                        tmpFile.delete();
+                    }
+                    FileOutputStream fileOutputStream = new FileOutputStream(tmpFile);
+                    long downloadLength = 0;
+                    int len;
+                    byte[] data = new byte[1024];
+                    while ((len = is.read(data)) != -1) {
+                        fileOutputStream.write(data, 0, len);
+                        downloadLength += len;
+                        if (callback != null){
+                            callback.onProgress((int) (downloadLength * 100 / contentLength));
+                        }
+                    }
+                    fileOutputStream.flush();
+                    is.close();
+                    fileOutputStream.close();
+                    File target = new File(path);
+                    if (tmpFile.renameTo(target)) {
+                        return true;
+                    }
+                }
+            }
+        } catch (Exception e) {
+            Log.e("123", "SyncDownloadFile error: " + e.getMessage());
+        }
+        return false;
+    }
+
+    public interface Callback {
+        void onProgress(int progress);
+    }
+
+}
diff --git a/test/src/main/java/ai/guiji/duix/test/realtime/OmniRealtimeClient.java b/test/src/main/java/ai/guiji/duix/test/realtime/OmniRealtimeClient.java
new file mode 100644
index 0000000..c09f74b
--- /dev/null
+++ b/test/src/main/java/ai/guiji/duix/test/realtime/OmniRealtimeClient.java
@@ -0,0 +1,311 @@
+package ai.guiji.duix.test.realtime;
+
+import android.util.Base64;
+import android.util.Log;
+
+import org.json.JSONException;
+import org.json.JSONObject;
+
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import okhttp3.OkHttpClient;
+import okhttp3.Request;
+import okhttp3.Response;
+import okhttp3.WebSocket;
+import okhttp3.WebSocketListener;
+
+/**
+ * Qwen-Omni-Realtime WebSocket 客户端
+ * 接入 qwen3-omni-flash-realtime-2025-12-01 模型
+ */
+public class OmniRealtimeClient {
+
+    private static final String TAG = "OmniRealtimeClient";
+
+    // Demo 配置（硬编码）
+    private static final String API_KEY = "sk-7c32083ace99472e97209e611b9ecbb1";
+    private static final String WS_URL = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime?model=qwen3-omni-flash-realtime-2025-12-01";
+
+    private final OkHttpClient okHttpClient;
+    private WebSocket webSocket;
+    private final Callback callback;
+    private final AtomicBoolean isConnected = new AtomicBoolean(false);
+    private final AtomicBoolean userSpeaking = new AtomicBoolean(false);
+
+    public OmniRealtimeClient(Callback callback) {
+        this.callback = callback;
+        this.okHttpClient = new OkHttpClient.Builder()
+                .connectTimeout(30, TimeUnit.SECONDS)
+                .readTimeout(120, TimeUnit.MINUTES)
+                .writeTimeout(30, TimeUnit.SECONDS)
+                .pingInterval(30, TimeUnit.SECONDS)
+                .build();
+    }
+
+    public void connect() {
+        if (isConnected.get()) {
+            Log.w(TAG, "Already connected");
+            return;
+        }
+        Request request = new Request.Builder()
+                .url(WS_URL)
+                .addHeader("Authorization", "Bearer " + API_KEY)
+                .build();
+
+        webSocket = okHttpClient.newWebSocket(request, new WebSocketListener() {
+            @Override
+            public void onOpen(WebSocket webSocket, Response response) {
+                Log.i(TAG, "WebSocket connected");
+                isConnected.set(true);
+                sendSessionUpdate();
+                if (callback != null) {
+                    callback.onConnected();
+                }
+            }
+
+            @Override
+            public void onMessage(WebSocket webSocket, String text) {
+                handleMessage(text);
+            }
+
+            @Override
+            public void onFailure(WebSocket webSocket, Throwable t, Response response) {
+                Log.e(TAG, "WebSocket error: " + t.getMessage());
+                isConnected.set(false);
+                if (callback != null) {
+                    callback.onError("连接失败: " + t.getMessage());
+                }
+            }
+
+            @Override
+            public void onClosing(WebSocket webSocket, int code, String reason) {
+                Log.i(TAG, "WebSocket closing: " + code + " " + reason);
+            }
+
+            @Override
+            public void onClosed(WebSocket webSocket, int code, String reason) {
+                Log.i(TAG, "WebSocket closed: " + code + " " + reason);
+                isConnected.set(false);
+                if (callback != null) {
+                    callback.onDisconnected();
+                }
+            }
+        });
+    }
+
+    private void sendSessionUpdate() {
+        try {
+            JSONObject session = new JSONObject();
+            session.put("modalities", new org.json.JSONArray().put("text").put("audio"));
+            session.put("voice", "Ethan");
+            session.put("input_audio_format", "pcm");
+            session.put("output_audio_format", "pcm");
+            session.put("instructions", "你是一名面向中国老年群体的心理陪伴 AI。你的职责是为老人提供温和、耐心、尊重、稳定的情感陪伴、情绪支持、回忆唤起、生活关怀和风险识别服务。你不是医生，也不是心理治疗师，不能替代专业诊疗，但应在必要时建议联系家属、社区、医生或急救资源。\n" +
+                    "\n" +
+                    "请始终遵循以下原则：\n" +
+                    "\n" +
+                    "先共情，再回应问题；\n" +
+                    "\n" +
+                    "先接住情绪，再给建议；\n" +
+                    "\n" +
+                    "使用简短、朴素、生活化的中文，始终用“您”称呼；\n" +
+                    "\n" +
+                    "不说教、不训诫、不否定、不敷衍；\n" +
+                    "\n" +
+                    "不把老人当小孩，不制造虚假亲密；\n" +
+                    "\n" +
+                    "一次不提供超过两个建议；\n" +
+                    "\n" +
+                    "鼓励老人表达感受，可适度引导回忆人生经历、兴趣、家庭和过往成就；\n" +
+                    "\n" +
+                    "对孤独、丧偶、被忽视、无价值感、失眠、焦虑、怕拖累家人等议题保持高度敏感；\n" +
+                    "\n" +
+                    "若出现自杀、自伤、重度绝望、幻觉妄想、意识混乱、虐待、严重医疗异常等风险，必须立即建议联系真人帮助和专业机构；\n" +
+                    "\n" +
+                    "不编造医疗知识、政策信息、机构电话，不确定时明确说明。\n" +
+                    "\n" +
+                    "你的回复风格应温和、平稳、尊重、有耐心。每次回答尽量包含：一句共情、一句理解或复述、一句温和追问或一个微小建议。核心目标是让老人感到：我被听见了，我被尊重了，有人愿意陪我。");
+
+            // 服务端 VAD 模式
+            JSONObject turnDetection = new JSONObject();
+            turnDetection.put("type", "server_vad");
+            turnDetection.put("threshold", 0.5);
+            turnDetection.put("silence_duration_ms", 800);
+            session.put("turn_detection", turnDetection);
+
+            JSONObject event = new JSONObject();
+            event.put("event_id", "event_" + System.currentTimeMillis());
+            event.put("type", "session.update");
+            event.put("session", session);
+
+            sendEvent(event);
+        } catch (JSONException e) {
+            Log.e(TAG, "Failed to build session.update: " + e.getMessage());
+        }
+    }
+
+    private void handleMessage(String text) {
+        try {
+            JSONObject event = new JSONObject(text);
+            String type = event.optString("type", "");
+
+            switch (type) {
+                case "session.created":
+                case "session.updated":
+                    Log.d(TAG, "Session: " + type);
+                    break;
+
+                case "input_audio_buffer.speech_started":
+                    Log.d(TAG, "User started speaking");
+                    userSpeaking.set(true);
+                    if (callback != null) {
+                        callback.onUserSpeechStarted();
+                    }
+                    break;
+
+                case "input_audio_buffer.speech_stopped":
+                    Log.d(TAG, "User stopped speaking");
+                    userSpeaking.set(false);
+                    if (callback != null) {
+                        callback.onUserSpeechStopped();
+                    }
+                    break;
+
+                case "conversation.item.input_audio_transcription.completed":
+                    String userTranscript = event.optString("transcript", "");
+                    Log.d(TAG, "User said: " + userTranscript);
+                    if (callback != null) {
+                        callback.onUserTranscript(userTranscript);
+                    }
+                    break;
+
+                case "response.audio_transcript.delta":
+                    String textDelta = event.optString("delta", "");
+                    if (callback != null && !textDelta.isEmpty()) {
+                        callback.onAssistantTextDelta(textDelta);
+                    }
+                    break;
+
+                case "response.audio_transcript.done":
+                    String fullTranscript = event.optString("transcript", "");
+                    Log.d(TAG, "Assistant said: " + fullTranscript);
+                    if (callback != null) {
+                        callback.onAssistantTranscriptDone(fullTranscript);
+                    }
+                    break;
+
+                case "response.audio.delta":
+                    String audioB64 = event.optString("delta", "");
+                    if (!audioB64.isEmpty() && callback != null && !userSpeaking.get()) {
+                        byte[] audioData = Base64.decode(audioB64, Base64.NO_WRAP);
+                        callback.onAssistantAudioDelta(audioData);
+                    }
+                    break;
+
+                case "response.audio.done":
+                    Log.d(TAG, "Audio generation done");
+                    if (callback != null) {
+                        callback.onAssistantAudioDone();
+                    }
+                    break;
+
+                case "response.done":
+                    Log.d(TAG, "Response done");
+                    if (callback != null) {
+                        callback.onResponseDone();
+                    }
+                    break;
+
+                case "error":
+                    String errMsg = event.optJSONObject("error") != null
+                            ? event.optJSONObject("error").optString("message", "Unknown error")
+                            : "Unknown error";
+                    Log.e(TAG, "Server error: " + errMsg);
+                    if (callback != null) {
+                        callback.onError(errMsg);
+                    }
+                    break;
+
+                default:
+                    Log.v(TAG, "Unhandled event: " + type);
+            }
+        } catch (JSONException e) {
+            Log.e(TAG, "Parse message error: " + e.getMessage());
+        }
+    }
+
+    public void appendAudio(byte[] pcmData) {
+        if (!isConnected.get() || webSocket == null) return;
+        try {
+            String audioB64 = Base64.encodeToString(pcmData, Base64.NO_WRAP);
+            JSONObject event = new JSONObject();
+            event.put("event_id", "event_" + System.currentTimeMillis());
+            event.put("type", "input_audio_buffer.append");
+            event.put("audio", audioB64);
+            sendEvent(event);
+        } catch (JSONException e) {
+            Log.e(TAG, "Failed to append audio: " + e.getMessage());
+        }
+    }
+
+    /**
+     * 向图像缓冲区追加图像数据（用于视频通话）
+     * 图片格式: JPG/JPEG，推荐 480P 或 720P，单张不超过 500KB
+     */
+    public void appendImage(byte[] jpegData) {
+        if (!isConnected.get() || webSocket == null) return;
+        try {
+            String imageB64 = Base64.encodeToString(jpegData, Base64.NO_WRAP);
+            JSONObject event = new JSONObject();
+            event.put("event_id", "event_" + System.currentTimeMillis());
+            event.put("type", "input_image_buffer.append");
+            event.put("image", imageB64);
+            sendEvent(event);
+        } catch (JSONException e) {
+            Log.e(TAG, "Failed to append image: " + e.getMessage());
+        }
+    }
+
+    private void sendEvent(JSONObject event) {
+        if (webSocket != null && isConnected.get()) {
+            webSocket.send(event.toString());
+        }
+    }
+
+    public void disconnect() {
+        if (webSocket != null) {
+            webSocket.close(1000, "bye");
+            webSocket = null;
+        }
+        isConnected.set(false);
+    }
+
+    public boolean isConnected() {
+        return isConnected.get();
+    }
+
+    public interface Callback {
+        void onConnected();
+
+        void onDisconnected();
+
+        void onError(String message);
+
+        void onUserSpeechStarted();
+
+        void onUserSpeechStopped();
+
+        void onUserTranscript(String transcript);
+
+        void onAssistantTextDelta(String delta);
+
+        void onAssistantTranscriptDone(String transcript);
+
+        void onAssistantAudioDelta(byte[] pcmData);
+
+        void onAssistantAudioDone();
+
+        void onResponseDone();
+    }
+}
diff --git a/test/src/main/java/ai/guiji/duix/test/render/DebugSink.java b/test/src/main/java/ai/guiji/duix/test/render/DebugSink.java
new file mode 100644
index 0000000..65dcd41
--- /dev/null
+++ b/test/src/main/java/ai/guiji/duix/test/render/DebugSink.java
@@ -0,0 +1,24 @@
+package ai.guiji.duix.test.render;
+
+import ai.guiji.duix.sdk.client.bean.ImageFrame;
+import ai.guiji.duix.sdk.client.render.RenderSink;
+
+public class DebugSink implements RenderSink {
+
+    VideoFrameCallback callback;
+
+    public DebugSink(VideoFrameCallback callback){
+        this.callback = callback;
+    }
+
+    @Override
+    public void onVideoFrame(ImageFrame imageFrame) {
+        if (callback != null){
+            callback.onVideoFrame(imageFrame);
+        }
+    }
+
+    public interface VideoFrameCallback{
+        void onVideoFrame(ImageFrame imageFrame);
+    }
+}
diff --git a/test/src/main/java/ai/guiji/duix/test/service/StorageService.java b/test/src/main/java/ai/guiji/duix/test/service/StorageService.java
new file mode 100644
index 0000000..6bd407c
--- /dev/null
+++ b/test/src/main/java/ai/guiji/duix/test/service/StorageService.java
@@ -0,0 +1,216 @@
+package ai.guiji.duix.test.service;
+
+import android.content.Context;
+import android.content.res.AssetManager;
+import android.os.Environment;
+import android.util.Log;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.util.concurrent.Executor;
+import java.util.concurrent.Executors;
+
+import ai.guiji.duix.sdk.client.util.MD5Util;
+import ai.guiji.duix.test.net.SyncDownloadFile;
+import ai.guiji.duix.test.util.StringUtils;
+import ai.guiji.duix.test.util.ZipUtil;
+
+
+public class StorageService {
+
+    public interface Callback {
+
+        void onDownloadProgress(int progress);
+
+        void onUnzipProgress(int progress);
+
+        void onComplete(String path);
+
+        void onError(String msg);
+    }
+
+    /**
+     * 下载zip文件并解压
+     *
+     * @param context
+     * @param url
+     * @param targetPath
+     * @param uuid
+     * @param callback
+     */
+    public static void downloadAndUnzip(Context context, String url, String targetPath, String uuid, Callback callback, boolean deleteZip) {
+        Executor executor = Executors.newSingleThreadExecutor();
+        executor.execute(() -> {
+            File cacheDir = context.getExternalCacheDir();
+            if (!cacheDir.exists()) {
+                cacheDir.mkdirs();
+            }
+            File zipFile = new File(cacheDir, MD5Util.string2MD5(url));
+            boolean result = true;
+            if (!zipFile.exists()) {
+                Log.d("123", "zip not found, try download.");
+                result = new SyncDownloadFile(url, zipFile.getAbsolutePath(), callback::onDownloadProgress).download();
+                Log.d("123", "download file success.");
+            } else {
+                Log.d("123", "found cache zip file.");
+            }
+            if (result) {
+                Log.e("123", "try unzip file.");
+                File targetDirFile = new File(targetPath);
+                if (targetDirFile.exists()) {
+                    Log.e("123", "delete old files.");
+                    deleteContents(targetDirFile);
+                }
+                // 拿到目标路径的父级
+                File targetParentDir = new File(targetPath).getParentFile();
+                if (!targetParentDir.exists()) {
+                    targetParentDir.mkdirs();
+                }
+                result = ZipUtil.unzip(zipFile.getAbsolutePath(), targetParentDir.getAbsolutePath(), callback::onUnzipProgress);
+                if (result) {
+                    Log.d("123", "unzip file complete.");
+                    // 这里时候targetDirFile应该是存在的
+                    if (targetDirFile.exists()) {
+                        File uuidFile = new File(targetDirFile, "uuid");
+                        try {
+                            OutputStream out = new FileOutputStream(uuidFile);
+                            byte[] uuidBytes = uuid.getBytes();
+                            out.write(uuidBytes, 0, uuidBytes.length);
+                            out.flush();
+                            out.close();
+                            if (deleteZip && zipFile.exists()){
+                                zipFile.delete();
+                            }
+                            callback.onComplete(targetPath);
+                        } catch (Exception e) {
+                            callback.onError("touch uuid file error!");
+                        }
+                    } else {
+                        callback.onError("unzip dir not found!");
+                    }
+                } else {
+                    if (zipFile.exists()){
+                        zipFile.delete();
+                    }
+                    callback.onError("unzip file error!");
+                }
+            } else {
+                callback.onError("zip file download error");
+            }
+        });
+    }
+
+    /**
+     * 从assets拷贝文件到sdcard
+     *
+     * @param context
+     * @param sourcePath 在assets目录的路径
+     * @param targetPath 在sd卡的路径
+     */
+    public static void unpack(Context context, String sourcePath, final String targetPath, final Callback callback) {
+        Executor executor = Executors.newSingleThreadExecutor();
+        executor.execute(() -> {
+            try {
+                final String outputPath = sync(context, sourcePath, targetPath);
+                callback.onComplete(outputPath);
+            } catch (final IOException e) {
+                callback.onError("拷贝文件异常: " + e);
+            }
+        });
+    }
+
+    public static boolean deleteContents(File dir) {
+        File[] files = dir.listFiles();
+        boolean success = true;
+        if (files != null) {
+            for (File file : files) {
+                if (file.isDirectory()) {
+                    success &= deleteContents(file);
+                }
+                if (!file.delete()) {
+                    success = false;
+                }
+            }
+        }
+        return success;
+    }
+
+    private static String sync(Context context, String sourcePath, String targetPath) throws IOException {
+
+        AssetManager assetManager = context.getAssets();
+
+        File externalFilesDir = context.getExternalFilesDir(null);
+        if (externalFilesDir == null) {
+            throw new IOException("cannot get external files dir, "
+                    + "external storage state is " + Environment.getExternalStorageState());
+        }
+
+        File targetDir = new File(externalFilesDir, targetPath);
+        String resultPath = new File(targetDir, sourcePath).getAbsolutePath();
+        String sourceUUID = readLine(assetManager.open(sourcePath + "/uuid"));
+        try {
+            String targetUUID = readLine(new FileInputStream(new File(targetDir, sourcePath + "/uuid")));
+            if (targetUUID.equals(sourceUUID)) return resultPath;
+        } catch (FileNotFoundException e) {
+            // ignore
+        }
+        deleteContents(targetDir);
+
+        copyAssets(assetManager, sourcePath, targetDir);
+
+        // Copy uuid
+        copyFile(assetManager, sourcePath + "/uuid", targetDir);
+
+        return resultPath;
+    }
+
+    private static String readLine(InputStream is) throws IOException {
+        return new BufferedReader(new InputStreamReader(is)).readLine();
+    }
+
+    private static void copyAssets(AssetManager assetManager, String path, File outPath) throws IOException {
+        String[] assets = assetManager.list(path);
+        if (assets == null) {
+            return;
+        }
+        if (assets.length == 0) {
+            if (!path.endsWith("uuid"))
+                copyFile(assetManager, path, outPath);
+        } else {
+            File dir = new File(outPath, path);
+            if (!dir.exists()) {
+                Log.d("123", "Making directory " + dir.getAbsolutePath());
+                if (!dir.mkdirs()) {
+                    Log.d("123", "Failed to create directory " + dir.getAbsolutePath());
+                }
+            }
+            for (String asset : assets) {
+                copyAssets(assetManager, path + "/" + asset, outPath);
+            }
+        }
+    }
+
+    private static void copyFile(AssetManager assetManager, String fileName, File outPath) throws IOException {
+        InputStream in;
+
+        Log.d("123", "Copy " + fileName + " to " + outPath);
+        in = assetManager.open(fileName);
+        OutputStream out = new FileOutputStream(outPath + "/" + fileName);
+
+        byte[] buffer = new byte[4000];
+        int read;
+        while ((read = in.read(buffer)) != -1) {
+            out.write(buffer, 0, read);
+        }
+        in.close();
+        out.close();
+    }
+
+}
diff --git a/test/src/main/java/ai/guiji/duix/test/ui/activity/BaseActivity.java b/test/src/main/java/ai/guiji/duix/test/ui/activity/BaseActivity.java
new file mode 100644
index 0000000..a3b1597
--- /dev/null
+++ b/test/src/main/java/ai/guiji/duix/test/ui/activity/BaseActivity.java
@@ -0,0 +1,119 @@
+package ai.guiji.duix.test.ui.activity;
+
+import android.content.pm.PackageManager;
+import android.os.Build;
+import android.os.Bundle;
+import android.os.Handler;
+import android.os.HandlerThread;
+import android.os.Message;
+import android.util.Log;
+import android.view.WindowManager;
+
+import androidx.activity.result.ActivityResultLauncher;
+import androidx.activity.result.contract.ActivityResultContracts;
+import androidx.annotation.NonNull;
+import androidx.annotation.Nullable;
+import androidx.appcompat.app.AppCompatActivity;
+import androidx.core.content.ContextCompat;
+
+import java.util.ArrayList;
+import java.util.List;
+
+
+public abstract class BaseActivity extends AppCompatActivity implements Handler.Callback {
+
+    public final String TAG = getClass().getName();
+    protected BaseActivity mContext;
+    protected Handler mHandler;
+
+    @Override
+    protected void onCreate(@Nullable Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+        mContext = this;
+        HandlerThread mHandlerThread = new HandlerThread(TAG);
+        mHandlerThread.start();
+        mHandler = new Handler(mHandlerThread.getLooper(), this);
+    }
+
+    @Override
+    protected void onDestroy() {
+        super.onDestroy();
+        Log.i(TAG, "onDestroy");
+        if (mHandler != null && mHandler.getLooper() != null) {
+            mHandler.getLooper().quit();
+        }
+    }
+
+    @Override
+    public boolean handleMessage(@NonNull Message msg) {
+        onMessage(msg);
+        return false;
+    }
+
+    // try abstract
+    protected void onMessage(@NonNull Message msg) {
+
+    }
+
+    protected void keepScreenOn() {
+        getWindow().addFlags(WindowManager.LayoutParams.FLAG_KEEP_SCREEN_ON);
+    }
+
+    private String[] mRequestPermissions;
+    private int mRequestPermissionCode;
+    ActivityResultLauncher<String[]> permissionLauncher = registerForActivityResult(new ActivityResultContracts.RequestMultiplePermissions(),
+            result -> {
+                boolean hasDeny = false;
+                for (String permission : mRequestPermissions) {
+                    if (null == permission) {
+                        continue;
+                    }
+                    if (ContextCompat.checkSelfPermission(mContext, permission) !=
+                            PackageManager.PERMISSION_GRANTED) {
+                        hasDeny = true;
+                    }
+                }
+                if (hasDeny) {
+                    permissionsGet(false, mRequestPermissionCode);
+                } else {
+                    permissionsGet(true, mRequestPermissionCode);
+                }
+            });
+
+    //申请权限
+    public void requestPermission(String[] permissions, int code) {
+        if (null == permissions) {
+            permissionsGet(true, code);
+            return;
+        }
+        if (Build.VERSION.SDK_INT < Build.VERSION_CODES.M) {
+            permissionsGet(true, code);
+            return;
+        }
+        mRequestPermissions = permissions;
+        mRequestPermissionCode = code;
+        List<String> requestPermissions = new ArrayList<>();
+        for (String permission : permissions) {
+            if (ContextCompat.checkSelfPermission(mContext, permission) !=
+                    PackageManager.PERMISSION_GRANTED) {
+                requestPermissions.add(permission);
+            }
+        }
+        if (0 != requestPermissions.size()) {
+            String[] permissionArray = new String[requestPermissions.size()];
+            for (int i = 0; i < requestPermissions.size(); i++) {
+                permissionArray[i] = requestPermissions.get(i);
+            }
+            permissionLauncher.launch(permissionArray);
+        } else {
+            permissionsGet(true, mRequestPermissionCode);
+        }
+    }
+
+    //申请权限回调
+    public void permissionsGet(boolean get, int code) {
+
+    }
+
+
+}
diff --git a/test/src/main/java/ai/guiji/duix/test/ui/activity/CallActivity.kt b/test/src/main/java/ai/guiji/duix/test/ui/activity/CallActivity.kt
new file mode 100644
index 0000000..508ebfa
--- /dev/null
+++ b/test/src/main/java/ai/guiji/duix/test/ui/activity/CallActivity.kt
@@ -0,0 +1,490 @@
+package ai.guiji.duix.test.ui.activity
+
+import ai.guiji.duix.sdk.client.Constant
+import ai.guiji.duix.sdk.client.DUIX
+import ai.guiji.duix.sdk.client.loader.ModelInfo
+import ai.guiji.duix.sdk.client.render.DUIXRenderer
+import ai.guiji.duix.sdk.client.thread.RenderThread
+import ai.guiji.duix.test.R
+import ai.guiji.duix.test.audio.AudioResampler
+import ai.guiji.duix.test.audio.StreamingAudioRecorder
+import ai.guiji.duix.test.camera.CameraFrameCapture
+import ai.guiji.duix.test.databinding.ActivityCallBinding
+import ai.guiji.duix.test.realtime.OmniRealtimeClient
+import ai.guiji.duix.test.ui.adapter.MotionAdapter
+import ai.guiji.duix.test.ui.dialog.AudioRecordDialog
+import ai.guiji.duix.test.util.StringUtils
+import android.Manifest
+import android.annotation.SuppressLint
+import android.opengl.GLSurfaceView
+import android.os.Bundle
+import android.os.Handler
+import android.os.Looper
+import android.text.TextUtils
+import android.util.Log
+import android.view.View
+import android.widget.CompoundButton
+import android.widget.Toast
+import androidx.core.content.ContextCompat
+import com.bumptech.glide.Glide
+import java.io.File
+import java.io.FileInputStream
+import java.io.FileOutputStream
+
+
+class CallActivity : BaseActivity() {
+
+    companion object {
+        const val GL_CONTEXT_VERSION = 2
+    }
+
+    private var modelUrl = ""
+    private var debug = false
+    private var mMessage = ""
+
+    @SuppressLint("SetTextI18n")
+    private fun applyMessage(msg: String){
+        if (debug){
+            runOnUiThread {
+                binding.tvDebug.visibility = View.VISIBLE
+                if (mMessage.length > 10000){
+                    mMessage = ""
+                }
+                mMessage = "${StringUtils.dateToStringMS4()} $msg\n$mMessage"
+                binding.tvDebug.text = mMessage
+            }
+        }
+
+    }
+
+    private lateinit var binding: ActivityCallBinding
+    private var duix: DUIX? = null
+    private var mDUIXRender: DUIXRenderer? = null
+    private var mModelInfo: ModelInfo?=null     // 加载的模型信息
+
+    // AI 实时对话
+    private var omniClient: OmniRealtimeClient? = null
+    private var streamingRecorder: StreamingAudioRecorder? = null
+    private var cameraFrameCapture: CameraFrameCapture? = null
+    private var isAIConversationActive = false
+    private var isCameraEnabled = false
+    private val mainHandler = Handler(Looper.getMainLooper())
+    private var assistantTextBuffer = StringBuilder()
+    private var hasStartedPushForResponse = false
+
+    override fun onCreate(savedInstanceState: Bundle?) {
+        super.onCreate(savedInstanceState)
+        keepScreenOn()
+//        val audioManager = mContext.getSystemService(AUDIO_SERVICE) as AudioManager
+//        audioManager.mode = AudioManager.MODE_IN_COMMUNICATION
+//        audioManager.isSpeakerphoneOn = true
+        binding = ActivityCallBinding.inflate(layoutInflater)
+        setContentView(binding.root)
+
+        modelUrl = intent.getStringExtra("modelUrl") ?: ""
+        debug = intent.getBooleanExtra("debug", false)
+
+        Glide.with(mContext).load("file:///android_asset/bg/bg1.png").into(binding.ivBg)
+
+        binding.glTextureView.setEGLContextClientVersion(GL_CONTEXT_VERSION)
+        binding.glTextureView.setEGLConfigChooser(8, 8, 8, 8, 16, 0)
+//        binding.glTextureView.preserveEGLContextOnPause = true
+        binding.glTextureView.isOpaque = false
+
+        binding.switchMute.setOnCheckedChangeListener(object : CompoundButton.OnCheckedChangeListener {
+            override fun onCheckedChanged(
+                buttonView: CompoundButton?,
+                isChecked: Boolean,
+            ) {
+                if (isChecked) {
+                    duix?.setVolume(0.0F)
+                } else {
+                    duix?.setVolume(1.0F)
+                }
+            }
+        })
+
+        binding.btnRecord.setOnClickListener {
+            requestPermission(arrayOf(Manifest.permission.RECORD_AUDIO), 1)
+        }
+
+        binding.btnPlayPCM.setOnClickListener {
+            applyMessage("start play pcm")
+            playPCMStream()
+        }
+
+        binding.btnPlayWAV.setOnClickListener {
+            applyMessage("start play wav")
+            playWAVFile()
+        }
+
+        binding.btnRandomMotion.setOnClickListener {
+            applyMessage("start random motion")
+            duix?.startRandomMotion(true)
+        }
+        binding.btnAIConversation.setOnClickListener {
+            requestPermission(arrayOf(Manifest.permission.RECORD_AUDIO), 2)
+        }
+
+        binding.btnCameraToggle.setOnClickListener {
+            if (isCameraEnabled) {
+                toggleCamera()
+            } else {
+                requestPermission(arrayOf(Manifest.permission.CAMERA), 3)
+            }
+        }
+
+        mDUIXRender =
+            DUIXRenderer(
+                mContext,
+                binding.glTextureView
+            )
+        binding.glTextureView.setRenderer(mDUIXRender)
+        binding.glTextureView.renderMode =
+            GLSurfaceView.RENDERMODE_WHEN_DIRTY      // 一定要在设置完Render之后再调用
+
+        duix = DUIX(mContext, modelUrl, mDUIXRender) { event, msg, info ->
+            when (event) {
+                Constant.CALLBACK_EVENT_INIT_READY -> {
+                    mModelInfo = info as ModelInfo
+                    Log.i(TAG, "CALLBACK_EVENT_INIT_READY: $mModelInfo")
+                    initOk()
+                }
+
+                Constant.CALLBACK_EVENT_INIT_ERROR -> {
+                    runOnUiThread {
+                        applyMessage("init error: $msg")
+                        Log.e(TAG, "CALLBACK_EVENT_INIT_ERROR: $msg")
+                        Toast.makeText(mContext, "Initialization exception: $msg", Toast.LENGTH_SHORT).show()
+                    }
+                }
+
+                Constant.CALLBACK_EVENT_AUDIO_PLAY_START -> {
+                    applyMessage("callback audio play start")
+                    Log.i(TAG, "CALLBACK_EVENT_AUDIO_PLAY_START")
+                }
+
+                Constant.CALLBACK_EVENT_AUDIO_PLAY_END -> {
+                    applyMessage("callback audio play end")
+                    Log.i(TAG, "CALLBACK_EVENT_PLAY_END")
+                }
+
+                Constant.CALLBACK_EVENT_AUDIO_PLAY_ERROR -> {
+                    applyMessage("callback audio play error: $msg")
+                    Log.e(TAG, "CALLBACK_EVENT_PLAY_ERROR: $msg")
+                }
+
+                Constant.CALLBACK_EVENT_MOTION_START -> {
+                    applyMessage("callback motion play start")
+                    Log.e(TAG, "CALLBACK_EVENT_MOTION_START")
+                }
+
+                Constant.CALLBACK_EVENT_MOTION_END -> {
+                    applyMessage("callback motion play end")
+                    Log.e(TAG, "CALLBACK_EVENT_MOTION_END")
+                }
+            }
+        }
+        // Rendering status callback
+//        duix?.setReporter(object : RenderThread.Reporter {
+//            override fun onRenderStat(
+//                resultCode: Int,
+//                isLip: Boolean,
+//                useTime: Long,
+//            ) {
+//
+//            }
+//        })
+        applyMessage("start init")
+        duix?.init()
+    }
+
+    private fun initOk() {
+        Log.i(TAG, "init ok")
+        applyMessage("init ok")
+        runOnUiThread {
+            binding.btnRecord.isEnabled = true
+            binding.btnPlayPCM.isEnabled = true
+            binding.btnPlayWAV.isEnabled = true
+            binding.switchMute.isEnabled = true
+            binding.btnAIConversation.isEnabled = true
+
+            if (debug) {
+                binding.btnPlayPCM.visibility = View.VISIBLE
+                binding.btnPlayWAV.visibility = View.VISIBLE
+            }
+
+            mModelInfo?.let { modelInfo ->
+                if (modelInfo.motionRegions.isNotEmpty()) {
+                    val names = ArrayList<String>()
+                    for (motion in modelInfo.motionRegions){
+                        if (!TextUtils.isEmpty(motion.name) && "unknown" != motion.name){
+                            names.add(motion.name)
+                        }
+                    }
+                    // Named action regions
+                    if (names.isNotEmpty()){
+                        val motionAdapter = MotionAdapter(names, object : MotionAdapter.Callback{
+                            override fun onClick(name: String, now: Boolean) {
+                                applyMessage("start [${name}] motion")
+                                duix?.startMotion(name, now)
+                            }
+                        })
+                        binding.rvMotion.adapter = motionAdapter
+                    }
+                    binding.btnRandomMotion.visibility = View.VISIBLE
+                    binding.tvMotionTips.visibility = View.VISIBLE
+                }
+            }
+        }
+    }
+
+
+    override fun onDestroy() {
+        super.onDestroy()
+        stopAIConversation()
+        duix?.release()
+    }
+
+    private fun playPCMStream(){
+        val thread = Thread {
+            duix?.startPush()
+            val inputStream = assets.open("pcm/2.pcm")
+            val buffer = ByteArray(320)
+            var length: Int
+            while (inputStream.read(buffer).also { length = it } > 0){
+                val data = buffer.copyOfRange(0, length)
+                duix?.pushPcm(data)
+            }
+            duix?.stopPush()
+            inputStream.close()
+        }
+        thread.start()
+    }
+
+    private fun playWAVFile(){
+        val thread = Thread {
+            val wavName = "1.wav"
+            val wavFile = File(mContext.externalCacheDir, wavName)
+            if (!wavFile.exists()){
+                // copy assets -> sd card
+                val inputStream = assets.open("wav/$wavName")
+                if (!mContext.externalCacheDir!!.exists()){
+                    mContext.externalCacheDir!!.mkdirs()
+                }
+                val out = FileOutputStream(wavFile)
+                val buffer = ByteArray(1024)
+                var length: Int
+                while ((inputStream.read(buffer).also { length = it }) > 0) {
+                    out.write(buffer, 0, length)
+                }
+                out.close()
+                inputStream.close()
+            }
+            duix?.playAudio(wavFile.absolutePath)
+        }
+        thread.start()
+    }
+
+    override fun permissionsGet(get: Boolean, code: Int) {
+        super.permissionsGet(get, code)
+        when (code) {
+            1 -> if (get) showRecordDialog() else Toast.makeText(mContext, R.string.need_permission_continue, Toast.LENGTH_SHORT).show()
+            2 -> if (get) toggleAIConversation() else Toast.makeText(mContext, R.string.need_permission_continue, Toast.LENGTH_SHORT).show()
+            3 -> if (get) toggleCamera() else Toast.makeText(mContext, R.string.need_permission_continue, Toast.LENGTH_SHORT).show()
+        }
+    }
+
+    private fun toggleAIConversation() {
+        if (isAIConversationActive) {
+            stopAIConversation()
+        } else {
+            startAIConversation()
+        }
+    }
+
+    private fun startAIConversation() {
+        if (duix?.isReady() != true) {
+            Toast.makeText(mContext, "数字人未就绪", Toast.LENGTH_SHORT).show()
+            return
+        }
+        applyMessage("AI对话: 连接中...")
+        binding.tvSubtitle.visibility = View.VISIBLE
+        binding.tvSubtitle.text = "正在连接..."
+        binding.btnAIConversation.text = getString(R.string.ai_fab_stop)
+
+        omniClient = OmniRealtimeClient(object : OmniRealtimeClient.Callback {
+            override fun onConnected() {
+                mainHandler.post {
+                    binding.tvSubtitle.text = "已连接，请对着麦克风说话"
+                    binding.btnCameraToggle.visibility = View.VISIBLE
+                    binding.btnCameraToggle.background = ContextCompat.getDrawable(mContext, R.drawable.shape_circle_camera_off)
+                    binding.btnCameraToggle.contentDescription = getString(R.string.camera_off)
+                    isCameraEnabled = false
+                    applyMessage("AI对话: 已连接")
+                    streamingRecorder = StreamingAudioRecorder(mContext, object : StreamingAudioRecorder.StreamingCallback {
+                        override fun onAudioData(pcmData: ByteArray) {
+                            omniClient?.appendAudio(pcmData)
+                        }
+                        override fun onError(message: String) {
+                            mainHandler.post {
+                                Toast.makeText(mContext, message, Toast.LENGTH_SHORT).show()
+                            }
+                        }
+                    })
+                    streamingRecorder?.start()
+                }
+            }
+
+            override fun onDisconnected() {
+                mainHandler.post { stopAIConversation() }
+            }
+
+            override fun onError(message: String) {
+                mainHandler.post {
+                    Toast.makeText(mContext, "AI错误: $message", Toast.LENGTH_SHORT).show()
+                    binding.tvSubtitle.text = "错误: $message"
+                }
+            }
+
+            override fun onUserSpeechStarted() {
+                mainHandler.post {
+                    binding.tvSubtitle.text = "正在聆听..."
+                }
+            }
+
+            override fun onUserSpeechStopped() {
+                mainHandler.post {
+                    binding.tvSubtitle.text = "思考中..."
+                }
+            }
+
+            override fun onUserTranscript(transcript: String) {
+                mainHandler.post {
+                    binding.tvSubtitle.text = "您: $transcript"
+                }
+            }
+
+            override fun onAssistantTextDelta(delta: String) {
+                mainHandler.post {
+                    assistantTextBuffer.append(delta)
+                    binding.tvSubtitle.text = assistantTextBuffer.toString()
+                }
+            }
+
+            override fun onAssistantTranscriptDone(transcript: String) {
+                mainHandler.post {
+                    assistantTextBuffer.clear()
+                    assistantTextBuffer.append(transcript)
+                    binding.tvSubtitle.text = transcript
+                }
+            }
+
+            override fun onAssistantAudioDelta(pcmData: ByteArray) {
+                if (!hasStartedPushForResponse) {
+                    hasStartedPushForResponse = true
+                    duix?.startPush()
+                }
+                val resampled = AudioResampler.resample24kTo16k(pcmData)
+                if (resampled.isNotEmpty()) {
+                    duix?.pushPcm(resampled)
+                }
+            }
+
+            override fun onAssistantAudioDone() {
+                if (hasStartedPushForResponse) {
+                    duix?.stopPush()
+                    hasStartedPushForResponse = false
+                }
+            }
+
+            override fun onResponseDone() {
+                mainHandler.post {
+                    if (hasStartedPushForResponse) {
+                        duix?.stopPush()
+                        hasStartedPushForResponse = false
+                    }
+                    assistantTextBuffer.clear()
+                }
+            }
+        })
+
+        isAIConversationActive = true
+        omniClient?.connect()
+    }
+
+    private fun stopAIConversation() {
+        isAIConversationActive = false
+        stopCamera()
+        streamingRecorder?.release()
+        streamingRecorder = null
+        omniClient?.disconnect()
+        omniClient = null
+        duix?.stopAudio()
+        assistantTextBuffer.clear()
+
+        mainHandler.post {
+            if (!isFinishing) {
+                binding.btnAIConversation.text = getString(R.string.ai_fab_start)
+                binding.btnCameraToggle.visibility = View.GONE
+                binding.tvSubtitle.visibility = View.GONE
+                binding.tvSubtitle.text = ""
+                applyMessage("AI对话: 已结束")
+            }
+        }
+    }
+
+    private fun toggleCamera() {
+        if (!isAIConversationActive || omniClient == null) return
+        isCameraEnabled = !isCameraEnabled
+        if (isCameraEnabled) {
+            binding.cameraPreview.visibility = View.VISIBLE
+            cameraFrameCapture = CameraFrameCapture(mContext, object : CameraFrameCapture.FrameCallback {
+                override fun onFrame(jpegData: ByteArray) {
+                    omniClient?.appendImage(jpegData)
+                }
+                override fun onError(message: String) {
+                    mainHandler.post {
+                        Toast.makeText(mContext, message, Toast.LENGTH_SHORT).show()
+                        stopCamera()
+                    }
+                }
+            }, binding.cameraPreview)
+            cameraFrameCapture?.start()
+            binding.btnCameraToggle.background = ContextCompat.getDrawable(mContext, R.drawable.shape_circle_camera_on)
+            binding.btnCameraToggle.contentDescription = getString(R.string.camera_on)
+        } else {
+            stopCamera()
+            binding.btnCameraToggle.background = ContextCompat.getDrawable(mContext, R.drawable.shape_circle_camera_off)
+            binding.btnCameraToggle.contentDescription = getString(R.string.camera_off)
+        }
+    }
+
+    private fun stopCamera() {
+        cameraFrameCapture?.stop()
+        cameraFrameCapture = null
+        isCameraEnabled = false
+        binding.cameraPreview.visibility = View.GONE
+    }
+
+    private fun showRecordDialog(){
+        val audioRecordDialog = AudioRecordDialog(mContext, object : AudioRecordDialog.Listener{
+            override fun onFinish(path: String) {
+                val thread = Thread {
+                    duix?.startPush()
+                    val inputStream = FileInputStream(path)
+                    val buffer = ByteArray(320)
+                    var length: Int
+                    while (inputStream.read(buffer).also { length = it } > 0){
+                        val data = buffer.copyOfRange(0, length)
+                        duix?.pushPcm(data)
+                    }
+                    duix?.stopPush()
+                    inputStream.close()
+                }
+                thread.start()
+            }
+        })
+        audioRecordDialog.show()
+    }
+}
diff --git a/test/src/main/java/ai/guiji/duix/test/ui/activity/MainActivity.kt b/test/src/main/java/ai/guiji/duix/test/ui/activity/MainActivity.kt
new file mode 100644
index 0000000..15b5ef7
--- /dev/null
+++ b/test/src/main/java/ai/guiji/duix/test/ui/activity/MainActivity.kt
@@ -0,0 +1,203 @@
+package ai.guiji.duix.test.ui.activity
+
+import ai.guiji.duix.sdk.client.BuildConfig
+import ai.guiji.duix.sdk.client.VirtualModelUtil
+import ai.guiji.duix.test.R
+import ai.guiji.duix.test.databinding.ActivityMainBinding
+import ai.guiji.duix.test.ui.dialog.LoadingDialog
+import ai.guiji.duix.test.ui.dialog.ModelSelectorDialog
+import android.annotation.SuppressLint
+import android.content.Intent
+import android.os.Bundle
+import android.text.TextUtils
+import android.widget.Toast
+import java.io.File
+
+
+class MainActivity : BaseActivity() {
+
+    private lateinit var binding: ActivityMainBinding
+    private var mLoadingDialog: LoadingDialog?=null
+    private var mLastProgress = 0
+
+    val models = arrayListOf(
+        "https://github.com/duixcom/Duix-Mobile/releases/download/v1.0.0/bendi3_20240518.zip",
+        "https://github.com/duixcom/Duix-Mobile/releases/download/v1.0.0/airuike_20240409.zip",
+        "https://github.com/duixcom/Duix-Mobile/releases/download/v1.0.0/675429759852613_7f8d9388a4213080b1820b83dd057cfb_optim_m80.zip",
+        "https://github.com/duixcom/Duix-Mobile/releases/download/v1.0.0/674402003804229_f6e86fb375c4f1f1b82b24f7ee4e7cb4_optim_m80.zip",
+        "https://github.com/duixcom/Duix-Mobile/releases/download/v1.0.0/674400178376773_3925e756433c5a9caa9b9d54147ae4ab_optim_m80.zip",
+        "https://github.com/duixcom/Duix-Mobile/releases/download/v1.0.0/674397294927941_6e297e18a4bdbe35c07a6ae48a1f021f_optim_m80.zip",
+        "https://github.com/duixcom/Duix-Mobile/releases/download/v1.0.0/674393494597701_f49fcf68f5afdb241d516db8a7d88a7b_optim_m80.zip",
+        "https://github.com/duixcom/Duix-Mobile/releases/download/v1.0.0/651705983152197_ccf3256b2449c76e77f94276dffcb293_optim_m80.zip",
+        "https://github.com/duixcom/Duix-Mobile/releases/download/v1.0.0/627306542239813_1871244b5e6912efc636ba31ea4c5c6d_optim_m80.zip",
+    )
+
+    private var mBaseConfigUrl = ""
+    private var mModelUrl = ""
+
+    @SuppressLint("SetTextI18n")
+    override fun onCreate(savedInstanceState: Bundle?) {
+        super.onCreate(savedInstanceState)
+        binding = ActivityMainBinding.inflate(layoutInflater)
+        setContentView(binding.root)
+
+        binding.tvSdkVersion.text = "SDK Version: ${BuildConfig.VERSION_NAME}"
+
+
+        binding.btnMoreModel.setOnClickListener {
+            val modelSelectorDialog = ModelSelectorDialog(mContext, models, object : ModelSelectorDialog.Listener{
+                override fun onSelect(url: String) {
+                    binding.etUrl.setText(url)
+                }
+            })
+            modelSelectorDialog.show()
+        }
+        binding.btnPlay.setOnClickListener {
+            play()
+        }
+    }
+
+    private fun play(){
+        mBaseConfigUrl = binding.etBaseConfig.text.toString()
+        mModelUrl = binding.etUrl.text.toString()
+        if (TextUtils.isEmpty(mBaseConfigUrl)){
+            Toast.makeText(mContext, R.string.base_config_cannot_be_empty, Toast.LENGTH_SHORT).show()
+            return
+        }
+        if (TextUtils.isEmpty(mModelUrl)){
+            Toast.makeText(mContext, R.string.model_url_cannot_be_empty, Toast.LENGTH_SHORT).show()
+            return
+        }
+        checkBaseConfig()
+    }
+
+    private fun checkBaseConfig(){
+        if (VirtualModelUtil.checkBaseConfig(mContext)){
+            checkModel()
+        } else {
+            baseConfigDownload()
+        }
+    }
+
+    private fun checkModel(){
+        if (VirtualModelUtil.checkModel(mContext, mModelUrl)){
+            jumpPlayPage()
+        } else {
+            modelDownload()
+        }
+    }
+
+    private fun jumpPlayPage(){
+        val intent = Intent(mContext, CallActivity::class.java)
+        intent.putExtra("modelUrl", mModelUrl)
+        val debug = binding.switchDebug.isChecked
+        intent.putExtra("debug", debug)
+        startActivity(intent)
+    }
+
+    private fun baseConfigDownload(){
+        mLoadingDialog?.dismiss()
+        mLoadingDialog = LoadingDialog(mContext, "Start downloading")
+        mLoadingDialog?.show()
+        VirtualModelUtil.baseConfigDownload(mContext, mBaseConfigUrl, object :
+            VirtualModelUtil.ModelDownloadCallback {
+            override fun onDownloadProgress(url: String?, current: Long, total: Long) {
+                val progress = (current * 100 / total).toInt()
+                if (progress != mLastProgress){
+                    mLastProgress = progress
+                    runOnUiThread {
+                        if (mLoadingDialog?.isShowing == true){
+                            mLoadingDialog?.setContent("Config download(${progress}%)")
+                        }
+                    }
+                }
+            }
+
+            override fun onUnzipProgress(url: String?, current: Long, total: Long) {
+                val progress = (current * 100 / total).toInt()
+                if (progress != mLastProgress){
+                    mLastProgress = progress
+                    runOnUiThread {
+                        if (mLoadingDialog?.isShowing == true){
+                            mLoadingDialog?.setContent("Config unzip(${progress}%)")
+                        }
+                    }
+                }
+            }
+
+            override fun onDownloadComplete(url: String?, dir: File?) {
+                runOnUiThread {
+                    mLoadingDialog?.dismiss()
+                    checkModel()
+                }
+            }
+
+            override fun onDownloadFail(url: String?, code: Int, msg: String?) {
+                runOnUiThread {
+                    mLoadingDialog?.dismiss()
+                    Toast.makeText(mContext, "BaseConfig download error: $msg", Toast.LENGTH_SHORT).show()
+                }
+            }
+
+        })
+    }
+
+    private fun modelDownload(){
+        mLoadingDialog?.dismiss()
+        mLoadingDialog = LoadingDialog(mContext, "Start downloading")
+        mLoadingDialog?.show()
+        VirtualModelUtil.modelDownload(mContext, mModelUrl, object : VirtualModelUtil.ModelDownloadCallback{
+            override fun onDownloadProgress(
+                url: String?,
+                current: Long,
+                total: Long,
+            ) {
+                val progress = (current * 100 / total).toInt()
+                if (progress != mLastProgress){
+                    mLastProgress = progress
+                    runOnUiThread {
+                        if (mLoadingDialog?.isShowing == true){
+                            mLoadingDialog?.setContent("Model download(${progress}%)")
+                        }
+                    }
+                }
+            }
+
+            override fun onUnzipProgress(
+                url: String?,
+                current: Long,
+                total: Long,
+            ) {
+                val progress = (current * 100 / total).toInt()
+                if (progress != mLastProgress){
+                    mLastProgress = progress
+                    runOnUiThread {
+                        if (mLoadingDialog?.isShowing == true){
+                            mLoadingDialog?.setContent("Model unzip(${progress}%)")
+                        }
+                    }
+                }
+            }
+
+            override fun onDownloadComplete(url: String?, dir: File?) {
+                runOnUiThread {
+                    mLoadingDialog?.dismiss()
+                    jumpPlayPage()
+                }
+            }
+
+            override fun onDownloadFail(
+                url: String?,
+                code: Int,
+                msg: String?,
+            ) {
+                runOnUiThread {
+                    mLoadingDialog?.dismiss()
+                    Toast.makeText(mContext, "Model download error: $msg", Toast.LENGTH_SHORT).show()
+                }
+            }
+
+        })
+    }
+
+}
diff --git a/test/src/main/java/ai/guiji/duix/test/ui/adapter/ModelSelectorAdapter.kt b/test/src/main/java/ai/guiji/duix/test/ui/adapter/ModelSelectorAdapter.kt
new file mode 100644
index 0000000..288d7d3
--- /dev/null
+++ b/test/src/main/java/ai/guiji/duix/test/ui/adapter/ModelSelectorAdapter.kt
@@ -0,0 +1,39 @@
+package ai.guiji.duix.test.ui.adapter
+
+import ai.guiji.duix.test.databinding.ItemModelSelectorBinding
+import android.annotation.SuppressLint
+import android.view.LayoutInflater
+import android.view.ViewGroup
+import androidx.recyclerview.widget.RecyclerView
+
+
+class ModelSelectorAdapter(
+    private val mList: ArrayList<String>,
+    private val callback: Callback
+) : RecyclerView.Adapter<ModelSelectorAdapter.ItemHolder>() {
+
+    class ItemHolder(val itemBinding: ItemModelSelectorBinding) :
+        RecyclerView.ViewHolder(itemBinding.root)
+
+    override fun onCreateViewHolder(parent: ViewGroup, viewType: Int): ItemHolder {
+        val itemBinding =
+            ItemModelSelectorBinding.inflate(LayoutInflater.from(parent.context), parent, false)
+        return ItemHolder(itemBinding)
+    }
+
+    @SuppressLint("SetTextI18n")
+    override fun onBindViewHolder(holder: ItemHolder, position: Int) {
+        holder.itemBinding.tvModelUrl.text = mList[position]
+        holder.itemBinding.tvModelUrl.setOnClickListener {
+            callback.onClick(mList[position])
+        }
+    }
+
+    override fun getItemCount(): Int {
+        return mList.size
+    }
+
+    interface Callback {
+        fun onClick(url: String)
+    }
+}
\ No newline at end of file
diff --git a/test/src/main/java/ai/guiji/duix/test/ui/adapter/MotionAdapter.kt b/test/src/main/java/ai/guiji/duix/test/ui/adapter/MotionAdapter.kt
new file mode 100644
index 0000000..8b24e52
--- /dev/null
+++ b/test/src/main/java/ai/guiji/duix/test/ui/adapter/MotionAdapter.kt
@@ -0,0 +1,39 @@
+package ai.guiji.duix.test.ui.adapter
+
+import ai.guiji.duix.test.databinding.ItemMotionButtonBinding
+import android.annotation.SuppressLint
+import android.view.LayoutInflater
+import android.view.ViewGroup
+import androidx.recyclerview.widget.RecyclerView
+
+
+class MotionAdapter(
+    private val mList: ArrayList<String>,
+    private val callback: Callback
+) : RecyclerView.Adapter<MotionAdapter.ItemHolder>() {
+
+    class ItemHolder(val itemBinding: ItemMotionButtonBinding) :
+        RecyclerView.ViewHolder(itemBinding.root)
+
+    override fun onCreateViewHolder(parent: ViewGroup, viewType: Int): ItemHolder {
+        val itemBinding =
+            ItemMotionButtonBinding.inflate(LayoutInflater.from(parent.context), parent, false)
+        return ItemHolder(itemBinding)
+    }
+
+    @SuppressLint("SetTextI18n")
+    override fun onBindViewHolder(holder: ItemHolder, position: Int) {
+        holder.itemBinding.btnMotion.text = mList[position]
+        holder.itemBinding.btnMotion.setOnClickListener {
+            callback.onClick(mList[position], true)
+        }
+    }
+
+    override fun getItemCount(): Int {
+        return mList.size
+    }
+
+    interface Callback {
+        fun onClick(name: String, now: Boolean)
+    }
+}
\ No newline at end of file
diff --git a/test/src/main/java/ai/guiji/duix/test/ui/dialog/AudioRecordDialog.kt b/test/src/main/java/ai/guiji/duix/test/ui/dialog/AudioRecordDialog.kt
new file mode 100644
index 0000000..11b1a21
--- /dev/null
+++ b/test/src/main/java/ai/guiji/duix/test/ui/dialog/AudioRecordDialog.kt
@@ -0,0 +1,96 @@
+package ai.guiji.duix.test.ui.dialog
+
+import ai.guiji.duix.test.R
+import ai.guiji.duix.test.audio.AudioRecorder
+import ai.guiji.duix.test.databinding.DialogAudioRecordBinding
+import android.annotation.SuppressLint
+import android.app.Dialog
+import android.content.Context
+import android.graphics.Color
+import android.graphics.drawable.ColorDrawable
+import android.os.Bundle
+import android.view.Gravity
+import android.view.MotionEvent
+import android.view.ViewGroup
+import android.view.Window
+import android.widget.Toast
+
+class AudioRecordDialog(
+    private val mContext: Context,
+    private val listener: Listener
+) : Dialog(mContext, R.style.dialog_center) {
+
+    private var binding: DialogAudioRecordBinding
+
+    private var audioRecorder: AudioRecorder?=null
+
+    init {
+        requestWindowFeature(Window.FEATURE_NO_TITLE)
+        binding = DialogAudioRecordBinding.inflate(layoutInflater)
+        setContentView(binding.root)
+    }
+
+
+    @SuppressLint("ClickableViewAccessibility")
+    override fun onCreate(savedInstanceState: Bundle?) {
+        super.onCreate(savedInstanceState)
+        window?.let {
+            it.setGravity(Gravity.BOTTOM)
+            it.setBackgroundDrawable(ColorDrawable(Color.TRANSPARENT))
+            it.setLayout(ViewGroup.LayoutParams.MATCH_PARENT, ViewGroup.LayoutParams.WRAP_CONTENT)
+        }
+
+        binding.tvTouch.setOnTouchListener { _, event ->
+            when (event?.action) {
+                MotionEvent.ACTION_DOWN -> {
+                    startRecord()
+                }
+
+                MotionEvent.ACTION_UP -> {
+                    stopRecord()
+                }
+            }
+            false
+        }
+
+        setCancelable(true)
+        setCanceledOnTouchOutside(true)
+    }
+
+    private fun startRecord(){
+        audioRecorder = AudioRecorder(mContext, object : AudioRecorder.RecorderCallback{
+            override fun onReadData(data: ByteArray, offsetInBytes: Int, length: Int) {
+            }
+
+            override fun onRecordError(code: Int, message: String) {
+                binding.layoutFrame.post {
+                    Toast.makeText(mContext, message, Toast.LENGTH_SHORT).show()
+                    audioRecorder?.release()
+                }
+            }
+
+            override fun onFinish(path: String) {
+                binding.layoutFrame.post {
+                    audioRecorder?.release()
+                    listener.onFinish(path)
+                    dismiss()
+                }
+            }
+        })
+        audioRecorder?.start()
+    }
+
+    private fun stopRecord(){
+        audioRecorder?.stop()
+    }
+
+    override fun dismiss() {
+        super.dismiss()
+        audioRecorder?.release()
+    }
+
+    interface Listener {
+        fun onFinish(path: String)
+    }
+
+}
\ No newline at end of file
diff --git a/test/src/main/java/ai/guiji/duix/test/ui/dialog/LoadingDialog.kt b/test/src/main/java/ai/guiji/duix/test/ui/dialog/LoadingDialog.kt
new file mode 100644
index 0000000..1b30ca0
--- /dev/null
+++ b/test/src/main/java/ai/guiji/duix/test/ui/dialog/LoadingDialog.kt
@@ -0,0 +1,50 @@
+package ai.guiji.duix.test.ui.dialog
+
+import ai.guiji.duix.test.R
+import ai.guiji.duix.test.databinding.DialogLoadingBinding
+import android.app.Dialog
+import android.content.Context
+import android.os.Bundle
+import android.text.TextUtils
+import android.view.Window
+import android.view.animation.Animation
+import android.view.animation.AnimationUtils
+import android.view.animation.LinearInterpolator
+
+
+class LoadingDialog(private var mContext: Context, private val content: String = "") :
+    Dialog(mContext, R.style.dialog_center) {
+
+    private lateinit var binding: DialogLoadingBinding
+
+    override fun onCreate(savedInstanceState: Bundle?) {
+        super.onCreate(savedInstanceState)
+        requestWindowFeature(Window.FEATURE_NO_TITLE)
+        binding = DialogLoadingBinding.inflate(layoutInflater)
+        super.setContentView(binding.root)
+
+        if (!TextUtils.isEmpty(content)){
+            binding.tvContent.text = content
+        }
+
+        setCancelable(false)
+        setCanceledOnTouchOutside(false)
+    }
+
+    fun setContent(content: String){
+        binding.tvContent.text = content
+    }
+
+    override fun show() {
+        super.show()
+        val animation: Animation = AnimationUtils.loadAnimation(mContext, R.anim.rotate)
+        val lin = LinearInterpolator()
+        animation.interpolator = lin
+        binding.ivProgress.startAnimation(animation)
+    }
+
+    override fun dismiss() {
+        super.dismiss()
+        binding.ivProgress.clearAnimation()
+    }
+}
\ No newline at end of file
diff --git a/test/src/main/java/ai/guiji/duix/test/ui/dialog/ModelSelectorDialog.kt b/test/src/main/java/ai/guiji/duix/test/ui/dialog/ModelSelectorDialog.kt
new file mode 100644
index 0000000..42a306b
--- /dev/null
+++ b/test/src/main/java/ai/guiji/duix/test/ui/dialog/ModelSelectorDialog.kt
@@ -0,0 +1,39 @@
+package ai.guiji.duix.test.ui.dialog
+
+import ai.guiji.duix.test.R
+import ai.guiji.duix.test.databinding.DialogModelSelectorBinding
+import ai.guiji.duix.test.ui.adapter.ModelSelectorAdapter
+import android.app.Dialog
+import android.content.Context
+import android.os.Bundle
+import android.view.Window
+
+
+class ModelSelectorDialog(mContext: Context, val models: ArrayList<String>, private val listener: Listener) :
+    Dialog(mContext, R.style.dialog_center) {
+
+    private lateinit var binding: DialogModelSelectorBinding
+    private var mAdapter: ModelSelectorAdapter?=null
+
+    override fun onCreate(savedInstanceState: Bundle?) {
+        super.onCreate(savedInstanceState)
+        requestWindowFeature(Window.FEATURE_NO_TITLE)
+        binding = DialogModelSelectorBinding.inflate(layoutInflater)
+        super.setContentView(binding.root)
+
+        mAdapter = ModelSelectorAdapter(models, object : ModelSelectorAdapter.Callback{
+            override fun onClick(url: String) {
+                dismiss()
+                listener.onSelect(url)
+            }
+        })
+        binding.rvModels.adapter = mAdapter
+
+        setCancelable(true)
+        setCanceledOnTouchOutside(true)
+    }
+
+    interface Listener {
+        fun onSelect(url: String)
+    }
+}
\ No newline at end of file
diff --git a/test/src/main/java/ai/guiji/duix/test/util/StringUtils.java b/test/src/main/java/ai/guiji/duix/test/util/StringUtils.java
new file mode 100644
index 0000000..1ddec8e
--- /dev/null
+++ b/test/src/main/java/ai/guiji/duix/test/util/StringUtils.java
@@ -0,0 +1,396 @@
+package ai.guiji.duix.test.util;
+
+import static java.lang.Character.UnicodeBlock.CJK_COMPATIBILITY_FORMS;
+import static java.lang.Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS;
+import static java.lang.Character.UnicodeBlock.CJK_RADICALS_SUPPLEMENT;
+import static java.lang.Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS;
+import static java.lang.Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A;
+import static java.lang.Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B;
+
+import java.math.BigDecimal;
+import java.net.URLEncoder;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class StringUtils {
+
+    /*
+     * 秒数转时间
+     * */
+    public static String secondsToHMS(int seconds) {
+        StringBuilder time = new StringBuilder();
+        int min = 0;
+        int hour = 0;
+//        将毫秒转换成秒
+//        seconds = seconds / 1000;
+        if (seconds > 60) {
+            min = seconds / 60;
+            seconds = seconds % 60;
+        }
+        if (min > 60) {
+            hour = min / 60;
+            min = min % 60;
+        }
+        //拼接
+        if (hour < 10)
+            time.append("0");
+        time.append(hour);
+        time.append(":");
+        if (min < 10)
+            time.append("0");
+        time.append(min);
+        time.append(":");
+        if (seconds < 10)
+            time.append("0");
+        time.append(seconds);
+        return time.toString();
+    }
+
+    /*
+     * 秒数转时间
+     * */
+    public static String secondsToMS(int seconds) {
+        StringBuilder time = new StringBuilder();
+        int min = 0;
+//        将毫秒转换成秒
+//        seconds = seconds / 1000;
+        if (seconds > 60) {
+            min = seconds / 60;
+            seconds = seconds % 60;
+        }
+        /*if (min > 60) {
+            min = min % 60;
+        }*/
+        if (min < 10)
+            time.append("0");
+        time.append(min);
+        time.append(":");
+        if (seconds < 10)
+            time.append("0");
+        time.append(seconds);
+        return time.toString();
+    }
+
+    public static String formatDateTime(long mss) {
+        String DateTimes;
+        long days = mss / (60 * 60 * 24);
+        long hours = (mss % (60 * 60 * 24)) / (60 * 60);
+        long minutes = (mss % (60 * 60)) / 60;
+        long seconds = mss % 60;
+        if (days > 0) {
+            DateTimes = days + "天" + hours + "时";
+        } else if (hours > 0) {
+            DateTimes = hours + "时" + minutes + "分钟";
+        } else if (minutes > 0) {
+            DateTimes = minutes + "分钟"
+                    + seconds + "秒";
+        } else {
+            DateTimes = seconds + "秒";
+        }
+
+        return DateTimes;
+    }
+
+    //只展示秒，分钟
+    public static String formatDateTime2(long mss) {
+        long min = mss / 60;
+        long second = mss % 60;
+        String result = "";
+        if (mss < 60) {
+            result = mss + "秒";
+        } else {
+            if (0 == second) {
+                result = min + "分钟";
+            } else {
+                result = min + "分钟" + second + "秒";
+            }
+        }
+        return result;
+    }
+
+    //只展示秒或者分钟
+    public static String formatDateTime3(long mss) {
+        return mss < 60 ? (mss + "秒") : ((mss / 60) + "分钟");
+    }
+
+    //只展示秒，分
+    public static String formatDateTime4(long mss) {
+        return mss < 60 ? (mss + "秒") : ((mss / 60) + "分" + ((mss % 60) == 0 ? "" : (mss % 60) + "秒"));
+    }
+
+    public static String formatVideoDateTime(long mss) {
+        String DateTimes;
+        long hours = mss / (60 * 60);
+        long minutes = (mss % (60 * 60)) / 60;
+        long seconds = mss % 60;
+        String hourStr = (hours < 10 ? "0" : "") + hours;
+        String minuteStr = (minutes < 10 ? "0" : "") + minutes;
+        String secondStr = (seconds < 10 ? "0" : "") + seconds;
+        if (hours > 0) {
+            DateTimes = hourStr + ":" + minuteStr + ":" + secondStr;
+        } else {
+            DateTimes = minuteStr + ":" + secondStr;
+        }
+
+        return DateTimes;
+    }
+
+    /**
+     * 格式化文件大小
+     *
+     * @param size
+     * @return
+     */
+    public static String formatSize(long size) {
+        String sizeStr = "";
+        String unit[] = {"byte", "kb", "mb", "g" };
+        long lastValue = size;
+        for (int i = 0; i < unit.length; i++) {
+            size = size / 1024;
+            if (0 == size) {
+                sizeStr = lastValue + unit[i];
+                break;
+            } else {
+                if (i == unit.length - 1) {
+                    sizeStr = lastValue + unit[i];
+                } else {
+                    lastValue = size;
+                }
+            }
+        }
+        return sizeStr;
+    }
+
+    public static String createFileName(String prefix, String suffix) {
+        Date dt = new Date(System.currentTimeMillis());
+        SimpleDateFormat fmt = new SimpleDateFormat("yyyyMMddHHmmssSSS");
+        String fileName = fmt.format(dt);
+        fileName = prefix + fileName + suffix; //extension, you can change it.
+        return fileName;
+    }
+
+    public static String dateToStringMS(long date) {
+        SimpleDateFormat format = new SimpleDateFormat("yyyyMMddHHmmssSSS");
+        Date dt = new Date(date);
+        return format.format(dt);
+    }
+
+    public static String dateToStringMS2() {
+        SimpleDateFormat format = new SimpleDateFormat("yyyy-MM");
+        Date dt = new Date();
+        return format.format(dt);
+    }
+
+    public static String dateToStringMS3() {
+        SimpleDateFormat format = new SimpleDateFormat("MM-dd HH:mm");
+        Date dt = new Date();
+        return format.format(dt);
+    }
+
+    public static String dateToStringMS4() {
+        SimpleDateFormat format = new SimpleDateFormat("HH:mm:ss.SSS");
+        Date dt = new Date();
+        return format.format(dt);
+    }
+
+    //去除数字结尾无用的0
+    public static String filterUnUselessZero(String str) {
+        String value = "";
+        if (null != str) {
+            String regEx = "\\.(0+)$";
+            Pattern pattern = Pattern.compile(regEx);
+            Matcher matcher = pattern.matcher(str);
+            boolean rs = matcher.find();
+            if (rs && null != matcher.group(0)) {
+                value = str.replace(matcher.group(0), "");
+            } else {
+                value = str;
+            }
+        }
+        return value;
+    }
+
+    //判断数字是否为0
+    public static boolean isNullOrZero(String str) {
+        boolean result = false;
+        if (null == str || "".equals(str.trim())) {
+            result = true;
+        } else {
+            try {
+                double d = Double.parseDouble(str);
+                result = d == 0;
+            } catch (Exception e) {
+            }
+        }
+        return result;
+    }
+
+    //判断是否为空
+    public static boolean isEmpty(String str) {
+        boolean result = false;
+        if (null == str || "".equals(str.trim())) {
+            result = true;
+        }
+        return result;
+    }
+
+    // encode
+    public String urlEncoded(String paramString) {
+        if (paramString == null || paramString == "") {
+            return "";
+        }
+        try {
+            String str = URLEncoder.encode(paramString, "UTF-8");
+            return str;
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+        return "";
+    }
+
+    //判断大小
+    public static Boolean isEnough(String v1, String v2) {
+        Boolean result = null;
+        try {
+            Double d1 = Double.parseDouble(v1);
+            Double d2 = Double.parseDouble(v2);
+            result = d1 >= d2;
+        } catch (Exception e) {
+        }
+        return result;
+    }
+
+    /**
+     * 比较两个字符串不相等处，两个字符串去除标点之后，长度必须一致才会比对
+     *
+     * @param str1
+     * @param str2
+     * @return
+     */
+    public static List<Integer> compareTwoStr(String str1, String str2) {
+        List<Integer> mark = new ArrayList<>();
+        String str1WithoutSymbol = removeSymbol(str1);
+        String str2WithoutSymbol = removeSymbol(str2);
+        int symbolCount = 0;
+        char[] str1Char = str1.toCharArray();
+        char[] str2Char = str2WithoutSymbol.toCharArray();
+        for (int i = 0; i < str1Char.length; i++) {
+            if ("".equals(removeSymbol(String.valueOf(str1Char[i])))) {
+                symbolCount++;
+                continue;
+            }
+            if (str1Char[i] != str2Char[i - symbolCount]) {
+                if (0 == mark.size() % 2) {
+                    mark.add(i);
+                }
+            } else {
+                if (0 != mark.size() % 2) {
+                    mark.add(i);
+                }
+            }
+            //提前结束
+            if (i - symbolCount == str2Char.length - 1) {
+                if (0 == mark.size() % 2 && i + 1 < str1Char.length) {
+                    //存在非符号，才把后面标出来
+                    if (!"".equals(removeSymbol(str1.substring(i + 1)))) {
+                        mark.add(i + 1);
+                    }
+                }
+                break;
+            }
+        }
+        if (0 != mark.size() % 2) {
+            mark.add(str1Char.length);
+        }
+        if (0 == mark.size() && str1WithoutSymbol.length() < str2WithoutSymbol.length()) {
+            mark.add(0);
+            mark.add(str1Char.length);
+        }
+        return mark;
+    }
+
+    /**
+     * 去除字符串中的符号标点
+     *
+     * @param input
+     * @return
+     */
+    public static String removeSymbol(String input) {
+        return input.replaceAll("\\p{P}|\\p{S}", "");
+    }
+
+    public static double formatDouble(double value, int accuracy) {
+        BigDecimal b = new BigDecimal(value);
+        //保留2位小数
+        double out = b.setScale(accuracy, BigDecimal.ROUND_HALF_UP).doubleValue();
+        return out;
+    }
+
+    /**
+     * 字符串中是否只包含汉字字母
+     *
+     * @return
+     */
+    public static boolean isCharOrLetter(String str) {
+        String ruleString = "[\\u4e00-\\u9fa5a-zA-Z0-9]*";
+        Pattern p = Pattern.compile(ruleString);
+        char[] c = str.toCharArray();
+        for (int i = 0; i < c.length; i++) {
+            Matcher matcher = p.matcher("" + c[i]);
+            boolean b = matcher.matches();
+            if (b) {
+                continue;
+            } else {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    /**
+     * 判断是否含有中文
+     *
+     * @param checkChar
+     * @return
+     */
+    public static boolean checkCharContainChinese(char checkChar) {
+        Character.UnicodeBlock ub = Character.UnicodeBlock.of(checkChar);
+        if (CJK_UNIFIED_IDEOGRAPHS == ub || CJK_COMPATIBILITY_IDEOGRAPHS == ub || CJK_COMPATIBILITY_FORMS == ub ||
+                CJK_RADICALS_SUPPLEMENT == ub || CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A == ub || CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B == ub) {
+            return true;
+        }
+        return false;
+    }
+
+    //格式化时间为年月日
+    public static String formatDateString(String time) {
+        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
+        String formatString = "";
+        try {
+            Date date = sdf.parse(time);
+            formatString = sdf.format(date);
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+        return formatString;
+    }
+
+    public static String subZeroAndDot(String s) {
+        if (s.indexOf(".") > 0) {
+            s = s.replaceAll("0+?$", "");//去掉多余的0
+            s = s.replaceAll("[.]$", "");//如最后一位是.则去掉
+        }
+        return s;
+    }
+     // 是否包含特殊字符
+    public static boolean isSpecialChar(String str) {
+        String regEx = "[ _`~!@#$%^&*()+=|{}':;',\\[\\].<>/?~！@#￥%……&*（）——+|{}【】‘；：”“’。，、？]|\n|\r|\t";
+        Pattern p = Pattern.compile(regEx);
+        Matcher m = p.matcher(str);
+        return m.find();
+    }
+}
diff --git a/test/src/main/java/ai/guiji/duix/test/util/SystemUtils.java b/test/src/main/java/ai/guiji/duix/test/util/SystemUtils.java
new file mode 100644
index 0000000..1b65bff
--- /dev/null
+++ b/test/src/main/java/ai/guiji/duix/test/util/SystemUtils.java
@@ -0,0 +1,71 @@
+package ai.guiji.duix.test.util;
+
+import android.content.Context;
+import android.content.pm.PackageManager;
+
+public class SystemUtils {
+
+    public static int getVersionCode(Context context) {
+        try {
+            return context.getPackageManager().getPackageInfo(context.getPackageName(), 0).versionCode;
+        } catch (PackageManager.NameNotFoundException e) {
+            e.printStackTrace();
+            return -1;
+        }
+    }
+
+    public static String getVersionName(Context context) {
+        try {
+            return context.getPackageManager().getPackageInfo(context.getPackageName(), 0).versionName;
+        } catch (PackageManager.NameNotFoundException e) {
+            e.printStackTrace();
+            return "";
+        }
+    }
+
+    /**
+     * 版本号比较
+     *
+     * @param version1
+     * @param version2
+     * @return
+     */
+    public static int compareVersion(String version1, String version2) {
+        try {
+            if (version1.equals(version2)) {
+                return 0;
+            }
+            String[] version1Array = version1.split("\\.");
+            String[] version2Array = version2.split("\\.");
+            int index = 0;
+            // 获取最小长度值
+            int minLen = Math.min(version1Array.length, version2Array.length);
+            int diff = 0;
+            while (index < minLen
+                    && (diff = Integer.parseInt(version1Array[index])
+                    - Integer.parseInt(version2Array[index])) == 0) {
+                index++;
+            }
+            if (diff == 0) {
+                // 如果位数不一致，比较多余位数
+                for (int i = index; i < version1Array.length; i++) {
+                    if (Integer.parseInt(version1Array[i]) > 0) {
+                        return 1;
+                    }
+                }
+
+                for (int i = index; i < version2Array.length; i++) {
+                    if (Integer.parseInt(version2Array[i]) > 0) {
+                        return -1;
+                    }
+                }
+                return 0;
+            } else {
+                return diff > 0 ? 1 : -1;
+            }
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+        return -1;
+    }
+}
diff --git a/test/src/main/java/ai/guiji/duix/test/util/ZipUtil.java b/test/src/main/java/ai/guiji/duix/test/util/ZipUtil.java
new file mode 100644
index 0000000..0b6f53d
--- /dev/null
+++ b/test/src/main/java/ai/guiji/duix/test/util/ZipUtil.java
@@ -0,0 +1,101 @@
+package ai.guiji.duix.test.util;
+
+import android.util.Log;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.Enumeration;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
+import java.util.zip.ZipInputStream;
+
+public class ZipUtil {
+
+    //解压缩文件
+    // 这里unzip需要使用canonicalPath做校验，但是该方法取的路径可能和getAbsolutePath()方式获取的不一致
+    // getCanonicalPath()   /data/data/.......
+    // getAbsolutePath      /data/user/0/......
+    public static boolean unzip(String zipFilePath, String outOutPath, Callback callback) {
+        try (FileInputStream fis = new FileInputStream(zipFilePath)) {
+            long total = 0;
+            if (callback != null){
+                total = getZipSize(zipFilePath);
+            }
+            ZipInputStream inZip = new ZipInputStream(fis);
+            long currentSize = 0;
+            ZipEntry zipEntry;
+            String szName = "";
+            while ((zipEntry = inZip.getNextEntry()) != null) {
+                szName = zipEntry.getName();
+                if (zipEntry.isDirectory()) {
+                    szName = szName.substring(0, szName.length() - 1);
+                    File folder = new File(outOutPath + File.separator + szName);
+                    String canonicalPath = folder.getCanonicalPath();
+                    if (!canonicalPath.startsWith(outOutPath)) {
+                        Log.e("123", "绝对值路径比较异常忽略该地址: " + folder.getAbsolutePath());
+                    } else {
+                        if (!folder.exists()) {
+                            if (!folder.mkdirs()) {
+                                return false;
+                            }
+                        }
+                    }
+                } else {
+                    File file = new File(outOutPath + File.separator + szName);
+                    String canonicalPath = file.getCanonicalPath();
+                    if (!canonicalPath.startsWith(outOutPath)) {
+                        Log.e("123", "绝对值路径比较异常忽略该地址: " + file.getAbsolutePath());
+                    } else {
+                        if (!file.exists()) {
+                            if (!file.getParentFile().exists()) {
+                                file.getParentFile().mkdirs();
+                            }
+                            if (!file.createNewFile()) {
+                                return false;
+                            }
+                            FileOutputStream out = new FileOutputStream(file);
+                            int len;
+                            byte[] buffer = new byte[2048];
+                            while ((len = inZip.read(buffer)) != -1) {
+                                out.write(buffer, 0, len);
+                                out.flush();
+                                if (callback != null) {
+                                    currentSize += len;
+                                    int progress = (int)(currentSize * 100.0f / total);
+                                    callback.onProgress(progress); // 通过回调函数更新进度
+                                }
+                            }
+                            out.close();
+                        }
+                    }
+                }
+            }
+            inZip.close();
+            return true;
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+        return false;
+    }
+
+    private static long getZipSize(String filePath){
+        long size = 0;
+        ZipFile f;
+        try {
+            f = new ZipFile(filePath);
+            Enumeration<? extends ZipEntry> en = f.entries();
+            while (en.hasMoreElements()) {
+                size += en.nextElement().getSize();
+            }
+        } catch (IOException e) {
+            size = 0;
+        }
+        return size;
+    }
+
+    public interface Callback {
+        void onProgress(int progress);
+    }
+}
diff --git a/test/src/main/res/anim/rotate.xml b/test/src/main/res/anim/rotate.xml
new file mode 100644
index 0000000..cdf1458
--- /dev/null
+++ b/test/src/main/res/anim/rotate.xml
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="utf-8"?>
+<set xmlns:android="http://schemas.android.com/apk/res/android">
+    <rotate
+        android:duration="1000"
+        android:fromDegrees="0"
+        android:toDegrees="+359"
+        android:repeatCount="-1"
+        android:repeatMode="restart"
+        android:pivotX="50%"
+        android:pivotY="50%"
+        android:interpolator="@android:anim/overshoot_interpolator"
+        />
+</set>
\ No newline at end of file
diff --git a/test/src/main/res/drawable/ic_launcher_background.xml b/test/src/main/res/drawable/ic_launcher_background.xml
new file mode 100644
index 0000000..07d5da9
--- /dev/null
+++ b/test/src/main/res/drawable/ic_launcher_background.xml
@@ -0,0 +1,170 @@
+<?xml version="1.0" encoding="utf-8"?>
+<vector xmlns:android="http://schemas.android.com/apk/res/android"
+    android:width="108dp"
+    android:height="108dp"
+    android:viewportWidth="108"
+    android:viewportHeight="108">
+    <path
+        android:fillColor="#3DDC84"
+        android:pathData="M0,0h108v108h-108z" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M9,0L9,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,0L19,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M29,0L29,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M39,0L39,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M49,0L49,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M59,0L59,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M69,0L69,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M79,0L79,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M89,0L89,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M99,0L99,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,9L108,9"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,19L108,19"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,29L108,29"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,39L108,39"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,49L108,49"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,59L108,59"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,69L108,69"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,79L108,79"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,89L108,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,99L108,99"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,29L89,29"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,39L89,39"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,49L89,49"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,59L89,59"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,69L89,69"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,79L89,79"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M29,19L29,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M39,19L39,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M49,19L49,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M59,19L59,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M69,19L69,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M79,19L79,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+</vector>
diff --git a/test/src/main/res/drawable/ic_launcher_foreground.xml b/test/src/main/res/drawable/ic_launcher_foreground.xml
new file mode 100644
index 0000000..2b068d1
--- /dev/null
+++ b/test/src/main/res/drawable/ic_launcher_foreground.xml
@@ -0,0 +1,30 @@
+<vector xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:aapt="http://schemas.android.com/aapt"
+    android:width="108dp"
+    android:height="108dp"
+    android:viewportWidth="108"
+    android:viewportHeight="108">
+    <path android:pathData="M31,63.928c0,0 6.4,-11 12.1,-13.1c7.2,-2.6 26,-1.4 26,-1.4l38.1,38.1L107,108.928l-32,-1L31,63.928z">
+        <aapt:attr name="android:fillColor">
+            <gradient
+                android:endX="85.84757"
+                android:endY="92.4963"
+                android:startX="42.9492"
+                android:startY="49.59793"
+                android:type="linear">
+                <item
+                    android:color="#44000000"
+                    android:offset="0.0" />
+                <item
+                    android:color="#00000000"
+                    android:offset="1.0" />
+            </gradient>
+        </aapt:attr>
+    </path>
+    <path
+        android:fillColor="#FFFFFF"
+        android:fillType="nonZero"
+        android:pathData="M65.3,45.828l3.8,-6.6c0.2,-0.4 0.1,-0.9 -0.3,-1.1c-0.4,-0.2 -0.9,-0.1 -1.1,0.3l-3.9,6.7c-6.3,-2.8 -13.4,-2.8 -19.7,0l-3.9,-6.7c-0.2,-0.4 -0.7,-0.5 -1.1,-0.3C38.8,38.328 38.7,38.828 38.9,39.228l3.8,6.6C36.2,49.428 31.7,56.028 31,63.928h46C76.3,56.028 71.8,49.428 65.3,45.828zM43.4,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2c-0.3,-0.7 -0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C45.3,56.528 44.5,57.328 43.4,57.328L43.4,57.328zM64.6,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2s-0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C66.5,56.528 65.6,57.328 64.6,57.328L64.6,57.328z"
+        android:strokeWidth="1"
+        android:strokeColor="#00000000" />
+</vector>
\ No newline at end of file
diff --git a/test/src/main/res/drawable/selector_60_primary.xml b/test/src/main/res/drawable/selector_60_primary.xml
new file mode 100644
index 0000000..2b4ce9c
--- /dev/null
+++ b/test/src/main/res/drawable/selector_60_primary.xml
@@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="utf-8"?>
+<selector xmlns:android="http://schemas.android.com/apk/res/android">
+    <item android:drawable="@drawable/shape_corners_60_primary" android:state_pressed="false" />
+    <item android:drawable="@drawable/shape_corners_60_99_primary" android:state_pressed="true" />
+</selector>
diff --git a/test/src/main/res/drawable/shape_circle_camera_off.xml b/test/src/main/res/drawable/shape_circle_camera_off.xml
new file mode 100644
index 0000000..1a55f79
--- /dev/null
+++ b/test/src/main/res/drawable/shape_circle_camera_off.xml
@@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="utf-8"?>
+<shape xmlns:android="http://schemas.android.com/apk/res/android"
+    android:shape="oval">
+    <solid android:color="#66444444" />
+</shape>
diff --git a/test/src/main/res/drawable/shape_circle_camera_on.xml b/test/src/main/res/drawable/shape_circle_camera_on.xml
new file mode 100644
index 0000000..d454605
--- /dev/null
+++ b/test/src/main/res/drawable/shape_circle_camera_on.xml
@@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="utf-8"?>
+<shape xmlns:android="http://schemas.android.com/apk/res/android"
+    android:shape="oval">
+    <solid android:color="#CC147BF7" />
+</shape>
diff --git a/test/src/main/res/drawable/shape_circle_fab.xml b/test/src/main/res/drawable/shape_circle_fab.xml
new file mode 100644
index 0000000..d454605
--- /dev/null
+++ b/test/src/main/res/drawable/shape_circle_fab.xml
@@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="utf-8"?>
+<shape xmlns:android="http://schemas.android.com/apk/res/android"
+    android:shape="oval">
+    <solid android:color="#CC147BF7" />
+</shape>
diff --git a/test/src/main/res/drawable/shape_common_btn.xml b/test/src/main/res/drawable/shape_common_btn.xml
new file mode 100644
index 0000000..d2984a6
--- /dev/null
+++ b/test/src/main/res/drawable/shape_common_btn.xml
@@ -0,0 +1,4 @@
+<shape xmlns:android="http://schemas.android.com/apk/res/android">
+    <corners android:radius="7dp" />
+    <solid android:color="#661a1e1c" />
+</shape>
\ No newline at end of file
diff --git a/test/src/main/res/drawable/shape_corners_60_99_primary.xml b/test/src/main/res/drawable/shape_corners_60_99_primary.xml
new file mode 100644
index 0000000..d4e150c
--- /dev/null
+++ b/test/src/main/res/drawable/shape_corners_60_99_primary.xml
@@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="utf-8"?>
+<shape xmlns:android="http://schemas.android.com/apk/res/android">
+    <corners android:radius="60dp" />
+    <solid android:color="@color/primary_99" />
+</shape>
\ No newline at end of file
diff --git a/test/src/main/res/drawable/shape_corners_60_primary.xml b/test/src/main/res/drawable/shape_corners_60_primary.xml
new file mode 100644
index 0000000..fa3f139
--- /dev/null
+++ b/test/src/main/res/drawable/shape_corners_60_primary.xml
@@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="utf-8"?>
+<shape xmlns:android="http://schemas.android.com/apk/res/android">
+    <corners android:radius="60dp" />
+    <solid android:color="@color/primary" />
+</shape>
\ No newline at end of file
diff --git a/test/src/main/res/drawable/shape_solid_b3000000_radius_7.xml b/test/src/main/res/drawable/shape_solid_b3000000_radius_7.xml
new file mode 100644
index 0000000..f0f6d53
--- /dev/null
+++ b/test/src/main/res/drawable/shape_solid_b3000000_radius_7.xml
@@ -0,0 +1,4 @@
+<shape xmlns:android="http://schemas.android.com/apk/res/android">
+    <corners android:radius="7dp" />
+    <solid android:color="#b3000000" />
+</shape>
\ No newline at end of file
diff --git a/test/src/main/res/drawable/shape_tl_tr_32_ffffffff.xml b/test/src/main/res/drawable/shape_tl_tr_32_ffffffff.xml
new file mode 100644
index 0000000..9ffaec5
--- /dev/null
+++ b/test/src/main/res/drawable/shape_tl_tr_32_ffffffff.xml
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="utf-8"?>
+<shape xmlns:android="http://schemas.android.com/apk/res/android">
+
+    <corners
+        android:topLeftRadius="32dp"
+        android:topRightRadius="32dp" />
+
+    <solid android:color="#ffffffff" />
+</shape>
diff --git a/test/src/main/res/drawable/shape_tl_tr_bl_20_ff147bf7.xml b/test/src/main/res/drawable/shape_tl_tr_bl_20_ff147bf7.xml
new file mode 100644
index 0000000..740842c
--- /dev/null
+++ b/test/src/main/res/drawable/shape_tl_tr_bl_20_ff147bf7.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="utf-8"?>
+<shape xmlns:android="http://schemas.android.com/apk/res/android">
+
+    <corners android:topLeftRadius="20dp" android:topRightRadius="20dp"
+        android:bottomLeftRadius="20dp"/>
+
+    <solid android:color="#ff147bf7" />
+</shape>
diff --git a/test/src/main/res/drawable/shape_tl_tr_br_20_ffd9eaff.xml b/test/src/main/res/drawable/shape_tl_tr_br_20_ffd9eaff.xml
new file mode 100644
index 0000000..0d461f6
--- /dev/null
+++ b/test/src/main/res/drawable/shape_tl_tr_br_20_ffd9eaff.xml
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="utf-8"?>
+<shape xmlns:android="http://schemas.android.com/apk/res/android">
+
+    <corners
+        android:bottomRightRadius="20dp"
+        android:topLeftRadius="20dp"
+        android:topRightRadius="20dp" />
+
+    <solid android:color="#ffd9eaff" />
+</shape>
diff --git a/test/src/main/res/layout/activity_call.xml b/test/src/main/res/layout/activity_call.xml
new file mode 100644
index 0000000..2219ea2
--- /dev/null
+++ b/test/src/main/res/layout/activity_call.xml
@@ -0,0 +1,180 @@
+<?xml version="1.0" encoding="utf-8"?>
+<androidx.constraintlayout.widget.ConstraintLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:app="http://schemas.android.com/apk/res-auto"
+    android:layout_width="match_parent"
+    android:fitsSystemWindows="true"
+    android:layout_height="match_parent">
+
+    <ImageView
+        android:id="@+id/iv_bg"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:scaleType="centerCrop" />
+
+    <ai.guiji.duix.sdk.client.render.DUIXTextureView
+        android:id="@+id/glTextureView"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent" />
+
+    <TextView
+        android:id="@+id/tvDebug"
+        android:textSize="12sp"
+        android:visibility="gone"
+        android:textColor="@color/white"
+        android:background="#66000000"
+        android:padding="6dp"
+        app:layout_constraintTop_toTopOf="parent"
+        android:layout_width="match_parent"
+        android:layout_height="200dp"/>
+
+
+    <androidx.appcompat.widget.SwitchCompat
+        android:id="@+id/switchMute"
+        android:text="@string/mute"
+        android:layout_margin="12dp"
+        android:enabled="false"
+        app:layout_constraintTop_toTopOf="parent"
+        app:layout_constraintEnd_toEndOf="parent"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"/>
+
+    <TextView
+        android:id="@+id/tvMotionTips"
+        android:visibility="invisible"
+        android:layout_margin="12dp"
+        app:layout_constraintStart_toStartOf="parent"
+        app:layout_constraintTop_toTopOf="parent"
+        android:text="@string/support_actions"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"/>
+
+    <androidx.recyclerview.widget.RecyclerView
+        android:id="@+id/rvMotion"
+        app:layout_constraintTop_toBottomOf="@+id/tvMotionTips"
+        app:layout_constraintStart_toStartOf="parent"
+        android:layout_marginStart="12dp"
+        app:layoutManager="androidx.recyclerview.widget.LinearLayoutManager"
+        android:orientation="vertical"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"/>
+
+    <Button
+        android:id="@+id/btnRandomMotion"
+        android:text="@string/random_play_action"
+        android:textSize="13sp"
+        android:visibility="invisible"
+        android:textColor="@color/white"
+        app:layout_constraintTop_toBottomOf="@+id/rvMotion"
+        app:layout_constraintStart_toStartOf="parent"
+        android:background="@drawable/shape_common_btn"
+        android:layout_margin="6dp"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"/>
+
+    <TextView
+        android:id="@+id/tvSubtitle"
+        android:text=""
+        android:textSize="16sp"
+        android:textColor="@color/white"
+        android:shadowColor="#1a1e1f"
+        android:shadowRadius="4"
+        android:gravity="center"
+        android:visibility="gone"
+        android:background="#99000000"
+        android:padding="12dp"
+        android:layout_width="0dp"
+        android:layout_height="wrap_content"
+        app:layout_constraintBottom_toTopOf="@+id/btnAIConversation"
+        app:layout_constraintStart_toStartOf="parent"
+        app:layout_constraintEnd_toEndOf="parent"
+        android:layout_marginHorizontal="24dp"
+        android:layout_marginBottom="12dp"
+        />
+
+    <Button
+        android:id="@+id/btnAIConversation"
+        android:text="@string/ai_fab_start"
+        android:textSize="14sp"
+        android:textColor="@color/white"
+        android:background="@drawable/shape_circle_fab"
+        android:elevation="8dp"
+        android:layout_width="56dp"
+        android:layout_height="56dp"
+        app:layout_constraintBottom_toBottomOf="parent"
+        app:layout_constraintStart_toStartOf="parent"
+        app:layout_constraintEnd_toEndOf="parent"
+        android:layout_marginBottom="32dp"
+        android:contentDescription="@string/ai_conversation"
+        />
+
+    <Button
+        android:id="@+id/btnCameraToggle"
+        android:text="📷"
+        android:textSize="18sp"
+        android:textColor="@color/white"
+        android:background="@drawable/shape_circle_camera_off"
+        android:elevation="8dp"
+        android:layout_width="48dp"
+        android:layout_height="48dp"
+        android:visibility="gone"
+        app:layout_constraintBottom_toBottomOf="@+id/btnAIConversation"
+        app:layout_constraintStart_toEndOf="@+id/btnAIConversation"
+        app:layout_constraintEnd_toEndOf="parent"
+        android:layout_marginStart="16dp"
+        android:layout_marginEnd="24dp"
+        android:contentDescription="@string/camera_off"
+        />
+
+    <TextureView
+        android:id="@+id/cameraPreview"
+        android:layout_width="120dp"
+        android:layout_height="160dp"
+        android:visibility="gone"
+        android:scaleX="-1"
+        app:layout_constraintStart_toStartOf="parent"
+        app:layout_constraintTop_toTopOf="parent"
+        android:layout_marginStart="16dp"
+        android:layout_marginTop="48dp"
+        />
+
+    <Button
+        android:id="@+id/btnRecord"
+        android:text="@string/record"
+        android:enabled="false"
+        android:visibility="gone"
+        android:textSize="13sp"
+        android:textColor="@color/white"
+        app:layout_constraintBottom_toBottomOf="parent"
+        android:layout_marginBottom="48dp"
+        android:background="@drawable/shape_common_btn"
+        android:layout_margin="6dp"
+        android:layout_width="match_parent"
+        android:layout_height="wrap_content"/>
+
+    <Button
+        android:id="@+id/btnPlayPCM"
+        android:text="@string/play_pcm"
+        android:visibility="gone"
+        app:layout_constraintBottom_toTopOf="@+id/btnAIConversation"
+        android:background="@drawable/shape_common_btn"
+        android:layout_margin="6dp"
+        android:textSize="13sp"
+        android:textColor="@color/white"
+        android:enabled="false"
+        android:layout_width="match_parent"
+        android:layout_height="wrap_content"/>
+
+    <Button
+        android:id="@+id/btnPlayWAV"
+        android:text="@string/play_wav"
+        android:visibility="gone"
+        app:layout_constraintBottom_toTopOf="@+id/btnPlayPCM"
+        android:background="@drawable/shape_common_btn"
+        android:layout_margin="6dp"
+        android:textSize="13sp"
+        android:textColor="@color/white"
+        android:enabled="false"
+        android:layout_width="match_parent"
+        android:layout_height="wrap_content"/>
+
+</androidx.constraintlayout.widget.ConstraintLayout>
\ No newline at end of file
diff --git a/test/src/main/res/layout/activity_main.xml b/test/src/main/res/layout/activity_main.xml
new file mode 100644
index 0000000..99044d7
--- /dev/null
+++ b/test/src/main/res/layout/activity_main.xml
@@ -0,0 +1,121 @@
+<?xml version="1.0" encoding="utf-8"?>
+<androidx.constraintlayout.widget.ConstraintLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:app="http://schemas.android.com/apk/res-auto"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:padding="12dp">
+
+    <TextView
+        android:id="@+id/tv_title"
+        android:layout_width="match_parent"
+        android:layout_height="120dp"
+        android:gravity="center"
+        android:text="@string/app_name"
+        android:textColor="@color/black"
+        android:textSize="18dp"
+        android:textStyle="italic|bold"
+        app:layout_constraintTop_toTopOf="parent" />
+
+    <TextView
+        android:id="@+id/tv_sdk_version"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:textSize="12sp"
+        app:layout_constraintEnd_toEndOf="parent"
+        app:layout_constraintTop_toBottomOf="@+id/tv_title" />
+
+    <ScrollView
+        android:layout_width="match_parent"
+        android:layout_height="0dp"
+        app:layout_constraintBottom_toBottomOf="parent"
+        app:layout_constraintTop_toBottomOf="@+id/tv_sdk_version">
+
+        <androidx.constraintlayout.widget.ConstraintLayout
+            android:layout_width="match_parent"
+            android:layout_height="wrap_content">
+
+            <TextView
+                android:id="@+id/tvDownloadTips"
+                app:layout_constraintTop_toTopOf="parent"
+                android:text="@string/main_download_tips"
+                android:textSize="13sp"
+                android:textColor="@color/black"
+                android:textStyle="bold"
+                android:layout_width="match_parent"
+                android:layout_height="wrap_content"/>
+
+            <TextView
+                android:id="@+id/tvBaseConfig"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:layout_marginTop="12dp"
+                android:text="@string/base_config_url"
+                android:textSize="13sp"
+                app:layout_constraintTop_toBottomOf="@+id/tvDownloadTips"
+                app:layout_constraintStart_toStartOf="parent"/>
+
+            <EditText
+                android:id="@+id/etBaseConfig"
+                android:layout_marginTop="12dp"
+                android:layout_width="match_parent"
+                android:layout_height="48dp"
+                android:text="https://github.com/duixcom/Duix-Mobile/releases/download/v1.0.0/gj_dh_res.zip"
+                android:textSize="13sp"
+                app:layout_constraintTop_toBottomOf="@+id/tvBaseConfig"
+                />
+
+            <TextView
+                android:id="@+id/tvModelUrl"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:layout_marginTop="32dp"
+                android:text="@string/model_url"
+                android:textSize="13sp"
+                app:layout_constraintStart_toStartOf="parent"
+                app:layout_constraintTop_toBottomOf="@+id/etBaseConfig"  />
+
+            <Button
+                android:id="@+id/btnMoreModel"
+                android:text="@string/more"
+                android:layout_marginTop="12dp"
+                android:layout_width="wrap_content"
+                android:layout_height="48dp"
+                app:layout_constraintTop_toBottomOf="@+id/tvModelUrl"
+                app:layout_constraintEnd_toEndOf="parent"
+                />
+
+            <EditText
+                android:id="@+id/etUrl"
+                android:layout_width="0dp"
+                android:layout_height="48dp"
+                android:text="https://github.com/duixcom/Duix-Mobile/releases/download/v1.0.0/bendi3_20240518.zip"
+                android:textSize="13sp"
+                app:layout_constraintStart_toStartOf="parent"
+                app:layout_constraintTop_toTopOf="@+id/btnMoreModel"
+                app:layout_constraintEnd_toStartOf="@+id/btnMoreModel"
+                />
+
+            <Button
+                android:id="@+id/btnPlay"
+                android:layout_width="match_parent"
+                android:layout_height="48dp"
+                android:layout_marginTop="20dp"
+                android:text="@string/play"
+                app:layout_constraintEnd_toEndOf="parent"
+                app:layout_constraintTop_toBottomOf="@+id/etUrl" />
+
+            <androidx.appcompat.widget.SwitchCompat
+                android:id="@+id/switchDebug"
+                android:text="@string/debug_message"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:textSize="13sp"
+                android:layout_margin="12dp"
+                app:layout_constraintEnd_toEndOf="parent"
+                app:layout_constraintTop_toBottomOf="@+id/btnPlay"
+                />
+
+        </androidx.constraintlayout.widget.ConstraintLayout>
+    </ScrollView>
+
+</androidx.constraintlayout.widget.ConstraintLayout>
\ No newline at end of file
diff --git a/test/src/main/res/layout/dialog_audio_record.xml b/test/src/main/res/layout/dialog_audio_record.xml
new file mode 100644
index 0000000..160b5c2
--- /dev/null
+++ b/test/src/main/res/layout/dialog_audio_record.xml
@@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="utf-8"?>
+<androidx.constraintlayout.widget.ConstraintLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:app="http://schemas.android.com/apk/res-auto"
+    android:layout_width="match_parent"
+    android:layout_height="wrap_content"
+    android:fitsSystemWindows="true"
+    android:paddingHorizontal="16dp"
+    android:paddingVertical="16dp">
+
+
+    <androidx.constraintlayout.widget.ConstraintLayout
+        android:id="@+id/layoutFrame"
+        android:layout_width="match_parent"
+        android:layout_height="wrap_content"
+        app:layout_constraintBottom_toBottomOf="parent">
+
+        <TextView
+            android:id="@+id/tvTouch"
+            android:layout_width="120dp"
+            android:layout_height="120dp"
+            android:paddingVertical="12dp"
+            android:clickable="true"
+            android:background="@drawable/selector_60_primary"
+            android:gravity="center"
+            android:text="@string/touch_me_record"
+            android:textColor="@color/white"
+            android:textSize="12sp"
+            app:layout_constraintEnd_toEndOf="parent"
+            app:layout_constraintStart_toStartOf="parent"
+            app:layout_constraintTop_toTopOf="parent" />
+
+    </androidx.constraintlayout.widget.ConstraintLayout>
+
+
+</androidx.constraintlayout.widget.ConstraintLayout>
+
diff --git a/test/src/main/res/layout/dialog_loading.xml b/test/src/main/res/layout/dialog_loading.xml
new file mode 100644
index 0000000..25b257c
--- /dev/null
+++ b/test/src/main/res/layout/dialog_loading.xml
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="utf-8"?>
+<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    android:layout_width="match_parent"
+    android:layout_height="wrap_content"
+    android:gravity="center">
+
+    <LinearLayout
+        android:layout_width="wrap_content"
+        android:layout_height="138dp"
+        android:background="@drawable/shape_solid_b3000000_radius_7"
+        android:gravity="center"
+        android:minWidth="138dp"
+        android:orientation="vertical"
+        android:paddingBottom="2dp">
+
+        <ImageView
+            android:id="@+id/iv_progress"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:src="@mipmap/iv_progress" />
+
+        <TextView
+            android:id="@+id/tv_content"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_marginTop="12dp"
+            android:gravity="center"
+            android:singleLine="true"
+            android:text="@string/loading"
+            android:layout_marginHorizontal="12dp"
+            android:textColor="@color/white"
+            android:textSize="12dp"
+            android:textStyle="bold" />
+    </LinearLayout>
+</LinearLayout>
diff --git a/test/src/main/res/layout/dialog_model_selector.xml b/test/src/main/res/layout/dialog_model_selector.xml
new file mode 100644
index 0000000..ef05396
--- /dev/null
+++ b/test/src/main/res/layout/dialog_model_selector.xml
@@ -0,0 +1,16 @@
+<?xml version="1.0" encoding="utf-8"?>
+<androidx.constraintlayout.widget.ConstraintLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    android:layout_width="match_parent"
+    android:layout_height="wrap_content"
+    xmlns:app="http://schemas.android.com/apk/res-auto"
+    android:gravity="center">
+
+    <androidx.recyclerview.widget.RecyclerView
+        android:id="@+id/rvModels"
+        app:layoutManager="androidx.recyclerview.widget.LinearLayoutManager"
+        android:orientation="vertical"
+        android:layout_width="match_parent"
+        android:layout_height="wrap_content"
+        app:layout_constraintTop_toTopOf="parent"
+        />
+</androidx.constraintlayout.widget.ConstraintLayout>
diff --git a/test/src/main/res/layout/item_model_selector.xml b/test/src/main/res/layout/item_model_selector.xml
new file mode 100644
index 0000000..07de1b8
--- /dev/null
+++ b/test/src/main/res/layout/item_model_selector.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="utf-8"?>
+<androidx.constraintlayout.widget.ConstraintLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    android:layout_width="match_parent"
+    android:layout_height="wrap_content"
+    android:padding="6dp"
+    xmlns:app="http://schemas.android.com/apk/res-auto">
+
+    <TextView
+        android:id="@+id/tvModelUrl"
+        app:layout_constraintTop_toTopOf="parent"
+        android:textSize="13sp"
+        android:padding="6dp"
+        android:background="@drawable/shape_common_btn"
+        android:textColor="@color/white"
+        android:layout_width="match_parent"
+        android:layout_height="wrap_content"/>
+
+</androidx.constraintlayout.widget.ConstraintLayout>
\ No newline at end of file
diff --git a/test/src/main/res/layout/item_motion_button.xml b/test/src/main/res/layout/item_motion_button.xml
new file mode 100644
index 0000000..4db2319
--- /dev/null
+++ b/test/src/main/res/layout/item_motion_button.xml
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="utf-8"?>
+<androidx.constraintlayout.widget.ConstraintLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    android:layout_width="wrap_content"
+    android:layout_height="wrap_content"
+    xmlns:app="http://schemas.android.com/apk/res-auto">
+
+    <Button
+        android:id="@+id/btnMotion"
+        app:layout_constraintStart_toStartOf="parent"
+        app:layout_constraintTop_toTopOf="parent"
+        android:background="@drawable/shape_common_btn"
+        android:textSize="13sp"
+        android:textColor="@color/white"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"/>
+
+</androidx.constraintlayout.widget.ConstraintLayout>
\ No newline at end of file
diff --git a/test/src/main/res/mipmap-hdpi/ic_launcher.webp b/test/src/main/res/mipmap-hdpi/ic_launcher.webp
new file mode 100644
index 0000000..c209e78
Binary files /dev/null and b/test/src/main/res/mipmap-hdpi/ic_launcher.webp differ
diff --git a/test/src/main/res/mipmap-hdpi/ic_launcher_round.webp b/test/src/main/res/mipmap-hdpi/ic_launcher_round.webp
new file mode 100644
index 0000000..b2dfe3d
Binary files /dev/null and b/test/src/main/res/mipmap-hdpi/ic_launcher_round.webp differ
diff --git a/test/src/main/res/mipmap-mdpi/ic_launcher.webp b/test/src/main/res/mipmap-mdpi/ic_launcher.webp
new file mode 100644
index 0000000..4f0f1d6
Binary files /dev/null and b/test/src/main/res/mipmap-mdpi/ic_launcher.webp differ
diff --git a/test/src/main/res/mipmap-mdpi/ic_launcher_round.webp b/test/src/main/res/mipmap-mdpi/ic_launcher_round.webp
new file mode 100644
index 0000000..62b611d
Binary files /dev/null and b/test/src/main/res/mipmap-mdpi/ic_launcher_round.webp differ
diff --git a/test/src/main/res/mipmap-xhdpi/ic_launcher.webp b/test/src/main/res/mipmap-xhdpi/ic_launcher.webp
new file mode 100644
index 0000000..948a307
Binary files /dev/null and b/test/src/main/res/mipmap-xhdpi/ic_launcher.webp differ
diff --git a/test/src/main/res/mipmap-xhdpi/ic_launcher_round.webp b/test/src/main/res/mipmap-xhdpi/ic_launcher_round.webp
new file mode 100644
index 0000000..1b9a695
Binary files /dev/null and b/test/src/main/res/mipmap-xhdpi/ic_launcher_round.webp differ
diff --git a/test/src/main/res/mipmap-xxhdpi/ic_launcher.webp b/test/src/main/res/mipmap-xxhdpi/ic_launcher.webp
new file mode 100644
index 0000000..28d4b77
Binary files /dev/null and b/test/src/main/res/mipmap-xxhdpi/ic_launcher.webp differ
diff --git a/test/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp b/test/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp
new file mode 100644
index 0000000..9287f50
Binary files /dev/null and b/test/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp differ
diff --git a/test/src/main/res/mipmap-xxhdpi/iv_progress.png b/test/src/main/res/mipmap-xxhdpi/iv_progress.png
new file mode 100644
index 0000000..1fcc79f
Binary files /dev/null and b/test/src/main/res/mipmap-xxhdpi/iv_progress.png differ
diff --git a/test/src/main/res/mipmap-xxxhdpi/ic_launcher.webp b/test/src/main/res/mipmap-xxxhdpi/ic_launcher.webp
new file mode 100644
index 0000000..aa7d642
Binary files /dev/null and b/test/src/main/res/mipmap-xxxhdpi/ic_launcher.webp differ
diff --git a/test/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp b/test/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp
new file mode 100644
index 0000000..9126ae3
Binary files /dev/null and b/test/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp differ
diff --git a/test/src/main/res/values-zh-rCN/strings.xml b/test/src/main/res/values-zh-rCN/strings.xml
new file mode 100644
index 0000000..92e3057
--- /dev/null
+++ b/test/src/main/res/values-zh-rCN/strings.xml
@@ -0,0 +1,29 @@
+<resources>
+    <string name="app_name">DUIX Demo</string>
+    <string name="play">播放</string>
+    <string name="base_config_url">基础配置Url:</string>
+    <string name="model_url">模型Url:</string>
+    <string name="more">更多</string>
+    <string name="base_config_cannot_be_empty">基础配置url不能为空</string>
+    <string name="model_url_cannot_be_empty">模型url不能为空</string>
+    <string name="loading">加载中...</string>
+    <string name="mute">静音</string>
+    <string name="ai_tips">本内容由AI生成，仅供参考</string>
+    <string name="support_actions">支持的动作:</string>
+    <string name="record">录音</string>
+    <string name="play_pcm">播放PCM流</string>
+    <string name="play_wav">播放WAV文件</string>
+    <string name="random_play_action">随机播放动作</string>
+    <string name="touch_me_record">按住我收音</string>
+    <string name="need_permission_continue">需要授权以继续使用</string>
+    <string name="play_stop">停止播放</string>
+    <string name="ai_conversation">AI 实时对话</string>
+    <string name="ai_conversation_stop">结束 AI 对话</string>
+    <string name="ai_fab_start">AI</string>
+    <string name="ai_fab_stop">停</string>
+    <string name="camera_on">镜头开</string>
+    <string name="camera_off">镜头关</string>
+    <string name="main_download_tips">Github地址可能下载失败，您可以考虑使用代理或者将文件缓存到自己的存储服务</string>
+    <string name="debug_message">调试信息</string>
+
+</resources>
\ No newline at end of file
diff --git a/test/src/main/res/values/attrs.xml b/test/src/main/res/values/attrs.xml
new file mode 100644
index 0000000..6bce460
--- /dev/null
+++ b/test/src/main/res/values/attrs.xml
@@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="utf-8"?>
+<resources>
+
+    <declare-styleable name="visualizerView">
+        <attr name="numColumns" format="integer" />
+        <attr name="renderColor" format="color" />
+        <attr name="renderRange">
+            <enum name="top" value="0" />
+            <enum name="bottom" value="1" />
+            <enum name="both" value="2" />
+        </attr>
+        <attr name="renderType">
+            <flag name="bar" value="0x1" />
+            <flag name="pixel" value="0x2" />
+            <flag name="fade" value="0x4" />
+        </attr>
+    </declare-styleable>
+
+</resources>
\ No newline at end of file
diff --git a/test/src/main/res/values/colors.xml b/test/src/main/res/values/colors.xml
new file mode 100644
index 0000000..9d797fb
--- /dev/null
+++ b/test/src/main/res/values/colors.xml
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="utf-8"?>
+<resources>
+    <color name="purple_200">#FFBB86FC</color>
+    <color name="purple_500">#FF6200EE</color>
+    <color name="purple_700">#FF3700B3</color>
+    <color name="teal_200">#FF03DAC5</color>
+    <color name="teal_700">#FF018786</color>
+    <color name="black">#FF000000</color>
+    <color name="white">#FFFFFFFF</color>
+    <color name="primary">#5a7bbe</color>
+    <color name="primary_99">#995a7bbe</color>
+</resources>
\ No newline at end of file
diff --git a/test/src/main/res/values/strings.xml b/test/src/main/res/values/strings.xml
new file mode 100644
index 0000000..c2857c7
--- /dev/null
+++ b/test/src/main/res/values/strings.xml
@@ -0,0 +1,28 @@
+<resources>
+    <string name="app_name">DUIX Demo</string>
+    <string name="play">Play</string>
+    <string name="base_config_url">BaseConfig Url:</string>
+    <string name="model_url">Model Url:</string>
+    <string name="more">More</string>
+    <string name="base_config_cannot_be_empty">BaseConfig url cannot be empty</string>
+    <string name="model_url_cannot_be_empty">Model url cannot be empty</string>
+    <string name="loading">Loading...</string>
+    <string name="mute">Mute</string>
+    <string name="ai_tips">This content is generated by AI and is for reference only</string>
+    <string name="support_actions">Supported actions:</string>
+    <string name="record">Record</string>
+    <string name="play_pcm">Play PCM stream</string>
+    <string name="play_wav">Play WAV file</string>
+    <string name="random_play_action">Random play action</string>
+    <string name="touch_me_record">Press and hold</string>
+    <string name="need_permission_continue">Permission is required to continue using</string>
+    <string name="play_stop">Stop playing</string>
+    <string name="ai_conversation">AI 实时对话</string>
+    <string name="ai_conversation_stop">结束 AI 对话</string>
+    <string name="ai_fab_start">AI</string>
+    <string name="ai_fab_stop">停</string>
+    <string name="camera_on">镜头开</string>
+    <string name="camera_off">镜头关</string>
+    <string name="main_download_tips">The GitHub url may fail to download, you can consider using a proxy or caching the file to your own storage service</string>
+    <string name="debug_message">Debug message</string>
+</resources>
\ No newline at end of file
diff --git a/test/src/main/res/values/style.xml b/test/src/main/res/values/style.xml
new file mode 100644
index 0000000..8160f5e
--- /dev/null
+++ b/test/src/main/res/values/style.xml
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="utf-8"?>
+<resources>
+
+    <style name="dialog_center" parent="Theme.AppCompat.Light.Dialog">
+        <item name="android:windowFrame">@null</item>
+        <item name="android:windowIsFloating">true</item>
+        <item name="android:windowIsTranslucent">true</item>
+        <item name="android:windowNoTitle">true</item>
+        <item name="android:background">@android:color/transparent</item>
+        <item name="android:windowBackground">@android:color/transparent</item>
+        <item name="android:backgroundDimAmount">0.5</item>
+        <item name="android:backgroundDimEnabled">true</item>
+    </style>
+
+</resources>
\ No newline at end of file
diff --git a/test/src/main/res/values/themes.xml b/test/src/main/res/values/themes.xml
new file mode 100644
index 0000000..8122445
--- /dev/null
+++ b/test/src/main/res/values/themes.xml
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="utf-8"?>
+<resources>
+
+    <style name="Theme.DUIX.Test" parent="Theme.AppCompat.Light.NoActionBar">
+        <item name="android:windowLightStatusBar">true</item>
+        <item name="android:windowBackground">@android:color/white</item>
+        <item name="android:windowNoTitle">true</item>
+        <item name="android:windowIsTranslucent">false</item>
+        <item name="android:windowIsFloating">false</item>
+        <item name="colorPrimary">#fff</item>
+        <item name="colorPrimaryDark">#4349a9</item>
+        <item name="colorAccent">#4349a9</item>
+        <item name="android:windowTranslucentStatus">true</item>
+    </style>
+</resources>
\ No newline at end of file
diff --git a/test/src/main/res/xml/network_security_config.xml b/test/src/main/res/xml/network_security_config.xml
new file mode 100644
index 0000000..dca93c0
--- /dev/null
+++ b/test/src/main/res/xml/network_security_config.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="utf-8"?>
+<network-security-config>
+    <base-config cleartextTrafficPermitted="true" />
+</network-security-config>
\ No newline at end of file